Exemplo n.º 1
0
    def save_corpus(fname, corpus, id2word=None, metadata=False):
        """
        Save an existing `corpus` to disk.

        Some formats also support saving the dictionary (`feature_id->word` mapping),
        which can in this case be provided by the optional `id2word` parameter.

        >>> MmCorpus.save_corpus('file.mm', corpus)

        Some corpora also support an index of where each document begins, so
        that the documents on disk can be accessed in O(1) time (see the
        `corpora.IndexedCorpus` base class). In this case, `save_corpus` is automatically
        called internally by `serialize`, which does `save_corpus` plus saves the index
        at the same time, so you want to store the corpus with::

        >>> MmCorpus.serialize('file.mm', corpus) # stores index as well, allowing random access to individual documents

        Calling `serialize()` is preferred to calling `save_corpus()`.

        """
        raise NotImplementedError('cannot instantiate abstract base class')

        # example code:
        logger.info("converting corpus to ??? format: %s", fname)
        with utils.smart_open(fname, 'wb') as fout:
            for doc in corpus:  # iterate over the document stream
                fmt = str(doc)  # format the document appropriately...
                fout.write(utils.to_utf8("%s\n" % fmt))  # serialize the formatted document to disk
Exemplo n.º 2
0
    def docbyoffset(self, offset):
        """Return document at file offset `offset` (in bytes)"""
        # empty documents are not stored explicitly in MM format, so the index marks
        # them with a special offset, -1.
        if offset == -1:
            return []
        if isinstance(self.input, str):
            fin, close_fin = utils.smart_open(self.input), True
        else:
            fin, close_fin = self.input, False

        fin.seek(offset)  # works for gzip/bz2 input, too
        previd, document = -1, []
        for line in fin:
            docid, termid, val = line.split()
            if not self.transposed:
                termid, docid = docid, termid
            # -1 because matrix market indexes are 1-based => convert to 0-based
            docid, termid, val = int(docid) - 1, int(termid) - 1, float(val)
            assert previd <= docid, "matrix columns must come in ascending order"
            if docid != previd:
                if previd >= 0:
                    break
                previd = docid

            document.append((
                termid,
                val,
            ))  # add another field to the current document

        if close_fin:
            fin.close()
        return document
Exemplo n.º 3
0
 def load_from_text(fname):
     """
     Load a previously stored Dictionary from a text file.
     Mirror function to `save_as_text`.
     """
     result = Dictionary()
     with utils.smart_open(fname) as f:
         for lineno, line in enumerate(f):
             line = utils.to_unicode(line)
             if lineno == 0:
                 if line.strip().isdigit():
                     # Older versions of save_as_text may not write num_docs on first line.
                     result.num_docs = int(line.strip())
                     continue
                 else:
                     logging.warning(
                         "Text does not contain num_docs on the first line."
                     )
             try:
                 wordid, word, docfreq = line[:-1].split('\t')
             except Exception:
                 raise ValueError("invalid line in dictionary file %s: %s" %
                                  (fname, line.strip()))
             wordid = int(wordid)
             if word in result.token2id:
                 raise KeyError(
                     'token %s is defined as ID %d and as ID %d' %
                     (word, wordid, result.token2id[word]))
             result.token2id[word] = wordid
             result.dfs[wordid] = int(docfreq)
     return result
Exemplo n.º 4
0
 def __init__(self, fname):
     self.fname = fname
     if fname.endswith(".gz") or fname.endswith('.bz2'):
         raise NotImplementedError(
             "compressed output not supported with MmWriter")
     self.fout = utils.smart_open(
         self.fname, 'wb+')  # open for both reading and writing
     self.headers_written = False
Exemplo n.º 5
0
    def test_mz_keywords(self):
        pre_path = os.path.join(os.path.dirname(__file__), 'test_data')

        with utils.smart_open(os.path.join(pre_path,
                                           "head500.noblanks.cor")) as f:
            text = utils.to_unicode(f.read())
        text = u' '.join(text.split()[:10240])
        kwds = mz_keywords(text)
        self.assertTrue(kwds.startswith('autism'))
        self.assertTrue(kwds.endswith('uk'))
        self.assertTrue(len(kwds.splitlines()))

        kwds_lst = mz_keywords(text, split=True)
        self.assertTrue(len(kwds_lst))
        # Automatic thresholding selects words with n_blocks / n_blocks+1
        # bits of entropy. For this text, n_blocks=10
        n_blocks = 10.
        kwds_auto = mz_keywords(text,
                                scores=True,
                                weighted=False,
                                threshold='auto')
        self.assertTrue(kwds_auto[-1][1] > (n_blocks / (n_blocks + 1.)))
Exemplo n.º 6
0
    def save_as_text(self, fname, sort_by_word=True):
        """
        Save this Dictionary to a text file, in format:
        `num_docs`
        `id[TAB]word_utf8[TAB]document frequency[NEWLINE]`. Sorted by word,
        or by decreasing word frequency.

        Note: text format should be use for corpus inspection. Use `save`/`load`
        to store in binary format (pickle) for improved performance.
        """
        logger.info("saving dictionary mapping to %s", fname)
        with utils.smart_open(fname, 'wb') as fout:
            numdocs_line = "%d\n" % self.num_docs
            fout.write(utils.to_utf8(numdocs_line))
            if sort_by_word:
                for token, tokenid in sorted(self.token2id.items()):
                    line = "%i\t%s\t%i\n" % (tokenid, token,
                                             self.dfs.get(tokenid, 0))
                    fout.write(utils.to_utf8(line))
            else:
                for tokenid, freq in sorted(self.dfs.items(),
                                            key=lambda item: -item[1]):
                    line = "%i\t%s\t%i\n" % (tokenid, self[tokenid], freq)
                    fout.write(utils.to_utf8(line))
Exemplo n.º 7
0
def read_file(path):
    with utils.smart_open(path) as fin:
        return fin.read()
Exemplo n.º 8
0
 def _get_text_from_test_data(self, file):
     pre_path = os.path.join(os.path.dirname(__file__), 'test_data')
     with utils.smart_open(os.path.join(pre_path, file), mode="r") as f:
         return f.read()