def save_corpus(fname, corpus, id2word=None, metadata=False): """ Save an existing `corpus` to disk. Some formats also support saving the dictionary (`feature_id->word` mapping), which can in this case be provided by the optional `id2word` parameter. >>> MmCorpus.save_corpus('file.mm', corpus) Some corpora also support an index of where each document begins, so that the documents on disk can be accessed in O(1) time (see the `corpora.IndexedCorpus` base class). In this case, `save_corpus` is automatically called internally by `serialize`, which does `save_corpus` plus saves the index at the same time, so you want to store the corpus with:: >>> MmCorpus.serialize('file.mm', corpus) # stores index as well, allowing random access to individual documents Calling `serialize()` is preferred to calling `save_corpus()`. """ raise NotImplementedError('cannot instantiate abstract base class') # example code: logger.info("converting corpus to ??? format: %s", fname) with utils.smart_open(fname, 'wb') as fout: for doc in corpus: # iterate over the document stream fmt = str(doc) # format the document appropriately... fout.write(utils.to_utf8( "%s\n" % fmt)) # serialize the formatted document to disk
def docbyoffset(self, offset): """Return document at file offset `offset` (in bytes)""" # empty documents are not stored explicitly in MM format, so the index marks # them with a special offset, -1. if offset == -1: return [] if isinstance(self.input, str): fin, close_fin = utils.smart_open(self.input), True else: fin, close_fin = self.input, False fin.seek(offset) # works for gzip/bz2 input, too previd, document = -1, [] for line in fin: docid, termid, val = line.split() if not self.transposed: termid, docid = docid, termid # -1 because matrix market indexes are 1-based => convert to 0-based docid, termid, val = int(docid) - 1, int(termid) - 1, float(val) assert previd <= docid, "matrix columns must come in ascending order" if docid != previd: if previd >= 0: break previd = docid document.append((termid, val,)) # add another field to the current document if close_fin: fin.close() return document
def load_from_text(fname): """ Load a previously stored Dictionary from a text file. Mirror function to `save_as_text`. """ result = Dictionary() with utils.smart_open(fname) as f: for lineno, line in enumerate(f): line = utils.to_unicode(line) if lineno == 0: if line.strip().isdigit(): # Older versions of save_as_text may not write num_docs on first line. result.num_docs = int(line.strip()) continue else: logging.warning( "Text does not contain num_docs on the first line." ) try: wordid, word, docfreq = line[:-1].split('\t') except Exception: raise ValueError("invalid line in dictionary file %s: %s" % (fname, line.strip())) wordid = int(wordid) if word in result.token2id: raise KeyError( 'token %s is defined as ID %d and as ID %d' % (word, wordid, result.token2id[word])) result.token2id[word] = wordid result.dfs[wordid] = int(docfreq) return result
def test_mz_keywords(self): pre_path = os.path.join(os.path.dirname(__file__), 'test_data') with utils.smart_open(os.path.join(pre_path, "head500.noblanks.cor")) as f: text = utils.to_unicode(f.read()) text = u' '.join(text.split()[:10240]) kwds = mz_keywords(text) self.assertTrue(kwds.startswith('autism')) self.assertTrue(kwds.endswith('uk')) self.assertTrue(len(kwds.splitlines())) kwds_lst = mz_keywords(text, split=True) self.assertTrue(len(kwds_lst)) # Automatic thresholding selects words with n_blocks / n_blocks+1 # bits of entropy. For this text, n_blocks=10 n_blocks = 10. kwds_auto = mz_keywords(text, scores=True, weighted=False, threshold='auto') self.assertTrue(kwds_auto[-1][1] > (n_blocks / (n_blocks + 1.)))
def save_as_text(self, fname, sort_by_word=True): """ Save this Dictionary to a text file, in format: `num_docs` `id[TAB]word_utf8[TAB]document frequency[NEWLINE]`. Sorted by word, or by decreasing word frequency. Note: text format should be use for corpus inspection. Use `save`/`load` to store in binary format (pickle) for improved performance. """ logger.info("saving dictionary mapping to %s", fname) with utils.smart_open(fname, 'wb') as fout: numdocs_line = "%d\n" % self.num_docs fout.write(utils.to_utf8(numdocs_line)) if sort_by_word: for token, tokenid in sorted(self.token2id.items()): line = "%i\t%s\t%i\n" % (tokenid, token, self.dfs.get(tokenid, 0)) fout.write(utils.to_utf8(line)) else: for tokenid, freq in sorted(self.dfs.items(), key=lambda item: -item[1]): line = "%i\t%s\t%i\n" % (tokenid, self[tokenid], freq) fout.write(utils.to_utf8(line))
def read_file(path): with utils.smart_open(path) as fin: return fin.read()
def __init__(self, fname): self.fname = fname if fname.endswith(".gz") or fname.endswith('.bz2'): raise NotImplementedError("compressed output not supported with MmWriter") self.fout = utils.smart_open(self.fname, 'wb+') # open for both reading and writing self.headers_written = False
def _get_text_from_test_data(self, file): pre_path = os.path.join(os.path.dirname(__file__), 'test_data') with utils.smart_open(os.path.join(pre_path, file), mode="r") as f: return f.read()