def save(self, fname, *args, **kwargs): """Save model to file. Parameters ---------- fname : str Path to output file. """ if os.path.exists(self._model_filename): # Vowpal Wabbit uses its own binary model file, read this into # variable before serialising this object - keeps all data # self contained within a single serialised file logger.debug("Reading model bytes from '%s'", self._model_filename) with utils.smart_open(self._model_filename, 'rb') as fhandle: self._model_data = fhandle.read() if os.path.exists(self._topics_filename): logger.debug("Reading topic bytes from '%s'", self._topics_filename) with utils.smart_open(self._topics_filename, 'rb') as fhandle: self._topics_data = fhandle.read() if 'ignore' not in kwargs: kwargs['ignore'] = frozenset(['_topics', 'tmp_dir']) super(LdaVowpalWabbit, self).save(fname, *args, **kwargs)
def load(cls, fname, *args, **kwargs): """Load model from `fname`. Parameters ---------- fname : str Path to file with :class:`~gensim.models.wrappers.ldavowpalwabbit.LdaVowpalWabbit`. """ lda_vw = super(LdaVowpalWabbit, cls).load(fname, *args, **kwargs) lda_vw._init_temp_dir(prefix=lda_vw.tmp_prefix) if lda_vw._model_data: # Vowpal Wabbit operates on its own binary model file - deserialise # to file at load time, making it immediately ready for use logger.debug("Writing model bytes to '%s'", lda_vw._model_filename) with utils.smart_open(lda_vw._model_filename, 'wb') as fhandle: fhandle.write(lda_vw._model_data) lda_vw._model_data = None # no need to keep in memory after this if lda_vw._topics_data: logger.debug("Writing topic bytes to '%s'", lda_vw._topics_filename) with utils.smart_open(lda_vw._topics_filename, 'wb') as fhandle: fhandle.write(lda_vw._topics_data) lda_vw._topics_data = None return lda_vw
def save_word2vec_format(self, fname, fvocab=None, binary=False, total_vec=None): """ Store the input-hidden weight matrix in the same format used by the original C word2vec-tool, for compatibility. `fname` is the file used to save the vectors in `fvocab` is an optional file used to save the vocabulary `binary` is an optional boolean indicating whether the data is to be saved in binary word2vec format (default: False) `total_vec` is an optional parameter to explicitly specify total no. of vectors (in case word vectors are appended with document vectors afterwards) """ if total_vec is None: total_vec = len(self.vocab) vector_size = self.syn0.shape[1] if fvocab is not None: logger.info("storing vocabulary in %s", fvocab) with utils.smart_open(fvocab, 'wb') as vout: for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count): vout.write(utils.to_utf8("%s %s\n" % (word, vocab.count))) logger.info("storing %sx%s projection weights into %s", total_vec, vector_size, fname) assert (len(self.vocab), vector_size) == self.syn0.shape with utils.smart_open(fname, 'wb') as fout: fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size))) # store in sorted order: most frequent words at the top for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count): row = self.syn0[vocab.index] if binary: fout.write(utils.to_utf8(word) + b" " + row.tostring()) else: fout.write(utils.to_utf8("%s %s\n" % (word, ' '.join("%f" % val for val in row))))
def get_texts(self): total_docs = 0 if os.path.isdir( self.input ): # Read two levels of files filenames = glob.glob('{}/*'.format(self.input)) for filename in filenames: if os.path.isdir(filename): filenames += glob.glob('{}/*'.format(filename)) for filename in filenames: if not os.path.isdir( filename ): with utils.smart_open( filename ) as f: docId = filename docContent = u' '.join(f.read().decode('utf-8', 'ignore').splitlines()) tokens = self.tokenRegex.findall(docContent) tokens = [token.lower().encode('utf-8') for token in tokens if token not in STOPWORDS] yield tokens self.docIds.append(docId) total_docs += 1 else: with utils.smart_open(self.input) as f: for line in f: docId, docContent = line.decode('utf-8', 'ignore').rstrip('\n').split('\t') tokens = self.tokenRegex.findall(docContent) tokens = [token.lower().encode('utf-8') for token in tokens if token not in STOPWORDS] yield tokens self.docIds.append(docId) total_docs += 1 self.length = total_docs
def test_corpus_summarization(self): pre_path = os.path.join(os.path.dirname(__file__), 'test_data') with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f: text = f.read() # Generate the corpus. sentences = text.split("\n") tokens = [sentence.split() for sentence in sentences] dictionary = Dictionary(tokens) corpus = [dictionary.doc2bow(sentence_tokens) for sentence_tokens in tokens] # Extract the most important documents. selected_documents = summarize_corpus(corpus) # They are compared to the method reference. with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.summ.txt"), mode="r") as f: summary = f.read() summary = summary.split('\n') # Each sentence in the document selection has to be in the model summary. for doc_number, document in enumerate(selected_documents): # Retrieves all words from the document. words = [dictionary[token_id] for (token_id, count) in document] # Asserts that all of them are in a sentence from the model reference. self.assertTrue(any(all(word in sentence for word in words)) for sentence in summary)
def save_corpus(fname, corpus, id2word=None, metadata=False): """ Save a corpus in the LDA-C format. There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file. This function is automatically called by `BleiCorpus.serialize`; don't call it directly, call `serialize` instead. """ if id2word is None: logger.info("no word id mapping provided; initializing from corpus") id2word = utils.dict_from_corpus(corpus) num_terms = len(id2word) else: num_terms = 1 + max([-1] + id2word.keys()) logger.info("storing corpus in Blei's LDA-C format into %s" % fname) with utils.smart_open(fname, 'wb') as fout: offsets = [] for doc in corpus: doc = list(doc) offsets.append(fout.tell()) parts = ["%i:%s" % p for p in doc if abs(p[1]) > 1e-7] fout.write(utils.to_utf8("%i %s\n" % (len(doc), ' '.join(parts)))) # write out vocabulary, in a format compatible with Blei's topics.py script fname_vocab = fname + '.vocab' logger.info("saving vocabulary of %i words to %s" % (num_terms, fname_vocab)) with utils.smart_open(fname_vocab, 'wb') as fout: for featureid in xrange(num_terms): fout.write(utils.to_utf8("%s\n" % id2word.get(featureid, '---'))) return offsets
def testLineSentenceWorksWithNormalFile(self): """Does LineSentence work with a file object argument, rather than filename?""" with utils.smart_open(datapath('head500.noblanks.cor')) as orig: with utils.smart_open(datapath('head500.noblanks.cor')) as fin: sentences = word2vec.LineSentence(fin) for words in sentences: self.assertEqual(words, utils.to_unicode(orig.readline()).split())
def testPathLineSentences(self): """Does PathLineSentences work with a path argument?""" with utils.smart_open(os.path.join(datapath('PathLineSentences'), '1.txt')) as orig1,\ utils.smart_open(os.path.join(datapath('PathLineSentences'), '2.txt.bz2')) as orig2: sentences = word2vec.PathLineSentences(datapath('PathLineSentences')) orig = orig1.readlines() + orig2.readlines() orig_counter = 0 # to go through orig while matching PathLineSentences for words in sentences: self.assertEqual(words, utils.to_unicode(orig[orig_counter]).split()) orig_counter += 1
def save_corpus(fname, corpus, id2word=None, metadata=False): """Save a corpus in the LDA-C format. Notes ----- There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file. Parameters ---------- fname : str Path to output file. corpus : iterable of iterable of (int, float) Input corpus in BoW format. id2word : dict of (str, str), optional Mapping id -> word for `corpus`. metadata : bool, optional THIS PARAMETER WILL BE IGNORED. Returns ------- list of int Offsets for each line in file (in bytes). """ if id2word is None: logger.info("no word id mapping provided; initializing from corpus") id2word = utils.dict_from_corpus(corpus) num_terms = len(id2word) elif id2word: num_terms = 1 + max(id2word) else: num_terms = 0 logger.info("storing corpus in Blei's LDA-C format into %s", fname) with utils.smart_open(fname, 'wb') as fout: offsets = [] for doc in corpus: doc = list(doc) offsets.append(fout.tell()) parts = ["%i:%g" % p for p in doc if abs(p[1]) > 1e-7] fout.write(utils.to_utf8("%i %s\n" % (len(doc), ' '.join(parts)))) # write out vocabulary, in a format compatible with Blei's topics.py script fname_vocab = utils.smart_extension(fname, '.vocab') logger.info("saving vocabulary of %i words to %s", num_terms, fname_vocab) with utils.smart_open(fname_vocab, 'wb') as fout: for featureid in range(num_terms): fout.write(utils.to_utf8("%s\n" % id2word.get(featureid, '---'))) return offsets
def test_text_summarization(self): pre_path = os.path.join(os.path.dirname(__file__), 'test_data') with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f: text = f.read() # Makes a summary of the text. generated_summary = summarize(text) # To be compared to the method reference. with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.summ.txt"), mode="r") as f: summary = f.read() self.assertEqual(generated_summary, summary)
def test_text_keywords_pos(self): pre_path = os.path.join(os.path.dirname(__file__), 'test_data') with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f: text = f.read() # calculate keywords using only certain parts of speech generated_keywords_nnvbjj = keywords(text, pos_filter=['NN', 'VB', 'JJ'], ratio=0.3, split=True) # To be compared to the reference. with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.kwpos.txt"), mode="r") as f: kw = f.read().strip().split("\n") self.assertEqual({str(x) for x in generated_keywords_nnvbjj}, {str(x) for x in kw})
def test_text_keywords(self): pre_path = os.path.join(os.path.dirname(__file__), 'test_data') with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f: text = f.read() # calculate keywords generated_keywords = keywords(text, split=True) # To be compared to the reference. with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.kw.txt"), mode="r") as f: kw = f.read().strip().split("\n") self.assertEqual({str(x) for x in generated_keywords}, {str(x) for x in kw})
def __init__(self, input): """ Parameters ---------- input : str Path to file in UCI format. """ logger.info('Initializing corpus reader from %s', input) self.input = input with utils.smart_open(self.input) as fin: self.num_docs = self.num_terms = self.num_nnz = 0 try: self.num_docs = int(next(fin).strip()) self.num_terms = int(next(fin).strip()) self.num_nnz = int(next(fin).strip()) except StopIteration: pass logger.info( "accepted corpus with %i documents, %i features, %i non-zero entries", self.num_docs, self.num_terms, self.num_nnz )
def __iter__(self): for source in sources: with utils.smart_open(source) as fin: title = '' conference = '' field = '' abstract = '' for line in fin.readlines(): line = line.decode("utf-8") if line == '\r\n': yield LabeledSentence(\ preprocess_string(title), \ tags=[conference, field]) title = '' conference = '' field = '' if line.startswith('#*'): title = str(line).strip()[2:] if line.startswith('#c'): conference = str(line).strip()[2:] if line.startswith('#f'): field = str(line).strip()[2:]
def docbyoffset(self, offset): """Return document at file offset `offset` (in bytes)""" # empty documents are not stored explicitly in MM format, so the index marks # them with a special offset, -1. if offset == -1: return [] if isinstance(self.input, string_types): fin = utils.smart_open(self.input) else: fin = self.input fin.seek(offset) # works for gzip/bz2 input, too previd, document = -1, [] for line in fin: docid, termid, val = line.split() if not self.transposed: termid, docid = docid, termid docid, termid, val = int(docid) - 1, int(termid) - 1, float(val) # -1 because matrix market indexes are 1-based => convert to 0-based assert previd <= docid, "matrix columns must come in ascending order" if docid != previd: if previd >= 0: return document previd = docid document.append((termid, val,)) # add another field to the current document return document
def __init__(self, fname, fname_vocab=None): """ Parameters ---------- fname : str Path to corpus in UCI format. fname_vocab : bool, optional Path to vocab. Examples -------- >>> from gensim.corpora import UciCorpus >>> from gensim.test.utils import datapath >>> >>> corpus = UciCorpus(datapath('testcorpus.uci')) >>> for document in corpus: ... pass """ IndexedCorpus.__init__(self, fname) UciReader.__init__(self, fname) if fname_vocab is None: fname_vocab = utils.smart_extension(fname, '.vocab') self.fname = fname with utils.smart_open(fname_vocab) as fin: words = [word.strip() for word in fin] self.id2word = dict(enumerate(words)) self.transposed = True
def save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False): """ Save a corpus in the UCI Bag-of-Words format. There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file. This function is automatically called by `UciCorpus.serialize`; don't call it directly, call `serialize` instead. """ if id2word is None: logger.info("no word id mapping provided; initializing from corpus") id2word = utils.dict_from_corpus(corpus) num_terms = len(id2word) else: num_terms = 1 + max([-1] + id2word.keys()) # write out vocabulary fname_vocab = fname + '.vocab' logger.info("saving vocabulary of %i words to %s" % (num_terms, fname_vocab)) with utils.smart_open(fname_vocab, 'wb') as fout: for featureid in xrange(num_terms): fout.write(utils.to_utf8("%s\n" % id2word.get(featureid, '---'))) logger.info("storing corpus in UCI Bag-of-Words format: %s" % fname) return UciWriter.write_corpus(fname, corpus, index=True, progress_cnt=progress_cnt)
def testPathLineSentencesOneFile(self): """Does PathLineSentences work with a single file argument?""" test_file = os.path.join(datapath('PathLineSentences'), '1.txt') with utils.smart_open(test_file) as orig: sentences = word2vec.PathLineSentences(test_file) for words in sentences: self.assertEqual(words, utils.to_unicode(orig.readline()).split())
def _load_vw_topics(self): """Read topics file generated by Vowpal Wabbit, convert to numpy array. Output consists of many header lines, followed by a number of lines of: <word_id> <topic_1_gamma> <topic_2_gamma> ... """ topics = numpy.zeros((self.num_topics, self.num_terms), dtype=numpy.float32) with utils.smart_open(self._topics_filename) as topics_file: found_data = False for line in topics_file: # look for start of data if not found_data: if line.startswith(b'0 ') and b':' not in line: found_data = True else: continue fields = line.split() word_id = int(fields[0]) # output contains entries for 2**b terms, where b was set # by the '-b' option, ignore anything past num_terms if word_id >= self.num_terms: break topics[:, word_id] = fields[1:] # normalise to probability distribution self._topics = topics / topics.sum(axis=1, keepdims=True)
def test_get_offsets_and_start_doctags_win(self): # Each line takes 7 bytes (including '\n' character which is actually '\r\n' on Windows) lines = ['line1\n', 'line2\n', 'line3\n', 'line4\n', 'line5\n'] tmpf = get_tmpfile('gensim_doc2vec.tst') with utils.smart_open(tmpf, 'wb', encoding='utf8') as fout: for line in lines: fout.write(utils.any2unicode(line)) offsets, start_doctags = doc2vec.Doc2Vec._get_offsets_and_start_doctags_for_corpusfile(tmpf, 1) self.assertEqual(offsets, [0]) self.assertEqual(start_doctags, [0]) offsets, start_doctags = doc2vec.Doc2Vec._get_offsets_and_start_doctags_for_corpusfile(tmpf, 2) self.assertEqual(offsets, [0, 14]) self.assertEqual(start_doctags, [0, 2]) offsets, start_doctags = doc2vec.Doc2Vec._get_offsets_and_start_doctags_for_corpusfile(tmpf, 3) self.assertEqual(offsets, [0, 7, 21]) self.assertEqual(start_doctags, [0, 1, 3]) offsets, start_doctags = doc2vec.Doc2Vec._get_offsets_and_start_doctags_for_corpusfile(tmpf, 4) self.assertEqual(offsets, [0, 7, 14, 21]) self.assertEqual(start_doctags, [0, 1, 2, 3]) offsets, start_doctags = doc2vec.Doc2Vec._get_offsets_and_start_doctags_for_corpusfile(tmpf, 5) self.assertEqual(offsets, [0, 7, 14, 21, 28]) self.assertEqual(start_doctags, [0, 1, 2, 3, 4]) offsets, start_doctags = doc2vec.Doc2Vec._get_offsets_and_start_doctags_for_corpusfile(tmpf, 6) self.assertEqual(offsets, [0, 0, 7, 14, 14, 21]) self.assertEqual(start_doctags, [0, 0, 1, 2, 2, 3])
def docbyoffset(self, offset): """ Return the document stored at file position `offset`. """ with utils.smart_open(self.fname) as f: f.seek(offset) return self.line2doc(f.readline())
def __init__(self, fname, fname_vocab=None): """ Initialize the corpus from a file. `fname_vocab` is the file with vocabulary; if not specified, it defaults to `fname.vocab`. """ IndexedCorpus.__init__(self, fname) logger.info("loading corpus from %s" % fname) if fname_vocab is None: fname_base, _ = path.splitext(fname) fname_dir = path.dirname(fname) for fname_vocab in [ fname + '.vocab', fname + '/vocab.txt', fname_base + '.vocab', fname_dir + '/vocab.txt', ]: if path.exists(fname_vocab): break else: raise IOError('BleiCorpus: could not find vocabulary file') self.fname = fname with utils.smart_open(fname_vocab) as fin: words = [utils.to_unicode(word).rstrip() for word in fin] self.id2word = dict(enumerate(words)) self.length = None
def to_array(self): self.sentences = [] for source, prefix in self.sources.items(): with utils.smart_open(source) as fin: for item_no, line in enumerate(fin): self.sentences.append(TaggedDocument(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])) return self.sentences
def docbyoffset(self, offset): """Get the document stored in file by `offset` position. Parameters ---------- offset : int Offset (in bytes) to begin of document. Returns ------- list of (int, int) Document in BoW format (+"document_id" and "lang" if metadata=True). Examples -------- .. sourcecode:: pycon >>> from gensim.test.utils import datapath >>> from gensim.corpora import MalletCorpus >>> >>> data = MalletCorpus(datapath("testcorpus.mallet")) >>> data.docbyoffset(1) # end of first line [(3, 1), (4, 1)] >>> data.docbyoffset(4) # start of second line [(4, 1)] """ with utils.smart_open(self.fname) as f: f.seek(offset) return self.line2doc(f.readline())
def to_array(self): for source, prefix in self.sources.items(): with utils.smart_open(source) as fin: for item_no, line in enumerate(fin): self.sentences.append( LabeledSentence(words=utils.to_unicode(line).split(), tags=[prefix + '_%s' % str(item_no)])) return self.sentences
def convert_input(self, corpus, infer=False): """ Serialize documents (lists of unicode tokens) to a temporary text file, then convert that text file to MALLET format `outfile`. """ logger.info("serializing temporary corpus to %s" % self.fcorpustxt()) # write out the corpus in a file format that MALLET understands: one document per line: # document id[SPACE]label (not used)[SPACE]whitespace delimited utf8-encoded tokens with utils.smart_open(self.fcorpustxt(), "wb") as fout: for docno, doc in enumerate(corpus): if self.id2word: tokens = sum(([self.id2word[tokenid]] * int(cnt) for tokenid, cnt in doc), []) else: tokens = sum(([str(tokenid)] * int(cnt) for tokenid, cnt in doc), []) fout.write(utils.to_utf8("%s 0 %s\n" % (docno, " ".join(tokens)))) # convert the text file above into MALLET's internal format cmd = ( self.mallet_path + " import-file --preserve-case --keep-sequence --remove-stopwords --token-regex '\S+' --input %s --output %s" ) if infer: cmd += " --use-pipe-from " + self.fcorpusmallet() cmd = cmd % (self.fcorpustxt(), self.fcorpusmallet() + ".infer") else: cmd = cmd % (self.fcorpustxt(), self.fcorpusmallet()) logger.info("converting temporary corpus to MALLET format with %s" % cmd) call(cmd, shell=True)
def save_corpus(fname, corpus, id2word=None, metadata=False): """ Save an existing `corpus` to disk. Some formats also support saving the dictionary (`feature_id->word` mapping), which can in this case be provided by the optional `id2word` parameter. >>> MmCorpus.save_corpus('file.mm', corpus) Some corpora also support an index of where each document begins, so that the documents on disk can be accessed in O(1) time (see the `corpora.IndexedCorpus` base class). In this case, `save_corpus` is automatically called internally by `serialize`, which does `save_corpus` plus saves the index at the same time, so you want to store the corpus with:: >>> MmCorpus.serialize('file.mm', corpus) # stores index as well, allowing random access to individual documents Calling `serialize()` is preferred to calling `save_corpus()`. """ raise NotImplementedError('cannot instantiate abstract base class') # example code: logger.info("converting corpus to ??? format: %s", fname) with utils.smart_open(fname, 'wb') as fout: for doc in corpus: # iterate over the document stream fmt = str(doc) # format the document appropriately... fout.write(utils.to_utf8("%s\n" % fmt)) # serialize the formatted document to disk
def write_corpus_as_vw(corpus, filename): """Covert `corpus` to Vowpal Wabbit format and save it to `filename`. Parameters ---------- corpus : iterable of list of (int, int) Collection of texts in BoW format. filename : str Path to output file. Returns ------- int Number of lines in `filename`. """ logger.debug("Writing corpus to: %s", filename) corpus_size = 0 with utils.smart_open(filename, 'wb') as corpus_file: for line in corpus_to_vw(corpus): corpus_file.write(line.encode('utf-8') + b'\n') corpus_size += 1 return corpus_size
def load_word_topics(self): """Load words X topics matrix from :meth:`gensim.models.wrappers.ldamallet.LdaMallet.fstate` file. Returns ------- numpy.ndarray Matrix words X topics. """ logger.info("loading assigned topics from %s", self.fstate()) word_topics = numpy.zeros((self.num_topics, self.num_terms), dtype=numpy.float64) if hasattr(self.id2word, 'token2id'): word2id = self.id2word.token2id else: word2id = revdict(self.id2word) with utils.smart_open(self.fstate()) as fin: _ = next(fin) # header self.alpha = numpy.array([float(val) for val in next(fin).split()[2:]]) assert len(self.alpha) == self.num_topics, "mismatch between MALLET vs. requested topics" _ = next(fin) # noqa:F841 beta for lineno, line in enumerate(fin): line = utils.to_unicode(line) doc, source, pos, typeindex, token, topic = line.split(" ") if token not in word2id: continue tokenid = word2id[token] word_topics[int(topic), tokenid] += 1.0 return word_topics
def _predict(self, chunk): """Run given chunk of documents against currently trained model. Parameters ---------- chunk : iterable of list of (int, int) Sequence of documents in BoW format. Returns ------- predictions : ndarray Tuple of prediction matrix. vw_data : dict Vowpal Wabbit data. """ corpus_size = write_corpus_as_vw(chunk, self._corpus_filename) cmd = self._get_vw_predict_command(corpus_size) vw_data = _parse_vw_output(_run_vw_command(cmd)) vw_data['corpus_size'] = corpus_size predictions = numpy.zeros((corpus_size, self.num_topics), dtype=numpy.float32) with utils.smart_open(self._predict_filename) as fhandle: for i, line in enumerate(fhandle): predictions[i, :] = line.split() predictions = predictions / predictions.sum(axis=1, keepdims=True) return predictions, vw_data
def __init__(self, fname): self.fname = fname if fname.endswith(".gz") or fname.endswith('.bz2'): raise NotImplementedError("compressed output not supported with MmWriter") self.fout = utils.smart_open(self.fname, 'wb+') # open for both reading and writing self.headers_written = False
def _calculate_num_docs(self): with utils.smart_open(self.fname) as fin: result = sum([1 for x in fin]) return result
def my_fake_header(self, num_docs, num_terms, num_nnz): self.fout.close() self.fout = utils.smart_open(self.fname, 'r+b') super(MyMmWriter, self).fake_headers(num_docs, num_terms, num_nnz) self.fout.close() self.fout = utils.smart_open(self.fname, 'ab+')
def _calculate_num_docs(self): # the first line in input data is the number of documents (integer). throws exception on bad input. with utils.smart_open(self.fname) as fin: result = int(fin.readline()) return result
def __iter__(self): for source, prefix in self.sources.items(): with utils.smart_open(source) as fin: for item_no, line in enumerate(fin): yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
def read_doctopics(self, fname, eps=1e-6, renorm=True): """Get document topic vectors from MALLET's "doc-topics" format, as sparse gensim vectors. Parameters ---------- fname : str Path to input file with document topics. eps : float, optional Threshold for probabilities. renorm : bool, optional If True - explicitly re-normalize distribution. Raises ------ RuntimeError If any line in invalid format. Yields ------ list of (int, float) LDA vectors for document. """ mallet_version = self.get_version(self.mallet_path) with utils.smart_open(fname) as fin: for lineno, line in enumerate(fin): if lineno == 0 and line.startswith(b"#doc "): continue # skip the header line if it exists parts = line.split()[2:] # skip "doc" and "source" columns # the MALLET doctopic format changed in 2.0.8 to exclude the id, # this handles the file differently dependent on the pattern if len(parts) == 2 * self.num_topics: doc = [(int(id_), float(weight)) for id_, weight in zip(*[iter(parts)] * 2) if abs(float(weight)) > eps] elif len(parts ) == self.num_topics and mallet_version != '2.0.7': doc = [(id_, float(weight)) for id_, weight in enumerate(parts) if abs(float(weight)) > eps] else: if mallet_version == "2.0.7": """ 1 1 0 1.0780612802674239 30.005575655428533364 2 0.005575655428533364 2 2 0 0.9184413079632608 40.009062076892971008 3 0.009062076892971008 In the above example there is a mix of the above if and elif statement. There are neither `2*num_topics` nor `num_topics` elements. It has 2 formats 40.009062076892971008 and 0 1.0780612802674239 which cannot be handled by above if elif. Also, there are some topics are missing(meaning that the topic is not there) which is another reason why the above if elif fails even when the `mallet` produces the right results """ count = 0 doc = [] if len(parts) > 0: while count < len(parts): """ if section is to deal with formats of type 2 0.034 so if count reaches index of 2 and since int(2) == float(2) so if block is executed now there is one extra element afer 2, so count + 1 access should not give an error else section handles formats of type 20.034 now count is there on index of 20.034 since float(20.034) != int(20.034) so else block is executed """ if float(parts[count]) == int(parts[count]): if float(parts[count + 1]) > eps: doc.append((int(parts[count]), float(parts[count + 1]))) count += 2 else: if float(parts[count]) - int( parts[count]) > eps: doc.append((int(parts[count]) % 10, float(parts[count]) - int(parts[count]))) count += 1 else: raise RuntimeError( "invalid doc topics format at line %i in %s" % (lineno + 1, fname)) if renorm: # explicitly normalize weights to sum up to 1.0, just to be sure... total_weight = float(sum([weight for _, weight in doc])) if total_weight: doc = [(id_, float(weight) / total_weight) for id_, weight in doc] yield doc
def load_word2vec_format(cls=gensim.models.KeyedVectors, fname='', fvocab=None, binary=False, encoding='utf8', unicode_errors='strict', limit=None, datatype=REAL): """ Load the input-hidden weight matrix from the original C word2vec-tool format. Note that the information stored in the file is incomplete (the binary tree is missing), so while you can query for word similarity etc., you cannot continue training with a model loaded this way. `binary` is a boolean indicating whether the data is in binary word2vec format. `norm_only` is a boolean indicating whether to only store normalised word2vec vectors in memory. Word counts are read from `fvocab` filename, if set (this is the file generated by `-save-vocab` flag of the original C tool). If you trained the C model using non-utf8 encoding for words, specify that encoding in `encoding`. `unicode_errors`, default 'strict', is a string suitable to be passed as the `errors` argument to the unicode() (Python 2.x) or str() (Python 3.x) function. If your source file may include word tokens truncated in the middle of a multibyte unicode character (as is common from the original word2vec.c tool), 'ignore' or 'replace' may help. `limit` sets a maximum number of word-vectors to read from the file. The default, None, means read all. `datatype` (experimental) can coerce dimensions to a non-default float type (such as np.float16) to save memory. (Such types may result in much slower bulk operations or incompatibility with optimized routines.) """ counts = None if fvocab is not None: logger.info("loading word counts from %s", fvocab) counts = {} with utils.smart_open(fvocab) as fin: for line in fin: word, count = utils.to_unicode(line).strip().split() counts[word] = int(count) logger.info("loading projection weights from %s", fname) with utils.smart_open(fname) as fin: header = utils.to_unicode(fin.readline(), encoding=encoding) vocab_size, vector_size = map( int, header.split()) # throws for invalid file format if limit: vocab_size = min(vocab_size, limit) result = cls() result.vector_size = vector_size result.syn0 = zeros((vocab_size, vector_size), dtype=datatype) def add_word(word, weights): word_id = len(result.vocab) if word in result.vocab: logger.warning( "duplicate word '%s' in %s, ignoring all but first", word, fname) return if counts is None: # most common scenario: no vocab file given. just make up some bogus counts, in descending order result.vocab[word] = Vocab(index=word_id, count=vocab_size - word_id) elif word in counts: # use count from the vocab file result.vocab[word] = Vocab(index=word_id, count=counts[word]) else: # vocab file given, but word is missing -- set count to None (TODO: or raise?) logger.warning( "vocabulary file is incomplete: '%s' is missing", word) result.vocab[word] = Vocab(index=word_id, count=None) result.syn0[word_id] = weights result.index2word.append(word) if binary: # TODO: delegate pass else: for line_no in xrange(vocab_size): line = fin.readline() if line == b'': raise EOFError( "unexpected end of input; is count incorrect or file otherwise damaged?" ) if '"' in utils.to_unicode(line, encoding=encoding, errors=unicode_errors): line = utils.to_unicode(line, encoding=encoding, errors=unicode_errors) label = line.split('"', 1)[1].rsplit('"')[0].strip() other = line.rsplit('"', 1)[1].strip().split(' ') parts = [label] + other else: parts = utils.to_unicode(line.rstrip(), encoding=encoding, errors=unicode_errors).split(" ") if len(parts) != vector_size + 1: raise ValueError( "invalid vector on line %s (is this really the text format?)" % (line_no)) word, weights = parts[0], list(map(REAL, parts[1:])) add_word(word, weights) if result.syn0.shape[0] != len(result.vocab): logger.info( "duplicate words detected, shrinking matrix size from %i to %i", result.syn0.shape[0], len(result.vocab)) result.syn0 = ascontiguousarray(result.syn0[:len(result.vocab)]) assert (len(result.vocab), vector_size) == result.syn0.shape logger.info("loaded %s matrix from %s" % (result.syn0.shape, fname)) return result
def load_word2vec_format(fname, fvocab=None, binary=False, norm_only=True, encoding='utf8'): """ !!! Code modified from gensim.models.Word2Vec.load_word2vec_format: original version cannot load files created by original C word2vec if vocabulary contains words which are not correct Unicode byte sequences - this could happen due to corpora encoding issues. !!! Load the input-hidden weight matrix from the original C word2vec-tool format. Note that the information stored in the file is incomplete (the binary tree is missing), so while you can query for word similarity etc., you cannot continue training with a model loaded this way. `binary` is a boolean indicating whether the data is in binary word2vec format. `norm_only` is a boolean indicating whether to only store normalised word2vec vectors in memory. Word counts are read from `fvocab` filename, if set (this is the file generated by `-save-vocab` flag of the original C tool). If you trained the C model using non-utf8 encoding for words, specify that encoding in `encoding`. """ counts = None if fvocab is not None: logger.info("loading word counts from %s" % (fvocab)) counts = {} with utils.smart_open(fvocab) as fin: for line in fin: word, count = utils.to_unicode(line).strip().split() counts[word] = int(count) logger.info("loading projection weights from %s" % (fname)) with utils.smart_open(fname) as fin: header = utils.to_unicode(fin.readline(), encoding=encoding) vocab_size, vector_size = map( int, header.split()) # throws for invalid file format result = Word2Vec(size=vector_size) result.syn0 = zeros((vocab_size, vector_size), dtype=REAL) if binary: binary_len = dtype(REAL).itemsize * vector_size for line_no in xrange(vocab_size): # mixed text and binary: read text first, then binary word = [] while True: ch = fin.read(1) if ch == b' ': break if ch != b'\n': # ignore newlines in front of words (some binary files have) word.append(ch) try: word = utils.to_unicode(b''.join(word), encoding=encoding) except UnicodeDecodeError, e: logger.warning( "Couldn't convert whole word to unicode: trying to convert first %d bytes only ..." % e.start) word = utils.to_unicode(b''.join(word[:e.start]), encoding=encoding) logger.warning("... first %d bytes converted to '%s'" % (e.start, word)) if counts is None: result.vocab[word] = Vocab(index=line_no, count=vocab_size - line_no) elif word in counts: result.vocab[word] = Vocab(index=line_no, count=counts[word]) else: logger.warning("vocabulary file is incomplete") result.vocab[word] = Vocab(index=line_no, count=None) result.index2word.append(word) result.syn0[line_no] = fromstring(fin.read(binary_len), dtype=REAL) else:
def load_binary_data(self, encoding='utf8'): """Loads data from the output binary file created by FastText training""" with utils.smart_open(self.file_name, 'rb') as f: self.load_model_params(f) self.load_dict(f, encoding=encoding) self.load_vectors(f)
def load_word2vec_format(cls, fname, fvocab=None, binary=False, norm_only=True): """ Load the input-hidden weight matrix from the original C word2vec-tool format. Note that the information stored in the file is incomplete (the binary tree is missing), so while you can query for word similarity etc., you cannot continue training with a model loaded this way. `binary` is a boolean indicating whether the data is in binary word2vec format. `norm_only` is a boolean indicating whether to only store normalised word2vec vectors in memory. Word counts are read from `fvocab` filename, if set (this is the file generated by `-save-vocab` flag of the original C tool). """ counts = None if fvocab is not None: logger.info("loading word counts from %s" % (fvocab)) counts = {} with utils.smart_open(fvocab) as fin: for line in fin: word, count = utils.to_unicode(line).strip().split() counts[word] = int(count) logger.info("loading projection weights from %s" % (fname)) with utils.smart_open(fname) as fin: header = utils.to_unicode(fin.readline()) vocab_size, layer1_size = map(int, header.split()) # throws for invalid file format result = Word2Vec(size=layer1_size) result.syn0 = zeros((vocab_size, layer1_size), dtype=REAL) if binary: binary_len = dtype(REAL).itemsize * layer1_size for line_no in xrange(vocab_size): # mixed text and binary: read text first, then binary word = [] while True: ch = fin.read(1) if ch == b' ': break if ch != b'\n': # ignore newlines in front of words (some binary files have newline, some don't) word.append(ch) word = utils.to_unicode(b''.join(word)) if counts is None: result.vocab[word] = Vocab(index=line_no, count=vocab_size - line_no) elif word in counts: result.vocab[word] = Vocab(index=line_no, count=counts[word]) else: logger.warning("vocabulary file is incomplete") result.vocab[word] = Vocab(index=line_no, count=None) result.index2word.append(word) result.syn0[line_no] = fromstring(fin.read(binary_len), dtype=REAL) else: for line_no, line in enumerate(fin): parts = utils.to_unicode(line).split() if len(parts) != layer1_size + 1: raise ValueError("invalid vector on line %s (is this really the text format?)" % (line_no)) word, weights = parts[0], map(REAL, parts[1:]) if counts is None: result.vocab[word] = Vocab(index=line_no, count=vocab_size - line_no) elif word in counts: result.vocab[word] = Vocab(index=line_no, count=counts[word]) else: logger.warning("vocabulary file is incomplete") result.vocab[word] = Vocab(index=line_no, count=None) result.index2word.append(word) result.syn0[line_no] = weights logger.info("loaded %s matrix from %s" % (result.syn0.shape, fname)) result.init_sims(norm_only) return result
def enseemble_results_extra(self, questions, topn): """ Returns a list of the results from an accuracy test """ ok_vocab = self.get_vocabulary() new_vocab = [(w, self.model.wv.vocab[w]) for w in ok_vocab] new_vocab = {w.upper(): v for w, v in new_vocab} new_vocab = dict(new_vocab) results = [] for line_no, line in enumerate(utils.smart_open(questions)): # TODO: use level3 BLAS (=evaluate multiple questions at once), for speed line = utils.to_unicode(line) if line.startswith(': '): continue else: try: a, b, c, expected = [word.upper() for word in line.split()] except ValueError: logger.info("skipping invalid line #%i in %s", line_no, questions) continue if a not in new_vocab or b not in new_vocab or c not in new_vocab or expected not in new_vocab: """if a not in new_vocab: print("Dont know: " + a) if b not in new_vocab: print("Dont know: " + b) if c not in new_vocab: print("Dont know: " + c) if expected not in new_vocab: print("Dont know: " + expected) """ logger.debug("skipping line #%i with OOV words: %s", line_no, line.strip()) results.append(None) continue original_vocab = self.get_vocabulary() self.set_vocabulary(new_vocab) ignore = {a, b, c} # input words to be ignored #print('topn') #print(topn) # find the most likely prediction, ignoring OOV words and input words sims = self.most_similar(positive_words=[b, c], negative_words=[a], topn=topn) # print("sims") #print(sims) self.set_vocabulary(original_vocab) inner_results = [] for predict in sims: predicted = predict[0] predicted = predicted.upper() predicted_tuple = (predicted, predict[1]) #print(predicted_tuple) inner_results.append(predicted_tuple) #print(predicted) results.append(inner_results) #print(results) return results
def special_danish_accuracy(self, questions): """ Compute accuracy of the model. `questions` is a filename where lines are 4-tuples of words, split into sections by ": SECTION NAME" lines. See questions-words.txt in https://storage.googleapis.com/google-code-archive-source/v2/code.google.com/word2vec/source-archive.zip for an example. The accuracy is reported (=printed to log and returned as a list) for each section separately, plus there's one aggregate summary at the end. Use `restrict_vocab` to ignore all questions containing a word not in the first `restrict_vocab` words (default 30,000). This may be meaningful if you've sorted the vocabulary by descending frequency. In case `case_insensitive` is True, the first `restrict_vocab` words are taken first, and then case normalization is performed. Use `case_insensitive` to convert all words in questions and vocab to their uppercase form before evaluating the accuracy (default True). Useful in case of case-mismatch between training tokens and question words. In case of multiple case variants of a single word, the vector for the first occurrence (also the most frequent if vocabulary is sorted) is taken. This method corresponds to the `compute-accuracy` script of the original C word2vec. """ ok_vocab = self.get_vocabulary() print("ok vocab") #print(ok_vocab) new_vocab = [(w, self.model.wv.vocab[w]) for w in ok_vocab] print("not dict") #new_vocab = [w.upper() for w in ok_vocab] #print(new_vocab) new_vocab = {w.upper(): v for w, v in new_vocab} new_vocab = dict(new_vocab) #print(new_vocab) sections, section = [], None wrong_predictions = [] for line_no, line in enumerate(utils.smart_open(questions)): # TODO: use level3 BLAS (=evaluate multiple questions at once), for speed line = utils.to_unicode(line) if line.startswith(': '): # a new section starts => store the old section if section: sections.append(section) self.log_accuracy(section) section = { 'section': line.lstrip(': ').strip(), 'correct': [], 'incorrect': [] } else: if not section: raise ValueError( "missing section header before line #%i in %s" % (line_no, questions)) try: a, b, c, d, e, expected = [ word.upper() for word in line.split() ] except ValueError: logger.info("skipping invalid line #%i in %s", line_no, questions) continue if a not in new_vocab or b not in new_vocab or c not in new_vocab or d not in new_vocab or e not in new_vocab or expected not in new_vocab: #print('not in vocab') logger.debug("skipping line #%i with OOV words: %s", line_no, line.strip()) continue original_vocab = self.get_vocabulary() self.set_vocabulary(new_vocab) ignore = {a, b, c, d, e} # input words to be ignored # find the most likely prediction, ignoring OOV words and input words sims = self.most_similar(positive_words=[c, d, e], negative_words=[a, b]) #print("sims") #print(sims) self.set_vocabulary(original_vocab) predicted = sims[0][0] predicted = predicted.upper() #print(predicted) if predicted == expected: section['correct'].append((a, b, c, d, e, expected)) else: wrong_message = a + " " + b + " " + c + " " + d + " " + e + ", predicted: " + predicted + ", should have been: " + expected section['incorrect'].append((a, b, c, d, e, expected)) wrong_predictions.append(wrong_message) if section: # store the last section, too sections.append(section) self.log_accuracy(section) total = { 'section': 'total', 'correct': sum((s['correct'] for s in sections), []), 'incorrect': sum((s['incorrect'] for s in sections), []), } self.log_accuracy(total) sections.append(total) print(wrong_predictions) return sections
data_prefix = data_path.split('/')[-1].split('.')[0] """ Loading trained Doc2Vec model """ windowsize = int(sys.argv[1]) dimension = int(sys.argv[4]) nepoch = int(sys.argv[2]) mode = sys.argv[3] name_tuple = ( data_prefix.strip('DATA').lower(), windowsize, nepoch ) model = Doc2Vec.load('./models/' + mode + '/' + str(dimension) + 'd' + '/semeval-%s-lc-ns-%dw-%de.d2v' % name_tuple) nsamp = 0 sqerr = 0.0 nsqerr = 0.0 sentences = [] with utils.smart_open(data_path) as fin: for item_no, line in enumerate(fin): sentences.append(line) words = preprocessor(line) model_v = model.docvecs[ data_prefix + '_%s' % item_no ] infer_v = model.infer_vector(words) sim = dot(model_v, infer_v) sqerr += ( ( 1 - sim ) * ( 1 - sim ) ) model_v /= norm(model_v) infer_v /= norm(infer_v) sim = dot(model_v, infer_v) nsqerr += ( ( 1 - sim ) * ( 1 - sim ) ) nsamp += 1 rsqerr = 0.0 rnsqerr = 0.0
def intersect_word2vec_format(self, fname, lockf=0.0, binary=False, encoding='utf8', unicode_errors='strict'): """ Merge the input-hidden weight matrix from the original C word2vec-tool format given, where it intersects with the current vocabulary. (No words are added to the existing vocabulary, but intersecting words adopt the file's weights, and non-intersecting words are left alone.) `binary` is a boolean indicating whether the data is in binary word2vec format. `lockf` is a lock-factor value to be set for any imported word-vectors; the default value of 0.0 prevents further updating of the vector during subsequent training. Use 1.0 to allow further training updates of merged vectors. """ overlap_count = 0 logger.info("loading projection weights from %s" % (fname)) with utils.smart_open(fname) as fin: header = utils.to_unicode(fin.readline(), encoding=encoding) vocab_size, vector_size = map( int, header.split()) # throws for invalid file format if not vector_size == self.vector_size: raise ValueError("incompatible vector size %d in file %s" % (vector_size, fname)) # TOCONSIDER: maybe mismatched vectors still useful enough to merge (truncating/padding)? if binary: binary_len = dtype(REAL).itemsize * vector_size for line_no in xrange(vocab_size): # mixed text and binary: read text first, then binary word = [] while True: ch = fin.read(1) if ch == b' ': break if ch != b'\n': # ignore newlines in front of words (some binary files have) word.append(ch) word = utils.to_unicode(b''.join(word), encoding=encoding, errors=unicode_errors) weights = fromstring(fin.read(binary_len), dtype=REAL) if word in self.wv.vocab: overlap_count += 1 self.wv.syn0[self.wv.vocab[word].index] = weights self.syn0_lockf[ self.wv.vocab[word]. index] = lockf # lock-factor: 0.0 stops further changes else: for line_no, line in enumerate(fin): parts = utils.to_unicode(line.rstrip(), encoding=encoding, errors=unicode_errors).split(" ") if len(parts) != vector_size + 1: raise ValueError( "invalid vector on line %s (is this really the text format?)" % (line_no)) word, weights = parts[0], list(map(REAL, parts[1:])) if word in self.wv.vocab: overlap_count += 1 self.wv.syn0[self.wv.vocab[word].index] = weights logger.info("merged %d vectors into %s matrix from %s" % (overlap_count, self.wv.syn0.shape, fname))
def evaluate_synsets(emb_model, pairs,flag_emb,flag_nv, our_logger, delimiter='\t', dummy4unknown=False): ok_vocab = [(w, emb_model.vocab[w]) for w in emb_model.index2word] ok_vocab = dict(ok_vocab) similarity_gold = [] similarity_model = [] oov = 0 original_vocab = emb_model.vocab emb_model.vocab = ok_vocab for line_no, line in enumerate(utils.smart_open(pairs)): line = utils.to_unicode(line) if line.startswith('#'): # May be a comment continue else: try: a, b, sim = [word for word in line.split(delimiter)] sim = float(sim) except (ValueError, TypeError): our_logger.info('Skipping invalid line #%d in %s', line_no, pairs) continue # Finding correct synsets if flag_nv: synsets_a = wn.synsets(a.strip(), 'n') synsets_b = wn.synsets(b.strip(), 'n') else: synsets_a = wn.synsets(a.strip(), 'v') synsets_b = wn.synsets(b.strip(), 'v') if len(list(synsets_a)) == 0 or len(list(synsets_b)) == 0: oov += 1 if dummy4unknown: our_logger.debug('Zero similarity for line #%d with words with no synsets: %s', line_no, line.strip()) similarity_model.append(0.0) similarity_gold.append(sim) continue else: our_logger.debug('Skipping line #%d with words with no synsets: %s', line_no, line.strip()) continue best_pair = None best_sim = 0.0 for pair in product(synsets_a, synsets_b): if flag_emb: possible_similarity = emb_model.similarity(pair[0].lemmas()[0].key(), pair[1].lemmas()[0].key()) else: possible_similarity = emb_model.similarity(pair[0].name(), pair[1].name()) if possible_similarity > best_sim: best_pair = pair best_sim = possible_similarity our_logger.debug('Original words: %s', line.strip()) our_logger.debug('Synsets chosen: %s with similarity %f', best_pair, best_sim) similarity_model.append(best_sim) # Similarity from the model similarity_gold.append(sim) # Similarity from the dataset emb_model.vocab = original_vocab spearman = stats.spearmanr(similarity_gold, similarity_model) pearson = stats.pearsonr(similarity_gold, similarity_model) if dummy4unknown: oov_ratio = float(oov) / len(similarity_gold) * 100 else: oov_ratio = float(oov) / (len(similarity_gold) + oov) * 100 our_logger.debug('Pearson correlation coefficient against %s: %f with p-value %f', pairs, pearson[0], pearson[1]) our_logger.debug( 'Spearman rank-order correlation coefficient against %s: %f with p-value %f', pairs, spearman[0], spearman[1]) our_logger.debug('Pairs with unknown words: %d', oov) return pearson, spearman, oov_ratio
from gensim import utils import json import re import logging logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s', level=logging.INFO) # iterate over the plain text data we just created output = open('/data/yechen/bert/wiki.en.article.txt', 'w', encoding='utf8') exclude_sections = { 'See also', 'References', 'Further reading', 'External links', 'Sources', 'Bibliography' } with utils.smart_open( '/data/yechen/bert/enwiki-20201101-pages-articles-multistream.json.gz', 'rb') as f: numart = 0 numsec = 0 for line in f: output_text = '' numart = numart + 1 article = json.loads(line) section_titles = article['section_titles'] if not section_titles: continue i = -1 for section_text in article['section_texts']: i = i + 1 if (section_titles[i] in exclude_sections): continue numsec = numsec + 1
def read_file(path): with utils.smart_open(path) as fin: return fin.read()
def _get_text_from_test_data(self, file): pre_path = os.path.join(os.path.dirname(__file__), 'test_data') with utils.smart_open(os.path.join(pre_path, file), mode="r") as f: return f.read()
def load_binary_data(self, model_binary_file): """Loads data from the output binary file created by FastText training""" with utils.smart_open(model_binary_file, 'rb') as f: self.load_model_params(f) self.load_dict(f) self.load_vectors(f)
def save_as_text(self, fname, sort_by_word=True): """Save :class:`~gensim.corpora.dictionary.Dictionary` to a text file. Parameters ---------- fname : str Path to output file. sort_by_word : bool, optional Sort words in lexicographical order before writing them out? Notes ----- Format:: num_docs id_1[TAB]word_1[TAB]document_frequency_1[NEWLINE] id_2[TAB]word_2[TAB]document_frequency_2[NEWLINE] .... id_k[TAB]word_k[TAB]document_frequency_k[NEWLINE] This text format is great for corpus inspection and debugging. As plaintext, it's also easily portable to other tools and frameworks. For better performance and to store the entire object state, including collected corpus statistics, use :meth:`~gensim.corpora.dictionary.Dictionary.save` and :meth:`~gensim.corpora.dictionary.Dictionary.load` instead. See Also -------- :meth:`~gensim.corpora.dictionary.Dictionary.load_from_text` Load :class:`~gensim.corpora.dictionary.Dictionary` from text file. Examples -------- .. sourcecode:: pycon >>> from gensim.corpora import Dictionary >>> from gensim.test.utils import get_tmpfile >>> >>> tmp_fname = get_tmpfile("dictionary") >>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]] >>> >>> dct = Dictionary(corpus) >>> dct.save_as_text(tmp_fname) >>> >>> loaded_dct = Dictionary.load_from_text(tmp_fname) >>> assert dct.token2id == loaded_dct.token2id """ logger.info("saving dictionary mapping to %s", fname) with utils.smart_open(fname, 'wb') as fout: numdocs_line = "%d\n" % self.num_docs fout.write(utils.to_utf8(numdocs_line)) if sort_by_word: for token, tokenid in sorted(iteritems(self.token2id)): line = "%i\t%s\t%i\n" % (tokenid, token, self.dfs.get(tokenid, 0)) fout.write(utils.to_utf8(line)) else: for tokenid, freq in sorted(iteritems(self.dfs), key=lambda item: -item[1]): line = "%i\t%s\t%i\n" % (tokenid, self[tokenid], freq) fout.write(utils.to_utf8(line))
def save_corpus(fname, corpus, id2word=None, metadata=False): """Save a corpus in the Mallet format. Warnings -------- This function is automatically called by :meth:`gensim.corpora.malletcorpus.MalletCorpus.serialize`, don't call it directly, call :meth:`gensim.corpora.lowcorpus.malletcorpus.MalletCorpus.serialize` instead. Parameters ---------- fname : str Path to output file. corpus : iterable of iterable of (int, int) Corpus in BoW format. id2word : {dict of (int, str), :class:`~gensim.corpora.dictionary.Dictionary`}, optional Mapping between word_ids (integers) and words (strings). If not provided, the mapping is constructed directly from `corpus`. metadata : bool, optional If True - ???? Return ------ list of int List of offsets in resulting file for each document (in bytes), can be used for :meth:`~gensim.corpora.malletcorpus.Malletcorpus.docbyoffset`. Notes ----- The document id will be generated by enumerating the corpus. That is, it will range between 0 and number of documents in the corpus. Since Mallet has a language field in the format, this defaults to the string '__unknown__'. If the language needs to be saved, post-processing will be required. """ if id2word is None: logger.info( "no word id mapping provided; initializing from corpus") id2word = utils.dict_from_corpus(corpus) logger.info("storing corpus in Mallet format into %s", fname) truncated = 0 offsets = [] with utils.smart_open(fname, 'wb') as fout: for doc_id, doc in enumerate(corpus): if metadata: doc_id, doc_lang = doc[1] doc = doc[0] else: doc_lang = '__unknown__' words = [] for wordid, value in doc: if abs(int(value) - value) > 1e-6: truncated += 1 words.extend([utils.to_unicode(id2word[wordid])] * int(value)) offsets.append(fout.tell()) fout.write( utils.to_utf8('%s %s %s\n' % (doc_id, doc_lang, ' '.join(words)))) if truncated: logger.warning( "Mallet format can only save vectors with integer elements; " "%i float entries were truncated to integer value", truncated) return offsets
def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case_insensitive=True, dummy4unknown=False): """ Compute correlation of the model with human similarity judgments. `pairs` is a filename of a dataset where lines are 3-tuples, each consisting of a word pair and a similarity value, separated by `delimiter'. An example dataset is included in Gensim (test/test_data/wordsim353.tsv). More datasets can be found at http://technion.ac.il/~ira.leviant/MultilingualVSMdata.html or https://www.cl.cam.ac.uk/~fh295/simlex.html. The model is evaluated using Pearson correlation coefficient and Spearman rank-order correlation coefficient between the similarities from the dataset and the similarities produced by the model itself. The results are printed to log and returned as a triple (pearson, spearman, ratio of pairs with unknown words). Use `restrict_vocab` to ignore all word pairs containing a word not in the first `restrict_vocab` words (default 300,000). This may be meaningful if you've sorted the vocabulary by descending frequency. If `case_insensitive` is True, the first `restrict_vocab` words are taken, and then case normalization is performed. Use `case_insensitive` to convert all words in the pairs and vocab to their uppercase form before evaluating the model (default True). Useful when you expect case-mismatch between training tokens and words pairs in the dataset. If there are multiple case variants of a single word, the vector for the first occurrence (also the most frequent if vocabulary is sorted) is taken. Use `dummy4unknown=True' to produce zero-valued similarities for pairs with out-of-vocabulary words. Otherwise (default False), these pairs are skipped entirely. """ ok_vocab = [(w, self.vocab[w]) for w in self.index2word[:restrict_vocab]] ok_vocab = dict((w.upper(), v) for w, v in reversed( ok_vocab)) if case_insensitive else dict(ok_vocab) similarity_gold = [] similarity_model = [] oov = 0 original_vocab = self.vocab self.vocab = ok_vocab for line_no, line in enumerate(utils.smart_open(pairs)): line = utils.to_unicode(line) if line.startswith('#'): # May be a comment continue else: try: if case_insensitive: a, b, sim = [ word.upper() for word in line.split(delimiter) ] else: a, b, sim = [word for word in line.split(delimiter)] sim = float(sim) except: logger.info('skipping invalid line #%d in %s', line_no, pairs) continue if a not in ok_vocab or b not in ok_vocab: oov += 1 if dummy4unknown: similarity_model.append(0.0) similarity_gold.append(sim) continue else: logger.debug('skipping line #%d with OOV words: %s', line_no, line.strip()) continue similarity_gold.append(sim) # Similarity from the dataset similarity_model.append(self.similarity( a, b)) # Similarity from the model self.vocab = original_vocab spearman = stats.spearmanr(similarity_gold, similarity_model) pearson = stats.pearsonr(similarity_gold, similarity_model) oov_ratio = float(oov) / (len(similarity_gold) + oov) * 100 logger.debug( 'Pearson correlation coefficient against %s: %f with p-value %f', pairs, pearson[0], pearson[1]) logger.debug( 'Spearman rank-order correlation coefficient against %s: %f with p-value %f', pairs, spearman[0], spearman[1]) logger.debug('Pairs with unknown words: %d' % oov) self.log_evaluate_word_pairs(pearson, spearman, oov_ratio, pairs) return pearson, spearman, oov_ratio
def testLineSentenceWorksWithCompressedFile(self): """Does LineSentence work with a compressed file object argument?""" with utils.smart_open(datapath('head500.noblanks.cor')) as orig: sentences = word2vec.LineSentence(bz2.BZ2File(datapath('head500.noblanks.cor.bz2'))) for words in sentences: self.assertEqual(words, utils.to_unicode(orig.readline()).split())
def _load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict', limit=None, datatype=REAL): """Load the input-hidden weight matrix from the original C word2vec-tool format. Note that the information stored in the file is incomplete (the binary tree is missing), so while you can query for word similarity etc., you cannot continue training with a model loaded this way. Parameters ---------- fname : str The file path to the saved word2vec-format file. fvocab : str, optional File path to the vocabulary.Word counts are read from `fvocab` filename, if set (this is the file generated by `-save-vocab` flag of the original C tool). binary : bool, optional If True, indicates whether the data is in binary word2vec format. encoding : str, optional If you trained the C model using non-utf8 encoding for words, specify that encoding in `encoding`. unicode_errors : str, optional default 'strict', is a string suitable to be passed as the `errors` argument to the unicode() (Python 2.x) or str() (Python 3.x) function. If your source file may include word tokens truncated in the middle of a multibyte unicode character (as is common from the original word2vec.c tool), 'ignore' or 'replace' may help. limit : int, optional Sets a maximum number of word-vectors to read from the file. The default, None, means read all. datatype : type, optional (Experimental) Can coerce dimensions to a non-default float type (such as `np.float16`) to save memory. Such types may result in much slower bulk operations or incompatibility with optimized routines.) Returns ------- object Returns the loaded model as an instance of :class:`cls`. """ from gensim.models.keyedvectors import Vocab counts = None if fvocab is not None: logger.info("loading word counts from %s", fvocab) counts = {} with utils.smart_open(fvocab) as fin: for line in fin: word, count = utils.to_unicode(line).strip().split() counts[word] = int(count) logger.info("loading projection weights from %s", fname) with utils.smart_open(fname) as fin: header = utils.to_unicode(fin.readline(), encoding=encoding) vocab_size, vector_size = (int(x) for x in header.split() ) # throws for invalid file format if limit: vocab_size = min(vocab_size, limit) result = cls(vector_size) result.vector_size = vector_size result.vectors = zeros((vocab_size, vector_size), dtype=datatype) def add_word(word, weights): word_id = len(result.vocab) if word in result.vocab: logger.warning( "duplicate word '%s' in %s, ignoring all but first", word, fname) return if counts is None: # most common scenario: no vocab file given. just make up some bogus counts, in descending order result.vocab[word] = Vocab(index=word_id, count=vocab_size - word_id) elif word in counts: # use count from the vocab file result.vocab[word] = Vocab(index=word_id, count=counts[word]) else: # vocab file given, but word is missing -- set count to None (TODO: or raise?) logger.warning( "vocabulary file is incomplete: '%s' is missing", word) result.vocab[word] = Vocab(index=word_id, count=None) result.vectors[word_id] = weights result.index2word.append(word) if binary: binary_len = dtype(REAL).itemsize * vector_size for _ in xrange(vocab_size): # mixed text and binary: read text first, then binary word = [] while True: ch = fin.read(1) if ch == b' ': break if ch == b'': raise EOFError( "unexpected end of input; is count incorrect or file otherwise damaged?" ) if ch != b'\n': # ignore newlines in front of words (some binary files have) word.append(ch) word = utils.to_unicode(b''.join(word), encoding=encoding, errors=unicode_errors) with utils.ignore_deprecation_warning(): # TODO use frombuffer or something similar weights = fromstring(fin.read(binary_len), dtype=REAL).astype(datatype) add_word(word, weights) else: for line_no in xrange(vocab_size): line = fin.readline() if line == b'': raise EOFError( "unexpected end of input; is count incorrect or file otherwise damaged?" ) parts = utils.to_unicode(line.rstrip(), encoding=encoding, errors=unicode_errors).split(" ") if len(parts) != vector_size + 1: raise ValueError( "invalid vector on line %s (is this really the text format?)" % line_no) word, weights = parts[0], [datatype(x) for x in parts[1:]] add_word(word, weights) if result.vectors.shape[0] != len(result.vocab): logger.info( "duplicate words detected, shrinking matrix size from %i to %i", result.vectors.shape[0], len(result.vocab)) result.vectors = ascontiguousarray(result.vectors[:len(result.vocab)]) assert (len(result.vocab), vector_size) == result.vectors.shape logger.info("loaded %s matrix from %s", result.vectors.shape, fname) return result
def __iter__(self): with utils.smart_open(self.source) as fin: for item_no, line in enumerate(fin): text = line.rsplit(None, 1)[0] yield TaggedDocument(text.split(), [item_no])
def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, case_insensitive=True): """ Compute accuracy of the model. `questions` is a filename where lines are 4-tuples of words, split into sections by ": SECTION NAME" lines. See questions-words.txt in https://storage.googleapis.com/google-code-archive-source/v2/code.google.com/word2vec/source-archive.zip for an example. The accuracy is reported (=printed to log and returned as a list) for each section separately, plus there's one aggregate summary at the end. Use `restrict_vocab` to ignore all questions containing a word not in the first `restrict_vocab` words (default 30,000). This may be meaningful if you've sorted the vocabulary by descending frequency. In case `case_insensitive` is True, the first `restrict_vocab` words are taken first, and then case normalization is performed. Use `case_insensitive` to convert all words in questions and vocab to their uppercase form before evaluating the accuracy (default True). Useful in case of case-mismatch between training tokens and question words. In case of multiple case variants of a single word, the vector for the first occurrence (also the most frequent if vocabulary is sorted) is taken. This method corresponds to the `compute-accuracy` script of the original C word2vec. """ ok_vocab = [(w, self.vocab[w]) for w in self.index2word[:restrict_vocab]] ok_vocab = dict((w.upper(), v) for w, v in reversed( ok_vocab)) if case_insensitive else dict(ok_vocab) sections, section = [], None for line_no, line in enumerate(utils.smart_open(questions)): # TODO: use level3 BLAS (=evaluate multiple questions at once), for speed line = utils.to_unicode(line) if line.startswith(': '): # a new section starts => store the old section if section: sections.append(section) self.log_accuracy(section) section = { 'section': line.lstrip(': ').strip(), 'correct': [], 'incorrect': [] } else: if not section: raise ValueError( "missing section header before line #%i in %s" % (line_no, questions)) try: if case_insensitive: a, b, c, expected = [ word.upper() for word in line.split() ] else: a, b, c, expected = [word for word in line.split()] except: logger.info("skipping invalid line #%i in %s" % (line_no, questions)) continue if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab: logger.debug("skipping line #%i with OOV words: %s" % (line_no, line.strip())) continue original_vocab = self.vocab self.vocab = ok_vocab ignore = set([a, b, c]) # input words to be ignored predicted = None # find the most likely prediction, ignoring OOV words and input words sims = most_similar(self, positive=[b, c], negative=[a], topn=False, restrict_vocab=restrict_vocab) self.vocab = original_vocab for index in matutils.argsort(sims, reverse=True): predicted = self.index2word[index].upper( ) if case_insensitive else self.index2word[index] if predicted in ok_vocab and predicted not in ignore: if predicted != expected: logger.debug("%s: expected %s, predicted %s", line.strip(), expected, predicted) break if predicted == expected: section['correct'].append((a, b, c, expected)) else: section['incorrect'].append((a, b, c, expected)) if section: # store the last section, too sections.append(section) self.log_accuracy(section) total = { 'section': 'total', 'correct': sum((s['correct'] for s in sections), []), 'incorrect': sum((s['incorrect'] for s in sections), []), } self.log_accuracy(total) sections.append(total) return sections
def __iter__(self): for source, prefix in self.sources.items(): with utils.smart_open(source) as fin: for item_no, line in enumerate(fin): yield LabeledSentence(LabeledSentenceclean_tweet(line), [prefix + '_%s' % item_no])
def testLineSentenceWorksWithFilename(self): """Does LineSentence work with a filename argument?""" with utils.smart_open(datapath('lee_background.cor')) as orig: sentences = word2vec.LineSentence(datapath('lee_background.cor')) for words in sentences: self.assertEqual(words, utils.to_unicode(orig.readline()).split())
def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar): """ Compute accuracy of the model. `questions` is a filename where lines are 4-tuples of words, split into sections by ": SECTION NAME" lines. See https://code.google.com/p/word2vec/source/browse/trunk/questions-words.txt for an example. The accuracy is reported (=printed to log and returned as a list) for each section separately, plus there's one aggregate summary at the end. Use `restrict_vocab` to ignore all questions containing a word whose frequency is not in the top-N most frequent words (default top 30,000). This method corresponds to the `compute-accuracy` script of the original C word2vec. """ ok_vocab = dict(sorted(iteritems(self.vocab), key=lambda item: -item[1].count)[:restrict_vocab]) ok_index = set(v.index for v in itervalues(ok_vocab)) sections, section = [], None for line_no, line in enumerate(utils.smart_open(questions)): # TODO: use level3 BLAS (=evaluate multiple questions at once), for speed line = utils.to_unicode(line) if line.startswith(': '): # a new section starts => store the old section if section: sections.append(section) self.log_accuracy(section) section = {'section': line.lstrip(': ').strip(), 'correct': [], 'incorrect': []} else: if not section: raise ValueError("missing section header before line #%i in %s" % (line_no, questions)) try: a, b, c, expected = [word.lower() for word in line.split()] # TODO assumes vocabulary preprocessing uses lowercase, too... except: logger.info("skipping invalid line #%i in %s" % (line_no, questions)) if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab: logger.debug("skipping line #%i with OOV words: %s" % (line_no, line.strip())) continue ignore = set(self.vocab[v].index for v in [a, b, c]) # indexes of words to ignore predicted = None # find the most likely prediction, ignoring OOV words and input words for index in argsort(most_similar(self, positive=[b, c], negative=[a], topn=False))[::-1]: if index in ok_index and index not in ignore: predicted = self.index2word[index] if predicted != expected: logger.debug("%s: expected %s, predicted %s" % (line.strip(), expected, predicted)) break if predicted == expected: section['correct'].append((a, b, c, expected)) else: section['incorrect'].append((a, b, c, expected)) if section: # store the last section, too sections.append(section) self.log_accuracy(section) total = { 'section': 'total', 'correct': sum(len(s['correct']) for s in sections), 'incorrect': sum(len(s['incorrect']) for s in sections) } self.log_accuracy(total) sections.append(total) return sections
def load_word2vec_format(fname, fvocab=None, binary=False, norm_only=True, encoding='utf8'): """ Load the input-hidden weight matrix from the original C word2vec-tool format. Note that the information stored in the file is incomplete (the binary tree is missing), so while you can query for word similarity etc., you cannot continue training with a model loaded this way. `binary` is a boolean indicating whether the data is in binary word2vec format. `norm_only` is a boolean indicating whether to only store normalised word2vec vectors in memory. Word counts are read from `fvocab` filename, if set (this is the file generated by `-save-vocab` flag of the original C tool). If you trained the C model using non-utf8 encoding for words, specify that encoding in `encoding`. """ counts = None if fvocab is not None: logger.info("loading word counts from %s" % (fvocab)) counts = {} with utils.smart_open(fvocab) as fin: for line in fin: word, count = utils.to_unicode(line).strip().split() counts[word] = int(count) logger.info("loading projection weights from %s" % (fname)) with utils.smart_open(fname) as fin: header = utils.to_unicode(fin.readline(), encoding=encoding) vocab_size, vector_size = map(int, header.split()) # throws for invalid file format result = Word2Vec(size=vector_size) result.wv.syn0 = zeros((vocab_size, vector_size), dtype=REAL) if binary: binary_len = dtype(REAL).itemsize * vector_size for line_no in range(vocab_size): # mixed text and binary: read text first, then binary word = [] while True: ch = fin.read(1) if ch == b' ': break if ch != b'\n': # ignore newlines in front of words (some binary files have) word.append(ch) try: word = utils.to_unicode(b''.join(word), encoding=encoding) except UnicodeDecodeError as e: logger.warning( "Couldn't convert whole word to unicode: trying to convert first %d bytes only ..." % e.start) word = utils.to_unicode(b''.join(word[:e.start]), encoding=encoding) logger.warning("... first %d bytes converted to '%s'" % (e.start, word)) word = word.replace('_NOUN', '').replace('_VERB', '').replace('_ADJ', '') if counts is None: result.wv.vocab[word] = Vocab(index=line_no, count=vocab_size - line_no) elif word in counts: result.wv.vocab[word] = Vocab(index=line_no, count=counts[word]) else: logger.warning("vocabulary file is incomplete") result.wv.vocab[word] = Vocab(index=line_no, count=None) result.wv.index2word.append(word) result.wv.syn0[line_no] = fromstring(fin.read(binary_len), dtype=REAL) else: for line_no, line in enumerate(fin): parts = utils.to_unicode(line[:-1], encoding=encoding).split(" ") if len(parts) != vector_size + 1: raise ValueError("invalid vector on line %s (is this really the text format?)" % (line_no)) word, weights = parts[0], list(map(REAL, parts[1:])) word = word.replace('_NOUN', '').replace('_VERB', '').replace('_ADJ', '') if counts is None: result.wv.vocab[word] = Vocab(index=line_no, count=vocab_size - line_no) elif word in counts: result.wv.vocab[word] = Vocab(index=line_no, count=counts[word]) else: logger.warning("vocabulary file is incomplete") result.wv.vocab[word] = Vocab(index=line_no, count=None) result.wv.index2word.append(word) result.wv.syn0[line_no] = weights logger.info("loaded %s matrix from %s" % (result.wv.syn0.shape, fname)) result.init_sims(norm_only) return result