def save_word2vec(fname, vocab, vectors, binary=False): """Store the weight matrix in the same format used by the original C word2vec-tool, for compatibility. Parameters ---------- fname : str The file path used to save the vectors in. vocab : dict The vocabulary of words. vectors : numpy.array The vectors to be stored. binary : bool, optional If True, the data wil be saved in binary word2vec format, else in plain text. . """ if not (vocab or vectors): raise RuntimeError("no input") vector_size = vectors.shape[1] logging.info("storing %sx%s projection weights into %s", len(vocab), vector_size, fname) with utils.smart_open(fname, 'wb') as fout: fout.write(utils.to_utf8("%s %s\n" % (len(vocab), vector_size))) for el in sorted(vocab.keys()): row = vectors[vocab[el]] if binary: row = row.astype(np.float32) fout.write(utils.to_utf8(el) + b" " + row.tostring()) else: fout.write( utils.to_utf8("%s %s\n" % (el, ' '.join(repr(val) for val in row))))
def write_log(f, commit, changes): for change in changes: f.write(change.type.upper()[0] + ' ') path = '/dev/null' if change.old.path and change.new.path: path = change.new.path elif change.old.path: path = change.old.path elif change.new.path: path = change.new.path f.write(to_utf8(path.strip().rstrip() + '\n')) if changes: f.write('\n') f.write('commit %s\n' % commit.id) raw_lines = commit.as_raw_string().splitlines() # tab the message like git tab = False for line in raw_lines: if not tab and not line: tab = True if tab and line: f.write(' ') if line: f.write(to_utf8(line)) f.write('\n') f.write('\n')
def save_mini_fasttext_format(model, fname, words_dict, binary=False): """ Store the input-hidden weight matrix in the same format used by the original C word2vec-tool, for compatibility. `fname` is the file used to save the vectors in `fvocab` is an optional file used to save the vocabulary `binary` is an optional boolean indicating whether the data is to be saved in binary word2vec format (default: False) `total_vec` is an optional parameter to explicitly specify total no. of vectors (in case word vectors are appended with document vectors afterwards) """ total_vec = len(model.vocab) vector_size = model.syn0.shape[1] print("storing %sx%s projection weights into %s", total_vec, vector_size, fname) assert (len(model.vocab), vector_size) == model.syn0.shape with utils.smart_open(fname, 'wb') as fout: fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size))) # store in sorted order: most frequent words at the top for word, vocab in sorted(iteritems(model.vocab), key=lambda item: -item[1].count): if word in words_dict: row = model.syn0[vocab.index] if binary: fout.write(utils.to_utf8(word) + b" " + row.tostring()) else: fout.write(utils.to_utf8("%s %s\n" % (word, ' '.join("%f" % val for val in row))))
def word2vec2tsv(word2vec_model, tensor_filename, vocab=[], binary=False): """only visualize input vocab Keyword Arguments: model -- vocab -- """ from gensim.utils import to_utf8 model = word2vec_model if not os.path.exists(tensor_filename): logger.info("create dir %s" % tensor_filename) os.makedirs(tensor_filename) outfiletsv = complete_dir_path(tensor_filename) + 'tensor.tsv' outfiletsvmeta = complete_dir_path(tensor_filename) + 'metadata.tsv' if len(vocab) != 0: absent_vocab = [] # remove absent vocabulary for token in vocab: if token not in model: # logger.debug('pop %s' % vocab) absent_vocab.append(token) vocab.remove(token) logger.debug("absent vocabulary in the model %s" % absent_vocab) else: vocab = model.index2word # write tensor value with open(outfiletsv, 'w+') as file_vector: with open(outfiletsvmeta, 'w+') as file_metadata: for word in vocab: file_metadata.write(to_utf8(word) + to_utf8('\n')) vector_row = '\t'.join(str(x) for x in model[word]) file_vector.write(vector_row + '\n') return vocab
def review_cleaner(corpus, logging_filename, preprocessed_file, review_tag_field, review_field, review_id, delimiter): logger = logging.getLogger(__name__) logging.basicConfig(filename=logging_filename, filemode='w', format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # Opening the files for reading infile = smart_open(corpus, 'rb') outfile = smart_open(preprocessed_file, 'wb') # Calling the Tokenizer class tok = Tokenizer(preserve_case=False) # Iterating over the review file for line_num, line in enumerate(infile): if line_num % 100 is 0: logger.info("processing line number {}".format(line_num)) line = to_unicode(line) cols_list = line.strip().split(delimiter) # doing a testing if len(cols_list) != 11: logger.info("incorrect format of line {}".format(line_num)) continue # reading the input_id, review_tag and review input_id = cols_list[0:review_id + 1] input_tag = cols_list[review_tag_field] review = cols_list[review_field] # tokenizing the tag and review tok_tag = tok.tokenize(input_tag) tok_review = tok.tokenize(review) new_tag = " ".join(tok_tag) new_review = " ".join(tok_review) # replace 2 or more occurrence of periods # with a single period regex = r"[..]+" replace_regex = r"." new_tag = re.sub(regex, replace_regex, new_tag) new_review = re.sub(regex, replace_regex, new_review) input_id = to_unicode(delimiter.join(input_id)) # writing the review tag to output file outfile.write(to_utf8(delimiter.join([input_id, new_tag]))) outfile.write("\n") # writing the review to file outfile.write(to_utf8(delimiter.join([input_id, new_review]))) outfile.write("\n") infile.close() outfile.close()
def write_headers(self, num_docs, num_terms, num_nnz): """Write headers to file. Parameters ---------- num_docs : int Number of documents in corpus. num_terms : int Number of term in corpus. num_nnz : int Number of non-zero elements in corpus. """ self.fout.write(MmWriter.HEADER_LINE) if num_nnz < 0: # we don't know the matrix shape/density yet, so only log a general line logger.info("saving sparse matrix to %s", self.fname) self.fout.write(utils.to_utf8(' ' * 50 + '\n')) # 48 digits must be enough for everybody else: logger.info( "saving sparse %sx%s matrix with %i non-zero entries to %s", num_docs, num_terms, num_nnz, self.fname ) self.fout.write(utils.to_utf8('%s %s %s\n' % (num_docs, num_terms, num_nnz))) self.last_docno = -1 self.headers_written = True
def save_word2vec_format(vectors, vocab, save_filename, fvocab=None, binary=True): """ Store the input-hidden weight matrix in the same format used by the original C word2vec-tool, for compatibility. `vectors` is the numpy 2d array of context vectors to save. `vocab` is the vocabulary for the vectors. `fname` is the file used to save the vectors in `fvocab` is an optional file used to save the vocabulary `binary` is an optional boolean indicating whether the data is to be saved in binary word2vec format (default: False) """ if fvocab is not None: # logger.info("storing vocabulary in %s" % (fvocab)) with utils.smart_open(fvocab, 'wb') as vout: for word, vocab in sorted(iteritems(vocab), key=lambda item: -item[1].count): vout.write(utils.to_utf8("%s %s\n" % (word, vocab.count))) # logger.info("storing %sx%s projection weights into %s" % (len(self.vocab), self.vector_size, fname)) # assert (len(vocab), self.vector_size) == self.syn0.shape with utils.smart_open(save_filename, 'wb') as fout: fout.write(utils.to_utf8("%s %s\n" % vectors.shape)) # store in sorted order: most frequent words at the top for word, voc in sorted(iteritems(vocab), key=lambda item: -item[1].count): row = vectors[voc.index] if binary: fout.write(utils.to_utf8(word) + b" " + row.tostring()) else: fout.write(utils.to_utf8("%s %s\n" % (word, ' '.join("%f" % val for val in row))))
def save_word2vec_format(self, fname, fvocab=None, binary=False): """ Store the input-hidden weight matrix in the same format used by the original C word2vec-tool, for compatibility. """ if fvocab is not None: logger.info("Storing vocabulary in %s" % (fvocab)) with utils.smart_open(fvocab, 'wb') as vout: for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count): vout.write(utils.to_utf8("%s %s\n" % (word, vocab.count))) logger.info("storing %sx%s projection weights into %s" % (len(self.vocab), self.layer1_size, fname)) assert (len(self.vocab), self.layer1_size) == self.syn0.shape with utils.smart_open(fname, 'wb') as fout: fout.write(utils.to_utf8("%s %s\n" % self.syn0.shape)) # store in sorted order: most frequent words at the top for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count): row = self.syn0[vocab.index] if binary: fout.write(utils.to_utf8(word) + b" " + row.tostring()) else: fout.write( utils.to_utf8("%s %s\n" % (word, ' '.join("%f" % val for val in row))))
def save_corpus(fname, corpus, id2word=None, metadata=False): """ Save a corpus in the List-of-words format. This function is automatically called by `LowCorpus.serialize`; don't call it directly, call `serialize` instead. """ if id2word is None: logger.info( "no word id mapping provided; initializing from corpus") id2word = utils.dict_from_corpus(corpus) logger.info("storing corpus in List-Of-Words format into %s" % fname) truncated = 0 offsets = [] with utils.smart_open(fname, 'wb') as fout: fout.write(utils.to_utf8('%i\n' % len(corpus))) for doc in corpus: words = [] for wordid, value in doc: if abs(int(value) - value) > 1e-6: truncated += 1 words.extend([utils.to_unicode(id2word[wordid])] * int(value)) offsets.append(fout.tell()) fout.write(utils.to_utf8('%s\n' % ' '.join(words))) if truncated: logger.warning( "List-of-words format can only save vectors with " "integer elements; %i float entries were truncated to integer value" % truncated) return offsets
def save_corpus(fname, corpus, id2word=None, metadata=False): """ Save a corpus in the LDA-C format. There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file. This function is automatically called by `BleiCorpus.serialize`; don't call it directly, call `serialize` instead. """ if id2word is None: logger.info("no word id mapping provided; initializing from corpus") id2word = utils.dict_from_corpus(corpus) num_terms = len(id2word) else: num_terms = 1 + max([-1] + id2word.keys()) logger.info("storing corpus in Blei's LDA-C format into %s" % fname) with utils.smart_open(fname, 'wb') as fout: offsets = [] for doc in corpus: doc = list(doc) offsets.append(fout.tell()) parts = ["%i:%s" % p for p in doc if abs(p[1]) > 1e-7] fout.write(utils.to_utf8("%i %s\n" % (len(doc), ' '.join(parts)))) # write out vocabulary, in a format compatible with Blei's topics.py script fname_vocab = fname + '.vocab' logger.info("saving vocabulary of %i words to %s" % (num_terms, fname_vocab)) with utils.smart_open(fname_vocab, 'wb') as fout: for featureid in xrange(num_terms): fout.write(utils.to_utf8("%s\n" % id2word.get(featureid, '---'))) return offsets
def save_word2vec_format(self, fname, fvocab=None, binary=False, total_vec=None): """ Store the input-hidden weight matrix in the same format used by the original C word2vec-tool, for compatibility. `fname` is the file used to save the vectors in `fvocab` is an optional file used to save the vocabulary `binary` is an optional boolean indicating whether the data is to be saved in binary word2vec format (default: False) `total_vec` is an optional parameter to explicitly specify total no. of vectors (in case word vectors are appended with document vectors afterwards) """ if total_vec is None: total_vec = len(self.vocab) vector_size = self.syn0.shape[1] if fvocab is not None: logger.info("storing vocabulary in %s", fvocab) with utils.smart_open(fvocab, 'wb') as vout: for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count): vout.write(utils.to_utf8("%s %s\n" % (word, vocab.count))) logger.info("storing %sx%s projection weights into %s", total_vec, vector_size, fname) assert (len(self.vocab), vector_size) == self.syn0.shape with utils.smart_open(fname, 'wb') as fout: fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size))) # store in sorted order: most frequent words at the top for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count): row = self.syn0[vocab.index] if binary: fout.write(utils.to_utf8(word) + b" " + row.tostring()) else: fout.write(utils.to_utf8("%s %s\n" % (word, ' '.join("%f" % val for val in row))))
def save_word2vec_format(self, fname, fvocab=None, binary=False, total_vec=None): """ Store the input-hidden weight matrix in the same format used by the original C word2vec-tool, for compatibility. `fname` is the file used to save the vectors in `fvocab` is an optional file used to save the vocabulary `binary` is an optional boolean indicating whether the data is to be saved in binary word2vec format (default: False) `total_vec` is an optional parameter to explicitly specify total no. of vectors (in case word vectors are appended with document vectors afterwards) """ if total_vec is None: total_vec = len(self.vocab) vector_size = self.syn0.shape[1] if fvocab is not None: logger.info("storing vocabulary in %s", fvocab) with utils.open(fvocab, 'wb') as vout: for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count): vout.write(utils.to_utf8("%s %s\n" % (word, vocab.count))) logger.info("storing %sx%s projection weights into %s", total_vec, vector_size, fname) assert (len(self.vocab), vector_size) == self.syn0.shape with utils.open(fname, 'wb') as fout: fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size))) # store in sorted order: most frequent words at the top for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count): row = self.syn0[vocab.index] if binary: fout.write(utils.to_utf8(word) + b" " + row.tostring()) else: fout.write(utils.to_utf8("%s %s\n" % (word, ' '.join("%f" % val for val in row))))
def save_to_txt(self, path, node_list_path): columns = ["entity_id", "cover_text"] self.vertext = pd.read_csv(node_list_path, sep='\t', index_col=0, header=None, comment='#') self.vertext.columns = columns vector_size = self.model.wv.vectors.shape[1] total_vec = len(self.model.wv.vocab) with utils.smart_open(path + "/" + self.name + ".txt", 'wb') as fout: fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size))) # store in sorted order: most frequent words at the top for word, vocab_ in sorted(iteritems(self.model.wv.vocab), key=lambda item: -item[1].count): word_label = self.vertext.loc[self.vertext['entity_id'] == word].values if (len(word_label) > 0): word_label = word_label[0][1] row = self.model.wv.vectors[vocab_.index] fout.write( utils.to_utf8("%s %s\n" % (word_label, ' '.join("%f" % val for val in row))))
def save_corpus(fname, corpus, id2word=None, metadata=False): """ Save a corpus in the List-of-words format. This function is automatically called by `LowCorpus.serialize`; don't call it directly, call `serialize` instead. """ if id2word is None: logger.info("no word id mapping provided; initializing from corpus") id2word = utils.dict_from_corpus(corpus) logger.info("storing corpus in List-Of-Words format into %s" % fname) truncated = 0 offsets = [] with utils.smart_open(fname, 'wb') as fout: fout.write(utils.to_utf8('%i\n' % len(corpus))) for doc in corpus: words = [] for wordid, value in doc: if abs(int(value) - value) > 1e-6: truncated += 1 words.extend([utils.to_unicode(id2word[wordid])] * int(value)) offsets.append(fout.tell()) fout.write(utils.to_utf8('%s\n' % ' '.join(words))) if truncated: logger.warning("List-of-words format can only save vectors with " "integer elements; %i float entries were truncated to integer value" % truncated) return offsets
def save_word2vec_format(fname, vocab, vectors, binary=False): """Store the input-hidden weight matrix in the same format used by the original C word2vec-tool, for compatibility. Parameters ---------- fname : str The file path used to save the vectors in vocab : dict The vocabulary of words with their ranks vectors : numpy.array The vectors to be stored binary : bool If True, the data wil be saved in binary word2vec format, else in plain text. """ if not (vocab or vectors): raise RuntimeError('no input') total_vec = len(vocab) vector_size = vectors.shape[1] print('storing %dx%d projection weights into %s' % (total_vec, vector_size, fname)) assert (len(vocab), vector_size) == vectors.shape with utils.smart_open(fname, 'wb') as fout: fout.write(utils.to_utf8('%s %s\n' % (total_vec, vector_size))) position = 0 for element in sorted(vocab, key=lambda word: vocab[word]): row = vectors[position] if binary: row = row.astype(real) fout.write(utils.to_utf8(element) + b" " + row.tostring()) else: fout.write( utils.to_utf8('%s %s\n' % (element, ' '.join(repr(val) for val in row)))) position += 1
def save_as_text(self, fname, sort_by_word=True): """Save :class:`~gensim.corpora.dictionary.Dictionary` to a text file. Parameters ---------- fname : str Path to output file. sort_by_word : bool, optional Sort words in lexicographical order before writing them out? Notes ----- Format:: num_docs id_1[TAB]word_1[TAB]document_frequency_1[NEWLINE] id_2[TAB]word_2[TAB]document_frequency_2[NEWLINE] .... id_k[TAB]word_k[TAB]document_frequency_k[NEWLINE] This text format is great for corpus inspection and debugging. As plaintext, it's also easily portable to other tools and frameworks. For better performance and to store the entire object state, including collected corpus statistics, use :meth:`~gensim.corpora.dictionary.Dictionary.save` and :meth:`~gensim.corpora.dictionary.Dictionary.load` instead. See Also -------- :meth:`~gensim.corpora.dictionary.Dictionary.load_from_text` Load :class:`~gensim.corpora.dictionary.Dictionary` from text file. Examples -------- .. sourcecode:: pycon >>> from gensim.corpora import Dictionary >>> from gensim.test.utils import get_tmpfile >>> >>> tmp_fname = get_tmpfile("dictionary") >>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]] >>> >>> dct = Dictionary(corpus) >>> dct.save_as_text(tmp_fname) >>> >>> loaded_dct = Dictionary.load_from_text(tmp_fname) >>> assert dct.token2id == loaded_dct.token2id """ logger.info("saving dictionary mapping to %s", fname) with utils.open(fname, 'wb') as fout: numdocs_line = "%d\n" % self.num_docs fout.write(utils.to_utf8(numdocs_line)) if sort_by_word: for token, tokenid in sorted(iteritems(self.token2id)): line = "%i\t%s\t%i\n" % (tokenid, token, self.dfs.get(tokenid, 0)) fout.write(utils.to_utf8(line)) else: for tokenid, freq in sorted(iteritems(self.dfs), key=lambda item: -item[1]): line = "%i\t%s\t%i\n" % (tokenid, self[tokenid], freq) fout.write(utils.to_utf8(line))
def save(self, np2vec_model_file="np2vec.model", binary=False, word2vec_format=True): """ Save the np2vec model. Args: np2vec_model_file (str): the file containing the np2vec model to load binary (bool): boolean indicating whether the np2vec model to load is in binary format word2vec_format(bool): boolean indicating whether to save the model in original word2vec format. """ if self.word_embedding_type == "fasttext" and self.word_ngrams == 1: if not binary: logger.error( "if word_embedding_type is fasttext and word_ngrams is 1, " "binary should be set to True.") sys.exit(0) # not relevant to prune fasttext subword model self.model.save(np2vec_model_file) else: # prune non NP terms if self.prune_non_np: logger.info("pruning np2vec model") total_vec = 0 vector_size = self.model.vector_size for word in self.model.wv.vocab.keys(): if self.is_marked(word) and len(word) > 1: total_vec += 1 logger.info( "storing %sx%s projection weights for NP's into %s", total_vec, vector_size, np2vec_model_file, ) with smart_open(np2vec_model_file, "wb") as fout: fout.write( utils.to_utf8("%s %s\n" % (total_vec, vector_size))) # store NP vectors in sorted order: most frequent NP's at the top for word, vocab in sorted(iteritems(self.model.wv.vocab), key=lambda item: -item[1].count): if self.is_marked(word) and len( word) > 1: # discard empty marked np's embedding_vec = self.model.wv.syn0[vocab.index] if binary: fout.write( utils.to_utf8(word) + b" " + embedding_vec.tostring()) else: fout.write( utils.to_utf8("%s %s\n" % (word, " ".join( "%f" % val for val in embedding_vec)))) if not word2vec_format: # pylint: disable=attribute-defined-outside-init self.model = KeyedVectors.load_word2vec_format( np2vec_model_file, binary=binary) if not word2vec_format: self.model.save(np2vec_model_file)
def save_as_text(self, fname, sort_by_word=True): """Save :class:`~gensim.corpora.dictionary.Dictionary` to a text file. Parameters ---------- fname : str Path to output file. sort_by_word : bool, optional Sort words in lexicographical order before writing them out? Notes ----- Format:: num_docs id_1[TAB]word_1[TAB]document_frequency_1[NEWLINE] id_2[TAB]word_2[TAB]document_frequency_2[NEWLINE] .... id_k[TAB]word_k[TAB]document_frequency_k[NEWLINE] This text format is great for corpus inspection and debugging. As plaintext, it's also easily portable to other tools and frameworks. For better performance and to store the entire object state, including collected corpus statistics, use :meth:`~gensim.corpora.dictionary.Dictionary.save` and :meth:`~gensim.corpora.dictionary.Dictionary.load` instead. See Also -------- :meth:`~gensim.corpora.dictionary.Dictionary.load_from_text` Load :class:`~gensim.corpora.dictionary.Dictionary` from text file. Examples -------- .. sourcecode:: pycon >>> from gensim.corpora import Dictionary >>> from gensim.test.utils import get_tmpfile >>> >>> tmp_fname = get_tmpfile("dictionary") >>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]] >>> >>> dct = Dictionary(corpus) >>> dct.save_as_text(tmp_fname) >>> >>> loaded_dct = Dictionary.load_from_text(tmp_fname) >>> assert dct.token2id == loaded_dct.token2id """ logger.info("saving dictionary mapping to %s", fname) with utils.smart_open(fname, 'wb') as fout: numdocs_line = "%d\n" % self.num_docs fout.write(utils.to_utf8(numdocs_line)) if sort_by_word: for token, tokenid in sorted(iteritems(self.token2id)): line = "%i\t%s\t%i\n" % (tokenid, token, self.dfs.get(tokenid, 0)) fout.write(utils.to_utf8(line)) else: for tokenid, freq in sorted(iteritems(self.dfs), key=lambda item: -item[1]): line = "%i\t%s\t%i\n" % (tokenid, self[tokenid], freq) fout.write(utils.to_utf8(line))
def save_as_text(self, fname, sort_by_word=True): """Save :class:`~gensim.corpora.dictionary.Dictionary` to a text file. Parameters ---------- fname : str Path to output file. sort_by_word : bool, optional if True - sort by word in lexicographical order. Notes ----- Format:: num_docs id_1[TAB]word_1[TAB]document_frequency_1[NEWLINE] id_2[TAB]word_2[TAB]document_frequency_2[NEWLINE] .... id_k[TAB]word_k[TAB]document_frequency_k[NEWLINE] Warnings -------- Text format should be use for corpus inspection. Use :meth:`~gensim.corpora.dictionary.Dictionary.save` and :meth:`~gensim.corpora.dictionary.Dictionary.load` to store in binary format (pickle) for better performance. See Also -------- :meth:`~gensim.corpora.dictionary.Dictionary.load_from_text` Examples -------- >>> from gensim.corpora import Dictionary >>> from gensim.test.utils import get_tmpfile >>> >>> tmp_fname = get_tmpfile("dictionary") >>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]] >>> >>> dct = Dictionary(corpus) >>> dct.save_as_text(tmp_fname) >>> >>> loaded_dct = Dictionary.load_from_text("testdata") >>> assert dct.token2id == loaded_dct.token2id """ logger.info("saving dictionary mapping to %s", fname) with utils.smart_open(fname, 'wb') as fout: numdocs_line = "%d\n" % self.num_docs fout.write(utils.to_utf8(numdocs_line)) if sort_by_word: for token, tokenid in sorted(iteritems(self.token2id)): line = "%i\t%s\t%i\n" % (tokenid, token, self.dfs.get(tokenid, 0)) fout.write(utils.to_utf8(line)) else: for tokenid, freq in sorted(iteritems(self.dfs), key=lambda item: -item[1]): line = "%i\t%s\t%i\n" % (tokenid, self[tokenid], freq) fout.write(utils.to_utf8(line))
def save_word_embedding(self, words, embeddings, out_path): with smart_open(out_path, 'wb') as fout: fout.write(to_utf8("%s %s\n" % embeddings.shape)) for index, word in enumerate(words): row = embeddings[index] fout.write( to_utf8("%s %s\n" % (word, ' '.join("%f" % val for val in row))))
def save_corpus(fname, corpus, id2word=None, metadata=False): """Save a corpus in the LDA-C format. Notes ----- There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file. Parameters ---------- fname : str Path to output file. corpus : iterable of iterable of (int, float) Input corpus in BoW format. id2word : dict of (str, str), optional Mapping id -> word for `corpus`. metadata : bool, optional THIS PARAMETER WILL BE IGNORED. Returns ------- list of int Offsets for each line in file (in bytes). """ if id2word is None: logger.info( "no word id mapping provided; initializing from corpus") id2word = utils.dict_from_corpus(corpus) num_terms = len(id2word) elif id2word: num_terms = 1 + max(id2word) else: num_terms = 0 logger.info("storing corpus in Blei's LDA-C format into %s", fname) with utils.open(fname, 'wb') as fout: offsets = [] for doc in corpus: doc = list(doc) offsets.append(fout.tell()) parts = ["%i:%g" % p for p in doc if abs(p[1]) > 1e-7] fout.write( utils.to_utf8("%i %s\n" % (len(doc), ' '.join(parts)))) # write out vocabulary, in a format compatible with Blei's topics.py script fname_vocab = utils.smart_extension(fname, '.vocab') logger.info("saving vocabulary of %i words to %s", num_terms, fname_vocab) with utils.open(fname_vocab, 'wb') as fout: for featureid in range(num_terms): fout.write( utils.to_utf8("%s\n" % id2word.get(featureid, '---'))) return offsets
def save_syn1neg_vectors(model, fname='syn1.vectors', binary=False): # Save output vectors with utils.smart_open(fname, 'wb') as fout: fout.write(utils.to_utf8("%s %s\n" % model.syn1neg.shape)) # store in sorted order: most frequent words at the top for word, vocab in sorted(iteritems(model.vocab), key=lambda item: -item[1].count): row = model.syn1neg[vocab.index] if binary: fout.write(utils.to_utf8(word) + b" " + row.tostring()) else: fout.write(utils.to_utf8("%s %s\n" % (word, ' '.join("%f" % val for val in row))))
def _save_word2vec_format(fname, vocab, vectors, fvocab=None, binary=False, total_vec=None): """Store the input-hidden weight matrix in the same format used by the original C word2vec-tool, for compatibility. Parameters ---------- fname : str The file path used to save the vectors in vocab : dict The vocabulary of words vectors : numpy.array The vectors to be stored fvocab : str Optional file path used to save the vocabulary binary : bool If True, the data wil be saved in binary word2vec format, else it will be saved in plain text. total_vec : int Optional parameter to explicitly specify total no. of vectors (in case word vectors are appended with document vectors afterwards) """ if not (vocab or vectors): raise RuntimeError("no input") if total_vec is None: total_vec = len(vocab) vector_size = vectors.shape[1] if fvocab is not None: logger.info("storing vocabulary in %s", fvocab) with utils.smart_open(fvocab, 'wb') as vout: for word, vocab_ in sorted(iteritems(vocab), key=lambda item: -item[1].count): vout.write(utils.to_utf8("%s %s\n" % (word, vocab_.count))) logger.info("storing %sx%s projection weights into %s", total_vec, vector_size, fname) assert (len(vocab), vector_size) == vectors.shape with utils.smart_open(fname, 'wb') as fout: fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size))) # store in sorted order: most frequent words at the top for word, vocab_ in sorted(iteritems(vocab), key=lambda item: -item[1].count): row = vectors[vocab_.index] if binary: row = row.astype(REAL) fout.write(utils.to_utf8(word) + b" " + row.tostring()) else: fout.write( utils.to_utf8("%s %s\n" % (word, ' '.join(repr(val) for val in row))))
def save_corpus(fname, corpus, id2word=None, metadata=False): """Save a corpus in the GibbsLda++ format. Warnings -------- This function is automatically called by :meth:`gensim.corpora.lowcorpus.LowCorpus.serialize`, don't call it directly, call :meth:`gensim.corpora.lowcorpus.LowCorpus.serialize` instead. Parameters ---------- fname : str Path to output file. corpus : iterable of iterable of (int, int) Corpus in BoW format. id2word : {dict of (int, str), :class:`~gensim.corpora.dictionary.Dictionary`}, optional Mapping between word_ids (integers) and words (strings). If not provided, the mapping is constructed directly from `corpus`. metadata : bool, optional THIS PARAMETER WILL BE IGNORED. Return ------ list of int List of offsets in resulting file for each document (in bytes), can be used for :meth:`~gensim.corpora.lowcorpus.LowCorpus.docbyoffset` """ if id2word is None: logger.info( "no word id mapping provided; initializing from corpus") id2word = utils.dict_from_corpus(corpus) logger.info("storing corpus in List-Of-Words format into %s" % fname) truncated = 0 offsets = [] with utils.smart_open(fname, 'wb') as fout: fout.write(utils.to_utf8('%i\n' % len(corpus))) for doc in corpus: words = [] for wordid, value in doc: if abs(int(value) - value) > 1e-6: truncated += 1 words.extend([utils.to_unicode(id2word[wordid])] * int(value)) offsets.append(fout.tell()) fout.write(utils.to_utf8('%s\n' % ' '.join(words))) if truncated: logger.warning( "List-of-words format can only save vectors with integer elements; " "%i float entries were truncated to integer value", truncated) return offsets
def write_headers(self, num_docs, num_terms, num_nnz): self.fout.write(MmWriter.HEADER_LINE) if num_nnz < 0: # we don't know the matrix shape/density yet, so only log a general line logger.info("saving sparse matrix to %s" % self.fname) self.fout.write(utils.to_utf8(' ' * 50 + '\n')) # 48 digits must be enough for everybody else: logger.info("saving sparse %sx%s matrix with %i non-zero entries to %s" % (num_docs, num_terms, num_nnz, self.fname)) self.fout.write(utils.to_utf8('%s %s %s\n' % (num_docs, num_terms, num_nnz))) self.last_docno = -1 self.headers_written = True
def convert_input(self, corpus, time_slices): """ Serialize documents in LDA-C format to a temporary text file,. """ logger.info("serializing temporary corpus to %s", self.fcorpustxt()) # write out the corpus in a file format that DTM understands: corpora.BleiCorpus.save_corpus(self.fcorpustxt(), corpus) with utils.smart_open(self.ftimeslices(), 'wb') as fout: fout.write(utils.to_utf8(str(len(self.time_slices)) + "\n")) for sl in time_slices: fout.write(utils.to_utf8(str(sl) + "\n"))
def convert_input(self, corpus, time_slices): """ Serialize documents in LDA-C format to a temporary text file,. """ logger.info("serializing temporary corpus to %s" % self.fcorpustxt()) # write out the corpus in a file format that DTM understands: corpora.BleiCorpus.save_corpus(self.fcorpustxt(), corpus) with utils.smart_open(self.ftimeslices(), 'wb') as fout: fout.write(utils.to_utf8(str(len(self.time_slices)) + "\n")) for sl in time_slices: fout.write(utils.to_utf8(str(sl) + "\n"))
def save_word2vec_format(page_vec, p2v_path=p2v_path, binary=True): total_vec = len(page_vec) voctor_size = _len with utils.open(p2v_path, 'wb') as fout: fout.write(utils.to_utf8("%s %s\n" % (total_vec, voctor_size))) for word, row in page_vec.items(): if binary: row = row.astype(REAL) fout.write(utils.to_utf8(word) + b" " + row.tostring()) else: fout.write( utils.to_utf8("%s %s\n" % (word, ' '.join(repr(val) for val in row))))
def save_corpus(fname, corpus, id2word=None, metadata=False): """Save a corpus in the GibbsLda++ format. Warnings -------- This function is automatically called by :meth:`gensim.corpora.lowcorpus.LowCorpus.serialize`, don't call it directly, call :meth:`gensim.corpora.lowcorpus.LowCorpus.serialize` instead. Parameters ---------- fname : str Path to output file. corpus : iterable of iterable of (int, int) Corpus in BoW format. id2word : {dict of (int, str), :class:`~gensim.corpora.dictionary.Dictionary`}, optional Mapping between word_ids (integers) and words (strings). If not provided, the mapping is constructed directly from `corpus`. metadata : bool, optional THIS PARAMETER WILL BE IGNORED. Return ------ list of int List of offsets in resulting file for each document (in bytes), can be used for :meth:`~gensim.corpora.lowcorpus.LowCorpus.docbyoffset` """ if id2word is None: logger.info("no word id mapping provided; initializing from corpus") id2word = utils.dict_from_corpus(corpus) logger.info("storing corpus in List-Of-Words format into %s" % fname) truncated = 0 offsets = [] with utils.smart_open(fname, 'wb') as fout: fout.write(utils.to_utf8('%i\n' % len(corpus))) for doc in corpus: words = [] for wordid, value in doc: if abs(int(value) - value) > 1e-6: truncated += 1 words.extend([utils.to_unicode(id2word[wordid])] * int(value)) offsets.append(fout.tell()) fout.write(utils.to_utf8('%s\n' % ' '.join(words))) if truncated: logger.warning( "List-of-words format can only save vectors with integer elements; " "%i float entries were truncated to integer value", truncated ) return offsets
def save_word2vec_format(self, fname, doctag_vec=False, word_vec=True, prefix='*dt_', fvocab=None, binary=False): """ Store the input-hidden weight matrix. `fname` is the file used to save the vectors in `doctag_vec` is an optional boolean indicating whether to store document vectors `word_vec` is an optional boolean indicating whether to store word vectors (if both doctag_vec and word_vec are True, then both vectors are stored in the same file) `prefix` to uniquely identify doctags from word vocab, and avoid collision in case of repeated string in doctag and word vocab `fvocab` is an optional file used to save the vocabulary `binary` is an optional boolean indicating whether the data is to be saved in binary word2vec format (default: False) """ total_vec = len(self.wv.vocab) + len(self.docvecs) # save word vectors if word_vec: if not doctag_vec: total_vec = len(self.wv.vocab) KeyedVectors.save_word2vec_format(self.wv, fname, fvocab, binary, total_vec) # save document vectors if doctag_vec: with utils.smart_open(fname, 'ab') as fout: if not word_vec: total_vec = len(self.docvecs) logger.info("storing %sx%s projection weights into %s", total_vec, self.vector_size, fname) fout.write( utils.to_utf8("%s %s\n" % (total_vec, self.vector_size))) # store as in input order for i in range(len(self.docvecs)): doctag = u"%s%s" % (prefix, self.docvecs.index_to_doctag(i)) row = self.docvecs.doctag_syn0[i] if binary: fout.write( utils.to_utf8(doctag) + b" " + row.tostring()) else: fout.write( utils.to_utf8("%s %s\n" % (doctag, ' '.join("%f" % val for val in row))))
def save_corpus(fname, corpus, id2word=None, metadata=False): """Save a corpus in the LDA-C format. Notes ----- There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file. Parameters ---------- fname : str Path to output file. corpus : iterable of iterable of (int, float) Input corpus in BoW format. id2word : dict of (str, str), optional Mapping id -> word for `corpus`. metadata : bool, optional THIS PARAMETER WILL BE IGNORED. Returns ------- list of int Offsets for each line in file (in bytes). """ if id2word is None: logger.info("no word id mapping provided; initializing from corpus") id2word = utils.dict_from_corpus(corpus) num_terms = len(id2word) elif id2word: num_terms = 1 + max(id2word) else: num_terms = 0 logger.info("storing corpus in Blei's LDA-C format into %s", fname) with utils.smart_open(fname, 'wb') as fout: offsets = [] for doc in corpus: doc = list(doc) offsets.append(fout.tell()) parts = ["%i:%g" % p for p in doc if abs(p[1]) > 1e-7] fout.write(utils.to_utf8("%i %s\n" % (len(doc), ' '.join(parts)))) # write out vocabulary, in a format compatible with Blei's topics.py script fname_vocab = utils.smart_extension(fname, '.vocab') logger.info("saving vocabulary of %i words to %s", num_terms, fname_vocab) with utils.smart_open(fname_vocab, 'wb') as fout: for featureid in range(num_terms): fout.write(utils.to_utf8("%s\n" % id2word.get(featureid, '---'))) return offsets
def convert_dictionary_to_words2vec(self, fname): """ we need dictioany of word2vec format to be able to use words2vec.most_similar() function for printing n_top words of resp. topic """ vocab = self.words2vec_ny vectors = self.words2vec_ny.values() vector_size = self.word_vec_size total_vec = len(self.words2vec_ny) with utils.smart_open(fname, 'wb') as fout: fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size))) for word, row in vocab.items(): fout.write( utils.to_utf8("%s %s\n" % (word, ' '.join(repr(val) for val in row)))) words2vec2 = KeyedVectors.load_word2vec_format(fname) return (words2vec2)
def convert_input(self, corpus, infer=False): """ Serialize documents (lists of unicode tokens) to a temporary text file, then convert that text file to MALLET format `outfile`. """ logger.info("serializing temporary corpus to %s" % self.fcorpustxt()) # write out the corpus in a file format that MALLET understands: one document per line: # document id[SPACE]label (not used)[SPACE]whitespace delimited utf8-encoded tokens with utils.smart_open(self.fcorpustxt(), 'wb') as fout: for docno, doc in enumerate(corpus): if self.id2word: tokens = sum(([self.id2word[tokenid]] * int(cnt) for tokenid, cnt in doc), []) else: tokens = sum(([str(tokenid)] * int(cnt) for tokenid, cnt in doc), []) fout.write( utils.to_utf8("%s 0 %s\n" % (docno, ' '.join(tokens)))) # convert the text file above into MALLET's internal format cmd = self.mallet_path + " import-file --keep-sequence --remove-stopwords --token-regex '\S+' --input %s --output %s" if infer: cmd += ' --use-pipe-from ' + self.fcorpusmallet() cmd = cmd % (self.fcorpustxt(), self.fcorpusmallet() + '.infer') else: cmd = cmd % (self.fcorpustxt(), self.fcorpusmallet()) logger.info("converting temporary corpus to MALLET format with %s" % cmd) call(cmd, shell=True)
def save_as_text(self, fname): """Save this HashDictionary to a text file. Parameters ---------- fname : str Path to output file. Notes ----- The format is: `id[TAB]document frequency of this id[TAB]tab-separated set of words in UTF8 that map to this id[NEWLINE]`. Examples -------- >>> from gensim.corpora import HashDictionary >>> from gensim.test.utils import get_tmpfile >>> >>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]] >>> data = HashDictionary(corpus) >>> data.save_as_text(get_tmpfile("dictionary_in_text_format")) """ logger.info("saving HashDictionary mapping to %s" % fname) with utils.smart_open(fname, 'wb') as fout: for tokenid in self.keys(): words = sorted(self[tokenid]) if words: words_df = [(word, self.dfs_debug.get(word, 0)) for word in words] words_df = ["%s(%i)" % item for item in sorted(words_df, key=lambda x: -x[1])] words_df = '\t'.join(words_df) fout.write(utils.to_utf8("%i\t%i\t%s\n" % (tokenid, self.dfs.get(tokenid, 0), words_df)))
def convert_input(self, corpus, infer=False): """ Serialize documents (lists of unicode tokens) to a temporary text file, then convert that text file to MALLET format `outfile`. """ logger.info("serializing temporary corpus to %s" % self.fcorpustxt()) # write out the corpus in a file format that MALLET understands: one document per line: # document id[SPACE]label (not used)[SPACE]whitespace delimited utf8-encoded tokens with utils.smart_open(self.fcorpustxt(), "wb") as fout: for docno, doc in enumerate(corpus): if self.id2word: tokens = sum(([self.id2word[tokenid]] * int(cnt) for tokenid, cnt in doc), []) else: tokens = sum(([str(tokenid)] * int(cnt) for tokenid, cnt in doc), []) fout.write(utils.to_utf8("%s 0 %s\n" % (docno, " ".join(tokens)))) # convert the text file above into MALLET's internal format cmd = ( self.mallet_path + " import-file --preserve-case --keep-sequence --remove-stopwords --token-regex '\S+' --input %s --output %s" ) if infer: cmd += " --use-pipe-from " + self.fcorpusmallet() cmd = cmd % (self.fcorpustxt(), self.fcorpusmallet() + ".infer") else: cmd = cmd % (self.fcorpustxt(), self.fcorpusmallet()) logger.info("converting temporary corpus to MALLET format with %s" % cmd) call(cmd, shell=True)
def main(): tokens_c = Counter() articles = 0 pool = Pool(cpus) entites = get_embedded_entities(temp_path) model = load_model(wordvector_path) idfdict = load_idf(idf_path) mapping = load_ID_mapping(mapping_path) with utils.open(temp_path, 'ab') as fout: tbar = tqdm.tqdm(total=len(mapping_path)) for _id, tokens in pool.imap_unordered(process_line, get_lines(wikidata_path)): articles += 1 tbar.update(1) tokens_c.update(tokens) wikeID = get_wikidataID(_id, mapping) if wikeID in entites: continue else: vec = get_p2v(tokens, model, idfdict) fout.write( utils.to_utf8("%s %s\n" % (wikeID, ' '.join(repr(val) for val in vec)))) pool.terminate() print("Finished, Done %d articles.", articles)
def save_corpus(fname, corpus, id2word=None, metadata=False): """ Save an existing `corpus` to disk. Some formats also support saving the dictionary (`feature_id->word` mapping), which can in this case be provided by the optional `id2word` parameter. >>> MmCorpus.save_corpus('file.mm', corpus) Some corpora also support an index of where each document begins, so that the documents on disk can be accessed in O(1) time (see the `corpora.IndexedCorpus` base class). In this case, `save_corpus` is automatically called internally by `serialize`, which does `save_corpus` plus saves the index at the same time, so you want to store the corpus with:: >>> MmCorpus.serialize('file.mm', corpus) # stores index as well, allowing random access to individual documents Calling `serialize()` is preferred to calling `save_corpus()`. """ raise NotImplementedError('cannot instantiate abstract base class') # example code: logger.info("converting corpus to ??? format: %s", fname) with utils.smart_open(fname, 'wb') as fout: for doc in corpus: # iterate over the document stream fmt = str(doc) # format the document appropriately... fout.write(utils.to_utf8("%s\n" % fmt)) # serialize the formatted document to disk
def write_vector(self, docno, vector): """Write a single sparse vector to the file. Parameters ---------- docno : int Number of document. vector : list of (int, float) Vector in BoW format. Returns ------- (int, int) Max word index in vector and len of vector. If vector is empty, return (-1, 0). """ assert self.headers_written, "must write Matrix Market file headers before writing data!" assert self.last_docno < docno, "documents %i and %i not in sequential order!" % ( self.last_docno, docno) vector = sorted((i, w) for i, w in vector if abs(w) > 1e-12) # ignore near-zero entries for termid, weight in vector: # write term ids in sorted order # +1 because MM format starts counting from 1 self.fout.write( utils.to_utf8("%i %i %s\n" % (docno + 1, termid + 1, weight))) self.last_docno = docno return (vector[-1][0], len(vector)) if vector else (-1, 0)
def save_corpus(fname, corpus, id2word=None, labels=False, metadata=False): """Save a corpus in the SVMlight format. The SVMlight `<target>` class tag is taken from the `labels` array, or set to 0 for all documents if `labels` is not supplied. Parameters ---------- fname : str Path to output file. corpus : iterable of iterable of (int, float) Corpus in BoW format. id2word : dict of (str, str), optional Mapping id -> word. labels : list or False An SVMlight `<target>` class tags or False if not present. metadata : bool ARGUMENT WILL BE IGNORED. Returns ------- list of int Offsets for each line in file (in bytes). """ logger.info("converting corpus to SVMlight format: %s", fname) offsets = [] with utils.smart_open(fname, 'wb') as fout: for docno, doc in enumerate(corpus): label = labels[ docno] if labels else 0 # target class is 0 by default offsets.append(fout.tell()) fout.write(utils.to_utf8(SvmLightCorpus.doc2line(doc, label))) return offsets
def save_corpus(fname, corpus, id2word=None, metadata=False): """ Save an existing `corpus` to disk. Some formats also support saving the dictionary (`feature_id->word` mapping), which can in this case be provided by the optional `id2word` parameter. >>> MmCorpus.save_corpus('file.mm', corpus) Some corpora also support an index of where each document begins, so that the documents on disk can be accessed in O(1) time (see the `corpora.IndexedCorpus` base class). In this case, `save_corpus` is automatically called internally by `serialize`, which does `save_corpus` plus saves the index at the same time, so you want to store the corpus with:: >>> MmCorpus.serialize('file.mm', corpus) # stores index as well, allowing random access to individual documents Calling `serialize()` is preferred to calling `save_corpus()`. """ raise NotImplementedError('cannot instantiate abstract base class') # example code: logger.info("converting corpus to ??? format: %s" % fname) with utils.smart_open(fname, 'wb') as fout: for doc in corpus: # iterate over the document stream fmt = str(doc) # format the document appropriately... fout.write(utils.to_utf8( "%s\n" % fmt)) # serialize the formatted document to disk
def save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False): """ Save a corpus in the UCI Bag-of-Words format. There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file. This function is automatically called by `UciCorpus.serialize`; don't call it directly, call `serialize` instead. """ if id2word is None: logger.info("no word id mapping provided; initializing from corpus") id2word = utils.dict_from_corpus(corpus) num_terms = len(id2word) else: num_terms = 1 + max([-1] + id2word.keys()) # write out vocabulary fname_vocab = fname + '.vocab' logger.info("saving vocabulary of %i words to %s" % (num_terms, fname_vocab)) with utils.smart_open(fname_vocab, 'wb') as fout: for featureid in xrange(num_terms): fout.write(utils.to_utf8("%s\n" % id2word.get(featureid, '---'))) logger.info("storing corpus in UCI Bag-of-Words format: %s" % fname) return UciWriter.write_corpus(fname, corpus, index=True, progress_cnt=progress_cnt)
def save_corpus(fname, corpus, id2word=None, progress_cnt=10000): """ Save a corpus in the UCI Bag-of-Words format. There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file. This function is automatically called by `UciCorpus.serialize`; don't call it directly, call `serialize` instead. """ if id2word is None: logger.info("no word id mapping provided; initializing from corpus") id2word = utils.dict_from_corpus(corpus) num_terms = len(id2word) else: num_terms = 1 + max([-1] + id2word.keys()) # write out vocabulary fname_vocab = fname + '.vocab' logger.info("saving vocabulary of %i words to %s" % (num_terms, fname_vocab)) with open(fname_vocab, 'w') as fout: for featureid in xrange(num_terms): fout.write("%s\n" % utils.to_utf8(id2word.get(featureid, '---'))) logger.info("storing corpus in UCI Bag-of-Words format: %s" % fname) return UciWriter.write_corpus(fname, corpus, index=True, progress_cnt=progress_cnt)
def save(self, np2vec_model_file='np2vec.model', binary=False): """ Save the np2vec model. Args: np2vec_model_file (str): the file containing the np2vec model to load binary (bool): boolean indicating whether the np2vec model to load is in binary format """ if self.word_embedding_type == 'fasttext' and self.word_ngrams == 1: if not binary: logger.error( "if word_embedding_type is fasttext and word_ngrams is 1, " "binary should be set to True.") sys.exit(0) # not relevant to prune fasttext subword model self.model.save(np2vec_model_file) else: # prune non NP terms logger.info('pruning np2vec model') total_vec = 0 vector_size = self.model.vector_size for word in self.model.wv.vocab.keys(): if self.is_marked(word): total_vec += 1 logger.info( "storing %sx%s projection weights for NP's into %s" % (total_vec, vector_size, np2vec_model_file)) with utils.smart_open(np2vec_model_file, 'wb') as fout: fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size))) # store NP vectors in sorted order: most frequent NP's at the top for word, vocab in sorted( iteritems( self.model.wv.vocab), key=lambda item: -item[1].count): if self.is_marked(word): embedding_vec = self.model.wv.syn0[vocab.index] if binary: fout.write( utils.to_utf8(word) + b" " + embedding_vec.tostring()) else: fout.write( utils.to_utf8( "%s %s\n" % (word, ' '.join( "%f" % val for val in embedding_vec))))
def restricted_hash(self, token): """ Calculate id of the given token. Also keep track of what words were mapped to what ids, for debugging reasons. """ h = self.myhash(utils.to_utf8(token)) % self.id_range if self.debug: self.token2id[token] = h self.id2token.setdefault(h, set()).add(token) return h
def prep_text(self,p_num, sentences,outfile): output = open(outfile,'w') #distribute textfiles for i,sen in enumerate(sentences): sentence = [utils.any2utf8(w) for w in sen ] for word_a, word_b in zip(sentence, sentence[1:]): word_a = re.sub("[^a-zA-Z]+", "", word_a ) word_b = re.sub("[^a-zA-Z]+", "", word_b ) if not word_a: continue phrase = word_a + "_" + word_b; if phrase in self.vocab: output.write(utils.to_utf8(self.vocab[phrase]+' ')) else: output.write(utils.to_utf8(self.vocab[word_a]+' ')) if i % 10000 == 0: logger.info("PROGRESS: at sentence #%i " %(i)) logger.info("PROGRESS: at sentence #%i " %(i)) output.close()
def _save_word2vec_format(fname, vocab, vectors, fvocab=None, binary=False, total_vec=None): """Store the input-hidden weight matrix in the same format used by the original C word2vec-tool, for compatibility. Parameters ---------- fname : str The file path used to save the vectors in vocab : dict The vocabulary of words vectors : numpy.array The vectors to be stored fvocab : str Optional file path used to save the vocabulary binary : bool If True, the data wil be saved in binary word2vec format, else it will be saved in plain text. total_vec : int Optional parameter to explicitly specify total no. of vectors (in case word vectors are appended with document vectors afterwards) """ if not (vocab or vectors): raise RuntimeError("no input") if total_vec is None: total_vec = len(vocab) vector_size = vectors.shape[1] if fvocab is not None: logger.info("storing vocabulary in %s", fvocab) with utils.smart_open(fvocab, 'wb') as vout: for word, vocab_ in sorted(iteritems(vocab), key=lambda item: -item[1].count): vout.write(utils.to_utf8("%s %s\n" % (word, vocab_.count))) logger.info("storing %sx%s projection weights into %s", total_vec, vector_size, fname) assert (len(vocab), vector_size) == vectors.shape with utils.smart_open(fname, 'wb') as fout: fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size))) # store in sorted order: most frequent words at the top for word, vocab_ in sorted(iteritems(vocab), key=lambda item: -item[1].count): row = vectors[vocab_.index] if binary: row = row.astype(REAL) fout.write(utils.to_utf8(word) + b" " + row.tostring()) else: fout.write(utils.to_utf8("%s %s\n" % (word, ' '.join(repr(val) for val in row))))
def save_as_text(self, fname, sort_by_word=True): """ Save this Dictionary to a text file, in format: `id[TAB]word_utf8[TAB]document frequency[NEWLINE]`. Sorted by word, or by decreasing word frequency. Note: text format should be use for corpus inspection. Use `save`/`load` to store in binary format (pickle) for improved performance. """ logger.info("saving dictionary mapping to %s" % fname) with utils.smart_open(fname, 'wb') as fout: if sort_by_word: for token, tokenid in sorted(iteritems(self.token2id)): line = "%i\t%s\t%i\n" % (tokenid, token, self.dfs.get(tokenid, 0)) fout.write(utils.to_utf8(line)) else: for tokenid, freq in sorted(iteritems(self.dfs), key=lambda item: -item[1]): line = "%i\t%s\t%i\n" % (tokenid, self[tokenid], freq) fout.write(utils.to_utf8(line))
def update_headers(self, num_docs, num_terms, num_nnz): """Update headers with actual values.""" offset = 0 values = [utils.to_utf8(str(n)) for n in [num_docs, num_terms, num_nnz]] for value in values: if len(value) > len(self.FAKE_HEADER): raise ValueError('Invalid header: value too large!') self.fout.seek(offset) self.fout.write(value) offset += len(self.FAKE_HEADER)
def convert_input(self, corpus, time_slices): """Convert corpus into LDA-C format by :class:`~gensim.corpora.bleicorpus.BleiCorpus` and save to temp file. Path to temporary file produced by :meth:`~gensim.models.wrappers.dtmmodel.DtmModel.ftimeslices`. Parameters ---------- corpus : iterable of iterable of (int, float) Corpus in BoW format. time_slices : list of int Sequence of timestamps. """ logger.info("serializing temporary corpus to %s", self.fcorpustxt()) # write out the corpus in a file format that DTM understands: corpora.BleiCorpus.save_corpus(self.fcorpustxt(), corpus) with utils.smart_open(self.ftimeslices(), 'wb') as fout: fout.write(utils.to_utf8(str(len(self.time_slices)) + "\n")) for sl in time_slices: fout.write(utils.to_utf8(str(sl) + "\n"))
def corpus2mallet(self, corpus, file_like): """ Write out `corpus` in a file format that MALLET understands: one document per line: document id[SPACE]label (not used)[SPACE]whitespace delimited utf8-encoded tokens[NEWLINE] """ for docno, doc in enumerate(corpus): if self.id2word: tokens = sum(([self.id2word[tokenid]] * int(cnt) for tokenid, cnt in doc), []) else: tokens = sum(([str(tokenid)] * int(cnt) for tokenid, cnt in doc), []) file_like.write(utils.to_utf8("%s 0 %s\n" % (docno, ' '.join(tokens))))
def Perform_LDA(i): NewFile = [] tokens = [utils.to_utf8(token) for token in utils.tokenize(' '.join(i), lower=True, errors='ignore')] for i in nltk.pos_tag(tokens): if findPOS(i) == '': NewFile.append(i[0]) else: NewFile.append(wordnet_lemmatizer.lemmatize(i[0], pos = findPOS(i))) texts = [[i.encode('utf-8')] for i in NewFile] dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] return corpus, models.ldamodel.LdaModel(corpus=corpus, passes = 30, id2word=dictionary, num_topics=3)
def save_as_text(self, fname, sort_by_word=True): """ Save this Dictionary to a text file, in format: `id[TAB]fid[TAB]sid[TAB]document frequency[NEWLINE]` and _unidict has an usual gensim dictionary """ self._unidict.save_as_text(fname + '.index', sort_by_word) with utils.smart_open(fname, 'wb') as fout: # no word to display in bidict for fid_sid, id in sorted(iteritems(self.fid_sid2bid)): line = "%i\t%i\t%i\t%i\n" % (id, fid_sid[0], fid_sid[1], self.dfs.get(id, 0)) fout.write(utils.to_utf8(line))
def save_word2vec_format(self, fname, fvocab=None, binary=False): """ Store the input-hidden weight matrix in the same format used by the original C word2vec-tool, for compatibility. """ if fvocab is not None: logger.info("Storing vocabulary in %s" % (fvocab)) with utils.smart_open(fvocab, 'wb') as vout: for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count): vout.write(utils.to_utf8("%s %s\n" % (word, vocab.count))) logger.info("storing %sx%s projection weights into %s" % (len(self.vocab), self.layer1_size, fname)) assert (len(self.vocab), self.layer1_size) == self.syn0.shape with utils.smart_open(fname, 'wb') as fout: fout.write(utils.to_utf8("%s %s\n" % self.syn0.shape)) # store in sorted order: most frequent words at the top for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count): row = self.syn0[vocab.index] if binary: fout.write(word + b" " + row.tostring()) else: fout.write("%s %s\n" % (word, ' '.join("%f" % val for val in row)))
def write_vector(self, docno, vector): """ Write a single sparse vector to the file. Sparse vector is any iterable yielding (field id, field value) pairs. """ assert self.headers_written, "must write Matrix Market file headers before writing data!" assert self.last_docno < docno, "documents %i and %i not in sequential order!" % (self.last_docno, docno) vector = sorted((i, w) for i, w in vector if abs(w) > 1e-12) # ignore near-zero entries for termid, weight in vector: # write term ids in sorted order self.fout.write(utils.to_utf8("%i %i %s\n" % (docno + 1, termid + 1, weight))) # +1 because MM format starts counting from 1 self.last_docno = docno return (vector[-1][0], len(vector)) if vector else (-1, 0)
def test_loadFromText(self): """`Dictionary` can be loaded from textfile.""" tmpf = get_tmpfile('load_dict_test.txt') no_num_docs_serialization = to_utf8("2\n1\tprvé\t1\n2\tslovo\t2\n") with open(tmpf, "wb") as file: file.write(no_num_docs_serialization) d = Dictionary.load_from_text(tmpf) self.assertEqual(d.token2id[u"prvé"], 1) self.assertEqual(d.token2id[u"slovo"], 2) self.assertEqual(d.dfs[1], 1) self.assertEqual(d.dfs[2], 2) self.assertEqual(d.num_docs, 2)
def build_vocab(self, sentences): """ Build vocabulary from a sequence of sentences (can be a once-only generator stream). Each sentence must be a list of utf8 strings. """ logger.info("collecting all words and their counts") sentence_no, vocab = -1, {} total_words = lambda: sum(v.count for v in vocab.itervalues()) for sentence_no, sentence in enumerate(sentences): if sentence_no % 10000 == 0: logger.info("PROGRESS: at sentence #%i, processed %i words and %i word types" % (sentence_no, total_words(), len(vocab))) for word in sentence: if word in vocab: vocab[word].count += 1 else: vocab[word] = Vocab(count=1) logger.info("collected %i word types from a corpus of %i words and %i sentences" % (len(vocab), total_words(), sentence_no + 1)) # assign a unique index to each word self.vocab, self.index2word = {}, [] for word, v in vocab.iteritems(): if v.count >= self.min_count and v.count <= self.max_count: try: utils.to_utf8(word) v.index = len(self.vocab) self.index2word.append(word) self.vocab[word] = v self.vocab[word].count_power = pow(v.count, self.c_power) self.vocab[word].count_power_2 = pow(v.count, 2*self.c_power) except: pass logger.info("total %i word types after removing those with count<%s" % (len(self.vocab), self.min_count)) # add info about each word's Huffman encoding self.create_binary_tree() self.reset_weights()
def doc2bow(self, document, allow_update=False, return_missing=False): """ Convert `document` (a list of words) into the bag-of-words format = list of `(token_id, token_count)` 2-tuples. Each word is assumed to be a **tokenized and normalized** utf-8 encoded string. No further preprocessing is done on the words in `document`; apply tokenization, stemming etc. before calling this method. If `allow_update` is set, then also update dictionary in the process: create ids for new words. At the same time, update document frequencies -- for each word appearing in this document, increase its document frequency (`self.dfs`) by one. If `allow_update` is **not** set, this function is `const`, aka read-only. """ result = {} missing = {} if isinstance(document, string_types): raise TypeError("doc2bow expects an array of utf8 tokens on input, not a string") document = sorted(utils.to_utf8(token) for token in document) # construct (word, frequency) mapping. in python3 this is done simply # using Counter(), but here i use itertools.groupby() for the job for word_norm, group in itertools.groupby(document): frequency = len(list(group)) # how many times does this word appear in the input document tokenid = self.token2id.get(word_norm, None) if tokenid is None: # first time we see this token (~normalized form) if return_missing: missing[word_norm] = frequency if not allow_update: # if we aren't allowed to create new tokens, continue with the next unique token continue tokenid = len(self.token2id) self.token2id[word_norm] = tokenid # new id = number of ids made so far; NOTE this assumes there are no gaps in the id sequence! # update how many times a token appeared in the document result[tokenid] = frequency if allow_update: self.num_docs += 1 self.num_pos += len(document) self.num_nnz += len(result) # increase document count for each unique token that appeared in the document for tokenid in iterkeys(result): self.dfs[tokenid] = self.dfs.get(tokenid, 0) + 1 # return tokenids, in ascending id order result = sorted(iteritems(result)) if return_missing: return result, missing else: return result