def save_word2vec_format(self, fname, fvocab=None, binary=False): """ Store the input-hidden weight matrix in the same format used by the original C word2vec-tool, for compatibility. """ if fvocab is not None: logger.info("Storing vocabulary in %s" % (fvocab)) with utils.smart_open(fvocab, 'wb') as vout: for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count): vout.write("%s %s\n" % (word, vocab.count)) logger.info("storing %sx%s projection weights into %s" % (len(self.vocab), self.layer1_size, fname)) assert (len(self.vocab), self.layer1_size) == self.syn0.shape with utils.smart_open(fname, 'wb') as fout: fout.write("%s %s\n" % self.syn0.shape) # store in sorted order: most frequent words at the top for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count): word = utils.to_utf8(word) # always store in utf8 row = self.syn0[vocab.index] if binary: fout.write("%s %s\n" % (word, row.tostring())) else: fout.write("%s %s\n" % (word, ' '.join("%f" % val for val in row)))
def create_dictionary(self): """ Utility method to generate gensim-style Dictionary directly from the corpus and vocabulary data. """ dictionary = Dictionary() # replace dfs with defaultdict to avoid downstream KeyErrors # uci vocabularies may contain terms that are not used in the document data dictionary.dfs = defaultdict(int) dictionary.id2token = self.id2word dictionary.token2id = dict((v, k) for k, v in iteritems(self.id2word)) dictionary.num_docs = self.num_docs dictionary.num_nnz = self.num_nnz for docno, doc in enumerate(self): if docno % 10000 == 0: logger.info('PROGRESS: processing document %i of %i' % (docno, self.num_docs)) for word, count in doc: dictionary.dfs[word] += 1 dictionary.num_pos += count return dictionary
def build_vocab(self, sentences): """ Build vocabulary from a sequence of sentences (can be a once-only generator stream). Each sentence must be a list of utf8 strings. """ logger.info("collecting all words and their counts") sentence_no, vocab = -1, {} total_words = 0 for sentence_no, sentence in enumerate(sentences): if sentence_no % 10000 == 0: logger.info("PROGRESS: at sentence #%i, processed %i words and %i word types" % (sentence_no, total_words, len(vocab))) for word in sentence: total_words += 1 if word in vocab: vocab[word].count += 1 else: vocab[word] = Vocab(count=1) logger.info("collected %i word types from a corpus of %i words and %i sentences" % (len(vocab), total_words, sentence_no + 1)) # assign a unique index to each word self.vocab, self.index2word = {}, [] for word, v in iteritems(vocab): if v.count >= self.min_count: v.index = len(self.vocab) self.index2word.append(word) self.vocab[word] = v logger.info("total %i word types after removing those with count<%s" % (len(self.vocab), self.min_count)) # add info about each word's Huffman encoding self.create_binary_tree() self.reset_weights()
def revdict(d): """ Reverse a dictionary mapping. When two keys map to the same value, only one of them will be kept in the result (which one is kept is arbitrary).""" return dict((v, k) for (k, v) in iteritems(d))
def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000): """ Remove document frequency statistics for tokens that appear in 1. less than `no_below` documents (absolute number) or 2. more than `no_above` documents (fraction of total corpus size, *not* absolute number). 3. after (1) and (2), keep only the first `keep_n` most frequent tokens (or keep all if `None`). **Note:** since HashDictionary's id range is fixed and doesn't depend on the number of tokens seen, this doesn't really "remove" anything. It only clears some supplementary statistics, for easier debugging and a smaller RAM footprint. """ no_above_abs = int( no_above * self.num_docs ) # convert fractional threshold to absolute threshold ok = [ item for item in iteritems(self.dfs_debug) if no_below <= item[1] <= no_above_abs ] ok = frozenset( word for word, freq in sorted(ok, key=lambda item: -item[1])[:keep_n]) self.dfs_debug = dict((word, freq) for word, freq in iteritems(self.dfs_debug) if word in ok) self.token2id = dict((token, tokenid) for token, tokenid in iteritems(self.token2id) if token in self.dfs_debug) self.id2token = dict( (tokenid, set(token for token in tokens if token in self.dfs_debug)) for tokenid, tokens in iteritems(self.id2token)) self.dfs = dict((tokenid, freq) for tokenid, freq in iteritems(self.dfs) if self.id2token.get(tokenid, set())) # for word->document frequency logger.info( "kept statistics for which were in no less than %i and no more than %i (=%.1f%%) documents" % (no_below, no_above_abs, 100.0 * no_above))
def doc2bow(self, document, allow_update=False, return_missing=False): """ Convert `document` (a list of words) into the bag-of-words format = list of `(token_id, token_count)` 2-tuples. Each word is assumed to be a **tokenized and normalized** utf-8 encoded string. No further preprocessing is done on the words in `document`; apply tokenization, stemming etc. before calling this method. If `allow_update` is set, then also update dictionary in the process: create ids for new words. At the same time, update document frequencies -- for each word appearing in this document, increase its document frequency (`self.dfs`) by one. If `allow_update` is **not** set, this function is `const`, aka read-only. """ result = {} missing = {} if isinstance(document, string_types): raise TypeError( "doc2bow expects an array of utf8 tokens on input, not a string" ) document = sorted(utils.to_utf8(token) for token in document) # construct (word, frequency) mapping. in python3 this is done simply # using Counter(), but here i use itertools.groupby() for the job for word_norm, group in itertools.groupby(document): frequency = len( list(group) ) # how many times does this word appear in the input document tokenid = self.token2id.get(word_norm, None) if tokenid is None: # first time we see this token (~normalized form) if return_missing: missing[word_norm] = frequency if not allow_update: # if we aren't allowed to create new tokens, continue with the next unique token continue tokenid = len(self.token2id) self.token2id[ word_norm] = tokenid # new id = number of ids made so far; NOTE this assumes there are no gaps in the id sequence! # update how many times a token appeared in the document result[tokenid] = frequency if allow_update: self.num_docs += 1 self.num_pos += len(document) self.num_nnz += len(result) # increase document count for each unique token that appeared in the document for tokenid in iterkeys(result): self.dfs[tokenid] = self.dfs.get(tokenid, 0) + 1 # return tokenids, in ascending id order result = sorted(iteritems(result)) if return_missing: return result, missing else: return result
def save_as_text(self, fname): """ Save this Dictionary to a text file, in format: `id[TAB]word_utf8[TAB]document frequency[NEWLINE]`. Note: use `save`/`load` to store in binary format instead (pickle). """ logger.info("saving dictionary mapping to %s" % fname) with utils.smart_open(fname, 'wb') as fout: for token, tokenid in sorted(iteritems(self.token2id)): fout.write("%i\t%s\t%i\n" % (tokenid, token, self.dfs.get(tokenid, 0)))
def compactify(self): """ Assign new word ids to all words. This is done to make the ids more compact, e.g. after some tokens have been removed via :func:`filter_tokens` and there are gaps in the id series. Calling this method will remove the gaps. """ logger.debug("rebuilding dictionary, shrinking gaps") # build mapping from old id -> new id idmap = dict( izip(itervalues(self.token2id), xrange(len(self.token2id)))) # reassign mappings to new ids self.token2id = dict((token, idmap[tokenid]) for token, tokenid in iteritems(self.token2id)) self.id2token = {} self.dfs = dict( (idmap[tokenid], freq) for tokenid, freq in iteritems(self.dfs))
def compactify(self): """ Assign new word ids to all words. This is done to make the ids more compact, e.g. after some tokens have been removed via :func:`filter_tokens` and there are gaps in the id series. Calling this method will remove the gaps. """ logger.debug("rebuilding dictionary, shrinking gaps") # build mapping from old id -> new id idmap = dict(izip(itervalues(self.token2id), xrange(len(self.token2id)))) # reassign mappings to new ids self.token2id = dict((token, idmap[tokenid]) for token, tokenid in iteritems(self.token2id)) self.id2token = {} self.dfs = dict((idmap[tokenid], freq) for tokenid, freq in iteritems(self.dfs))
def cossim(vec1, vec2): vec1, vec2 = dict(vec1), dict(vec2) if not vec1 or not vec2: return 0.0 vec1len = 1.0 * math.sqrt(sum(val * val for val in itervalues(vec1))) vec2len = 1.0 * math.sqrt(sum(val * val for val in itervalues(vec2))) assert vec1len > 0.0 and vec2len > 0.0, "sparse documents must not contain any explicit zero entries" if len(vec2) < len(vec1): vec1, vec2 = vec2, vec1 # swap references so that we iterate over the shorter vector result = sum(value * vec2.get(index, 0.0) for index, value in iteritems(vec1)) result /= vec1len * vec2len # rescale by vector lengths return result
def doc2bow(self, document, allow_update=False, return_missing=False): """ Convert `document` (a list of words) into the bag-of-words format = list of `(token_id, token_count)` 2-tuples. Each word is assumed to be a **tokenized and normalized** utf-8 encoded string. No further preprocessing is done on the words in `document`; apply tokenization, stemming etc. before calling this method. If `allow_update` is set, then also update dictionary in the process: create ids for new words. At the same time, update document frequencies -- for each word appearing in this document, increase its document frequency (`self.dfs`) by one. If `allow_update` is **not** set, this function is `const`, aka read-only. """ result = {} missing = {} if isinstance(document, string_types): raise TypeError("doc2bow expects an array of utf8 tokens on input, not a string") document = sorted(utils.to_utf8(token) for token in document) # construct (word, frequency) mapping. in python3 this is done simply # using Counter(), but here i use itertools.groupby() for the job for word_norm, group in itertools.groupby(document): frequency = len(list(group)) # how many times does this word appear in the input document tokenid = self.token2id.get(word_norm, None) if tokenid is None: # first time we see this token (~normalized form) if return_missing: missing[word_norm] = frequency if not allow_update: # if we aren't allowed to create new tokens, continue with the next unique token continue tokenid = len(self.token2id) self.token2id[word_norm] = tokenid # new id = number of ids made so far; NOTE this assumes there are no gaps in the id sequence! # update how many times a token appeared in the document result[tokenid] = frequency if allow_update: self.num_docs += 1 self.num_pos += len(document) self.num_nnz += len(result) # increase document count for each unique token that appeared in the document for tokenid in iterkeys(result): self.dfs[tokenid] = self.dfs.get(tokenid, 0) + 1 # return tokenids, in ascending id order result = sorted(iteritems(result)) if return_missing: return result, missing else: return result
def filter_tokens(self, bad_ids=None, good_ids=None): """ Remove the selected `bad_ids` tokens from all dictionary mappings, or, keep selected `good_ids` in the mapping and remove the rest. `bad_ids` and `good_ids` are collections of word ids to be removed. """ if bad_ids is not None: bad_ids = set(bad_ids) self.token2id = dict((token, tokenid) for token, tokenid in iteritems(self.token2id) if tokenid not in bad_ids) self.dfs = dict((tokenid, freq) for tokenid, freq in iteritems(self.dfs) if tokenid not in bad_ids) if good_ids is not None: good_ids = set(good_ids) self.token2id = dict((token, tokenid) for token, tokenid in iteritems(self.token2id) if tokenid in good_ids) self.dfs = dict((tokenid, freq) for tokenid, freq in iteritems(self.dfs) if tokenid in good_ids)
def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000): """ Remove document frequency statistics for tokens that appear in 1. less than `no_below` documents (absolute number) or 2. more than `no_above` documents (fraction of total corpus size, *not* absolute number). 3. after (1) and (2), keep only the first `keep_n` most frequent tokens (or keep all if `None`). **Note:** since HashDictionary's id range is fixed and doesn't depend on the number of tokens seen, this doesn't really "remove" anything. It only clears some supplementary statistics, for easier debugging and a smaller RAM footprint. """ no_above_abs = int(no_above * self.num_docs) # convert fractional threshold to absolute threshold ok = [item for item in iteritems(self.dfs_debug) if no_below <= item[1] <= no_above_abs] ok = frozenset(word for word, freq in sorted(ok, key=lambda item: -item[1])[:keep_n]) self.dfs_debug = dict((word, freq) for word, freq in iteritems(self.dfs_debug) if word in ok) self.token2id = dict((token, tokenid) for token, tokenid in iteritems(self.token2id) if token in self.dfs_debug) self.id2token = dict((tokenid, set(token for token in tokens if token in self.dfs_debug)) for tokenid, tokens in iteritems(self.id2token)) self.dfs = dict((tokenid, freq) for tokenid, freq in iteritems(self.dfs) if self.id2token.get(tokenid, set())) # for word->document frequency logger.info("kept statistics for which were in no less than %i and no more than %i (=%.1f%%) documents" % (no_below, no_above_abs, 100.0 * no_above))
def doc2bow(self, document, allow_update=False, return_missing=False): """ Convert `document` (a list of words) into the bag-of-words format = list of `(token_id, token_count)` 2-tuples. Each word is assumed to be a **tokenized and normalized** utf-8 encoded string. No further preprocessing is done on the words in `document`; apply tokenization, stemming etc. before calling this method. If `allow_update` or `self.allow_update` is set, then also update dictionary in the process: update overall corpus statistics and document frequencies. For each id appearing in this document, increase its document frequency (`self.dfs`) by one. """ result = {} missing = {} document = sorted( document) # convert the input to plain list (needed below) for word_norm, group in itertools.groupby(document): frequency = len( list(group) ) # how many times does this word appear in the input document tokenid = self.restricted_hash(word_norm) result[tokenid] = result.get(tokenid, 0) + frequency if self.debug: # increment document count for each unique token that appeared in the document self.dfs_debug[word_norm] = self.dfs_debug.get(word_norm, 0) + 1 if allow_update or self.allow_update: self.num_docs += 1 self.num_pos += len(document) self.num_nnz += len(result) if self.debug: # increment document count for each unique tokenid that appeared in the document # done here, because several words may map to the same tokenid for tokenid in iterkeys(result): self.dfs[tokenid] = self.dfs.get(tokenid, 0) + 1 # return tokenids, in ascending id order result = sorted(iteritems(result)) if return_missing: return result, missing else: return result
def merge_with(self, other): """ Merge another dictionary into this dictionary, mapping same tokens to the same ids and new tokens to new ids. The purpose is to merge two corpora created using two different dictionaries, one from `self` and one from `other`. `other` can be any id=>word mapping (a dict, a Dictionary object, ...). Return a transformation object which, when accessed as `result[doc_from_other_corpus]`, will convert documents from a corpus built using the `other` dictionary into a document using the new, merged dictionary (see :class:`gensim.interfaces.TransformationABC`). Example: >>> dict1 = Dictionary(some_documents) >>> dict2 = Dictionary(other_documents) # ids not compatible with dict1! >>> dict2_to_dict1 = dict1.merge_with(dict2) >>> # now we can merge corpora from the two incompatible dictionaries into one >>> merged_corpus = itertools.chain(some_corpus_from_dict1, dict2_to_dict1[some_corpus_from_dict2]) """ old2new = {} for other_id, other_token in iteritems(other): if other_token in self.token2id: new_id = self.token2id[other_token] else: new_id = len(self.token2id) self.token2id[other_token] = new_id self.dfs[new_id] = 0 old2new[other_id] = new_id try: self.dfs[new_id] += other.dfs[other_id] except: # `other` isn't a Dictionary (probably just a dict) => ignore dfs, keep going pass try: self.num_docs += other.num_docs self.num_nnz += other.num_nnz self.num_pos += other.num_pos except: pass import gensim.models return gensim.models.VocabTransform(old2new)
def doc2bow(self, document, allow_update=False, return_missing=False): """ Convert `document` (a list of words) into the bag-of-words format = list of `(token_id, token_count)` 2-tuples. Each word is assumed to be a **tokenized and normalized** utf-8 encoded string. No further preprocessing is done on the words in `document`; apply tokenization, stemming etc. before calling this method. If `allow_update` or `self.allow_update` is set, then also update dictionary in the process: update overall corpus statistics and document frequencies. For each id appearing in this document, increase its document frequency (`self.dfs`) by one. """ result = {} missing = {} document = sorted(document) # convert the input to plain list (needed below) for word_norm, group in itertools.groupby(document): frequency = len(list(group)) # how many times does this word appear in the input document tokenid = self.restricted_hash(word_norm) result[tokenid] = result.get(tokenid, 0) + frequency if self.debug: # increment document count for each unique token that appeared in the document self.dfs_debug[word_norm] = self.dfs_debug.get(word_norm, 0) + 1 if allow_update or self.allow_update: self.num_docs += 1 self.num_pos += len(document) self.num_nnz += len(result) if self.debug: # increment document count for each unique tokenid that appeared in the document # done here, because several words may map to the same tokenid for tokenid in iterkeys(result): self.dfs[tokenid] = self.dfs.get(tokenid, 0) + 1 # return tokenids, in ascending id order result = sorted(iteritems(result)) if return_missing: return result, missing else: return result
def __init__(self, fname, id2word=None, line2words=split_on_space): """ Initialize the corpus from a file. `id2word` and `line2words` are optional parameters. If provided, `id2word` is a dictionary mapping between word_ids (integers) and words (strings). If not provided, the mapping is constructed from the documents. `line2words` is a function which converts lines into tokens. Defaults to simple splitting on spaces. """ IndexedCorpus.__init__(self, fname) logger.info("loading corpus from %s" % fname) self.fname = fname # input file, see class doc for format self.line2words = line2words # how to translate lines into words (simply split on space by default) self.num_docs = self._calculate_num_docs() if not id2word: # build a list of all word types in the corpus (distinct words) logger.info("extracting vocabulary from the corpus") all_terms = set() self.use_wordids = False # return documents as (word, wordCount) 2-tuples for doc in self: all_terms.update(word for word, wordCnt in doc) all_terms = sorted( all_terms ) # sort the list of all words; rank in that list = word's integer id self.id2word = dict(izip( xrange(len(all_terms)), all_terms)) # build a mapping of word id(int) -> word (string) else: logger.info("using provided word mapping (%i ids)" % len(id2word)) self.id2word = id2word self.word2id = dict((v, k) for k, v in iteritems(self.id2word)) self.num_terms = len(self.word2id) self.use_wordids = True # return documents as (wordIndex, wordCount) 2-tuples logger.info("loaded corpus with %i documents and %i terms from %s" % (self.num_docs, self.num_terms, fname))
def __init__(self, fname, id2word=None, line2words=split_on_space): """ Initialize the corpus from a file. `id2word` and `line2words` are optional parameters. If provided, `id2word` is a dictionary mapping between word_ids (integers) and words (strings). If not provided, the mapping is constructed from the documents. `line2words` is a function which converts lines into tokens. Defaults to simple splitting on spaces. """ IndexedCorpus.__init__(self, fname) logger.info("loading corpus from %s" % fname) self.fname = fname # input file, see class doc for format self.line2words = line2words # how to translate lines into words (simply split on space by default) self.num_docs = self._calculate_num_docs() if not id2word: # build a list of all word types in the corpus (distinct words) logger.info("extracting vocabulary from the corpus") all_terms = set() self.use_wordids = False # return documents as (word, wordCount) 2-tuples for doc in self: all_terms.update(word for word, wordCnt in doc) all_terms = sorted(all_terms) # sort the list of all words; rank in that list = word's integer id self.id2word = dict( izip(xrange(len(all_terms)), all_terms) ) # build a mapping of word id(int) -> word (string) else: logger.info("using provided word mapping (%i ids)" % len(id2word)) self.id2word = id2word self.word2id = dict((v, k) for k, v in iteritems(self.id2word)) self.num_terms = len(self.word2id) self.use_wordids = True # return documents as (wordIndex, wordCount) 2-tuples logger.info("loaded corpus with %i documents and %i terms from %s" % (self.num_docs, self.num_terms, fname))
def save(self, fname, separately=None, sep_limit=10 * 1024**2, ignore=frozenset()): """ Save the object to file (also see `load`). If `separately` is None, automatically detect large numpy/scipy.sparse arrays in the object being stored, and store them into separate files. This avoids pickle memory errors and allows mmap'ing large arrays back on load efficiently. You can also set `separately` manually, in which case it must be a list of attribute names to be stored in separate files. The automatic check is not performed in this case. `ignore` is a set of attribute names to *not* serialize (file handles, caches etc). On subsequent load() these attributes will be set to None. """ logger.info("saving %s object under %s, separately %s" % (self.__class__.__name__, fname, separately)) subname = lambda suffix: fname + '.' + suffix + '.npy' tmp = {} if separately is None: separately = [] for attrib, val in iteritems(self.__dict__): if isinstance(val, numpy.ndarray) and val.size >= sep_limit: separately.append(attrib) elif isinstance(val, (scipy.sparse.csr_matrix, scipy.sparse.csc_matrix)) and val.nnz >= sep_limit: separately.append(attrib) # whatever's in `separately` or `ignore` at this point won't get pickled anymore for attrib in separately + list(ignore): if hasattr(self, attrib): tmp[attrib] = getattr(self, attrib) delattr(self, attrib) try: numpys, scipys, ignoreds = [], [], [] for attrib, val in iteritems(tmp): if isinstance(val, numpy.ndarray) and attrib not in ignore: numpys.append(attrib) logger.info("storing numpy array '%s' to %s" % (attrib, subname(attrib))) numpy.save(subname(attrib), numpy.ascontiguousarray(val)) elif isinstance(val, (scipy.sparse.csr_matrix, scipy.sparse.csc_matrix)) and attrib not in ignore: scipys.append(attrib) logger.info("storing scipy.sparse array '%s' under %s" % (attrib, subname(attrib))) numpy.save(subname(attrib) + '.data.npy', val.data) numpy.save(subname(attrib) + '.indptr.npy', val.indptr) numpy.save(subname(attrib) + '.indices.npy', val.indices) data, indptr, indices = val.data, val.indptr, val.indices val.data, val.indptr, val.indices = None, None, None try: pickle(val, subname(attrib)) # store array-less object finally: val.data, val.indptr, val.indices = data, indptr, indices else: logger.info("not storing attribute %s" % (attrib)) ignoreds.append(attrib) self.__dict__['__numpys'] = numpys self.__dict__['__scipys'] = scipys self.__dict__['__ignoreds'] = ignoreds pickle(self, fname) finally: # restore the attributes for attrib, val in iteritems(tmp): setattr(self, attrib, val)
def __getitem__(self, tokenid): if len(self.id2token) != len(self.token2id): # the word->id mapping has changed (presumably via add_documents); # recompute id->word accordingly self.id2token = dict((v, k) for k, v in iteritems(self.token2id)) return self.id2token[tokenid] # will throw for non-existent ids
def precompute_idfs(wglobal, dfs, total_docs): """Precompute the inverse document frequency mapping for all terms.""" # not strictly necessary and could be computed on the fly in TfidfModel__getitem__. # this method is here just to speed things up a little. return dict((termid, wglobal(df, total_docs)) for termid, df in iteritems(dfs))
def precompute_idfs(wglobal, dfs, total_docs): """Precompute the inverse document frequency mapping for all terms.""" # not strictly necessary and could be computed on the fly in TfidfModel__getitem__. # this method is here just to speed things up a little. return dict( (termid, wglobal(df, total_docs)) for termid, df in iteritems(dfs))
def accuracy(self, questions, restrict_vocab=30000): """ Compute accuracy of the model. `questions` is a filename where lines are 4-tuples of words, split into sections by ": SECTION NAME" lines. See https://code.google.com/p/word2vec/source/browse/trunk/questions-words.txt for an example. The accuracy is reported (=printed to log and returned as a list) for each section separately, plus there's one aggregate summary at the end. Use `restrict_vocab` to ignore all questions containing a word whose frequency is not in the top-N most frequent words (default top 30,000). This method corresponds to the `compute-accuracy` script of the original C word2vec. """ ok_vocab = dict(sorted(iteritems(self.vocab), key=lambda item: -item[1].count)[:restrict_vocab]) ok_index = set(v.index for v in itervalues(ok_vocab)) def log_accuracy(section): correct, incorrect = section['correct'], section['incorrect'] if correct + incorrect > 0: logger.info("%s: %.1f%% (%i/%i)" % (section['section'], 100.0 * correct / (correct + incorrect), correct, correct + incorrect)) sections, section = [], None for line_no, line in enumerate(open(questions)): # TODO: use level3 BLAS (=evaluate multiple questions at once), for speed if line.startswith(': '): # a new section starts => store the old section if section: sections.append(section) log_accuracy(section) section = {'section': line.lstrip(': ').strip(), 'correct': 0, 'incorrect': 0} else: if not section: raise ValueError("missing section header before line #%i in %s" % (line_no, questions)) try: a, b, c, expected = [word.lower() for word in line.split()] # TODO assumes vocabulary preprocessing uses lowercase, too... except: logger.info("skipping invalid line #%i in %s" % (line_no, questions)) if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab: logger.debug("skipping line #%i with OOV words: %s" % (line_no, line)) continue ignore = set(self.vocab[v].index for v in [a, b, c]) # indexes of words to ignore predicted = None # find the most likely prediction, ignoring OOV words and input words for index in argsort(self.most_similar(positive=[b, c], negative=[a], topn=False))[::-1]: if index in ok_index and index not in ignore: predicted = self.index2word[index] if predicted != expected: logger.debug("%s: expected %s, predicted %s" % (line.strip(), expected, predicted)) break section['correct' if predicted == expected else 'incorrect'] += 1 if section: # store the last section, too sections.append(section) log_accuracy(section) total = {'section': 'total', 'correct': sum(s['correct'] for s in sections), 'incorrect': sum(s['incorrect'] for s in sections)} log_accuracy(total) sections.append(total) return sections
def save(self, fname, separately=None, sep_limit=10 * 1024**2, ignore=frozenset()): """ Save the object to file (also see `load`). If `separately` is None, automatically detect large numpy/scipy.sparse arrays in the object being stored, and store them into separate files. This avoids pickle memory errors and allows mmap'ing large arrays back on load efficiently. You can also set `separately` manually, in which case it must be a list of attribute names to be stored in separate files. The automatic check is not performed in this case. `ignore` is a set of attribute names to *not* serialize (file handles, caches etc). On subsequent load() these attributes will be set to None. """ logger.info("saving %s object under %s, separately %s" % (self.__class__.__name__, fname, separately)) subname = lambda suffix: fname + '.' + suffix + '.npy' tmp = {} if separately is None: separately = [] for attrib, val in iteritems(self.__dict__): if isinstance(val, numpy.ndarray) and val.size >= sep_limit: separately.append(attrib) elif isinstance( val, (scipy.sparse.csr_matrix, scipy.sparse.csc_matrix)) and val.nnz >= sep_limit: separately.append(attrib) # whatever's in `separately` or `ignore` at this point won't get pickled anymore for attrib in separately + list(ignore): if hasattr(self, attrib): tmp[attrib] = getattr(self, attrib) delattr(self, attrib) try: numpys, scipys, ignoreds = [], [], [] for attrib, val in iteritems(tmp): if isinstance(val, numpy.ndarray) and attrib not in ignore: numpys.append(attrib) logger.info("storing numpy array '%s' to %s" % (attrib, subname(attrib))) numpy.save(subname(attrib), numpy.ascontiguousarray(val)) elif isinstance( val, (scipy.sparse.csr_matrix, scipy.sparse.csc_matrix)) and attrib not in ignore: scipys.append(attrib) logger.info("storing scipy.sparse array '%s' under %s" % (attrib, subname(attrib))) numpy.save(subname(attrib) + '.data.npy', val.data) numpy.save(subname(attrib) + '.indptr.npy', val.indptr) numpy.save(subname(attrib) + '.indices.npy', val.indices) data, indptr, indices = val.data, val.indptr, val.indices val.data, val.indptr, val.indices = None, None, None try: pickle(val, subname(attrib)) # store array-less object finally: val.data, val.indptr, val.indices = data, indptr, indices else: logger.info("not storing attribute %s" % (attrib)) ignoreds.append(attrib) self.__dict__['__numpys'] = numpys self.__dict__['__scipys'] = scipys self.__dict__['__ignoreds'] = ignoreds pickle(self, fname) finally: # restore the attributes for attrib, val in iteritems(tmp): setattr(self, attrib, val)