def similar_to(self, word, n=10, should_enrich_with_web=False): def _max_dim(vec): return max(vec, key=lambda x: x[1]) def _unvec(vec): return dict((self.dictionary[a], b) for a,b in vec) wt = WordTransformer() word = wt.transform(word) self.sim.numBest = n vec = self.vector_corpus[word] if(should_enrich_with_web): vec = self._enrich_vec_with_web(vec, word) vec = self.tfidf[vec] retval = [] for docid, score in self.sim[vec]: docword = self.docid2word[docid] retval.append((docword, score)) return retval
def index_database(corpus_file_or_folder, index_file, remove_once=True): # if remove once is True, the words that appear in the corpus only # once will not be indexed. wt = WordTransformer() print "Initing database (%s)..." % index_file conn = init_index(index_file) word_ids = dict() print "Calculating total corpus size..." filenames = list(walk(corpus_file_or_folder)) total_offset = 0 total_bytes = sum(float(os.stat(f).st_size) for f in filenames) from datetime import datetime from terminal import ProgressBar pb = ProgressBar(width=20, color='green') start = datetime.now() print "Beginning indexing..." for fileid, filename in enumerate(filenames): offset = 0 with open(filename) as f: add_file(conn, filename, fileid) for j, line in enumerate(f): line_bytes = len(line) if line.strip() == '.START': # special token in the wsj corpus file offset += line_bytes continue words = wt.tokenize(line) processed = [] for word in words: if word in processed: # no need to record when a word appears twice # that'll fall out later continue processed.append(word) wordid = add_word(conn, word, word_ids) add_context(conn, wordid, fileid, offset) offset += line_bytes total_offset += line_bytes if j % 2500 == 0: pct = float(total_offset) / total_bytes eta = ((datetime.now() - start) / total_offset) * int(total_bytes - total_offset) msg = "indexing - ETA %s" % (str(eta)[:10]) pb.render(pct, msg) msg = "completed in %s" % (datetime.now() - start) pb.render(1, msg) if remove_once: print "filtering words appearing only once..." remove_singletons(conn) print "syncing to disk... (almost done!)" close_index(conn)