def __iter__(self): from datetime import datetime from terminal import ProgressBar pb = ProgressBar('green', width=20) self.passes += 1 start = datetime.now() num_docs = float(len(self)) for i, word in enumerate(self._words_iter()): self.docid2word[i] = word self.word2docid[word] = i if (i+1) % 10 == 0: eta = (datetime.now() - start) * int((num_docs - i)) // (i + 1) pct = i/num_docs msg = "corpus pass #%d / ETA %s" % (self.passes, eta) pb.render(pct, msg) yield self._get_word_context_vector(word) runtime = datetime.now() - start msg = "completed pass #%d in %s" % (self.passes, runtime) pb.render(1, msg)
def index_database(corpus_file_or_folder, index_file, remove_once=True): # if remove once is True, the words that appear in the corpus only # once will not be indexed. wt = WordTransformer() print "Initing database (%s)..." % index_file conn = init_index(index_file) word_ids = dict() print "Calculating total corpus size..." filenames = list(walk(corpus_file_or_folder)) total_offset = 0 total_bytes = sum(float(os.stat(f).st_size) for f in filenames) from datetime import datetime from terminal import ProgressBar pb = ProgressBar(width=20, color='green') start = datetime.now() print "Beginning indexing..." for fileid, filename in enumerate(filenames): offset = 0 with open(filename) as f: add_file(conn, filename, fileid) for j, line in enumerate(f): line_bytes = len(line) if line.strip() == '.START': # special token in the wsj corpus file offset += line_bytes continue words = wt.tokenize(line) processed = [] for word in words: if word in processed: # no need to record when a word appears twice # that'll fall out later continue processed.append(word) wordid = add_word(conn, word, word_ids) add_context(conn, wordid, fileid, offset) offset += line_bytes total_offset += line_bytes if j % 2500 == 0: pct = float(total_offset) / total_bytes eta = ((datetime.now() - start) / total_offset) * int(total_bytes - total_offset) msg = "indexing - ETA %s" % (str(eta)[:10]) pb.render(pct, msg) msg = "completed in %s" % (datetime.now() - start) pb.render(1, msg) if remove_once: print "filtering words appearing only once..." remove_singletons(conn) print "syncing to disk... (almost done!)" close_index(conn)