def build_ta_index(self):
     clock = Timer()
     ta_index = ""
     index = 0
     for book in iter_data_to_stream(self.fname):
         if index % 50 == 0:
             print "Processing bookindex #" + str(index)
             clock.print_lap()
         title = book[0]
         author = book[1]
         ta_row = str(index) + "_" + str(title) + "_" + str(author) + "\n"
         index += 1
         ta_index += ta_row
     print "\n    Done!\n"
     clock.print_lap()
     clock = None
     print "\n    Writing to file " + str(self.ta_indexfile) + " ...\n"
     with open(self.ta_indexfile, "w") as text_file:
         text_file.write(ta_index)
tokenized_corpus = CorpusPGFin(datafile)

# save title/author index
print "\n    Saving T/A index to disk.\n"
tokenized_corpus.build_ta_index()
# testing
# print (tokenized_corpus.read_ta_index(12))


print "\n    Starting document stream."
doc_stream = (tokenized_corpus.get_token_stream())
print "        doc_stream type: " + str(type(doc_stream))
print "\n"

clock = Timer()

print "\n   Creating dictionary."
id2word_pgfin = gensim.corpora.Dictionary(doc_stream)
print (id2word_pgfin)

clock.print_lap()

# filter tokens: discard those in less than 2 documents and those in more than 40%
# leaves about 10% with test set of 20
id2word_pgfin.filter_extremes(no_below=10, no_above=0.7)
print (id2word_pgfin)

print "\n    Saving dictionary to disk.\n"
# id2word_pgfin.save('./data/pgfintestdata20.dictionary')
id2word_pgfin.save('./data/pgfin.dictionary')