else:
    keep_words = DEFAULT_DICT_SIZE

if os.path.exists(outp +
                  '_wordids.txt.bz2') and os.path.exists(outp +
                                                         '_corpus.pkl.bz2'):
    dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')
    wiki = TextCorpus.load(outp + '_corpus.pkl.bz2')
else:
    wiki = TextCorpus(inp)
    # only keep the most frequent words
    wiki.dictionary.filter_extremes(no_below=20,
                                    no_above=0.1,
                                    keep_n=keep_words)
    wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
    wiki.save(outp + '_corpus.pkl.bz2')
    # load back the id->word mapping directly from file
    # this seems to save more memory, compared to keeping the wiki.dictionary object from above
    dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')

# build tfidf
if os.path.exists(outp + '_tfidf.mm'):
    mm = gensim.corpora.MmCorpus(outp + '_tfidf.mm')
else:
    tfidf = TfidfModel(wiki, id2word=dictionary, normalize=True)
    #tfidf.save(outp + '.tfidf_model')

    # save tfidf vectors in matrix market format
    mm = tfidf[wiki]
    MmCorpus.serialize(outp + '_tfidf.mm', mm, progress_cnt=10000)