def topic_word_vectorizer(data=SST_KAGGLE):
    lda = load_lda(data=data)
    train_doc, _, test_doc = read_sst_kaggle_pickle(use_textblob=True)
    docs = train_doc + test_doc
    corpus = MyCorpus(documents=docs)
    count = 0
    input_sent = []
    for doc in docs:
        bow_vec = corpus.dictionary.doc2bow(doc)
        topics = lda[bow_vec]
        if len(topics) == 50:
            count += 1
    print count
def topic_vectorizer(data=SST_KAGGLE):
    lda = load_lda(data=data)
    train_doc, _, test_doc = read_sst_kaggle_pickle(use_textblob=True)
    docs = train_doc + test_doc
    corpus = MyCorpus(documents=docs)

    input_sent = []
    for doc in docs:
        bow_vec = corpus.dictionary.doc2bow(doc)
        dict_vec = dict(lda[bow_vec])
        input_sent.append(dict_vec)
    print "vectorizing topic probabilities..."
    dv = DictVectorizer()
    all_x = dv.fit_transform(input_sent)

    train_len = len(train_doc)
    train_x = all_x[:train_len]
    test_x = all_x[train_len:]

    return train_x.toarray(), test_x.toarray()