def topic_word_vectorizer(data=SST_KAGGLE): lda = load_lda(data=data) train_doc, _, test_doc = read_sst_kaggle_pickle(use_textblob=True) docs = train_doc + test_doc corpus = MyCorpus(documents=docs) count = 0 input_sent = [] for doc in docs: bow_vec = corpus.dictionary.doc2bow(doc) topics = lda[bow_vec] if len(topics) == 50: count += 1 print count
def topic_vectorizer(data=SST_KAGGLE): lda = load_lda(data=data) train_doc, _, test_doc = read_sst_kaggle_pickle(use_textblob=True) docs = train_doc + test_doc corpus = MyCorpus(documents=docs) input_sent = [] for doc in docs: bow_vec = corpus.dictionary.doc2bow(doc) dict_vec = dict(lda[bow_vec]) input_sent.append(dict_vec) print "vectorizing topic probabilities..." dv = DictVectorizer() all_x = dv.fit_transform(input_sent) train_len = len(train_doc) train_x = all_x[:train_len] test_x = all_x[train_len:] return train_x.toarray(), test_x.toarray()