def train(data=SST_KAGGLE, alg='log'): _, train_y, _ = vectorize_text(data=data) train_x, test_x = read_doc2vec_pickle(dm=False) # train_x_1, test_x_1 = senti_lexicon_vectorizor(data=data, tfidf=True) # train_x_2, test_x_2 = senti_wordnet_vectorizer(data=data, tfidf=True) # # train_x = sparse.hstack((train_x_1, train_x_2)) # test_x = sparse.hstack((test_x_1, test_x_2)) print "shape for training data is", train_x.shape if alg == 'svm': clf = SVC(verbose=1) elif alg == 'log': clf = LogisticRegression(verbose=1) # 61.756, no phrase, elif alg == 'nb': clf = MultinomialNB() else: raise NotImplementedError print "training..." clf.fit(train_x, train_y) predicted = clf.predict(test_x) save_csv(predicted)
def doc2vec_kaggle_dataset(): train_x, test_x = read_doc2vec_pickle(dm=True, concat=False) _, train_y, _ = read_sst_kaggle_pickle() return train_x, train_y, test_x