Пример #1
0
def train(data=SST_KAGGLE, alg='log'):
    _, train_y, _ = vectorize_text(data=data)
    train_x, test_x = read_doc2vec_pickle(dm=False)
    # train_x_1, test_x_1 = senti_lexicon_vectorizor(data=data, tfidf=True)
    # train_x_2, test_x_2 = senti_wordnet_vectorizer(data=data, tfidf=True)
    #
    # train_x = sparse.hstack((train_x_1, train_x_2))
    # test_x = sparse.hstack((test_x_1, test_x_2))

    print "shape for training data is", train_x.shape

    if alg == 'svm':
        clf = SVC(verbose=1)
    elif alg == 'log':
        clf = LogisticRegression(verbose=1)     # 61.756, no phrase,
    elif alg == 'nb':
        clf = MultinomialNB()
    else:
        raise NotImplementedError

    print "training..."
    clf.fit(train_x, train_y)
    predicted = clf.predict(test_x)
    save_csv(predicted)
Пример #2
0
def doc2vec_kaggle_dataset():
    train_x, test_x = read_doc2vec_pickle(dm=True, concat=False)
    _, train_y, _ = read_sst_kaggle_pickle()
    return train_x, train_y, test_x