model=model.wv, num_features=10) print(np.round(avg_word_vec_features, 3)) nd_avg_word_vec_features = averaged_word_vectorizer(corpus=tokenized_new_doc, model=model.wv, num_features=10) print(np.round(nd_avg_word_vec_features, 3)) # Using tfidf weighted average of word vectors in a document from feature_extractors import tfidf_weighted_averaged_word_vectorizer corpus_tfidf = tdidf_features vocab = tfidf_vectorizer.vocabulary_ wt_tfidf_word_vec_features = tfidf_weighted_averaged_word_vectorizer(corpus=TOKENIZED_CORPUS, tfidf_vectors=corpus_tfidf, tfidf_vocabulary=vocab, model=model.wv, num_features=10) print(np.round(wt_tfidf_word_vec_features, 3)) nd_wt_tfidf_word_vec_features = tfidf_weighted_averaged_word_vectorizer(corpus=tokenized_new_doc, tfidf_vectors=nd_tfidf, tfidf_vocabulary=vocab, model=model.wv, num_features=10) print(np.round(nd_wt_tfidf_word_vec_features, 3))
avg_word_vec_features = averaged_word_vectorizer(corpus=TOKENIZED_CORPUS, model=model, num_features=10) print np.round(avg_word_vec_features, 3) nd_avg_word_vec_features = averaged_word_vectorizer(corpus=tokenized_new_doc, model=model, num_features=10) print np.round(nd_avg_word_vec_features, 3) from feature_extractors import tfidf_weighted_averaged_word_vectorizer corpus_tfidf = tdidf_features vocab = tfidf_vectorizer.vocabulary_ wt_tfidf_word_vec_features = tfidf_weighted_averaged_word_vectorizer(corpus=TOKENIZED_CORPUS, tfidf_vectors=corpus_tfidf, tfidf_vocabulary=vocab, model=model, num_features=10) print np.round(wt_tfidf_word_vec_features, 3) nd_wt_tfidf_word_vec_features = tfidf_weighted_averaged_word_vectorizer(corpus=tokenized_new_doc, tfidf_vectors=nd_tfidf, tfidf_vocabulary=vocab, model=model, num_features=10) print np.round(nd_wt_tfidf_word_vec_features, 3)
min_count=30, sample=1e-3) # averaged word vector features avg_wv_train_features = averaged_word_vectorizer(corpus=tokenized_train, model=model, num_features=500) avg_wv_test_features = averaged_word_vectorizer(corpus=tokenized_test, model=model, num_features=500) # tfidf weighted averaged word vector features vocab = tfidf_vectorizer.vocabulary_ tfidf_wv_train_features = tfidf_weighted_averaged_word_vectorizer( corpus=tokenized_train, tfidf_vectors=tfidf_train_features, tfidf_vocabulary=vocab, model=model, num_features=500) tfidf_wv_test_features = tfidf_weighted_averaged_word_vectorizer( corpus=tokenized_test, tfidf_vectors=tfidf_test_features, tfidf_vocabulary=vocab, model=model, num_features=500) from sklearn import metrics import numpy as np def get_metrics(true_labels, predicted_labels):
# averaged word vector features avg_wv_train_features = averaged_word_vectorizer(corpus=tokenized_train, model=model, num_features=500) avg_wv_test_features = averaged_word_vectorizer(corpus=tokenized_test, model=model, num_features=500) # tfidf weighted averaged word vector features vocab = tfidf_vectorizer.vocabulary_ tfidf_wv_train_features = tfidf_weighted_averaged_word_vectorizer(corpus=tokenized_train, tfidf_vectors=tfidf_train_features, tfidf_vocabulary=vocab, model=model, num_features=500) tfidf_wv_test_features = tfidf_weighted_averaged_word_vectorizer(corpus=tokenized_test, tfidf_vectors=tfidf_test_features, tfidf_vocabulary=vocab, model=model, num_features=500) from sklearn import metrics import numpy as np def get_metrics(true_labels, predicted_labels): print 'Accuracy:', np.round(