stopwordlist = stopwords.words('english') clf = svm.LinearSVC() if True: # #clf = svm.linearSVC()#92009 clf = svm.LinearSVC(class_weight={ '1': 0.1, '2': 0.5, '3': 0.12, '4': 0.2, '5': 0.08 }) #91236 target, data = helper.get_train_data('../../Data/train_prep.csv', vectorizer=helper.get_vectorizer( stop_words=stopwordlist, min_df=3, ngram_range=(2, 2)), pred_pos=0, text_pos=1, tf_idf=True, remove_header=False) print "LOW + MIN3 + BIG + TFIDF + NS" x_train, x_test, y_train, y_test = train_test_split(data, target, random_state=0) clf.fit(x_train, y_train) y_predicted = clf.predict(x_test) print(confusion_matrix(y_test, y_predicted)) if False: #[ 0.82027483 0.82085535 0.82068147] target, data = helper.get_train_data('../../Data/train_prep_emot.csv',
from sklearn.model_selection import cross_val_score from sklearn.ensemble import AdaBoostClassifier import helpers as helper from nltk.corpus import stopwords stopwordlist = stopwords.words('english') clf = AdaBoostClassifier(n_estimators=100) x_target, x_data = helper.get_train_data( '../../Data/train_prep.csv', vectorizer=helper.get_vectorizer(stop_words=stopwordlist, min_df=3), pred_pos=0, text_pos=1, tf_idf=True, remove_header=False) scores = cross_val_score(clf, x_data, x_target) print scores.mean()
import numpy as np from nltk.stem.wordnet import WordNetLemmatizer from nltk import word_tokenize import helpers as helper from sklearn.neighbors import KneighborsClassifier from sklearn import cross_validation from nltk.corpus import stopwords from sklearn.metrics import confusion_matrix from sklearn.model_selection import train_test_split stopwordlist = stopwords.words('english') clf = svm.KneighborsClassifier() if True: # target, data = helper.get_train_data('../../Data/train_prep.csv', vectorizer = helper.get_vectorizer(stop_words=stopwordlist, min_df = 3, ngram_range = (2,2)), pred_pos=0, text_pos=1,tf_idf=True, remove_header=False) print "LOW + MIN3 + BIG + TFIDF + NS" x_train, x_test, y_train, y_test = train_test_split(data, target, random_state=0) clf.fit(x_train, y_train) y_predicted = clf.predict(x_test) print(confusion_matrix(y_test, y_predicted)) if False: #[ 0.82027483 0.82085535 0.82068147] target, data = helper.get_train_data('../../Data/train_prep_emot.csv', vectorizer = helper.get_vectorizer(min_df = 3, ngram_range = (2,2)), pred_pos=0, text_pos=1,tf_idf=True, remove_header=True) print "EMO + NSP + NHTML + LOW + MIN3 + BIG + TFIDF" this_scores = cross_validation.cross_val_score(clf, data, target, cv = 3) print(this_scores) if False: #[ 0.82023524 0.82082237 0.82073424] target, data = helper.get_train_data('../../Data/train_prep.csv', vectorizer = helper.get_vectorizer(min_df = 3, ngram_range = (2,2)), pred_pos=0, text_pos=1,tf_idf=True, remove_header=True) print "NSP + NHTML + LOW + MIN3 + BIG + TFIDF"
print(__doc__) from time import time import numpy as np import matplotlib.pyplot as plt from sklearn import metrics from sklearn.cluster import KMeans from sklearn.datasets import load_digits from sklearn.decomposition import PCA from sklearn.preprocessing import scale import helpers as helper target, data = helper.get_train_data('../../Data/train_prep.csv', vectorizer = helper.get_vectorizer(min_df = 3), pred_pos=0, text_pos=1,tf_idf=True, remove_header=False) n_digits = len(np.unique(target)) labels = target sample_size = 300 print(79 * '_') print('% 9s' % 'init' ' time inertia h**o compl v-meas ARI AMI silhouette') def bench_k_means(estimator, name, data): t0 = time() estimator.fit(data) print('% 9s %.2fs %i %.3f %.3f %.3f %.3f %.3f %.3f' % (name, (time() - t0), estimator.inertia_,
for x in voc[c]: if count % 1000 == 0: print count if x not in voc['5'] and x not in vocab: vocab[x] = 1 count += 1 vocabulario = [] with open(train_path + 'voc--.csv', 'wb') as csvo: writero = csv.writer(csvo) for x in vocab: vocabulario.append(x) writero.writerow([x]) if True: print "Obtengo los datos" x_target, x_data = helper.get_train_data(train_path + '.csv', vectorizer = helper.get_vectorizer(vocabulary=vocabulario, min_df = 3), pred_pos=0, text_pos=1,tf_idf=True, remove_header=False) y_target, y_data = helper.get_train_data(test_path + '.csv', vectorizer = helper.get_vectorizer(vocabulary=vocabulario ,min_df = 3), pred_pos=0, text_pos=1,tf_idf=True, remove_header=False) print "LOW + MIN3 + BIG + TFIDF" clf = MultinomialNB(alpha=0.01) clf.fit(x_data, x_target) y_predicted = clf.predict(y_data) print(confusion_matrix(y_target, y_predicted)) print(accuracy_score(y_target, y_predicted)) if False: count = 0 with open('../../Data/train_prep_voc1.csv', 'r') as csv4: with open('../../Data/train_prep_voc5.csv', 'r') as csv5: with open('../../Data/train_prep_voc1-5.csv', 'wb') as csvo: reader4 = csv.reader(csv4) reader5 = csv.reader(csv5)