def run(vector_size, window, iter, min_df, max_df): print("Reading data") data_loc = "data/" speech = read_files(data_loc, vector_size, window, iter, min_df, max_df) print("Training classifier") cls = classify.train_classifier(speech.train_doc_vec, speech.trainy) # cls = classify.semi_supervised_learning(cls, speech.train_doc_vec, speech.trainy, speech.unlabeled_doc_vec, speech.dev_doc_vec, speech.devy) print("Evaluating") train_acc = classify.evaluate(speech.train_doc_vec, speech.trainy, cls) dev_acc = classify.evaluate(speech.dev_doc_vec, speech.devy, cls) print("Writing Kaggle pred file") write_pred_kaggle_file(cls, "data/speech-pred.csv", speech) print("=================================") print("size: " + str(vector_size) + " window: " + str(window) + " iter: " + str(iter)) print("min_df: " + str(min_df) + " max_df: " + str(max_df)) print("train_acc: " + str(train_acc)) print("dev_acc: " + str(dev_acc)) print("=================================") return 0
def semi_supervised_learning(unlabeled, sentiment, f, iters): import classify import numpy as np from sklearn.utils import shuffle import matplotlib.pyplot as plt cls = classify.train_classifier( sentiment.trainX, sentiment.trainy) # initial train with 0 unlabelled predicted initial_preds = cls.predict(unlabeled.X) factor = f # roughly about 10% of the corpus # print(type(sentiment.trainX)) # print(type(sentiment.trainy)) unlabeled.data_temp = unlabeled.data for i in range(iters): end_index = min(len(unlabeled.data), (i * factor) + factor) partition = unlabeled.data_temp[i * factor: end_index] # create partition of data #partition_matrix = sentiment.tfidf_vect.transform(partition) # create tfidf features on corpus partition_matrix = unlabeled.X[i * factor:end_index] yp = cls.predict( partition_matrix ) # predict on this partition of unseen data to create labels decisions = cls.decision_function(partition_matrix) # predict on unseen portion of data #for j in range(len(decisions)): # print(decisions[j]) #print(decisions) #print(decisions) # append this data to the train to create new train with labels for j in range(len(partition)): # check the confidence on each prediction before appending if (abs(decisions[j]) > 3.5): #print("HI") # print(partition[j]) # print(yp[j]) sentiment.train_data.append(partition[j]) sentiment.trainy = np.append(sentiment.trainy, yp[j]) #print(len(sentiment.train_data)) #print(sentiment.trainy.shape) sentiment.trainX = sentiment.tfidf_vect.transform( sentiment.train_data ) # transform new training data with partition addition cls = classify.train_classifier( sentiment.trainX, sentiment.trainy) # train a new classifier classify.evaluate(sentiment.trainX, sentiment.trainy, cls, 'train') classify.evaluate(sentiment.devX, sentiment.devy, cls, 'dev') # evaluate on dev portion return cls # return this new classifier
def train(self): importlib.reload(sentimentinterface) print("Reading data") tarfname = "data/news.tar.gz" sentiment = sentimentinterface.read_data(tarfname) sentiment.stop_words = sentimentinterface.generate_stop_words( sentiment, diff=0.4) from sklearn.feature_extraction.text import CountVectorizer sentiment.cv = CountVectorizer(min_df=3) sentiment.cv.fit_transform(sentiment.train_data) sentiment.mindf_stop_words = sentiment.cv.stop_words_ sentiment.cv = CountVectorizer(max_df=0.2) sentiment.cv.fit_transform(sentiment.train_data) sentiment.maxdf_stop_words = sentiment.cv.stop_words_ sentiment.cv = CountVectorizer() sentiment.cv.fit_transform(sentiment.train_data) sentiment.training_set_vocabulary = sentiment.cv.vocabulary_ sentimentinterface.vectorize_data(sentiment, stop_words=sentiment.stop_words, max_df=0.2, min_df=3) cls = classify.train_classifier(sentiment.trainX, sentiment.trainy, C=3.7) classify.evaluate(sentiment.devX, sentiment.devy, cls, 'dev') # print("\nReading unlabeled data") # unlabeled = sentimentinterface.read_unlabeled(tarfname, sentiment) # print("Writing predictions to a file") # sentimentinterface.write_pred_kaggle_file(unlabeled, cls, "data/sentiment-pred.csv", sentiment) # Logistic Regression Interception self.intercept = copy.deepcopy(cls.intercept_)[0] # Vectorizer vocaulary list (ordered) cv = sentiment.count_vect.vocabulary_ cv = [(v, w) for w, v in cv.items()] cv.sort() cv = [x[1] for x in cv] self.cv = cv return sentiment, cls
def semi_supervised(): print("Reading data") tarfname = "data/sentiment.tar.gz" sentiment = read_files(tarfname) print("\nReading unlabeled data") unlabeled = read_unlabeled(tarfname, sentiment) # try different percentage for best result percent_list = [0.5, 0.6, 0.7, 0.8, 0.9, 1] results = [] for percent in percent_list: # do semi_supervised search on each percentage cls = expand(sentiment, unlabeled, percent) # evaluate on dev print("\nEvaluating") import classify acc = classify.evaluate(sentiment.devX, sentiment.devy, cls, 'dev') results.append(acc) best = 0 best_index = -1 for result in results: if result > best: best = result best_index = results.index(result) print("Best result is {} when processing {} percent as confident".format( best, best_index)) # train on best index cls = expand(sentiment, unlabeled, best_index) print("\nEvaluating on best percentage...") import classify classify.evaluate(sentiment.devX, sentiment.devy, cls, 'dev') print("Writing predictions to a file") write_pred_kaggle_file(unlabeled, cls, "data/sentiment-pred.csv", sentiment)
def supervised(): print("Reading data") tarfname = "data/sentiment.tar.gz" sentiment = read_files(tarfname) print("\nTraining classifier") import classify cls = classify.train_classifier(sentiment.trainX, sentiment.trainy) print("\nEvaluating") classify.evaluate(sentiment.trainX, sentiment.trainy, cls, 'train') classify.evaluate(sentiment.devX, sentiment.devy, cls, 'dev') print("\nReading unlabeled data") unlabeled = read_unlabeled(tarfname, sentiment) from scipy.sparse import vstack test = vstack([sentiment.trainX, unlabeled.X]) print(test.shape) print("Writing predictions to a file") write_pred_kaggle_file(unlabeled, cls, "data/sentiment-pred.csv", sentiment) decisive_features(cls, sentiment)
def expand_data(speech): unlabeledX = speech.unlabeledX trainX = speech.trainX trainy = speech.trainy unlabeledX = sklearn.utils.shuffle(unlabeledX) total_unlabeled_count = unlabeledX.shape[0] best_clf = None best_acc = 0 best_i = 0 unlabeled_results = dict() n_samples = 100 n_iterations = int(total_unlabeled_count / n_samples) print("Doing ", n_iterations, " iterations, with a sample size of ", n_samples) for i in range(n_iterations): clf = classify.train_classifier(trainX, trainy) # acc_before = evaluate(trainX, trainy, clf) newX = unlabeledX[:n_samples] unlabeledX = unlabeledX[n_samples:] newy = clf.predict(newX) trainX = scipy.sparse.vstack([trainX, newX]) trainy = numpy.concatenate([trainy, newy]) acc = classify.evaluate(speech.devX, speech.devy, clf) unlabeled_results[(i + 1) * n_samples] = acc if acc > best_acc: best_acc = acc best_clf = clf best_i = i print("Iteration: ", i, " Accuracy: ", acc) util.print_dict_tofile(unlabeled_results) print("Best accuracy: ", best_acc, " samples of unlabeled data used", (best_i + 1) * n_samples) return best_clf
def training_and_evaluation(sentiment, iteration, confidence): l = list(range(iteration + 1)) l = l[1:] l[:] = [x * 0.1 for x in l] unlabeled = read_unlabeled(tarfname, sentiment) unlabeled_size = unlabeled.X.shape[0] # training the classifier only on the training data import classify cls = classify.train_classifier(sentiment.trainX, sentiment.trainy) print("\nEvaluating") classify.evaluate(sentiment.trainX, sentiment.trainy, cls, 'train') classify.evaluate(sentiment.devX, sentiment.devy, cls, 'dev') # increase the proportion of unlabeled data by 10%, 20%, ... 100% for i in l: print('\nUnlabeled Data: ' + str(i * 100) + '%') unlabeled_y = write_pred_kaggle_file(unlabeled, cls, "data/sentiment-pred.csv", sentiment) # find the instances of unlabeled data which have been predicted with more than confidence% class_probabilities = cls.predict_proba( unlabeled.X[0:int(i * unlabeled_size)]) idx = np.where(class_probabilities > confidence) C = unlabeled.X[0:int(i * unlabeled_size)] D = C.tocsr() D = D[idx[0], :] # build the new training set new_trainX = vstack((sentiment.trainX, D)) new_trainy = np.concatenate((sentiment.trainy, unlabeled_y[idx[0]]), axis=0) print(new_trainX.shape) print(new_trainy.shape) # train the classifier on the expanded data cls = classify.train_classifier(new_trainX, new_trainy) print("Evaluating") yp_train = classify.evaluate(new_trainX, new_trainy, cls, 'train') yp_dev = classify.evaluate(sentiment.devX, sentiment.devy, cls, 'dev') interpretation(cls, sentiment, yp_train, yp_dev) i = 0 j = 0 while i < 10: if (yp_dev[j] != sentiment.devy[j]): print(sentiment.dev_data[j]) i += 1 j += 1 return cls
# Define a pipeline text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1, 3))), ('tfidf', TfidfTransformer(use_idf=True, smooth_idf=False, sublinear_tf=True)), ('clf', LogisticRegression(random_state=0, C=512, solver='saga', max_iter=1000))]) print("\nTraining Supervised classifier") text_clf.fit(sentiment.train_data, sentiment.trainy) classify.evaluate(sentiment.train_data, sentiment.trainy, text_clf, 'train') classify.evaluate(sentiment.dev_data, sentiment.devy, text_clf, 'dev') print('\nTraining Word2Vec') w2v = gensim.models.Word2Vec(list(unlabeled.data), size=200, window=10, min_count=3, iter=20) train_data = [sentence_vector(sent, w2v) for sent in sentiment.train_data] dev_data = [sentence_vector(sent, w2v) for sent in sentiment.dev_data] print("\nTraining Word2Vec Supervised classifier") clf = LogisticRegression(random_state=0, C=100, solver='saga',
if __name__ == "__main__": print("Reading data") tarfname = "data/sentiment.tar.gz" sentiment = read_files(tarfname) print("\nTraining classifier") import classify test_acc, dev_acc, max_dev_acc, best_c, best_p = [], [], 0.0, 0.0, 'l2' testacc, devacc = [], [] for c in [0.1, 0.5, 1.0, 2.5, 5.0, 7.5, 10.0]: for p in ['l1', 'l2']: cls = classify.train_classifier(sentiment.trainX, sentiment.trainy, c, p) print("\nEvaluating at C = ", c, " , Penalty = ", p) t_acc = classify.evaluate(sentiment.trainX, sentiment.trainy, cls, 'train') d_acc = classify.evaluate(sentiment.devX, sentiment.devy, cls, 'dev') if p == 'l1': test_acc.append(t_acc) dev_acc.append(d_acc) else: testacc.append(t_acc) devacc.append(d_acc) if d_acc > max_dev_acc: best_c = c best_p = p max_dev_acc = d_acc print("\nBest c: ", best_c, ", Best penalty: ", best_p, " | Accuracy: ", max_dev_acc)
f.write("\n") f.close() if __name__ == "__main__": if(len(sys.argv) != 2): print("Please enter two arguments") sys.exit(1) if(sys.argv[1] == "run_model"): print("Reading data") tarfname = "data/sentiment.tar.gz" sentiment = read_files(tarfname) print("\nTraining classifier") import classify cls = classify.train_classifier(sentiment.trainX, sentiment.trainy) print("\nEvaluating") classify.evaluate(sentiment.trainX, sentiment.trainy, cls, 'train') classify.evaluate(sentiment.devX, sentiment.devy, cls, 'dev') print("\nReading unlabeled data") unlabeled = read_unlabeled(tarfname, sentiment) print(lexicon_stuff) cls = semi_supervised_learning(unlabeled, sentiment) print("Writing predictions to a file") write_pred_kaggle_file(unlabeled, cls, "data/sentiment-pred.csv", sentiment) #write_basic_kaggle_file("data/sentiment-unlabeled.tsv", "data/sentiment-basic.csv") # You can't run this since you do not have the true labels # print "Writing gold file" # write_gold_kaggle_file("data/sentiment-unlabeled.tsv", "data/sentiment-gold.csv") if(sys.argv[1] == "final"): print("Reading data")
x_train, x_test, y_train, y_test = preprocess.run(votes, test_clip) # Classify using SVM with different kernels y_pred_linear = classify.SVM_linear(x_train, x_test, y_train) y_pred_rbf = classify.SVM_rbf(x_train, x_test, y_train) # Combine classifiers y_final = classify.combine(y_pred_linear, y_pred_rbf) # Make very engaged -> engaged. y_final = ignore_very(y_final) y_test = ignore_very(y_test) # Evaluate model accuracy_svm += classify.evaluate(y_final, y_test, "SVM", test_clip) * len(y_test) # Classify using dummy to get baseline y_dummy = classify.dummy(x_train, x_test, y_train) # Evaluate dummy accuracy_dummy += classify.evaluate(y_dummy, y_test, "Dummy", test_clip) * len(y_test) # Count number of clips number_of_clips += len(y_test) # matrix classify.matrix(y_test, y_final) print("Final results:")
def greedy_searchpara(text_clf, sentiment, tarfname): # Greedy Search Parameter parameters = { # 'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5), (5, 5)], # (1, 3) is best # 'tfidf__use_idf': [(True, False), (True, True), (False, True), ((False, False))], 'clf__C': [2**(i) for i in range(-10, 15)], # 512 is best # 'clf__class_weight': [None, 'balanced'], # None is better # 'clf__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], # 'saga' is better # 'clf__max_iter': [10**i for i in range(2, 8)], # iteration 1000 } from sklearn.metrics import make_scorer from sklearn.metrics import accuracy_score scoring = {'Accuracy': make_scorer(accuracy_score)} gs_clf = GridSearchCV(text_clf, parameters, cv=5, iid=False, n_jobs=-1, scoring=scoring, refit='Accuracy', return_train_score=True) gs_clf = gs_clf.fit(sentiment.train_data, sentiment.trainy) print(gs_clf.best_score_) for param_name in sorted(parameters.keys()): print("%s: %r" % (param_name, gs_clf.best_params_[param_name])) results = gs_clf.cv_results_ # plotting the result plt.figure(figsize=(13, 13)) # plt.title("GridSearchCV evaluating using multiple scorers simultaneously", # fontsize=16) plt.xlabel( "the inverse of regularization strength for LogisticRegression Model") plt.ylabel("Score") ax = plt.gca() # Get the regular numpy array from the MaskedArray X_axis = np.array(results['param_clf__C'].data, dtype=float) for scorer, color in zip(sorted(scoring), ['g', 'k']): for sample, style in (('train', '--'), ('test', '-')): sample_score_mean = results['mean_%s_%s' % (sample, scorer)] sample_score_std = results['std_%s_%s' % (sample, scorer)] ax.fill_between(X_axis, sample_score_mean - sample_score_std, sample_score_mean + sample_score_std, alpha=0.1 if sample == 'test' else 0, color=color) ax.plot(X_axis, sample_score_mean, style, color=color, alpha=1 if sample == 'test' else 0.7, label="%s (%s)" % (scorer, sample)) best_index = np.nonzero(results['rank_test_%s' % scorer] == 1)[0][0] best_score = results['mean_test_%s' % scorer][best_index] # Plot a dotted vertical line at the best score for that scorer marked by x ax.plot([ X_axis[best_index], ] * 2, [0, best_score], linestyle='-.', color=color, marker='x', markeredgewidth=3, ms=8) # Annotate the best score for that scorer ax.annotate("%0.2f" % best_score, (X_axis[best_index], best_score + 0.005)) plt.xscale('log') plt.legend(loc="best") plt.grid(False) plt.show() # Evaluate on the refit model classify.evaluate(sentiment.train_data, sentiment.trainy, gs_clf, 'train') classify.evaluate(sentiment.dev_data, sentiment.devy, gs_clf, 'dev') # Evaluate on the unlabeled data print("\nReading unlabeled data") unlabeled = read_unlabeled(tarfname, sentiment) print("Writing predictions to a file") write_pred_kaggle_file(unlabeled, gs_clf, "data/sentiment-pred.csv", sentiment)
C_range = [1, 10, 100, 1000] solvers = ["newton-cg", "lbfgs", "liblinear", "sag", "saga"] for solver in ["saga"]: print("Using " + solver) for c in [10]: print("Evaluating at C=" + str(c)) for tfidf in [True]: print("With tfidf" if tfidf else "Without tfidf") cls = classify.train_classifier( speech.trainX_tfidf if tfidf else speech.trainX, speech.trainy, c=c, solver=solver) print("Acc on Training Data") classify.evaluate( speech.trainX_tfidf if tfidf else speech.trainX, speech.trainy, cls) print("Acc on Dev Data") classify.evaluate(speech.devX_tfidf if tfidf else speech.devX, speech.devy, cls) print("\n") print("Reading unlabeled data") unlabeled = read_unlabeled(tarfname, speech) # numBatches = 10 # labeledXBatches = np.split(speech.trainX_tfidf.toarray(), numBatches) # labeledYBatches = np.split(speech.trainy, numBatches) # unlabeledXBatches = np.split( # unlabeled.X.toarray()[:-2], numBatches) # trainXBatches = [None] * numBatches # trainYBatches = [None] * numBatches
def semi_supervise(sentiment, unlabeled, iter, num_conf): import classify best_dev = [] # from scipy.sparse import vstack for i in range(iter): print("\nTraining classifier") sentiment = tfidfvectorizer_feat(sentiment) # reference: https://stackoverflow.com/questions/45232671/obtain-tf-idf-weights-of-words-with-sklearn index_value = { i[1]: i[0] for i in sentiment.count_vect.vocabulary_.items() } fully_indexed = {} for row in sentiment.trainX: for (column, value) in zip(row.indices, row.data): fully_indexed[index_value[column]] = value print( sorted(fully_indexed.items(), key=lambda x: x[1], reverse=True)[:10]) unlabeled.X = sentiment.count_vect.transform(unlabeled.data) cls = classify.train_classifier(sentiment.trainX, sentiment.trainy, 1000) acc = classify.evaluate(sentiment.devX, sentiment.devy, cls, 'dev') if i != 0: best_dev.append(acc) # preds = cls.predict(unlabeled.X) # conf_score = np.max(cls.predict_proba(unlabeled.X), axis=1) conf_score = np.apply_along_axis( lambda x: np.random.choice(x, 1, p=x)[0], 1, cls.predict_proba(unlabeled.X)) preds = np.array([int(i >= 0.5) for i in conf_score]) # conf_score = np.absolute(cls.decision_function(unlabeled.X)) # conf_idx = np.argsort(conf_score) ''' reference: https://stackoverflow.com/questions/2566412/find-nearest-value-in-numpy-array ''' # def find_nearest(array, value): # array = np.asarray(array) # idx = (np.abs(array - value)).argmin() # return idx sum_conf = np.sum(conf_score) conf_score = conf_score / sum_conf conf_idx = np.random.choice(list(range(len(conf_score))), num_conf, p=conf_score) # conf_idx = [] # for i in conf_tmp: # conf_idx.append(find_nearest(conf_score,i)) # conf_idx = np.nonzero(conf_score > 0.99)[0] # print(len(conf_idx)) # if len(conf_idx) < 1000: # return unlabeled, cls, sentiment # new_labeled_X = np.array(unlabeled.data)[conf_idx[-num_conf:]] # new_labeled_y = preds[conf_idx[-num_conf:]] new_labeled_X = np.array(unlabeled.data)[conf_idx] new_labeled_y = preds[conf_idx] tmp_idx = [i for i in range(len(conf_score)) if i not in conf_idx] sentiment.train_data = np.concatenate( (sentiment.train_data, new_labeled_X)) sentiment.trainy = np.concatenate((sentiment.trainy, new_labeled_y)) # unlabeled.data = np.array(unlabeled.data)[conf_idx[:-num_conf]] unlabeled.data = np.array(unlabeled.data)[tmp_idx] return unlabeled, cls, sentiment, max(best_dev)
if __name__ == "__main__": print("Reading data") tarfname = "data/speech.tar.gz" speech = read_files(tarfname) print("Training classifier") import classify # #C = [100,50,20,10,9,8,7,5,4,3,2,1,0.9,0.8,0.7,0.6,0.5,0.4,0.3] # C = [1000,500,300,200,150,120,110,105,100, 50, 20, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1] # i_c = [1/i for i in C] train_accs = [] test_accs = [] c = 1 # for c in C: cls = classify.train_classifier(speech.trainX, speech.trainy, c) confusion_mtrx, train_acc = classify.evaluate(speech.trainX, speech.trainy, cls) train_accs.append(train_acc) confusion_mtrx, test_acc = classify.evaluate(speech.devX, speech.devy, cls) test_accs.append(test_acc) #plot_confusion_matrix(confusion_mtrx,speech.le.classes_) #get important features for class for i in range(0, cls.coef_.shape[0]): top10_indices = np.argsort(cls.coef_[i])[-10:] top10_feature = [] print(speech.le.classes_[i]) for idx in top10_indices: for word in speech.count_vect.vocabulary_: if (speech.count_vect.vocabulary_[word] == idx): top10_feature.append(word) print(top10_feature)
if __name__ == "__main__": tarfname = "data/sentiment.tar.gz" maxdf = 1.0 mindf = 1 solve_name = 'sag' penalty = 'l2' print("Reading data") tarfname = "data/sentiment.tar.gz" sentiment = read_files(tarfname, min_df=mindf, max_df=maxdf) print("\nTraining classifier") import classify cls = classify.train_classifier(sentiment.trainX, sentiment.trainy) print("\nEvaluating") classify.evaluate(sentiment.trainX, sentiment.trainy, cls, 'train') classify.evaluate(sentiment.devX, sentiment.devy, cls, 'dev') print("\nReading unlabeled data") unlabeled = read_unlabeled(tarfname, sentiment) #probability =[0.6,0.7,0.75,0.8,0.85,0.9,0.95,0.98] #for p in probability: cls = train_classifier(sentiment.trainX, sentiment.trainy, penalty=penalty, solver=solve_name) acc = evaluate(sentiment.devX, sentiment.devy, cls, 'dev data') print('when using using min_df = {}, MAX_DF ={},acc : {}'.format(
f.close() def read_instance(tar, ifname): inst = tar.getmember(ifname) ifile = tar.extractfile(inst) content = ifile.read().strip() return content if __name__ == "__main__": print("Reading data") tarfname = "data/speech.tar.gz" speech = read_files(tarfname) print("Training classifier") import classify cls = classify.train_classifier(speech.trainX, speech.trainy) print("Evaluating") classify.evaluate(speech.trainX, speech.trainy, cls) classify.evaluate(speech.devX, speech.devy, cls) print("Reading unlabeled data") unlabeled = read_unlabeled(tarfname, speech) print("Writing pred file") write_pred_kaggle_file(unlabeled, cls, "data/speech-pred.csv", speech) # You can't run this since you do not have the true labels # print "Writing gold file" # write_gold_kaggle_file("data/speech-unlabeled.tsv", "data/speech-gold.csv") # write_basic_kaggle_file("data/speech-unlabeled.tsv", "data/speech-basic.csv")
f.write(",") f.write("POSITIVE") f.write("\n") f.close() if __name__ == "__main__": print("Reading data") tarfname = "data/sentiment.tar.gz" sentiment = read_files(tarfname) print("\nTraining classifier") import classify test_acc, dev_acc, max_dev_acc, best_c, best_p = [], [], 0.0, 0.0, 'l2' cls = classify.train_classifier(sentiment.trainX, sentiment.trainy, 5.0, 'l2') classify.evaluate(sentiment.trainX, sentiment.trainy, cls, 'train') classify.evaluate(sentiment.devX, sentiment.devy, cls, 'dev') cls_nb = classify.train_classifier_2(sentiment.trainX, sentiment.trainy) cls_svm = classify.train_classifier_3(sentiment.trainX, sentiment.trainy) print("\nReading unlabeled data") unlabeled = read_unlabeled(tarfname, sentiment) print("Unlabeled data***", len(unlabeled.data)) lab, unlab = add_unlabeled(unlabeled, cls, cls_nb, cls_svm, sentiment) print("Len labeled data: ", len(lab)) test_acc, dev_acc = [], [] lens_ = [] val_10 = 9152 for i in range(10):
(label, review) = line.strip().split("\t") i += 1 f.write(str(i)) f.write(",") f.write("POSITIVE") f.write("\n") f.close() if __name__ == "__main__": print("Reading data") tarfname = "data/sentiment.tar.gz" sentiment = read_files(tarfname) print("\nTraining classifier") import classify cls = classify.train_classifier(sentiment.trainX, sentiment.trainy) print("\nEvaluating") classify.evaluate(sentiment.trainX, sentiment.trainy, cls, 'train') classify.evaluate(sentiment.devX, sentiment.devy, cls, 'dev') print("\nReading unlabeled data") unlabeled = read_unlabeled(tarfname, sentiment) print("Writing predictions to a file") write_pred_kaggle_file(unlabeled, cls, "data/sentiment-pred.csv", sentiment) #write_basic_kaggle_file("data/sentiment-unlabeled.tsv", "data/sentiment-basic.csv") # You can't run this since you do not have the true labels # print "Writing gold file" # write_gold_kaggle_file("data/sentiment-unlabeled.tsv", "data/sentiment-gold.csv")
elif y == 'FALSE': X.append(x) Y.append(0) # The first 90% are train data, the last 10% are test data n = int(len(X) * .9) XY_train = list(zip(X, Y))[:n] XY_test = list(zip(X, Y))[n:] data_train, y_train = [x for x, y in XY_train], [y for x, y in XY_train] data_test, y_test = [x for x, y in XY_test], [y for x, y in XY_test] print("Train data has %d positive reviews" % y_train.count(1)) print("Train data has %d negative reviews" % y_train.count(0)) print("Test data has %d positive reviews" % y_test.count(1)) print("Test data has %d negative reviews" % y_test.count(0)) # Testing print("Testing CountVectorizer...") count_vect = CountVectorizer() count_vect.fit(data) X_train = count_vect.transform(data_train) X_test = count_vect.transform(data_test) cls = classify.train_classifier(X_train, y_train) classify.evaluate(X_test, y_test, cls, 'test') print("Testing TfidfVectorizer...") tfidf_vect = TfidfVectorizer() tfidf_vect.fit(data) X_train = tfidf_vect.transform(data_train) X_test = tfidf_vect.transform(data_test) cls = classify.train_classifier(X_train, y_train) classify.evaluate(X_test, y_test, cls, 'test')