def runModel(X, y, model_name): nFolders = 5 accs = [] precs = [] recalls = [] F1s = [] n = X.shape[0] for exp in range(0, nFolders): print '\n\n============================================================================================\nexperiment', exp ### 2.1 split training and testing data start = (int)((1 - (exp + 1) * 1.0 / nFolders) * n) end = (int)((1 - exp * 1.0 / nFolders) * n) #print n, start, end X_train, y_train, X_test, y_test = splitTrainTest(X, y, start, end) print 'Running', model_name if model_name == 'SVM': ### 2.2 build classifier clf = LinearSVC(penalty="l1", dual=False, tol=1e-7) clf.fit(X_train, y_train) ### 2.3 predict y_pred = clf.predict(X_test) if model_name == 'SVM_new': ### 2.2 build classifier clf = svm.SVC(C=1.0, gamma=1.0, class_weight='auto') clf.fit(X_train, y_train) ### 2.3 predict y_pred = clf.predict(X_test) elif model_name == 'NaiveBayes': clf = GaussianNB() clf.fit(X_train.todense(), y_train) y_pred = clf.predict(X_test.todense()) elif model_name == 'LogisticRegression': clf = LogisticRegression(C=1.0, penalty='l1', tol=0.01) clf.fit(X_train.toarray(), y_train) y_pred = clf.predict(X_test.toarray()) else: raise Exception("The model name is incorrect!!!") ### 2.4 eval acc, prec, recall, F1 = eval(y_test, y_pred) print 'Acc = ', acc print 'Precision =', prec print 'Recall=', recall print 'F1 =', F1 accs.append(acc) precs.append(prec) recalls.append(recall) F1s.append(F1) print '\n\n\n' print 'avg Acc = ', sum(accs) / len(accs) print 'avg Precision = ', sum(precs) / len(precs) print 'avg Recall = ', sum(recalls) / len(recalls) print 'avg F1 = ', sum(F1s) / len(F1s) return sum(accs) / len(accs), sum(precs) / len(precs), sum(recalls) / len( recalls), sum(F1s) / len(F1s)
def runModel(X, y, model_name): nFolders = 5 accs = [] precs = [] recalls = [] F1s = [] n = X.shape[0] for exp in range(0, nFolders): print '\n\n============================================================================================\nexperiment' , exp ### 2.1 split training and testing data start = (int)((1-(exp+1) * 1.0/nFolders)*n) end = (int)((1-exp * 1.0/nFolders)*n) #print n, start, end X_train, y_train, X_test, y_test = splitTrainTest(X, y, start, end) print 'Running', model_name if model_name == 'SVM': ### 2.2 build classifier clf = LinearSVC(penalty="l1", dual=False, tol=1e-7) clf.fit(X_train, y_train) ### 2.3 predict y_pred = clf.predict(X_test) if model_name == 'SVM_new': ### 2.2 build classifier clf = svm.SVC(C = 1.0, gamma = 1.0, class_weight = 'auto') clf.fit(X_train, y_train) ### 2.3 predict y_pred = clf.predict(X_test) elif model_name == 'NaiveBayes': clf = GaussianNB() clf.fit(X_train.todense(), y_train) y_pred = clf.predict(X_test.todense()) elif model_name == 'LogisticRegression': clf = LogisticRegression(C=1.0, penalty='l1', tol=0.01) clf.fit(X_train.toarray(), y_train) y_pred = clf.predict(X_test.toarray()) else: raise Exception("The model name is incorrect!!!") ### 2.4 eval acc, prec, recall, F1 = eval(y_test, y_pred) print 'Acc = ', acc; print 'Precision =', prec; print 'Recall=', recall; print 'F1 =', F1 accs.append(acc) precs.append(prec) recalls.append(recall) F1s.append(F1) print '\n\n\n' print 'avg Acc = ', sum(accs)/len(accs) print 'avg Precision = ', sum(precs)/len(precs) print 'avg Recall = ', sum(recalls)/len(recalls) print 'avg F1 = ', sum(F1s)/len(F1s) return sum(accs)/len(accs), sum(precs)/len(precs), sum(recalls)/len(recalls), sum(F1s)/len(F1s)
class Expander_LDA_multiclass(Expander_LDA_cossim): """ take LDA vectors of labelled articles and do a multi-class classification for deciding where the LDA of the test text belongs """ def __init__(self, ldaModelAll, expander_type=AcronymExpanderEnum.LDA_multiclass): Expander_LDA_cossim.__init__(self, ldaModelAll, expander_type) self.classifier = LinearSVC() def transform(self, X): results = Expander_LDA_cossim.transform(self, X) return [self._getDenseVector(item) for item in results] def _getDenseVector(self, sparse_vec): return sparse2full(sparse_vec, self.ldaModel.num_topics) def fit(self, X_train, y_train): self.classifier.fit(X_train, y_train) def predict(self, X_test, acronym): labels = self.classifier.predict(X_test) decisions = self.classifier.decision_function(X_test) confidences = self._getConfidencesFromDecisionFunction(labels, decisions) return labels, confidences
class LinearSVCImpl(): def __init__(self, penalty='l2', loss='squared_hinge', dual=True, tol=0.0001, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight='balanced', verbose=0, random_state=None, max_iter=1000): self._hyperparams = { 'penalty': penalty, 'loss': loss, 'dual': dual, 'tol': tol, 'C': C, 'multi_class': multi_class, 'fit_intercept': fit_intercept, 'intercept_scaling': intercept_scaling, 'class_weight': class_weight, 'verbose': verbose, 'random_state': random_state, 'max_iter': max_iter} self._wrapped_model = SKLModel(**self._hyperparams) def fit(self, X, y=None): if (y is not None): self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X) def decision_function(self, X): return self._wrapped_model.decision_function(X)
def classification_linear_svm(tweets, train_index, test_index, labels_train, random_state=None): """Classifies using SVM as classifier """ #Representation tfidf_parser = TfidfVectorizer(tokenizer=tokenize, lowercase=False, analyzer='word') tweets_train = [tweets[tweet_index] for tweet_index in train_index] tweets_test = [tweets[tweet_index] for tweet_index in test_index] train_sparse_matrix_features_tfidf = tfidf_parser.fit_transform( tweets_train) test_sparse_matrix_features_tfidf = tfidf_parser.transform(tweets_test) classifier = LinearSVC(multi_class="ovr", random_state=random_state) print("Start SVM training") classifier = classifier.fit(train_sparse_matrix_features_tfidf, labels_train) print("Finish SVM training") y_labels = classifier.predict(test_sparse_matrix_features_tfidf) return y_labels
def applyModelandfit(tweet_list, tweet_label_list,all_tweet,model_name,filename): X, y, tweet_id_list = buildMatrixTrainAndTest(tweet_list, tweet_label_list, all_tweet) X_train = X[:len(y),:] y_train = y X_test = X[len(y):,:] tweet_id_list_test = tweet_id_list[len(y):] print "number of training tweets are ", X_train.shape, len(y_train) if model_name == 'SVM': clf = LinearSVC(penalty="l1", dual=False, tol=1e-7) clf.fit(X_train, y_train) elif model_name == 'NaiveBayes': clf = GaussianNB() clf.fit(X_train.todense(), y_train) elif model_name == 'LogisticRegression': clf = LogisticRegression(C=1.0, penalty='l1', tol=0.01) clf.fit(X_train.toarray(), y_train) else: raise Exception("The model name is incorrect!!!") y_pred = clf.predict(X_test) print 'length of predict data is ', len(y_pred) with open(RESULT_FOLDER+'/'+filename+'_c.csv','wb') as fp: writer = csv.writer(fp, delimiter =",",quoting=csv.QUOTE_MINIMAL) for i, tweetid in enumerate(tweet_id_list_test): writer.writerow([tweetid, all_tweet[tweetid], y_pred[i]])
def runModel(X, y, S_data, model_name): f = open('r_' + "_" + model_name + '.txt', 'w') auc_score_all = [] fold = S_data[1] Index_gen = S_data[0] label = ['0.0', '1.0'] # note that python does not copy the generator, # so when it's in the end of the for loop the generator for S_data is also extruded! print 'Running', model_name for exp in range(0, fold): print "=" * 80, "\n", "experiment =", exp # getting one fold of indices from the index generator train_index, test_index = Index_gen.next() X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :] y_train, y_test = y.iloc[train_index], y.iloc[test_index] # fitting the model # 1 fit a model # 2 prediction if model_name == 'SVM': # LinearSVC take care of the multi class response by using one vs others method clf = LinearSVC(random_state=0).fit(X_train, y_train) y_pred = clf.predict(X_test) elif model_name == 'NaiveBayes': clf = GaussianNB() clf.fit(X_train.to_dense(), y_train) y_pred = clf.predict(X_test.to_dense()) elif model_name == 'LogisticRegression': clf = LogisticRegression(C=1.0, penalty='l1', tol=0.01) clf.fit(X_train.as_matrix(), y_train) y_pred = clf.predict(X_test.as_matrix()) else: raise Exception("The model name is incorrect!!!") ### 2.4 eval auc_score = roc_auc_score(y_test, y_pred, average=None) auc_score_all.append(auc_score) auc_ave = mean(array(auc_score_all), 0) print >> f, model_name, '\n', "=" * 80 print >> f, 'avg auc = ', auc_ave
def get_svm_score(w, b_h, dataset): """ Given a trained RBM, get the classification score of a linear SVM trained on the hidden Representation :param w: Weights :param b_h: Hidden biases :param dataset: A Dataset object :return: A scalar score """ proj_training_data = sigm(dataset.training_set.input.dot(w)+b_h) classifier = LinearSVC() classifier.fit(proj_training_data, dataset.training_set.target) proj_test_data = sigm(dataset.test_set.input.dot(w)+b_h) predicted_labels = classifier.predict(proj_test_data) score = percent_correct(dataset.test_set.target, predicted_labels) return score
def single_model_tuning(modelname, fold_nr): """ The thread function that can be used for finding the best model hyperparameters, for a single, non-ensemble model, for a fixed preprocessor, this method requires the data to be split in folds first. parameters: :param str modelname: The name of the model to test. :param int fold_nr: The number of the fold. :return list<dict> results: A list of dictionaries containing the parameter setting and the mae. """ # Init a best mae so far (for printing purposes) best = 10 try: log('Fold: ' + str(fold_nr) + ': Loaded the cached preprocessed data.') X_train, X_val, y_train, y_val, rev_val = load_fold(fold_nr) except IOError: log('Fold: ' + str(fold_nr) + 'run "python kfold_prepr.py" first') results = [] # Tune a model based on the command line argument if modelname == 'log': par = ParameterGrid({ 'logistic__C': np.logspace(-5.0, 5.0, num=11), 'logistic__tol': np.logspace(-5.0, 5.0, num=11) }) for a in list(par): logistic = LogisticRegression(solver='sag', n_jobs=NUM_THEADS, C=a['logistic__C'], tol=a['logistic__tol']) logistic.fit(X_train, y_train) predictions_val = logistic.predict(X_val) mae = mean_absolute_error(predictions_val, y_val) results.append({ 'logistic__C': a['logistic__C'], 'logistic__tol': a['logistic__tol'], 'mae': mae }) elif modelname == 'ridge': par = ParameterGrid({'ridge__alpha': np.logspace(-5.0, 5.0, num=11)}) for a in list(par): ridge = OrdinalRidge(a['ridge__alpha']) ridge.fit(X_train, y_train) predictions_val = ridge.predict(X_val) mae = mean_absolute_error(predictions_val, y_val) results.append({'ridge__alpha': a['ridge__alpha'], 'mae': mae}) elif modelname == 'svc': par = ParameterGrid({ 'svc__C': np.logspace(-5.0, 5.0, num=11), 'svc__tol': np.logspace(-5.0, 5.0, num=11) }) for a in list(par): svc = LinearSVC(C=a['svc__C'], tol=a['svc__tol']) svc.fit(X_train, y_train) predictions_val = svc.predict(X_val) mae = mean_absolute_error(predictions_val, y_val) results.append({ 'svc__C': a['svc__C'], 'svc__tol': a['svc__tol'], 'mae': mae }) elif modelname == 'lad': par = ParameterGrid({ 'lad__C': np.logspace(-5.0, 5.0, num=11), 'lad__tol': np.logspace(-5.0, 5.0, num=11) }) for a in list(par): svr_ = svm.LinearSVR(loss='squared_epsilon_insensitive') svr = LAD(svr_) # use mord for rounding and clipping svr.fit(X_train, y_train) predictions_val = svr.predict(X_val) mae = mean_absolute_error(predictions_val, y_val) results.append({ 'lad__C': a['lad__C'], 'lad__tol': a['lad__tol'], 'mae': mae }) elif modelname == 'final': # This is the tuning of the final ensemble, with fixing 0 rating predictions par = ParameterGrid({ 'logistic_lbfgs__C': np.logspace(-5.0, 5.0, num=11), 'logistic_lbfgs__tol': np.logspace(-5.0, 5.0, num=11), 'logistic_lbfgs_multinom__C': np.logspace(-5.0, 5.0, num=11), 'logistic_lbfgs_multinom__tol': np.logspace(-5.0, 5.0, num=11), 'logistic_sag_balanced__C': np.logspace(-5.0, 5.0, num=11), 'logistic_sag_balanced__tol': np.logspace(-5.0, 5.0, num=11) }) ensemble = VotingClassifier(estimators=[ ('logistic_lbfgs', LogisticRegression(solver='lbfgs', n_jobs=NUM_THEADS, C=5, tol=0.01)), ('logistic_lbfgs_multinom', LogisticRegression(solver='lbfgs', n_jobs=NUM_THEADS, C=5, tol=0.01, multi_class='multinomial')), ('logistic_sag_balanced', LogisticRegression(solver='sag', n_jobs=NUM_THEADS, C=5, tol=0.01, class_weight='balanced')), ], voting='soft', weights=[1, 1, 1]) for a in list(par): ensemble.set_params(**a) ensemble.fit(X_train, y_train) predictions_val = ensemble.predict(X_val) predictions_val = fix_zero_predictions(predictions_val, rev_val) mae = mean_absolute_error(predictions_val, y_val) temp = a temp['mae'] = mae if mae < best: print temp best = mae results.append(temp) elif modelname == 'lbfgs_bal': clf = LogisticRegression(solver='lbfgs', n_jobs=NUM_THEADS, class_weight='balanced') par = ParameterGrid({ 'C': np.logspace(-1.0, 1.0, num=5), 'tol': np.logspace(-3.0, -1.0, num=3) }) for a in list(par): clf.set_params(**a) clf.fit(X_train, y_train) predictions_val = clf.predict(X_val) predictions_val = fix_zero_predictions(predictions_val, rev_val) mae = mean_absolute_error(predictions_val, y_val) temp = a temp['mae'] = mae if mae < best: print temp best = mae results.append(temp) elif modelname == 'lbfgs_multi': clf = LogisticRegression(solver='lbfgs', n_jobs=NUM_THEADS, multi_class='multinomial') par = ParameterGrid({ 'C': np.logspace(-5.0, 5.0, num=11), 'tol': np.logspace(-5.0, 5.0, num=11) }) for a in list(par): clf.set_params(**a) clf.fit(X_train, y_train) predictions_val = clf.predict(X_val) predictions_val = fix_zero_predictions(predictions_val, rev_val) mae = mean_absolute_error(predictions_val, y_val) temp = a temp['mae'] = mae if mae < best: print temp best = mae results.append(temp) elif modelname == 'sag_bal': clf = LogisticRegression(solver='sag', n_jobs=NUM_THEADS, class_weight='balanced') par = ParameterGrid({ 'C': np.logspace(-5.0, 5.0, num=11), 'tol': np.logspace(-5.0, 5.0, num=11) }) for a in list(par): clf.set_params(**a) clf.fit(X_train, y_train) predictions_val = clf.predict(X_val) predictions_val = fix_zero_predictions(predictions_val, rev_val) mae = mean_absolute_error(predictions_val, y_val) temp = a temp['mae'] = mae if mae < best: print temp best = mae results.append(temp) elif modelname == 'nb': clf = MultinomialNB() par = ParameterGrid( {'alpha': [0, 0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 1, 1.5]}) for a in list(par): clf.set_params(**a) clf.fit(X_train, y_train) predictions_val = clf.predict(X_val) predictions_val = fix_zero_predictions(predictions_val, rev_val) mae = mean_absolute_error(predictions_val, y_val) temp = a temp['mae'] = mae if mae < best: print temp best = mae results.append(temp) else: print "model name not defined" return None return results
print "tain time: ", round(train_r1 - train_r0, 3), "s" print "prediction time: ", round(test_r1 - test_r0, 3), "s" print "#################################" ''' #SVC lib_linear print("lib_linear") clf_lib=LinearSVC() # training train_l0 = time() clf_lib.fit(features_train, labels_train) train_l1 = time() # prediction or testing test_l0 = time() predict = clf_lib.predict(features_test) test_l1 = time() print "accuracy: ", clf_lib.score(features_test, labels_test) print "#################################" print "tain time: ", round(train_l1 - train_l0, 3), "s" print "prediction time: ", round(test_l1 - test_l0, 3), "s" ###################################### text = 'اى هبل' Ifeatures_train,Ifeatures_test,Ilabels_train=preprocess_input([text]) clf_lib.fit(Ifeatures_train,Ilabels_train) print ("prediction of ",str(clf_lib.predict(Ifeatures_test))[1]) #print "prediction of ", clf.predict(preprocess_input(text)) # print str(clf.predict(Ifeatures_test))[1]
#test_encode = [] #cont = 0 #while cont < 10: # test_encode.append(dataset.letra[cont].split()) # cont += 1 #teste = label_encoder.transform(test_encode[0]) #Rotinas para alimentar o OneHotEncoder onehot = OneHotEncoder() int_encoded_fit = int_encoded_fit.reshape(len(int_encoded_fit), 1) int_encoded_pred = int_encoded_pred.reshape(len(int_encoded_pred), 1) letra_fit = onehot.fit_transform(int_encoded_fit) letra_pred = onehot.transform(int_encoded_pred) #Utilização do SVM clf.fit(letra_fit, label_train) prediction = clf.predict(letra_pred) print() print("Recall {}".format( recall_score(label_test, prediction, average='weighted'))) print("Precision {}".format( precision_score(label_test, prediction, average='weighted'))) print("F1 {}".format(f1_score(label_test, prediction, average='weighted'))) print("Accuracy {}".format(accuracy_score(label_test, prediction)))
print "tain time: ", round(train_r1 - train_r0, 3), "s" print "prediction time: ", round(test_r1 - test_r0, 3), "s" print "#################################" ''' #SVC lib_linear print("lib_linear") clf_lib = LinearSVC() # training train_l0 = time() clf_lib.fit(features_train, labels_train) train_l1 = time() # prediction or testing test_l0 = time() predict = clf_lib.predict(features_test) test_l1 = time() print "accuracy: ", clf_lib.score(features_test, labels_test) print "#################################" print "tain time: ", round(train_l1 - train_l0, 3), "s" print "prediction time: ", round(test_l1 - test_l0, 3), "s" ###################################### text = 'اى هبل' Ifeatures_train, Ifeatures_test, Ilabels_train = preprocess_input([text]) clf_lib.fit(Ifeatures_train, Ilabels_train) print("prediction of ", str(clf_lib.predict(Ifeatures_test))[1]) #print "prediction of ", clf.predict(preprocess_input(text)) # print str(clf.predict(Ifeatures_test))[1]
return docs, t_docs, t_docsCategories data = readData('hackerrank/documentClassification.txt') X_train = np.array(data[1]) y_train = np.array(data[2]) X_test = np.array(data[0]) print("Extracting features from the training dataset using a sparse vectorizer") #vectorizer = HashingVectorizer(stop_words='english', non_negative=True) vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1, 2), stop_words='english', strip_accents='unicode', norm='l2') X_train = vectorizer.fit_transform(X_train) #vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, # stop_words='english') #X2_train = vectorizer.fit_transform(data_train.data) X_test = vectorizer.transform(X_test) nb_classifier = MultinomialNB().fit(X_train, y_train) svm_classifier = LinearSVC().fit(X_train, y_train) maxent_classifier = LogisticRegression().fit(X_train, y_train) y_nb_predicted = nb_classifier.predict(X_test) print(y_nb_predicted) y_nb_predicted = svm_classifier.predict(X_test) print(y_nb_predicted) y_nb_predicted = maxent_classifier.predict(X_test) print(y_nb_predicted)