def classification_linear_svm(tweets, train_index, test_index, labels_train, random_state=None): """Classifies using SVM as classifier """ #Representation tfidf_parser = TfidfVectorizer(tokenizer=tokenize, lowercase=False, analyzer='word') tweets_train = [tweets[tweet_index] for tweet_index in train_index] tweets_test = [tweets[tweet_index] for tweet_index in test_index] train_sparse_matrix_features_tfidf = tfidf_parser.fit_transform( tweets_train) test_sparse_matrix_features_tfidf = tfidf_parser.transform(tweets_test) classifier = LinearSVC(multi_class="ovr", random_state=random_state) print("Start SVM training") classifier = classifier.fit(train_sparse_matrix_features_tfidf, labels_train) print("Finish SVM training") y_labels = classifier.predict(test_sparse_matrix_features_tfidf) return y_labels
class Expander_LDA_multiclass(Expander_LDA_cossim): """ take LDA vectors of labelled articles and do a multi-class classification for deciding where the LDA of the test text belongs """ def __init__(self, ldaModelAll, expander_type=AcronymExpanderEnum.LDA_multiclass): Expander_LDA_cossim.__init__(self, ldaModelAll, expander_type) self.classifier = LinearSVC() def transform(self, X): results = Expander_LDA_cossim.transform(self, X) return [self._getDenseVector(item) for item in results] def _getDenseVector(self, sparse_vec): return sparse2full(sparse_vec, self.ldaModel.num_topics) def fit(self, X_train, y_train): self.classifier.fit(X_train, y_train) def predict(self, X_test, acronym): labels = self.classifier.predict(X_test) decisions = self.classifier.decision_function(X_test) confidences = self._getConfidencesFromDecisionFunction(labels, decisions) return labels, confidences
def fit(self, data, args): self.model = LinearSVC() with Timer() as t: self.model.fit(data.X_train, data.y_train) return t.interval
def applyModelandfit(tweet_list, tweet_label_list,all_tweet,model_name,filename): X, y, tweet_id_list = buildMatrixTrainAndTest(tweet_list, tweet_label_list, all_tweet) X_train = X[:len(y),:] y_train = y X_test = X[len(y):,:] tweet_id_list_test = tweet_id_list[len(y):] print "number of training tweets are ", X_train.shape, len(y_train) if model_name == 'SVM': clf = LinearSVC(penalty="l1", dual=False, tol=1e-7) clf.fit(X_train, y_train) elif model_name == 'NaiveBayes': clf = GaussianNB() clf.fit(X_train.todense(), y_train) elif model_name == 'LogisticRegression': clf = LogisticRegression(C=1.0, penalty='l1', tol=0.01) clf.fit(X_train.toarray(), y_train) else: raise Exception("The model name is incorrect!!!") y_pred = clf.predict(X_test) print 'length of predict data is ', len(y_pred) with open(RESULT_FOLDER+'/'+filename+'_c.csv','wb') as fp: writer = csv.writer(fp, delimiter =",",quoting=csv.QUOTE_MINIMAL) for i, tweetid in enumerate(tweet_id_list_test): writer.writerow([tweetid, all_tweet[tweetid], y_pred[i]])
class LinearSVCImpl(): def __init__(self, penalty='l2', loss='squared_hinge', dual=True, tol=0.0001, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight='balanced', verbose=0, random_state=None, max_iter=1000): self._hyperparams = { 'penalty': penalty, 'loss': loss, 'dual': dual, 'tol': tol, 'C': C, 'multi_class': multi_class, 'fit_intercept': fit_intercept, 'intercept_scaling': intercept_scaling, 'class_weight': class_weight, 'verbose': verbose, 'random_state': random_state, 'max_iter': max_iter} def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self def predict(self, X): return self._sklearn_model.predict(X)
def get_svm_score(w, b_h, dataset): """ Given a trained RBM, get the classification score of a linear SVM trained on the hidden Representation :param w: Weights :param b_h: Hidden biases :param dataset: A Dataset object :return: A scalar score """ proj_training_data = sigm(dataset.training_set.input.dot(w)+b_h) classifier = LinearSVC() classifier.fit(proj_training_data, dataset.training_set.target) proj_test_data = sigm(dataset.test_set.input.dot(w)+b_h) predicted_labels = classifier.predict(proj_test_data) score = percent_correct(dataset.test_set.target, predicted_labels) return score
class CreateLinearSVC(CreateModel): def fit(self, data, args): self.model = LinearSVC() with Timer() as t: self.model.fit(data.X_train, data.y_train) return t.interval def predict(self, data): assert self.model is not None with Timer() as t: self.predictions = self.test(data) return t.interval
def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self
def test_model_within_optimizer(self): pipe = Pipeline([('reduce_dim', PCA()), ('classify', LinearSVC())]) n_features_options = [2, 4, 8] c_options = [1, 10, 100, 1000] param_grid = [ { 'reduce_dim': [PCA(iterated_power=7), NMF()], 'reduce_dim__n_components': n_features_options, 'classify__C': c_options }, { 'reduce_dim': [SelectKBest(chi2)], 'reduce_dim__k': n_features_options, 'classify__C': c_options }, ] grid = GridSearchCV(pipe, cv=3, n_jobs=1, param_grid=param_grid) digits = load_digits() grid.fit(digits.data, digits.target) try: Porter(grid, language='java') except ValueError: self.assertTrue(False) else: self.assertTrue(True)
def train_with_svm(self): rbm = BernoulliRBM(random_state=0, verbose=False) svc = LinearSVC(C=10.0,class_weight='balanced',max_iter=100) classifier = Pipeline(steps=[('rbm', rbm), ('svm', svc)]) rbm.learning_rate = 0.05 rbm.n_iter = 30 # More components tend to give better prediction performance, but larger # fitting time rbm.n_components = 100 classifier.fit(self.X, self.Y) self.classifier = classifier joblib.dump(classifier,"rbm.pkl")
def runandsaveModel(tweet_list, tweet_label_list,model_name): X, y, vectorizer= buildMatrixTrain(tweet_list, tweet_label_list) print "number of training tweets are ", X.shape, len(y) #trainning the model if model_name == 'SVM': clf = LinearSVC(penalty="l1", dual=False, tol=1e-7) clf.fit(X, y) elif model_name == 'NaiveBayes': clf = GaussianNB() clf.fit(X.todense(), y) elif model_name == 'LogisticRegression': clf = LogisticRegression(C=1.0, penalty='l1', tol=0.01) clf.fit(X.toarray(), y) else: raise Exception("The model name is incorrect!!!") #save the model model = Model(model_name, clf, vectorizer) with open(RESULT_FOLDER+"/"+model_name+"_model.m","wb") as pf: pickle.dump(model,pf) print model_name, "is saved at", RESULT_FOLDER+"/"+model_name+"_model.m"
def __init__(self, penalty='l2', loss='squared_hinge', dual=True, tol=0.0001, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight='balanced', verbose=0, random_state=None, max_iter=1000): self._hyperparams = { 'penalty': penalty, 'loss': loss, 'dual': dual, 'tol': tol, 'C': C, 'multi_class': multi_class, 'fit_intercept': fit_intercept, 'intercept_scaling': intercept_scaling, 'class_weight': class_weight, 'verbose': verbose, 'random_state': random_state, 'max_iter': max_iter} self._wrapped_model = SKLModel(**self._hyperparams)
def single_model_tuning(modelname, fold_nr): """ The thread function that can be used for finding the best model hyperparameters, for a single, non-ensemble model, for a fixed preprocessor, this method requires the data to be split in folds first. parameters: :param str modelname: The name of the model to test. :param int fold_nr: The number of the fold. :return list<dict> results: A list of dictionaries containing the parameter setting and the mae. """ # Init a best mae so far (for printing purposes) best = 10 try: log('Fold: ' + str(fold_nr) + ': Loaded the cached preprocessed data.') X_train, X_val, y_train, y_val, rev_val = load_fold(fold_nr) except IOError: log('Fold: ' + str(fold_nr) + 'run "python kfold_prepr.py" first') results = [] # Tune a model based on the command line argument if modelname == 'log': par = ParameterGrid({ 'logistic__C': np.logspace(-5.0, 5.0, num=11), 'logistic__tol': np.logspace(-5.0, 5.0, num=11) }) for a in list(par): logistic = LogisticRegression(solver='sag', n_jobs=NUM_THEADS, C=a['logistic__C'], tol=a['logistic__tol']) logistic.fit(X_train, y_train) predictions_val = logistic.predict(X_val) mae = mean_absolute_error(predictions_val, y_val) results.append({ 'logistic__C': a['logistic__C'], 'logistic__tol': a['logistic__tol'], 'mae': mae }) elif modelname == 'ridge': par = ParameterGrid({'ridge__alpha': np.logspace(-5.0, 5.0, num=11)}) for a in list(par): ridge = OrdinalRidge(a['ridge__alpha']) ridge.fit(X_train, y_train) predictions_val = ridge.predict(X_val) mae = mean_absolute_error(predictions_val, y_val) results.append({'ridge__alpha': a['ridge__alpha'], 'mae': mae}) elif modelname == 'svc': par = ParameterGrid({ 'svc__C': np.logspace(-5.0, 5.0, num=11), 'svc__tol': np.logspace(-5.0, 5.0, num=11) }) for a in list(par): svc = LinearSVC(C=a['svc__C'], tol=a['svc__tol']) svc.fit(X_train, y_train) predictions_val = svc.predict(X_val) mae = mean_absolute_error(predictions_val, y_val) results.append({ 'svc__C': a['svc__C'], 'svc__tol': a['svc__tol'], 'mae': mae }) elif modelname == 'lad': par = ParameterGrid({ 'lad__C': np.logspace(-5.0, 5.0, num=11), 'lad__tol': np.logspace(-5.0, 5.0, num=11) }) for a in list(par): svr_ = svm.LinearSVR(loss='squared_epsilon_insensitive') svr = LAD(svr_) # use mord for rounding and clipping svr.fit(X_train, y_train) predictions_val = svr.predict(X_val) mae = mean_absolute_error(predictions_val, y_val) results.append({ 'lad__C': a['lad__C'], 'lad__tol': a['lad__tol'], 'mae': mae }) elif modelname == 'final': # This is the tuning of the final ensemble, with fixing 0 rating predictions par = ParameterGrid({ 'logistic_lbfgs__C': np.logspace(-5.0, 5.0, num=11), 'logistic_lbfgs__tol': np.logspace(-5.0, 5.0, num=11), 'logistic_lbfgs_multinom__C': np.logspace(-5.0, 5.0, num=11), 'logistic_lbfgs_multinom__tol': np.logspace(-5.0, 5.0, num=11), 'logistic_sag_balanced__C': np.logspace(-5.0, 5.0, num=11), 'logistic_sag_balanced__tol': np.logspace(-5.0, 5.0, num=11) }) ensemble = VotingClassifier(estimators=[ ('logistic_lbfgs', LogisticRegression(solver='lbfgs', n_jobs=NUM_THEADS, C=5, tol=0.01)), ('logistic_lbfgs_multinom', LogisticRegression(solver='lbfgs', n_jobs=NUM_THEADS, C=5, tol=0.01, multi_class='multinomial')), ('logistic_sag_balanced', LogisticRegression(solver='sag', n_jobs=NUM_THEADS, C=5, tol=0.01, class_weight='balanced')), ], voting='soft', weights=[1, 1, 1]) for a in list(par): ensemble.set_params(**a) ensemble.fit(X_train, y_train) predictions_val = ensemble.predict(X_val) predictions_val = fix_zero_predictions(predictions_val, rev_val) mae = mean_absolute_error(predictions_val, y_val) temp = a temp['mae'] = mae if mae < best: print temp best = mae results.append(temp) elif modelname == 'lbfgs_bal': clf = LogisticRegression(solver='lbfgs', n_jobs=NUM_THEADS, class_weight='balanced') par = ParameterGrid({ 'C': np.logspace(-1.0, 1.0, num=5), 'tol': np.logspace(-3.0, -1.0, num=3) }) for a in list(par): clf.set_params(**a) clf.fit(X_train, y_train) predictions_val = clf.predict(X_val) predictions_val = fix_zero_predictions(predictions_val, rev_val) mae = mean_absolute_error(predictions_val, y_val) temp = a temp['mae'] = mae if mae < best: print temp best = mae results.append(temp) elif modelname == 'lbfgs_multi': clf = LogisticRegression(solver='lbfgs', n_jobs=NUM_THEADS, multi_class='multinomial') par = ParameterGrid({ 'C': np.logspace(-5.0, 5.0, num=11), 'tol': np.logspace(-5.0, 5.0, num=11) }) for a in list(par): clf.set_params(**a) clf.fit(X_train, y_train) predictions_val = clf.predict(X_val) predictions_val = fix_zero_predictions(predictions_val, rev_val) mae = mean_absolute_error(predictions_val, y_val) temp = a temp['mae'] = mae if mae < best: print temp best = mae results.append(temp) elif modelname == 'sag_bal': clf = LogisticRegression(solver='sag', n_jobs=NUM_THEADS, class_weight='balanced') par = ParameterGrid({ 'C': np.logspace(-5.0, 5.0, num=11), 'tol': np.logspace(-5.0, 5.0, num=11) }) for a in list(par): clf.set_params(**a) clf.fit(X_train, y_train) predictions_val = clf.predict(X_val) predictions_val = fix_zero_predictions(predictions_val, rev_val) mae = mean_absolute_error(predictions_val, y_val) temp = a temp['mae'] = mae if mae < best: print temp best = mae results.append(temp) elif modelname == 'nb': clf = MultinomialNB() par = ParameterGrid( {'alpha': [0, 0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 1, 1.5]}) for a in list(par): clf.set_params(**a) clf.fit(X_train, y_train) predictions_val = clf.predict(X_val) predictions_val = fix_zero_predictions(predictions_val, rev_val) mae = mean_absolute_error(predictions_val, y_val) temp = a temp['mae'] = mae if mae < best: print temp best = mae results.append(temp) else: print "model name not defined" return None return results
def main_preprop(): """ The main method that can be used for finding the best preprocessor hyperparameters, for a fixed model. """ # Initialize the result array results = [] # Set the search range in a parameter grid, all possible combinations will be tested params_preprocessor = { 'a_value': range(10, 15), 'epsilon': [0.1, 0.01, 0.001], 'reduction_level': [0.01, 0.02, 0.05, 0.1, 0.15, 0.2, 0.3, 0.5] } # Create a preprocessor object (tweaked for tuning hyperparameters) preprocessor_object = Preprocessor2() # Define the models logistic = LogisticRegression(solver='sag', n_jobs=NUM_THEADS, C=5, tol=0.01) ridge = OrdinalRidge(10) svc = LinearSVC(C=0.5) svr = svm.LinearSVR(loss='squared_epsilon_insensitive') lad = LAD(svr) ensemble = VotingClassifier(estimators=[('logistic', logistic), ('ridge', ridge), ('svc', svc), ('lad', lad)], voting='hard', weights=[1, 1, 1, 1]) # load the data train_data, test_data = preprocessor_object.load() # Convert the data to tf-idf matrices X_train_full, X_val_full, y_train, y_val = preprocessor_object.fit_data( train_data) # Test the preprocessor for all possible hyperparameter combinations for x in params_preprocessor['a_value']: for y in params_preprocessor['epsilon']: sigma_values = preprocessor_object.compute_sigma_values( X_train_full, y_train, x, y) for z in params_preprocessor['reduction_level']: X_train, X_val, X_test = preprocessor_object.remove_features( sigma_values, z, X_train_full, X_val_full, test_data) ensemble.fit(X_train, y_train) predictions_val = ensemble.predict(X_val) mae = mean_absolute_error(predictions_val, y_val) print strftime("%H:%M:%S") + ' -> a: ' + str( x) + '; epsilon: ' + str(y) + '; red lvl: ' + str( z) + '; MAE: ' + str(mae) results.append({ 'a_value': x, 'epsilon': y, 'reduction_level': z, 'mae': mae }) print results log('That\'s all folks!')
def runModel(X, y, S_data, model_name): f = open('r_' + "_" + model_name + '.txt', 'w') auc_score_all = [] fold = S_data[1] Index_gen = S_data[0] label = ['0.0', '1.0'] # note that python does not copy the generator, # so when it's in the end of the for loop the generator for S_data is also extruded! print 'Running', model_name for exp in range(0, fold): print "=" * 80, "\n", "experiment =", exp # getting one fold of indices from the index generator train_index, test_index = Index_gen.next() X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :] y_train, y_test = y.iloc[train_index], y.iloc[test_index] # fitting the model # 1 fit a model # 2 prediction if model_name == 'SVM': # LinearSVC take care of the multi class response by using one vs others method clf = LinearSVC(random_state=0).fit(X_train, y_train) y_pred = clf.predict(X_test) elif model_name == 'NaiveBayes': clf = GaussianNB() clf.fit(X_train.to_dense(), y_train) y_pred = clf.predict(X_test.to_dense()) elif model_name == 'LogisticRegression': clf = LogisticRegression(C=1.0, penalty='l1', tol=0.01) clf.fit(X_train.as_matrix(), y_train) y_pred = clf.predict(X_test.as_matrix()) else: raise Exception("The model name is incorrect!!!") ### 2.4 eval auc_score = roc_auc_score(y_test, y_pred, average=None) auc_score_all.append(auc_score) auc_ave = mean(array(auc_score_all), 0) print >> f, model_name, '\n', "=" * 80 print >> f, 'avg auc = ', auc_ave
#path = '/media/robbis/DATA/fmri/monks' path = '/home/carlos/mount/megmri03/monks' subjects = os.listdir(path) subjects = [s for s in subjects if s.find('.') == -1 and s.find('_') == -1] # Load monk data in the form of n_samples x n_voxels x n_time ds, _, _ = load_subject_ds( path, subjects[:1], #os.path.join(path, 'subjects.csv'), 'meditation_permut1.conf', 'fmri', prepro=MonksPreprocessingPipeline(), roi_labels=atlas_dict) clf = make_pipeline(StandardScaler(), LinearSVC(C=1)) time_gen = GeneralizingEstimator(clf, scoring='accuracy', n_jobs=20) ds = SampleSlicer({'group': ['E']}).transform(ds) scores_dict = {} # Generalization of time for network in os.listdir(path_templates): network = network[:-21] ds_network = FeatureSlicer({network: ['!0']}).transform(ds) n_samples, n_voxels = ds_network.shape data = ds_network.samples.reshape(-1, 135, n_voxels) X = np.rollaxis(data, 1, 3) y = np.arange(data.shape[0]) % 2
def setUp(self): super(LinearSVCJavaTest, self).setUp() self.estimator = LinearSVC(C=1., random_state=0)
def __init__(self, ldaModelAll, expander_type=AcronymExpanderEnum.LDA_multiclass): Expander_LDA_cossim.__init__(self, ldaModelAll, expander_type) self.classifier = LinearSVC()
'KernelRidge':KernelRidge(), 'LSHForest':LSHForest(), 'LabelPropagation':LabelPropagation(), 'LabelSpreading':LabelSpreading(), 'Lars':Lars(), 'LarsCV':LarsCV(), 'Lasso':Lasso(), 'LassoCV':LassoCV(), 'LassoLars':LassoLars(), 'LassoLarsCV':LassoLarsCV(), 'LassoLarsIC':LassoLarsIC(), 'LatentDirichletAllocation':LatentDirichletAllocation(), 'LedoitWolf':LedoitWolf(), 'LinearDiscriminantAnalysis':LinearDiscriminantAnalysis(), 'LinearRegression':LinearRegression(), 'LinearSVC':LinearSVC(), 'LinearSVR':LinearSVR(), 'LocallyLinearEmbedding':LocallyLinearEmbedding(), 'LogisticRegression':LogisticRegression(), 'LogisticRegressionCV':LogisticRegressionCV(), 'MDS':MDS(), 'MLPClassifier':MLPClassifier(), 'MLPRegressor':MLPRegressor(), 'MaxAbsScaler':MaxAbsScaler(), 'MeanShift':MeanShift(), 'MinCovDet':MinCovDet(), 'MinMaxScaler':MinMaxScaler(), 'MiniBatchDictionaryLearning':MiniBatchDictionaryLearning(), 'MiniBatchKMeans':MiniBatchKMeans(), 'MiniBatchSparsePCA':MiniBatchSparsePCA(), 'MultiTaskElasticNet':MultiTaskElasticNet(),
def setUp(self): super(LinearSVCTest, self).setUp() mdl = LinearSVC(C=1., random_state=0) self._port_model(mdl)
def runModel(X, y, model_name): nFolders = 5 accs = [] precs = [] recalls = [] F1s = [] n = X.shape[0] for exp in range(0, nFolders): print '\n\n============================================================================================\nexperiment' , exp ### 2.1 split training and testing data start = (int)((1-(exp+1) * 1.0/nFolders)*n) end = (int)((1-exp * 1.0/nFolders)*n) #print n, start, end X_train, y_train, X_test, y_test = splitTrainTest(X, y, start, end) print 'Running', model_name if model_name == 'SVM': ### 2.2 build classifier clf = LinearSVC(penalty="l1", dual=False, tol=1e-7) clf.fit(X_train, y_train) ### 2.3 predict y_pred = clf.predict(X_test) elif model_name == 'NaiveBayes': clf = GaussianNB() clf.fit(X_train.todense(), y_train) y_pred = clf.predict(X_test.todense()) elif model_name == 'LogisticRegression': clf = LogisticRegression(C=1.0, penalty='l1', tol=0.01) clf.fit(X_train.toarray(), y_train) y_pred = clf.predict(X_test.toarray()) else: raise Exception("The model name is incorrect!!!") ### 2.4 eval acc, prec, recall, F1 = eval(y_test, y_pred) print 'Acc = ', acc; print 'Precision =', prec; print 'Recall=', recall; print 'F1 =', F1 accs.append(acc) precs.append(prec) recalls.append(recall) F1s.append(F1) print '\n\n\n' print 'avg Acc = ', sum(accs)/len(accs) print 'avg Precision = ', sum(precs)/len(precs) print 'avg Recall = ', sum(recalls)/len(recalls) print 'avg F1 = ', sum(F1s)/len(F1s) return sum(accs)/len(accs), sum(precs)/len(precs), sum(recalls)/len(recalls), sum(F1s)/len(F1s)
n_classes = 2 FEAT_F33 = "F33" ROLL_LEN = 10 PANTHEON_SIZE = 10 n_users = 1000 max_runs = None #10000 percTest = 0.1 predictors = [ # DummyClassifier(strategy="stratified"), # DummyClassifier(strategy="uniform"), # BernoulliNB(), # SVC(kernel='rbf', max_iter=10000, class_weight="balanced", verbose=1), LinearSVC(max_iter=50), MLPClassifier(max_iter=50, nesterovs_momentum=True, early_stopping=True), #, activation="logistic"), # LogisticRegression(class_weight='balanced'), RandomForestClassifier(class_weight="balanced"), # ExtraTreeClassifier(), # AdaBoostClassifier(), # DecisionTreeClassifier(), ] predictor_params = [ # None, # None, #{'n_iter':50, 'alpha': numpy.logspace(-3, 2) }, # {'name':'RBFSVC', 'n_iter':50,'C': numpy.logspace(-2, 6), 'gamma': numpy.logspace(-9, 3)}, {
#criacao o dicionario #dict = [] #cont = 0 #for t in dataset.letra[:]: # for w in t.split(): # dict.append(w) #dict = set(dict) #dict agora contem todas as palavras existentes no dataset #dict_array = list(dict) #precisa transformar pois o fit do label encoder nao aceita set #separar o dataset em uma parte para fit e outra para predict letras_train, letras_test, label_train, label_test = train_test_split( dataset.letra, dataset.label, test_size=0.30, random_state=42) #uso do LinearSVC clf = SVC(C=1, loss='squared_hinge', penalty='l1', dual=False) #print(dict_array) #rotinas para alimentar o LabelEnconder label_encoder = LabelEncoder() int_encoded_fit = label_encoder.fit_transform(letras_train) int_encoded_pred = label_encoder.fit_transform(letras_test) #dict_encode = label_encoder.fit(dict_array) #test_encode = [] #cont = 0 #while cont < 10: # test_encode.append(dataset.letra[cont].split()) # cont += 1
from sklearn.datasets import make_blobs import mglearn import numpy as np import matplotlib.pyplot as plt from sklearn.svm.classes import LinearSVC from Lib.idlelib.colorizer import color_config X,y = make_blobs(random_state=42) mglearn.discrete_scatter(X[:,0],X[:,1],y) plt.xlabel("Feature 0") plt.ylabel("Feature 1") plt.legend(["Class 0","Class 1","Class 2"]) plt.show() ##在这个数据集上训练一个LinearSVC分类器 linear_svm = LinearSVC().fit(X,y) print("Coefficient shape: ",linear_svm.coef_.shape) print("Intercept shape: ",linear_svm.intercept_.shape) #Coefficient shape: (3, 2) #Intercept shape: (3,) #coef_每行包含三个类别之一的系数向量,每列包含某个特征对应的系数值,包含两个特征 #intercept_是一维数组,保存每个类别的截距 #将这3个分类器给出的直线可视化 mglearn.discrete_scatter(X[:,0],X[:,1],y) line = np.linspace(-15,15) for coef,intercept,color in zip(linear_svm.coef_,linear_svm.intercept_,['b','r','g']): plt.plot(line,-(line*coef[0]+intercept)/coef[1],c=color) plt.ylim(-10,15) plt.xlim((-10,8))
def setUp(self): super(LinearSVCTest, self).setUp() self.porter = Porter(language='ruby') self._port_model(LinearSVC(C=1., random_state=0))
op.add_option('--use_available_classifiers', action='store_true', dest='available_classifiers', help='Uses previously generated classifiers. If does not exists, new classifier models are \ generated.') op.add_option("--random_state", action='store', type='int', dest='random_state', help='Use random value of type int to reproduce the results.') (opts, args) = op.parse_args() if len(args) > 0: op.error("this script takes no arguments.") sys.exit(1) if __doc__: print(__doc__) op.print_help() clfs = ( (LinearSVC(), "Linear SVC"), ) # total results from classifier def calculateScore(result, n_folds): print 'x' * 70 n_classifier = len(clfs) for x in xrange(n_classifier): a=0.0 p=0.0 r=0.0 f1=0.0 for i in xrange(x * n_folds, (x * n_folds) + n_folds): a += result[i][0] p += result[i][1] r += result[i][2]
QENC_QUAL=False QENC_DIFF=False qenc_width = 33 n_classes = 3 FEAT_F33 = "F33" n_users = 1000 max_runs = None #10000 percTest = 0.20 predictors = [ # DummyClassifier(strategy="stratified"), # DummyClassifier(strategy="uniform"), # BernoulliNB(), LinearSVC(max_iter=100, class_weight="balanced"), MLPClassifier(max_iter=100, nesterovs_momentum=True, early_stopping=True), #, activation="logistic"), LogisticRegression(class_weight='balanced'), # GaussianNB(), ] predictor_params = [ # None, # None, # {'n_iter':50, 'alpha': numpy.logspace(-3, 2) }, {'n_iter':50,'C': numpy.logspace(-3, 2)}, {'n_iter':125,'hidden_layer_sizes':[(100,), (66,10)], 'learning_rate_init':[0.001, 0.01, 0.1], 'alpha': numpy.logspace(-6,2) }, {'n_iter':50,'C': numpy.logspace(-3, 2)}, # None, ]
return docs, t_docs, t_docsCategories data = readData('hackerrank/documentClassification.txt') X_train = np.array(data[1]) y_train = np.array(data[2]) X_test = np.array(data[0]) print("Extracting features from the training dataset using a sparse vectorizer") #vectorizer = HashingVectorizer(stop_words='english', non_negative=True) vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1, 2), stop_words='english', strip_accents='unicode', norm='l2') X_train = vectorizer.fit_transform(X_train) #vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, # stop_words='english') #X2_train = vectorizer.fit_transform(data_train.data) X_test = vectorizer.transform(X_test) nb_classifier = MultinomialNB().fit(X_train, y_train) svm_classifier = LinearSVC().fit(X_train, y_train) maxent_classifier = LogisticRegression().fit(X_train, y_train) y_nb_predicted = nb_classifier.predict(X_test) print(y_nb_predicted) y_nb_predicted = svm_classifier.predict(X_test) print(y_nb_predicted) y_nb_predicted = maxent_classifier.predict(X_test) print(y_nb_predicted)
def setUp(self): super(LinearSVCTest, self).setUp() self.porter = Porter(language='c') self.set_classifier(LinearSVC(C=1., random_state=0))
train_r1 = time() # prediction or testing test_r0 = time() predict = clf_rbf.predict(features_test) test_r1 = time() print "accuracy: ", clf_rbf.score(features_test, labels_test) print "#################################" print "tain time: ", round(train_r1 - train_r0, 3), "s" print "prediction time: ", round(test_r1 - test_r0, 3), "s" print "#################################" ''' #SVC lib_linear print("lib_linear") clf_lib=LinearSVC() # training train_l0 = time() clf_lib.fit(features_train, labels_train) train_l1 = time() # prediction or testing test_l0 = time() predict = clf_lib.predict(features_test) test_l1 = time() print "accuracy: ", clf_lib.score(features_test, labels_test) print "#################################" print "tain time: ", round(train_l1 - train_l0, 3), "s" print "prediction time: ", round(test_l1 - test_l0, 3), "s"
def setUp(self): super(LinearSVCPHPTest, self).setUp() self.mdl = LinearSVC(C=1., random_state=0)
train_file = '/home/jvujjini/Kaggle/ForestCoverTypePrediction/train.csv' test_file = '/home/jvujjini/Kaggle/ForestCoverTypePrediction/test.csv' train_data = np.loadtxt(train_file, np.float32, delimiter=',') test_data = np.loadtxt(test_file, np.float32, delimiter=',') #training_data, training_label, test_data, test_label = train_data[:15000,:-1], data[:15000,-1], data[15000:,:-1], data[15000:,-1] train_X = train_data[:, :-1] train_y = train_data[:, -1] test_X = test_data print "starting..." predict_label = OneVsRestClassifier(LinearSVC(random_state=0)).fit( train_X, train_y).predict(test_X) '''print "Started Training..." clf.fit(train_X, train_y) print "Done Training" print "Started Predicting..." predict_label = clf.predict(test_X)''' output_file = '/home/jvujjini/Kaggle/ForestCoverTypePrediction/output.csv' with open(output_file, 'w') as thefile: print "File Opened..." for item in predict_label: thefile.write("%s\n" % item) print "Success!"
def runModel(X, y, model_name): nFolders = 5 accs = [] precs = [] recalls = [] F1s = [] n = X.shape[0] for exp in range(0, nFolders): print '\n\n============================================================================================\nexperiment', exp ### 2.1 split training and testing data start = (int)((1 - (exp + 1) * 1.0 / nFolders) * n) end = (int)((1 - exp * 1.0 / nFolders) * n) #print n, start, end X_train, y_train, X_test, y_test = splitTrainTest(X, y, start, end) print 'Running', model_name if model_name == 'SVM': ### 2.2 build classifier clf = LinearSVC(penalty="l1", dual=False, tol=1e-7) clf.fit(X_train, y_train) ### 2.3 predict y_pred = clf.predict(X_test) if model_name == 'SVM_new': ### 2.2 build classifier clf = svm.SVC(C=1.0, gamma=1.0, class_weight='auto') clf.fit(X_train, y_train) ### 2.3 predict y_pred = clf.predict(X_test) elif model_name == 'NaiveBayes': clf = GaussianNB() clf.fit(X_train.todense(), y_train) y_pred = clf.predict(X_test.todense()) elif model_name == 'LogisticRegression': clf = LogisticRegression(C=1.0, penalty='l1', tol=0.01) clf.fit(X_train.toarray(), y_train) y_pred = clf.predict(X_test.toarray()) else: raise Exception("The model name is incorrect!!!") ### 2.4 eval acc, prec, recall, F1 = eval(y_test, y_pred) print 'Acc = ', acc print 'Precision =', prec print 'Recall=', recall print 'F1 =', F1 accs.append(acc) precs.append(prec) recalls.append(recall) F1s.append(F1) print '\n\n\n' print 'avg Acc = ', sum(accs) / len(accs) print 'avg Precision = ', sum(precs) / len(precs) print 'avg Recall = ', sum(recalls) / len(recalls) print 'avg F1 = ', sum(F1s) / len(F1s) return sum(accs) / len(accs), sum(precs) / len(precs), sum(recalls) / len( recalls), sum(F1s) / len(F1s)
train_acc = metrics.accuracy_score(y_train, model.predict(x_train)) test_acc = metrics.accuracy_score(y_test, y_hat) print u'训练集准确率:%.2f%%' % (100 * train_acc) print u'测试集准确率:%.2f%%' % (100 * test_acc) return t_train, t_test, 1 - train_acc, 1 - test_acc, name #开始传提参数 clfs = [[RidgeClassifier(), 'Ridge'], [KNeighborsClassifier(), 'KNN'], [MultinomialNB(), 'MultinomialNB'], [BernoulliNB(), 'BernoulliNB'], [RandomForestClassifier(n_estimators=200), 'RandomForest'], [SVC(), 'SVM'], [ LinearSVC(loss='squared_hinge', penalty='l1', dual=False, tol=1e-4), 'LinearSVC-l1' ], [ LinearSVC(loss='squared_hinge', penalty='l2', dual=False, tol=1e-4), 'LinearSVC-l2' ]] #开始训练 result = [] for clf, name in clfs: a = benchmark(clf, name) result.append(a) print '\n' result = np.array(result)
genres = list(data_df.drop(['title', 'plot'], axis=1).columns.values) data_x = data_df[['plot']].as_matrix() data_y = data_df.drop(['title', 'plot'], axis=1).as_matrix() stratified_split = StratifiedShuffleSplit(n_splits=2, test_size=0.33) x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.33, random_state=42) # transform matrix of plots into lists to pass to a TfidfVectorizer train_x = [x[0].strip() for x in x_train.tolist()] test_x = [x[0].strip() for x in x_test.tolist()] stop_words = set(stopwords.words('english')) ## http://michelleful.github.io/code-blog/2015/06/20/pipelines/ ## learn feature union to add more features (time, region) pipeline = Pipeline([ ('tfidf', TfidfVectorizer(stop_words=stop_words)), ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)), ]) parameters = { 'tfidf__max_df': (0.25, 0.5, 0.75), 'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)], "clf__estimator__C": [0.01, 0.1, 1], "clf__estimator__class_weight": ['balanced', None], } grid_search(train_x, y_train, test_x, y_test, genres, parameters, pipeline)
train_r1 = time() # prediction or testing test_r0 = time() predict = clf_rbf.predict(features_test) test_r1 = time() print "accuracy: ", clf_rbf.score(features_test, labels_test) print "#################################" print "tain time: ", round(train_r1 - train_r0, 3), "s" print "prediction time: ", round(test_r1 - test_r0, 3), "s" print "#################################" ''' #SVC lib_linear print("lib_linear") clf_lib = LinearSVC() # training train_l0 = time() clf_lib.fit(features_train, labels_train) train_l1 = time() # prediction or testing test_l0 = time() predict = clf_lib.predict(features_test) test_l1 = time() print "accuracy: ", clf_lib.score(features_test, labels_test) print "#################################" print "tain time: ", round(train_l1 - train_l0, 3), "s" print "prediction time: ", round(test_l1 - test_l0, 3), "s"