def test_predict_on_toy_problem(): """Manually check predicted class labels for toy dataset.""" clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(random_state=123) clf3 = GaussianNB() X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2], [2.1, 1.4], [3.1, 2.3]]) y = np.array([1, 1, 1, 2, 2, 2]) assert_equal(all(clf1.fit(X, y).predict(X)), all([1, 1, 1, 2, 2, 2])) assert_equal(all(clf2.fit(X, y).predict(X)), all([1, 1, 1, 2, 2, 2])) assert_equal(all(clf3.fit(X, y).predict(X)), all([1, 1, 1, 2, 2, 2])) eclf = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard', weights=[1, 1, 1]) assert_equal(all(eclf.fit(X, y).predict(X)), all([1, 1, 1, 2, 2, 2])) eclf = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft', weights=[1, 1, 1]) assert_equal(all(eclf.fit(X, y).predict(X)), all([1, 1, 1, 2, 2, 2]))
def train(self, inverse_regularisation=1.0, verbose=True): self.trained_classifiers = {} if verbose: print '=' * 120 print 'Training' for n, clser in enumerate(sorted(self.classifiers)): if verbose: print '-' * 120 print "Training classifier: ", clser, ' #', n+1 , '/', len(self.classifiers) print " Matrix: ", (len(self.classifiers_outputs[clser]), len(self.classifiers_features_list[clser])) classifier_input = np.zeros((len(self.classifiers_outputs[clser]), len(self.classifiers_features_list[clser]))) for i, feat in enumerate(self.classifiers_features[clser]): classifier_input[i] = feat.get_feature_vector(self.classifiers_features_mapping[clser]) lr = LogisticRegression('l2', C=inverse_regularisation, tol=1e-6) lr.fit(classifier_input, self.classifiers_outputs[clser]) self.trained_classifiers[clser] = lr if verbose: mean_accuracy = lr.score(classifier_input, self.classifiers_outputs[clser]) print " Prediction mean accuracy on the training data: %6.2f" % (100.0 * mean_accuracy, ) print " Size of the params:", lr.coef_.shape
def test_cross_val_predict_with_method(): iris = load_iris() X, y = iris.data, iris.target X, y = shuffle(X, y, random_state=0) classes = len(set(y)) kfold = KFold(len(iris.target)) methods = ['decision_function', 'predict_proba', 'predict_log_proba'] for method in methods: est = LogisticRegression() predictions = cross_val_predict(est, X, y, method=method) assert_equal(len(predictions), len(y)) expected_predictions = np.zeros([len(y), classes]) func = getattr(est, method) # Naive loop (should be same as cross_val_predict): for train, test in kfold.split(X, y): est.fit(X[train], y[train]) expected_predictions[test] = func(X[test]) predictions = cross_val_predict(est, X, y, method=method, cv=kfold) assert_array_almost_equal(expected_predictions, predictions)
def main(): parser = argparse.ArgumentParser() parser.add_argument('dataset', help='dataset must have data, target, target_names attributes') parser.add_argument('-c', '--classifier', default='logistic_regression', help='now supports logistic_regression only') parser.add_argument('-O', '--output', default='clf.pkl.gz', help='saving clf filename') args = parser.parse_args(sys.argv[1:]) print('loading dataset') with gzip.open(args.dataset, 'rb') as f: dataset = pickle.load(f) X = dataset.data y = dataset.target target_names = dataset.target_names # create train and test data X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=np.random.randint(1234)) # train and test if args.classifier == 'logistic_regression': clf = LogisticRegression() else: raise ValueError('unsupported classifier') print('fitting {0}'.format(args.classifier)) clf.fit(X_train, y_train) clf.target_names_ = target_names with gzip.open(args.output, 'wb') as f: pickle.dump(clf, f) y_pred = clf.predict(X_test) print('score of classifier: {}'.format(accuracy_score(y_test, y_pred))) print(classification_report(y_test, y_pred, target_names=target_names))
def training_stage3(dftrain,dfvalid,cat1,i): fname = ddir + 'joblib/stage3_'+str(cat1)+ext df = dftrain[dftrain.Categorie1 == cat1].reset_index(drop=True) dfv = dfvalid[dfvalid.Categorie1 == cat1].reset_index(drop=True) labels = np.unique(df.Categorie3) if len(labels)==1: joblib.dump((labels,None,None),fname) scv = -1 sct = -1 print 'training',cat1,'\t\t(',i,') : N=',len(df),'K=',len(labels) print 'training',cat1,'\t\t(',i,') : training=',sct,'validation=',scv return (sct,scv) vec,X = vectorizer_stage3(df.txt) Y = df['Categorie3'].values cla = LogisticRegression(C=best_regularisation.get(cat1,100)) cla.fit(X,Y) labels = np.unique(df.Categorie3) sct = cla.score(X[:min(10000,len(df))],Y[:min(10000,len(df))]) if len(dfv)==0: scv = -1 else: Xv = vec.transform(dfv.txt) Yv = dfv['Categorie3'].values scv = cla.score(Xv,Yv) print 'training',cat1,'\t\t(',i,') : N=',len(df),'K=',len(labels) print 'training',cat1,'\t\t(',i,') : training=',sct,'validation=',scv joblib.dump((labels,vec,cla),fname) del vec,cla return (sct,scv)
def test_string_labels_refit_false(): np.random.seed(123) clf1 = LogisticRegression() clf2 = RandomForestClassifier() clf3 = GaussianNB() y_str = y.copy() y_str = y_str.astype(str) y_str[:50] = 'a' y_str[50:100] = 'b' y_str[100:150] = 'c' clf1.fit(X, y_str) clf2.fit(X, y_str) clf3.fit(X, y_str) eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='hard', refit=False) eclf.fit(X, y_str) assert round(eclf.score(X, y_str), 2) == 0.97 eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='soft', refit=False) eclf.fit(X, y_str) assert round(eclf.score(X, y_str), 2) == 0.97
def test_performance_in_other_dataset(self): from sklearn.linear_model import LogisticRegression from sklearn.cross_validation import StratifiedShuffleSplit compr_matrix = self.W0s.get_value().T # currently best compression AT_X_compr = np.dot(compr_matrix, AT_X.T).T clf = LogisticRegression(penalty='l1') folder = StratifiedShuffleSplit(y=AT_labels, n_iter=5, test_size=0.2, random_state=42) acc_list = [] prfs_list = [] for (train_inds, test_inds) in folder: clf.fit(AT_X_compr[train_inds, :], AT_labels[train_inds]) pred_y = clf.predict(AT_X_compr[test_inds, :]) acc = (pred_y == AT_labels[test_inds]).mean() prfs_list.append(precision_recall_fscore_support( AT_labels[test_inds], pred_y)) acc_list.append(acc) compr_mean_acc = np.mean(acc_list) prfs = np.asarray(prfs_list).mean(axis=0) return compr_mean_acc, prfs
def check_lambda(dirnm, datanm_train, datanm_valid, datanm_orig_train, datanm_orig_valid, samples_per_class, Cs, num_classes): spct = 10*70 tdata, tlabels = load_full(dirnm+datanm_train, spct) print tdata.shape, tlabels.shape spct = 10 otdata, otlabels = load_full(dirnm+datanm_orig_train, spct) spct = 10*30 vdata, vlabels = load_full(dirnm+datanm_valid, spct) spct = 10 ovdata, ovlabels = load_full(dirnm+datanm_orig_valid, spct) # artif ans = np.zeros((len(Cs), 4)) for i, C in enumerate(Cs): clf = LogisticRegression(C =C, penalty='l2', multi_class = 'ovr', tol=0.001, n_jobs = -1, verbose = 0, solver = 'newton-cg') clf.fit(tdata, tlabels) out_train = clf.predict_proba(tdata) out_valid = clf.predict_proba(vdata) out_train_real = clf.predict_proba(otdata) out_valid_real = clf.predict_proba(ovdata) ans[i, 0] += log_loss(tlabels, out_train) ans[i, 1] += log_loss(vlabels, out_valid) ans[i, 2] += log_loss(otlabels, out_train_real) ans[i, 3] += log_loss(ovlabels, out_valid_real) np.savez("logreg_lambda", ans= ans, Cs = Cs, num_classes = num_classes, samples_per_class = samples_per_class) return ans
def predict_lr(train_features, test_features, train_labels, test_labels): model = LogisticRegression() model.fit(train_features, train_labels) predictions = model.predict(train_features) print get_accuracy(predictions, train_labels) predictions = model.predict(test_features) print get_accuracy(predictions, test_labels)
def logit_clf(dataset, DV, train): start = time.time() # Load Data to Pandas data = pd.read_csv(dataset, index_col=0) data.columns = [camel_to_snake(col) for col in data.columns] #DV y = data[str(DV)] X = data[data.columns - [str(DV)]] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) model = LogisticRegression() if train=='yes': model1 = model.fit(X_train, y_train) print "Classifier: Logistic Regression" print pd.DataFrame(zip(X.columns, np.transpose(model.coef_))) end = time.time() print "Runtime, base model: %.3f" % (end-start), "seconds." return model1 elif train=='no': model2 = model.fit(X, y) print "Classifier: Logistic Regression" print pd.DataFrame(zip(X.columns, np.transpose(model.coef_))) end = time.time() print "Runtime, base model: %.3f" % (end-start), "seconds." return model2
def check_vb(dirnm, datanm_train, datanm_valid, C, num_classes): spct = 10*70 tdata, tlabels = load_full(dirnm+datanm_train, spct) #print tdata.shape, tlabels.shape spct = 10*30 vdata, vlabels = load_full(dirnm+datanm_valid, spct) h = np.arange(0, 310, 10) h[0] +=1 # artif ans = np.zeros((h.size, 2)) tind = kget(tlabels, num_classes, h[-1]) vind = kget(vlabels, num_classes, h[-1]) for l in xrange(0, h.size): clf = LogisticRegression(C =C, penalty='l2', multi_class = 'ovr', tol=0.001, n_jobs = -1, verbose = 0, solver = 'newton-cg') clf.fit(tdata[tind[:h[l]*num_classes]], tlabels[tind[:h[l]*num_classes]]) out_train = clf.predict_proba(tdata[tind[:h[l]*num_classes]]) out_valid = clf.predict_proba(vdata[vind[:h[l]*num_classes]]) ans[l, 0] += log_loss(tlabels[tind[:h[l]*num_classes]], out_train) ans[l, 1] += log_loss(vlabels[vind[:h[l]*num_classes]], out_valid) np.savez("logreg_bv", ans= ans, C = C, num_classes = num_classes) return ans
def Predict_Survivors(): data = open_data('Data/Titanic/train.csv') target = data[::,0].astype(np.float) features = [row[1::] for row in data] #filter out class and sex and age as main features # [1,2,3 are first/second/third class] [0 is male, 1 is female] [0 is child, 1 is adult] filtered_features = strip_array(features) log_reg = LogisticRegression().fit(filtered_features,target) print log_reg.predict_proba([3,0]) #probability of third class male adult? print log_reg.predict_proba([1,1]) #probability of first class female adult? data_test = open_data('Data/Titanic/test.csv') data_test_array = [row[0::] for row in data_test] data_test_array_stripped = strip_array(data_test_array) predicted = log_reg.predict(data_test_array_stripped) cv = cross_validation.cross_val_score(log_reg, filtered_features, target, cv = 3, n_jobs = 2) print cv np.savetxt('Data/Titanic/submission_titanic_class_gender_age.csv', predicted, delimiter=',', fmt='%d')
def readout_sk(self, X_train, X_test, y_train, y_test, **kwargs): from sklearn.linear_model import LogisticRegression lr = LogisticRegression(**kwargs) lr.fit(X_train.T, y_train.T) y_train_predictions = lr.predict(X_train.T) y_test_predictions = lr.predict(X_test.T) return accuracy_score(y_train_predictions, y_train.T), accuracy_score(y_test_predictions, y_test.T)
def test_ratio(self): ''' Compare several competing methods changing the ratio of the positive class in the dataset. We use binary class dataset for the easy of interpretation. ''' dataset = rcv1_binary_reader.toNumpy() set_size = 100 X_train_full, y_train_full, X_test, y_test = dataset X_train, y_train = self.get_sub_set_with_size([X_train_full, y_train_full], set_size) assert(len(y_train) == set_size) train_set = (X_train, y_train) test_set_original = (X_test, y_test) clf = LogisticRegression() clf.fit(X_train, y_train) p = Prior(clf) for r in np.arange(0.05, 1.0, 0.05): # Generate a new test set with desired positive proportions. X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1) test_set = [X_test_new, y_test_new] true_pos = DE.arrayToDist(y_test_new)[1] p.fit(X_train, y_train, {-1:1-true_pos, 1:true_pos}) y_pred = p.predict(X_test_new) cm = confusion_matrix(y_test_new, y_pred) acc = self.accuracy(cm) print r, acc
def get_predictions_and_actual_outcomes(): """Get predictions for a particular model and the actual outcomes.""" df_train_set, df_test_set = get_train_set_and_test_set_dataframes() y_true_train = df_train_set['decision'].values x_train = df_train_set.drop(['docket', 'decision'], axis=1).values y_true_test = df_test_set['decision'].values x_test = df_test_set.drop(['docket', 'decision'], axis=1).values lr_model = LogisticRegression() lr_model.fit(x_train, y_true_train) probs = lr_model.predict_proba(x_test)[:, 1] threshold = 0.65 y_pred = probs > threshold predictions_and_actual_outcomes = [] for docket, prediction in zip(df_test_set['docket'].tolist(), y_pred): if prediction == True: predicted_winning_side = 'petitioner' else: predicted_winning_side = 'respondent' actual_outcome = df_test_set[df_test_set['docket'] == docket]['decision'].values[0] if actual_outcome == True: actual_winning_side = 'petitioner' else: actual_winning_side = 'respondent' print actual_winning_side predictions_and_actual_outcomes.append(docket + \ ':' + predicted_winning_side + \ ':' + actual_winning_side) return '\n'.join(predictions_and_actual_outcomes)
def logit(X_train, y_train,arg): logr = LogisticRegression(C=arg) clf = logr.fit(X_train, y_train) # clf = LogisticRegression() # clf.fit(X_train,y_train) return clf
def train_and_test(domain_dir, sentences): train_dir = os.path.join(domain_dir, "train") test_dir = os.path.join(domain_dir, "test") X_train, y_train = load_svmlight_file(os.path.join(train_dir, "feature_vector")) X_test, y_test = load_svmlight_file(os.path.join(test_dir, "feature_vector")) clf = LogisticRegression(C=1.0, intercept_scaling=1, dual=False, fit_intercept=True, penalty="l2", tol=0.0001) print("fit..") clf.fit(X_train, y_train) print("fit end...") y_train_predict = clf.predict(X_train) print(f1_score(y_train, y_train_predict)) y = clf.predict(X_test) f = open(os.path.join(test_dir, "relation.classifier"), "w", encoding="utf8") i = 0 for sentence in sentences: flag = False str_list = [] str_list.append("S\t{0}".format(sentence.text)) for pair in sentence.candidate_relation: if y[i] != 0: flag = True str_list.append("R\t{0}\t{1}\t{2}\t{3}".format( sentence.print_phrase(pair[0]).lower(), sentence.print_phrase(pair[1]).lower(), list(pair[0]), list(pair[1]))) i += 1 if flag: for s in str_list: print(s, file=f) f.close()
def Predict(): USERAW = False clf = LogisticRegression(C=2.3,class_weight='auto') if USERAW: fio = fileio.RawInput('../data/alldata.csv',usePairs=True,useTrips=True) fio.df.to_csv('../data/tripsFractions.csv',index=False) else: fio = fileio.Preprocessed('../data/tripsFractions.csv') base = [201, 294, 260, 67, 220, 235, 7, 176, 290, 48, 309, 156, 66, 263, 138, 262, 35, 18, 233, 208, 240, 338, 0, 210, 9, 295, 317] # seed 410 for b in base: print "%d. %s" %(b,fio.df.columns[b]) return fio.encode(base) train, truth = fio.transformTrain(base) c = classifier.Classifier(train, truth) prefix = 'lib/logr' c.validate(clf,nFolds=10,out=prefix+'.csv') score = c.holdout(clf,nFolds=10,fraction=0.2) print score if True: test = fio.transformTest(base) clf.fit(train,truth) y_ = clf.predict_proba(test)[:,1] writeSubmission(y_,filename=prefix+'Test.csv') return
def thresholds(): for name in ['ant', 'ivy', 'jedit', 'lucene', 'poi']: print("##", name) train, test = explore(dir='../Data/Jureczko/', name=name) data_DF=csv2DF(train, toBin=True) metrics=[str[1:] for str in data_DF[data_DF.columns[:-1]]] ubr = LogisticRegression() X = data_DF[data_DF.columns[:-1]].values y = data_DF[data_DF.columns[-1]].values ubr.fit(X,y) inter, coef, pVal = ubr.intercept_[0], ubr.coef_[0], f_classif(X,y)[1] table= texttable.Texttable() table.set_cols_align(["l","l","l"]) table.set_cols_valign(["m","m","m"]) table.set_cols_dtype(['t', 't', 't']) table_rows=[["Metric", "Threshold", "P-Value"]] for i in xrange(len(metrics)): if VARL(coef[i], inter, p0=0.05)>0 and pVal[i]<0.05: thresh="%0.2f"%VARL(coef[i], inter, p0=0.1) table_rows.append([metrics[i], thresh, "%0.3f"%pVal[i]]) table.add_rows(table_rows) print(table.draw()) # === DEBUG === set_trace() return None
def test_curve_diffs(self): np.random.seed(0) clf = LogisticRegression() scikitplot.classifier_factory(clf) ax_micro = clf.plot_precision_recall_curve(self.X, self.y, curves='micro') ax_class = clf.plot_precision_recall_curve(self.X, self.y, curves='each_class') self.assertNotEqual(ax_micro, ax_class)
def test_topic_distribution(doc_topic_weights_filename, annotated_data_filename, k, train_prop, num_repeat, column_of_interest): (X, Y) = process_dataset(doc_topic_weights_filename, annotated_data_filename, k, column_of_interest) num_train = int(X.shape[0] * train_prop) # We repeat the experiments and report the average scores = [] for i in range(num_repeat): print "Iteration: %d" % i rng = np.random.RandomState(i) indices = np.arange(len(X)) rng.shuffle( indices ) # Divide the set into train and test sets X_train = X[indices[:num_train]] Y_train = Y[indices[:num_train]] X_test = X[indices[num_train+1:]] Y_test = Y[indices[num_train+1:]] # Build a classifier clf = LogisticRegression().fit(X_train, Y_train) # Make prediction predicted_labels = clf.predict(X_test) # Report the accuracy true_labels = Y_test score = f1_score(predicted_labels, true_labels) scores.append( score ) return sum(scores) / len(scores)
def evaluate(model, k=10, seed=1234, evalcv=True, evaltest=False): """ Run experiment k: number of CV folds test: whether to evaluate on test set """ print 'Preparing data...' traintext, testtext = load_data() train, train_labels = prepare_data(traintext) test, test_labels = prepare_data(testtext) train_labels = prepare_labels(train_labels) test_labels = prepare_labels(test_labels) train, train_labels = shuffle(train, train_labels, random_state=seed) print 'Computing training skipthoughts...' trainF = skipthoughts.encode(model, train, verbose=False, use_eos=False) if evalcv: print 'Running cross-validation...' interval = [2**t for t in range(0,9,1)] # coarse-grained C = eval_kfold(trainF, train_labels, k=k, scan=interval, seed=seed) if evaltest: if not evalcv: C = 128 # Best parameter found from CV print 'Computing testing skipthoughts...' testF = skipthoughts.encode(model, test, verbose=False, use_eos=False) print 'Evaluating...' clf = LogisticRegression(C=C) clf.fit(trainF, train_labels) yhat = clf.predict(testF) print 'Test accuracy: ' + str(clf.score(testF, test_labels))
def giniGrowth(df,woeVarsInfo,badFlag): woeTable = woeVarsInfo.copy() woeTable.variable = woeTable.variable.apply(lambda x: x + '_WOE') IV = getIVfromWOE(woeTable) columns = IV.variable columnsForModeking = [] giniTest = [] giniTrain = [] y = df[badFlag].values for col in columns: columnsForModeking.append(col) X = df[columnsForModeking].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=3) lr = LogisticRegression() lr.fit(X_train,y_train) pr_test = lr.predict_proba(X_test)[:,1] pr_train = lr.predict_proba(X_train)[:,1] rocGiniTest = met.roc_auc_score(y_test,pr_test) * 2 - 1 rocGiniTrain = met.roc_auc_score(y_train,pr_train) * 2 - 1 giniTest.append(rocGiniTest) giniTrain.append(rocGiniTrain) trainDiff = [x-y for x,y in zip(giniTrain,[0]+giniTrain[:-1])] testDiff = [x-y for x,y in zip(giniTest,[0]+giniTest[:-1])] dfOut = pd.DataFrame({'variable':columns, 'giniTrain' : giniTrain,'giniTest': giniTest,'trainDiff':trainDiff,'testDiff':testDiff,'informationValue':list(IV.InformationValue)}) dfOut[['trainDiff','testDiff']] = dfOut[['trainDiff','testDiff']]#.apply('${:,.2f}'.format) dfOut = dfOut.reindex_axis(['variable','informationValue','testDiff','trainDiff','giniTest','giniTrain'],axis=1) return dfOut
def test_scoring(): X, y = iris_data() clf1 = LogisticRegression(random_state=1, solver='liblinear', multi_class='ovr') clf2 = DecisionTreeClassifier(random_state=1) X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=0.5, random_state=123) score1 = clf1.fit(X_train, y_train).score(X_test, y_test) score2 = clf2.fit(X_train, y_train).score(X_test, y_test) assert round(score1, 2) == 0.96, round(score1, 2) assert round(score2, 2) == 0.91, round(score2, 2) t, p = paired_ttest_kfold_cv(estimator1=clf1, estimator2=clf2, X=X, y=y, scoring='accuracy', random_seed=1) assert round(t, 3) == -1.861, t assert round(p, 3) == 0.096, p t, p = paired_ttest_kfold_cv(estimator1=clf1, estimator2=clf2, X=X, y=y, scoring='recall_micro', random_seed=1) assert round(t, 3) == -1.861, t assert round(p, 3) == 0.096, p
def train_tagger(fold, essays_TD, essays_VD, wd_test_tags, wd_train_tags, dual, C, penalty, fit_intercept, multi_class): # TD and VD are lists of Essay objects. The sentences are lists # of featureextractortransformer.Word objects """ Data Partitioning and Training """ td_feats, td_tags = flatten_to_wordlevel_feat_tags(essays_TD) vd_feats, vd_tags = flatten_to_wordlevel_feat_tags(essays_VD) feature_transformer = FeatureVectorizer(min_feature_freq=MIN_FEAT_FREQ, sparse=SPARSE_WD_FEATS) td_X, vd_X = feature_transformer.fit_transform(td_feats), feature_transformer.transform(vd_feats) """ compute most common tags per word for training only (but not for evaluation) """ wd_td_ys = get_wordlevel_mostfrequent_ys(td_tags, wd_train_tags, tag_freq) """ TRAIN Tagger """ solver = 'liblinear' if multi_class == 'multinomial': solver = "lbfgs" model = LogisticRegression(dual=dual, C=C, penalty=penalty, fit_intercept=fit_intercept, multi_class=multi_class, solver=solver) if fold == 0: print(model) model.fit(td_X, wd_td_ys) wd_td_pred = model.predict(td_X) wd_vd_pred = model.predict(vd_X) """ TEST Tagger """ td_wd_predictions_by_code = get_by_code_from_powerset_predictions(wd_td_pred, wd_test_tags) vd_wd_predictions_by_code = get_by_code_from_powerset_predictions(wd_vd_pred, wd_test_tags) """ Get Actual Ys by code (dict of label to predictions """ wd_td_ys_by_code = get_wordlevel_ys_by_code(td_tags, wd_train_tags) wd_vd_ys_by_code = get_wordlevel_ys_by_code(vd_tags, wd_train_tags) return td_wd_predictions_by_code, vd_wd_predictions_by_code, wd_td_ys_by_code, wd_vd_ys_by_code
class SimpleGB(BaseEstimator): def __init__(self, tree_params_dict, iters, tau): self.tree_params_dict = tree_params_dict self.iters = iters self.tau = tau def fit(self, X_data, y_data): self.base_algo = LogisticRegression(C=0.0005).fit(X_data, y_data) self.estimators = [] # p = 1 / (1 + exp(-a)) => a = - ln (1 / p - 1) = ln(p / (1 - p)) curr_pred = - np.log(1. / self.base_algo.predict_proba(X_data)[:, 1] - 1) for iter_num in range(self.iters): # y это 0 или 1 # a - сырое предсказание # f(a) = 1 / (1 + exp(-a)) - преобразование в вероятность # f'(a) = - exp(a) / (1 + exp(-a))^2 = - f(a) (1 - f(a)) # log loss это (y log f(a) + (1 - y) log(1 - f(a))) # d/da (y log f(a) + (1 - y) log(1 - f(a))) = f'(a) (y/f(a) - (1 - y) / (1 - f(a))) fa = 1. / (1 + np.exp(-curr_pred)) grad = - fa * (1. - fa) * (y_data / fa - (1. - y_data) / (1. - fa)) algo = DecisionTreeRegressor(**self.tree_params_dict).fit(X_data, - grad) self.estimators.append(algo) curr_pred += self.tau * algo.predict(X_data) return self def predict(self, X_data): res = - np.log(1. / self.base_algo.predict_proba(X_data)[:, 1] - 1) for estimator in self.estimators: res += self.tau * estimator.predict(X_data) return res > 0.1 # этот порог можно варировать с целью повышения метрики
def test_multiclass_classifier_class_weight(): """tests multiclass with classweights for each class""" alpha = .1 n_samples = 20 tol = .00001 max_iter = 50 class_weight = {0: .45, 1: .55, 2: .75} fit_intercept = True X, y = make_blobs(n_samples=n_samples, centers=3, random_state=0, cluster_std=0.1) step_size = get_step_size(X, alpha, fit_intercept, classification=True) classes = np.unique(y) clf1 = LogisticRegression(solver='sag', C=1. / alpha / n_samples, max_iter=max_iter, tol=tol, random_state=77, fit_intercept=fit_intercept, class_weight=class_weight) clf2 = clone(clf1) clf1.fit(X, y) clf2.fit(sp.csr_matrix(X), y) le = LabelEncoder() class_weight_ = compute_class_weight(class_weight, np.unique(y), y) sample_weight = class_weight_[le.fit_transform(y)] coef1 = [] intercept1 = [] coef2 = [] intercept2 = [] for cl in classes: y_encoded = np.ones(n_samples) y_encoded[y != cl] = -1 spweights1, spintercept1 = sag_sparse(X, y_encoded, step_size, alpha, n_iter=max_iter, dloss=log_dloss, sample_weight=sample_weight) spweights2, spintercept2 = sag_sparse(X, y_encoded, step_size, alpha, n_iter=max_iter, dloss=log_dloss, sample_weight=sample_weight, sparse=True) coef1.append(spweights1) intercept1.append(spintercept1) coef2.append(spweights2) intercept2.append(spintercept2) coef1 = np.vstack(coef1) intercept1 = np.array(intercept1) coef2 = np.vstack(coef2) intercept2 = np.array(intercept2) for i, cl in enumerate(classes): assert_array_almost_equal(clf1.coef_[i].ravel(), coef1[i].ravel(), decimal=2) assert_almost_equal(clf1.intercept_[i], intercept1[i], decimal=1) assert_array_almost_equal(clf2.coef_[i].ravel(), coef2[i].ravel(), decimal=2) assert_almost_equal(clf2.intercept_[i], intercept2[i], decimal=1)
def make_classifier(min_word_size, stemmer): print "Training the classifier..." # Build X matrix of vector representations of review files, # and y vector of labels pos_file_names = get_file_names('pos') neg_file_names = get_file_names('neg') # m is the number of training examples m_pos = len(pos_file_names) m_neg = len(neg_file_names) m = m_pos + m_neg pos_labels = np.ones(m_pos) neg_labels = -np.ones(m_neg) y = np.concatenate((pos_labels, neg_labels), axis=0) # get dimensions of data dimensions = len(vocab) # initialize X X = np.zeros((m, dimensions)) message = "{:.2%} percent done\r" # build X for i in xrange(m_pos): X[i, :] = vectorize(pos_file_names[i], min_word_size, stemmer) sys.stdout.write(message.format(i / float(m))) sys.stdout.flush() for j in xrange(m_neg): X[j + m_pos, :] = vectorize(neg_file_names[j], min_word_size, stemmer) sys.stdout.write(message.format((m_pos + j) / float(m))) sys.stdout.flush() # make the logistic regression function lr = LR() lr.fit(X, y) return lr
def fit(x, y): # evaluate the model by splitting into train and test sets x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0) model = LogisticRegression() model.fit(x_train, y_train) # predict class labels for the test set predicted = model.predict(x_test) print(predicted) # generate class probabilities pr = model.predict_proba(x_test) print(pr) # generate evaluation metrics print(metrics.accuracy_score(y_test, predicted)) print(metrics.roc_auc_score(y_test, pr[:, 1])) print(metrics.confusion_matrix(y_test, predicted)) print(metrics.classification_report(y_test, predicted)) # evaluate the model using 10-fold cross-validation scores = cross_val_score(LogisticRegression(), x, y, scoring='accuracy', cv=10) print(scores) print(scores.mean())
def test_do_cv(self): np.random.seed(0) clf = LogisticRegression() scikitplot.classifier_factory(clf) ax = clf.plot_precision_recall_curve(self.X, self.y) self.assertRaises(AttributeError, clf.plot_precision_recall_curve, self.X, self.y, do_cv=False)
# The runClassification function will accept a list with the classifiers that the # # we wish to run. # ##################################################################################### # TODO: Abstract these following classifiers so they can be passed into runClassification with their own parameters clfs = { 'RF': RandomForestClassifier(n_estimators=50, n_jobs=-1), 'ET': ExtraTreesClassifier(n_estimators=10, n_jobs=-1, criterion='entropy'), 'AB': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200), 'LR': LogisticRegression(penalty='l1', C=1e5), 'SVM': svm.SVC(kernel='linear', probability=True, random_state=0), 'GB': GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=10), 'NB': GaussianNB(), 'DT': DecisionTreeClassifier() } attributes = { 'study_hrs': 0,
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1) # TF-IDF vectorizer = TfidfVectorizer(min_df=10) vectorizer = vectorizer.fit(X) X_train_tfidf = vectorizer.transform(X_train) X_test_tfidf = vectorizer.transform(X_test) # Dimensionality Reduction lda = LinearDiscriminantAnalysis(n_components=10) lda = lda.fit(X_train_tfidf.toarray(), y_train) X_train_lda = lda.transform(X_train_tfidf.toarray()) X_test_lda = lda.transform(X_test_tfidf.toarray()) # Machine Learning clf = LogisticRegression(max_iter=10000).fit(X_train_lda, y_train) # Results X_train_pred = [np.round(i) for i in clf.predict(X_train_lda)] X_test_pred = [np.round(i) for i in clf.predict(X_test_lda)] print(classification_report(y_train, X_train_pred)) print(classification_report(y_test, X_test_pred)) #%% Alternate Result Printing results = pd.DataFrame(zip(y_test, X_test_pred)) results[2] = results[1] - results[0] results = pd.DataFrame(dict(Counter(results[2])).items()).sort_values(1) results[0] = results[0].apply(np.abs) results = results.groupby(0).sum() sum = results.sum().item() results["diff"] = results[1] / sum
def test_check_estimator(): # tests that the estimator actually fails on "bad" estimators. # not a complete test of all checks, which are very extensive. # check that we have a set_params and can clone msg = "it does not implement a 'get_params' methods" assert_raises_regex(TypeError, msg, check_estimator, object) assert_raises_regex(TypeError, msg, check_estimator, object()) # check that values returned by get_params match set_params msg = "get_params result does not match what was passed to set_params" assert_raises_regex(AssertionError, msg, check_estimator, ModifiesValueInsteadOfRaisingError()) assert_warns(UserWarning, check_estimator, RaisesErrorInSetParams()) assert_raises_regex(AssertionError, msg, check_estimator, ModifiesAnotherValue()) # check that we have a fit method msg = "object has no attribute 'fit'" assert_raises_regex(AttributeError, msg, check_estimator, BaseEstimator) assert_raises_regex(AttributeError, msg, check_estimator, BaseEstimator()) # check that fit does input validation msg = "ValueError not raised" assert_raises_regex(AssertionError, msg, check_estimator, BaseBadClassifier) assert_raises_regex(AssertionError, msg, check_estimator, BaseBadClassifier()) # check that sample_weights in fit accepts pandas.Series type try: from pandas import Series # noqa msg = ("Estimator NoSampleWeightPandasSeriesType raises error if " "'sample_weight' parameter is of type pandas.Series") assert_raises_regex(ValueError, msg, check_estimator, NoSampleWeightPandasSeriesType) except ImportError: pass # check that predict does input validation (doesn't accept dicts in input) msg = "Estimator doesn't check for NaN and inf in predict" assert_raises_regex(AssertionError, msg, check_estimator, NoCheckinPredict) assert_raises_regex(AssertionError, msg, check_estimator, NoCheckinPredict()) # check that estimator state does not change # at transform/predict/predict_proba time msg = 'Estimator changes __dict__ during predict' assert_raises_regex(AssertionError, msg, check_estimator, ChangesDict) # check that `fit` only changes attribures that # are private (start with an _ or end with a _). msg = ('Estimator ChangesWrongAttribute should not change or mutate ' 'the parameter wrong_attribute from 0 to 1 during fit.') assert_raises_regex(AssertionError, msg, check_estimator, ChangesWrongAttribute) check_estimator(ChangesUnderscoreAttribute) # check that `fit` doesn't add any public attribute msg = (r'Estimator adds public attribute\(s\) during the fit method.' ' Estimators are only allowed to add private attributes' ' either started with _ or ended' ' with _ but wrong_attribute added') assert_raises_regex(AssertionError, msg, check_estimator, SetsWrongAttribute) # check for invariant method name = NotInvariantPredict.__name__ method = 'predict' msg = ("{method} of {name} is not invariant when applied " "to a subset.").format(method=method, name=name) assert_raises_regex(AssertionError, msg, check_estimator, NotInvariantPredict) # check for sparse matrix input handling name = NoSparseClassifier.__name__ msg = "Estimator %s doesn't seem to fail gracefully on sparse data" % name # the check for sparse input handling prints to the stdout, # instead of raising an error, so as not to remove the original traceback. # that means we need to jump through some hoops to catch it. old_stdout = sys.stdout string_buffer = StringIO() sys.stdout = string_buffer try: check_estimator(NoSparseClassifier) except: pass finally: sys.stdout = old_stdout assert msg in string_buffer.getvalue() # Large indices test on bad estimator msg = ('Estimator LargeSparseNotSupportedClassifier doesn\'t seem to ' r'support \S{3}_64 matrix, and is not failing gracefully.*') assert_raises_regex(AssertionError, msg, check_estimator, LargeSparseNotSupportedClassifier) # does error on binary_only untagged estimator msg = 'Only 2 classes are supported' assert_raises_regex(ValueError, msg, check_estimator, UntaggedBinaryClassifier) # non-regression test for estimators transforming to sparse data check_estimator(SparseTransformer()) # doesn't error on actual estimator check_estimator(LogisticRegression) check_estimator(LogisticRegression(C=0.01)) check_estimator(MultiTaskElasticNet) check_estimator(MultiTaskElasticNet()) # doesn't error on binary_only tagged estimator check_estimator(TaggedBinaryClassifier) # Check regressor with requires_positive_y estimator tag check_estimator(RequiresPositiveYRegressor)
def model(): training = pd.read_excel('model/Training.xlsx') try: training.drop(columns=['year'], inplace=True) except: pass # In[2]: y = training['Promotion'] X = training X.drop(columns=['Promotion'], inplace=True) clf = LogisticRegression(random_state=0, max_iter=60000).fit(X, y) # In[9]: r = [] z = clf.intercept_ for x in X.columns.values: if x not in r: r.append(x) print(r) Model = pd.DataFrame(clf.coef_, columns=[r]) Model["Interception"] = z Model Model.to_excel('model/Coeficients.xlsx') # In[8]: test = pd.read_excel('model/Testing.xlsx') try: test.drop(columns=['year', 'Unnamed: 0'], inplace=True) except: pass y1 = test['Promotion'] X1 = test X1.drop(columns=['Promotion'], inplace=True) r1 = list(X.columns.values) r2 = list(X1.columns.values) r = list(set(r) - set(r2)) for x in r: X1[x] = 0 # In[29]: X1 = X1.reindex(columns=r1) for x in r1: X1[x].fillna(0, inplace=True) Test = clf.score(X1, y1) Trainig = clf.score(X, y) E = [] E.append(Trainig) E.append(Test) Errors = pd.DataFrame(E, columns=["error"], index=["Training", "Testing"]) Errors.to_excel('model/Errors.xlsx') # In[30]: y_P = clf.predict(X) f1 = f1_score(y, y_P, average='macro') recal = recall_score(y, y_P, average='macro') CM = confusion_matrix(y, y_P) y_P = clf.predict(X1) f1_t = f1_score(y1, y_P, average='macro') recal_t = recall_score(y1, y_P, average='macro') CM_T = confusion_matrix(y1, y_P) Va = [] Va.append(recal) Va.append(f1) Va.append(recal_t) Va.append(f1_t) Stats = pd.DataFrame(Va, columns=["Value"], index=["Recall", "F1", "Recall Test", "F1 Test"]) Stats.to_excel('model/Stats.xlsx')
from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() X_train[num_col] = scaler.fit_transform(X_train[num_col]) # ### Logistic Regression # In[38]: # Import 'LogisticRegression' and create a LogisticRegression object from sklearn.linear_model import LogisticRegression logreg = LogisticRegression(class_weight='balanced') # In[39]: # Import RFE and select 15 variables from sklearn.feature_selection import RFE rfe = RFE(logreg, 15) # running RFE with 15 variables as output rfe = rfe.fit(X_train, y_train) # In[40]:
print(classification_report(y_test, y_pred)) #Building a logistic regression model # Import the necessary modules from sklearn.linear_model import LogisticRegression from sklearn.metrics import confusion_matrix, classification_report # Create training and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42) # Create the classifier: logreg logreg = LogisticRegression() # Fit the classifier to the training data logreg.fit(X_train, y_train) # Predict the labels of the test set: y_pred y_pred = logreg.predict(X_test) # Compute and print the confusion matrix and classification report print(confusion_matrix(y_test, y_pred)) print(classification_report(y_test, y_pred)) #Plotting an ROC curve # Import necessary modules from sklearn.metrics import roc_curve
vectorizer = DictVectorizer() X_train_vec = vectorizer.fit_transform(train_as_dicts) #n380855x26828 test_as_dicts = [dict(r.iteritems()) for _, r in X_test.fillna(-1).iterrows()] X_test_vec = vectorizer.transform(test_as_dicts) def Label_enc(df): le = LabelEncoder() le.fit(df.unique()) vec = le.transform(df.values) return vec y_train_vec = Label_enc(y_train) y_test_vec = Lahel_enc(y_test) #%% logistic = LogisticRegression(multi_class = 'multinomial', solver = 'sag') logistic.fit(X_train_vec, y_train_vec) y_pred_log = logistic.predict(X_test_vec) y_test_vec = Label_enc(y_test) target_names = ['Charged Off', 'Current', 'Default', 'Fully Paid', 'In Grace Period', 'Issued', 'Late (16-30 days)', 'Late (31-120 days)'] print(classification_report(y_test_vec, y_pred_log, target_names = target_names)) #%% # Compute ROC curve and ROC area for each class y_test_bin = label_binarize(y_test_vec, classes = [0,1,2,3,4,5,6,7]) y_score = logistic.decision_function(X_test_vec) fpr = dict() tpr = dict() roc_auc = dict()
n_samples=data_points ) # to match minority class # reproducible results data = pd.concat([data_downsampled, alt_data]) x_data = data[training_head] x_data = x_data.values.astype(float) y_data = data["actual_use"] y_data = y_data.values.astype(float) #tree.DecisionTreeClassifier()#LogisticRegression() estimator = LogisticRegression( fit_intercept=True ) #tree.DecisionTreeClassifier() #LogisticRegression() #SVC(kernel="linear") #tree.DecisionTreeClassifier() #LogisticRegression()# rfecv = RFECV(estimator, step=1, cv=StratifiedKFold(2)) rfecv.fit(x_data, y_data) print('number of features selected:', rfecv.n_features_) x_new = rfecv.transform(x_data) selected_inds = rfecv.get_support(indices=True) selected_ranks = rfecv.ranking_ selected_feats = [training_head[ind] for ind in selected_inds] #print(selected_feats) #print(rfecv.estimator_.coef_)
y_cv.value_counts() # %% y_test.value_counts() # %% [markdown] # ### 4.1.1.1. Logistic Regression # %% C = [math.pow(base,i) for i in range(-6,6)] # H = [round(math.log(i,10)) for i in C] tuned_parameters = [{'C': C}, {'penalty':['l1','l2']}, {'class_weight':[None,'balanced']}] C = [round(math.log(i,base)) for i in C] clf = GridSearchCV(LogisticRegression(), tuned_parameters, cv=cv, scoring='recall', n_jobs=7, verbose=10) clf.fit(X_train_std, y_train) # plot_grid_search(clf, X_train, y_train, C) print(clf.best_estimator_) print(clf.best_params_) best_estimator = clf.best_estimator_ calib = CalibratedClassifierCV(best_estimator, cv=cv, method='sigmoid') calib.fit(X_train_std, y_train) plot_confusion_matrix(y_train, calib.predict(X_train_std), y_test, calib.predict(X_test_std)) # %% threshold, cost = plot_precision_recall_costs(calib, X_cv_std, y_cv)
import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.datasets import load_iris from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report, confusion_matrix X, y = load_iris(return_X_y=True, as_frame=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=5) model = LogisticRegression() model.fit(X_train, y_train) preds = model.predict(X_test) print("Resultados\n") print(confusion_matrix(y_test, preds)) print(classification_report(y_test, preds))
feature_names = ['statuses_count', 'followers_count','followees_count','favorites_count','listed_count','betweenness','sentiment','subjectivity','time_diff','time_diff_median','tweet number','retweet number','quote number','number urls','number hashtags','status length','baddies','mentions'] x=df_clean[feature_names] y=df_clean['hate'] mask = ~np.any(np.isnan(x), axis=1) x = x[mask] y = y[mask] scaler=RobustScaler() x = scaler.fit_transform(x) x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2) from sklearn.linear_model import LogisticRegression model=LogisticRegression() model.fit(x_train,y_train) y_pred=model.predict(x_test) from sklearn.metrics import accuracy_score accuracy = accuracy_score(y_test, y_pred) print("Accuracy: %.2f%%" % (accuracy * 100.0)) from sklearn.metrics import confusion_matrix from matplotlib import pyplot as plt print(classification_report(y_test, y_pred)) plot_confusion_matrix(confusion_matrix(y_test, y_pred))
with open('settings/specifications.json') as f: specifications = json.load(f) raw_train = pd.read_csv(TRAIN_CSV) x_columns = specifications['description']['X'] y_column = specifications['description']['y'] x_raw = raw_train[x_columns] loader = DataLoader() loader.fit(x_raw) X = loader.load_data() y = raw_train.Response model = LogisticRegression(C=0.01, penalty='l1', solver='liblinear') model.fit(X, y) with open('models/log_reg.pickle', 'wb')as f: pickle.dump(model, f) import pickle import json import pandas as pd from sklearn.svm import SVC from utils.dataloader import DataLoader
yTrain = copy.deepcopy(labels_train) yTest = copy.deepcopy(labels_test) print('=======>yTrain') for j in range(0, 60000): # print(yTrain[j]) if (yTrain[j] == i): yTrain[j] = 1 # print('-->if changed:',yTrain[j]) else: yTrain[j] = 0 # print('-->else changed:', yTrain[j]) # print('=======>yTest') for j in range(0, 10000): # print(yTest[j]) if (yTest[j] == i): yTest[j] = 1 else: yTest[j] = 0 logitL1 = LogisticRegression(penalty='l1', solver='liblinear', C=100) logitL1.fit(images_train, yTrain) print('Class:', i) trainScore = logitL1.score(np.array(images_train).reshape(60000, 784), np.array(labels_train).reshape(60000, 1)) score = logitL1.score(np.array(images_test).reshape(10000, 784), np.array(yTest).reshape(10000, 1)) print('Train Score:',trainScore) print('Test Score:', score) print('`````````````````````````````````````````````')
def predictor_func(val1, val2) : # To add a new cell, type '# %%' # To add a new markdown cell, type '# %% [markdown]' # %% from IPython import get_ipython # %% [markdown] # # Logistic Regression # # Logistic Regression is a statistical method for predicting binary outcomes from data. # # Examples of this are "yes" vs "no" or "young" vs "old". # # These are categories that translate to probability of being a 0 or a 1. # # Source: [Logistic Regression](https://towardsdatascience.com/real-world-implementation-of-logistic-regression-5136cefb8125) # %% [markdown] # We can calculate logistic regression by adding an activation function as the final step to our linear model. # # This converts the linear regression output to a probability. # %% # get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt import pandas as pd # %% [markdown] # Linear Regression: # <br>Y = β0+β1X # <br>Depending on the values of X (explanatory variable), the predict values for Y (response variable) may fall outside of \[ 0, 1 ] # <br>Changes in X have a linear effect on estimated probabilties # <br>Coefficients are easy to interpret, i.e., the change in Y when C increases by one unit # # <br>Logistic Regression: # <br>P(Y=1) = e^(β0+β1X) / (1 + e^(β0+β1X)) # <br>Predicted values always fall in \[ 0, 1 ] # <br>Changes in X can have a different effect on probabilities for different levels of X # <br>So, how to interpret Coefficients? # <br>The odds ratio for the estimated coefficient b1 is e^b1 # <br> # <br>Probabilities (Wins / (Wins+loses)) # <br>vs # <br>Odds (Wins / Loses) # <br> # <br>0.50 1/2 --- 0.50/(1-0.50) 1 # <br>0.33 1/3 --- 0.33/(1-0.33) 1/2 # <br>0.66 2/3 --- 0.66/(1-0.66) 2 # <br>0.20 1/5 --- 0.20/(1-0.2) 1/4 # %% from sklearn.datasets import make_blobs X, y = make_blobs(centers=2, random_state=42) print(f"Labels: {y[:10]}") print(f"Data: {X[:10]}") # %% # Visualizing both classes plt.scatter(X[:, 0], X[:, 1], c=y) # %% [markdown] # Split our data into training and testing # %% from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) # %% [markdown] # Create a Logistic Regression Model # %% from sklearn.linear_model import LogisticRegression classifier = LogisticRegression() classifier # %% [markdown] # Fit (train) or model using the training data # %% classifier.fit(X_train, y_train) # %% [markdown] # Validate the model using the test data # %% # Mean Accuracy print(f"Training Data Score: {classifier.score(X_train, y_train)}") print(f"Testing Data Score: {classifier.score(X_test, y_test)}") # %% [markdown] # Make predictions # %% # Generate a new data point (the red circle) import numpy as np new_data1 = np.array([[-2, 6]]) new_data2 = np.array([[-1, 6]]) new_data3 = np.array([[1, 6]]) new_data4 = np.array([[3, 6]]) new_data5 = np.array([[val1, val2]]) plt.scatter(X[:, 0], X[:, 1], c=y) plt.scatter(new_data1[0, 0], new_data1[0, 1], c="r", marker="o", s=100) plt.scatter(new_data2[0, 0], new_data2[0, 1], c="r", marker="o", s=100) plt.scatter(new_data3[0, 0], new_data3[0, 1], c="r", marker="o", s=100) plt.scatter(new_data4[0, 0], new_data4[0, 1], c="r", marker="o", s=100) # Predict the class (purple or yellow) of the new data point prediction1 = classifier.predict(new_data1) pred_prob1 = classifier.predict_proba(new_data1)[:, 1] pred_odds1 = pred_prob1/(1-pred_prob1) prediction2 = classifier.predict(new_data2) pred_prob2 = classifier.predict_proba(new_data2)[:, 1] pred_odds2 = pred_prob2/(1-pred_prob2) prediction3 = classifier.predict(new_data3) pred_prob3 = classifier.predict_proba(new_data3)[:, 1] pred_odds3 = pred_prob3/(1-pred_prob3) prediction4 = classifier.predict(new_data4) pred_prob4 = classifier.predict_proba(new_data4)[:, 1] pred_odds4 = pred_prob4/(1-pred_prob4) prediction5 = classifier.predict(new_data5) pred_prob5 = classifier.predict_proba(new_data5)[:, 1] pred_odds5 = pred_prob5/(1-pred_prob5) # %% print("Classes are either 0 (purple) or 1 (yellow)") print(f"The new point estimated probability is: {pred_prob1} {pred_prob2} {pred_prob3} {pred_prob4} {pred_prob5}") print(f"The new point estimated odds is: {pred_odds1} {pred_odds2} {pred_odds3} {pred_odds4} {pred_odds5}") print(f"The new point was classified as: {prediction1} {prediction2} {prediction3} {prediction4} {prediction5}") # %% result_response = { "probability": f'{round(float(pred_prob5[0]), 4)*100}%', "prediction": int(prediction5[0]) } # %% return result_response
def train_emotions(train, test, input): x_train = train['Testo_stringa'] y_train = train['Genere'] x_test = test['Testo_stringa'] y_test = test['Genere'] if input == "MNB": print("Multinomial Naive Bayes Classifier") mnb_model = Pipeline(steps=[ ("combined_features", TfidfVectorizer(ngram_range=(1, 2))), ("classifier", MultinomialNB()), ]) mnb_model.fit(x_train, y_train) y_pred = mnb_model.predict(x_test) print("Classification report: %s" % (classification_report(y_test, y_pred))) print("accuracy for multinomial naive bayes: %s" % mnb_model.score(x_test, y_test)) cm = confusion_matrix(y_test, y_pred) # print('Confusion Matrix', cm) conf_matr(input, cm, y_test, y_pred) if input == "LR": print("Logistic Regression Classifier") lr_model = Pipeline(steps=[ ("features", TfidfVectorizer(ngram_range=(1, 2))), ("classifier", LogisticRegression(solver="liblinear", multi_class="ovr")), ]) lr_model.fit(x_train, y_train) y_pred = lr_model.predict(x_test) print("Classification report: %s" % (classification_report(y_test, y_pred))) print("accuracy for LogisticRegression: %s" % (lr_model.score(x_test, y_test))) cm = confusion_matrix(y_test, y_pred) # print('Confusion Matrix', cm) conf_matr(input, cm, y_test, y_pred) if input == 'DT': print("Decision Tree Classifier") dt_model = Pipeline(steps=[ ("features", TfidfVectorizer(ngram_range=(1, 2))), ("classifier", DecisionTreeClassifier(max_depth=2)), ]) dt_model.fit(x_train, y_train) y_pred = dt_model.predict(x_test) print("Classification report: %s" % (classification_report(y_test, y_pred))) print("accuracy for Decision Tree %s" % (dt_model.score(x_test, y_test))) cm = confusion_matrix(y_test, y_pred) # print('Confusion Matrix', cm) conf_matr(input, cm, y_test, y_pred) if input == 'SVC': print("Support Vector Classifier") svc_model = Pipeline(steps=[ ("features", TfidfVectorizer(ngram_range=(1, 2))), ("classifier", SVC(kernel='linear', C=1)), ]) svc_model.fit(x_train, y_train) y_pred = svc_model.predict(x_test) print("Classification report: %s" % (classification_report(y_test, y_pred))) print("accuracy for Support Vector Classifier %s" % (svc_model.score(x_test, y_test))) cm = confusion_matrix(y_test, y_pred) # print('Confusion Matrix', cm) conf_matr(input, cm, y_test, y_pred) if input == 'KNN': print("K-Neighbors Classifier") knn_model = Pipeline(steps=[ ("features", TfidfVectorizer(ngram_range=(1, 2))), ("classifier", KNeighborsClassifier(n_neighbors=7)), ]) knn_model.fit(x_train, y_train) y_pred = knn_model.predict(x_test) print("Classification report: %s" % (classification_report(y_test, y_pred))) print("accuracy for K-Neighbors Classifier %s" % (knn_model.score(x_test, y_test))) cm = confusion_matrix(y_test, y_pred) # print('Confusion Matrix', cm) conf_matr(input, cm, y_test, y_pred) return
random_state=0) model.fit(data) score = consensus_score(model.biclusters_, (rows[:, row_idx], columns[:, col_idx])) print("consensus score: {:.1f}".format(score)) fit_data = data[np.argsort(model.row_labels_)] fit_data = fit_data[:, np.argsort(model.column_labels_)] plt.matshow(fit_data, cmap=plt.cm.Blues) plt.title("After biclustering; rearranged to show biclusters") plt.matshow(np.outer( np.sort(model.row_labels_) + 1, np.sort(model.column_labels_) + 1), cmap=plt.cm.Blues) plt.title("Checkerboard structure of rearranged data") plt.show() X, y = make_classification(random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) pipe = make_pipeline(StandardScaler(), LogisticRegression()) pipe.fit(X_train, y_train) # apply scaling on training data #Pipeline(steps=[('standardscaler', StandardScaler()), ('logisticregression', LogisticRegression())]) pipe.score( X_test, y_test) # apply scaling on testing data, without leaking training data.
def LOG(cls, X_train, Y_train): log = LogisticRegression() log.fit(X_train, Y_train) cls.save(log, 'LOG') return log
separator.predict_proba(X_test)[0:5] confusion_matrix(yhat,y_test) # # Logistic Regression # In[30]: from sklearn.linear_model import LogisticRegression # In[31]: LR = LogisticRegression(C=0.01, solver='lbfgs').fit(X_train,y_train) yhat = LR.predict(X_test) y_prob = LR.predict_proba(X_test) # In[32]: # Plot non-normalized confusion matrix from sklearn.metrics import confusion_matrix, classification_report, accuracy_score plt.figure() print('Accuracy score for training set: {:.2f}'.format(accuracy_score(y_true=y_train,y_pred=LR.predict(X_train)))) print('Accuracy score for test set: {:.2f}'.format(accuracy_score(y_true=y_test,y_pred=yhat))) print (classification_report(y_test, yhat)) confusion_matrix(y_test, yhat, labels=['COLLECTION','PAIDOFF'])
gs = GridSearchCV(estimator=Ridge(), param_grid=param_grid, cv=10) result = gs.fit(diabetes.data, diabetes.target) print("최적 점수: {}".format(result.best_score_)) print("최적 파라미터: {}".format(result.best_params_)) print(gs.best_estimator_) pd.DataFrame(result.cv_results_) # multiprecessing을 이용한 GridSearchCV import multiprocessing from sklearn.datasets import load_iris from sklearn.linear_model import LogisticRegression iris = load_iris() param_grid = [{'penalty': ['l1', 'l2'], 'C': [1.5, 2.0, 2.5, 3.0, 3.5]}] gs = GridSearchCV(estimator=LogisticRegression(), param_grid=param_grid, scoring='accuracy', cv=10, n_jobs=multiprocessing.cpu_count()) result = gs.fit(iris.data, iris.target) print("최적 점수: {}".format(result.best_score_)) print("최적 파라미터: {}".format(result.best_params_)) print(gs.best_estimator_) pd.DataFrame(result.cv_results_) # preprocessing 데이터 전처리 모듈 # - 데이터의 특징 스케일링(feature scaling)을 위한 방법으로 표준화(Standardization)와 정규화(Normalization) 사용 # - scilit-learn에서는 개별 백터 크기를 맞추는 형태로 정규화
plt.title('Years spent in the \n company without leaving(STAYING)') plt.xlabel('Years') plt.ylabel('No. of employees') plt.show() #Prediction Analysis feats = ['sales', 'salary'] df_final = pd.get_dummies(data, columns=feats, drop_first=True) X = df_final.drop(['left'], axis=1).values y = df_final['left'].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4) sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) Rf = RandomForestClassifier() Rf.fit(X_train, y_train) Lr = LogisticRegression() Lr.fit(X_train, y_train) clf = neighbors.KNeighborsClassifier() clf.fit(X_train, y_train) svc_linear = SVC() svc_linear.fit(X_train, y_train) print("Random Forest Classifier accuracy :", Rf.score(X_test, y_test)) print("Logistic Regression accuracy :", Lr.score(X_test, y_test)) print("KNeighborsClassifier accuracy :", clf.score(X_test, y_test)) print("SVC accuracy :", svc_linear.score(X_test, y_test)) new_pred = np.array([[ 0.26, 0.7, 3., 238., 6., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1. ]]) prediction = Rf.predict(new_pred) print('RandomForest new_pred :', prediction)
print_classifier_metrics(y_test, y_pred_cv, name="Best Gamma SVM") cv_cm = confusion_matrix(y_test, y_pred_cv) # Best Gamma SVM confusion matrix plt.figure() plot_confusion_matrix(cv_cm, classes=class_names, title='Best Gamma SVM Confusion Matrix') plot_roc_curve(y_test, clf.best_estimator_.decision_function(X_test_LSI), name="Best Gamma SVM") # Best Gamma SVM ROC curve ########################################################## QUESTION 5 from sklearn.linear_model import LogisticRegression lr = LogisticRegression( C=10**10, random_state=42) # Logistic regression without regularization y_pred_lr = lr.fit(X_train_LSI, y_train).predict(X_test_LSI) print("Coefficients learned by logistic regression without regularization: ", lr.coef_) print_classifier_metrics(y_test, y_pred_lr, name="Logistic Regression without regularization") lr_cm = confusion_matrix( y_test, y_pred_lr) # logistic regression without regularization confusion matrix plt.figure() plot_confusion_matrix(lr_cm, classes=class_names, title='Logistic Regression Confusion Matrix') plot_roc_curve(y_test, lr.decision_function(X_test_LSI),
pred_statsmod = result.predict(X_statsmod) # Code admission as 1 if probability is greater than .5. pred_y_statsmod = np.where(pred_statsmod < .5, 0, 1) # Accuracy table. table = pd.crosstab(df['admit'], pred_y_statsmod) print('\n Accuracy by admission status') print(table) print('\n Percentage accuracy') print((table.iloc[0, 0] + table.iloc[1, 1]) / (table.sum().sum())) # Declare a logistic regression classifier. # Parameter regularization coefficient C described above. lr = LogisticRegression() #C=1e9) y = df['admit'] X = df[['gpa', 'gre']] # Fit the model. fit = lr.fit(X, y) # Display. print('Coefficients') print(fit.coef_) print(fit.intercept_) pred_y_sklearn = lr.predict(X) print('\n Accuracy by admission status') print(pd.crosstab(pred_y_sklearn, y))
clf_rf = RandomForestClassifier() clf_rf.fit(rescaled_asmd_train, severity_train) rf_score = cross_val_score(clf_rf, rescaled_asmd, array_severity, cv=10) #print(rf_score.mean()) #0.769 clf_svc = svm.SVC(kernel='linear', C=1.0) #kernel='rbf', kernel='sigmoid', kernel='poly' svm_cv_scores = cross_val_score(clf_svc, rescaled_asmd, array_severity, cv=10) #print(svm_cv_scores.mean()) # 80.3 for n in range(1, 50): clf_knn = neighbors.KNeighborsClassifier(n_neighbors=n) knn_cv_scores = cross_val_score(clf_knn, rescaled_asmd, array_severity, cv=10) #print(n,knn_cv_scores.mean()) scaler = preprocessing.MinMaxScaler().fit(array_asmd) rescaled_asmd = scaler.transform(array_asmd) clf_nb = MultinomialNB() nb_cv_scores = cross_val_score(clf_nb, rescaled_asmd, array_severity, cv=10) #print(nb_cv_scores.mean()) #78.42 clf_lr = LogisticRegression() lr_cv_scores = cross_val_score(clf_lr, rescaled_asmd, array_severity, cv=10) print(lr_cv_scores.mean())
def predict_user_labels(n_clicks, model_data_json, std_features_json, model_labels_binzd, model_genres_binzd, binarized_user_data_json, user_genres_binzd, user_data_json, threshold_slider, playlists): if n_clicks: model_data = pd.read_json(model_data_json) user_data_binzd = pd.read_json(binarized_user_data_json) user_data = pd.read_json(user_data_json) std_features = pd.read_json(std_features_json) print('model labels:', model_labels_binzd) # Reconcile gaps in binary genre columns between user and training dataset model_genres_add = [] user_genres_add = [] model_genres_add = [ genre for genre in user_genres_binzd if genre not in model_genres_binzd ] user_genres_add = [ genre for genre in model_genres_binzd if genre not in user_genres_binzd ] for genre in model_genres_add: model_data[genre] = 0 for genre in user_genres_add: user_data_binzd[genre] = 0 bin_user_cols = user_data_binzd.columns.to_list() model_data = model_data[bin_user_cols + model_labels_binzd] user_genres = [ col for col in bin_user_cols if col not in features and col != 'trackid' ] model_genres = user_genres # Remainder of user data prep for model prediction user_track_ids_col = user_data_binzd['trackid'] std_user_data = user_data_binzd.drop('trackid', axis=1) std_user_data = np.concatenate((stdscaler.fit_transform( std_user_data[features]), std_user_data[user_genres].to_numpy()), axis=1) # Model training X = pd.concat([std_features, model_data[model_genres]], join='inner', axis=1, ignore_index=True) y = model_data[model_labels_binzd] multilogreg = OneVsRestClassifier(LogisticRegression(max_iter=500), n_jobs=-1) multilogreg.fit(X, y) # Model application to user data user_data_probas = pd.DataFrame( multilogreg.predict_proba(std_user_data)) pl_pred_raw = user_data_probas.applymap(lambda x: 1 if x > threshold_slider else 0) pl_pred_raw.columns = model_labels_binzd pl_model_output = pd.concat([user_track_ids_col, pl_pred_raw], join='inner', axis=1) pl_predictions = pd.DataFrame(columns=['trackid', 'label']) # Pivot predicted binary label columns back into a single categorical column for pl in model_labels_binzd: pl_category = pl_model_output[pl_model_output[pl] == 1] for index, row in pl_category.iterrows(): pl_predictions = pl_predictions.append( { 'trackid': row['trackid'], 'label': pl.replace('label_', '') }, ignore_index=True) # Filter categorized user data only to songs that belong in the playlists selected by the user user_data_predicted = user_data[['trackid'] + features + ['genre']].merge(pl_predictions, how='inner', on='trackid') user_data_predicted_final = user_data_predicted[ user_data_predicted['label'].isin(playlists)].reset_index( drop=True) user_data_predicted_viz = user_data_predicted_final.drop( 'trackid', axis=1).reset_index(drop=True) user_data_std_features = pd.DataFrame(stdscaler.fit_transform( user_data_predicted[features]), columns=features) return user_data_predicted_final.to_json(), user_data_predicted_viz.to_json(), user_data_std_features.to_json(), {}, {'display': 'block'},\ {'display': 'block', 'text-align': 'center', 'align-items': 'center', 'justify-content': 'center', 'width': '60%'} else: return {}, {}, {}, {}, {'display': 'none'}, {'display': 'none'}
nRuns = 9 elif subjectDay == 3: nRuns = 8 #nruns = len(cfg.session.Runs) - 1 for runId in np.arange(1, nRuns): print(runId) runDir = 'run' + str(runId) + '/' pyModelFn = utils.findNewestFile( pyDataDir, 'trainedModel_r' + str(runId) + '*_py.mat') # to find what matModel includes use matModel.keys() --> trainedModel, trainPats, trainLabels # for each model we have W [ nVoxel x 2 classes], biases [ 1 x 2 classes] # we can't apply this model to any of the examples in this run, but let's apply it to the first 4 blocks of the next run # now load testing data from the next run to test it on pyModel_train = utils.loadMatFile(pyModelFn) # INSTEAD MAKE NEW MODEL lrc1 = LogisticRegression(penalty='l2', solver='saga', max_iter=300) lrc2 = LogisticRegression(penalty='l2', solver='saga', max_iter=300) lrc1.fit(pyModel_train.trainPats[:, ROI_indices], pyModel_train.trainLabels[:, 0]) lrc2.fit(pyModel_train.trainPats[:, ROI_indices], pyModel_train.trainLabels[:, 1]) newTrainedModel = utils.MatlabStructDict({}, 'trainedModel') newTrainedModel.trainedModel = StructDict({}) newTrainedModel.trainedModel.weights = np.concatenate( (lrc1.coef_.T, lrc2.coef_.T), axis=1) newTrainedModel.trainedModel.biases = np.concatenate( (lrc1.intercept_, lrc2.intercept_)).reshape(1, 2) newTrainedModel.trainPats = pyModel_train.trainPats[:, ROI_indices] newTrainedModel.trainLabels = pyModel_train.trainLabels
def gbdt_lr(self, data, label, epoch=10): ''' https://research.fb.com/wp-content/uploads/2016/11/practical-lessons-from-predicting-clicks-on-ads-at-facebook.pdf gbdt + lr 在广告点击率预估场景的使用 https://zhuanlan.zhihu.com/p/113350563 reference input : data (dataframe) columns = ['user_id','title,'age','gender'] gbdt feats: gbdt_leaf_0 ~ gbdt_leaf_n onehot feats = gbdt_leaf_0 : [1,0,0,...,0] new input = input + onehot feats ''' data_train, data_val, label_train, label_val = train_test_split( data, label, test_size=0.33, random_state=10) gbm = lgb.LGBMClassifier(boosting_type='gbdt', objective='binary', subsample=0.8, min_child_weight=0.5, colsample_bytree=0.7, num_leaves=100, max_depth=3, learning_rate=0.01, n_estimators=50) gbm.fit(data_train, label_train, eval_set=[(data_train, label_train), (data_val, label_val)], eval_names=['train', 'test'], eval_metric='binary_logloss') model = gbm.booster_ gbdt_feats_train = model.predict(data_train, pred_leaf=True) gbdt_feats_test = model.predict(data_val, pred_leaf=True) gbdt_feats_name = [ 'gbdt_leaf_' + str(i) for i in range(gbdt_feats_train.shape[1]) ] df_train_gbdt_feats = pd.DataFrame(gbdt_feats_train, columns=gbdt_feats_name) df_test_gbdt_feats = pd.DataFrame(gbdt_feats_test, columns=gbdt_feats_name) # narray transto dataframe data_train = pd.DataFrame(data_train) data_val = pd.DataFrame(data_val) train = pd.concat([data_train, df_train_gbdt_feats], axis=1) val = pd.concat([data_val, df_test_gbdt_feats], axis=1) train_len = train.shape[0] data = pd.concat([train, val]) for col in gbdt_feats_name: onehot_feats = pd.get_dummies(data[col], prefix=col) del data[col] data = pd.concat([data, onehot_feats], axis=1) x_train, x_val, y_train, y_val = train_test_split(data, label, test_size=0.2, random_state=10) lr = LogisticRegression() lr.fit(x_train, y_train) tr_logloss = log_loss(y_train, lr.predict_proba(x_train)[:, 1]) val_logloss = log_loss(y_val, lr.predict_proba(x_val)[:, 1]) # NE = (-1) / len(y_pred_test) * sum(((1+y_test)/2 * np.log(y_pred_test[:,1]) + (1-y_test)/2 * np.log(1 - y_pred_test[:,1]))) y_pred_train = lr.predict_proba(x_train)[:, 1] y_pred_val = lr.predict_proba(x_val)[:, 1] val_ne = (-1) / len(y_pred_val) * sum( ((1 + y_val) / 2 * np.log(y_pred_val) + (1 - y_val) / 2 * np.log(1 - y_pred_val))) tr_ne = (-1) / len(y_pred_train) * sum( ((1 + y_train) / 2 * np.log(y_pred_train) + (1 - y_train) / 2 * np.log(1 - y_pred_train))) print('tr-logloss: %2.2f , val logloss:%2.2f ' % (tr_logloss, val_logloss)) print('tr-ne: %2.2f , val ne:%2.2f ' % (tr_ne, val_ne))
def run_logisticreg(self): self.__obj = LogisticRegression(random_state=self.seed, C=self.c) self.__fit() return self.__predict()
labels=['low_level', 'med minus', 'med_level', 'med plus', 'high_level']) df = pl.binarize_categ_var(df, 'MonthlyIncome_discretize') df.head() # # Step 5: Build Classifier from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from sklearn.tree import DecisionTreeClassifier X_train, X_test, y_train, y_test = pl.split_data(df, X, y, 0.2) classifiers = [ LogisticRegression(), KNeighborsClassifier(), DecisionTreeClassifier(), RandomForestClassifier(), GradientBoostingClassifier() ] for method in classifiers: pl.test_model(X_train, y_train, features, method) pl.predict_model(X_train, y_train, X_test, y, features, method) # # Step 6: Evaluate Classifier for method in classifiers: pl.eval_model(X_train, y_train, X_test, y_test, features, method)
for j, clf in enumerate(clfs): # print(j, clf) dataset_blend_test_j = np.zeros((X_submission.shape[0], len(skf))) for i, (train, test) in enumerate(skf): # print("Fold", i) X_train = X[train] y_train = y[train] X_test = X[test] # y_test = y[test] clf.fit(X_train, y_train) y_submission = clf.predict_proba(X_test)[:, 1] dataset_blend_train[test, j] = y_submission dataset_blend_test_j[:, i] = clf.predict_proba(X_submission)[:, 1] dataset_blend_test[:, j] = dataset_blend_test_j.mean(1) print("Blending.") clf = LogisticRegression() clf.fit(dataset_blend_train, y) # y_submission = clf.predict_proba(dataset_blend_test)[:, 1] y_submission = clf.predict(dataset_blend_test) print("Linear stretch of predictions to [0,1]") # y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min()) y_pred = y_submission accuracy_score = metrics.accuracy_score(y_pred, y_test) print("Blend : ", accuracy_score) # print # "Saving Results." # tmp = np.vstack([range(1, len(y_submission) + 1), y_submission]).T # np.savetxt(fname='submission.csv', X=tmp, fmt='%d,%0.9f',
train_base[feature+'_mean'] = train_base[feature+'_mean']/train_base[feature+'_mean'].max() test_base[feature + '_mean'] = np.log(test_base[feature]).replace(-inf, 0) test_base[feature + '_mean'] = test_base[feature + '_mean'] / test_base[feature + '_mean'].max() # list for remembering names of our modified continuous features continuous_feature_list_mean += [feature+'_mean'] # for categorical features I sugget to apply mean-encoding, where we transform each category into it's target-mean # representation. Of course, to avoid overfittig we should use only mean encodings from train dataset # we have to transfer encodings from train to test and NOT to calculate them separately on test dataset. for feature in train_base.drop(['income', 'fnlwgt']+continuous_feature_list + continuous_feature_list_mean,axis=1).columns: db_group = train_base.groupby(feature)['income'].mean() train_base[feature+'_mean'] = train_base[feature].replace(list(db_group.index.values), list(db_group.values)) test_base[feature + '_mean'] = test_base[feature].replace(list(db_group.index.values), list(db_group.values)) # datasets cleaning train_x = train_base.dropna().loc[:, train_base.columns.str.contains('mean')] train_y = train_base.dropna().loc[:, 'income'] test_x = test_base.dropna().loc[:, test_base.columns.str.contains('mean')] test_y = test_base.dropna().loc[:, 'income'] # calling training lib - LogisticRegression from sklearn.linear_model import LogisticRegression model = LogisticRegression() model.fit(train_x, train_y) # printing results for train and test print('Train score = '+str(model.score(train_x, train_y))) print('Test score = ' + str(model.score(test_x, test_y)))