def test_predict_on_toy_problem():
    """Manually check predicted class labels for toy dataset."""
    clf1 = LogisticRegression(random_state=123)
    clf2 = RandomForestClassifier(random_state=123)
    clf3 = GaussianNB()

    X = np.array([[-1.1, -1.5],
                  [-1.2, -1.4],
                  [-3.4, -2.2],
                  [1.1, 1.2],
                  [2.1, 1.4],
                  [3.1, 2.3]])

    y = np.array([1, 1, 1, 2, 2, 2])

    assert_equal(all(clf1.fit(X, y).predict(X)), all([1, 1, 1, 2, 2, 2]))
    assert_equal(all(clf2.fit(X, y).predict(X)), all([1, 1, 1, 2, 2, 2]))
    assert_equal(all(clf3.fit(X, y).predict(X)), all([1, 1, 1, 2, 2, 2]))

    eclf = VotingClassifier(estimators=[
                            ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
                            voting='hard',
                            weights=[1, 1, 1])
    assert_equal(all(eclf.fit(X, y).predict(X)), all([1, 1, 1, 2, 2, 2]))

    eclf = VotingClassifier(estimators=[
                            ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
                            voting='soft',
                            weights=[1, 1, 1])
    assert_equal(all(eclf.fit(X, y).predict(X)), all([1, 1, 1, 2, 2, 2]))
示例#2
0
    def train(self, inverse_regularisation=1.0, verbose=True):
        self.trained_classifiers = {}

        if verbose:
            print '=' * 120
            print 'Training'

        for n, clser in enumerate(sorted(self.classifiers)):
            if verbose:
                print '-' * 120
                print "Training classifier: ", clser, ' #', n+1 , '/', len(self.classifiers)
                print "  Matrix:            ", (len(self.classifiers_outputs[clser]), len(self.classifiers_features_list[clser]))

            classifier_input = np.zeros((len(self.classifiers_outputs[clser]), len(self.classifiers_features_list[clser])))
            for i, feat in enumerate(self.classifiers_features[clser]):
                classifier_input[i] = feat.get_feature_vector(self.classifiers_features_mapping[clser])

            lr = LogisticRegression('l2', C=inverse_regularisation, tol=1e-6)

            lr.fit(classifier_input, self.classifiers_outputs[clser])
            self.trained_classifiers[clser] = lr

            if verbose:
                mean_accuracy = lr.score(classifier_input, self.classifiers_outputs[clser])
                print "  Prediction mean accuracy on the training data: %6.2f" % (100.0 * mean_accuracy, )
                print "  Size of the params:", lr.coef_.shape
示例#3
0
def test_cross_val_predict_with_method():
    iris = load_iris()
    X, y = iris.data, iris.target
    X, y = shuffle(X, y, random_state=0)
    classes = len(set(y))

    kfold = KFold(len(iris.target))

    methods = ['decision_function', 'predict_proba', 'predict_log_proba']
    for method in methods:
        est = LogisticRegression()

        predictions = cross_val_predict(est, X, y, method=method)
        assert_equal(len(predictions), len(y))

        expected_predictions = np.zeros([len(y), classes])
        func = getattr(est, method)

        # Naive loop (should be same as cross_val_predict):
        for train, test in kfold.split(X, y):
            est.fit(X[train], y[train])
            expected_predictions[test] = func(X[test])

        predictions = cross_val_predict(est, X, y, method=method,
                                        cv=kfold)
        assert_array_almost_equal(expected_predictions, predictions)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('dataset',
        help='dataset must have data, target, target_names attributes')
    parser.add_argument('-c', '--classifier', default='logistic_regression',
                        help='now supports logistic_regression only')
    parser.add_argument('-O', '--output', default='clf.pkl.gz',
                        help='saving clf filename')
    args = parser.parse_args(sys.argv[1:])

    print('loading dataset')
    with gzip.open(args.dataset, 'rb') as f:
        dataset = pickle.load(f)

    X = dataset.data
    y = dataset.target
    target_names = dataset.target_names

    # create train and test data
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                        random_state=np.random.randint(1234))

    # train and test
    if args.classifier == 'logistic_regression':
        clf = LogisticRegression()
    else:
        raise ValueError('unsupported classifier')
    print('fitting {0}'.format(args.classifier))
    clf.fit(X_train, y_train)
    clf.target_names_ = target_names
    with gzip.open(args.output, 'wb') as f:
        pickle.dump(clf, f)
    y_pred = clf.predict(X_test)
    print('score of classifier: {}'.format(accuracy_score(y_test, y_pred)))
    print(classification_report(y_test, y_pred, target_names=target_names))
示例#5
0
def training_stage3(dftrain,dfvalid,cat1,i):
    fname = ddir + 'joblib/stage3_'+str(cat1)+ext
    df = dftrain[dftrain.Categorie1 == cat1].reset_index(drop=True)
    dfv = dfvalid[dfvalid.Categorie1 == cat1].reset_index(drop=True)
    labels = np.unique(df.Categorie3)
    if len(labels)==1:
        joblib.dump((labels,None,None),fname)
        scv = -1
        sct = -1
        print 'training',cat1,'\t\t(',i,') : N=',len(df),'K=',len(labels)
        print 'training',cat1,'\t\t(',i,') : training=',sct,'validation=',scv
        return (sct,scv)
    vec,X = vectorizer_stage3(df.txt)
    Y = df['Categorie3'].values
    cla = LogisticRegression(C=best_regularisation.get(cat1,100))
    cla.fit(X,Y)
    labels = np.unique(df.Categorie3)
    sct = cla.score(X[:min(10000,len(df))],Y[:min(10000,len(df))])
    if len(dfv)==0:
        scv = -1
    else:
        Xv = vec.transform(dfv.txt)
        Yv = dfv['Categorie3'].values
        scv = cla.score(Xv,Yv)
    print 'training',cat1,'\t\t(',i,') : N=',len(df),'K=',len(labels)
    print 'training',cat1,'\t\t(',i,') : training=',sct,'validation=',scv
    joblib.dump((labels,vec,cla),fname)
    del vec,cla
    return (sct,scv)
def test_string_labels_refit_false():
    np.random.seed(123)
    clf1 = LogisticRegression()
    clf2 = RandomForestClassifier()
    clf3 = GaussianNB()

    y_str = y.copy()
    y_str = y_str.astype(str)
    y_str[:50] = 'a'
    y_str[50:100] = 'b'
    y_str[100:150] = 'c'

    clf1.fit(X, y_str)
    clf2.fit(X, y_str)
    clf3.fit(X, y_str)

    eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3],
                                  voting='hard',
                                  refit=False)

    eclf.fit(X, y_str)
    assert round(eclf.score(X, y_str), 2) == 0.97

    eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3],
                                  voting='soft',
                                  refit=False)

    eclf.fit(X, y_str)
    assert round(eclf.score(X, y_str), 2) == 0.97
示例#7
0
    def test_performance_in_other_dataset(self):
        from sklearn.linear_model import LogisticRegression
        from sklearn.cross_validation import StratifiedShuffleSplit

        compr_matrix = self.W0s.get_value().T  # currently best compression
        AT_X_compr = np.dot(compr_matrix, AT_X.T).T
        clf = LogisticRegression(penalty='l1')
        folder = StratifiedShuffleSplit(y=AT_labels, n_iter=5, test_size=0.2,
                                        random_state=42)

        acc_list = []
        prfs_list = []
        for (train_inds, test_inds) in folder:
            clf.fit(AT_X_compr[train_inds, :], AT_labels[train_inds])
            pred_y = clf.predict(AT_X_compr[test_inds, :])

            acc = (pred_y == AT_labels[test_inds]).mean()
            prfs_list.append(precision_recall_fscore_support(
                             AT_labels[test_inds], pred_y))

            acc_list.append(acc)

        compr_mean_acc = np.mean(acc_list)
        prfs = np.asarray(prfs_list).mean(axis=0)
        return compr_mean_acc, prfs
示例#8
0
def check_lambda(dirnm, datanm_train, datanm_valid, datanm_orig_train, datanm_orig_valid, samples_per_class, Cs, num_classes):
    spct = 10*70
    tdata, tlabels = load_full(dirnm+datanm_train, spct)
    print tdata.shape, tlabels.shape

    spct = 10
    otdata, otlabels = load_full(dirnm+datanm_orig_train, spct)

    spct = 10*30
    vdata, vlabels = load_full(dirnm+datanm_valid, spct)

    spct = 10
    ovdata, ovlabels = load_full(dirnm+datanm_orig_valid, spct)

    # artif
    ans = np.zeros((len(Cs), 4))

    for i, C in enumerate(Cs):
        clf = LogisticRegression(C  =C,     penalty='l2', multi_class = 'ovr',
                                 tol=0.001, n_jobs = -1, verbose = 0, solver = 'newton-cg')
        clf.fit(tdata, tlabels)

        out_train = clf.predict_proba(tdata)
        out_valid = clf.predict_proba(vdata)
        out_train_real = clf.predict_proba(otdata)
        out_valid_real = clf.predict_proba(ovdata)

        ans[i, 0] += log_loss(tlabels, out_train)
        ans[i, 1] += log_loss(vlabels, out_valid)
        ans[i, 2] += log_loss(otlabels, out_train_real)
        ans[i, 3] += log_loss(ovlabels, out_valid_real)

    np.savez("logreg_lambda", ans= ans, Cs = Cs, num_classes = num_classes, samples_per_class = samples_per_class)
    return ans
示例#9
0
def predict_lr(train_features, test_features, train_labels, test_labels):
  model = LogisticRegression()
  model.fit(train_features, train_labels)
  predictions = model.predict(train_features)
  print get_accuracy(predictions, train_labels)
  predictions = model.predict(test_features)
  print get_accuracy(predictions, test_labels)
示例#10
0
def logit_clf(dataset, DV, train):
	start = time.time()
	# Load Data to Pandas
	data = pd.read_csv(dataset, index_col=0)
	data.columns = [camel_to_snake(col) for col in data.columns]

	#DV
	y = data[str(DV)]
	X = data[data.columns - [str(DV)]]
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

	model = LogisticRegression()
	
	if train=='yes':
		model1 = model.fit(X_train, y_train)
		print "Classifier: Logistic Regression"
		print pd.DataFrame(zip(X.columns, np.transpose(model.coef_)))
		end = time.time()
		print "Runtime, base model: %.3f" % (end-start), "seconds."
		return model1
	elif train=='no':
		model2 = model.fit(X, y)
		print "Classifier: Logistic Regression"
		print pd.DataFrame(zip(X.columns, np.transpose(model.coef_)))
		end = time.time()
		print "Runtime, base model: %.3f" % (end-start), "seconds."
		return model2
示例#11
0
def check_vb(dirnm, datanm_train, datanm_valid, C, num_classes):
    spct = 10*70
    tdata, tlabels = load_full(dirnm+datanm_train, spct)
    #print tdata.shape, tlabels.shape

    spct = 10*30
    vdata, vlabels = load_full(dirnm+datanm_valid, spct)

    h = np.arange(0, 310, 10)
    h[0] +=1
    # artif
    ans = np.zeros((h.size, 2))

    tind = kget(tlabels, num_classes, h[-1])
    vind = kget(vlabels, num_classes, h[-1])

    for l in xrange(0, h.size):

        clf = LogisticRegression(C  =C,     penalty='l2', multi_class = 'ovr',
                                 tol=0.001, n_jobs = -1, verbose = 0, solver = 'newton-cg')
        clf.fit(tdata[tind[:h[l]*num_classes]], tlabels[tind[:h[l]*num_classes]])

        out_train = clf.predict_proba(tdata[tind[:h[l]*num_classes]])
        out_valid = clf.predict_proba(vdata[vind[:h[l]*num_classes]])

        ans[l, 0] += log_loss(tlabels[tind[:h[l]*num_classes]], out_train)
        ans[l, 1] += log_loss(vlabels[vind[:h[l]*num_classes]], out_valid)

    np.savez("logreg_bv", ans= ans, C = C, num_classes = num_classes)
    return ans
def Predict_Survivors():
    
    data = open_data('Data/Titanic/train.csv')
    
    target = data[::,0].astype(np.float)
    features = [row[1::] for row in data] 
    
    #filter out class and sex and age as main features
    # [1,2,3 are first/second/third class]  [0 is male, 1 is female] [0 is child, 1 is adult]
    filtered_features = strip_array(features)
     
    log_reg = LogisticRegression().fit(filtered_features,target)
      
    print log_reg.predict_proba([3,0]) #probability of third class male adult?
    print log_reg.predict_proba([1,1]) #probability of first class female adult?
    
    data_test     = open_data('Data/Titanic/test.csv')
    data_test_array = [row[0::] for row in data_test]
    
    data_test_array_stripped = strip_array(data_test_array)
    predicted = log_reg.predict(data_test_array_stripped)
    
    cv = cross_validation.cross_val_score(log_reg, filtered_features, target, cv = 3, n_jobs = 2)
    print cv
    
    np.savetxt('Data/Titanic/submission_titanic_class_gender_age.csv', predicted, delimiter=',', fmt='%d')
 def readout_sk(self, X_train, X_test, y_train, y_test, **kwargs):
     from sklearn.linear_model import LogisticRegression
     lr = LogisticRegression(**kwargs)
     lr.fit(X_train.T, y_train.T)
     y_train_predictions = lr.predict(X_train.T)
     y_test_predictions = lr.predict(X_test.T)
     return accuracy_score(y_train_predictions, y_train.T), accuracy_score(y_test_predictions, y_test.T)
示例#14
0
文件: prior.py 项目: pyongjoo/ende
    def test_ratio(self):
        '''
        Compare several competing methods changing the ratio of the positive
        class in the dataset. We use binary class dataset for the easy of
        interpretation.
        '''
        dataset = rcv1_binary_reader.toNumpy()
        set_size = 100

        X_train_full, y_train_full, X_test, y_test = dataset
        X_train, y_train = self.get_sub_set_with_size([X_train_full, y_train_full], set_size)
        assert(len(y_train) == set_size)

        train_set = (X_train, y_train)
        test_set_original = (X_test, y_test)

        clf = LogisticRegression()
        clf.fit(X_train, y_train)

        p = Prior(clf)

        for r in np.arange(0.05, 1.0, 0.05):
            # Generate a new test set with desired positive proportions.
            X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1)
            test_set = [X_test_new, y_test_new]

            true_pos = DE.arrayToDist(y_test_new)[1]

            p.fit(X_train, y_train, {-1:1-true_pos, 1:true_pos})
            y_pred = p.predict(X_test_new)
            cm = confusion_matrix(y_test_new, y_pred)
            acc = self.accuracy(cm)

            print r, acc
示例#15
0
def get_predictions_and_actual_outcomes():
    """Get predictions for a particular model and the actual outcomes."""
    df_train_set, df_test_set = get_train_set_and_test_set_dataframes()

    y_true_train = df_train_set['decision'].values
    x_train = df_train_set.drop(['docket', 'decision'], axis=1).values

    y_true_test = df_test_set['decision'].values
    x_test = df_test_set.drop(['docket', 'decision'], axis=1).values

    lr_model = LogisticRegression()
    lr_model.fit(x_train, y_true_train)
    probs = lr_model.predict_proba(x_test)[:, 1]
    threshold = 0.65
    y_pred = probs > threshold

    predictions_and_actual_outcomes = []
    for docket, prediction in zip(df_test_set['docket'].tolist(), y_pred):
        if prediction == True:
            predicted_winning_side = 'petitioner'
        else:
            predicted_winning_side = 'respondent'
        
        actual_outcome = df_test_set[df_test_set['docket'] == docket]['decision'].values[0]
        if actual_outcome == True:
            actual_winning_side = 'petitioner'
        else:
            actual_winning_side = 'respondent'
        print actual_winning_side

        predictions_and_actual_outcomes.append(docket + \
                                               ':' + predicted_winning_side + \
                                               ':' + actual_winning_side)
    return '\n'.join(predictions_and_actual_outcomes)
示例#16
0
def logit(X_train, y_train,arg):
    logr = LogisticRegression(C=arg)
    clf = logr.fit(X_train, y_train)
   # clf = LogisticRegression()
   # clf.fit(X_train,y_train)

    return clf
示例#17
0
def train_and_test(domain_dir, sentences):
    train_dir = os.path.join(domain_dir, "train")
    test_dir = os.path.join(domain_dir, "test")
    X_train, y_train = load_svmlight_file(os.path.join(train_dir, "feature_vector"))
    X_test, y_test = load_svmlight_file(os.path.join(test_dir, "feature_vector"))
    clf = LogisticRegression(C=1.0, intercept_scaling=1, dual=False,
                             fit_intercept=True, penalty="l2", tol=0.0001)
    print("fit..")
    clf.fit(X_train, y_train)
    print("fit end...")
    y_train_predict = clf.predict(X_train)
    print(f1_score(y_train, y_train_predict))
    y = clf.predict(X_test)
    f = open(os.path.join(test_dir, "relation.classifier"), "w", encoding="utf8")
    i = 0
    for sentence in sentences:
        flag = False
        str_list = []
        str_list.append("S\t{0}".format(sentence.text))
        for pair in sentence.candidate_relation:
            if y[i] != 0:
                flag = True
                str_list.append("R\t{0}\t{1}\t{2}\t{3}".format(
                    sentence.print_phrase(pair[0]).lower(),
                    sentence.print_phrase(pair[1]).lower(),
                    list(pair[0]),
                    list(pair[1])))
            i += 1
        if flag:
            for s in str_list:
                print(s, file=f)
    f.close()
示例#18
0
def Predict():
	USERAW = False
	clf = LogisticRegression(C=2.3,class_weight='auto')
	if USERAW:
		fio = fileio.RawInput('../data/alldata.csv',usePairs=True,useTrips=True)
		fio.df.to_csv('../data/tripsFractions.csv',index=False)
	else:
		fio = fileio.Preprocessed('../data/tripsFractions.csv')

	base = [201, 294, 260, 67, 220, 235, 7, 176, 290, 48, 309, 156, 66, 263, 138, 262, 35, 18, 233, 208, 240, 338, 0, 210, 9, 295, 317] # seed 410
	for b in base:
		print "%d. %s" %(b,fio.df.columns[b])
	return
	fio.encode(base)
	train, truth = fio.transformTrain(base)
	c = classifier.Classifier(train, truth)
	prefix = 'lib/logr'
	c.validate(clf,nFolds=10,out=prefix+'.csv')
	score = c.holdout(clf,nFolds=10,fraction=0.2)
	print score

	if True:
		test = fio.transformTest(base)
		clf.fit(train,truth)
		y_ = clf.predict_proba(test)[:,1]
		writeSubmission(y_,filename=prefix+'Test.csv')
		return
示例#19
0
文件: thresholds.py 项目: rahlk/RAAT
def thresholds():
  for name in ['ant', 'ivy', 'jedit', 'lucene', 'poi']:
    print("##", name)
    train, test = explore(dir='../Data/Jureczko/', name=name)
    data_DF=csv2DF(train, toBin=True)
    metrics=[str[1:] for str in data_DF[data_DF.columns[:-1]]]
    ubr = LogisticRegression()
    X = data_DF[data_DF.columns[:-1]].values
    y = data_DF[data_DF.columns[-1]].values
    ubr.fit(X,y)
    inter, coef, pVal = ubr.intercept_[0], ubr.coef_[0], f_classif(X,y)[1]

    table= texttable.Texttable()
    table.set_cols_align(["l","l","l"])
    table.set_cols_valign(["m","m","m"])
    table.set_cols_dtype(['t', 't', 't'])
    table_rows=[["Metric", "Threshold", "P-Value"]]

    for i in xrange(len(metrics)):
      if VARL(coef[i], inter, p0=0.05)>0 and pVal[i]<0.05:
        thresh="%0.2f"%VARL(coef[i], inter, p0=0.1)
        table_rows.append([metrics[i], thresh, "%0.3f"%pVal[i]])

    table.add_rows(table_rows)
    print(table.draw())

  # === DEBUG ===
  set_trace()
  return None
示例#20
0
 def test_curve_diffs(self):
     np.random.seed(0)
     clf = LogisticRegression()
     scikitplot.classifier_factory(clf)
     ax_micro = clf.plot_precision_recall_curve(self.X, self.y, curves='micro')
     ax_class = clf.plot_precision_recall_curve(self.X, self.y, curves='each_class')
     self.assertNotEqual(ax_micro, ax_class)
def test_topic_distribution(doc_topic_weights_filename, annotated_data_filename, k, train_prop, num_repeat, column_of_interest):
	(X, Y) = process_dataset(doc_topic_weights_filename, annotated_data_filename, k, column_of_interest)
	num_train = int(X.shape[0] * train_prop)

	# We repeat the experiments and report the average
	scores = []
	for i in range(num_repeat):
		print "Iteration: %d" % i
		rng = np.random.RandomState(i)
		indices = np.arange(len(X))
		rng.shuffle( indices )

		# Divide the set into train and test sets
		X_train = X[indices[:num_train]]
		Y_train = Y[indices[:num_train]]

		X_test = X[indices[num_train+1:]]
		Y_test = Y[indices[num_train+1:]]

		# Build a classifier
		clf = LogisticRegression().fit(X_train, Y_train)

		# Make prediction
		predicted_labels = clf.predict(X_test)

		# Report the accuracy
		true_labels = Y_test
		score = f1_score(predicted_labels, true_labels)	
		scores.append( score )

	return sum(scores) / len(scores)
示例#22
0
def evaluate(model, k=10, seed=1234, evalcv=True, evaltest=False):
    """
    Run experiment
    k: number of CV folds
    test: whether to evaluate on test set
    """
    print 'Preparing data...'
    traintext, testtext = load_data()
    train, train_labels = prepare_data(traintext)
    test, test_labels = prepare_data(testtext)
    train_labels = prepare_labels(train_labels)
    test_labels = prepare_labels(test_labels)
    train, train_labels = shuffle(train, train_labels, random_state=seed)

    print 'Computing training skipthoughts...'
    trainF = skipthoughts.encode(model, train, verbose=False, use_eos=False)
    
    if evalcv:
        print 'Running cross-validation...'
        interval = [2**t for t in range(0,9,1)]     # coarse-grained
        C = eval_kfold(trainF, train_labels, k=k, scan=interval, seed=seed)

    if evaltest:
        if not evalcv:
            C = 128     # Best parameter found from CV

        print 'Computing testing skipthoughts...'
        testF = skipthoughts.encode(model, test, verbose=False, use_eos=False)

        print 'Evaluating...'
        clf = LogisticRegression(C=C)
        clf.fit(trainF, train_labels)
        yhat = clf.predict(testF)
        print 'Test accuracy: ' + str(clf.score(testF, test_labels))
示例#23
0
def giniGrowth(df,woeVarsInfo,badFlag):
    woeTable = woeVarsInfo.copy()
    woeTable.variable = woeTable.variable.apply(lambda x: x + '_WOE')
    IV = getIVfromWOE(woeTable)
    columns = IV.variable
    columnsForModeking = []
    giniTest = []
    giniTrain = []
    y = df[badFlag].values
    for col in columns:
        columnsForModeking.append(col)
        X = df[columnsForModeking].values
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=3)
        lr = LogisticRegression()
        lr.fit(X_train,y_train)
        pr_test = lr.predict_proba(X_test)[:,1]
        pr_train = lr.predict_proba(X_train)[:,1]
        rocGiniTest =  met.roc_auc_score(y_test,pr_test) * 2 - 1
        rocGiniTrain =  met.roc_auc_score(y_train,pr_train) * 2 - 1
        giniTest.append(rocGiniTest)
        giniTrain.append(rocGiniTrain)
    trainDiff = [x-y for x,y in zip(giniTrain,[0]+giniTrain[:-1])]
    testDiff = [x-y for x,y in zip(giniTest,[0]+giniTest[:-1])]
    dfOut = pd.DataFrame({'variable':columns, 'giniTrain' : giniTrain,'giniTest': giniTest,'trainDiff':trainDiff,'testDiff':testDiff,'informationValue':list(IV.InformationValue)})
    dfOut[['trainDiff','testDiff']] = dfOut[['trainDiff','testDiff']]#.apply('${:,.2f}'.format)
    dfOut = dfOut.reindex_axis(['variable','informationValue','testDiff','trainDiff','giniTest','giniTrain'],axis=1)
    return dfOut
示例#24
0
def test_scoring():
    X, y = iris_data()
    clf1 = LogisticRegression(random_state=1,
                              solver='liblinear',
                              multi_class='ovr')
    clf2 = DecisionTreeClassifier(random_state=1)

    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=0.5,
                         random_state=123)

    score1 = clf1.fit(X_train, y_train).score(X_test, y_test)
    score2 = clf2.fit(X_train, y_train).score(X_test, y_test)

    assert round(score1, 2) == 0.96, round(score1, 2)
    assert round(score2, 2) == 0.91, round(score2, 2)

    t, p = paired_ttest_kfold_cv(estimator1=clf1,
                                 estimator2=clf2,
                                 X=X, y=y,
                                 scoring='accuracy',
                                 random_seed=1)

    assert round(t, 3) == -1.861, t
    assert round(p, 3) == 0.096, p

    t, p = paired_ttest_kfold_cv(estimator1=clf1,
                                 estimator2=clf2,
                                 X=X, y=y,
                                 scoring='recall_micro',
                                 random_seed=1)

    assert round(t, 3) == -1.861, t
    assert round(p, 3) == 0.096, p
def train_tagger(fold, essays_TD, essays_VD, wd_test_tags, wd_train_tags,
                 dual, C, penalty, fit_intercept, multi_class):

    # TD and VD are lists of Essay objects. The sentences are lists
    # of featureextractortransformer.Word objects
    """ Data Partitioning and Training """
    td_feats, td_tags = flatten_to_wordlevel_feat_tags(essays_TD)
    vd_feats, vd_tags = flatten_to_wordlevel_feat_tags(essays_VD)
    feature_transformer = FeatureVectorizer(min_feature_freq=MIN_FEAT_FREQ, sparse=SPARSE_WD_FEATS)
    td_X, vd_X = feature_transformer.fit_transform(td_feats), feature_transformer.transform(vd_feats)

    """ compute most common tags per word for training only (but not for evaluation) """
    wd_td_ys = get_wordlevel_mostfrequent_ys(td_tags, wd_train_tags, tag_freq)

    """ TRAIN Tagger """
    solver = 'liblinear'
    if multi_class == 'multinomial':
        solver = "lbfgs"
    model = LogisticRegression(dual=dual, C=C, penalty=penalty, fit_intercept=fit_intercept, multi_class=multi_class, solver=solver)
    if fold == 0:
        print(model)

    model.fit(td_X, wd_td_ys)

    wd_td_pred = model.predict(td_X)
    wd_vd_pred = model.predict(vd_X)

    """ TEST Tagger """
    td_wd_predictions_by_code = get_by_code_from_powerset_predictions(wd_td_pred, wd_test_tags)
    vd_wd_predictions_by_code = get_by_code_from_powerset_predictions(wd_vd_pred, wd_test_tags)

    """ Get Actual Ys by code (dict of label to predictions """
    wd_td_ys_by_code = get_wordlevel_ys_by_code(td_tags, wd_train_tags)
    wd_vd_ys_by_code = get_wordlevel_ys_by_code(vd_tags, wd_train_tags)
    return td_wd_predictions_by_code, vd_wd_predictions_by_code, wd_td_ys_by_code, wd_vd_ys_by_code
class SimpleGB(BaseEstimator):
    def __init__(self, tree_params_dict, iters, tau):
        self.tree_params_dict = tree_params_dict
        self.iters = iters
        self.tau = tau
        
    def fit(self, X_data, y_data):
        self.base_algo = LogisticRegression(C=0.0005).fit(X_data, y_data)
        self.estimators = []
        # p = 1 / (1 + exp(-a)) => a = - ln (1 / p - 1) = ln(p / (1 - p))
        curr_pred = - np.log(1. / self.base_algo.predict_proba(X_data)[:, 1] - 1)

        for iter_num in range(self.iters):
            # y это 0 или 1
            # a - сырое предсказание
            # f(a) = 1 / (1 + exp(-a)) - преобразование в вероятность
            # f'(a) = - exp(a) / (1 + exp(-a))^2 = - f(a) (1 - f(a))
            # log loss это (y log f(a) + (1 - y) log(1 - f(a)))

            # d/da (y log f(a) + (1 - y) log(1 - f(a))) = f'(a) (y/f(a) - (1 - y) / (1 - f(a)))
            fa = 1. / (1 + np.exp(-curr_pred))
            grad = - fa * (1. - fa) * (y_data / fa - (1. - y_data) / (1. - fa))
            algo = DecisionTreeRegressor(**self.tree_params_dict).fit(X_data, - grad)
            self.estimators.append(algo)
            curr_pred += self.tau * algo.predict(X_data)
        return self
    
    def predict(self, X_data):
        res = - np.log(1. / self.base_algo.predict_proba(X_data)[:, 1] - 1)
        for estimator in self.estimators:
            res += self.tau * estimator.predict(X_data)
        return res > 0.1 # этот порог можно варировать с целью повышения метрики
示例#27
0
def test_multiclass_classifier_class_weight():
    """tests multiclass with classweights for each class"""
    alpha = .1
    n_samples = 20
    tol = .00001
    max_iter = 50
    class_weight = {0: .45, 1: .55, 2: .75}
    fit_intercept = True
    X, y = make_blobs(n_samples=n_samples, centers=3, random_state=0,
                      cluster_std=0.1)
    step_size = get_step_size(X, alpha, fit_intercept, classification=True)
    classes = np.unique(y)

    clf1 = LogisticRegression(solver='sag', C=1. / alpha / n_samples,
                              max_iter=max_iter, tol=tol, random_state=77,
                              fit_intercept=fit_intercept,
                              class_weight=class_weight)
    clf2 = clone(clf1)
    clf1.fit(X, y)
    clf2.fit(sp.csr_matrix(X), y)

    le = LabelEncoder()
    class_weight_ = compute_class_weight(class_weight, np.unique(y), y)
    sample_weight = class_weight_[le.fit_transform(y)]

    coef1 = []
    intercept1 = []
    coef2 = []
    intercept2 = []
    for cl in classes:
        y_encoded = np.ones(n_samples)
        y_encoded[y != cl] = -1

        spweights1, spintercept1 = sag_sparse(X, y_encoded, step_size, alpha,
                                              n_iter=max_iter, dloss=log_dloss,
                                              sample_weight=sample_weight)
        spweights2, spintercept2 = sag_sparse(X, y_encoded, step_size, alpha,
                                              n_iter=max_iter, dloss=log_dloss,
                                              sample_weight=sample_weight,
                                              sparse=True)
        coef1.append(spweights1)
        intercept1.append(spintercept1)
        coef2.append(spweights2)
        intercept2.append(spintercept2)

    coef1 = np.vstack(coef1)
    intercept1 = np.array(intercept1)
    coef2 = np.vstack(coef2)
    intercept2 = np.array(intercept2)

    for i, cl in enumerate(classes):
        assert_array_almost_equal(clf1.coef_[i].ravel(),
                                  coef1[i].ravel(),
                                  decimal=2)
        assert_almost_equal(clf1.intercept_[i], intercept1[i], decimal=1)

        assert_array_almost_equal(clf2.coef_[i].ravel(),
                                  coef2[i].ravel(),
                                  decimal=2)
        assert_almost_equal(clf2.intercept_[i], intercept2[i], decimal=1)
示例#28
0
def make_classifier(min_word_size, stemmer):
    print "Training the classifier..."
    # Build X matrix of vector representations of review files,
    # and y vector of labels
    pos_file_names = get_file_names('pos')
    neg_file_names = get_file_names('neg')
    # m is the number of training examples
    m_pos = len(pos_file_names)
    m_neg = len(neg_file_names)
    m = m_pos + m_neg
    pos_labels = np.ones(m_pos)
    neg_labels = -np.ones(m_neg)
    y = np.concatenate((pos_labels, neg_labels), axis=0)
    # get dimensions of data
    dimensions = len(vocab)

    # initialize X
    X = np.zeros((m, dimensions))
    message = "{:.2%} percent done\r"
    # build X
    for i in xrange(m_pos):
        X[i, :] = vectorize(pos_file_names[i], min_word_size, stemmer)
        sys.stdout.write(message.format(i / float(m)))
        sys.stdout.flush()
    for j in xrange(m_neg):
        X[j + m_pos, :] = vectorize(neg_file_names[j], min_word_size, stemmer)
        sys.stdout.write(message.format((m_pos + j) / float(m)))
        sys.stdout.flush()
    # make the logistic regression function
    lr = LR()
    lr.fit(X, y)

    return lr
示例#29
0
def fit(x, y):

    # evaluate the model by splitting into train and test sets
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)
    model = LogisticRegression()
    model.fit(x_train, y_train)

    # predict class labels for the test set
    predicted = model.predict(x_test)
    print(predicted)

    # generate class probabilities
    pr = model.predict_proba(x_test)
    print(pr)

    # generate evaluation metrics
    print(metrics.accuracy_score(y_test, predicted))
    print(metrics.roc_auc_score(y_test, pr[:, 1]))
    print(metrics.confusion_matrix(y_test, predicted))
    print(metrics.classification_report(y_test, predicted))

    # evaluate the model using 10-fold cross-validation
    scores = cross_val_score(LogisticRegression(), x, y, scoring='accuracy', cv=10)
    print(scores)
    print(scores.mean())
示例#30
0
 def test_do_cv(self):
     np.random.seed(0)
     clf = LogisticRegression()
     scikitplot.classifier_factory(clf)
     ax = clf.plot_precision_recall_curve(self.X, self.y)
     self.assertRaises(AttributeError, clf.plot_precision_recall_curve, self.X, self.y,
                       do_cv=False)
# The runClassification function will accept a list with the classifiers that the   #
# we wish to run.                                                                   #
#####################################################################################

# TODO: Abstract these following classifiers so they can be passed into runClassification with their own parameters
clfs = {
    'RF':
    RandomForestClassifier(n_estimators=50, n_jobs=-1),
    'ET':
    ExtraTreesClassifier(n_estimators=10, n_jobs=-1, criterion='entropy'),
    'AB':
    AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                       algorithm="SAMME",
                       n_estimators=200),
    'LR':
    LogisticRegression(penalty='l1', C=1e5),
    'SVM':
    svm.SVC(kernel='linear', probability=True, random_state=0),
    'GB':
    GradientBoostingClassifier(learning_rate=0.05,
                               subsample=0.5,
                               max_depth=6,
                               n_estimators=10),
    'NB':
    GaussianNB(),
    'DT':
    DecisionTreeClassifier()
}

attributes = {
    'study_hrs': 0,
示例#32
0
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

# TF-IDF
vectorizer = TfidfVectorizer(min_df=10)
vectorizer = vectorizer.fit(X)
X_train_tfidf = vectorizer.transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Dimensionality Reduction
lda = LinearDiscriminantAnalysis(n_components=10)
lda = lda.fit(X_train_tfidf.toarray(), y_train)
X_train_lda = lda.transform(X_train_tfidf.toarray())
X_test_lda = lda.transform(X_test_tfidf.toarray())

# Machine Learning
clf = LogisticRegression(max_iter=10000).fit(X_train_lda, y_train)

# Results
X_train_pred = [np.round(i) for i in clf.predict(X_train_lda)]
X_test_pred = [np.round(i) for i in clf.predict(X_test_lda)]
print(classification_report(y_train, X_train_pred))
print(classification_report(y_test, X_test_pred))

#%% Alternate Result Printing
results = pd.DataFrame(zip(y_test, X_test_pred))
results[2] = results[1] - results[0]
results = pd.DataFrame(dict(Counter(results[2])).items()).sort_values(1)
results[0] = results[0].apply(np.abs)
results = results.groupby(0).sum()
sum = results.sum().item()
results["diff"] = results[1] / sum
def test_check_estimator():
    # tests that the estimator actually fails on "bad" estimators.
    # not a complete test of all checks, which are very extensive.

    # check that we have a set_params and can clone
    msg = "it does not implement a 'get_params' methods"
    assert_raises_regex(TypeError, msg, check_estimator, object)
    assert_raises_regex(TypeError, msg, check_estimator, object())
    # check that values returned by get_params match set_params
    msg = "get_params result does not match what was passed to set_params"
    assert_raises_regex(AssertionError, msg, check_estimator,
                        ModifiesValueInsteadOfRaisingError())
    assert_warns(UserWarning, check_estimator, RaisesErrorInSetParams())
    assert_raises_regex(AssertionError, msg, check_estimator,
                        ModifiesAnotherValue())
    # check that we have a fit method
    msg = "object has no attribute 'fit'"
    assert_raises_regex(AttributeError, msg, check_estimator, BaseEstimator)
    assert_raises_regex(AttributeError, msg, check_estimator, BaseEstimator())
    # check that fit does input validation
    msg = "ValueError not raised"
    assert_raises_regex(AssertionError, msg, check_estimator,
                        BaseBadClassifier)
    assert_raises_regex(AssertionError, msg, check_estimator,
                        BaseBadClassifier())
    # check that sample_weights in fit accepts pandas.Series type
    try:
        from pandas import Series  # noqa
        msg = ("Estimator NoSampleWeightPandasSeriesType raises error if "
               "'sample_weight' parameter is of type pandas.Series")
        assert_raises_regex(ValueError, msg, check_estimator,
                            NoSampleWeightPandasSeriesType)
    except ImportError:
        pass
    # check that predict does input validation (doesn't accept dicts in input)
    msg = "Estimator doesn't check for NaN and inf in predict"
    assert_raises_regex(AssertionError, msg, check_estimator, NoCheckinPredict)
    assert_raises_regex(AssertionError, msg, check_estimator,
                        NoCheckinPredict())
    # check that estimator state does not change
    # at transform/predict/predict_proba time
    msg = 'Estimator changes __dict__ during predict'
    assert_raises_regex(AssertionError, msg, check_estimator, ChangesDict)
    # check that `fit` only changes attribures that
    # are private (start with an _ or end with a _).
    msg = ('Estimator ChangesWrongAttribute should not change or mutate  '
           'the parameter wrong_attribute from 0 to 1 during fit.')
    assert_raises_regex(AssertionError, msg, check_estimator,
                        ChangesWrongAttribute)
    check_estimator(ChangesUnderscoreAttribute)
    # check that `fit` doesn't add any public attribute
    msg = (r'Estimator adds public attribute\(s\) during the fit method.'
           ' Estimators are only allowed to add private attributes'
           ' either started with _ or ended'
           ' with _ but wrong_attribute added')
    assert_raises_regex(AssertionError, msg, check_estimator,
                        SetsWrongAttribute)
    # check for invariant method
    name = NotInvariantPredict.__name__
    method = 'predict'
    msg = ("{method} of {name} is not invariant when applied "
           "to a subset.").format(method=method, name=name)
    assert_raises_regex(AssertionError, msg, check_estimator,
                        NotInvariantPredict)
    # check for sparse matrix input handling
    name = NoSparseClassifier.__name__
    msg = "Estimator %s doesn't seem to fail gracefully on sparse data" % name
    # the check for sparse input handling prints to the stdout,
    # instead of raising an error, so as not to remove the original traceback.
    # that means we need to jump through some hoops to catch it.
    old_stdout = sys.stdout
    string_buffer = StringIO()
    sys.stdout = string_buffer
    try:
        check_estimator(NoSparseClassifier)
    except:
        pass
    finally:
        sys.stdout = old_stdout
    assert msg in string_buffer.getvalue()

    # Large indices test on bad estimator
    msg = ('Estimator LargeSparseNotSupportedClassifier doesn\'t seem to '
           r'support \S{3}_64 matrix, and is not failing gracefully.*')
    assert_raises_regex(AssertionError, msg, check_estimator,
                        LargeSparseNotSupportedClassifier)

    # does error on binary_only untagged estimator
    msg = 'Only 2 classes are supported'
    assert_raises_regex(ValueError, msg, check_estimator,
                        UntaggedBinaryClassifier)

    # non-regression test for estimators transforming to sparse data
    check_estimator(SparseTransformer())

    # doesn't error on actual estimator
    check_estimator(LogisticRegression)
    check_estimator(LogisticRegression(C=0.01))
    check_estimator(MultiTaskElasticNet)
    check_estimator(MultiTaskElasticNet())

    # doesn't error on binary_only tagged estimator
    check_estimator(TaggedBinaryClassifier)

    # Check regressor with requires_positive_y estimator tag
    check_estimator(RequiresPositiveYRegressor)
示例#34
0
def model():
    training = pd.read_excel('model/Training.xlsx')
    try:
        training.drop(columns=['year'], inplace=True)
    except:
        pass

    # In[2]:

    y = training['Promotion']
    X = training
    X.drop(columns=['Promotion'], inplace=True)
    clf = LogisticRegression(random_state=0, max_iter=60000).fit(X, y)

    # In[9]:

    r = []
    z = clf.intercept_

    for x in X.columns.values:
        if x not in r:
            r.append(x)
    print(r)
    Model = pd.DataFrame(clf.coef_, columns=[r])
    Model["Interception"] = z
    Model
    Model.to_excel('model/Coeficients.xlsx')

    # In[8]:

    test = pd.read_excel('model/Testing.xlsx')
    try:
        test.drop(columns=['year', 'Unnamed: 0'], inplace=True)
    except:
        pass

    y1 = test['Promotion']
    X1 = test
    X1.drop(columns=['Promotion'], inplace=True)

    r1 = list(X.columns.values)
    r2 = list(X1.columns.values)
    r = list(set(r) - set(r2))
    for x in r:
        X1[x] = 0

    # In[29]:

    X1 = X1.reindex(columns=r1)
    for x in r1:
        X1[x].fillna(0, inplace=True)

    Test = clf.score(X1, y1)
    Trainig = clf.score(X, y)
    E = []
    E.append(Trainig)
    E.append(Test)
    Errors = pd.DataFrame(E, columns=["error"], index=["Training", "Testing"])
    Errors.to_excel('model/Errors.xlsx')

    # In[30]:

    y_P = clf.predict(X)
    f1 = f1_score(y, y_P, average='macro')
    recal = recall_score(y, y_P, average='macro')
    CM = confusion_matrix(y, y_P)

    y_P = clf.predict(X1)
    f1_t = f1_score(y1, y_P, average='macro')
    recal_t = recall_score(y1, y_P, average='macro')
    CM_T = confusion_matrix(y1, y_P)

    Va = []
    Va.append(recal)
    Va.append(f1)
    Va.append(recal_t)
    Va.append(f1_t)

    Stats = pd.DataFrame(Va,
                         columns=["Value"],
                         index=["Recall", "F1", "Recall Test", "F1 Test"])
    Stats.to_excel('model/Stats.xlsx')
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

X_train[num_col] = scaler.fit_transform(X_train[num_col])


# ### Logistic Regression

# In[38]:


# Import 'LogisticRegression' and create a LogisticRegression object

from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(class_weight='balanced')


# In[39]:


# Import RFE and select 15 variables

from sklearn.feature_selection import RFE
rfe = RFE(logreg, 15)             # running RFE with 15 variables as output
rfe = rfe.fit(X_train, y_train)


# In[40]:

print(classification_report(y_test, y_pred))

#Building a logistic regression model

# Import the necessary modules
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.4,
                                                    random_state=42)

# Create the classifier: logreg
logreg = LogisticRegression()

# Fit the classifier to the training data
logreg.fit(X_train, y_train)

# Predict the labels of the test set: y_pred
y_pred = logreg.predict(X_test)

# Compute and print the confusion matrix and classification report
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

#Plotting an ROC curve

# Import necessary modules
from sklearn.metrics import roc_curve
vectorizer = DictVectorizer()
X_train_vec = vectorizer.fit_transform(train_as_dicts) #n380855x26828
test_as_dicts = [dict(r.iteritems()) for _, r in X_test.fillna(-1).iterrows()]
X_test_vec = vectorizer.transform(test_as_dicts)

def Label_enc(df):
    le = LabelEncoder()
    le.fit(df.unique())
    vec = le.transform(df.values)
    return vec

y_train_vec = Label_enc(y_train)
y_test_vec = Lahel_enc(y_test)

#%%
logistic = LogisticRegression(multi_class = 'multinomial', solver = 'sag')
logistic.fit(X_train_vec, y_train_vec)

y_pred_log = logistic.predict(X_test_vec)
y_test_vec = Label_enc(y_test)
target_names = ['Charged Off', 'Current', 'Default', 'Fully Paid', 'In Grace Period', 'Issued', 'Late (16-30 days)', 'Late (31-120 days)']
print(classification_report(y_test_vec, y_pred_log, target_names = target_names))

#%%
# Compute ROC curve and ROC area for each class
y_test_bin = label_binarize(y_test_vec, classes = [0,1,2,3,4,5,6,7])
y_score = logistic.decision_function(X_test_vec)

fpr = dict()
tpr = dict()
roc_auc = dict()
示例#38
0
        n_samples=data_points
    )  # to match minority class # reproducible results

    data = pd.concat([data_downsampled, alt_data])

    x_data = data[training_head]

    x_data = x_data.values.astype(float)

    y_data = data["actual_use"]

    y_data = y_data.values.astype(float)

    #tree.DecisionTreeClassifier()#LogisticRegression()
    estimator = LogisticRegression(
        fit_intercept=True
    )  #tree.DecisionTreeClassifier() #LogisticRegression()   #SVC(kernel="linear")  #tree.DecisionTreeClassifier() #LogisticRegression()#
    rfecv = RFECV(estimator, step=1, cv=StratifiedKFold(2))

    rfecv.fit(x_data, y_data)
    print('number of features selected:', rfecv.n_features_)

    x_new = rfecv.transform(x_data)

    selected_inds = rfecv.get_support(indices=True)
    selected_ranks = rfecv.ranking_

    selected_feats = [training_head[ind] for ind in selected_inds]
    #print(selected_feats)

    #print(rfecv.estimator_.coef_)
示例#39
0
y_cv.value_counts()

# %%
y_test.value_counts()

# %% [markdown]
# ### 4.1.1.1. Logistic Regression

# %%
C = [math.pow(base,i) for i in range(-6,6)]
# H = [round(math.log(i,10)) for i in C]

tuned_parameters = [{'C': C}, {'penalty':['l1','l2']}, {'class_weight':[None,'balanced']}]

C = [round(math.log(i,base)) for i in C]
clf = GridSearchCV(LogisticRegression(),                    tuned_parameters, cv=cv, scoring='recall', n_jobs=7, verbose=10)
clf.fit(X_train_std, y_train)

# plot_grid_search(clf, X_train, y_train, C)
print(clf.best_estimator_)
print(clf.best_params_)
best_estimator = clf.best_estimator_

calib = CalibratedClassifierCV(best_estimator, cv=cv, method='sigmoid')
calib.fit(X_train_std, y_train)
plot_confusion_matrix(y_train, calib.predict(X_train_std), y_test, calib.predict(X_test_std))


# %%
threshold, cost = plot_precision_recall_costs(calib, X_cv_std, y_cv)
示例#40
0
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

X, y = load_iris(return_X_y=True, as_frame=True)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    stratify=y,
                                                    random_state=5)

model = LogisticRegression()
model.fit(X_train, y_train)

preds = model.predict(X_test)

print("Resultados\n")
print(confusion_matrix(y_test, preds))
print(classification_report(y_test, preds))
feature_names = ['statuses_count', 'followers_count','followees_count','favorites_count','listed_count','betweenness','sentiment','subjectivity','time_diff','time_diff_median','tweet number','retweet number','quote number','number urls','number hashtags','status length','baddies','mentions']
x=df_clean[feature_names]
y=df_clean['hate']

mask = ~np.any(np.isnan(x), axis=1)
x = x[mask]
y = y[mask]

scaler=RobustScaler()
x = scaler.fit_transform(x)

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

from sklearn.linear_model import LogisticRegression
model=LogisticRegression()
model.fit(x_train,y_train)

y_pred=model.predict(x_test)

from sklearn.metrics import accuracy_score


accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

from sklearn.metrics import confusion_matrix
from matplotlib import pyplot as plt

print(classification_report(y_test, y_pred))
plot_confusion_matrix(confusion_matrix(y_test, y_pred))
    with open('settings/specifications.json') as f:
        specifications = json.load(f)

    raw_train = pd.read_csv(TRAIN_CSV)
    x_columns = specifications['description']['X']
    y_column = specifications['description']['y']

    x_raw = raw_train[x_columns]

    loader = DataLoader()
    loader.fit(x_raw)
    X = loader.load_data()
    y = raw_train.Response

    model = LogisticRegression(C=0.01, penalty='l1', solver='liblinear')
    model.fit(X, y)
    with open('models/log_reg.pickle', 'wb')as f:
        pickle.dump(model, f)






    import pickle
    import json
    import pandas as pd
    from sklearn.svm import SVC

    from utils.dataloader import DataLoader
示例#43
0
    yTrain = copy.deepcopy(labels_train)
    yTest = copy.deepcopy(labels_test)

    print('=======>yTrain')
    for j in range(0, 60000):
        # print(yTrain[j])
        if (yTrain[j] == i):
            yTrain[j] = 1
            # print('-->if changed:',yTrain[j])
        else:
            yTrain[j] = 0
        # print('-->else changed:', yTrain[j])
    #
    print('=======>yTest')
    for j in range(0, 10000):
        # print(yTest[j])
        if (yTest[j] == i):
            yTest[j] = 1
        else:
            yTest[j] = 0

    logitL1 = LogisticRegression(penalty='l1', solver='liblinear', C=100)
    logitL1.fit(images_train, yTrain)

    print('Class:', i)
    trainScore = logitL1.score(np.array(images_train).reshape(60000, 784), np.array(labels_train).reshape(60000, 1))
    score = logitL1.score(np.array(images_test).reshape(10000, 784), np.array(yTest).reshape(10000, 1))
    print('Train Score:',trainScore)
    print('Test Score:', score)
    print('`````````````````````````````````````````````')
示例#44
0
def predictor_func(val1, val2) :

    # To add a new cell, type '# %%'
    # To add a new markdown cell, type '# %% [markdown]'
    # %%
    from IPython import get_ipython

    # %% [markdown]
    # # Logistic Regression
    # 
    # Logistic Regression is a statistical method for predicting binary outcomes from data.
    # 
    # Examples of this are "yes" vs "no" or "young" vs "old". 
    # 
    # These are categories that translate to probability of being a 0 or a 1.
    # 
    # Source: [Logistic Regression](https://towardsdatascience.com/real-world-implementation-of-logistic-regression-5136cefb8125)
    # %% [markdown]
    # We can calculate logistic regression by adding an activation function as the final step to our linear model. 
    # 
    # This converts the linear regression output to a probability.

    # %%
    # get_ipython().run_line_magic('matplotlib', 'inline')
    import matplotlib.pyplot as plt
    import pandas as pd

    # %% [markdown]
    # Linear Regression:
    # <br>Y = β0+β1X
    # <br>Depending on the values of X (explanatory variable), the predict values for Y (response variable) may fall outside of \[ 0, 1 ]
    # <br>Changes in X have a linear effect on estimated probabilties
    # <br>Coefficients are easy to interpret, i.e., the change in Y when C increases by one unit
    # 
    # <br>Logistic Regression:
    # <br>P(Y=1) = e^(β0+β1X) / (1 + e^(β0+β1X))
    # <br>Predicted values always fall in \[ 0, 1 ]
    # <br>Changes in X can have a different effect on probabilities for different levels of X
    # <br>So, how to interpret Coefficients?
    # <br>The odds ratio for the estimated coefficient b1 is e^b1
    # <br>
    # <br>Probabilities (Wins / (Wins+loses))
    # <br>vs 
    # <br>Odds (Wins / Loses)
    # <br>
    # <br>0.50 1/2 --- 0.50/(1-0.50) 1
    # <br>0.33 1/3 --- 0.33/(1-0.33) 1/2
    # <br>0.66 2/3 --- 0.66/(1-0.66) 2
    # <br>0.20 1/5 --- 0.20/(1-0.2) 1/4

    # %%
    from sklearn.datasets import make_blobs

    X, y = make_blobs(centers=2, random_state=42)

    print(f"Labels: {y[:10]}")
    print(f"Data: {X[:10]}")


    # %%
    # Visualizing both classes
    plt.scatter(X[:, 0], X[:, 1], c=y)

    # %% [markdown]
    # Split our data into training and testing

    # %%
    from sklearn.model_selection import train_test_split

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

    # %% [markdown]
    # Create a Logistic Regression Model

    # %%
    from sklearn.linear_model import LogisticRegression
    classifier = LogisticRegression()
    classifier

    # %% [markdown]
    # Fit (train) or model using the training data

    # %%
    classifier.fit(X_train, y_train)

    # %% [markdown]
    # Validate the model using the test data

    # %%
    # Mean Accuracy

    print(f"Training Data Score: {classifier.score(X_train, y_train)}")
    print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

    # %% [markdown]
    # Make predictions

    # %%
    # Generate a new data point (the red circle)
    import numpy as np
    new_data1 = np.array([[-2, 6]])
    new_data2 = np.array([[-1, 6]])
    new_data3 = np.array([[1, 6]])
    new_data4 = np.array([[3, 6]])
    new_data5 = np.array([[val1, val2]])
    plt.scatter(X[:, 0], X[:, 1], c=y)
    plt.scatter(new_data1[0, 0], new_data1[0, 1], c="r", marker="o", s=100)
    plt.scatter(new_data2[0, 0], new_data2[0, 1], c="r", marker="o", s=100)
    plt.scatter(new_data3[0, 0], new_data3[0, 1], c="r", marker="o", s=100)
    plt.scatter(new_data4[0, 0], new_data4[0, 1], c="r", marker="o", s=100)

    # Predict the class (purple or yellow) of the new data point
    prediction1 = classifier.predict(new_data1)
    pred_prob1 = classifier.predict_proba(new_data1)[:, 1]
    pred_odds1 = pred_prob1/(1-pred_prob1)

    prediction2 = classifier.predict(new_data2)
    pred_prob2 = classifier.predict_proba(new_data2)[:, 1]
    pred_odds2 = pred_prob2/(1-pred_prob2)

    prediction3 = classifier.predict(new_data3)
    pred_prob3 = classifier.predict_proba(new_data3)[:, 1]
    pred_odds3 = pred_prob3/(1-pred_prob3)

    prediction4 = classifier.predict(new_data4)
    pred_prob4 = classifier.predict_proba(new_data4)[:, 1]
    pred_odds4 = pred_prob4/(1-pred_prob4)

    prediction5 = classifier.predict(new_data5)
    pred_prob5 = classifier.predict_proba(new_data5)[:, 1]
    pred_odds5 = pred_prob5/(1-pred_prob5)

    # %%
    print("Classes are either 0 (purple) or 1 (yellow)")
    print(f"The new point estimated probability is: {pred_prob1} {pred_prob2} {pred_prob3} {pred_prob4}  {pred_prob5}")
    print(f"The new point estimated odds is: {pred_odds1} {pred_odds2} {pred_odds3} {pred_odds4} {pred_odds5}")
    print(f"The new point was classified as: {prediction1} {prediction2} {prediction3} {prediction4} {prediction5}")

    # %%
    result_response = {
        "probability": f'{round(float(pred_prob5[0]), 4)*100}%',
        "prediction": int(prediction5[0])
    }

    # %%
    return result_response
示例#45
0
def train_emotions(train, test, input):
    x_train = train['Testo_stringa']
    y_train = train['Genere']

    x_test = test['Testo_stringa']
    y_test = test['Genere']

    if input == "MNB":
        print("Multinomial Naive Bayes Classifier")
        mnb_model = Pipeline(steps=[
            ("combined_features", TfidfVectorizer(ngram_range=(1, 2))),
            ("classifier", MultinomialNB()),
        ])
        mnb_model.fit(x_train, y_train)
        y_pred = mnb_model.predict(x_test)
        print("Classification report: %s" %
              (classification_report(y_test, y_pred)))
        print("accuracy for multinomial naive bayes: %s" %
              mnb_model.score(x_test, y_test))

        cm = confusion_matrix(y_test, y_pred)
        # print('Confusion Matrix', cm)
        conf_matr(input, cm, y_test, y_pred)

    if input == "LR":
        print("Logistic Regression Classifier")
        lr_model = Pipeline(steps=[
            ("features", TfidfVectorizer(ngram_range=(1, 2))),
            ("classifier",
             LogisticRegression(solver="liblinear", multi_class="ovr")),
        ])
        lr_model.fit(x_train, y_train)
        y_pred = lr_model.predict(x_test)

        print("Classification report: %s" %
              (classification_report(y_test, y_pred)))
        print("accuracy for LogisticRegression: %s" %
              (lr_model.score(x_test, y_test)))

        cm = confusion_matrix(y_test, y_pred)
        # print('Confusion Matrix', cm)
        conf_matr(input, cm, y_test, y_pred)

    if input == 'DT':
        print("Decision Tree Classifier")
        dt_model = Pipeline(steps=[
            ("features", TfidfVectorizer(ngram_range=(1, 2))),
            ("classifier", DecisionTreeClassifier(max_depth=2)),
        ])
        dt_model.fit(x_train, y_train)
        y_pred = dt_model.predict(x_test)

        print("Classification report: %s" %
              (classification_report(y_test, y_pred)))
        print("accuracy for Decision Tree %s" %
              (dt_model.score(x_test, y_test)))
        cm = confusion_matrix(y_test, y_pred)
        # print('Confusion Matrix', cm)
        conf_matr(input, cm, y_test, y_pred)

    if input == 'SVC':
        print("Support Vector Classifier")
        svc_model = Pipeline(steps=[
            ("features", TfidfVectorizer(ngram_range=(1, 2))),
            ("classifier", SVC(kernel='linear', C=1)),
        ])
        svc_model.fit(x_train, y_train)
        y_pred = svc_model.predict(x_test)

        print("Classification report: %s" %
              (classification_report(y_test, y_pred)))
        print("accuracy for Support Vector Classifier %s" %
              (svc_model.score(x_test, y_test)))

        cm = confusion_matrix(y_test, y_pred)
        # print('Confusion Matrix', cm)
        conf_matr(input, cm, y_test, y_pred)

    if input == 'KNN':
        print("K-Neighbors Classifier")
        knn_model = Pipeline(steps=[
            ("features", TfidfVectorizer(ngram_range=(1, 2))),
            ("classifier", KNeighborsClassifier(n_neighbors=7)),
        ])
        knn_model.fit(x_train, y_train)
        y_pred = knn_model.predict(x_test)

        print("Classification report: %s" %
              (classification_report(y_test, y_pred)))
        print("accuracy for K-Neighbors Classifier %s" %
              (knn_model.score(x_test, y_test)))

        cm = confusion_matrix(y_test, y_pred)
        # print('Confusion Matrix', cm)
        conf_matr(input, cm, y_test, y_pred)

    return
示例#46
0
                             random_state=0)
model.fit(data)
score = consensus_score(model.biclusters_,
                        (rows[:, row_idx], columns[:, col_idx]))

print("consensus score: {:.1f}".format(score))

fit_data = data[np.argsort(model.row_labels_)]
fit_data = fit_data[:, np.argsort(model.column_labels_)]

plt.matshow(fit_data, cmap=plt.cm.Blues)
plt.title("After biclustering; rearranged to show biclusters")

plt.matshow(np.outer(
    np.sort(model.row_labels_) + 1,
    np.sort(model.column_labels_) + 1),
            cmap=plt.cm.Blues)
plt.title("Checkerboard structure of rearranged data")

plt.show()

X, y = make_classification(random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
pipe = make_pipeline(StandardScaler(), LogisticRegression())
pipe.fit(X_train, y_train)  # apply scaling on training data
#Pipeline(steps=[('standardscaler', StandardScaler()), ('logisticregression', LogisticRegression())])

pipe.score(
    X_test,
    y_test)  # apply scaling on testing data, without leaking training data.
    def LOG(cls, X_train, Y_train):
        log = LogisticRegression()
        log.fit(X_train, Y_train)

        cls.save(log, 'LOG')
        return log
separator.predict_proba(X_test)[0:5]
confusion_matrix(yhat,y_test)


# # Logistic Regression

# In[30]:


from sklearn.linear_model import LogisticRegression


# In[31]:


LR = LogisticRegression(C=0.01, solver='lbfgs').fit(X_train,y_train)
yhat = LR.predict(X_test)
y_prob = LR.predict_proba(X_test)


# In[32]:


# Plot non-normalized confusion matrix
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
plt.figure()
print('Accuracy score for training set: {:.2f}'.format(accuracy_score(y_true=y_train,y_pred=LR.predict(X_train))))
print('Accuracy score for test set: {:.2f}'.format(accuracy_score(y_true=y_test,y_pred=yhat)))
print (classification_report(y_test, yhat))
confusion_matrix(y_test, yhat, labels=['COLLECTION','PAIDOFF'])
示例#49
0
gs = GridSearchCV(estimator=Ridge(), param_grid=param_grid, cv=10)
result = gs.fit(diabetes.data, diabetes.target)

print("최적 점수: {}".format(result.best_score_))
print("최적 파라미터: {}".format(result.best_params_))
print(gs.best_estimator_)
pd.DataFrame(result.cv_results_)

# multiprecessing을 이용한 GridSearchCV
import multiprocessing
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression

iris = load_iris()
param_grid = [{'penalty': ['l1', 'l2'], 'C': [1.5, 2.0, 2.5, 3.0, 3.5]}]
gs = GridSearchCV(estimator=LogisticRegression(),
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=10,
                  n_jobs=multiprocessing.cpu_count())

result = gs.fit(iris.data, iris.target)

print("최적 점수: {}".format(result.best_score_))
print("최적 파라미터: {}".format(result.best_params_))
print(gs.best_estimator_)
pd.DataFrame(result.cv_results_)

# preprocessing 데이터 전처리 모듈
# - 데이터의 특징 스케일링(feature scaling)을 위한 방법으로 표준화(Standardization)와 정규화(Normalization) 사용
# - scilit-learn에서는 개별 백터 크기를 맞추는 형태로 정규화
示例#50
0
plt.title('Years spent in the \n company without leaving(STAYING)')
plt.xlabel('Years')
plt.ylabel('No. of employees')
plt.show()
#Prediction Analysis
feats = ['sales', 'salary']
df_final = pd.get_dummies(data, columns=feats, drop_first=True)
X = df_final.drop(['left'], axis=1).values
y = df_final['left'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
Rf = RandomForestClassifier()
Rf.fit(X_train, y_train)
Lr = LogisticRegression()
Lr.fit(X_train, y_train)
clf = neighbors.KNeighborsClassifier()
clf.fit(X_train, y_train)
svc_linear = SVC()
svc_linear.fit(X_train, y_train)
print("Random Forest Classifier accuracy :", Rf.score(X_test, y_test))
print("Logistic Regression accuracy :", Lr.score(X_test, y_test))
print("KNeighborsClassifier accuracy :", clf.score(X_test, y_test))
print("SVC accuracy :", svc_linear.score(X_test, y_test))
new_pred = np.array([[
    0.26, 0.7, 3., 238., 6., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1.
]])
prediction = Rf.predict(new_pred)
print('RandomForest new_pred :', prediction)
示例#51
0
print_classifier_metrics(y_test, y_pred_cv, name="Best Gamma SVM")

cv_cm = confusion_matrix(y_test, y_pred_cv)  # Best Gamma SVM confusion matrix
plt.figure()
plot_confusion_matrix(cv_cm,
                      classes=class_names,
                      title='Best Gamma SVM Confusion Matrix')

plot_roc_curve(y_test,
               clf.best_estimator_.decision_function(X_test_LSI),
               name="Best Gamma SVM")  # Best Gamma SVM ROC curve

########################################################## QUESTION 5
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(
    C=10**10, random_state=42)  # Logistic regression without regularization
y_pred_lr = lr.fit(X_train_LSI, y_train).predict(X_test_LSI)
print("Coefficients learned by logistic regression without regularization: ",
      lr.coef_)
print_classifier_metrics(y_test,
                         y_pred_lr,
                         name="Logistic Regression without regularization")
lr_cm = confusion_matrix(
    y_test,
    y_pred_lr)  # logistic regression without regularization confusion matrix
plt.figure()
plot_confusion_matrix(lr_cm,
                      classes=class_names,
                      title='Logistic Regression Confusion Matrix')
plot_roc_curve(y_test,
               lr.decision_function(X_test_LSI),
示例#52
0
pred_statsmod = result.predict(X_statsmod)

# Code admission as 1 if probability is greater than .5.
pred_y_statsmod = np.where(pred_statsmod < .5, 0, 1)

# Accuracy table.
table = pd.crosstab(df['admit'], pred_y_statsmod)

print('\n Accuracy by admission status')
print(table)
print('\n Percentage accuracy')
print((table.iloc[0, 0] + table.iloc[1, 1]) / (table.sum().sum()))

# Declare a logistic regression classifier.
# Parameter regularization coefficient C described above.
lr = LogisticRegression()  #C=1e9)
y = df['admit']
X = df[['gpa', 'gre']]

# Fit the model.
fit = lr.fit(X, y)

# Display.
print('Coefficients')
print(fit.coef_)
print(fit.intercept_)
pred_y_sklearn = lr.predict(X)

print('\n Accuracy by admission status')
print(pd.crosstab(pred_y_sklearn, y))
clf_rf = RandomForestClassifier()
clf_rf.fit(rescaled_asmd_train, severity_train)

rf_score = cross_val_score(clf_rf, rescaled_asmd, array_severity, cv=10)
#print(rf_score.mean()) #0.769

clf_svc = svm.SVC(kernel='linear',
                  C=1.0)  #kernel='rbf', kernel='sigmoid', kernel='poly'
svm_cv_scores = cross_val_score(clf_svc, rescaled_asmd, array_severity, cv=10)

#print(svm_cv_scores.mean()) # 80.3

for n in range(1, 50):
    clf_knn = neighbors.KNeighborsClassifier(n_neighbors=n)
    knn_cv_scores = cross_val_score(clf_knn,
                                    rescaled_asmd,
                                    array_severity,
                                    cv=10)
    #print(n,knn_cv_scores.mean())

scaler = preprocessing.MinMaxScaler().fit(array_asmd)
rescaled_asmd = scaler.transform(array_asmd)

clf_nb = MultinomialNB()
nb_cv_scores = cross_val_score(clf_nb, rescaled_asmd, array_severity, cv=10)
#print(nb_cv_scores.mean()) #78.42

clf_lr = LogisticRegression()
lr_cv_scores = cross_val_score(clf_lr, rescaled_asmd, array_severity, cv=10)
print(lr_cv_scores.mean())
示例#54
0
def predict_user_labels(n_clicks, model_data_json, std_features_json,
                        model_labels_binzd, model_genres_binzd,
                        binarized_user_data_json, user_genres_binzd,
                        user_data_json, threshold_slider, playlists):
    if n_clicks:
        model_data = pd.read_json(model_data_json)
        user_data_binzd = pd.read_json(binarized_user_data_json)
        user_data = pd.read_json(user_data_json)
        std_features = pd.read_json(std_features_json)

        print('model labels:', model_labels_binzd)

        # Reconcile gaps in binary genre columns between user and training dataset
        model_genres_add = []
        user_genres_add = []
        model_genres_add = [
            genre for genre in user_genres_binzd
            if genre not in model_genres_binzd
        ]
        user_genres_add = [
            genre for genre in model_genres_binzd
            if genre not in user_genres_binzd
        ]

        for genre in model_genres_add:
            model_data[genre] = 0
        for genre in user_genres_add:
            user_data_binzd[genre] = 0

        bin_user_cols = user_data_binzd.columns.to_list()
        model_data = model_data[bin_user_cols + model_labels_binzd]

        user_genres = [
            col for col in bin_user_cols
            if col not in features and col != 'trackid'
        ]
        model_genres = user_genres

        # Remainder of user data prep for model prediction
        user_track_ids_col = user_data_binzd['trackid']
        std_user_data = user_data_binzd.drop('trackid', axis=1)
        std_user_data = np.concatenate((stdscaler.fit_transform(
            std_user_data[features]), std_user_data[user_genres].to_numpy()),
                                       axis=1)

        # Model training
        X = pd.concat([std_features, model_data[model_genres]],
                      join='inner',
                      axis=1,
                      ignore_index=True)
        y = model_data[model_labels_binzd]

        multilogreg = OneVsRestClassifier(LogisticRegression(max_iter=500),
                                          n_jobs=-1)
        multilogreg.fit(X, y)

        # Model application to user data
        user_data_probas = pd.DataFrame(
            multilogreg.predict_proba(std_user_data))
        pl_pred_raw = user_data_probas.applymap(lambda x: 1
                                                if x > threshold_slider else 0)
        pl_pred_raw.columns = model_labels_binzd

        pl_model_output = pd.concat([user_track_ids_col, pl_pred_raw],
                                    join='inner',
                                    axis=1)
        pl_predictions = pd.DataFrame(columns=['trackid', 'label'])

        # Pivot predicted binary label columns back into a single categorical column
        for pl in model_labels_binzd:
            pl_category = pl_model_output[pl_model_output[pl] == 1]
            for index, row in pl_category.iterrows():
                pl_predictions = pl_predictions.append(
                    {
                        'trackid': row['trackid'],
                        'label': pl.replace('label_', '')
                    },
                    ignore_index=True)

        # Filter categorized user data only to songs that belong in the playlists selected by the user
        user_data_predicted = user_data[['trackid'] + features +
                                        ['genre']].merge(pl_predictions,
                                                         how='inner',
                                                         on='trackid')
        user_data_predicted_final = user_data_predicted[
            user_data_predicted['label'].isin(playlists)].reset_index(
                drop=True)
        user_data_predicted_viz = user_data_predicted_final.drop(
            'trackid', axis=1).reset_index(drop=True)
        user_data_std_features = pd.DataFrame(stdscaler.fit_transform(
            user_data_predicted[features]),
                                              columns=features)

        return user_data_predicted_final.to_json(), user_data_predicted_viz.to_json(), user_data_std_features.to_json(), {}, {'display': 'block'},\
               {'display': 'block', 'text-align': 'center', 'align-items': 'center', 'justify-content': 'center', 'width': '60%'}
    else:
        return {}, {}, {}, {}, {'display': 'none'}, {'display': 'none'}
示例#55
0
        nRuns = 9
    elif subjectDay == 3:
        nRuns = 8
    #nruns = len(cfg.session.Runs) - 1
    for runId in np.arange(1, nRuns):
        print(runId)
        runDir = 'run' + str(runId) + '/'
        pyModelFn = utils.findNewestFile(
            pyDataDir, 'trainedModel_r' + str(runId) + '*_py.mat')
        # to find what matModel includes use matModel.keys() --> trainedModel, trainPats, trainLabels
        # for each model we have W [ nVoxel x 2 classes], biases [ 1 x 2 classes]
        # we can't apply this model to any of the examples in this run, but let's apply it to the first 4 blocks of the next run
        # now load testing data from the next run to test it on
        pyModel_train = utils.loadMatFile(pyModelFn)
        # INSTEAD MAKE NEW MODEL
        lrc1 = LogisticRegression(penalty='l2', solver='saga', max_iter=300)
        lrc2 = LogisticRegression(penalty='l2', solver='saga', max_iter=300)

        lrc1.fit(pyModel_train.trainPats[:, ROI_indices],
                 pyModel_train.trainLabels[:, 0])
        lrc2.fit(pyModel_train.trainPats[:, ROI_indices],
                 pyModel_train.trainLabels[:, 1])
        newTrainedModel = utils.MatlabStructDict({}, 'trainedModel')
        newTrainedModel.trainedModel = StructDict({})
        newTrainedModel.trainedModel.weights = np.concatenate(
            (lrc1.coef_.T, lrc2.coef_.T), axis=1)
        newTrainedModel.trainedModel.biases = np.concatenate(
            (lrc1.intercept_, lrc2.intercept_)).reshape(1, 2)
        newTrainedModel.trainPats = pyModel_train.trainPats[:, ROI_indices]
        newTrainedModel.trainLabels = pyModel_train.trainLabels
示例#56
0
    def gbdt_lr(self, data, label, epoch=10):
        '''
        https://research.fb.com/wp-content/uploads/2016/11/practical-lessons-from-predicting-clicks-on-ads-at-facebook.pdf  gbdt + lr 在广告点击率预估场景的使用 
        https://zhuanlan.zhihu.com/p/113350563 reference  
        input : data (dataframe) columns = ['user_id','title,'age','gender']
        gbdt feats: gbdt_leaf_0 ~ gbdt_leaf_n 
        onehot feats =  gbdt_leaf_0 : [1,0,0,...,0] 
        new input  =  input + onehot feats 

        '''
        data_train, data_val, label_train, label_val = train_test_split(
            data, label, test_size=0.33, random_state=10)
        gbm = lgb.LGBMClassifier(boosting_type='gbdt',
                                 objective='binary',
                                 subsample=0.8,
                                 min_child_weight=0.5,
                                 colsample_bytree=0.7,
                                 num_leaves=100,
                                 max_depth=3,
                                 learning_rate=0.01,
                                 n_estimators=50)
        gbm.fit(data_train,
                label_train,
                eval_set=[(data_train, label_train), (data_val, label_val)],
                eval_names=['train', 'test'],
                eval_metric='binary_logloss')
        model = gbm.booster_
        gbdt_feats_train = model.predict(data_train, pred_leaf=True)
        gbdt_feats_test = model.predict(data_val, pred_leaf=True)

        gbdt_feats_name = [
            'gbdt_leaf_' + str(i) for i in range(gbdt_feats_train.shape[1])
        ]
        df_train_gbdt_feats = pd.DataFrame(gbdt_feats_train,
                                           columns=gbdt_feats_name)
        df_test_gbdt_feats = pd.DataFrame(gbdt_feats_test,
                                          columns=gbdt_feats_name)
        # narray transto dataframe
        data_train = pd.DataFrame(data_train)
        data_val = pd.DataFrame(data_val)
        train = pd.concat([data_train, df_train_gbdt_feats], axis=1)
        val = pd.concat([data_val, df_test_gbdt_feats], axis=1)
        train_len = train.shape[0]
        data = pd.concat([train, val])

        for col in gbdt_feats_name:
            onehot_feats = pd.get_dummies(data[col], prefix=col)
            del data[col]
            data = pd.concat([data, onehot_feats], axis=1)

        x_train, x_val, y_train, y_val = train_test_split(data,
                                                          label,
                                                          test_size=0.2,
                                                          random_state=10)

        lr = LogisticRegression()
        lr.fit(x_train, y_train)
        tr_logloss = log_loss(y_train, lr.predict_proba(x_train)[:, 1])
        val_logloss = log_loss(y_val, lr.predict_proba(x_val)[:, 1])

        # NE = (-1) / len(y_pred_test) * sum(((1+y_test)/2 * np.log(y_pred_test[:,1]) +  (1-y_test)/2 * np.log(1 - y_pred_test[:,1])))
        y_pred_train = lr.predict_proba(x_train)[:, 1]
        y_pred_val = lr.predict_proba(x_val)[:, 1]
        val_ne = (-1) / len(y_pred_val) * sum(
            ((1 + y_val) / 2 * np.log(y_pred_val) +
             (1 - y_val) / 2 * np.log(1 - y_pred_val)))
        tr_ne = (-1) / len(y_pred_train) * sum(
            ((1 + y_train) / 2 * np.log(y_pred_train) +
             (1 - y_train) / 2 * np.log(1 - y_pred_train)))
        print('tr-logloss: %2.2f , val logloss:%2.2f  ' %
              (tr_logloss, val_logloss))
        print('tr-ne: %2.2f , val ne:%2.2f  ' % (tr_ne, val_ne))
示例#57
0
 def run_logisticreg(self):
     self.__obj = LogisticRegression(random_state=self.seed, C=self.c)
     self.__fit()
     return self.__predict()
示例#58
0
    labels=['low_level', 'med minus', 'med_level', 'med plus', 'high_level'])
df = pl.binarize_categ_var(df, 'MonthlyIncome_discretize')

df.head()

# # Step 5: Build Classifier

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

X_train, X_test, y_train, y_test = pl.split_data(df, X, y, 0.2)

classifiers = [
    LogisticRegression(),
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    GradientBoostingClassifier()
]

for method in classifiers:
    pl.test_model(X_train, y_train, features, method)
    pl.predict_model(X_train, y_train, X_test, y, features, method)

# # Step 6: Evaluate Classifier

for method in classifiers:
    pl.eval_model(X_train, y_train, X_test, y_test, features, method)
示例#59
0
    for j, clf in enumerate(clfs):
        # print(j, clf)
        dataset_blend_test_j = np.zeros((X_submission.shape[0], len(skf)))
        for i, (train, test) in enumerate(skf):
            # print("Fold", i)
            X_train = X[train]
            y_train = y[train]
            X_test = X[test]
            # y_test = y[test]
            clf.fit(X_train, y_train)
            y_submission = clf.predict_proba(X_test)[:, 1]
            dataset_blend_train[test, j] = y_submission
            dataset_blend_test_j[:, i] = clf.predict_proba(X_submission)[:, 1]
        dataset_blend_test[:, j] = dataset_blend_test_j.mean(1)

    print("Blending.")
    clf = LogisticRegression()
    clf.fit(dataset_blend_train, y)
    # y_submission = clf.predict_proba(dataset_blend_test)[:, 1]
    y_submission = clf.predict(dataset_blend_test)

    print("Linear stretch of predictions to [0,1]")
    # y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min())
    y_pred = y_submission
    accuracy_score = metrics.accuracy_score(y_pred, y_test)
    print("Blend : ", accuracy_score)
    # print
    # "Saving Results."
    # tmp = np.vstack([range(1, len(y_submission) + 1), y_submission]).T
    # np.savetxt(fname='submission.csv', X=tmp, fmt='%d,%0.9f',
示例#60
0
    train_base[feature+'_mean'] = train_base[feature+'_mean']/train_base[feature+'_mean'].max()
    test_base[feature + '_mean'] = np.log(test_base[feature]).replace(-inf, 0)
    test_base[feature + '_mean'] = test_base[feature + '_mean'] / test_base[feature + '_mean'].max()
    # list for remembering names of our modified continuous features
    continuous_feature_list_mean += [feature+'_mean']

# for categorical features I sugget to apply mean-encoding, where we transform each category into it's target-mean
# representation. Of course, to avoid overfittig we should use only mean encodings from train dataset
# we have to transfer encodings from train to test and NOT to calculate them separately on test dataset.
for feature in train_base.drop(['income', 'fnlwgt']+continuous_feature_list + continuous_feature_list_mean,axis=1).columns:
    db_group = train_base.groupby(feature)['income'].mean()
    train_base[feature+'_mean'] = train_base[feature].replace(list(db_group.index.values), list(db_group.values))
    test_base[feature + '_mean'] = test_base[feature].replace(list(db_group.index.values), list(db_group.values))

# datasets cleaning
train_x = train_base.dropna().loc[:, train_base.columns.str.contains('mean')]
train_y = train_base.dropna().loc[:, 'income']

test_x = test_base.dropna().loc[:, test_base.columns.str.contains('mean')]
test_y = test_base.dropna().loc[:, 'income']

# calling training lib - LogisticRegression
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

model.fit(train_x, train_y)

# printing results for train and test
print('Train score = '+str(model.score(train_x, train_y)))
print('Test score = ' + str(model.score(test_x, test_y)))