age_scale_param) fare_scale_param = scaler.fit(df['Fare'].reshape(-1, 1)) df['Fare_scaled'] = scaler.fit_transform(df['Fare'].reshape(-1, 1), fare_scale_param) train_df = df.filter( regex= 'Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*' ) train_np = train_df.as_matrix() y = train_np[:, 0] X = train_np[:, 1:] clf = linear_model.LogisticRegression(C=1.0, random_state=0, penalty='l1', tol=0.000001) clf.fit(X, y) #------------------------------------------------------------------------------------- data_test.loc[(data_test.Fare.isnull()), 'Fare'] = 0 tmp_df = data_test[['Age', 'Fare', 'Parch', 'SibSp', 'Pclass']] null_age = tmp_df[data_test.Age.isnull()].as_matrix() test_X = null_age[:, 1:] predictedAges = rfr.predict(test_X) data_test.loc[(data_test.Age.isnull()), 'Age'] = predictedAges data_test = set_Cabin_type(data_test) dummies_Cabin = pd.get_dummies(data_test['Cabin'], prefix='Cabin') dummies_Embarked = pd.get_dummies(data_test['Embarked'], prefix='Embarked')
def get_data(filename): return picles.Unpickler(open('grantData_hw3/' + filename + '.pickle', 'rb')).load() train = get_data('training') testing = get_data('testing') reduced_set = get_data('reduced') #Training: x = train[reduced_set['x']] y = train['Class'] classifier = linear_model.LogisticRegression(solver='liblinear') classifier.fit(x, y) #Testing: test_x = testing[reduced_set['x']] test_y = testing['Class'] predic_y = classifier.predict(test_x) assertion = [test_y[n] == item for n, item in enumerate(predic_y)] print('Logistic Regression:') corrects = assertion.count(True) wrongs = assertion.count(False)
import numpy as np import pandas as pd from sklearn import linear_model from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.model_selection import cross_val_score np.set_printoptions(suppress=True) train_df = pd.read_csv("train.csv") y = train_df["author"] logreg = linear_model.LogisticRegression(C=1e5) # work out how many tokens we should keep token_range = [ 10, 50, 75, 100, 300, 500, 750, 1000, 1250, 1500, 1750, 2000, 2500, 3000 ] token_scores = [] for tokens in token_range: vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=tokens, stop_words='english') X = vectorizer.fit_transform(train_df["text"]) scores = cross_val_score(logreg, X, y, cv=10, scoring='neg_log_loss') token_scores.append(abs(scores.mean())) import matplotlib matplotlib.use('TkAgg') import matplotlib.pyplot as plt plt.plot(token_range, token_scores)
for file_name in file_list: df = pd.read_table(file_path + file_name, index_col=0) target = 'Disease' cells = df.columns.tolist() train_score_dict = {} test_score_dict, test_fscore_dict = {}, {} for i in range(0, len(cells) - 1): # for i in range(0, 5): features = cells[i:i + 1] # can use for loop to do one cell iteration print(''.join(features)) train_df, test_df = train_test_split(df, train_size=0.8, random_state=1) # print(train_df.shape, test_df.shape) regularization = 1.0 # 1e5 logreg = linear_model.LogisticRegression(C=regularization) logreg.fit(train_df[features], train_df[target]) preds = logreg.predict_proba(test_df[features]) preds_1 = preds[:, 1] cell = ''.join(features) fpr[cell], tpr[cell], _ = metrics.roc_curve(test_df[target], preds_1) roc_auc[cell] = metrics.auc(fpr[cell], tpr[cell]) p_list = logreg.predict(test_df[features]).tolist() t_list = test_df[target].tolist() print(p_list, t_list, sep="\n") score_Train = logreg.score(train_df[features], train_df[target]) score_Test = logreg.score(test_df[features], test_df[target]) # scoreTestA = metrics.accuracy_score(t_list,p_list) # fraction of correctly classified samples scoreTestF = metrics.f1_score(t_list, p_list) print(features, score_Train, score_Test,
ridge_classifier.fit(train_data, train_lables) # Use the classifier: ridge_prediction = ridge_classifier.predict(test_data) # Quality control: metrics.accuracy_score(test_labels,ridge_prediction) # 0.8666666666666667 good! print(ridge_classifier.coef_) # weights: [[-0.0854443 -0.07273219]] print(ridge_classifier.intercept_) # coef before free member [-0.31250723] plt.show() # LogisticRegression: log_regressor = linear_model.LogisticRegression(random_state=1) log_regressor.fit(train_data,train_lables) lr_predictions = log_regressor.predict(test_data) lr_predictions_proba = log_regressor.predict_proba(test_data) # probability of prediction # print(test_labels) # print(lr_predictions) # print(lr_predictions_proba) accuracy_score = metrics.accuracy_score(test_labels,lr_predictions) # 0.8 # Quality control on cross-validation: ridge_scoring1 = cross_val_score(ridge_classifier, blobs[0], blobs[1], scoring="accuracy", cv = 10) lr_scoring = cross_val_score(log_regressor, blobs[0], blobs[1], scoring="accuracy", cv = 10)
print "PARAMS:" print " targets - target labels which are as positive, in \"s0,s1,s2,...\" format" print " config_file - config file of learning & architecture parameters" print " patts_file - training patterns file in metis format" print " model_file - out model file as \"Perceptron\" in metis format" print "" exit(-1) learn_params = MetisParams.LearnParams() arch_params = MetisParams.ArchParams() targets = map(lambda x: int(x), sys.argv[1].split(",")) learn_params.readFromConfig(sys.argv[2]) clf = linear_model.LogisticRegression(penalty=learn_params._regula, C=learn_params._alpha, max_iter=learn_params._max_iter, tol=learn_params._epsilon) # read patterns from file time0 = datetime.datetime.now() X_train, Y_train = MetisParams.readPatts(sys.argv[3], targets) arch_params._input = len(X_train[0]) arch_params._output = 2 print "Load %d patterns from %s" % (len(Y_train), sys.argv[3]) # train model time1 = datetime.datetime.now() clf.fit(X_train, Y_train) print "Training Completed, number of iterations is %d" % clf.n_iter_ # save model
test_data[:, 0] - 1)) / float(num_test) ctr += 1 dw[ind] = 0 if mode == 1: ind = np.argmax(loss_change) elif mode == 2: ind = np.random.random_integers(0, 41) print 'w = ', w print 'loss = ', loss(w, train_data, reg_term) print 'accuracy = ', accuracy[ctr - 1] regr = linear_model.LogisticRegression(solver='lbfgs', multi_class='multinomial', C=reg_term, tol=1e-8, max_iter=num_iterations) regr.fit(train_data[:, 1:14], train_data[:, 0]) w_regr = np.column_stack((regr.coef_, regr.intercept_)).reshape(-1) print 'w_regr = ', w_regr print 'regr_loss = ', loss(w_regr, train_data, reg_term) print 'regr_accuracy = ', regr.score(test_data[:, 1:14], test_data[:, 0]) if mode == 1: np.save('losses_custom.npy', loss1) np.save('accuracy_custom.npy', accuracy) elif mode == 2: np.save('losses_random.npy', loss1) np.save('accuracy_random.npy', accuracy)
preds = KNN_clf.predict(test) print(preds) print(accuracy_score(test_labels, preds)) # rss=((X-y)**2).sum() # mse=np.mean((X-y)**2) # print("Final rmse value is =",np.sqrt(np.mean((X-y)**2))) AB_clf = AdaBoostClassifier(n_estimators=300, random_state=2) model = AB_clf.fit(train, train_labels) preds = AB_clf.predict(test) print(preds) print(accuracy_score(test_labels, preds)) LOG_clf = linear_model.LogisticRegression(multi_class="ovr", solver="sag", class_weight='balanced') model = LOG_clf.fit(train, train_labels) preds = LOG_clf.predict(test) print(preds) print(accuracy_score(test_labels, preds)) R_F = RandomForestClassifier(n_estimators=300, max_depth=3, random_state=2) model = R_F.fit(train, train_labels) preds = R_F.predict(test) print(preds) print(accuracy_score(test_labels, preds)) scores = [] num_features = len(train.columns) for i in range(num_features): col = train.columns[i]
def get_sparse_classifiers(ncv_set, cvp_set): return { 'L1 Logistic Regression': ({ 'proc__C': [ 0.001, 0.01, 0.05, 0.1, 0.3, 0.5, 0.8, 1, 1.5, 2, 3, 5, 10, 15, 20, 25, 50 ], 'severity': [True, False] }, BinClassifier( linear_model.LogisticRegression(penalty='l1', class_weight='balanced')), ncv_set, True, 'proc__C'), 'L2 Logistic Regression': ({ 'proc__C': [ 0.001, 0.01, 0.05, 0.1, 0.3, 0.5, 0.8, 1, 1.5, 2, 3, 5, 10, 15, 20, 25, 50 ], 'severity': [True, False] }, BinClassifier( linear_model.LogisticRegression( penalty='l2', class_weight='balanced')), ncv_set, True, 'proc__C'), 'Lasso': ({ 'reg__alpha': [2, 1.5, 1, 0.8, 0.7, 0.5, 0.3, 0.2, 0.1, 0.01], 'severity': [True, False], 'thres': [0, 0.2, 0.75, 0.8, 0.85, 0.9, 0.95] }, RegClassifier(linear_model.Lasso()), cvp_set, True, 'reg__alpha'), 'Linear Regression': ({ 'severity': [True, False], 'thres': [0] }, RegClassifier(linear_model.LinearRegression()), cvp_set, True, 'thres'), 'Ridge': ({ 'reg__alpha': [2, 1.5, 1, 0.8, 0.7, 0.5, 0.3, 0.2, 0.1, 0.01], 'severity': [True, False], 'thres': [0, 0.2, 0.75, 0.8, 0.85, 0.9, 0.95] }, RegClassifier(linear_model.Ridge()), cvp_set, True, 'reg__alpha'), 'Relaxed Lasso': ({ 'reg__alpha': [0.005, 0.01, 0.1, 0.2, 0.5], 'severity': [True, False], 'thres': [0, 0.4, 0.6, 0.75], 'first_reg__reg__alpha': [1, 0.5, 0.2, 0.1, 0.01], 'first_reg__severity': [False], 'first_reg__thres': [0, 0.2, 0.75] }, RelaxedLinear(first_reg=RegClassifier( reg=linear_model.Lasso()), reg=linear_model.Lasso()), cvp_set, True, 'first_reg__reg__alpha'), 'Elastic Net': ({ 'reg__alpha': [2, 1.5, 1, 0.8, 0.7, 0.5, 0.3, 0.2, 0.1, 0.01], 'severity': [True, False], 'thres': [0, 0.75, 0.85, 0.9, 0.95], 'reg__l1_ratio': [0.2, 0.4, 0.5, 0.6, 0.7, 0.8] }, RegClassifier(linear_model.ElasticNet()), cvp_set, True, 'reg__alpha'), 'Shrunken Centroids OCV': ({ 'proc__shrink_threshold': [2, 1.5, 1.3, 1.2, 1.1, 1, 0.8, 0.5, 0.1, 0.01], 'proc__metric': ['euclidean', 'manhattan', 'cosine'], 'severity': [False] }, BinClassifier(ShrunkenCentroidClassifier()), ncv_set, True, 'proc__shrink_threshold'), 'Shrunken Centroids UCV': ({ 'proc__shrink_threshold': [2, 1.5, 1.3, 1.2, 1.1, 1, 0.8, 0.5, 0.1, 0.01], 'proc__metric': ['euclidean', 'manhattan', 'cosine'], 'severity': [False] }, BinClassifier(ShrunkenCentroidClassifier()), cvp_set, True, 'proc__shrink_threshold'), 'SVM enet': ({ 'proc__loss': ['modified_huber'], 'proc__alpha': [2, 1, 0.5, 0.4, 0.2, 0.1, 0.05], 'proc__l1_ratio': [0.1, 0.2, 0.3, 0.5] }, BinClassifier( linear_model.SGDClassifier(penalty='elasticnet', class_weight='balanced')), ncv_set, True, 'proc__alpha'), 'L1 Linear SVM': ({ 'proc__C': [0.0000001, 0.001, 0.01, 0.5, 0.7, 0.1, 0.2, 0.5, 1, 1.5, 2] }, BinClassifier( svm.LinearSVC(penalty='l1', dual=False, class_weight='balanced')), ncv_set, True, 'proc__C') }
def solver(self, solver): self.__solver = solver self.classifier = linear_model.LogisticRegression( solver=self.__solver, C=self.__C, multi_class=self.__multi_class)
def multi_class(self, multi_class): self.__multi_class = multi_class self.classifier = linear_model.LogisticRegression( solver=self.__solver, C=self.__C, multi_class=self.__multi_class)
qeds.themes.mpl_style() # Logistic Regression data_url = "https://raw.githubusercontent.com/propublica/compas-analysis" data_url += "/master/compas-scores-two-years.csv" df = pd.read_csv(data_url) df.head() X = df[["decile_score"]] y = df["two_year_recid"] X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.25) logistic_model = linear_model.LogisticRegression(solver="lbfgs") logistic_model.fit(X_train, y_train) beta_0 = logistic_model.intercept_[0] beta_1 = logistic_model.coef_[0][0] print(f"Fit model: p(recid) = L({beta_0:.4f} + {beta_1:.4f} decile_score)") # Decesion boundaries X = df[["decile_score", "age"]] X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.25, random_state=42) logistic_age_model = linear_model.LogisticRegression(solver="lbfgs") logistic_age_model.fit(X_train, y_train)
'penalty': ['l1', 'l2'], 'C': [0.01, 0.1, 1, 2, 3, 4, 5, 10], 'solver': ['liblinear'], 'max_iter': [100, 500], 'tol': [0.0001, 0.00001, 0.00001], 'class_weight': [None, 'balanced'] } #survived = y[y == 1] #isCabin_surv = data.iloc[survived.index]['Cabin'].map(lambda x: 0 if x != x else 1) #n_survived = y[y == 0] #isCabin_n_surv = data.iloc[n_survived.index]['Cabin'].map(lambda x: 0 if x != x else 1) #colors = ['blue','green'] #plt.hist([isCabin_surv, isCabin_n_surv], histtype='bar', color=colors, stacked=True, fill=True, label = ['survived', 'not survived']) #plt.legend() linear_model_titanic = linear_model.LogisticRegression(random_state=0) linear_search_res = ms.GridSearchCV(linear_model_titanic, param_log) linear_model_titanic.fit(whole_train_matrix, y) linear_search_res.fit(whole_train_matrix, y) #predictions = linear_search_res.best_estimator_.predict_proba(whole_test_matrix) #predictions = [1 if prediction[1] > 0.5 else 0 for prediction in predictions] score_logistic = ms.cross_val_score(linear_model_titanic, whole_train_matrix, y, cv=3).mean() score_search_logistic = ms.cross_val_score(linear_search_res.best_estimator_, whole_train_matrix, y, cv=3).mean() #answer = pd.DataFrame() #answer['PassengerId'] = test_data['PassengerId']
print(train.columns) sns.countplot(x='target', data=train) #filter unique value features train1 = train.iloc[:, 2:] y = train['target'].astype(int) X_train, X_eval, y_train, y_eval = model_selection.train_test_split( train1, y, test_size=0.1, random_state=1) stages = [('imputer', preprocessing.Imputer()), ('zv_filter', feature_selection.VarianceThreshold()), ('feature_selector', feature_selection.RFE(svm.LinearSVC(max_iter=10000))), ('classifier', linear_model.LogisticRegression())] pipeline = pipeline.Pipeline(stages) pipeline_grid = { 'feature_selector__n_features_to_select': [10, 20], 'classifier__C': [0.001, 0.01, 0.1, 0.2, 0.5], 'classifier__penalty': ['l1', 'l2'], 'classifier__class_weight': ['balanced', None] } pipeline_generated = cutils.grid_search_best_model(pipeline, pipeline_grid, X_train, y_train, scoring="roc_auc") final_estimator = pipeline_generated.named_steps['classifier'] print(pipeline_generated.score(X_eval, y_eval))
import numpy as np from sklearn import linear_model from sklearn.metrics import accuracy_score as acc from utils.mnist_loader import load_mnist np.random.seed(21) ## DATA X_path = 'mnist/t10k-images-idx3-ubyte' y_path = 'mnist/t10k-labels-idx1-ubyte' X, y = load_mnist(X_path, y_path) ## FEATURE id_shuffle = np.random.permutation(len(y)) id_trn = id_shuffle[:6000] id_tst = id_shuffle[-4000:] X = X.reshape(X.shape[0],-1)/127.5 - 1. X_trn, y_trn = X[id_trn], y[id_trn] X_tst, y_tst = X[id_tst], y[id_tst] print('train: ', X_trn.shape, y_trn.shape) print('test: ', X_tst.shape, y_tst.shape) ## MODEL model = linear_model.LogisticRegression(C=1e5, solver='liblinear', multi_class='ovr') model.fit(X_trn, y_trn) y_prd = model.predict(X_tst) accuracy = acc(y_tst, y_prd) print('accuracy: ', accuracy)
def __init__(self): """Initializes the classifier.""" self.clf = linear_model.LogisticRegression(class_weight='balanced')
def get_classifier(self): """ returns the classifier""" log_reg = linear_model.LogisticRegression(solver='lbfgs') ors = OneVsRestClassifier(log_reg) return ors
def fitL1LogisticWithNFeat(**kwargs): fitter = linear_model.LogisticRegression(penalty="l1", C=1, solver="saga") return fitModelWithNFeat(fitter=fitter, **kwargs)
print df my_tags = ['m', 'f'] print df.gender.value_counts() train_data, test_data = train_test_split(df, test_size=0.25, random_state=42) print(len(test_data), len(train_data)) print "\n---BAG OF WORDS---" count_vectorizer = CountVectorizer(analyzer="word", tokenizer=nltk.word_tokenize, preprocessor=None, stop_words='english', max_features=4000) train_data_features = count_vectorizer.fit_transform(train_data['text']) logreg_model = linear_model.LogisticRegression(n_jobs=1, C=1e5) logreg_model = logreg_model.fit(train_data_features, train_data['gender']) print word_embeddings.predict(count_vectorizer, logreg_model, test_data, my_tags) print "\n---N-GRAMS---" n_gram_vectorizer = CountVectorizer(analyzer="char", ngram_range=([2, 5]), tokenizer=None, preprocessor=None, max_features=4000) charn_model = linear_model.LogisticRegression(n_jobs=1, C=1e5) train_data_features = n_gram_vectorizer.fit_transform(train_data['text'])
def fitLogisticWithNFeat(**kwargs): fitter = linear_model.LogisticRegression(penalty="l2", C=1e10) return fitModelWithNFeat(fitter=fitter, **kwargs)
def get_logistic_regr_score(X_train, Y_train, X_test, Y_test): log_regr = linear_model.LogisticRegression(solver='newton-cg', random_state=42) log_regr.fit(X_train, Y_train) return log_regr.score(X_test, Y_test)
# Loads pandas import pandas # Loads numpy import numpy as np from xgboost import XGBClassifier dataframe = pandas.read_csv("USDJPY,5multiclass.csv", header=None) dataset = dataframe.values # split into input (X) and output (Y) variables #X = dataset[:,0:4050].astype(float) X = dataset[:,0:59] scaler = StandardScaler() scaler.fit(X) X = scaler.transform(X) y = dataset[:,59] clf0 = sklearn.ensemble.GradientBoostingClassifier(n_estimators=200) clf1 = lm.LogisticRegression(penalty = "l1", C = 9081) clf2 = RandomForestClassifier(random_state=1, n_estimators=200) clf3 = lm.LogisticRegression(penalty = "l2", C = 5000) clf4 = MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08, hidden_layer_sizes=(30, 30, 30), learning_rate='constant', learning_rate_init=0.001, max_iter=2000, momentum=0.9, nesterovs_momentum=True, power_t=0.5, random_state=None, shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False, warm_start=False) eclf = EnsembleVoteClassifier(clfs=[clf0, clf1, clf2, clf3, clf4], weights=[1,1,1,3,1]) labels = ['GBC','Lasso', 'Random Forest', 'Ridge', 'MLP','Ensemble'] for clf, label in zip([clf0, clf1, clf2, clf3, clf4, eclf], labels): scores = model_selection.cross_val_score(clf, X, y,
def prediction_step(background_train, background_test, job_training_data, challengeID_train): # We apply transform to both the training and test set #background_train_np = enc.transform(background_train_np) #background_test_np = enc.transform(background_test_np) # Convert the background training and testing to numpy arrays background_train_np = background_train.as_matrix() background_train_np = np.asmatrix(background_train_np) background_test_np = background_test.as_matrix() background_test_np = np.asmatrix(background_test_np) # Convert the job_training data into matrix and then into a 1-D array job_training_data_np = job_training_data.as_matrix() job_training_data_np = np.asmatrix(job_training_data_np) job_training_data_np = np.ravel(job_training_data_np) # Perform fecture selection to reduce the number of # required features #background_train_np, background_test_np = select_feature(background_train_np, background_test_np, job_training_data_np) # Select k-best features #background_train_np, background_test_np = select_k_best(background_train_np, background_test_np, job_training_data_np) # Perform principal component analysis #background_train_np, background_test_np = perform_pca(background_train_np, background_test_np, job_training_data_np) # Perform principal random tree embedding # predict_job_training = perform_one_hotencoding(background_train_np, background_test_np, job_training_data_np) # Perform Cross Validation # Choose the method to perform the actual prediction using the best performing # scheme position = cross_validate_model(background_train_np, job_training_data_np) #################################################### ## Set up the same methods used in cross validation ## Fitting twice gives an error hence this way #################################################### # List the regression methods to use. clf_quaddis = discriminant_analysis.QuadraticDiscriminantAnalysis() clf_logreg = sklinear.LogisticRegression(penalty='l1') clf_random_forest = ensemble.RandomForestClassifier(n_estimators=50) clf_adaboost = ensemble.AdaBoostClassifier(n_estimators = 50) clf_mlpc = neural_network.MLPClassifier() clf_extra_tree = ensemble.ExtraTreesClassifier(n_estimators=50, bootstrap=True) # Add the above methods in an array # More ameable for looping methods = [clf_quaddis, clf_logreg, clf_random_forest, clf_adaboost, clf_mlpc, clf_extra_tree] methods_label = ['clf_quaddis', 'clf_logreg', 'clf_random_forest', 'clf_adaboost', 'clf_mlpc', 'clf_extra_tree'] method = methods[position] method_label = methods_label[position] print('The chosen method is : %s' %(method_label)) # Predict based on the chosen method method.fit(background_train_np, job_training_data_np) predict_job_training = method.predict_proba(background_test_np) filename = 'predict_job_training_'+method_label+'.csv' if os.path.isfile(filename) : os.remove(filename) for i in range(len(predict_job_training)): file = open(filename,"a+") file.write("%f \r\n" % (predict_job_training[i,1])) file.close()
def run(self): data = OrderedDict() shapes = {} for r in self.requires(): x = r.load().squeeze() data[r.task_id] = x shapes[r.task_id] = x.shape[1] if len(x.shape) == 2 else 1 data = pandas.DataFrame(data)[list(data.keys())] data['is_duplicate'] = Dataset().load()[1].is_duplicate X = data.drop('is_duplicate', 1).values print(X.max(), X.min(), np.isnan(X).sum()) y = data.is_duplicate.values np.savetxt('cache/Ry.csv', data.is_duplicate, header='is_duplicate', delimiter=',') weights = core.weights[y] scores = [] cls = linear_model.LogisticRegression(C=10) cls.fit(X, y) print(pandas.Series(cls.coef_[0], data.drop('is_duplicate', 1).columns)) polytransform = preprocessing.PolynomialFeatures(2) scaletransform = preprocessing.Normalizer() transform = pipeline.Pipeline([('scale', scaletransform), ('poly', polytransform)]) for train_index, test_index in model_selection.KFold( n_splits=10).split(X, y): cls = linear_model.LogisticRegression(C=10) #cls = TorchLogit() X_train, X_test = X[train_index], X[test_index] X_train = transform.fit_transform(X_train) X_test = transform.transform(X_test) y_train, y_test = y[train_index], y[test_index] w_train, w_test = weights[train_index], weights[test_index] cls.fit(X_train.copy(), y_train.copy()) #, sample_weight=w_train) pred = cls.predict_proba(X_test) score = metrics.log_loss(y_test, pred, sample_weight=w_test) print(score) scores.append(score) print(colors.yellow | '!----++++++----!') print(colors.yellow | colors.bold | '|' + str(np.mean(scores)) + '|') print(colors.yellow | '¡----++++++----¡') X = transform.transform(X) cls.fit(X, y, sample_weight=weights) data = OrderedDict() for r in self.requires(): x = r.load_test().squeeze() data[r.task_id] = x assert shapes[r.task_id] == x.shape[1] if len(x.shape) == 2 else 1,\ "Shape: {} did not match expected {}" % (x.shape, shapes[r.task_id]) #print(r.__class__.__name__, '\t', x.shape, type(x)) data = pandas.DataFrame.from_dict(data) X = data.values X = transform.transform(X) index = pandas.Index(np.arange(X.shape[0]), name='test_id') pred = pandas.Series(cls.predict_proba(X)[:, 1], index=index, name='is_duplicate').to_frame() print(colors.green | str(pred.head())) with gzip.open('cache/stacked_pred.csv.gz.tmp', 'wt') as f: pred.to_csv(f) os.rename('cache/stacked_pred.csv.gz.tmp', 'cache/stacked_pred.csv.gz')
def ml(dfin): """ Runs either logistic regression or KNN through sklearn using both dev and test set. The hyperparameter is set using a dev set. Post hyperparameter training uses train and dev set for training. :df -pandas df - must have a 'y' column that is bool, all other columns will be cast to float and used as features. :model -'LOGISTIC' or 'KNN' """ df = pd.DataFrame(dfin, dtype=float) assert ('y' in df.columns) df = df.dropna() if df.shape[0] < 20: N_FOLDS = 2 else: N_FOLDS = 10 ACC_THRESH = 0.01 # dev set accuracy must be x% better to use new param models = ['KNN', 'LOGISTIC', 'TREE'] for model in models: print('\nMODEL: ', model) if model == 'LOGISTIC': c_l = [ 0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100, 300, 1000, 3000, 10000 ] elif model == 'KNN': c_l = [50, 40, 35, 30, 25, 20, 18, 15] else: c_l = [3, 4, 5, 6, 7, 8] regularizer = 'l1' X_nd = df.drop('y', axis=1).values #X_nd = scale(X_nd) # magnitude has useful info? y_n = df['y'].values.astype(bool) skf = StratifiedKFold(shuffle=True, n_splits=N_FOLDS) acc_test_a = np.zeros(N_FOLDS) acc_train_a = np.zeros(N_FOLDS) for i, (train, test) in enumerate(skf.split(X_nd, y_n)): train_n = len(train) dev = train[:int(train_n / 4)] # empirically found that dev 1/4 is good sub_train = train[int(train_n / 4):] # this is temporary train set best_acc = 0 best_c = None # in this loop we find best hyper parameter for this split for c in c_l: if model == 'LOGISTIC': clf = linear_model.LogisticRegression(penalty=regularizer, C=c) elif model == 'KNN': clf = KNeighborsClassifier(n_neighbors=c, metric='euclidean', weights='uniform') else: clf = tree.DecisionTreeClassifier(max_leaf_nodes=c) clf.fit(X_nd[sub_train], y_n[sub_train]) y_pred = clf.predict(X_nd[dev]) acc = metrics.accuracy_score(y_pred, y_n[dev]) if (acc > best_acc + ACC_THRESH): best_acc = acc best_c = c # retrain with all train data and best_c print('fold:', i, ' best c:', best_c, ' dev:%.2f' % best_acc, ' dev_ones:%.2f' % (y_n[dev].sum() / len(dev)), end='') if model == 'LOGISTIC': clf = linear_model.LogisticRegression(penalty=regularizer, C=best_c) elif model == 'KNN': clf = KNeighborsClassifier(n_neighbors=best_c, metric='euclidean', weights='uniform') else: clf = tree.DecisionTreeClassifier(max_leaf_nodes=best_c) clf.fit(X_nd[train], y_n[train]) y_pred = clf.predict(X_nd) acc_test_a[i] = metrics.accuracy_score(y_pred[test], y_n[test]) acc_train_a[i] = metrics.accuracy_score(y_pred[train], y_n[train]) print(' test:%.2f' % acc_test_a[i], ' train:%.2f' % acc_train_a[i]) print('Avg test acc:%.3f' % acc_test_a.mean(), 'Avg train acc:%.3f' % acc_train_a.mean())
# with torch.no_grad(): # loss, acc = validation(model, test_dataset, criterion) # print("Loss ", loss, "Accuracy", acc) model = Network(1470, [400], 1) batch_size = 60 epoch = 3 criterion = nn.BCELoss() optimizer = optim.Adam(model.parameters(), lr=0.02) df = pd.read_csv('Features.csv') feat = df.drop(df.columns[[0, -1]], axis=1, inplace=False) Xd = feat.values.astype(float) Y = df['Class'].values Yd = (Y == 'M').astype(float) m1 = sksvm.SVC() m2 = ske.RandomForestClassifier() m3 = skl.LogisticRegression() svmScore = 0 rfScore = 0 logScore = 0 nnscore = 0 for i in range(10): X_train, X_test, Y_train, Y_test = skm.train_test_split(Xd, Yd, test_size=0.3) trainset = data(X_train, Y_train, batch_size) model = Network(1470, [400], 1) batch_size = 60 epoch = 3 criterion = nn.BCELoss() optimizer = optim.Adam(model.parameters(), lr=0.02)
# (4) 特征工程 - 归一化 # 接下来我们要接着做一些数据预处理的工作,比如scaling,将一些变化幅度较大的特征化到[-1,1]之内 # 这样可以加速logistic regression的收敛 import sklearn.preprocessing as preprocessing scaler = preprocessing.StandardScaler() age_scale_param = scaler.fit(df['Age']) df['Age_scaled'] = scaler.fit_transform(df['Age'], age_scale_param) fare_scale_param = scaler.fit(df['Fare']) df['Fare_scaled'] = scaler.fit_transform(df['Fare'], fare_scale_param) # (5) 特征工程 - 特征抽取 # 我们把需要的feature字段取出来,转成numpy格式,使用scikit-learn中的LogisticRegression建模 train_df = df.filter( regex= 'Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*' ) #train_df.to_csv("processed_titanic.csv" , encoding = "utf-8") train_np = train_df.as_matrix() # y即Survival结果 y = train_np[:, 0] # X即特征属性值 X = train_np[:, 1:] # (6) 模型构建与训练 clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6) clf.fit(X, y) # (7) 绘制learning curve plot_learning_curve(clf, u"学习曲线", X, y)
from sklearn.neural_networks import BernoulliRBM from sklearn.cross_validation import cross_val_score from sklearn import linear_model #constants TRAINING = 'train.csv' TEST = 'test.csv' def read_in_csv(filename): data = [] with open(filename, 'rb') as input_data: filereader = csv.reader(input_data, delimiter=',') for row in filereader: data.append(row) return data train_examples = read_in_csv(TRAINING) train_examples.pop(0) #get rid of column headers train_labels = [] #extract labels for row in train_examples: train_labels.append(row.pop(0)) #models we will use linear_classifier = linear_model.LogisticRegression() rbm = BernoulliRBM(random_state=0, verbose=True)
from sklearn import datasets, neighbors, linear_model digits = datasets.load_digits() X_digits = digits.data y_digits = digits.target n_samples = int(len(X_digits)) X_train = X_digits[:.9 * n_samples] y_train = y_digits[:.9 * n_samples] X_test = X_digits[.9 * n_samples:] y_test = y_digits[.9 * n_samples:] knn = neighbors.KNeighborsClassifier() logistic = linear_model.LogisticRegression() print('KNN score: %f' % knn.fit(X_train, y_train).score(X_test, y_test)) print('LogisticRegression score: %f' % logistic.fit(X_train, y_train).score(X_test, y_test))
def runIrisFlowersTool(): print("\nIris Flowers with tool\n") inputs, outputs = readDataTool() inputTrain, outputTrain, inputTest, outputTest = splitData(inputs, outputs) # normalise the dates scaler = StandardScaler() if not isinstance(inputTrain[0], list): inputTrain = [[d] for d in inputTrain] inputTest = [[d] for d in inputTest] scaler.fit(inputTrain) normalisedTrainInput = scaler.transform(inputTrain) normalisedTestInput = scaler.transform(inputTest) # decode from list normalisedTrainInput = [el[0] for el in normalisedTrainInput] normalisedTestInput = [el[0] for el in normalisedTestInput] else: scaler.fit(inputTrain) normalisedTrainInput = scaler.transform(inputTrain) normalisedTestInput = scaler.transform(inputTest) # normalised data: normalisedTrainInput, normalisedTestInput logisticRegressionTool = linear_model.LogisticRegression(max_iter=1000) logisticRegressionTool.fit(normalisedTrainInput, outputTrain) w0, w1, w2, w3, w4 = logisticRegressionTool.intercept_[0], logisticRegressionTool.coef_[0][0], \ logisticRegressionTool.coef_[0][1], logisticRegressionTool.coef_[0][2], \ logisticRegressionTool.coef_[0][3] print('Model SETOSA: w0 = ', w0, ' w1 = ', w1, ' w2 = ', w2, ' w3 = ', w3, ' w4 = ', w4) w0, w1, w2, w3, w4 = logisticRegressionTool.intercept_[1], logisticRegressionTool.coef_[1][0], \ logisticRegressionTool.coef_[1][1], logisticRegressionTool.coef_[1][2], \ logisticRegressionTool.coef_[1][3] print('Model VERSICOLOR: w0 = ', w0, ' w1 = ', w1, ' w2 = ', w2, ' w3 = ', w3, ' w4 = ', w4) w0, w1, w2, w3, w4 = logisticRegressionTool.intercept_[2], logisticRegressionTool.coef_[2][0], \ logisticRegressionTool.coef_[2][1], logisticRegressionTool.coef_[2][2], \ logisticRegressionTool.coef_[2][3] print('Model VIRGINICA: w0 = ', w0, ' w1 = ', w1, ' w2 = ', w2, ' w3 = ', w3, ' w4 = ', w4) print() print('Prediction (tool): ', logisticRegressionTool.predict(normalisedTestInput)) print( "Accuracy (tool): ", accuracy_score(outputTest, logisticRegressionTool.predict(normalisedTestInput))) error = 1 - accuracy_score( outputTest, logisticRegressionTool.predict(normalisedTestInput)) print("Classification Error (tool): ", error)