def model(neural_data, run_onset, det_window, penalty, neuron_num=None, pca_com=None): if neuron_num is not None: neurons_idx = np.random.randint(0, len(neural_data), neuron_num) feat = extract_features(neural_data, neurons_idx=neurons_idx) key = neuron_num elif pca_com is not None: feat = extract_features(neural_data, pca_comp_num=pca_com) key = pca_com else: raise ValueError("Either `neuron_num` or `pca_com` must be provided.") X_train, y_train, X_test, y_test = prepare_data(feat, run_onset, det_window, 0.2) C = np.logspace(-8, 0, 50) decoder = LogisticRegressionCV(Cs=C, penalty=penalty, solver='liblinear', max_iter=100) decoder.fit(X_train, y_train) acc_test = decoder.score(X_test, y_test) acc_train = decoder.score(X_train, y_train) decoders[(key, det_window, penalty)] = decoder train_acc[(key, det_window, penalty)] = acc_train test_acc[(key, det_window, penalty)] = acc_test return np.abs(acc_train - acc_test) if acc_test > 0.7 else 100
def SimGraphConv(A, X, Y, k, penalty='l2'): """ Simple Graph Convolution Algorithm. Arguments: A: Sparse adjacency matrix [n, n] (n is the number of nodes) X: Sparse feature matrix [n, d] (d is the number of features) Y: Numpy array with labels [n,1] k: number of layers penalty: 'l1', 'l2' specify the norm used in the penalization. Return: Y: Prediction [n, c] Y[i,j] denotes the the prob of node i belongs to class j """ I = sparse.eye(A.shape[0]) A_hat = A + I D_hat = np.asarray(A_hat.sum(axis=0)).astype(np.float64)[0] assert((D_hat>0).all()) invsqrt = lambda x: x**(-0.5) D_hat_invsqrt = sparse.diags(invsqrt(D_hat)) S = D_hat_invsqrt.dot(A_hat).dot(D_hat_invsqrt) train_ind, test_ind = train_test_split(np.arange(X.shape[0]), train_size=0.7, test_size=0.3) X_train = (S**k)[train_ind,:].dot(X) Y_train = Y[train_ind] X_test = (S**k)[test_ind,:].dot(X) Y_test = Y[test_ind] logfit = LogisticRegressionCV(cv=2, penalty='l2', solver='liblinear', random_state=1, max_iter=100).fit(X_train, Y_train) return logfit.score(X_train,Y_train),logfit.score(X_test,Y_test), logfit.coef_
def fit_logistic_cv(X_train, X_test, y_train, y_test, cv=5): pred_y = None from sklearn.linear_model import LogisticRegressionCV ''' Your code here... Please follow the German credit example. First fit the model and obtain pred_y values. You need to figure out how to do 5-fold cross validation. then 1. print classification report 2. print accuracy. You can find how to get model accuracy by consulting the documentation of sklearn logistic regression. Hint: you need use score() Your code should print the measures as follows. The numbers you get could be be different because of random sampling precision recall f1-score support 0 0.80 0.90 0.85 3741 1 0.75 0.59 0.66 1965 avg / total 0.78 0.79 0.78 5706 accuracy: 0.788643533123 ''' # train model using cross-validation model = LogisticRegressionCV(cv=cv).fit(X_train, y_train) # make prediction pred_y = model.predict(X_test) # evaluate the prediction results print metrics.classification_report(y_test, pred_y) print model.score(X_train, y_train)
def sub_cancer_type(x,y,t): #choose cancer type idt means id_cancer_type #pdb.set_trace() idt=y[:,0]==t yt=y[idt] xt=np.r_[x[0].reshape(1,len(x[0])),x[1:][idt]] yth1,yth2=np.percentile(yt[:,1],(33,66)) yr=np.array(yt[:,1],copy=True) id1=yr<=yth1 id2=yr>=yth2 yr[id1]=0 yr[id2]=1 yr=yr[np.logical_or(id1,id2)] xr=np.array(xt,copy=True) xr=np.r_[xr[0].reshape(1,len(xr[0])),xr[1:][np.logical_or(id1,id2)]] #feature selection with chi2 sp = SelectPercentile(chi2,percentile=50) sp.fit(xr[1:],yr) idg=sp.pvalues_<=0.1 xr2=xr[:,idg] #LogicticRegression #clf=LogisticRegression() x_train, x_test, y_train, y_test = train_test_split(xr2[1:],yr,test_size=0.2) clf=LogisticRegressionCV(Cs=[0.001,0.01,0.1,1.,10.,100.,1000.],penalty='l2',max_iter=1000,cv=10).fit(x_train,y_train) #clf.fit(x_train,y_train) accuracy=clf.score(x_train,y_train) scores=clf.score(x_test,y_test) return(accuracy,scores,idg,clf.coef_)
def main(unused_argv): print('Running statistics on %s' % exp_name) if len(unused_argv) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) start_time = time.time() np.random.seed(random_seed) train_x, train_y, val_x, val_y, test_x, test_y = load_data() lr = LogisticRegressionCV() lr.fit(train_x, train_y) train_acc = lr.score(train_x, train_y) print(train_acc) test_acc = lr.score(test_x, test_y) print(test_acc) train_y_pred = lr.predict(train_x) y_pred = lr.predict(test_x) print('Training eval') print(metrics.classification_report(train_y, train_y_pred)) print('Testing eval') print('-----------------------------------------------') print(metrics.classification_report(test_y, y_pred)) with open(os.path.join(model_dir, dataset + '.pkl'), 'wb') as f: dill.dump(lr, f) util.print_execution_time(start_time)
def logistic(): lr_base = LogisticRegressionCV(random_state=0, max_iter=10000) lr_base.fit(X_train, y_train) if verbose: print('LR Base training accuracy:', lr_base.score(X_train, y_train)) print('LR Base Test accuracy:', lr_base.score(X_test, y_test)) # Add to our final models to compare final_models.append(('LR base', lr_base, 'LR'))
def multi_logsitc_cv_with_all(): raw_frame=thal_data() x=raw_frame.drop(['thal'],axis=1) y=raw_frame['thal'] x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=5) clf = LogisticRegressionCV(cv=10, random_state=0, multi_class='multinomial',max_iter=10000).fit(x_train, y_train) global train_score train_score.append(clf.score(x_train,y_train)) global test_score test_score.append(clf.score(x_test,y_test))
def one_vs_rest_multi_logsitc_cv_without_log(): raw_frame=thal_data() x=raw_frame.drop(['thal','log_pressure','log_cholestoral','log_age','log_heart_rate'],axis=1) y=raw_frame['thal'] x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=5) clf = LogisticRegressionCV(cv=5, random_state=0, multi_class='ovr',max_iter=10000).fit(x_train, y_train) global train_score train_score.append(clf.score(x_train,y_train)) global test_score test_score.append(clf.score(x_test,y_test))
def one_vs_rest_multi_logisitc_selected_feature(): raw_frame=thal_data() x=raw_frame.drop(['sugar','age','cardiographic','angina','slope','thal','log_cholestoral'],axis=1) y=raw_frame['thal'] x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=5) clf = LogisticRegressionCV(cv=10, random_state=0, multi_class='ovr',max_iter=10000).fit(x_train, y_train) global train_score train_score.append(clf.score(x_train,y_train)) global test_score test_score.append(clf.score(x_test,y_test))
def logistic(dataset, out): print('logistic') X = dataset[['x', 'y']] y = dataset.label X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0) logreg = LogisticRegressionCV(Cs=[0.1, 0.5, 1, 5, 10, 50, 100], cv=5) logreg.fit(X_train, y_train) print('best score: ' + str(logreg.scores_[1].max())) print('test score: ' + str(logreg.score(X_test, y_test))) out.write('logistic,' + str(logreg.scores_[1].max()) + ',' + str(logreg.score(X_test, y_test)) + '\n')
def logistic(X_train, X_test, y_train, y_test): lr = LogisticRegressionCV(multi_class="ovr", fit_intercept=True, Cs=10, cv=3, penalty="l2", solver="lbfgs", tol=0.01, class_weight='balanced') #lr = LogisticRegression(C = 2.0, class_weight = 'balanced') lr.fit(X_train, y_train) print("Training score:%f" % (lr.score(X_train, y_train))) print("Testing score:%f" % (lr.score(X_test, y_test))) y_pred = lr.predict(X_test) return y_pred
def run_logistic_regression(): print("~ Logistic Regression ~") # Create Logistic Regression # with c value under consideration # with cross-validation folds under consideration # and evaluation metric of log loss model = LogisticRegressionCV(Cs=10, cv=10, scoring='neg_log_loss') # Fit the model model.fit(wine_train_X, wine_train_y.ravel()) # Predict y with test data predict_y = model.predict(wine_test_X) # Find the accuracy score accuracy = model.score(wine_test_X, wine_test_y) # Create a confusion matrix cm = confusion_matrix(wine_test_y, predict_y) # Print findings print("Accuracy: ", accuracy) print("Confusion Matix:") print(cm)
def classify(_char): print 'to fetch data' start_time = time.time() char_count = Character.objects.filter(char=_char, is_correct=1).count() if char_count < 10: return char_lst = Character.objects.filter(char=_char) y, X, ty, tX, t_charid_lst, test_accuracy_lst = prepare_data_with_database( char_lst) if len(y) == 0 or len(ty) == 0: return if 1 == len(set(y)) or len(y) < 10: return fetch_negative_samples(_char, X, y) if len(y) == 0 or len(ty) == 0: return if 1 == len(set(y)) or len(y) < 50: return print "fetch data done, spent %s seconds." % int(time.time() - start_time) start_time = time.time() print "traning: data size: %d" % len(y) model = LogisticRegressionCV(cv=5, solver='liblinear', n_jobs=1) try: model.fit(X, y) print "training done, spent %s seconds." % int(time.time() - start_time) #print 'params: ' #for k, v in model.get_params().iteritems(): # print '\t', k, ' : ', v print 'score: ', model.score(X, y) except Exception, e: print 'except: ', e traceback.print_exc() return
def selectThreshold(alpha, dataPath): trainData, _ = getData(dataPath, 0.5) trainData, valData = train_test_split(trainData, train_size=0.7) clf = LogisticRegressionCV(cv=10, penalty='l2') clf.fit(trainData.iloc[:, :-1], trainData.iloc[:, -1]) val_score = clf.score(valData.iloc[:, :-1], valData.iloc[:, -1]) print("Validation accuracy: %.6f" % val_score) ## find optimal threshold on validation data y_true = valData.iloc[:, -1] y_positive_idx = set(np.where(y_true == 1)[0]) y_negative_idx = set(np.where(y_true == 0)[0]) numPositive = len(y_positive_idx) numNegative = len(y_negative_idx) y_pred = clf.predict_proba(valData.iloc[:, :-1])[:, 1] ret = [] for th in np.linspace(0.1, 0.9, 9): FPR = len(set(np.where(y_pred > th)[0]) & y_negative_idx) / numNegative FNR = len(set(np.where(y_pred < th)[0]) & y_positive_idx) / numPositive cost = alpha * FPR + (1 - alpha) * FNR print("threshold: %.2f cost: %.2f" % (th, cost)) ret.append((th, cost)) ## return optimal th optTh = min(ret, key=lambda x: x[1])[0] return optTh
class LogisticRegressionCV_(ProbabilisticModel): """LogisticRegressionCV Classifier """ def __init__(self, *args, **kwargs): self.model = LogisticRegressionCV(*args, **kwargs) self.name = "lrcv" def train(self, dataset, *args, **kwargs): return self.model.fit(*(dataset.format_sklearn() + args), **kwargs) def predict(self, feature, *args, **kwargs): return self.model.predict(feature, *args, **kwargs) def score(self, testing_dataset, *args, **kwargs): return self.model.score(*(testing_dataset.format_sklearn() + args), **kwargs) def predict_real(self, feature, *args, **kwargs): dvalue = self.model.decision_function(feature, *args, **kwargs) if len(np.shape(dvalue)) == 1: # n_classes == 2 return np.vstack((-dvalue, dvalue)).T else: return dvalue def predict_proba(self, feature, *args, **kwargs): return self.model.predict_proba(feature, *args, **kwargs) def feature_importances_(self): return self.model.coef_.ravel() def get_params(self): return self.model.get_params
def main(): ''' pre-process input text ''' textList = readText() for text in textList: paragraphs = splitParagraphs(text) processedText = [] for paragraph in paragraphs: processedText.append(processText(paragraph)) # corpus without label ''' extract ngram features and labels ''' dataset = extractFeaturesLabels(processedText) y = dataset[1] # dataset['partie'] X = dataset[0] # selectFeatures(dataset['texte']) dataset.columns = ['texte', 'partie'] # dataset.to_csv('corpus.csv') ''' split training and testing dataset ''' X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25) ''' select features and train model ''' vectorizer = selectFeatures() X_train = vectorizer.fit_transform(X_train) # print(vectorizer.get_feature_names()) # print(vectorizer.get_stop_words()) # print(X_train.shape) # dimension = (153, 300) clf = LogisticRegressionCV(cv = 3) # SGDClassifier(loss = "hinge", penalty = "l2") clf.fit(X_train, list(y_train)) ''' predict outcomes and test model ''' X_test = vectorizer.transform(X_test) # y_predicted = clf.predict(X_test) print('accuracy = ' + str(clf.score(X_test, list(y_test)) ) ) ''' find coeeficient weights and evaluate results ''' features = vectorizer.get_feature_names() coef = clf.coef_ # an array of shape (1, n_feature) model_coef = pd.DataFrame([features, coef.T]).T # dataframe of 1000 rows, 2 cols model_coef.columns = ['feature', 'coef'] # model_coef.to_csv('model_coef.csv') print(model_coef.sort_values(by=['coef'], inplace=False))
class LogisticRegressionDensityRatioEstimator(DensityRatioBase): def __init__(self, Cs=10, solver='lbfgs', epochs=100, seed=None, *args, **kwargs): self.model = LogisticRegressionCV(Cs=Cs, solver=solver, max_iter=epochs, random_state=seed) def logit(self, X, y=None): return self.model.decision_function(X) def fit(self, X_top, X_bot, *args, **kwargs): X, y = make_classification_dataset(X_top, X_bot) return self.model.fit(X, y, *args, **kwargs) def evaluate(self, X_top, X_bot, *args, **kwargs): X, y = make_classification_dataset(X_top, X_bot) return self.model.score(X, y, *args, **kwargs)
def apply_lr_cross_val(key, training_data, validation_data): print('Applying LRC to', key, '...') LR = LogisticRegressionCV(cv=5, max_iter=100000, n_jobs=-1) LR.fit(training_data, y_train) score = LR.score(validation_data, y_val) accuracy_dict_LR_COUNT[key] = score
class LogisticRegression(): def __init__(self): # Initalisation du modele self.logistic_regression = LogisticRegressionCV( Cs=np.arange(3, 4, 1e-3), cv=15, random_state=0, solver='lbfgs', multi_class='multinomial', max_iter=1000) def fit(self, x_train, t_train): # Retroune l'entrainement du modele par rapport aux donnees return self.logistic_regression.fit(x_train, t_train) def predict(self, x_train): # Retourne la prediction des donnees return self.logistic_regression.predict(x_train) def score(self, x_train, t_train): # Retourne la score moyen des donnees en fonction de leur classe return self.logistic_regression.score(x_train, t_train) def get_best_param(self): # Retroune le meilleur hyperparametre return self.logistic_regression.C_
def logistic_fidelity(self): #group data and assign state labels gnd_features = np.hstack([np.real(self.ground_data.T), np.imag(self.ground_data.T)]) ex_features = np.hstack([np.real(self.excited_data.T), np.imag(self.excited_data.T)]) #liblinear wants arrays in C order features = np.ascontiguousarray(np.vstack([gnd_features, ex_features])) state = np.ascontiguousarray(np.hstack([np.zeros(self.ground_data.shape[1]), np.ones(self.excited_data.shape[1])])) #Set up logistic regression with cross-validation using liblinear. #Cs sets the inverse of the regularization strength, which will be optimized #through cross-validation. Uses the default Stratified K-Folds #CV generator, with 3 folds. #This is set up to be as consistent with the MATLAB implementation #as I can make it. --GJR Cs = np.logspace(-1,2,5) logreg = LogisticRegressionCV(Cs, cv=3, solver='liblinear') logreg.fit(features, state) #fit the model predictions = logreg.predict(features) #in-place classification score = logreg.score(features,state) #mean accuracy of classification N = len(predictions) S = np.sum(predictions == state) #how many we got right #now calculate confidence intervals c = 0.95 flo = betaincinv(S+1, N-S+1, (1-c)/2., ) fhi = betaincinv(S+1, N-S+1, (1+c)/2., ) logger.info(("In-place logistic regression fidelity: " + "{:.2f}% ({:.2f}, {:.2f})".format(100*score, 100*flo, 100*fhi)))
def fit_logistic_regression(lr, Xt, corpus): Xt = preprocessing.scale(Xt) lr = LogisticRegressionCV() Y, le = load_Y(corpus) lr.fit(Xt, Y) print("Accuracy = {}".format(lr.score(Xt, Y))) return lr, Xt
def logistic(Xd, yd, Xt, yt): yd1 = [np.nonzero(yd[i])[0][0] for i in range(yd.shape[0])] clf = LogisticRegressionCV(cv=10, random_state=0, multi_class='multinomial').fit(Xd, yd1) yt1 = [np.nonzero(yt[i])[0][0] for i in range(yt.shape[0])] print(clf.score(Xt, yt1))
def main(args): np.random.seed(452346324) # load data columns = args.features.split(",") raw_df = pd.read_csv(args.train_data_path) data = raw_df[columns].values targets = raw_df[args.label].values x_train, x_test, y_train, y_test = train_test_split(data, targets, train_size=0.8) # fit the model ss = StandardScaler() x_train = ss.fit_transform(x_train) ## 训练模型及归一化数据 lr = LogisticRegressionCV(fit_intercept=True, Cs=np.logspace(-2, 2, 20), cv=2, penalty='l2', solver='lbfgs', tol=0.01) lr.fit(x_train, y_train) x_test = ss.fit_transform(x_test) r = lr.score(x_test, y_test) print("R值(准确率):", r) ModelUtils.save_model(columns, lr, args.model_path)
def train(trainingData, pklFile): # ========================================================================= # # =============== STEP 1. DEFINE OUTPUT LEARNT MODEL FILE ================= # # ========================================================================= # if (pklFile == ''): os.system('rm -rf learntModel & mkdir learntModel') pklFile = 'learntModel/learntModel.pkl' # ========================================================================= # # ================= STEP 2. PREPARE AND FORMATTING DATA =================== # # ========================================================================= # NUMBER_OF_FEATURES = len(trainingData[0]) - 1 NUMBER_OF_TRAINING_POINTS = len(trainingData) x = trainingData[:, range(0, NUMBER_OF_FEATURES)] y = trainingData[:, NUMBER_OF_FEATURES] # ========================================================================= # # ============== STEP 3. DECLARE PRIMITIVES BEFORE THE PARTY ============== # # ========================================================================= # minSquareError = np.inf targetAlpha = None alphas = np.logspace(-10, -2, 500) # ========================================================================= # # ===== STEP 4. PERFORM FITTING WITH THE BEST ALPHA AND SAVE THE MODEL ==== # # ========================================================================= # clf = LogisticRegressionCV(Cs=alphas) clf.fit(x, y) joblib.dump(clf, pklFile) return {"intercept": clf.intercept_, "coef":clf.coef_, "alpha":clf.C_, "accuracy":clf.score(x,y)}
def classify(_char): print 'to fetch data' start_time = time.time() char_count = Character.objects.filter(char=_char, is_correct=1).count() if char_count < 10: return char_lst = Character.objects.filter(char=_char) y, X, ty, tX, t_charid_lst, test_accuracy_lst = prepare_data_with_database(char_lst) if len(y) == 0 or len(ty) == 0: return if 1 == len(set(y)) or len(y) < 10: return fetch_negative_samples(_char, X, y) if len(y) == 0 or len(ty) == 0: return if 1 == len(set(y)) or len(y) < 50: return print "fetch data done, spent %s seconds." % int(time.time() - start_time) start_time = time.time() print "traning: data size: %d" % len(y) model = LogisticRegressionCV(cv=5, solver='liblinear', n_jobs=1) try: model.fit(X, y) print "training done, spent %s seconds." % int(time.time() - start_time) #print 'params: ' #for k, v in model.get_params().iteritems(): # print '\t', k, ' : ', v print 'score: ', model.score(X, y) except Exception, e: print 'except: ', e traceback.print_exc() return
def evalute(matx, label): X_train, X_test, Y_train, Y_test = train_test_split(matx, label, test_size=0.1) print 'split finish...' ss = StandardScaler(with_mean=False) X_train = ss.fit_transform(X_train) X_test = ss.fit_transform(X_test) # print len(X_test) logistic = LogisticRegressionCV(Cs=np.logspace(-4, 1, 50), fit_intercept=True, penalty='l2', solver='lbfgs', tol=0.01, multi_class='ovr') #logistic = LogisticRegression() logistic.fit(X_train, Y_train) print 'training is finish ' print logistic.predict_proba(X_test) ## Logistic算法效果输出 logistic_r = logistic.score(X_train, Y_train) print "Logistic算法R值(准确率):", logistic_r #print "Logistic算法稀疏化特征比率:%.2f%%" % (np.mean(logistic_r.coef_.ravel() == 0) * 100) #print "Logistic算法参数:", logistic_r.coef_ #print "Logistic算法截距:", logistic_r.intercept_ logistic_r_predict = logistic.predict_proba(X_test) print "log_loss value is : ", log_loss(Y_test, logistic_r_predict)
def execute_recursive_elimination_feature_selection(X_scaled, y): ''' ''' print("Recursive elimination") model = LogisticRegressionCV(solver='liblinear', cv=3) print("Start Recursive Elimination. Fit model with {} examples.".format( X_scaled.shape[0])) # Initializing RFE model, 3 features selected rfe = RFE(model) # Transforming data using RFE X_rfe = rfe.fit_transform(X_scaled, y) # Fitting the data to model model.fit(X_rfe, y) print("Best accuracy score using built-in Logistic Regression: ", model.score(X_rfe, y)) print("Ranking") rfe_coef = pd.Series(X_scaled.columns, index=rfe.ranking_ - 1).sort_index() print(rfe_coef) print("Selected columns") print(X_scaled.columns[rfe.support_].values) return X_scaled.columns[rfe.support_].values, rfe_coef
def do_sentence_encoding_experiment_libri_speech(activations_dir, sentence_data): activations_per_layer = {} labels = [] top_10_labels = [9, 7, 10, 8, 11, 12, 17, 13, 6, 14] files = [f for f in os.listdir(activations_dir) if f.endswith('.npy')] for file in files: path = file[:-4] if path == '2961-961-0022': continue label = len(sentence_data[path].split(' ')) if label not in top_10_labels: continue # Use length of blank splitted as label labels.append(label) item = np.load('{}/{}.npy'.format(activations_dir, path)) for i, layer_act in enumerate(item): # Average activations over timesteps and L2 normalize mean_activations = np.mean(layer_act, axis=0) l2_activations = mean_activations / np.sqrt( np.sum(mean_activations**2)) layer_name = 'layer_{}'.format(i) if layer_name not in activations_per_layer: activations_per_layer[layer_name] = [] activations_per_layer[layer_name].append(l2_activations) # counter = {} # for label in set(labels): # counter[label] = labels.count(label) # sorted_counter = sorted(counter, key = counter.get, reverse = True) # top_10_most_occuring_labels = sorted_counter[:10] # result: ['9', '7', '10', '8', '11', '12', '17', '13', '6', '14'] results = {} for name, activations in activations_per_layer.items(): print('Training Logistic Regression for {} activations'.format(name)) X_train, X_test, y_train, y_test = train_test_split( activations, labels, test_size=0.25, random_state=random_state) scaler = StandardScaler().fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) classifier = LogisticRegressionCV(Cs=5, max_iter=500, random_state=random_state).fit( X_train, y_train) test_accuracy = classifier.score(X_test, y_test) print('Accuracy for layer {}: {}'.format(name, test_accuracy)) results[name] = test_accuracy return results
def train(df): y = df.iloc[:, 1] X = get_feature_columns(df) clf = LogisticRegressionCV(cv=10, random_state=0).fit(X, y) print('score', clf.score(X, y)) return clf
def do_sentence_length_encoding_experiment_common_voice(sets, activations_dir): data = [] labels = [] print('{} sets to process...'.format(len(sets))) for set in sets: for item in set['set_items']: path = item['path'][:-4] data.append( np.load('{}/{}/{}.npy'.format(activations_dir, set['set_id'], path))) # Clean up sentences from punctuation not_allowed = [',', '.', '!', '?', '"', '-', ':', ';'] sentence_clean = item['sentence'] for c in not_allowed: sentence_clean = sentence_clean.replace(c, '') # Use length of blank splitted as label (as string, classification not regression) labels.append(len(sentence_clean.split(' '))) print('{} files found'.format(len(data))) activations_per_layer = {} results = {} for item in data: for i, layer_act in enumerate(item): # Average activations over timesteps and L2 normalize mean_activations = np.mean(layer_act, axis=0) l2_activations = mean_activations / np.sqrt( np.sum(mean_activations**2)) layer_name = 'layer_{}'.format(i) if layer_name not in activations_per_layer: activations_per_layer[layer_name] = [] activations_per_layer[layer_name].append(l2_activations) for name, activations in activations_per_layer.items(): print('Training Logistic Regression model for {} activations'.format( name)) X_train, X_test, y_train, y_test = train_test_split( activations, labels, test_size=0.25, random_state=random_state) scaler = StandardScaler().fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) classifier = LogisticRegressionCV(Cs=5, max_iter=500, random_state=random_state).fit( X_train, y_train) test_accuracy = classifier.score(X_test, y_test) print('Accuracy for layer {}: {}'.format(name, test_accuracy)) results[name] = test_accuracy return results
def cross_validation(penalty): #read in train and test data model, X, y = read_train_data() model, vocab, X_vals, Y_vals = read_test_data() #train model with penalty clf = LogisticRegressionCV(penalty=penalty, solver='liblinear', max_iter = 10000).fit(X, y) #print model accuracy print(clf.score(X_vals,Y_vals)) #get model parameters print(clf.get_params())
def bestLogisticModel(X, y): scaler = preprocessing.StandardScaler().fit(X) X_scaled = scaler.transform(X) model = LogisticRegressionCV(max_iter=100) model.fit(X_scaled, y.ravel()) score = model.score(X_scaled, y.ravel()) print( f'highest accuracy for logistic regression: {model.scores_[1.0].max()}' ) print(f'coefficients for logistic regression model{model.coef_}')
def classify(self, mp, x_train, y_train, x_test): x_train = sm.add_constant(x_train) x_test = sm.add_constant(x_test) clf = LogisticRegressionCV(verbose=1, cv=5) log_to_info('Fitting a Logistic Regression to labeled training data...') clf = clf.fit(x_train, y_train) log_to_info('Training details') log_to_info('Classifier parameters: {}'.format(clf.get_params())) log_to_info('On training: {}'.format(clf.score(x_train, y_train) * 100.0)) log_to_info('Predicting test value') y_test = clf.predict(x_test) log_to_info('Done!') return y_test
def classify_with_random_samples(char, positive_sample_count, auto_apply=False, random_sample=0): print char, positive_sample_count started = timezone.now() start_time = time.time() query = Character.objects.filter(char=char) positive_samples, negative_samples, test_X, test_y, test_char_id_lst, test_accuracy_lst = \ prepare_data_with_database2(query) X = [] y = [] if random_sample != 0: if positive_sample_count > 0: if len(positive_samples) > positive_sample_count: positive_samples = random.sample(positive_samples, positive_sample_count) if len(negative_samples) > positive_sample_count: negative_samples = random.sample(negative_samples, positive_sample_count) else: if len(positive_samples) > positive_sample_count: positive_samples.sort(key=itemgetter(2), reverse=True) positive_samples = positive_samples[:positive_sample_count] if len(negative_samples) > positive_sample_count: negative_samples.sort(key=itemgetter(2)) negative_samples = negative_samples[:positive_sample_count] for sample in positive_samples: X.append(sample[0]) y.append(sample[1]) for sample in negative_samples: X.append(sample[0]) y.append(sample[1]) train_count = len(y) predict_count = len(test_y) if 1 == len(set(y)) or train_count < 10 or predict_count == 0: return fetch_spent = int(time.time() - start_time) print "fetch data done, spent %s seconds." % fetch_spent start_time = time.time() print "traning: data size: %d" % len(y) model = LogisticRegressionCV(cv=5, solver='liblinear', n_jobs=1) try: model.fit(X, y) training_spent = int(time.time() - start_time) print "training done, spent %s seconds." % training_spent # print 'params: ' # for k, v in model.get_params().iteritems(): # print '\t', k, ' : ', v print 'score: ', model.score(X, y) except Exception, e: print 'except: ', e traceback.print_exc() return
# ***************************** generate predictions on validation sets def predict_features(base_estimators, X, scaledX): basepredicts = [ estimator.estimator.predict(scaledX) if estimator.need_scale else estimator.estimator.predict(X) for estimator in base_estimators ] return pd.DataFrame( np.asarray(basepredicts).T, index=X.index, columns=[estimator.name for estimator in base_estimators] ) # ***************************** fit advanced features to validation target validate_basepredicts = predict_features(base_estimators, Xvalidate, Xvalidate_scaled) lrcv = LogisticRegressionCV(Cs=30, cv=10) lrcv.fit(validate_basepredicts, yvalidate) lrcv.score(validate_basepredicts, yvalidate) common.make_coefs_frame(validate_basepredicts.columns, lrcv.coef_.ravel()) # fit again with whole data basepredict_lr = LogisticRegression(C=lrcv.C_[0]) basepredict_lr.fit(validate_basepredicts, yvalidate) basepredict_lr.score(validate_basepredicts, yvalidate) common.make_coefs_frame(validate_basepredicts.columns, basepredict_lr.coef_.ravel()) # ***************************** test test_df = pd.read_csv("test_processed.csv", index_col="PassengerId") Xtest = test_df[feature_names] Xtest_scaled = scaler.transform(Xtest) test_basepredict = predict_features(base_estimators, Xtest, Xtest_scaled) final_predictions = basepredict_lr.predict(test_basepredict)
feature_names = ["Pclass","Age","SibSp","Parch","Fare","IsMale", 'EmbarkC','EmbarkQ', 'EmbarkS', "Ticket-4digit","Ticket-5digit","Ticket-6digit","Ticket-7digit","Ticket-A","Ticket-C","Ticket-F","Ticket-Others","Ticket-P","Ticket-S","Ticket-W"] Xtrain = train_df[feature_names] ytrain = train_df["Survived"] # --------------------------- scale train data scaler = StandardScaler() Xtrain_scaled = scaler.fit_transform(Xtrain) # --------------------------- LR lrcv = LogisticRegressionCV(Cs=30,cv=10) lrcv.fit(Xtrain_scaled,ytrain) lrcv.C_ lrcv.score(Xtrain_scaled,ytrain) def pretty_print_coef(coefs, names=None, sort=False): if names == None: names = ["X%s" % x for x in range(len(coefs))] lst = zip(coefs, names) if sort: lst = sorted(lst, key = lambda x:-np.abs(x[0])) return " + ".join("%s * %s" % (round(coef, 3), name) for coef, name in lst) pretty_print_coef(lrcv.coef_.ravel(),feature_names,True) coefs = pd.DataFrame({"names":feature_names,"coefs":lrcv.coef_.ravel()},columns=["names","coefs"]) coefs["rank"] = np.abs(coefs.coefs) coefs.sort_index(by="rank",inplace=True,ascending=False) del coefs["rank"]
hh = np.array(np.asarray(test).reshape(-1)) print hh.dtype hhh = np.logical_not( np.isfinite(np.asarray(test).reshape(-1)) ) print hh[hhh] """ enc = OneHotEncoder(categorical_features=[4]) enc.fit(train) train = enc.transform(train) test = enc.transform(test) solver = LogRegCV(n_jobs=-1) solver.fit(train, data_train.Survived) res = solver.predict(test) res = pd.DataFrame({"PassengerId": data_test.PassengerId, "Survived": res}) res.to_csv("../output/logic_0.csv", index=False) print solver.score(train, data_train.Survived) solver = LogRegCV(n_jobs=-1, scoring='roc_auc') solver.fit(train, data_train.Survived) res = solver.predict(test) res = pd.DataFrame({"PassengerId": data_test.PassengerId, "Survived": res}) res.to_csv("../output/logic_1.csv", index=False) print solver.score(train, data_train.Survived) solver = LogRegCV(n_jobs=-1, scoring='average_precision') solver.fit(train, data_train.Survived) res = solver.predict(test) res = pd.DataFrame({"PassengerId": data_test.PassengerId, "Survived": res}) res.to_csv("../output/logic_2.csv", index=False)
# modeling with categorical dummies = pd.get_dummies(data['alchemy_category']) second_model = pd.concat([X, dummies], axis = 1) X2_train, X2_test, y2_train, y2_test = train_test_split(second_model, y) lr2 = LogisticRegression() lr2.fit(X2_train, y2_train) lr2.predict(X2_test) lr2.score(X2_test, y2_test) # modeling with cross_validation lrCV = LogisticRegressionCV() lrCV.fit(X2_train, y2_train) lrCV.predict(X2_test) lrCV.score(X2_test, y2_test) # models with pre normalized values & inclusion of ALL categorical variables dummies2 = pd.get_dummies(data[maskCat]) data2 = pd.concat([X, dummies2], axis = 1) data2 = normalize(data2, norm = 'l2') X3_train, X3_test, y3_train, y3_test = train_test_split(data2, y) lr3 = LogisticRegression() lr3.fit(X3_train, y3_train) lr3.predict(X3_test) lr3.score(X3_test, y3_test) lrCV2 = LogisticRegressionCV() lrCV2.fit(X3_train, y3_train)
# Prepare data iris = sns.load_dataset("iris") X = iris.values[:, 0:4] y = iris.values[:, 4] # Make test and train set train_X, test_X, train_y, test_y = train_test_split(X, y, train_size=0.5, random_state=0) ################################ # Evaluate Logistic Regression ################################ lr = LogisticRegressionCV() lr.fit(train_X, train_y) pred_y = lr.predict(test_X) print("Test fraction correct (LR-Accuracy) = {:.2f}".format(lr.score(test_X, test_y))) ################################ # Evaluate Keras Neural Network ################################ # Make ONE-HOT def one_hot_encode_object_array(arr): '''One hot encode a numpy array of objects (e.g. strings)''' uniques, ids = np.unique(arr, return_inverse=True) return np_utils.to_categorical(ids, len(uniques)) train_y_ohe = one_hot_encode_object_array(train_y) test_y_ohe = one_hot_encode_object_array(test_y)
x['SexCallClass']=x['SexN']*x['Call']*x['Pclass'] ##x['AgeClass']=x['Age']*x['Pclass'] x['Family']=x['Parch']+x['SibSp'] ##x['SexAge']=x['SexN']*x['Age'] x = (x-sp.mean(x))/sp.std(x) n_train = 500 x_train = x.iloc[:n_train,:] y_train = y.iloc[:n_train] x_test = x.iloc[n_train:,:] y_test = y.iloc[n_train:] ##x_test = x_test[~pd.isnull(x_test.Age)] ##y_test = y_test[~pd.isnull(x_test.Age)] cv = KFold(n=len(x), n_folds=10) clf = LogisticRegressionCV() scores = [] aucs=[] for train, test in cv: x_train, y_train = x.iloc[train,:], y.iloc[train] x_test, y_test = x.iloc[test,:], y.iloc[test] clf.fit(x_train, y_train) pr = clf.predict_proba(x_test)[:,1] scores.append(clf.score(x_test, y_test)) precision, recall, thres = precision_recall_curve(y_test, clf.predict(x_test)) aucs.append(auc(recall, precision)) print("Score = %s, Auc = %s"%(sp.mean(scores), sp.mean(aucs)))
X = x[1:, :] y = y[1:] X = X.astype(np.float) y = y.astype(np.float) ##################### Logistic Reg CV ############ # For the grid of Cs values (that are set by default to be ten values in a logarithmic scale between 1e-4 and 1e4) # If Cs is as an int, then a grid of Cs values are chosen in a logarithmic scale between 1e-4 and 1e4. # Like in support vector machines, smaller values specify stronger regularization. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) lr_cv = LogisticRegressionCV(Cs = 10, cv=5) lr_cv = lr_cv.fit(X_train, y_train) print 'Logistic Regression train accuracy', lr_cv.score(X_train, y_train) print 'Logistic Regression CV test accuracy', lr_cv.score(X_test, y_test) ######### Logistic Regression Grid Search for C ############ print '******** Logistic Reg *********' tuned_parameters = {'C': np.linspace(0.1, 10, 10)} scores = ['precision', 'recall'] for score in scores: print("# Tuning hyper-parameters for %s" % score) clf = GridSearchCV(LogisticRegression(), tuned_parameters, cv=4, scoring='%s_weighted' % score) clf.fit(X_train, y_train) print("Best parameters set found on development set:")