Exemplo n.º 1
0
def model(neural_data,
          run_onset,
          det_window,
          penalty,
          neuron_num=None,
          pca_com=None):
    if neuron_num is not None:
        neurons_idx = np.random.randint(0, len(neural_data), neuron_num)
        feat = extract_features(neural_data, neurons_idx=neurons_idx)
        key = neuron_num
    elif pca_com is not None:
        feat = extract_features(neural_data, pca_comp_num=pca_com)
        key = pca_com
    else:
        raise ValueError("Either `neuron_num` or `pca_com` must be provided.")

    X_train, y_train, X_test, y_test = prepare_data(feat, run_onset,
                                                    det_window, 0.2)
    C = np.logspace(-8, 0, 50)

    decoder = LogisticRegressionCV(Cs=C,
                                   penalty=penalty,
                                   solver='liblinear',
                                   max_iter=100)
    decoder.fit(X_train, y_train)
    acc_test = decoder.score(X_test, y_test)
    acc_train = decoder.score(X_train, y_train)

    decoders[(key, det_window, penalty)] = decoder
    train_acc[(key, det_window, penalty)] = acc_train
    test_acc[(key, det_window, penalty)] = acc_test
    return np.abs(acc_train - acc_test) if acc_test > 0.7 else 100
def SimGraphConv(A, X, Y, k, penalty='l2'):
    """
    Simple Graph Convolution Algorithm.
    
    Arguments:
    A: Sparse adjacency matrix [n, n] (n is the number of nodes)
    X: Sparse feature matrix [n, d] (d is the number of features)
    Y: Numpy array with labels [n,1]
    k: number of layers
    penalty: 'l1', 'l2' specify the norm used in the penalization.

    Return:
    Y: Prediction [n, c] Y[i,j] denotes the the prob of node i belongs to class j
    """

    I = sparse.eye(A.shape[0])
    A_hat = A + I
    D_hat = np.asarray(A_hat.sum(axis=0)).astype(np.float64)[0]
    assert((D_hat>0).all())
    invsqrt = lambda x: x**(-0.5)
    D_hat_invsqrt = sparse.diags(invsqrt(D_hat))
    S = D_hat_invsqrt.dot(A_hat).dot(D_hat_invsqrt)
    train_ind, test_ind = train_test_split(np.arange(X.shape[0]), train_size=0.7, test_size=0.3)
    X_train = (S**k)[train_ind,:].dot(X)
    Y_train = Y[train_ind]
    X_test = (S**k)[test_ind,:].dot(X)
    Y_test = Y[test_ind]

    logfit = LogisticRegressionCV(cv=2, penalty='l2', solver='liblinear', random_state=1,
                                    max_iter=100).fit(X_train, Y_train)
    return logfit.score(X_train,Y_train),logfit.score(X_test,Y_test), logfit.coef_
def fit_logistic_cv(X_train, X_test, y_train, y_test, cv=5):
    pred_y = None
    from sklearn.linear_model import LogisticRegressionCV
    '''
    Your code here... Please follow the German credit example.
    First fit the model and obtain pred_y values. You need to figure out how to do 5-fold cross validation.
    then
    1. print classification report
    2. print accuracy. You can find how to get model accuracy by consulting the documentation of sklearn logistic regression. Hint: you need use score()
    Your code should print the measures as follows. The numbers you get could be be different because of random sampling

                 precision    recall  f1-score   support

              0       0.80      0.90      0.85      3741
              1       0.75      0.59      0.66      1965

    avg / total       0.78      0.79      0.78      5706

    accuracy: 0.788643533123
    '''

    # train model using cross-validation
    model = LogisticRegressionCV(cv=cv).fit(X_train, y_train)
    # make prediction
    pred_y = model.predict(X_test)

    # evaluate the prediction results
    print metrics.classification_report(y_test, pred_y)
    print model.score(X_train, y_train)
Exemplo n.º 4
0
def sub_cancer_type(x,y,t):
#choose cancer type idt means id_cancer_type

    #pdb.set_trace()
    idt=y[:,0]==t
    yt=y[idt]
    xt=np.r_[x[0].reshape(1,len(x[0])),x[1:][idt]]
    yth1,yth2=np.percentile(yt[:,1],(33,66))
    yr=np.array(yt[:,1],copy=True)
    id1=yr<=yth1
    id2=yr>=yth2
    yr[id1]=0
    yr[id2]=1
    yr=yr[np.logical_or(id1,id2)]
    xr=np.array(xt,copy=True)
    xr=np.r_[xr[0].reshape(1,len(xr[0])),xr[1:][np.logical_or(id1,id2)]]

#feature selection with chi2
    sp = SelectPercentile(chi2,percentile=50)
    sp.fit(xr[1:],yr)
    idg=sp.pvalues_<=0.1
    xr2=xr[:,idg]

#LogicticRegression
    #clf=LogisticRegression()
    x_train, x_test, y_train, y_test = train_test_split(xr2[1:],yr,test_size=0.2)
    clf=LogisticRegressionCV(Cs=[0.001,0.01,0.1,1.,10.,100.,1000.],penalty='l2',max_iter=1000,cv=10).fit(x_train,y_train)
    #clf.fit(x_train,y_train)
    accuracy=clf.score(x_train,y_train)
    scores=clf.score(x_test,y_test)
    return(accuracy,scores,idg,clf.coef_)
Exemplo n.º 5
0
def main(unused_argv):
    print('Running statistics on %s' % exp_name)

    if len(unused_argv) != 1: # prints a message if you've entered flags incorrectly
        raise Exception("Problem with flags: %s" % unused_argv)
    start_time = time.time()
    np.random.seed(random_seed)

    train_x, train_y, val_x, val_y, test_x, test_y = load_data()

    lr = LogisticRegressionCV()
    lr.fit(train_x, train_y)
    train_acc = lr.score(train_x, train_y)
    print(train_acc)
    test_acc = lr.score(test_x, test_y)
    print(test_acc)
    train_y_pred = lr.predict(train_x)
    y_pred = lr.predict(test_x)

    print('Training eval')
    print(metrics.classification_report(train_y, train_y_pred))

    print('Testing eval')
    print('-----------------------------------------------')
    print(metrics.classification_report(test_y, y_pred))

    with open(os.path.join(model_dir, dataset + '.pkl'), 'wb') as f:
        dill.dump(lr, f)

    util.print_execution_time(start_time)
def logistic():
    lr_base = LogisticRegressionCV(random_state=0, max_iter=10000)
    lr_base.fit(X_train, y_train)
    if verbose:
        print('LR Base training accuracy:', lr_base.score(X_train, y_train))
        print('LR Base Test accuracy:', lr_base.score(X_test, y_test))
    # Add to our final models to compare
    final_models.append(('LR base', lr_base, 'LR'))
Exemplo n.º 7
0
def multi_logsitc_cv_with_all():
    raw_frame=thal_data()
    x=raw_frame.drop(['thal'],axis=1)
    y=raw_frame['thal']
    x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=5)
    clf = LogisticRegressionCV(cv=10, random_state=0, multi_class='multinomial',max_iter=10000).fit(x_train, y_train)
    global train_score
    train_score.append(clf.score(x_train,y_train))
    global test_score
    test_score.append(clf.score(x_test,y_test))
Exemplo n.º 8
0
def one_vs_rest_multi_logsitc_cv_without_log():
    raw_frame=thal_data()
    x=raw_frame.drop(['thal','log_pressure','log_cholestoral','log_age','log_heart_rate'],axis=1)
    y=raw_frame['thal']
    x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=5)
    clf = LogisticRegressionCV(cv=5, random_state=0, multi_class='ovr',max_iter=10000).fit(x_train, y_train)
    global train_score
    train_score.append(clf.score(x_train,y_train))
    global test_score
    test_score.append(clf.score(x_test,y_test))
Exemplo n.º 9
0
def one_vs_rest_multi_logisitc_selected_feature():
    raw_frame=thal_data()
    x=raw_frame.drop(['sugar','age','cardiographic','angina','slope','thal','log_cholestoral'],axis=1)
    y=raw_frame['thal']
    x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=5)
    clf = LogisticRegressionCV(cv=10, random_state=0, multi_class='ovr',max_iter=10000).fit(x_train, y_train)
    global train_score
    train_score.append(clf.score(x_train,y_train))
    global test_score
    test_score.append(clf.score(x_test,y_test))
def logistic(dataset, out):
    print('logistic')
    X = dataset[['x', 'y']]
    y = dataset.label
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.4,
                                                        random_state=0)
    logreg = LogisticRegressionCV(Cs=[0.1, 0.5, 1, 5, 10, 50, 100], cv=5)
    logreg.fit(X_train, y_train)
    print('best score: ' + str(logreg.scores_[1].max()))
    print('test score: ' + str(logreg.score(X_test, y_test)))
    out.write('logistic,' + str(logreg.scores_[1].max()) + ',' +
              str(logreg.score(X_test, y_test)) + '\n')
Exemplo n.º 11
0
def logistic(X_train, X_test, y_train, y_test):
    lr = LogisticRegressionCV(multi_class="ovr",
                              fit_intercept=True,
                              Cs=10,
                              cv=3,
                              penalty="l2",
                              solver="lbfgs",
                              tol=0.01,
                              class_weight='balanced')
    #lr = LogisticRegression(C = 2.0, class_weight = 'balanced')
    lr.fit(X_train, y_train)
    print("Training score:%f" % (lr.score(X_train, y_train)))
    print("Testing score:%f" % (lr.score(X_test, y_test)))
    y_pred = lr.predict(X_test)
    return y_pred
def run_logistic_regression():
    print("~ Logistic Regression ~")

    # Create Logistic Regression
    # with c value under consideration
    # with cross-validation folds under consideration
    # and evaluation metric of log loss
    model = LogisticRegressionCV(Cs=10,
                                 cv=10,
                                 scoring='neg_log_loss')

    # Fit the model
    model.fit(wine_train_X, wine_train_y.ravel())

    # Predict y with test data
    predict_y = model.predict(wine_test_X)

    # Find the accuracy score
    accuracy = model.score(wine_test_X, wine_test_y)

    # Create a confusion matrix
    cm = confusion_matrix(wine_test_y, predict_y)

    # Print findings
    print("Accuracy: ", accuracy)
    print("Confusion Matix:")
    print(cm)
Exemplo n.º 13
0
def classify(_char):
    print 'to fetch data'
    start_time = time.time()
    char_count = Character.objects.filter(char=_char, is_correct=1).count()
    if char_count < 10:
        return
    char_lst = Character.objects.filter(char=_char)
    y, X, ty, tX, t_charid_lst, test_accuracy_lst = prepare_data_with_database(
        char_lst)
    if len(y) == 0 or len(ty) == 0:
        return
    if 1 == len(set(y)) or len(y) < 10:
        return
    fetch_negative_samples(_char, X, y)
    if len(y) == 0 or len(ty) == 0:
        return
    if 1 == len(set(y)) or len(y) < 50:
        return

    print "fetch data done, spent %s seconds." % int(time.time() - start_time)
    start_time = time.time()
    print "traning: data size: %d" % len(y)
    model = LogisticRegressionCV(cv=5, solver='liblinear', n_jobs=1)
    try:
        model.fit(X, y)
        print "training done, spent %s seconds." % int(time.time() -
                                                       start_time)
        #print 'params: '
        #for k, v in model.get_params().iteritems():
        #    print '\t', k, ' : ', v
        print 'score: ', model.score(X, y)
    except Exception, e:
        print 'except: ', e
        traceback.print_exc()
        return
def selectThreshold(alpha, dataPath):
    trainData, _ = getData(dataPath, 0.5)
    trainData, valData = train_test_split(trainData, train_size=0.7)
    clf = LogisticRegressionCV(cv=10, penalty='l2')
    clf.fit(trainData.iloc[:, :-1], trainData.iloc[:, -1])
    val_score = clf.score(valData.iloc[:, :-1], valData.iloc[:, -1])
    print("Validation accuracy:  %.6f" % val_score)
    ## find optimal threshold on validation data
    y_true = valData.iloc[:, -1]
    y_positive_idx = set(np.where(y_true == 1)[0])
    y_negative_idx = set(np.where(y_true == 0)[0])
    numPositive = len(y_positive_idx)
    numNegative = len(y_negative_idx)
    y_pred = clf.predict_proba(valData.iloc[:, :-1])[:, 1]

    ret = []
    for th in np.linspace(0.1, 0.9, 9):
        FPR = len(set(np.where(y_pred > th)[0]) & y_negative_idx) / numNegative
        FNR = len(set(np.where(y_pred < th)[0]) & y_positive_idx) / numPositive
        cost = alpha * FPR + (1 - alpha) * FNR
        print("threshold: %.2f  cost: %.2f" % (th, cost))
        ret.append((th, cost))
    ## return optimal th
    optTh = min(ret, key=lambda x: x[1])[0]
    return optTh
Exemplo n.º 15
0
class LogisticRegressionCV_(ProbabilisticModel):

    """LogisticRegressionCV Classifier
    """

    def __init__(self, *args, **kwargs):
        self.model = LogisticRegressionCV(*args, **kwargs)
        self.name = "lrcv"        

    def train(self, dataset, *args, **kwargs):
        return self.model.fit(*(dataset.format_sklearn() + args), **kwargs)

    def predict(self, feature, *args, **kwargs):
        return self.model.predict(feature, *args, **kwargs)

    def score(self, testing_dataset, *args, **kwargs):
        return self.model.score(*(testing_dataset.format_sklearn() + args),
                                **kwargs)
    
    def predict_real(self, feature, *args, **kwargs):
        dvalue = self.model.decision_function(feature, *args, **kwargs)
        if len(np.shape(dvalue)) == 1:  # n_classes == 2
            return np.vstack((-dvalue, dvalue)).T
        else:
            return dvalue
    
    def predict_proba(self, feature, *args, **kwargs):
        return self.model.predict_proba(feature, *args, **kwargs)
    
    def feature_importances_(self):
        return self.model.coef_.ravel()
    
    def get_params(self):
        return self.model.get_params
Exemplo n.º 16
0
def main():
    ''' pre-process input text '''
    textList = readText()
    for text in textList:
        paragraphs = splitParagraphs(text)
    processedText = []
    for paragraph in paragraphs:
        processedText.append(processText(paragraph)) # corpus without label
    ''' extract ngram features and labels '''
    dataset = extractFeaturesLabels(processedText)
    y = dataset[1] # dataset['partie']
    X = dataset[0] # selectFeatures(dataset['texte'])
    dataset.columns = ['texte', 'partie']
    # dataset.to_csv('corpus.csv')
    ''' split training and testing dataset '''
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)
    ''' select features and train model '''
    vectorizer = selectFeatures()
    X_train = vectorizer.fit_transform(X_train)
    # print(vectorizer.get_feature_names())
    # print(vectorizer.get_stop_words())
    # print(X_train.shape) # dimension = (153, 300)
    clf = LogisticRegressionCV(cv = 3) # SGDClassifier(loss = "hinge", penalty = "l2")
    clf.fit(X_train, list(y_train))
    ''' predict outcomes and test model '''
    X_test = vectorizer.transform(X_test)
    # y_predicted = clf.predict(X_test)
    print('accuracy = ' + str(clf.score(X_test, list(y_test)) ) )
    ''' find coeeficient weights and evaluate results '''
    features = vectorizer.get_feature_names()
    coef = clf.coef_ # an array of shape (1, n_feature)
    model_coef = pd.DataFrame([features, coef.T]).T # dataframe of 1000 rows, 2 cols
    model_coef.columns = ['feature', 'coef']
    # model_coef.to_csv('model_coef.csv')
    print(model_coef.sort_values(by=['coef'], inplace=False))
Exemplo n.º 17
0
class LogisticRegressionDensityRatioEstimator(DensityRatioBase):
    def __init__(self,
                 Cs=10,
                 solver='lbfgs',
                 epochs=100,
                 seed=None,
                 *args,
                 **kwargs):

        self.model = LogisticRegressionCV(Cs=Cs,
                                          solver=solver,
                                          max_iter=epochs,
                                          random_state=seed)

    def logit(self, X, y=None):

        return self.model.decision_function(X)

    def fit(self, X_top, X_bot, *args, **kwargs):

        X, y = make_classification_dataset(X_top, X_bot)
        return self.model.fit(X, y, *args, **kwargs)

    def evaluate(self, X_top, X_bot, *args, **kwargs):

        X, y = make_classification_dataset(X_top, X_bot)
        return self.model.score(X, y, *args, **kwargs)
Exemplo n.º 18
0
def apply_lr_cross_val(key, training_data, validation_data):
    print('Applying LRC to', key, '...')
    LR = LogisticRegressionCV(cv=5, max_iter=100000, n_jobs=-1)
    LR.fit(training_data, y_train)

    score = LR.score(validation_data, y_val)
    accuracy_dict_LR_COUNT[key] = score
Exemplo n.º 19
0
class LogisticRegression():
    def __init__(self):
        # Initalisation du modele
        self.logistic_regression = LogisticRegressionCV(
            Cs=np.arange(3, 4, 1e-3),
            cv=15,
            random_state=0,
            solver='lbfgs',
            multi_class='multinomial',
            max_iter=1000)

    def fit(self, x_train, t_train):
        # Retroune l'entrainement du modele par rapport aux donnees
        return self.logistic_regression.fit(x_train, t_train)

    def predict(self, x_train):
        # Retourne la prediction des donnees
        return self.logistic_regression.predict(x_train)

    def score(self, x_train, t_train):
        # Retourne la score moyen des donnees en fonction de leur classe
        return self.logistic_regression.score(x_train, t_train)

    def get_best_param(self):
        # Retroune le meilleur hyperparametre
        return self.logistic_regression.C_
Exemplo n.º 20
0
 def logistic_fidelity(self):
     #group data and assign state labels
     gnd_features = np.hstack([np.real(self.ground_data.T),
                             np.imag(self.ground_data.T)])
     ex_features = np.hstack([np.real(self.excited_data.T),
                             np.imag(self.excited_data.T)])
     #liblinear wants arrays in C order
     features = np.ascontiguousarray(np.vstack([gnd_features, ex_features]))
     state = np.ascontiguousarray(np.hstack([np.zeros(self.ground_data.shape[1]),
                                             np.ones(self.excited_data.shape[1])]))
     #Set up logistic regression with cross-validation using liblinear.
     #Cs sets the inverse of the regularization strength, which will be optimized
     #through cross-validation. Uses the default Stratified K-Folds
     #CV generator, with 3 folds.
     #This is set up to be as consistent with the MATLAB implementation
     #as I can make it. --GJR
     Cs = np.logspace(-1,2,5)
     logreg = LogisticRegressionCV(Cs, cv=3, solver='liblinear')
     logreg.fit(features, state) #fit the model
     predictions = logreg.predict(features) #in-place classification
     score = logreg.score(features,state) #mean accuracy of classification
     N = len(predictions)
     S = np.sum(predictions == state) #how many we got right
     #now calculate confidence intervals
     c = 0.95
     flo = betaincinv(S+1, N-S+1, (1-c)/2., )
     fhi = betaincinv(S+1, N-S+1, (1+c)/2., )
     logger.info(("In-place logistic regression fidelity: " +
             "{:.2f}% ({:.2f}, {:.2f})".format(100*score, 100*flo, 100*fhi)))
def fit_logistic_regression(lr, Xt, corpus):
    Xt = preprocessing.scale(Xt)
    lr = LogisticRegressionCV()
    Y, le = load_Y(corpus)
    lr.fit(Xt, Y)
    print("Accuracy = {}".format(lr.score(Xt, Y)))
    return lr, Xt
Exemplo n.º 22
0
def logistic(Xd, yd, Xt, yt):
    yd1 = [np.nonzero(yd[i])[0][0] for i in range(yd.shape[0])]
    clf = LogisticRegressionCV(cv=10,
                               random_state=0,
                               multi_class='multinomial').fit(Xd, yd1)
    yt1 = [np.nonzero(yt[i])[0][0] for i in range(yt.shape[0])]
    print(clf.score(Xt, yt1))
Exemplo n.º 23
0
def main(args):
    np.random.seed(452346324)
    # load data
    columns = args.features.split(",")
    raw_df = pd.read_csv(args.train_data_path)
    data = raw_df[columns].values
    targets = raw_df[args.label].values
    x_train, x_test, y_train, y_test = train_test_split(data,
                                                        targets,
                                                        train_size=0.8)

    # fit the model
    ss = StandardScaler()
    x_train = ss.fit_transform(x_train)  ## 训练模型及归一化数据
    lr = LogisticRegressionCV(fit_intercept=True,
                              Cs=np.logspace(-2, 2, 20),
                              cv=2,
                              penalty='l2',
                              solver='lbfgs',
                              tol=0.01)
    lr.fit(x_train, y_train)
    x_test = ss.fit_transform(x_test)
    r = lr.score(x_test, y_test)
    print("R值(准确率):", r)

    ModelUtils.save_model(columns, lr, args.model_path)
Exemplo n.º 24
0
def train(trainingData, pklFile):
	# ========================================================================= #
	# =============== STEP 1. DEFINE OUTPUT LEARNT MODEL FILE ================= #
	# ========================================================================= #
	if (pklFile == ''):
		os.system('rm -rf learntModel & mkdir learntModel')
		pklFile = 'learntModel/learntModel.pkl'
	
	# ========================================================================= #
	# ================= STEP 2. PREPARE AND FORMATTING DATA =================== #
	# ========================================================================= #
	NUMBER_OF_FEATURES = len(trainingData[0]) - 1
	NUMBER_OF_TRAINING_POINTS = len(trainingData)

	x = trainingData[:, range(0, NUMBER_OF_FEATURES)]
	y = trainingData[:, NUMBER_OF_FEATURES]
	
	# ========================================================================= #
	# ============== STEP 3. DECLARE PRIMITIVES BEFORE THE PARTY ============== #
	# ========================================================================= #
	minSquareError = np.inf
	targetAlpha = None
	alphas = np.logspace(-10, -2, 500)			
	
	# ========================================================================= #
	# ===== STEP 4. PERFORM FITTING WITH THE BEST ALPHA AND SAVE THE MODEL ==== #
	# ========================================================================= #
	clf = LogisticRegressionCV(Cs=alphas)
	clf.fit(x, y)
	joblib.dump(clf, pklFile)
	
	return {"intercept": clf.intercept_, "coef":clf.coef_, "alpha":clf.C_, "accuracy":clf.score(x,y)}
Exemplo n.º 25
0
def classify(_char):
    print 'to fetch data'
    start_time = time.time()
    char_count = Character.objects.filter(char=_char, is_correct=1).count()
    if char_count < 10:
        return
    char_lst = Character.objects.filter(char=_char)
    y, X, ty, tX, t_charid_lst, test_accuracy_lst = prepare_data_with_database(char_lst)
    if len(y) == 0 or len(ty) == 0:
        return
    if 1 == len(set(y)) or len(y) < 10:
        return
    fetch_negative_samples(_char, X, y)
    if len(y) == 0 or len(ty) == 0:
        return
    if 1 == len(set(y)) or len(y) < 50:
        return

    print "fetch data done, spent %s seconds." % int(time.time() - start_time)
    start_time = time.time()
    print "traning: data size: %d" % len(y)
    model = LogisticRegressionCV(cv=5, solver='liblinear', n_jobs=1)
    try:
        model.fit(X, y)
        print "training done, spent %s seconds." % int(time.time() - start_time)
        #print 'params: '
        #for k, v in model.get_params().iteritems():
        #    print '\t', k, ' : ', v
        print 'score: ', model.score(X, y)
    except Exception, e:
        print 'except: ', e
        traceback.print_exc()
        return
def evalute(matx, label):
    X_train, X_test, Y_train, Y_test = train_test_split(matx,
                                                        label,
                                                        test_size=0.1)
    print 'split finish...'
    ss = StandardScaler(with_mean=False)
    X_train = ss.fit_transform(X_train)
    X_test = ss.fit_transform(X_test)
    # print len(X_test)
    logistic = LogisticRegressionCV(Cs=np.logspace(-4, 1, 50),
                                    fit_intercept=True,
                                    penalty='l2',
                                    solver='lbfgs',
                                    tol=0.01,
                                    multi_class='ovr')
    #logistic = LogisticRegression()
    logistic.fit(X_train, Y_train)
    print 'training is finish '
    print logistic.predict_proba(X_test)
    ## Logistic算法效果输出
    logistic_r = logistic.score(X_train, Y_train)
    print "Logistic算法R值(准确率):", logistic_r
    #print "Logistic算法稀疏化特征比率:%.2f%%" % (np.mean(logistic_r.coef_.ravel() == 0) * 100)
    #print "Logistic算法参数:", logistic_r.coef_
    #print "Logistic算法截距:", logistic_r.intercept_
    logistic_r_predict = logistic.predict_proba(X_test)
    print "log_loss value is : ", log_loss(Y_test, logistic_r_predict)
Exemplo n.º 27
0
def execute_recursive_elimination_feature_selection(X_scaled, y):
    '''


    '''

    print("Recursive elimination")
    model = LogisticRegressionCV(solver='liblinear', cv=3)
    print("Start Recursive Elimination. Fit model with {} examples.".format(
        X_scaled.shape[0]))
    # Initializing RFE model, 3 features selected
    rfe = RFE(model)
    # Transforming data using RFE
    X_rfe = rfe.fit_transform(X_scaled, y)
    # Fitting the data to model
    model.fit(X_rfe, y)

    print("Best accuracy score using built-in Logistic Regression: ",
          model.score(X_rfe, y))
    print("Ranking")
    rfe_coef = pd.Series(X_scaled.columns, index=rfe.ranking_ - 1).sort_index()
    print(rfe_coef)

    print("Selected columns")
    print(X_scaled.columns[rfe.support_].values)

    return X_scaled.columns[rfe.support_].values, rfe_coef
Exemplo n.º 28
0
def do_sentence_encoding_experiment_libri_speech(activations_dir,
                                                 sentence_data):
    activations_per_layer = {}
    labels = []
    top_10_labels = [9, 7, 10, 8, 11, 12, 17, 13, 6, 14]
    files = [f for f in os.listdir(activations_dir) if f.endswith('.npy')]
    for file in files:
        path = file[:-4]
        if path == '2961-961-0022': continue

        label = len(sentence_data[path].split(' '))
        if label not in top_10_labels: continue

        # Use length of blank splitted as label
        labels.append(label)

        item = np.load('{}/{}.npy'.format(activations_dir, path))
        for i, layer_act in enumerate(item):
            # Average activations over timesteps and L2 normalize
            mean_activations = np.mean(layer_act, axis=0)
            l2_activations = mean_activations / np.sqrt(
                np.sum(mean_activations**2))

            layer_name = 'layer_{}'.format(i)
            if layer_name not in activations_per_layer:
                activations_per_layer[layer_name] = []
            activations_per_layer[layer_name].append(l2_activations)

    # counter = {}
    # for label in set(labels):
    #     counter[label] = labels.count(label)

    # sorted_counter = sorted(counter, key = counter.get, reverse = True)
    # top_10_most_occuring_labels = sorted_counter[:10]
    # result: ['9', '7', '10', '8', '11', '12', '17', '13', '6', '14']

    results = {}

    for name, activations in activations_per_layer.items():
        print('Training Logistic Regression for {} activations'.format(name))
        X_train, X_test, y_train, y_test = train_test_split(
            activations, labels, test_size=0.25, random_state=random_state)

        scaler = StandardScaler().fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)

        classifier = LogisticRegressionCV(Cs=5,
                                          max_iter=500,
                                          random_state=random_state).fit(
                                              X_train, y_train)
        test_accuracy = classifier.score(X_test, y_test)
        print('Accuracy for layer {}: {}'.format(name, test_accuracy))

        results[name] = test_accuracy

    return results
Exemplo n.º 29
0
def train(df):
    y = df.iloc[:, 1]
    X = get_feature_columns(df)

    clf = LogisticRegressionCV(cv=10, random_state=0).fit(X, y)

    print('score', clf.score(X, y))

    return clf
Exemplo n.º 30
0
def do_sentence_length_encoding_experiment_common_voice(sets, activations_dir):
    data = []
    labels = []
    print('{} sets to process...'.format(len(sets)))
    for set in sets:
        for item in set['set_items']:
            path = item['path'][:-4]
            data.append(
                np.load('{}/{}/{}.npy'.format(activations_dir, set['set_id'],
                                              path)))

            # Clean up sentences from punctuation
            not_allowed = [',', '.', '!', '?', '"', '-', ':', ';']
            sentence_clean = item['sentence']
            for c in not_allowed:
                sentence_clean = sentence_clean.replace(c, '')

            # Use length of blank splitted as label (as string, classification not regression)
            labels.append(len(sentence_clean.split(' ')))

    print('{} files found'.format(len(data)))

    activations_per_layer = {}
    results = {}
    for item in data:
        for i, layer_act in enumerate(item):
            # Average activations over timesteps and L2 normalize
            mean_activations = np.mean(layer_act, axis=0)
            l2_activations = mean_activations / np.sqrt(
                np.sum(mean_activations**2))

            layer_name = 'layer_{}'.format(i)
            if layer_name not in activations_per_layer:
                activations_per_layer[layer_name] = []
            activations_per_layer[layer_name].append(l2_activations)

    for name, activations in activations_per_layer.items():
        print('Training Logistic Regression model for {} activations'.format(
            name))
        X_train, X_test, y_train, y_test = train_test_split(
            activations, labels, test_size=0.25, random_state=random_state)

        scaler = StandardScaler().fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)

        classifier = LogisticRegressionCV(Cs=5,
                                          max_iter=500,
                                          random_state=random_state).fit(
                                              X_train, y_train)
        test_accuracy = classifier.score(X_test, y_test)
        print('Accuracy for layer {}: {}'.format(name, test_accuracy))

        results[name] = test_accuracy

    return results
Exemplo n.º 31
0
def cross_validation(penalty):
    #read in train and test data
    model, X, y = read_train_data()
    model, vocab, X_vals, Y_vals = read_test_data()
    #train model with penalty
    clf = LogisticRegressionCV(penalty=penalty, solver='liblinear', max_iter = 10000).fit(X, y)
    #print model accuracy
    print(clf.score(X_vals,Y_vals))
    #get model parameters
    print(clf.get_params())
Exemplo n.º 32
0
def bestLogisticModel(X, y):
    scaler = preprocessing.StandardScaler().fit(X)
    X_scaled = scaler.transform(X)
    model = LogisticRegressionCV(max_iter=100)
    model.fit(X_scaled, y.ravel())
    score = model.score(X_scaled, y.ravel())
    print(
        f'highest accuracy for logistic regression: {model.scores_[1.0].max()}'
    )
    print(f'coefficients for logistic regression model{model.coef_}')
 def classify(self, mp, x_train, y_train, x_test):
     x_train = sm.add_constant(x_train)
     x_test = sm.add_constant(x_test)
     clf = LogisticRegressionCV(verbose=1, cv=5)
     log_to_info('Fitting a Logistic Regression to labeled training data...')
     clf = clf.fit(x_train, y_train)
     log_to_info('Training details')
     log_to_info('Classifier parameters: {}'.format(clf.get_params()))
     log_to_info('On training: {}'.format(clf.score(x_train, y_train) * 100.0))
     log_to_info('Predicting test value')
     y_test = clf.predict(x_test)
     log_to_info('Done!')
     return y_test
Exemplo n.º 34
0
def classify_with_random_samples(char, positive_sample_count, auto_apply=False, random_sample=0):
    print char, positive_sample_count
    started = timezone.now()
    start_time = time.time()
    query = Character.objects.filter(char=char)
    positive_samples, negative_samples, test_X, test_y, test_char_id_lst, test_accuracy_lst = \
        prepare_data_with_database2(query)
    X = []
    y = []
    if random_sample != 0:
        if positive_sample_count > 0:
            if len(positive_samples) > positive_sample_count:
                positive_samples = random.sample(positive_samples, positive_sample_count)
            if len(negative_samples) > positive_sample_count:
                negative_samples = random.sample(negative_samples, positive_sample_count)
    else:
        if len(positive_samples) > positive_sample_count:
            positive_samples.sort(key=itemgetter(2), reverse=True)
            positive_samples = positive_samples[:positive_sample_count]
        if len(negative_samples) > positive_sample_count:
            negative_samples.sort(key=itemgetter(2))
            negative_samples = negative_samples[:positive_sample_count]
    for sample in positive_samples:
        X.append(sample[0])
        y.append(sample[1])
    for sample in negative_samples:
        X.append(sample[0])
        y.append(sample[1])
    train_count = len(y)
    predict_count = len(test_y)
    if 1 == len(set(y)) or train_count < 10 or predict_count == 0:
        return
    fetch_spent = int(time.time() - start_time)
    print "fetch data done, spent %s seconds." % fetch_spent
    start_time = time.time()
    print "traning: data size: %d" % len(y)
    model = LogisticRegressionCV(cv=5, solver='liblinear', n_jobs=1)
    try:
        model.fit(X, y)
        training_spent = int(time.time() - start_time)
        print "training done, spent %s seconds." % training_spent
        # print 'params: '
        # for k, v in model.get_params().iteritems():
        #    print '\t', k, ' : ', v
        print 'score: ', model.score(X, y)
    except Exception, e:
        print 'except: ', e
        traceback.print_exc()
        return
Exemplo n.º 35
0
# ***************************** generate predictions on validation sets
def predict_features(base_estimators, X, scaledX):
    basepredicts = [
        estimator.estimator.predict(scaledX) if estimator.need_scale else estimator.estimator.predict(X)
        for estimator in base_estimators
    ]
    return pd.DataFrame(
        np.asarray(basepredicts).T, index=X.index, columns=[estimator.name for estimator in base_estimators]
    )


# ***************************** fit advanced features to validation target
validate_basepredicts = predict_features(base_estimators, Xvalidate, Xvalidate_scaled)
lrcv = LogisticRegressionCV(Cs=30, cv=10)
lrcv.fit(validate_basepredicts, yvalidate)
lrcv.score(validate_basepredicts, yvalidate)
common.make_coefs_frame(validate_basepredicts.columns, lrcv.coef_.ravel())

# fit again with whole data
basepredict_lr = LogisticRegression(C=lrcv.C_[0])
basepredict_lr.fit(validate_basepredicts, yvalidate)
basepredict_lr.score(validate_basepredicts, yvalidate)
common.make_coefs_frame(validate_basepredicts.columns, basepredict_lr.coef_.ravel())

# ***************************** test
test_df = pd.read_csv("test_processed.csv", index_col="PassengerId")
Xtest = test_df[feature_names]
Xtest_scaled = scaler.transform(Xtest)

test_basepredict = predict_features(base_estimators, Xtest, Xtest_scaled)
final_predictions = basepredict_lr.predict(test_basepredict)
Exemplo n.º 36
0
feature_names = ["Pclass","Age","SibSp","Parch","Fare","IsMale",
                 'EmbarkC','EmbarkQ', 'EmbarkS',
                 "Ticket-4digit","Ticket-5digit","Ticket-6digit","Ticket-7digit","Ticket-A","Ticket-C","Ticket-F","Ticket-Others","Ticket-P","Ticket-S","Ticket-W"]
Xtrain = train_df[feature_names]
ytrain = train_df["Survived"]

# --------------------------- scale train data
scaler = StandardScaler()
Xtrain_scaled = scaler.fit_transform(Xtrain)

# --------------------------- LR
lrcv = LogisticRegressionCV(Cs=30,cv=10)
lrcv.fit(Xtrain_scaled,ytrain)

lrcv.C_
lrcv.score(Xtrain_scaled,ytrain)

def pretty_print_coef(coefs, names=None, sort=False):
    if names == None:
        names = ["X%s" % x for x in range(len(coefs))]
    lst = zip(coefs, names)
    if sort:
        lst = sorted(lst,  key = lambda x:-np.abs(x[0]))
    return " + ".join("%s * %s" % (round(coef, 3), name)     for coef, name in lst)
pretty_print_coef(lrcv.coef_.ravel(),feature_names,True)

coefs = pd.DataFrame({"names":feature_names,"coefs":lrcv.coef_.ravel()},columns=["names","coefs"])
coefs["rank"] = np.abs(coefs.coefs)
coefs.sort_index(by="rank",inplace=True,ascending=False)
del coefs["rank"]
Exemplo n.º 37
0
hh = np.array(np.asarray(test).reshape(-1))
print hh.dtype 
hhh = np.logical_not( np.isfinite(np.asarray(test).reshape(-1)) )
print hh[hhh]
"""
enc = OneHotEncoder(categorical_features=[4])
enc.fit(train)
train = enc.transform(train)
test = enc.transform(test)

solver = LogRegCV(n_jobs=-1)
solver.fit(train, data_train.Survived)
res = solver.predict(test)
res = pd.DataFrame({"PassengerId": data_test.PassengerId, "Survived": res})
res.to_csv("../output/logic_0.csv", index=False)
print solver.score(train, data_train.Survived)

solver = LogRegCV(n_jobs=-1, scoring='roc_auc')
solver.fit(train, data_train.Survived)
res = solver.predict(test)
res = pd.DataFrame({"PassengerId": data_test.PassengerId, "Survived": res})
res.to_csv("../output/logic_1.csv", index=False)
print solver.score(train, data_train.Survived)


solver = LogRegCV(n_jobs=-1, scoring='average_precision')
solver.fit(train, data_train.Survived)
res = solver.predict(test)

res = pd.DataFrame({"PassengerId": data_test.PassengerId, "Survived": res})
res.to_csv("../output/logic_2.csv", index=False)
Exemplo n.º 38
0
# modeling with categorical
dummies = pd.get_dummies(data['alchemy_category'])
second_model = pd.concat([X, dummies], axis = 1)

X2_train, X2_test, y2_train, y2_test = train_test_split(second_model, y)
lr2 = LogisticRegression()
lr2.fit(X2_train, y2_train)
lr2.predict(X2_test)
lr2.score(X2_test, y2_test)

# modeling with cross_validation
lrCV = LogisticRegressionCV()
lrCV.fit(X2_train, y2_train)
lrCV.predict(X2_test)
lrCV.score(X2_test, y2_test)

# models with pre normalized values & inclusion of ALL categorical variables
dummies2 = pd.get_dummies(data[maskCat])
data2 = pd.concat([X, dummies2], axis = 1)
data2 = normalize(data2, norm = 'l2')

X3_train, X3_test, y3_train, y3_test = train_test_split(data2, y)
lr3 = LogisticRegression()
lr3.fit(X3_train, y3_train)
lr3.predict(X3_test)
lr3.score(X3_test, y3_test)

lrCV2 = LogisticRegressionCV()
lrCV2.fit(X3_train, y3_train)
Exemplo n.º 39
0
# Prepare data
iris = sns.load_dataset("iris")
X = iris.values[:, 0:4]
y = iris.values[:, 4]

# Make test and train set
train_X, test_X, train_y, test_y = train_test_split(X, y, train_size=0.5, random_state=0)

################################
# Evaluate Logistic Regression
################################
lr = LogisticRegressionCV()
lr.fit(train_X, train_y)
pred_y = lr.predict(test_X)
print("Test fraction correct (LR-Accuracy) = {:.2f}".format(lr.score(test_X, test_y)))


################################
# Evaluate Keras Neural Network
################################

# Make ONE-HOT
def one_hot_encode_object_array(arr):
        '''One hot encode a numpy array of objects (e.g. strings)'''
        uniques, ids = np.unique(arr, return_inverse=True)
        return np_utils.to_categorical(ids, len(uniques))


train_y_ohe = one_hot_encode_object_array(train_y)
test_y_ohe = one_hot_encode_object_array(test_y)
Exemplo n.º 40
0
x['SexCallClass']=x['SexN']*x['Call']*x['Pclass']

##x['AgeClass']=x['Age']*x['Pclass']
x['Family']=x['Parch']+x['SibSp']
##x['SexAge']=x['SexN']*x['Age']
x = (x-sp.mean(x))/sp.std(x)


n_train = 500
x_train = x.iloc[:n_train,:]
y_train = y.iloc[:n_train]
x_test = x.iloc[n_train:,:]
y_test = y.iloc[n_train:]

##x_test = x_test[~pd.isnull(x_test.Age)]
##y_test = y_test[~pd.isnull(x_test.Age)]

cv = KFold(n=len(x), n_folds=10)
clf = LogisticRegressionCV()
scores = []
aucs=[]
for train, test in cv:
    x_train, y_train = x.iloc[train,:], y.iloc[train]
    x_test, y_test = x.iloc[test,:], y.iloc[test]
    clf.fit(x_train, y_train)
    pr = clf.predict_proba(x_test)[:,1]
    scores.append(clf.score(x_test, y_test))
    precision, recall, thres = precision_recall_curve(y_test, clf.predict(x_test))
    aucs.append(auc(recall, precision))
print("Score = %s, Auc = %s"%(sp.mean(scores), sp.mean(aucs)))
Exemplo n.º 41
0
X = x[1:, :]
y = y[1:]
X = X.astype(np.float)
y = y.astype(np.float)

##################### Logistic Reg CV ############

# For the grid of Cs values (that are set by default to be ten values in a logarithmic scale between 1e-4 and 1e4)
# If Cs is as an int, then a grid of Cs values are chosen in a logarithmic scale between 1e-4 and 1e4.
# Like in support vector machines, smaller values specify stronger regularization.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

lr_cv = LogisticRegressionCV(Cs = 10, cv=5)
lr_cv = lr_cv.fit(X_train, y_train)
print 'Logistic Regression train accuracy', lr_cv.score(X_train, y_train)
print 'Logistic Regression CV test accuracy', lr_cv.score(X_test, y_test)

######### Logistic Regression Grid Search for C ############

print '******** Logistic Reg *********'

tuned_parameters = {'C': np.linspace(0.1, 10, 10)}

scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    clf = GridSearchCV(LogisticRegression(), tuned_parameters, cv=4, scoring='%s_weighted' % score)
    clf.fit(X_train, y_train)
    print("Best parameters set found on development set:")