def main():
    df = pd.read_csv('dataset/ElectionsData.csv')
    df['split'] = 0
    indices = KFold(n=len(df), n_folds=5, shuffle=True)._iter_test_indices()
    df['split'][indices.next()] = 1
    df['split'][indices.next()] = 2
    raw_data = df.copy()

    raw_data[raw_data['split']==0].drop('split', axis=1).to_csv('dataset/raw_train.csv', index=False)
    raw_data[raw_data['split']==1].drop('split', axis=1).to_csv('dataset/raw_test.csv', index=False)
    raw_data[raw_data['split']==2].drop('split', axis=1).to_csv('dataset/raw_validation.csv', index=False)

    all_features, discrete_features, continuous_features, categorical_features, numeric_features = split_features_by_type(df)
    features_to_keep = {'Yearly_ExpensesK', 'Yearly_IncomeK', 'Overall_happiness_score', 'Most_Important_Issue',
                        'Avg_Residancy_Altitude', 'Will_vote_only_large_party', 'Financial_agenda_matters'}
    df = mark_negative_values_as_nan(df)
    df = outlier_detection(df, continuous_features)

    #fill missing values by correlated features.
    fill_f1_by_f2_linear(df, 'Yearly_ExpensesK', 'Avg_monthly_expense_on_pets_or_plants')
    fill_f1_by_f2_linear(df, 'Yearly_IncomeK', 'Avg_size_per_room')
    fill_f1_by_f2_linear(df, 'Overall_happiness_score', 'Political_interest_Total_Score') #not perfectly corelated, but better then nothing
    fill_f1_by_f2_discrete(df, 'Most_Important_Issue', 'Last_school_grades')
    fill_f1_by_f2_linear(df, 'Avg_Residancy_Altitude', 'Avg_monthly_expense_when_under_age_21')
    fill_f1_by_f2_discrete(df, 'Will_vote_only_large_party', 'Looking_at_poles_results')
    fill_f1_by_f2_discrete(df, 'Financial_agenda_matters', 'Vote')
    for c in features_to_keep:
        rows_to_fix = df[c].isnull()
        for row, value in enumerate(rows_to_fix):
            if value:
                df[c][row] = df[df.Vote==df.Vote[row]][c].mean()

    df=df[list(features_to_keep)+['Vote', 'split']]
    reduce_Most_Important_Issue(df)
    z_score_scaling(df, list(features_to_keep.intersection(set(continuous_features))))
    l_encoder = label_encoder(df)
    df = categorical_features_transformation(df)

    pickle.dump(l_encoder, open('encoder.pickle', 'w'))
    df[df['split'] == 0].drop('split', axis=1).to_csv('dataset/transformed_train.csv', index=False)
    df[df['split'] == 1].drop('split', axis=1).to_csv('dataset/transformed_test.csv', index=False)
    df[df['split']==2].drop('split', axis=1).to_csv('dataset/transformed_validation.csv', index=False)
Пример #2
0
def cv_valid(data, cutoff, folds, make_syn):
    
    log.info('Creating CV splits.')
    
    y = get_labels(data,cutoff)
    valid_idx = []
    for i,y_val in enumerate(y):
        if y_val==0 or y_val==1:
            valid_idx.append(i)
            
    log.info('Data label distribution: total={0}, benign={1}, malware={2}, ambigious={3}, client_unlabeled={4}, no_vt={5}, unknown={6}.'.format(len(y), np.sum(y==0), np.sum(y==1), np.sum(y==-2), np.sum(y==-1), np.sum(y==-3), np.sum(np.isnan(y))))

    first_seen = data['time_seen']
    cuckoo_idx, splunk_idx = valid_by_types(data, valid_idx)
    
    log.info('Time split stats: min(days)={}, max(days)={}, std(days)={}.'.format(np.min(first_seen[cuckoo_idx])/86400.0, np.max(first_seen[cuckoo_idx])/86400.0, np.std(first_seen[cuckoo_idx])/86400.0))
    
    #first seen time
    time_cut = np.median(first_seen[valid_idx])
    train = []
    test = []
    for i in cuckoo_idx:
        if first_seen[i]<time_cut:
            train.append(i)
        else:
            test.append(i)
            
    cv_time = [[np.array(train), np.array(test)]]
    
    cv_cuckoo = StratifiedKFold(y[cuckoo_idx], n_folds=folds, shuffle=True)
    if len(splunk_idx)>=folds:
        cv_splunk = KFold(len(splunk_idx), n_folds=folds+1, shuffle=True)
    else:
        cv_splunk = []
        for i in xrange(folds+1):
            cv_splunk.append([[],[]])
    
    #touples
    syn_touples = []
    cv_sandbox = []
    cv_enterprise = []
    count = 1
    for cuckoo,splunk in zip(cv_cuckoo, cv_splunk):
        train = []
        test_sandbox = []
        test_enterprise = []
        
        train.extend([cuckoo_idx[i] for i in cuckoo[0]])
        train.extend([splunk_idx[i] for i in splunk[0]])
        
        test_sandbox.extend([cuckoo_idx[i] for i in cuckoo[1]])
        test_enterprise.extend([splunk_idx[i] for i in splunk[1]])
        
        #add the malware from cuckoo box
        test_enterprise.extend([cuckoo_idx[i] for i in cuckoo[1] if y[cuckoo_idx[i]]==1])
                    
        train = np.array(train)
        curr_sandbox = [train,np.array(test_sandbox)]
        curr_enterprise = [train,np.array(test_enterprise)]
        cv_sandbox.append(curr_sandbox)
        cv_enterprise.append(curr_enterprise)
        
        cuckoo_idx_c, splunk_idx_c = valid_by_types(data, curr_sandbox[0]) 
        cuckoo_idx_d, splunk_idx_d = valid_by_types(data, curr_sandbox[1]) 
        
        log.info('Created sandbox split %d: training size=%d (benign=%d,malware=%d,cuckoo=%d,splunk=%d), testing size=%d (benign=%d,malware=%d,cuckoo=%d,splunk=%d).' % (count, len(curr_sandbox[0]), np.sum(y[curr_sandbox[0]]==0), np.sum(y[curr_sandbox[0]]==1), len(cuckoo_idx_c), len(splunk_idx_c), len(curr_sandbox[1]), np.sum(y[curr_sandbox[1]]==0), np.sum(y[curr_sandbox[1]]==1), len(cuckoo_idx_d), len(splunk_idx_d)))
        if len(curr_enterprise[1])>0: 
            log.info('Created enterprise split %d: training size=%d (benign=%d,malware=%d), testing size=%d (benign=%d,malware=%d).' % (count, len(curr_enterprise[0]), np.sum(y[curr_enterprise[0]]==0), np.sum(y[curr_enterprise[0]]==1), len(curr_enterprise[1]), np.sum(y[curr_enterprise[1]]==0), np.sum(y[curr_enterprise[1]]==1)))
        
        count+=1

    if make_syn:
        splunk_count = 0
        for train,test in cv_splunk:
            if splunk_count==folds:
                syn_touples.extend(test)
            splunk_count+=1
        syn_touples = np.array(syn_touples)
    
    return cv_sandbox, cv_enterprise, syn_touples, cv_time
Пример #3
0
from sklearn import datasets
from sklearn.gaussian_process import GaussianProcess
from sklearn.cross_validation import cross_val_score, KFold

# Load the dataset from scikit's data sets
diabetes = datasets.load_diabetes()
X, y = diabetes.data, diabetes.target

# Instanciate a GP model
gp = GaussianProcess(regr='constant',
                     corr='absolute_exponential',
                     theta0=[1e-4] * 10,
                     thetaL=[1e-12] * 10,
                     thetaU=[1e-2] * 10,
                     nugget=1e-2,
                     optimizer='Welch')

# Fit the GP model to the data performing maximum likelihood estimation
gp.fit(X, y)

# Deactivate maximum likelihood estimation for the cross-validation loop
gp.theta0 = gp.theta_  # Given correlation parameter = MLE
gp.thetaL, gp.thetaU = None, None  # None bounds deactivate MLE

# Perform a cross-validation estimate of the coefficient of determination using
# the cross_validation module using all CPUs available on the machine
K = 20  # folds
R2 = cross_val_score(gp, X, y=y, cv=KFold(y.size, K), n_jobs=1).mean()
print("The %d-Folds estimate of the coefficient of determination is R2 = %s" %
      (K, R2))
Пример #4
0
et = SklearnHelper(clf=ExtraTreesRegressor, seed=SEED, params=et_params)
ada = SklearnHelper(clf=AdaBoostRegressor, seed=SEED, params=ada_params)
gb_regressor = SklearnHelper(clf=GradientBoostingRegressor, seed=SEED, params=gb_regressor_params)
svm = SklearnHelper(clf=LinearSVR, seed=SEED, params=svm_params)

# -------------------------------------------------------------------------------------------------


# here where you can notice our result are different from him, because we don't have full 3m records with eng. features

ntrain = x_train.shape[0]
print(ntrain)
ntest = x_test_201610.shape[0] #need the size of a test set
print(ntest)

kf = KFold(ntrain, n_folds= NFOLDS, random_state=SEED)

# -------------------------------------------------------------------------------------------------

svm_oof_train, svm_oof_test_201610, svm_oof_test_201611, svm_oof_test_201612, svm_oof_test_201710, svm_oof_test_201711, svm_oof_test_201712 = get_oof(svm,x_train, y_train, x_test_201610, x_test_201611, x_test_201612, x_test_201710, x_test_201711, x_test_201712) # Support Vector Classifier
print("SVM Training is complete")

# -------------------------------------------------------------------------------------------------

et_oof_train, et_oof_test_201610, et_oof_test_201611, et_oof_test_201612, et_oof_test_201710, et_oof_test_201711, et_oof_test_201712 = get_oof(et, x_train, y_train, x_test_201610, x_test_201611, x_test_201612, x_test_201710, x_test_201711, x_test_201712) # Extra Trees
print("Extra Trees Regressor Training is complete")

# -------------------------------------------------------------------------------------------------

rf_oof_train, rf_oof_test_201610, rf_oof_test_201611, rf_oof_test_201612, rf_oof_test_201710, rf_oof_test_201711, rf_oof_test_201712 = get_oof(rf,x_train, y_train, x_test_201610, x_test_201611, x_test_201612, x_test_201710, x_test_201711, x_test_201712) # Random Forest
print("Random Forest Regressor Training is complete")
Пример #5
0
def crossValidation(data):
    k_fold = KFold(n=len(data), n_folds=10)

    Mat = np.zeros((users, items))
    for e in data:
        Mat[e[0] - 1][e[1] - 1] = e[2]

    sim_item_cosine, sim_item_jaccard, sim_item_pearson = similarity_item(Mat)
    #sim_item_cosine, sim_item_jaccard, sim_item_pearson = np.random.rand(items,items), np.random.rand(items,items), np.random.rand(items,items)
    '''sim_item_cosine = np.zeros((items,items))
	sim_item_jaccard = np.zeros((items,items))
	sim_item_pearson = np.zeros((items,items))
	f_sim_i = open("sim_item_based.txt", "r")
	for row in f_sim_i:
		r = row.strip().split(',')
		sim_item_cosine[int(r[0])][int(r[1])] = float(r[2])
		sim_item_jaccard[int(r[0])][int(r[1])] = float(r[3])
		sim_item_pearson[int(r[0])][int(r[1])] = float(r[4])
	f_sim_i.close()'''

    rmse_cosine = []
    rmse_jaccard = []
    rmse_pearson = []

    for train_indices, test_indices in k_fold:
        train = [data[i] for i in train_indices]
        test = [data[i] for i in test_indices]

        M = np.zeros((users, items))

        for e in train:
            M[e[0] - 1][e[1] - 1] = e[2]

        true_rate = []
        pred_rate_cosine = []
        pred_rate_jaccard = []
        pred_rate_pearson = []

        for e in test:
            user = e[0]
            item = e[1]
            true_rate.append(e[2])

            pred_cosine = 3.0
            pred_jaccard = 3.0
            pred_pearson = 3.0

            #item-based
            if np.count_nonzero(M[:, item - 1]):
                sim_cosine = sim_item_cosine[item - 1]
                sim_jaccard = sim_item_jaccard[item - 1]
                sim_pearson = sim_item_pearson[item - 1]
                ind = (M[user - 1] > 0)
                #ind[item-1] = False
                normal_cosine = np.sum(np.absolute(sim_cosine[ind]))
                normal_jaccard = np.sum(np.absolute(sim_jaccard[ind]))
                normal_pearson = np.sum(np.absolute(sim_pearson[ind]))
                if normal_cosine > 0:
                    pred_cosine = np.dot(sim_cosine,
                                         M[user - 1]) / normal_cosine

                if normal_jaccard > 0:
                    pred_jaccard = np.dot(sim_jaccard,
                                          M[user - 1]) / normal_jaccard

                if normal_pearson > 0:
                    pred_pearson = np.dot(sim_pearson,
                                          M[user - 1]) / normal_pearson

            if pred_cosine < 0:
                pred_cosine = 0

            if pred_cosine > 5:
                pred_cosine = 5

            if pred_jaccard < 0:
                pred_jaccard = 0

            if pred_jaccard > 5:
                pred_jaccard = 5

            if pred_pearson < 0:
                pred_pearson = 0

            if pred_pearson > 5:
                pred_pearson = 5

            #print str(user) + "\t" + str(item) + "\t" + str(e[2]) + "\t" + str(pred_cosine) + "\t" + str(pred_jaccard) + "\t" + str(pred_pearson)
            pred_rate_cosine.append(pred_cosine)
            pred_rate_jaccard.append(pred_jaccard)
            pred_rate_pearson.append(pred_pearson)

        rmse_cosine.append(
            sqrt(mean_squared_error(true_rate, pred_rate_cosine)))
        rmse_jaccard.append(
            sqrt(mean_squared_error(true_rate, pred_rate_jaccard)))
        rmse_pearson.append(
            sqrt(mean_squared_error(true_rate, pred_rate_pearson)))

        print(
            str(sqrt(mean_squared_error(true_rate, pred_rate_cosine))) + "\t" +
            str(sqrt(mean_squared_error(true_rate, pred_rate_jaccard))) +
            "\t" + str(sqrt(mean_squared_error(true_rate, pred_rate_pearson))))
        #raw_input()

    #print sum(rms) / float(len(rms))
    rmse_cosine = sum(rmse_cosine) / float(len(rmse_cosine))
    rmse_pearson = sum(rmse_pearson) / float(len(rmse_pearson))
    rmse_jaccard = sum(rmse_jaccard) / float(len(rmse_jaccard))

    print(
        str(rmse_cosine) + "\t" + str(rmse_jaccard) + "\t" + str(rmse_pearson))

    f_rmse = open("rmse_item.txt", "w")
    f_rmse.write(
        str(rmse_cosine) + "\t" + str(rmse_jaccard) + "\t" +
        str(rmse_pearson) + "\n")

    rmse = [rmse_cosine, rmse_jaccard, rmse_pearson]
    req_sim = rmse.index(min(rmse))

    print(req_sim)
    f_rmse.write(str(req_sim))
    f_rmse.close()

    if req_sim == 0:
        sim_mat_item = sim_item_cosine

    if req_sim == 1:
        sim_mat_item = sim_item_jaccard

    if req_sim == 2:
        sim_mat_item = sim_item_pearson

    #predictRating(Mat, sim_mat_item)
    return Mat, sim_mat_item
        test = admissions[admissions['fold'] == fold]
        lr.fit(train[['gpa']], train['actual_label'])
        test['predicted_label'] = lr.predict(test[['gpa']])
        correct_predictions = test[(test['predicted_label']) == (
            test['actual_label'])]
        fold_accuracies.append(len(correct_predictions) / len(test))
    return (fold_accuracies)


accuracies = train_and_test(admissions, fold_ids)
print(accuracies)
average_accuracy = np.mean(accuracies)
print(average_accuracy)

## 5. Sklearn ##

from sklearn.cross_validation import KFold
from sklearn.cross_validation import cross_val_score

admissions = pd.read_csv("admissions.csv")
admissions["actual_label"] = admissions["admit"]
admissions = admissions.drop("admit", axis=1)
kf = KFold(len(admissions), 5, shuffle=True, random_state=8)
lr = LogisticRegression()
accuracies = cross_val_score(lr,
                             admissions[['gpa']],
                             admissions['actual_label'],
                             scoring='accuracy',
                             cv=kf)
average_accuracy = sum(accuracies) / len(accuracies)
print(accuracies, average_accuracy)
print loans.info()

# In[ ]:

# In[96]:

#使用逻辑回归来分析数据,逻辑回归是一个非常经典的二分类
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_predict, KFold

lr = LogisticRegression()
cols = loans.columns
train_cols = cols.drop("loan_status")
features = loans[train_cols]
target = loans["loan_status"]
kf = KFold(features.shape[0], random_state=1)
predictions = cross_val_predict(lr, features, target, cv=kf)
predictions = pd.Series(predictions)

# In[98]:

#False positive
fp_filter = (predictions == 1) & (loans["loan_status"] == 0)
fp = len(predictions[fp_filter])
#True Positive
tp_filter = (predictions == 1) & (loans["loan_status"] == 1)
tp = len(predictions[tp_filter])
#False negative
fn_filter = (predictions == 0) & (loans["loan_status"] == 1)
fn = len(predictions[fn_filter])
#True negative
def stacking_classifier(folds, models):    
    # Level 1 regression models
    regrs = models

    # 5-fold cross validation
    kf = list(KFold(len(target_train_bin), n_folds=folds, shuffle = True, random_state = 1991))

   
    # Pre-allocate the data
    blend_train = np.zeros((regressors_train_pca.shape[0], len(regrs)))     # Number of training data x Number of classifiers
    blend_test = np.zeros((regressors_validation_pca.shape[0], len(regrs)))       # Number of testing data x Number of classifiers
                  
    
    # For each classifier, we train the number of fold times (=len(kf))
    for j, clf in enumerate(regrs):
        print('Training Regression Model [{}] - {}'.format(j, clf))
        blend_test_j = np.zeros((regressors_validation_pca.shape[0], len(kf))) # Number of testing data x Number of folds , we will take the mean of the predictions later
        for i, (train_index, cv_index) in enumerate(kf):
            print('Fold [{}]'.format(i))
            
            # This is the training and validation set
            X_train = regressors_train_pca[train_index]
            #Y_train = target_train_bin.iloc[train_index]
            Y_train = target_train_bin[train_index]
            X_cv = regressors_train_pca[cv_index]
            
            if(j == 0):
                # ANN
                Y_train = to_categorical(Y_train)
                clf.fit(X_train, Y_train, validation_split=0.2, epochs=5, batch_size=16, verbose=2)
            else:  
                clf.fit(X_train, Y_train)

            
            # This output will be the basis for our blended classifier to train against,
            # which is also the output of level 1 Regressors
            if(j==0):
                blend_train[cv_index, j] = clf.predict_classes(X_cv).flatten()
                blend_test_j[:, i] = clf.predict_classes(regressors_validation_pca).flatten()
            else:
                blend_train[cv_index, j] = clf.predict(X_cv).flatten()
                blend_test_j[:, i] = clf.predict(regressors_validation_pca).flatten()
           
        # Take the mean of the predictions of the cross validation set
        blend_test[:, j] = blend_test_j.mean(1)
    
    
    # Blending (predict Level 2 based on predictions on the train set)
 
#    ridgecv = RidgeCV(alphas=alphas, scoring='neg_mean_squared_error', normalize=False)
#    ridgecv.fit(blend_train, target_train)
#    ridgecv.alpha_
    # Fit Ridge model with best alpha

#    bclf = Ridge(alpha=ridgecv.alpha_, normalize=False, max_iter=10000)
#    bclf.fit(blend_train, target_train)

    bclf =  LogisticRegression()
    bclf.fit(blend_train, target_train_bin)
    #bclf =NN_CLF_model(len(regrs))
    #bclf.fit(blend_train, to_categorical(target_train_bin), validation_split=0.2, epochs=5, batch_size=16, verbose=2)
    # Predict now
    predicted_level2_bin = bclf.predict(blend_test)
    #predicted_level2_bin = bclf.predict_classes(blend_test)
    score = accuracy_score(target_validation_bin, predicted_level2_bin)
    return score, predicted_level2_bin
Пример #9
0

def loadDataSet(filename):
    strArr = [line.strip().split('\t') for line in open(filename).readlines()]
    dataSet = [map(float, line) for line in strArr]
    dataMat = np.mat(dataSet)
    m, n = np.shape(dataMat)
    return dataMat[:, :n - 1], dataMat[:, -1]


if __name__ == "__main__":
    x, y = loadDataSet('../2-knn/datingTestSet2.txt')

    #拆分数据集
    m = np.shape(x)[0]
    kf = KFold(m, n_folds=5, shuffle=True)  #1000分为5份
    clf = neighbors.KNeighborsClassifier(n_neighbors=3)
    for iteration, data in enumerate(kf, start=1):
        clf.fit(x[data[0]], np.ravel(y[data[0]]))
        answer = clf.predict(x[data[1]])
        print 'iteration', iteration
        print(classification_report(y[data[1]], answer))

    #训练KNN分类器
    # x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)
    # clf = neighbors.KNeighborsClassifier(n_neighbors=3)
    # clf.fit(x_train,np.ravel(y_train))
    # answer = clf.predict(x_test)
    # print(classification_report(y_test,answer))

    # precision,recall,thresholds = precision_recall_curve(y_test,answer) 二分类问题
Пример #10
0
def run_cross_validation_create_models(nfolds=10):
    # input image dimensions
    batch_size = 16
    nb_epoch = 25
    random_state = 51
    restore_from_last_checkpoint = 1

    train_data, train_target, train_id, driver_id, unique_drivers = read_and_normalize_train_data(
    )

    yfull_train = dict()
    kf = KFold(len(unique_drivers),
               n_folds=nfolds,
               shuffle=True,
               random_state=random_state)
    num_fold = 0
    sum_score = 0
    for train_drivers, test_drivers in kf:
        model = VGG_16()
        unique_list_train = [unique_drivers[i] for i in train_drivers]
        X_train, Y_train, train_index = copy_selected_drivers(
            train_data, train_target, driver_id, unique_list_train)
        unique_list_valid = [unique_drivers[i] for i in test_drivers]
        X_valid, Y_valid, test_index = copy_selected_drivers(
            train_data, train_target, driver_id, unique_list_valid)

        num_fold += 1
        print('Start KFold number {} from {}'.format(num_fold, nfolds))
        print('Split train: ', len(X_train), len(Y_train))
        print('Split valid: ', len(X_valid), len(Y_valid))
        print('Train drivers: ', unique_list_train)
        print('Test drivers: ', unique_list_valid)

        kfold_weights_path = os.path.join(
            'cache', 'weights_kfold_vgg16_' + str(num_fold) + '.h5')
        if not os.path.isfile(
                kfold_weights_path) or restore_from_last_checkpoint == 0:
            callbacks = [
                EarlyStoppingByLossVal(monitor='val_loss',
                                       value=0.00001,
                                       verbose=1),
                EarlyStopping(monitor='val_loss', patience=5, verbose=0),
                ModelCheckpoint(kfold_weights_path,
                                monitor='val_loss',
                                save_best_only=True,
                                verbose=0),
            ]
            model.fit(X_train,
                      Y_train,
                      batch_size=batch_size,
                      nb_epoch=nb_epoch,
                      shuffle=True,
                      verbose=1,
                      validation_data=(X_valid, Y_valid),
                      callbacks=callbacks)
        if os.path.isfile(kfold_weights_path):
            model.load_weights(kfold_weights_path)

        # score = model.evaluate(X_valid, Y_valid, show_accuracy=True, verbose=0)
        # print('Score log_loss: ', score[0])

        predictions_valid = model.predict(X_valid.astype('float32'),
                                          batch_size=batch_size,
                                          verbose=1)
        score = log_loss(Y_valid, predictions_valid)
        print('Score log_loss: ', score)
        sum_score += score * len(test_index)

        # Store valid predictions
        for i in range(len(test_index)):
            yfull_train[test_index[i]] = predictions_valid[i]

    score = sum_score / len(train_data)
    print("Log_loss train independent avg: ", score)

    predictions_valid = get_validation_predictions(train_data, yfull_train)

    print('Final log_loss: {}, nfolds: {} epoch: {}'.format(
        score, nfolds, nb_epoch))
    info_string = 'loss_' + str(score) \
                  + '_folds_' + str(nfolds) \
                  + '_ep_' + str(nb_epoch)

    save_useful_data(predictions_valid, train_id, model, info_string)

    score1 = log_loss(train_target, predictions_valid)
    if abs(score1 - score) > 0.0001:
        print('Check error: {} != {}'.format(score, score1))
Пример #11
0
feats = df_train.drop("revenue", axis=1)
X = feats.values  #features
y = df_train["revenue"].values  #target

for i in range(0, len(y) - 1):
    if y[i] > 10000000:
        print("sdfjsd")
        X.pop(i)
        y.pop(i)

### Linear Regression ###

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

kf = KFold(len(y), n_folds=15, shuffle=True)

y_pred = np.zeros(len(y), dtype=y.dtype)  # where we'll accumulate predictions
lr = LinearRegression()

# CV Loop
for train_index, test_index in kf:
    # for each iteration of the for loop we'll do a test train split
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    t = StandardScaler()
    X_train = t.fit_transform(X_train)
    lr.fit(X_train, y_train)  # Train on the training data

    X_test = t.transform(X_test)
Пример #12
0
9.others
from shutil import copyfile
	copyfile(src,file)
# make and write file
script=open(os.path.join(output_file,'Dodge_'+str(idx)+'.txt'),'a')
script.write('1'+'\n')
script.close()
	
10. glob
path = os.path.join('..','data','train',fld,'*jpg')
files = glob.glob(path)

11. sclearn
#K-Folds cross validation iterator
from sklearn.cross_validation import KFold
	kf = KFold(len(X_train), n_folds=n_fold, shuffle=True, random_state=random_state)
	for train_idx, cv_idx in kf: 

12. keras
callbacks = [EarlyStopping(monitor='val_loss', patience=3, verbose=0)]
model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=nb_epoch,
      shuffle=True, verbose=2, validation_data=(X_valid, Y_valid),
      callbacks=callbacks)


model.add(Convolution2D(12,4,4, border_mode='same',trainable=False))

13. pandas
result1 = pd.DataFrame(predictions, columns=['ALB', 'BET', 'DOL', 'LAG', 'NoF', 'OTHER', 'SHARK', 'YFT'])
result1.loc[:, 'image'] = pd.Series(test_id, index=result1.index)
Пример #13
0
#%%
# Lasso
model = linear_model.Lasso(alpha = 0.001)
model.fit(trainX, trainY)
prediction = model.predict(testX)
print("Lasso Accuracy: ", model.score(testX,testY))

#%%
#Ridge
model = linear_model.Ridge(alpha = 0.05, normalize=True)
model.fit(trainX, trainY)
prediction = model.predict(testX)
print("Ridge Accuracy: ", model.score(testX,testY))

#%%
kfold = KFold(n=10,random_state=10)

#%%
cvMean = []
results = []
classifiers = ['Linear Svm','Radial Svm','Logistic Regression','Decision Tree','KNN']
models = [svm.SVC(kernel='linear'),svm.SVC(kernel='rbf'),LogisticRegression(),DecisionTreeClassifier(),KNeighborsClassifier(n_neighbors=3)]
for i in models:
    model = i
    result = cross_val_score(model, wine[wine.columns[:11]], wine['quality'],cv=kfold, scoring='accuracy')
    results.append(result)
    cvMean.append(result.mean())
new_models_df = pd.DataFrame(cvMean, index=classifiers)
new_models_df.columns = ['CV Mean']
new_models_df
    def get_ten_fold_crossvalid_perfermance(self, fisher_mode, settings=None):
        analysis_scr = []
        predicted_score = False
        reduce_ratio = 1
        #for seq_no in range(1, self.ddi_obj.total_number_of_sequences+1):
        #subset_size = math.floor(self.ddi_obj.total_number_of_sequences / 10.0)
        kf = KFold(self.ddi_obj.total_number_of_sequences, n_folds=10)
        #for subset_no in range(1, 11):
        for ((train_index, test_index), subset_no) in izip(kf, range(1, 11)):
            #for train_index, test_index in kf;
            print("Subset:", subset_no)
            print("Train index: ", train_index)
            print("Test index: ", test_index)
            #logger.info('subset number: ' + str(subset_no))
            if 1:
                print "SVM"
                #start_index = int((subset_no - 1) * subset_size + 1)
                #if subset_no == 10:
                #    end_index  = int(max(start_index + subset_size, self.ddi_obj.total_number_of_sequences))
                #else:
                #    end_index  = int(start_index + subset_size)
                #print  start_index, end_index
                #(train_X_10fold, train_y_10fold),(train_X_reduced, train_y_reduced), (test_X, test_y) = self.ddi_obj.get_ten_fold_crossvalid_one_subset(start_index, end_index, reduce_ratio = reduce_ratio)
                (train_X_10fold,
                 train_y_10fold), (train_X_reduced, train_y_reduced), (
                     test_X,
                     test_y) = self.ddi_obj.get_ten_fold_crossvalid_one_subset(
                         train_index, test_index, reduce_ratio=reduce_ratio)
                standard_scaler = preprocessing.StandardScaler().fit(
                    train_X_reduced)
                scaled_train_X = standard_scaler.transform(train_X_reduced)
                scaled_test_X = standard_scaler.transform(test_X)
                Linear_SVC = LinearSVC(C=1, penalty="l2")
                Linear_SVC.fit(scaled_train_X, train_y_reduced)
                predicted_test_y = Linear_SVC.predict(scaled_test_X)
                isTest = True
                #new
                analysis_scr.append(
                    (self.ddi, subset_no, fisher_mode, 'SVM', isTest) + tuple(
                        performance_score(test_y,
                                          predicted_test_y).values()))  #new

                predicted_train_y = Linear_SVC.predict(scaled_train_X)
                isTest = False
                #new
                analysis_scr.append(
                    (self.ddi, subset_no, fisher_mode, 'SVM', isTest) + tuple(
                        performance_score(train_y_reduced,
                                          predicted_train_y).values()))

            # direct deep learning
            min_max_scaler = Precessing_Scaler_0_9()
            X_train_pre_validation_minmax = min_max_scaler.fit(train_X_reduced)
            X_train_pre_validation_minmax = min_max_scaler.transform(
                train_X_reduced)
            x_test_minmax = min_max_scaler.transform(test_X)
            pretraining_X_minmax = min_max_scaler.transform(train_X_10fold)
            x_train_minmax, x_validation_minmax, y_train_minmax, y_validation_minmax = train_test_split(
                X_train_pre_validation_minmax,
                train_y_reduced,
                test_size=0.4,
                random_state=42)
            finetune_lr = 1
            batch_size = 100
            pretraining_epochs = cal_epochs(5000,
                                            x_train_minmax,
                                            batch_size=batch_size)
            #pretrain_lr=0.001
            pretrain_lr = 0.001
            training_epochs = 1500
            hidden_layers_sizes = [100, 100]
            corruption_levels = [0.1, 0.1]
            if 1:
                print "direct deep learning"
                sda = trainSda(x_train_minmax, y_train_minmax,
                             x_validation_minmax, y_validation_minmax ,
                             x_test_minmax, test_y,
                             hidden_layers_sizes = hidden_layers_sizes, corruption_levels = corruption_levels, batch_size = batch_size , \
                             training_epochs = training_epochs, pretraining_epochs = pretraining_epochs,
                             pretrain_lr = pretrain_lr, finetune_lr=finetune_lr
                 )
                print 'hidden_layers_sizes:', hidden_layers_sizes
                print 'corruption_levels:', corruption_levels
                training_predicted = sda.predict(x_train_minmax)
                y_train = y_train_minmax
                isTest = False
                #new
                analysis_scr.append((
                    self.ddi, subset_no, fisher_mode, 'DL', isTest
                ) + tuple(
                    performance_score(y_train, training_predicted).values()))

                test_predicted = sda.predict(x_test_minmax)
                y_test = test_y
                isTest = True
                #new
                analysis_scr.append(
                    (self.ddi, subset_no, fisher_mode, 'DL', isTest) +
                    tuple(performance_score(y_test, test_predicted).values()))

            if 0:
                # deep learning using unlabeled data for pretraining
                print 'deep learning with unlabel data'
                pretraining_epochs = cal_epochs(5000,
                                                pretraining_X_minmax,
                                                batch_size=batch_size)
                sda_unlabel = trainSda(x_train_minmax, y_train_minmax,
                             x_validation_minmax, y_validation_minmax ,
                             x_test_minmax, test_y,
                             pretraining_X_minmax = pretraining_X_minmax,
                             hidden_layers_sizes = hidden_layers_sizes, corruption_levels = corruption_levels, batch_size = batch_size , \
                             training_epochs = training_epochs, pretraining_epochs = pretraining_epochs,
                             pretrain_lr = pretrain_lr, finetune_lr=finetune_lr
                 )
                print 'hidden_layers_sizes:', hidden_layers_sizes
                print 'corruption_levels:', corruption_levels
                training_predicted = sda_unlabel.predict(x_train_minmax)
                y_train = y_train_minmax
                isTest = False
                #new
                analysis_scr.append(
                    (self.ddi, subset_no, fisher_mode, 'DL_U', isTest) + tuple(
                        performance_score(y_train, training_predicted,
                                          predicted_score).values()))

                test_predicted = sda_unlabel.predict(x_test_minmax)
                y_test = test_y

                isTest = True
                #new
                analysis_scr.append(
                    (self.ddi, subset_no, fisher_mode, 'DL_U', isTest) + tuple(
                        performance_score(y_test, test_predicted,
                                          predicted_score).values()))
            if 0:
                # deep learning using split network
                print 'deep learning using split network'
                # get the new representation for A set. first 784-D
                pretraining_epochs = 5000
                hidden_layers_sizes = [100, 100, 100]
                corruption_levels = [0, 0, 0]

                x = x_train_minmax[:, :x_train_minmax.shape[1] / 2]
                print "original shape for A", x.shape
                a_MAE_A = train_a_MultipleAEs(
                    x,
                    pretraining_epochs=pretraining_epochs,
                    pretrain_lr=pretrain_lr,
                    batch_size=batch_size,
                    hidden_layers_sizes=hidden_layers_sizes,
                    corruption_levels=corruption_levels)
                new_x_train_minmax_A = a_MAE_A.transform(
                    x_train_minmax[:, :x_train_minmax.shape[1] / 2])
                x = x_train_minmax[:, x_train_minmax.shape[1] / 2:]

                print "original shape for B", x.shape
                a_MAE_B = train_a_MultipleAEs(
                    x,
                    pretraining_epochs=pretraining_epochs,
                    pretrain_lr=pretrain_lr,
                    batch_size=batch_size,
                    hidden_layers_sizes=hidden_layers_sizes,
                    corruption_levels=corruption_levels)
                new_x_train_minmax_B = a_MAE_B.transform(
                    x_train_minmax[:, x_train_minmax.shape[1] / 2:])

                new_x_test_minmax_A = a_MAE_A.transform(
                    x_test_minmax[:, :x_test_minmax.shape[1] / 2])
                new_x_test_minmax_B = a_MAE_B.transform(
                    x_test_minmax[:, x_test_minmax.shape[1] / 2:])
                new_x_validation_minmax_A = a_MAE_A.transform(
                    x_validation_minmax[:, :x_validation_minmax.shape[1] / 2])
                new_x_validation_minmax_B = a_MAE_B.transform(
                    x_validation_minmax[:, x_validation_minmax.shape[1] / 2:])
                new_x_train_minmax_whole = np.hstack(
                    (new_x_train_minmax_A, new_x_train_minmax_B))
                new_x_test_minmax_whole = np.hstack(
                    (new_x_test_minmax_A, new_x_test_minmax_B))
                new_x_validationt_minmax_whole = np.hstack(
                    (new_x_validation_minmax_A, new_x_validation_minmax_B))

                finetune_lr = 1
                batch_size = 100
                pretraining_epochs = cal_epochs(5000,
                                                x_train_minmax,
                                                batch_size=batch_size)
                #pretrain_lr=0.001
                pretrain_lr = 0.001
                training_epochs = 1500
                hidden_layers_sizes = [100, 100, 100]
                corruption_levels = [0, 0, 0]

                sda_transformed = trainSda(new_x_train_minmax_whole, y_train_minmax,
                     new_x_validationt_minmax_whole, y_validation_minmax ,
                     new_x_test_minmax_whole, y_test,
                     hidden_layers_sizes = hidden_layers_sizes, corruption_levels = corruption_levels, batch_size = batch_size , \
                     training_epochs = training_epochs, pretraining_epochs = pretraining_epochs,
                     pretrain_lr = pretrain_lr, finetune_lr=finetune_lr
                     )

                print 'hidden_layers_sizes:', hidden_layers_sizes
                print 'corruption_levels:', corruption_levels
                training_predicted = sda_transformed.predict(
                    new_x_train_minmax_whole)
                y_train = y_train_minmax

                isTest = False
                #new
                analysis_scr.append(
                    (self.ddi, subset_no, fisher_mode, 'DL_S', isTest) + tuple(
                        performance_score(y_train, training_predicted,
                                          predicted_score).values()))

                test_predicted = sda_transformed.predict(
                    new_x_test_minmax_whole)
                y_test = test_y

                isTest = True
                #new
                analysis_scr.append(
                    (self.ddi, subset_no, fisher_mode, 'DL_S', isTest) + tuple(
                        performance_score(y_test, test_predicted,
                                          predicted_score).values()))

        report_name = filename + '_' + '_test10fold_'.join(
            map(str, hidden_layers_sizes)
        ) + '_' + str(pretrain_lr) + '_' + str(finetune_lr) + '_' + str(
            reduce_ratio) + '_' + str(training_epochs) + '_' + current_date
        saveAsCsv(predicted_score, report_name,
                  performance_score(y_test, test_predicted, predicted_score),
                  analysis_scr)
Пример #15
0
lasagne.layers.set_all_param_values(net['prob'], d['param values'])
for i, (tr_ix, val_ix) in enumerate(kf):
    print('CV Fold', i)
    X_tr = X[tr_ix]
    y_tr = y[tr_ix]
    X_val = X[val_ix]
    y_val = y[val_ix]

    #net['new_output'] = DenseLayer(net['pool5/7x7_s1'], num_units=10, nonlinearity=softmax, W=lasagne.init.Normal(0.01))
    lasagne.layers.set_all_param_values(net['prob'], d['param values'])
    learning_rate.set_value(0.0002)

    for epoch in range(2):

        kf2 = KFold(len(y_tr),
                    n_folds=np.floor(len(y_tr) / BATCH_SIZE),
                    shuffle=True,
                    random_state=1)
        progbar = Progbar(np.floor(len(y_tr) / BATCH_SIZE))
        for j, (_, ix) in enumerate(kf2):
            loss, acc = train_batch(ix)
            progbar.add(1)

        learning_rate.set_value(learning_rate.get_value() *
                                learning_rate_decay)

        v_ix = range(len(y_val))
        t_ix = range(len(y_tr))
        np.random.shuffle(v_ix)
        np.random.shuffle(t_ix)

        tr_loss_tot = 0.
data = featureFormat(my_dataset, features_list)

### split into labels and features (this line assumes that the first
### feature in the array is the label, which is why "poi" must always
### be first in features_list
labels, features = targetFeatureSplit(data)

### machine learning goes here!
### please name your classifier clf for easy export below

### deploying feature selection
features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(
    features, labels, test_size=0.1, random_state=42)

### use KFold for split and validate algorithm
kf = KFold(len(labels), 3)
for train_indices, test_indices in kf:
    #make training and testing sets
    features_train = [features[ii] for ii in train_indices]
    features_test = [features[ii] for ii in test_indices]
    labels_train = [labels[ii] for ii in train_indices]
    labels_test = [labels[ii] for ii in test_indices]

t0 = time()

clf = DecisionTreeClassifier()
clf.fit(features_train, labels_train)
score = clf.score(features_test, labels_test)
print 'accuracy before tuning ', score

print "Decision tree algorithm time:", round(time() - t0, 3), "s"
Пример #17
0
process_mask[:, 30:] = 0
process_mask_img = nibabel.Nifti1Image(process_mask, mask_img.get_affine())

### Searchlight computation ###################################################

# Make processing parallel
# /!\ As each thread will print its progress, n_jobs > 1 could mess up the
#     information output.
n_jobs = 1

### Define the cross-validation scheme used for validation.
# Here we use a KFold cross-validation on the session, which corresponds to
# splitting the samples in 4 folds and make 4 runs using each fold as a test
# set once and the others as learning sets
from sklearn.cross_validation import KFold
cv = KFold(y.size, n_folds=4)

import nilearn.decoding
# The radius is the one of the Searchlight sphere that will scan the volume
searchlight = nilearn.decoding.SearchLight(mask_img,
                                           process_mask_img=process_mask_img,
                                           radius=5.6,
                                           n_jobs=n_jobs,
                                           verbose=1,
                                           cv=cv)
searchlight.fit(fmri_img, y)

### F-scores computation ######################################################
from nilearn.input_data import NiftiMasker

# For decoding, standardizing is often very important
Пример #18
0
from sklearn.externals import joblib
import time
from sklearn.naive_bayes import MultinomialNB

filename = '/Users/jzhy/Downloads/train.csv'
data = pd.read_csv(filename)

X = numpy.zeros((len(data.x), 4))
X[:, 0] = data.x
X[:, 1] = data.y
X[:, 2] = data.accuracy
X[:, 3] = data.time
Y = numpy.zeros((len(data.x), 1))
Y = data.place_id

XX = preprocessing.scale(X)
YY = numpy.unique(Y)

kf = KFold(len(X), n_folds=len(Y) / 10000 + 1)

clf = MultinomialNB()

i = 0
for train, test in kf:
    clf.partial_fit(X[test, :], Y[test], YY.reshape((len(YY), -1)))
    i = i + 1
    print i
joblib.dump(clf, 'MultinomialNB.pkl')

exit()
Пример #19
0
# Create the dataframe containing the unsorted preictal samples features
ix_1 = 1990401 # Index of the first sample of the category
second_preictal_index = df[df['index'] >= ix_1].index.tolist()
#print "second_preictal_list", len(second_preictal_index) [debug]

# Create two dataframes to store separately the preictal and the interictal samples
preictal_df = df.loc[(df['class'] == 1) & (df['index'] < ix_1)]
preictal_df = preictal_df.sort_values('index', axis = 0)
# print "preictal_df", preictal_df.shape [debug]

interictal_df = (df.loc[df['class'] == 0])
interictal_df = interictal_df.sort_values('index',axis = 0)
# print "interictal_df", interictal_df.shape [debug]

# Create the train test splits for the preictal and the interictal samples
preictal_folds = list(KFold(n = preictal_df.shape[0], n_folds=25, shuffle=False))
interictal_folds = list(KFold(n = interictal_df.shape[0], n_folds=24, shuffle=False))


# Create the test set
test_i = [] # create the list for temporary storage of interictal sample indices for the test set
test_p = [] # create the list for temporary storage of preictal sample indices for the test set

# Compose the list of indices for the test set
for i in range(25): 
    if ((i+1)%5) == 0: 
        tr_p, tt_p = preictal_folds[i]
        test_p = test_p + list(tt_p+interictal_df.shape[0])
    if ((i+1)%6) == 0 and i < 24: 
        tr_i, tt_i = interictal_folds[i] 
        test_i = test_i + list(tt_i) 
Пример #20
0
def main():
	if len(sys.argv) == 1:
		print 'need filename'
		sys.exit(-1)
	else:
		infilename = sys.argv[-1]
		print infilename

	#npzfile = np.load('data/unigram_bigram_ner_senti_pos_lda_data.npz')
	npzfile = np.load(infilename)
	X = npzfile['X'];
	y = npzfile['y'];
	#split the data into 8:2 -> training:testing 
	trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.2, random_state=0)

	print 'feature size: '+str(np.shape(X))

	
	feature_index = set() # store the best feature indices
	

	kf = KFold(trainX.shape[0], n_folds=5) # kfold on training set for feature selection
	for train_index, test_index in kf: 
		trainX_train, trainX_test = trainX[train_index], trainX[test_index]
		trainy_train, trainy_test = y[train_index], y[test_index]

		auc_best_global = 0; # best auc in each cross validation
		xtrainBest = []	# store the best feture matrix for traing section of traingX
		xtestBest = [] #store the best feture matrix for testing section of traingX
		residual_col_indices = set() # residual column indices to check for each iteration when adding new features
		for i in range(0,X.shape[1]): #init the set with all col indices
			residual_col_indices.add(i)

		for i in range(0, X.shape[1]):

			colInd_best = -1;
			auc_best_local = 0 # init to 0
			for colInd in residual_col_indices:

				if i == 0: # if it's the first feature to add
					xtrainCur = trainX_train[:,colInd].reshape(trainX_train.shape[0],-1) #convert to a column vector
					xtestCur = trainX_test[:,colInd].reshape(trainX_test.shape[0],-1)
				else: 
					xtrainCur = np.hstack((xtrainBest, trainX_train[:,colInd].reshape(trainX_train.shape[0],-1) ))
					xtestCur =  np.hstack((xtestBest, trainX_test[:,colInd].reshape(trainX_test.shape[0],-1) ))

				clf = LogisticRegression();
				clf.fit(xtrainCur, trainy_train)


				y_true, y_pred = trainy_test, clf.predict(xtestCur)
				auc = roc_auc_score(y_true, y_pred) # auc score
				
				if auc_best_local < auc:
					auc_best_local = auc
					colInd_best = colInd
					print 'auc = ' + str(auc_best_local) + '\tcolInd_best = '+str(colInd_best)
					

			if auc_best_global < auc_best_local : # if auc is increasing by adding new features
				if i == 0: # if it's the first feature to add
					xtrainBest = trainX_train[:,colInd_best].reshape(trainX_train.shape[0],-1)
					xtestBest = trainX_test[:,colInd_best].reshape(trainX_test.shape[0],-1)
				else:
					xtrainBest = np.hstack((xtrainBest,trainX_train[:,colInd_best].reshape(trainX_train.shape[0],-1)))
					xtestBest = np.hstack((xtestBest,trainX_test[:,colInd_best].reshape(trainX_test.shape[0],-1)))

				print 'feature index to add: '+str(colInd_best)
				feature_index.add(colInd_best) # union of all features selected during each k-fold CV
				residual_col_indices.remove(colInd_best)
				auc_best_global = auc_best_local

				if auc_best_global == 1:
					break;
			else: 
				break;
				

		print 'auc_best_global found on current trainX_test fold: '+str(auc_best_global)

	print '# features selected = '+str(len(feature_index)) 
	feature_index = list(feature_index)
	print 'feature_index = ' + str(feature_index)
	# should NOT sort feature_index before test!

	outfilename = infilename[0:-8] +'selected.npz' 
	np.savez(outfilename,X = X[:,feature_index], y = y)

	clf.fit(trainX[:,feature_index], trainy)
	testy_true, testy_pred = testy, clf.predict(testX[:,feature_index])
	auc_test = roc_auc_score(testy_true, testy_pred)		
	print 'auc test = '+str(auc_test)

	# ---------------------------------- tune params ----------------------------------

	# Set the parameters by cross-validation
	tuned_parameters = [{},
						{'penalty': ['l2'], 'C':np.logspace(-5, 4, 10), 'solver': ['sag'] ,'max_iter':[500] },
						{'penalty': ['l2'], 'C':np.logspace(-5, 4, 10), 'solver': ['newton-cg'] ,'max_iter':[500] },
						{'penalty': ['l2'], 'C':np.logspace(-5, 4, 10), 'solver': ['lbfgs'] ,'max_iter':[500] },
						{'penalty': ['l2','l1'], 'C':np.logspace(-5, 4, 10), 'solver': ['liblinear'] ,'max_iter':[500] }
						]
	clf = GridSearchCV(LogisticRegression(class_weight= 'balanced'), tuned_parameters, cv=5, scoring= None)

	clf.fit(trainX[:,feature_index], trainy)

	print("Best parameters set found on development set:")
	print(clf.best_params_)
	y_true, y_pred = testy, clf.predict(testX[:,feature_index])
	auc = roc_auc_score(testy_true, testy_pred)
	print 'accuracy = ' + str(accuracy_score(y_true, y_pred))
	print 'auc = ' + str(auc)
Пример #21
0
miz = aud_model.Functional_Model(input_neurons=input_neurons,
                                 dropout1=dropout1,
                                 cross_validation=cross_validation,
                                 act1=act1,
                                 act2=act2,
                                 act3=act3,
                                 nb_filter=nb_filter,
                                 filter_length=filter_length,
                                 num_classes=num_classes,
                                 model=model,
                                 dimx=dimx,
                                 dimy=dimy)

np.random.seed(68)
if cross_validation:
    kf = KFold(len(tr_X), folds, shuffle=True, random_state=42)
    results = []
    for train_indices, test_indices in kf:
        train_x = [tr_X[ii] for ii in train_indices]
        train_y = [tr_y[ii] for ii in train_indices]
        test_x = [tr_X[ii] for ii in test_indices]
        test_y = [tr_y[ii] for ii in test_indices]
        #train_y = to_categorical(train_y,num_classes=len(labels))
        #test_y = to_categorical(test_y,num_classes=len(labels))

        train_x = np.array(train_x)
        train_y = np.array(train_y)
        test_x = np.array(test_x)
        test_y = np.array(test_y)
        print "Development Mode"
Пример #22
0
def main():
    parser = argparse.ArgumentParser(
        description="Transform csv files into numpy array")
    parser.add_argument('-d',
                        '--data',
                        required=True,
                        help="The data directory")
    args = parser.parse_args()
    learning_rate = 0.0001
    L1_reg = 0.00
    L2_reg = 0.0001
    n_epochs = 1000
    batch_size = 32
    n_hidden = 1000
    ds = pickle.load(open(os.path.join(args.data, 'ds.npy')))
    trI = ds['trI']
    trX = ds['trX'].toarray()
    trY = ds['trY'].astype(np.int32)
    teI = ds['teI']
    teX = ds['teX'].toarray()
    allX = np.vstack((trX, teX))
    means, stds = calculate_mean_and_std(allX)
    normailize_by_zvalue(means, stds, allX)
    #normailize_by_minmax(allX);
    trX = allX[0:trX.shape[0], :]
    teX = allX[trX.shape[0]:trX.shape[0] + teX.shape[0], :]
    kf = KFold(trX.shape[0], n_folds=5)
    trainIds = None
    testIds = None
    for train, test in kf:
        trainIds = train
        testIds = test
    cv_train_X = theano.shared(trX[trainIds, :], 'cv_train_X')
    cv_test_X = theano.shared(trX[testIds, :], 'cv_test_X')
    cv_train_Y = theano.shared(trY[trainIds], 'cv_train_Y')
    cv_test_Y = theano.shared(trY[testIds], 'cv_test_Y')
    ncvtr = len(trainIds)
    ncvte = len(testIds)
    nte = teX.shape[0]
    n_train_batches = int(np.ceil(len(trainIds) * 1.0 / batch_size))
    n_valid_batches = int(np.ceil(len(testIds) * 1.0 / batch_size))
    n_test_batches = int(np.ceil(teX.shape[0] * 1.0 / batch_size))
    teX = theano.shared(teX)
    rng = np.random.RandomState(1234)
    print('... building the model')
    left = T.lscalar()
    right = T.lscalar()
    x = T.matrix('x')
    y = T.ivector('y')
    classifier = MLP(rng=rng, input=x, n_in=149, n_hidden=n_hidden, n_out=2)
    cost = (classifier.negative_log_likelihood(y) + L1_reg * classifier.L1 +
            L2_reg * classifier.L2_sqr)
    validate_model = theano.function(inputs=[left, right],
                                     outputs=classifier.errors(y),
                                     givens={
                                         x: cv_test_X[left:right],
                                         y: cv_test_Y[left:right]
                                     })
    gparams = [T.grad(cost, param) for param in classifier.params]
    updates = [(param, param - learning_rate * gparam)
               for param, gparam in zip(classifier.params, gparams)]
    train_model = theano.function(inputs=[left, right],
                                  outputs=cost,
                                  updates=updates,
                                  givens={
                                      x: cv_train_X[left:right],
                                      y: cv_train_Y[left:right]
                                  })
    test_model = theano.function(inputs=[left, right],
                                 outputs=classifier.output,
                                 givens={
                                     x: teX[left:right],
                                 })
    print('... training')

    # early-stopping parameters
    patience = 10000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience // 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_validation_loss = np.inf
    best_iter = 0
    test_score = 0.
    start_time = timeit.default_timer()

    epoch = 0
    done_looping = False

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in range(n_train_batches):

            minibatch_avg_cost = train_model(
                minibatch_index * batch_size,
                min((minibatch_index + 1) * batch_size, ncvtr))
            iter = (epoch - 1) * n_train_batches + minibatch_index

            if (iter + 1) % validation_frequency == 0:
                validation_losses = [
                    validate_model(i * batch_size,
                                   min((i + 1) * batch_size, ncvte))
                    for i in range(n_valid_batches)
                ]
                this_validation_loss = np.mean(validation_losses)

                print('epoch %i, minibatch %i/%i, validation error %f %%' %
                      (epoch, minibatch_index + 1, n_train_batches,
                       this_validation_loss * 100.))

                if this_validation_loss < best_validation_loss:
                    if (this_validation_loss <
                            best_validation_loss * improvement_threshold):
                        patience = max(patience, iter * patience_increase)

                    best_validation_loss = this_validation_loss
                    best_iter = iter

            if patience <= iter:
                done_looping = True
                break

    end_time = timeit.default_timer()
    print(('Optimization complete. Best validation score of %f %% '
           'obtained at iteration %i, with test performance %f %%') %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    probs = np.vstack([
        test_model(i * batch_size, min((i + 1) * batch_size, nte))
        for i in range(n_test_batches)
    ])
    fid = open('/local/db/uqdxingz/Santander/sub/mlp.csv', 'w')
    fid.write('ID,TARGET\n')
    for i in range(len(teI)):
        fid.write('%d,%.9f\n' % (int(teI[i]), probs[i][1]))
    fid.close()
Пример #23
0
     xs = xyz[0][count]
     ys = xyz[1][count]
     zs = xyz[2][count]
     count = count+1
     ax.scatter(xs, ys, zs, c=plt.cm.coolwarm(zs), alpha=.4)
 plt.show()
      
 #treino
 xy = mat['dados_rbf'][:,:2]
 z = mat['dados_rbf'][:,2].reshape(-1,1)
 ntd = xy.shape[0]   #qtd dados
 #nf = x.shape[1]   #qtd features
 
 #metaparametros
 n = 8 #qtd neuronios
 kf = KFold(ntd, n_folds=3)
 
 i = 1
 #validacao cruzada
 for train_index, test_index in kf:
     #print("TRAIN:", train_index, "TEST:", test_index)
     X_train, X_test = xy[train_index], xy[test_index]
     y_train, y_test = z[train_index], z[test_index]    
 
     # rbf regression
     rbf = RBF(2, n, 1)
     rbf.train(X_train, y_train)
     zest = rbf.test(X_test)
     
         
     MSE = mean_squared_error(y_test, zest)
Пример #24
0
train_feat['id'] = train_feat['id'].apply(lambda x: 0 - int(x[1:])
                                          if 'p' in x else int(x[1:]))
test_feat['id'] = test_feat['id'].apply(lambda x: 0 - int(x[1:])
                                        if 'p' in x else int(x[1:]))

predictors = train_feat.columns.drop(
    ['label', 'enddate', 'hy_16.0', 'hy_91.0', 'hy_94.0'])

print('开始CV 5折训练...')
scores = []
t0 = time.time()
mean_score = []
train_preds = np.zeros(len(train_feat))
test_preds = np.zeros(len(test_feat))
kf = KFold(len(train_feat), n_folds=5, shuffle=True, random_state=520)
for i, (train_index, test_index) in enumerate(kf):
    lgb_train = lgb.Dataset(train_feat[predictors].iloc[train_index],
                            train_feat['label'].iloc[train_index])
    lgb_test = lgb.Dataset(train_feat[predictors].iloc[test_index],
                           train_feat['label'].iloc[test_index])

    params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'max_depth': 20,
        'num_leaves': 150,
        'learning_rate': 0.01,
        'subsample': 0.7,
Пример #25
0
max_auc = 0.62  # best auc so far 
best_parameter = {'nodes': 200, 'weight decay': 1e-5} 
average_aucs = np.array([]) # array to store all average aucs for different parameters
sd_aucs = np.array([]) # to store all standard deviation of aucs
for i in range(len(grid)):    
    
    print('---'*30)
    print('remaining iterations:', len(grid) - i)
    print('best auc so far is:', max_auc)
    print('best set of parameters are', best_parameter)
    
    next_parameters = {'nodes': grid[i][0], 'weight decay': grid[i][1],'regulization': grid[i][2]}
    print('Now try: ', next_parameters)
    
    nb_folds = 5
    kfolds = KFold(len(y), nb_folds)
    #av_roc = 0.
    
    auc = np.array([]) # array to store all aucs in each fold
    f = 0
    for train, valid in kfolds:
        
        print('---'*20)
        print('Fold', f+1)
        
        # counting folds
        f += 1
        # splitting the folds
        X_train = X[train]
        X_valid = X[valid]
        Y_train = Y[train]
Пример #26
0
def make_mf_sliced_classification(subset_tr,
                                  subset_te,
                                  clf,
                                  n_round=3,
                                  target_col='median_relevance'):
    print '\n [make_mf_slice]'
    print clf
    mf_tr = np.zeros(len(subset_tr))
    mf_te = np.zeros(len(subset_te))

    #query-slice
    for cur_query in subset_tr.query_stem.value_counts().index:
        mask_tr = subset_tr.query_stem == cur_query
        mask_te = subset_te.query_stem == cur_query

        # build Bow
        vect = CountVectorizer(min_df=1, ngram_range=(1, 2))

        txts = (list((subset_tr[mask_tr]['title_ext']).values) + list(
            (subset_te[mask_te]['title_ext']).values))
        vect.fit(txts)

        X_loc_base = vect.transform(
            list((subset_tr[mask_tr]['title_ext']).values)).todense()
        X_loc_hold = vect.transform(
            list((subset_te[mask_te]['title_ext']).values)).todense()
        y_loc_train = subset_tr[mask_tr][target_col].values
        # intersect terms
        feat_counts = np.array(np.sum(X_loc_base, axis=0))[0] * np.array(
            np.sum(X_loc_hold, axis=0))[0]
        feat_mask = np.where(feat_counts > 0)[0]
        # build final feats matrix
        X_loc_base = np.hstack(
            (X_loc_base[:, feat_mask], subset_tr[mask_tr][feat_list]))
        X_loc_hold = np.hstack(
            (X_loc_hold[:, feat_mask], subset_te[mask_te][feat_list]))

        # metafeatures iterators
        tmp_tr = np.zeros(sum(mask_tr))
        tmp_te = np.zeros(sum(mask_te))

        #print y_loc_train.shape, X_loc_base.shape

        for i in range(n_round):
            kf = KFold(len(y_loc_train),
                       n_folds=2,
                       shuffle=True,
                       random_state=42 + i * 1000)
            for ind_tr, ind_te in kf:
                X_tr = X_loc_base[ind_tr]
                X_te = X_loc_base[ind_te]
                y_tr = y_loc_train[ind_tr]
                y_te = y_loc_train[ind_te]

                clf.fit(X_tr, y_tr)
                tmp_tr[ind_te] += clf.predict(X_te)
                tmp_te += clf.predict(X_loc_hold) * 0.5
        mf_tr[mask_tr.values] = tmp_tr / n_round
        mf_te[mask_te.values] = tmp_te / n_round

    y_valid = subset_tr[target_col].values
    kappa = pykappa.quadratic_weighted_kappa(y_valid, np.round(mf_tr))
    acc = np.mean(y_valid == np.round(mf_tr))
    print '[{}] kappa:{}, acc:{}'.format(i, kappa, acc)
    return (mf_tr, mf_te)
Пример #27
0
def cross_validate(X, y, pca_reduce=True):
    if pca_reduce == True:
        X = pd.DataFrame(dimensionality_reduction(X, y))
    kf = KFold(len(X), n_folds=10, shuffle=True)
    accuracies = []
    conf = []
    precisions = []
    recalls = []
    for train, test in kf:
        X_train, X_test, y_train, y_test = X.as_matrix()[train], X.as_matrix(
        )[test], y.as_matrix()[train], y.as_matrix()[test]
        clf = LogisticRegression()
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)
        conf.append(confusion_matrix(y_test, predictions))
        recalls.append(recall_score(y_test, predictions))
        precisions.append(precision_score(y_test, predictions))
        accuracies.append(clf.score(X_test, y_test))
    print 'Accuracy: ' + str(np.average(accuracies))
    print 'Precision: ' + str(np.average(precisions))
    print 'Recall: ' + str(np.average(recalls))
    confusion = np.zeros((2, 2))
    for i in range(len(conf)):
        confusion += conf[i]
    print confusion

    # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    fpr[0], tpr[0], _ = roc_curve(y_test[:], predictions[:])
    roc_auc[0] = auc(fpr[0], tpr[0])

    plt.figure()
    lw = 2
    plt.plot(fpr[0],
             tpr[0],
             color='darkorange',
             lw=lw,
             label='ROC curve (area = %0.2f)' % roc_auc[0])
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend(loc="lower right")
    plt.show()

    average_precision = dict()
    precision = dict()
    recall = dict()
    precision[0], recall[0], _ = precision_recall_curve(
        y_test[:], predictions[:])
    average_precision[0] = average_precision_score(y_test[:], predictions[:])
    plt.clf()
    plt.plot(recall[0],
             precision[0],
             lw=lw,
             color='navy',
             label='Precision-Recall curve')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    plt.title('Precision-Recall example: AUC={0:0.2f}'.format(
        average_precision[0]))
    plt.legend(loc="lower left")
    plt.show()
Пример #28
0
parser.add_argument('-b','--minbin', help='Minimum categorical bin size', type=int, default=1)
parser.add_argument('-cv','--cv', action='store_true')
parser.add_argument('-codetest','--codetest', action='store_true')
parser.add_argument('-getcached', '--getcached', action='store_true')
parser.add_argument('-extra', '--extra', action='store_true')
m_params = vars(parser.parse_args())

# Load data
X, y, X_sub, ids = data.load(m_params)

print("BNP Parabas: classification...\n") 
clf = ExtraTreesRegressor(n_estimators=700, max_features=60, min_samples_split= 4, max_depth=40, n_jobs=-1, min_samples_leaf=2)

if m_params['cv']:
	# do cross validation scoring
	kf = KFold(X.shape[0], n_folds=4, shuffle=True, random_state=1)
	scr = np.zeros([len(kf)])
	oob_pred = np.zeros(X.shape[0])

	for i, (tr_ix, val_ix) in enumerate(kf):
		clf.fit(X[tr_ix], y[tr_ix])
		pred = clf.predict(X[val_ix])
		oob_pred[val_ix] = np.array(pred)
		scr[i] = log_loss(y[val_ix], np.array(pred))
		print('Train score is:', scr[i])
	print(log_loss(y, oob_pred))
	print oob_pred[1:10]
	oob_filename = '../output/oob_pred_extrees_' + str(np.mean(scr)) + '.p'
	pkl.dump(oob_pred, open(oob_filename, 'wb'))

else:
Пример #29
0
                                              x_valid, "lr", 19)
    return xgb_train, xgb_test, cv_scores


def nbsvm(x_train, y_train, x_valid):
    xgb_train, xgb_test, cv_scores = stacking(lightgbm, x_train, y_train,
                                              x_valid, "nbsvm", 19)
    return xgb_train, xgb_test, cv_scores


import lightgbm
from sklearn.cross_validation import KFold
folds = 5
seed = 2018

kf = KFold(train_x.shape[0], n_folds=folds, shuffle=True, random_state=seed)
lgb_train, lgb_test, m = nbsvm(train_x, train_y, test_x)

score = f1_score(train_y,
                 np.argmax(lgb_train, axis=1),
                 labels=range(0, 19),
                 average='macro')
score = str(score)[:7]
print(score)
#保存预测概率文件
train_prob = pd.DataFrame(lgb_train)
train_prob.columns = [
    "class_prob_%s" % i for i in range(1, lgb_test.shape[1] + 1)
]
train_prob["id"] = list(train_id["id"])
train_prob.to_csv('../sub_prob/train_prob_nblr_cv_%s.csv' % score, index=None)
# -*- coding: utf-8 -*-
"""
Created on Tue Sep  6 10:10:40 2016
"""
import numpy as np
from sklearn.cross_validation import KFold

y = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2])

print("kfold")
kf = KFold(9, n_folds=3)
print(len(kf))

for train, test in kf:
    print(train, test)

print()
for train, test in kf:
    print(y[train], y[test])

print()

from sklearn.cross_validation import StratifiedKFold
print("StratifiedKFold")
kf = StratifiedKFold(y, n_folds=3)
print(len(kf))
for train, test in kf:
    print(train, test)

print()
for train, test in kf:
Пример #31
0
# 2. Вычислите TF-IDF-признаки для всех текстов. Обратите внимание, что в этом задании мы предлагаем вам
# вычислить TF-IDF по всем данным. При таком подходе получается, что признаки на обучающем множестве используют
# информацию из тестовой выборки — но такая ситуация вполне законна, поскольку мы не используем значения целевой
# переменной из теста. На практике нередко встречаются ситуации, когда признаки объектов тестовой выборки известны на
# момент обучения, и поэтому можно ими пользоваться при обучении алгоритма.

vectorizer = TfidfVectorizer()
vectorizer.fit_transform(X)

# 3. Подберите минимальный лучший параметр C из множества [10^-5, 10^-4, ... 10^4, 10^5] для SVM с
# линейным ядром (kernel='linear') при помощи кросс-валидации по 5 блокам. Укажите параметр random_state=241 и для SVM,
# и для KFold. В качестве меры качества используйте долю верных ответов (accuracy).

grid = {'C': np.power(10.0, np.arange(-5, 6))}
cv = KFold(y.size, n_folds=5, shuffle=True, random_state=241)
model = SVC(kernel='linear', random_state=241)
gs = grid_search.GridSearchCV(model, grid, scoring='accuracy', cv=cv)
gs.fit(vectorizer.transform(X), y)

score = 0
C = 0
for attempt in gs.grid_scores_:
    if attempt.mean_validation_score > score:
        score = attempt.mean_validation_score
        C = attempt.parameters['C']

# 4. Обучите SVM по всей выборке с оптимальным параметром C, найденным на предыдущем шаге.

model = SVC(kernel='linear', random_state=241, C=C)
model.fit(vectorizer.transform(X), y)
 metricas = ("Metricas del modelo " + nombreClasificador).capitalize()
 imprimirTextoCentrado(metricas, tamanoConsola)
 mostrarMetricasGenerales(model, x_train, y_train, y_pred_train, "train")
 mostrarMetricasGenerales(model, x_test, y_test, y_pred_test, "test")
 imprimirTextoCentrado("", tamanoConsola, "*")
 imprimirTextoCentrado("Metricas importantes para clasificación",
                       tamanoConsola, "*")
 confusion_matrix_train = mostrarMetricasClasificacion(
     model, x_train, y_train, y_pred_train, "train")
 imprimirTextoCentrado("", tamanoConsola, "#")
 confusion_matrix_test = mostrarMetricasClasificacion(
     model, x_test, y_test, y_pred_test, "test")
 #Crear un iterador de validación cruzada k-fold
 #Nota: Por defecto, la puntuación utilizada es la que se devuelve por el
 #      método de puntuación del estimador (precisión)
 cv = KFold(n=len(y_train), n_folds=5, shuffle=True, random_state=0)
 scores = cross_val_score(model, x_train, y_train, cv=cv)
 print("Scores: ", (scores))
 print("Mean score: {0:.3f} (+/-{1:.3f})".format(np.mean(scores),
                                                 sem(scores)))
 print(
     "*******************************************************************")
 ###########################################################################
 if generarCompar:
     #Graficacion
     #En esta sección se grafican los resultados obtenidos
     #Se grafican la clasificación emocional con respecto
     # con respecto a las variables independiente que en este caso son los
     # pixeles de las imágenes de los rostros.
     mostrarGraficacionPrediVsReal(x_train, y_train, y_pred_train, "train",
                                   nombreClasificador)