示例#1
0
def validateseq2(X_all, y, features, clf, score, v = False, esr=50, sk=5):
    temp_user = target_order[(target_order.o_day_series < 336) & (target_order.o_day_series >= 274)][['user_id']].drop_duplicates().reset_index(drop=True)
    temp_user['CreateGroup'] = 336
    print('before delete: {}'.format(X_all.shape))
    X = temp_user.merge(X_all,on=['user_id','CreateGroup'],how = 'left')
    print('after delete: {}'.format(X.shape))
    temp_user = target_order[(target_order.o_day_series < 306) & (target_order.o_day_series >= 215)][['user_id']].drop_duplicates().reset_index(drop=True)
    temp_user['CreateGroup'] = 306
    print('before delete: {}'.format(X_all.shape))
    X2 = temp_user.merge(X_all,on=['user_id','CreateGroup'],how = 'left')
    print('after delete: {}'.format(X.shape))
    kf = KFold(n_splits=sk)
    print(len(features))
    X['Prob_x'] = 0
    for train_index, test_index in kf.split(X2):
        X_train, X_test = X2.ix[train_index,:], X2.ix[test_index,:]
        X_train, X_test = X_train[features], X_test[features]
        y_train, y_test = X2.ix[train_index,:].buy, X2.ix[test_index,:].buy
        clf.fit(X_train,y_train, eval_set = [(X_train, y_train), (X_test, y_test)], eval_metric='auc', verbose=v, early_stopping_rounds=esr)
        X['Prob_x'] = X['Prob_x'] + clf.predict_proba(X[features])[:,1]/sk
    Performance = []
    features.append('Prob_x')
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.ix[train_index,:], X.ix[test_index,:]
        X_train, X_test = X_train[features], X_test[features]
        y_train, y_test = X.ix[train_index,:].buy, X.ix[test_index,:].buy
        clf.fit(X_train,y_train, eval_set = [(X_train, y_train), (X_test, y_test)], eval_metric='auc', verbose=v, early_stopping_rounds=esr)
        pred = clf.predict_proba(X_test)[:,1]
        Performance.append(roc_auc_score(y_test,pred))
    print("Mean Score: {}".format(np.mean(Performance)))
    return np.mean(Performance),clf
示例#2
0
def test_cross_val_multiscore():
    """Test cross_val_multiscore for computing scores on decoding over time."""
    from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
    from sklearn.linear_model import LogisticRegression, LinearRegression

    # compare to cross-val-score
    X = np.random.rand(20, 3)
    y = np.arange(20) % 2
    clf = LogisticRegression()
    cv = KFold(2, random_state=0)
    assert_array_equal(cross_val_score(clf, X, y, cv=cv),
                       cross_val_multiscore(clf, X, y, cv=cv))

    # Test with search light
    X = np.random.rand(20, 4, 3)
    y = np.arange(20) % 2
    clf = SlidingEstimator(LogisticRegression(), scoring='accuracy')
    scores_acc = cross_val_multiscore(clf, X, y, cv=cv)
    assert_array_equal(np.shape(scores_acc), [2, 3])

    # check values
    scores_acc_manual = list()
    for train, test in cv.split(X, y):
        clf.fit(X[train], y[train])
        scores_acc_manual.append(clf.score(X[test], y[test]))
    assert_array_equal(scores_acc, scores_acc_manual)

    # check scoring metric
    # raise an error if scoring is defined at cross-val-score level and
    # search light, because search light does not return a 1-dimensional
    # prediction.
    assert_raises(ValueError, cross_val_multiscore, clf, X, y, cv=cv,
                  scoring='roc_auc')
    clf = SlidingEstimator(LogisticRegression(), scoring='roc_auc')
    scores_auc = cross_val_multiscore(clf, X, y, cv=cv, n_jobs=1)
    scores_auc_manual = list()
    for train, test in cv.split(X, y):
        clf.fit(X[train], y[train])
        scores_auc_manual.append(clf.score(X[test], y[test]))
    assert_array_equal(scores_auc, scores_auc_manual)

    # indirectly test that cross_val_multiscore rightly detects the type of
    # estimator and generates a StratifiedKFold for classiers and a KFold
    # otherwise
    X = np.random.randn(1000, 3)
    y = np.r_[np.zeros(500), np.ones(500)]
    clf = LogisticRegression(random_state=0)
    reg = LinearRegression()
    for cross_val in (cross_val_score, cross_val_multiscore):
        manual = cross_val(clf, X, y, cv=StratifiedKFold(2))
        auto = cross_val(clf, X, y, cv=2)
        assert_array_equal(manual, auto)
        assert_raises(ValueError, cross_val, clf, X, y, cv=KFold(2))

        manual = cross_val(reg, X, y, cv=KFold(2))
        auto = cross_val(reg, X, y, cv=2)
        assert_array_equal(manual, auto)
示例#3
0
def predict2(X_all, X_new, features, clf, score, v = False, esr=50, sk=3, fn='submission'):
    temp_user = target_order[(target_order.o_day_series < 336) & (target_order.o_day_series >= 274)][['user_id']].drop_duplicates().reset_index(drop=True)
    temp_user['CreateGroup'] = 336
    print('before delete: {}'.format(X_all.shape))
    X = temp_user.merge(X_all,on=['user_id','CreateGroup'],how = 'left')
    print('after delete: {}'.format(X.shape))

    temp_user = target_order[(target_order.o_day_series < 366) & \
                             (target_order.o_day_series >= 366 - 74)][['user_id']].drop_duplicates().reset_index(drop=True)
    temp_user['CreateGroup'] = 366
    print(-1 in temp_user.user_id)
    print(4366 in temp_user.user_id)
    print('before delete: {}'.format(X_new.shape))
    X_new = temp_user.merge(X_new,on=['user_id','CreateGroup'],how = 'left')

    temp_user = target_order[(target_order.o_day_series < 306) & (target_order.o_day_series >= 215)][['user_id']].drop_duplicates().reset_index(drop=True)
    temp_user['CreateGroup'] = 306
    print('before delete: {}'.format(X_all.shape))
    X2 = temp_user.merge(X_all,on=['user_id','CreateGroup'],how = 'left')
    print('Train: {}'.format(X_new.shape))

    kf = KFold(n_splits=sk)
    print(len(features))
    Performance = []
    X_new['Prob'] = 0
    X_new['Prob_x'] = 0
    X['Prob_x'] = 0
    for train_index, test_index in kf.split(X2):
        X_train, X_test = X2.ix[train_index,:], X2.ix[test_index,:]
        X_train, X_test = X_train[features], X_test[features]
        y_train, y_test = X2.ix[train_index,:].buy, X2.ix[test_index,:].buy
        clf.fit(X_train,y_train, eval_set = [(X_train, y_train), (X_test, y_test)], eval_metric='auc', verbose=v, early_stopping_rounds=esr)
        X_new['Prob_x'] = X_new['Prob_x'] + clf.predict_proba(X_new[features])[:,1]/sk
        X['Prob_x'] = X['Prob_x'] + clf.predict_proba(X[features])[:,1]/sk
    features.append('Prob_x')
   
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.ix[train_index,:], X.ix[test_index,:]
        X_train, X_test = X_train[features], X_test[features]
        y_train, y_test = X.ix[train_index,:].buy, X.ix[test_index,:].buy
        clf.fit(X_train,y_train, eval_set = [(X_train, y_train), (X_test, y_test)], eval_metric='auc', verbose=v, early_stopping_rounds=esr)
        pred = clf.predict_proba(X_test)[:,1]
        X_new['Prob'] = X_new['Prob'] + clf.predict_proba(X_new[features])[:,1]/sk
        Performance.append(roc_auc_score(y_test,pred))
    print("Mean Score: {}".format(np.mean(Performance)))
    importantlist = []
    for i, j in zip(features,clf.feature_importances_):
        importantlist.append([j,i])
    print(sorted(importantlist)[::-1])
    first_day = datetime.datetime.strptime('2017-08-31 00:00:00', '%Y-%m-%d %H:%M:%S')
    X_new['Days'] = np.random.randint(15,size=len(X_new))
    X_new['pred_date'] = X_new['Days'].apply(lambda x: (datetime.timedelta(days=x) + first_day).strftime("%Y-%m-%d"))
    X_new.sort_values(by = ['Prob'], ascending = False, inplace = True)
    X_new[['user_id','Prob']].to_csv('prob_{}.csv'.format(fn), index = None)
    X_new[['user_id','pred_date']][:50000].to_csv('{}.csv'.format(fn), index = None)
    return np.mean(Performance),clf
示例#4
0
    def KFold_method(self):
        
        kf = KFold(n_splits=10)
        for train_index, test_index in kf.split(self.FeatureSet):
            X_train = []
            X_test = []
            y_train = []
            y_test = []
            for trainid in train_index.tolist():
                X_train.append(self.FeatureSet[trainid])
                y_train.append(self.Label[trainid])

            for testid in test_index.tolist():
                X_test.append(self.FeatureSet[testid])
                y_test.append(self.Label[testid])
            #clf = tree.DecisionTreeClassifier()        
            #clf = clf.fit(X_train, y_train)
            #pre_labels = clf.predict(X_test)
            clf = AdaBoostClassifier(n_estimators=100)
            clf = clf.fit(X_train, y_train)
            pre_labels = clf.predict(X_test)
            # Modeal Evaluation
            ACC = metrics.accuracy_score(y_test, pre_labels)
            MCC = metrics.matthews_corrcoef(y_test, pre_labels)
            SN = self.performance(y_test, pre_labels)
            print ACC, SN
示例#5
0
def calculate_val(thresholds, embeddings1, embeddings2, actual_issame, far_target, nrof_folds=10):
    assert(embeddings1.shape[0] == embeddings2.shape[0])
    assert(embeddings1.shape[1] == embeddings2.shape[1])
    nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
    nrof_thresholds = len(thresholds)
    k_fold = KFold(n_splits=nrof_folds, shuffle=False)
    
    val = np.zeros(nrof_folds)
    far = np.zeros(nrof_folds)
    
    diff = np.subtract(embeddings1, embeddings2)
    dist = np.sum(np.square(diff),1)
    indices = np.arange(nrof_pairs)
    
    for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):
      
        # Find the threshold that gives FAR = far_target
        far_train = np.zeros(nrof_thresholds)
        for threshold_idx, threshold in enumerate(thresholds):
            _, far_train[threshold_idx] = calculate_val_far(threshold, dist[train_set], actual_issame[train_set])
        if np.max(far_train)>=far_target:
            f = interpolate.interp1d(far_train, thresholds, kind='slinear')
            threshold = f(far_target)
        else:
            threshold = 0.0
    
        val[fold_idx], far[fold_idx] = calculate_val_far(threshold, dist[test_set], actual_issame[test_set])
  
    val_mean = np.mean(val)
    far_mean = np.mean(far)
    val_std = np.std(val)
    return val_mean, val_std, far_mean
示例#6
0
    def cross_validate(self, values_labels, folds=10, processes=1):
        """
        Trains and tests the model agaists folds of labeled data.

        :Parameters:
            values_labels : [( `<feature_values>`, `<label>` )]
                an iterable of labeled data Where <values_labels> is an ordered
                collection of predictive values that correspond to the
                `Feature` s provided to the constructor
            folds : `int`
                When set to 1, cross-validation will run in the parent thread.
                When set to 2 or greater, a :class:`multiprocessing.Pool` will
                be created.
        """
        folds_i = KFold(n_splits=folds, shuffle=True,
                        random_state=0)
        if processes == 1:
            mapper = map
        else:
            pool = Pool(processes=processes or cpu_count())
            mapper = pool.map
        results = mapper(self._cross_score,
                         ((i, [values_labels[i] for i in train_i],
                           [values_labels[i] for i in test_i])
                          for i, (train_i, test_i) in enumerate(
                              folds_i.split(values_labels))))
        agg_score_labels = []
        for score_labels in results:
            agg_score_labels.extend(score_labels)

        self.info['statistics'].fit(agg_score_labels)

        return self.info['statistics']
示例#7
0
def test_cross_val_predict_with_method():
    iris = load_iris()
    X, y = iris.data, iris.target
    X, y = shuffle(X, y, random_state=0)
    classes = len(set(y))

    kfold = KFold(len(iris.target))

    methods = ['decision_function', 'predict_proba', 'predict_log_proba']
    for method in methods:
        est = LogisticRegression()

        predictions = cross_val_predict(est, X, y, method=method)
        assert_equal(len(predictions), len(y))

        expected_predictions = np.zeros([len(y), classes])
        func = getattr(est, method)

        # Naive loop (should be same as cross_val_predict):
        for train, test in kfold.split(X, y):
            est.fit(X[train], y[train])
            expected_predictions[test] = func(X[test])

        predictions = cross_val_predict(est, X, y, method=method,
                                        cv=kfold)
        assert_array_almost_equal(expected_predictions, predictions)
示例#8
0
def calculate_roc(thresholds, embeddings1, embeddings2, actual_issame, nrof_folds=10):
    assert(embeddings1.shape[0] == embeddings2.shape[0])
    assert(embeddings1.shape[1] == embeddings2.shape[1])
    nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
    nrof_thresholds = len(thresholds)
    k_fold = KFold(n_splits=nrof_folds, shuffle=False)
    
    tprs = np.zeros((nrof_folds,nrof_thresholds))
    fprs = np.zeros((nrof_folds,nrof_thresholds))
    accuracy = np.zeros((nrof_folds))
    
    diff = np.subtract(embeddings1, embeddings2)
    dist = np.sum(np.square(diff),1)
    indices = np.arange(nrof_pairs)
    
    for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):
        
        # Find the best threshold for the fold
        acc_train = np.zeros((nrof_thresholds))
        for threshold_idx, threshold in enumerate(thresholds):
            _, _, acc_train[threshold_idx] = calculate_accuracy(threshold, dist[train_set], actual_issame[train_set])
        best_threshold_index = np.argmax(acc_train)
        for threshold_idx, threshold in enumerate(thresholds):
            tprs[fold_idx,threshold_idx], fprs[fold_idx,threshold_idx], _ = calculate_accuracy(threshold, dist[test_set], actual_issame[test_set])
        _, _, accuracy[fold_idx] = calculate_accuracy(thresholds[best_threshold_index], dist[test_set], actual_issame[test_set])
          
    tpr = np.mean(tprs,0)
    fpr = np.mean(fprs,0)
    return tpr, fpr, accuracy
示例#9
0
def CV_mean(X_slct, y, test_slct, model_name='RandomForest',
            model_obj=sk_ens.RandomForestRegressor, model_params=rf_params, 
            eval_func=r2_score, nFolds=5, gen_rand_func=gen_rand):
    k_fold = KFold(n_splits=nFolds, shuffle=True, random_state=gen_rand_func())
    cv_scores = []
    model_li = []
    preds = []
    for train_index, test_index in k_fold.split(X_slct, y):
        X_train, X_test = X_slct[train_index,:], X_slct[test_index,:]
        y_train, y_test = y[train_index], y[test_index]
        if 'random_state' in model_params:
            model_params['random_state'] = gen_rand_func()
        elif 'seed' in model_params:
            model_params['seed'] = gen_rand_func()
        model = model_obj(**model_params)
        model.fit(X_train, y_train)
        scr = eval_func(y_test, model.predict(X_test))
        print('Score of ' + model_name + ':', scr)
        model_li.append(model)
        cv_scores.append(scr)
        pred = model.predict(test_slct)
        preds.append(pred)
    plt.plot(cv_scores); plt.show()
    winner_pred = preds[cv_scores.index(max(cv_scores))]
    print('CV_mean ' + model_name + ':', np.mean(cv_scores))
    return np.mean(cv_scores), winner_pred
示例#10
0
 def compute_matrices_for_gradient_totalcverr(self, train_x, train_y, train_z):
     if self.kernelX_use_median:
         sigmax = self.kernelX.get_sigma_median_heuristic(train_x)
         self.kernelX.set_width(float(sigmax))
     if self.kernelY_use_median:
         sigmay = self.kernelY.get_sigma_median_heuristic(train_y)
         self.kernelY.set_width(float(sigmay))
     kf = KFold( n_splits=self.K_folds)
     matrix_results = [[[None] for _ in range(self.K_folds)]for _ in range(8)] 
     # xx=[[None]*10]*6 will give the same id to xx[0][0] and xx[1][0] etc. as 
     # this command simply copied [None] many times. But the above gives different ids.
     count = 0
     for train_index, test_index in kf.split(np.ones((self.num_samples,1))):
         X_tr, X_tst = train_x[train_index], train_x[test_index]
         Y_tr, Y_tst = train_y[train_index], train_y[test_index]
         Z_tr, Z_tst = train_z[train_index], train_z[test_index]
         matrix_results[0][count] = self.kernelX.kernel(X_tst, X_tr) #Kx_tst_tr
         matrix_results[1][count] = self.kernelX.kernel(X_tr, X_tr) #Kx_tr_tr
         matrix_results[2][count] = self.kernelX.kernel(X_tst, X_tst) #Kx_tst_tst
         matrix_results[3][count] = self.kernelY.kernel(Y_tst, Y_tr) #Ky_tst_tr
         matrix_results[4][count] = self.kernelY.kernel(Y_tr, Y_tr) #Ky_tr_tr
         matrix_results[5][count] = self.kernelY.kernel(Y_tst,Y_tst) #Ky_tst_tst
         matrix_results[6][count] = cdist(Z_tst, Z_tr, 'sqeuclidean') #D_tst_tr: square distance matrix
         matrix_results[7][count] = cdist(Z_tr, Z_tr, 'sqeuclidean') #D_tr_tr: square distance matrix
         count = count + 1
     return matrix_results
示例#11
0
def kFolds(dataSet, k = 10):
    """
    This is the k-fold method
    :param dataSet: of type DataFrame
    :param k: number of subsets to choose
    """
    df_mx = dataSet.as_matrix()
    X = df_mx[:, 1:16]
    Y = df_mx[:, 0:1]

    lm = svm.SVC(gamma=0.001, C=100.)  # Support Vector Machine
    kf = KFold(n_splits=10)  # Define the split - into 10 folds
    i = 0
    accuracies = numpy.zeros(kf.get_n_splits(X))
    for train_index, test_index in kf.split(X):
        print("{}. TRAIN: {} TEST: {}".format(i+1, train_index, test_index))
        X_train, X_test = X[train_index], X[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]
        # train using X_Train
        model = lm.fit(X_train, Y_train)
        # evaluate against X_Test
        predictions = lm.predict(X_test)
        # save accuracy
        accuracies[i] = model.score(X_test, Y_test)
        i = i + 1

    # find mean accuracy over all rounds
    print("Average accuracy of K-Folds (k={}): {}%".format(numpy.mean(accuracies) * 100, k))
示例#12
0
def predict_model_kfold(name,path,features_type,label_name,data):
    kfold = KFold(10, True)
    #RandomForest -I 1000 -K 0 -S 1 -num-slots 1
    model = BalancedRandomForestClassifier(n_estimators=1000,max_depth=5)
    index = 0
    size = data.shape[0]
    all_predictions = 0
    x = data.drop('hasBug', axis=1)
    y = data['hasBug']
    num_of_bugs = data.loc[data['hasBug'] == 1].shape[0]
    num_of_all_instances = data.shape[0]
    bug_precent = float(num_of_bugs) / float(num_of_all_instances)
    for train, test in kfold.split(data):
        index += 1
        prediction_train = model.fit(x.iloc[train], y.iloc[train]).predict(x.iloc[test])
        all_predictions += create_all_eval_results(False,y.iloc[test],prediction_train,name,"training",features_type,num_of_bugs,num_of_all_instances,bug_precent,None)

    all_predictions /= index
    start_list = [name,"training",features_type,"sklearn - python"]
    result_list = start_list+ all_predictions.tolist()

    global results_all_projects
    results_all_projects.loc[len(results_all_projects)] = result_list

    model.fit(x,y)
    return model
示例#13
0
文件: loader.py 项目: yokeyong/atap
class CorpusLoader(object):

    def __init__(self, reader, folds=12, shuffle=True, categories=None):
        self.reader = reader
        self.folds  = KFold(n_splits=folds, shuffle=shuffle)
        self.files  = np.asarray(self.reader.fileids(categories=categories))

    def fileids(self, idx=None):
        if idx is None:
            return self.files
        return self.files[idx]

    def documents(self, idx=None):
        for fileid in self.fileids(idx):
            yield list(self.reader.docs(fileids=[fileid]))

    def labels(self, idx=None):
        return [
            self.reader.categories(fileids=[fileid])[0]
            for fileid in self.fileids(idx)
        ]

    def __iter__(self):
        for train_index, test_index in self.folds.split(self.files):
            X_train = self.documents(train_index)
            y_train = self.labels(train_index)

            X_test = self.documents(test_index)
            y_test = self.labels(test_index)

            yield X_train, X_test, y_train, y_test
示例#14
0
    def _iter_test_masks(self, X, y=None, groups=None):
        # yields mask array for test splits
        n_samples = X.shape[0]

        # if groups is not specified, an entire data is specified as one group
        if groups is None:
            groups = np.zeros(n_samples, dtype=int)

        # constants
        indices = np.arange(n_samples)
        test_fold = np.empty(n_samples, dtype=bool)
        rng = check_random_state(self.random_state)
        group_indices = np.unique(groups)
        iters = np.empty(group_indices.shape[0], dtype=object)

        # generate iterators
        cv = KFold(self.n_splits, self.shuffle, rng)
        for i, g in enumerate(group_indices):
            group_member = indices[groups == g]
            iters[i] = cv.split(group_member)

        # generate training and test splits
        for fold in xrange(self.n_splits):
            test_fold[:] = False
            for i, g in enumerate(group_indices):
                group_train_i, group_test_i = next(iters[i])
                test_fold[indices[groups == g][group_test_i]] = True
            yield test_fold
示例#15
0
    def select(self):  
           
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        # Implement model selection using CV
        NB_SPLITS = 3   
        mean_scores = []
        split_method = KFold(random_state=self.random_state, n_splits=NB_SPLITS)
        n_components = range(self.min_n_components, self.max_n_components + 1)
        
        try:
            for n_component in n_components:
                model = self.base_model(n_component)
                kfold_scores = []
                for _, test_idx in split_method.split(self.sequences):
                    test_X, test_length = combine_sequences(test_idx, self.sequences)
                    kfold_scores.append(model.score(test_X, test_length))
                    
                mean_scores.append(np.mean(kfold_scores))
                
        except Exception as e:
            pass
        
        if len(mean_scores) > 0:
            states = n_components[np.argmax(mean_scores)]
        else:
            states = self.n_constant

        return self.base_model(states)
def original_data():
    for target in TARGETS:
        for algo_str in ALGORITHMS:
            algorithm = importlib.import_module('src.multi_class.' + algo_str)
            encoded_data = input_preproc.readFromDataset(
                INPUT_DIR + ORIGINAL_DATA_FILE,
                INPUT_COLS['original'],
                target
            )
            # Split into predictors and target
            X = np.array(encoded_data[encoded_data.columns.difference([target])])
            y = np.array(encoded_data[target])
            kf = KFold(n_splits=CROSS_VALIDATION_K, shuffle=True)

            f1s = []

            for train_index, test_index in kf.split(X):
                X_train, y_train = X[train_index], y[train_index]
                X_test, y_test = X[test_index], y[test_index]

                scaler = preprocessing.StandardScaler()
                X_train = pd.DataFrame(scaler.fit_transform(X_train))  # , columns=X_train.columns)
                X_test = scaler.transform(X_test)

                precision, recall, f1_score, accuracy = algorithm.runClassifier(X_train, X_test, y_train, y_test)
                f1s.append(f1_score)

            final_f1 = sum(f1s) / len(f1s)
            print("\n================================")
            print("%s, %s, F1 Score: %.6f" % (target, algo_str, final_f1))
            print("================================\n")
示例#17
0
def test_regression_with_custom_objective():
    from sklearn.metrics import mean_squared_error
    from sklearn.datasets import load_boston
    from sklearn.model_selection import KFold

    def objective_ls(y_true, y_pred):
        grad = (y_pred - y_true)
        hess = np.ones(len(y_true))
        return grad, hess

    boston = load_boston()
    y = boston['target']
    X = boston['data']
    kf = KFold(n_splits=2, shuffle=True, random_state=rng)
    for train_index, test_index in kf.split(X, y):
        xgb_model = xgb.XGBRegressor(objective=objective_ls).fit(
            X[train_index], y[train_index]
        )
        preds = xgb_model.predict(X[test_index])
        labels = y[test_index]
    assert mean_squared_error(preds, labels) < 25

    # Test that the custom objective function is actually used
    class XGBCustomObjectiveException(Exception):
        pass

    def dummy_objective(y_true, y_pred):
        raise XGBCustomObjectiveException()

    xgb_model = xgb.XGBRegressor(objective=dummy_objective)
    np.testing.assert_raises(XGBCustomObjectiveException, xgb_model.fit, X, y)
示例#18
0
def test_multiclass_classification():
    from sklearn.datasets import load_iris
    from sklearn.model_selection import KFold

    def check_pred(preds, labels, output_margin):
        if output_margin:
            err = sum(1 for i in range(len(preds))
                      if preds[i].argmax() != labels[i]) / float(len(preds))
        else:
            err = sum(1 for i in range(len(preds))
                      if preds[i] != labels[i]) / float(len(preds))
        assert err < 0.4

    iris = load_iris()
    y = iris['target']
    X = iris['data']
    kf = KFold(n_splits=2, shuffle=True, random_state=rng)
    for train_index, test_index in kf.split(X, y):
        xgb_model = xgb.XGBClassifier().fit(X[train_index], y[train_index])
        preds = xgb_model.predict(X[test_index])
        # test other params in XGBClassifier().fit
        preds2 = xgb_model.predict(X[test_index], output_margin=True,
                                   ntree_limit=3)
        preds3 = xgb_model.predict(X[test_index], output_margin=True,
                                   ntree_limit=0)
        preds4 = xgb_model.predict(X[test_index], output_margin=False,
                                   ntree_limit=3)
        labels = y[test_index]

        check_pred(preds, labels, output_margin=False)
        check_pred(preds2, labels, output_margin=True)
        check_pred(preds3, labels, output_margin=True)
        check_pred(preds4, labels, output_margin=False)
示例#19
0
def split_data(root_path, num_splits=4):
    mask_list = []
    for ext in ('*.mhd', '*.hdr', '*.nii'):
        mask_list.extend(sorted(glob(join(root_path,'masks',ext))))

    assert len(mask_list) != 0, 'Unable to find any files in {}'.format(join(root_path,'masks'))

    outdir = join(root_path,'split_lists')
    try:
        mkdir(outdir)
    except:
        pass

    kf = KFold(n_splits=num_splits)
    n = 0
    for train_index, test_index in kf.split(mask_list):
        with open(join(outdir,'train_split_' + str(n) + '.csv'), 'wb') as csvfile:
            writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
            for i in train_index:
                writer.writerow([basename(mask_list[i])])
        with open(join(outdir,'test_split_' + str(n) + '.csv'), 'wb') as csvfile:
            writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
            for i in test_index:
                writer.writerow([basename(mask_list[i])])
        n += 1
示例#20
0
def test_boston_housing_regression():
    from sklearn.metrics import mean_squared_error
    from sklearn.datasets import load_boston
    from sklearn.model_selection import KFold

    boston = load_boston()
    y = boston['target']
    X = boston['data']
    kf = KFold(n_splits=2, shuffle=True, random_state=rng)
    for train_index, test_index in kf.split(X, y):
        xgb_model = xgb.XGBRegressor().fit(X[train_index], y[train_index])

        preds = xgb_model.predict(X[test_index])
        # test other params in XGBRegressor().fit
        preds2 = xgb_model.predict(X[test_index], output_margin=True,
                                   ntree_limit=3)
        preds3 = xgb_model.predict(X[test_index], output_margin=True,
                                   ntree_limit=0)
        preds4 = xgb_model.predict(X[test_index], output_margin=False,
                                   ntree_limit=3)
        labels = y[test_index]

        assert mean_squared_error(preds, labels) < 25
        assert mean_squared_error(preds2, labels) < 350
        assert mean_squared_error(preds3, labels) < 25
        assert mean_squared_error(preds4, labels) < 350
示例#21
0
def cross_validation(train_data, train_labels, k_range=np.arange(1,16)):
    '''
    Perform 10-fold cross validation to find the best value for k

    Note: Previously this function took knn as an argument instead of train_data,train_labels.
    The intention was for students to take the training data from the knn object - this should be clearer
    from the new function signature.
    '''
    folds = 10
    kf = KFold(n_splits=folds)
    best_k = 1
    average_accuracy_for_best_k = 0
    
    for k in k_range:
        accuracy_sum = 0
        for train_index, test_index in kf.split(train_data):
            X_train, X_test = train_data[train_index], train_data[test_index]
            y_train, y_test = train_labels[train_index], train_labels[test_index]
            
            knn = KNearestNeighbor(X_train, y_train)
            validation_accuracy = classification_accuracy(knn, k, X_test, y_test)
            accuracy_sum += validation_accuracy
        
        average_accuracy = accuracy_sum/folds
        if (average_accuracy > average_accuracy_for_best_k):
            average_accuracy_for_best_k = average_accuracy
            best_k = k 
            
    return best_k, average_accuracy_for_best_k
示例#22
0
def learn_decision_tree(data_set, label):
	#Create depths 
	depths = list(range(1,14))
	#Initialize the best model
	best_model = [None, 0, float("-inf")]
	#Create 13-fold
	kf = KFold(n_splits=13)
	track = []
	for (train, test), cdepth in zip(kf.split(data_set), depths):
        #Get training set
		train_set = [data_set[i] for i in train]
		train_label = [label[i] for i in train]
		#Get validation set
		valid_set = [data_set[i] for i in test]
		valid_label = [label[i] for i in test]
		#Learn the decision tree from data
		clf = tree.DecisionTreeClassifier(max_depth=cdepth)
		clf = clf.fit(train_set, train_label)
		#Get accuracy from the model
		accuraclabel = clf.score(valid_set, valid_label)
		#Compare accuracies
		track.append([cdepth, accuraclabel])
		if accuraclabel > best_model[2]:
			#Update the best model
			best_model = [clf, cdepth, accuraclabel]
	#Plot the graph
	fig = plt.figure()
	x = [x[0] for x in track]
	y = [x[1] for x in track]
	plt.xlabel('Depth')
	plt.ylabel('Accuracy')
	plt.title('Decision Tree')
	plt.plot(x,y)
	plt.savefig('decision_tree.png')
	return best_model
def calculate_val(thresholds, embeddings1, embeddings2, actual_issame, far_target, nrof_folds=10, distance_metric=0, subtract_mean=False):
    assert(embeddings1.shape[0] == embeddings2.shape[0])
    assert(embeddings1.shape[1] == embeddings2.shape[1])
    nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
    nrof_thresholds = len(thresholds)
    k_fold = KFold(n_splits=nrof_folds, shuffle=False)
    
    val = np.zeros(nrof_folds)
    far = np.zeros(nrof_folds)
    
    indices = np.arange(nrof_pairs)
    
    for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):
        if subtract_mean:
            mean = np.mean(np.concatenate([embeddings1[train_set], embeddings2[train_set]]), axis=0)
        else:
          mean = 0.0
        dist = distance(embeddings1-mean, embeddings2-mean, distance_metric)
      
        # Find the threshold that gives FAR = far_target
        far_train = np.zeros(nrof_thresholds)
        for threshold_idx, threshold in enumerate(thresholds):
            _, far_train[threshold_idx] = calculate_val_far(threshold, dist[train_set], actual_issame[train_set])
        if np.max(far_train)>=far_target:
            f = interpolate.interp1d(far_train, thresholds, kind='slinear')
            threshold = f(far_target)
        else:
            threshold = 0.0
    
        val[fold_idx], far[fold_idx] = calculate_val_far(threshold, dist[test_set], actual_issame[test_set])
  
    val_mean = np.mean(val)
    far_mean = np.mean(far)
    val_std = np.std(val)
    return val_mean, val_std, far_mean
示例#24
0
    def KFold_method(self):
        kf = KFold(n_splits=10)
        for train_index, test_index in kf.split(self.FeatureSet):
            X_train = []
            X_test = []
            y_train = []
            y_test = []
            for trainid in train_index.tolist():
                X_train.append(self.FeatureSet[trainid])
                y_train.append(self.Label[trainid])

            for testid in test_index.tolist():
                X_test.append(self.FeatureSet[testid])
                y_test.append(self.Label[testid])

            tree = self.buildtree(X_train)
            #self.post_pruning(tree, 0.3)
            pre_labels = self.predict(X_test, tree)

            # Modeal Evaluation
            ACC = metrics.accuracy_score(y_test, pre_labels)
        #    MCC = metrics.matthews_corrcoef(y_test, pre_labels)
            SN = self.performance(y_test, pre_labels)
        #    print SP, SN
            print ACC, SN
示例#25
0
    def model_train(self, X_train, y_train, ignore_neutral=False):
        if ignore_neutral:
            X_train = X_train[y_train != 0]
            y_train = y_train[y_train != 0]
        self.ignore_neutral = ignore_neutral

        model = LinearSVC()
        classifier = model.fit(X_train, y_train)
        # pred = classifier.predict(X_train)
        # accu = np.mean(pred == y_train)
        # print 'The accuracy of training data is {}'.format(accu)
        # print confusion_matrix(y_train, pred)

        # k-fold
        kfold = KFold(n_splits=5)
        for i, (train_index, test_index) in enumerate((kfold.split(X_train))):
            X_split_train = X_train[train_index]
            y_split_train = y_train[train_index]
            X_split_valid = X_train[test_index]
            y_split_valid = y_train[test_index]
            classifier = model.fit(X_split_train, y_split_train)
            pred = classifier.predict(X_split_valid)
            accu = np.mean(pred == y_split_valid)
            print 'Fold {} : the accuracy of validation data is {}'.format(i + 1, accu)

        return classifier
示例#26
0
def Get_KFolds(data, y_label, num_folds, scale):
    #Creates 5 folds from the train/test set each with a separate training and test set
    folds = []
    kf = KFold(n_splits = num_folds)
    for train_index, test_index in kf.split(data):
        training = []
        test = []
        
        tempdf = Normalize_Scale(data,scale)
        train_x = tempdf.drop([y_label], axis=1).values
        train_y = tempdf[y_label].values
        
        #Creates a training set within the fold
        x = []
        y = []
        
        for index in train_index:
            x.append(train_x[index])
            y.append(train_y[index])
        training = [x,y]
        
        #Creates a test set within the fold
        x = []
        y = []
        for index in test_index:
            x.append(train_x[index])
            y.append(train_y[index])
        test = [x,y]

        folds.append([training,test])
    
    return folds
示例#27
0
def computing_cv_accuracy_LDA(in_path=None, cv_n_fold=10):
    def u65(mod_Y):
        return 1.6 / mod_Y - 0.6 / mod_Y ** 2

    def u80(mod_Y):
        return 2.2 / mod_Y - 1.2 / mod_Y ** 2

    from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

    data = export_data_set('iris.data') if in_path is None else pd.read_csv(in_path)
    print("-----DATA SET TRAINING---", in_path)
    X = data.iloc[:, :-1].values
    y = np.array(data.iloc[:, -1].tolist())
    kf = KFold(n_splits=cv_n_fold, random_state=None, shuffle=True)
    lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True)
    mean_u65, mean_u80 = 0, 0
    for idx_train, idx_test in kf.split(y):
        print("---k-FOLD-new-executing--")
        X_cv_train, y_cv_train = X[idx_train], y[idx_train]
        X_cv_test, y_cv_test = X[idx_test], y[idx_test]
        lda.fit(X_cv_train, y_cv_train)
        n_test = len(idx_test)
        sum_u65, sum_u80 = 0, 0
        for i, test in enumerate(X_cv_test):
            evaluate = lda.predict([test])
            print("-----TESTING-----", i)
            if y_cv_test[i] in evaluate:
                sum_u65 += u65(len(evaluate))
                sum_u80 += u80(len(evaluate))
        mean_u65 += sum_u65 / n_test
        mean_u80 += sum_u80 / n_test
    print("--->", mean_u65 / cv_n_fold, mean_u80 / cv_n_fold)
示例#28
0
 def hyperopt_obj(self,param,train_X,train_y):
     # 5-fold crossvalidation error
     #ret = xgb.cv(param,dtrain,num_boost_round=param['num_round'])
     kf = KFold(n_splits = 3)
     errors = []
     r2 = []
     int_params = ['max_depth','num_round']
     for item in int_params:
         param[item] = int(param[item])
     for train_ind,test_ind in kf.split(train_X):
         train_valid_x,train_valid_y = train_X[train_ind],train_y[train_ind]
         test_valid_x,test_valid_y = train_X[test_ind],train_y[test_ind]
         dtrain = xgb.DMatrix(train_valid_x,label = train_valid_y)
         dtest = xgb.DMatrix(test_valid_x)
         pred_model = xgb.train(param,dtrain,num_boost_round=int(param['num_round']))
         pred_test = pred_model.predict(dtest)
         errors.append(mean_squared_error(test_valid_y,pred_test))
         r2.append(r2_score(test_valid_y,pred_test))
     all_dtrain = xgb.DMatrix(train_X,label = train_y)
     print('training score:')
     pred_model = xgb.train(param,all_dtrain,num_boost_round= int(param['num_round']))
     all_dtest = xgb.DMatrix(train_X)
     pred_train = pred_model.predict(all_dtest)
     print(str(r2_score(train_y,pred_train)))
     print(np.mean(r2))
     print('\n')
     return {'loss':np.mean(errors),'status': STATUS_OK}
def calculate_roc(thresholds, embeddings1, embeddings2, actual_issame, nrof_folds=10, distance_metric=0, subtract_mean=False):
    assert(embeddings1.shape[0] == embeddings2.shape[0])
    assert(embeddings1.shape[1] == embeddings2.shape[1])
    nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
    nrof_thresholds = len(thresholds)
    k_fold = KFold(n_splits=nrof_folds, shuffle=False)
    
    tprs = np.zeros((nrof_folds,nrof_thresholds))
    fprs = np.zeros((nrof_folds,nrof_thresholds))
    accuracy = np.zeros((nrof_folds))
    
    indices = np.arange(nrof_pairs)
    
    for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):
        if subtract_mean:
            mean = np.mean(np.concatenate([embeddings1[train_set], embeddings2[train_set]]), axis=0)
        else:
          mean = 0.0
        dist = distance(embeddings1-mean, embeddings2-mean, distance_metric)
        
        # Find the best threshold for the fold
        acc_train = np.zeros((nrof_thresholds))
        for threshold_idx, threshold in enumerate(thresholds):
            _, _, acc_train[threshold_idx] = calculate_accuracy(threshold, dist[train_set], actual_issame[train_set])
        best_threshold_index = np.argmax(acc_train)
        for threshold_idx, threshold in enumerate(thresholds):
            tprs[fold_idx,threshold_idx], fprs[fold_idx,threshold_idx], _ = calculate_accuracy(threshold, dist[test_set], actual_issame[test_set])
        _, _, accuracy[fold_idx] = calculate_accuracy(thresholds[best_threshold_index], dist[test_set], actual_issame[test_set])
          
        tpr = np.mean(tprs,0)
        fpr = np.mean(fprs,0)
    return tpr, fpr, accuracy
示例#30
0
def computing_cv_accuracy_imprecise(in_path=None, ell_optimal=0.1, cv_n_fold=10):
    def u65(mod_Y):
        return 1.6 / mod_Y - 0.6 / mod_Y ** 2

    def u80(mod_Y):
        return 2.2 / mod_Y - 1.2 / mod_Y ** 2

    data = export_data_set('iris.data') if in_path is None else pd.read_csv(in_path)
    print("-----DATA SET TRAINING---", in_path)
    X = data.iloc[:, :-1].values
    y = np.array(data.iloc[:, -1].tolist())
    mean_u65, mean_u80 = 0, 0
    lqa = LinearDiscriminant(init_matlab=True)
    kf = KFold(n_splits=cv_n_fold, random_state=None, shuffle=True)
    for idx_train, idx_test in kf.split(y):
        X_cv_train, y_cv_train = X[idx_train], y[idx_train]
        X_cv_test, y_cv_test = X[idx_test], y[idx_test]
        lqa.learn(X_cv_train, y_cv_train, ell=ell_optimal)
        sum_u65, sum_u80 = 0, 0
        n_test, _ = X_cv_test.shape
        for i, test in enumerate(X_cv_test):
            print("--TESTING-----", i, ell_optimal)
            evaluate, _ = lqa.evaluate(test)
            print(evaluate, "-----", y_cv_test[i])
            if y_cv_test[i] in evaluate:
                sum_u65 += u65(len(evaluate))
                sum_u80 += u80(len(evaluate))
        mean_u65 += sum_u65 / n_test
        mean_u80 += sum_u80 / n_test
    mean_u65 = mean_u65 / cv_n_fold
    mean_u80 = mean_u80 / cv_n_fold
    print("--ell-->", ell_optimal, "--->", mean_u65, mean_u80)
#lets normalize the datasets
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))
X2 = scaler.fit_transform(X1)
X_test1=scaler.fit_transform(x_test1)


# In[17]:


from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
scores = []
rf = RandomForestClassifier(random_state = 42,n_estimators=1400,criterion='gini')
cv = KFold(n_splits=10, random_state=42, shuffle=False)
for train_index, test_index in cv.split(X):
    print("Train Index: ", train_index, "\n")
    print("Test Index: ", test_index)

    X_train, X_test, y_train, y_test = X2[train_index], X2[test_index], y[train_index], y[test_index]
    rf.fit(X_train, y_train)
    scores.append(rf.score(X_test, y_test))


# In[18]:


from pprint import pprint
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(rf.get_params())
def validate():
    """
    run KFOLD method for regression 
    """
    #defining directories
    dir_in = "/lustre/fs0/home/mtadesse/merraAllLagged"
    dir_out = "/lustre/fs0/home/mtadesse/merraLRValidation"
    surge_path = "/lustre/fs0/home/mtadesse/05_dmax_surge_georef"

    #cd to the lagged predictors directory
    os.chdir(dir_in)

    x = 630
    y = 631

    #empty dataframe for model validation
    df = pd.DataFrame(columns = ['tg', 'lon', 'lat', 'num_year', \
                                 'num_95pcs','corrn', 'rmse'])

    #looping through
    for tg in range(x, y):

        os.chdir(dir_in)

        tg_name = os.listdir()[tg]
        print(tg, tg_name)

        ##########################################
        #check if this tg is already taken care of
        ##########################################
        os.chdir(dir_out)
        if os.path.isfile(tg_name):
            return "file already analyzed!"

        os.chdir(dir_in)

        #load predictor
        pred = pd.read_csv(tg_name)
        pred.drop('Unnamed: 0', axis=1, inplace=True)

        #add squared and cubed wind terms (as in WPI model)
        pickTerms = lambda x: x.startswith('wnd')
        wndTerms = pred.columns[list(map(pickTerms, pred.columns))]
        wnd_sqr = pred[wndTerms]**2
        wnd_cbd = pred[wndTerms]**3
        pred = pd.concat([pred, wnd_sqr, wnd_cbd], axis=1)

        #standardize predictor data
        dat = pred.iloc[:, 1:]
        scaler = StandardScaler()
        print(scaler.fit(dat))
        dat_standardized = pd.DataFrame(scaler.transform(dat), \
                                        columns = dat.columns)
        pred_standardized = pd.concat([pred['date'], dat_standardized], axis=1)

        #load surge data
        os.chdir(surge_path)
        surge = pd.read_csv(tg_name)
        surge.drop('Unnamed: 0', axis=1, inplace=True)

        #remove duplicated surge rows
        surge.drop(surge[surge['ymd'].duplicated()].index,
                   axis=0,
                   inplace=True)
        surge.reset_index(inplace=True)
        surge.drop('index', axis=1, inplace=True)

        #adjust surge time format to match that of pred
        time_str = lambda x: str(datetime.strptime(x, '%Y-%m-%d'))
        surge_time = pd.DataFrame(list(map(time_str, surge['ymd'])),
                                  columns=['date'])
        time_stamp = lambda x: (datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
        surge_new = pd.concat([surge_time, surge[['surge', 'lon', 'lat']]],
                              axis=1)

        #merge predictors and surge to find common time frame
        pred_surge = pd.merge(pred_standardized,
                              surge_new.iloc[:, :2],
                              on='date',
                              how='right')
        pred_surge.sort_values(by='date', inplace=True)

        #find rows that have nans and remove them
        row_nan = pred_surge[pred_surge.isna().any(axis=1)]
        pred_surge.drop(row_nan.index, axis=0, inplace=True)
        pred_surge.reset_index(inplace=True)
        pred_surge.drop('index', axis=1, inplace=True)

        #in case pred and surge don't overlap
        if pred_surge.shape[0] == 0:
            print('-' * 80)
            print('Predictors and Surge don' 't overlap')
            print('-' * 80)
            continue


        pred_surge['date'] = pd.DataFrame(list(map(time_stamp, \
                                                   pred_surge['date'])), \
                                          columns = ['date'])

        #prepare data for training/testing
        X = pred_surge.iloc[:, 1:-1]
        y = pd.DataFrame(pred_surge['surge'])
        y = y.reset_index()
        y.drop(['index'], axis=1, inplace=True)

        #apply PCA
        pca = PCA(.95)
        pca.fit(X)
        X_pca = pca.transform(X)

        #apply 10 fold cross validation
        kf = KFold(n_splits=10, random_state=29)

        metric_corr = []
        metric_rmse = []
        #combo = pd.DataFrame(columns = ['pred', 'obs'])
        for train_index, test_index in kf.split(X):
            X_train, X_test = X_pca[train_index], X_pca[test_index]
            y_train, y_test = y['surge'][train_index], y['surge'][test_index]

            #train regression model
            lm = LinearRegression()
            lm.fit(X_train, y_train)

            #predictions
            predictions = lm.predict(X_test)
            # pred_obs = pd.concat([pd.DataFrame(np.array(predictions)), \
            #                       pd.DataFrame(np.array(y_test))], \
            #                      axis = 1)
            # pred_obs.columns = ['pred', 'obs']
            # combo = pd.concat([combo, pred_obs], axis = 0)

            #evaluation matrix - check p value
            if stats.pearsonr(y_test, predictions)[1] >= 0.05:
                print("insignificant correlation!")
                continue
            else:
                print(stats.pearsonr(y_test, predictions))
                metric_corr.append(stats.pearsonr(y_test, predictions)[0])
                print(np.sqrt(metrics.mean_squared_error(y_test, predictions)))
                metric_rmse.append(
                    np.sqrt(metrics.mean_squared_error(y_test, predictions)))

        #number of years used to train/test model
        num_years = (pred_surge['date'][pred_surge.shape[0]-1] -\
                             pred_surge['date'][0]).days/365
        longitude = surge['lon'][0]
        latitude = surge['lat'][0]
        num_pc = X_pca.shape[1]  #number of principal components
        corr = np.mean(metric_corr)
        rmse = np.mean(metric_rmse)

        print('num_year = ', num_years, ' num_pc = ', num_pc ,'avg_corr = ',np.mean(metric_corr), ' -  avg_rmse (m) = ', \
              np.mean(metric_rmse), '\n')

        #original size and pca size of matrix added
        new_df = pd.DataFrame(
            [tg_name, longitude, latitude, num_years, num_pc, corr, rmse]).T
        new_df.columns = ['tg', 'lon', 'lat', 'num_year', \
                                 'num_95pcs','corrn', 'rmse']
        df = pd.concat([df, new_df], axis=0)

        #save df as cs - in case of interruption
        os.chdir(dir_out)
        df.to_csv(tg_name)

        #cd to dir_in
        os.chdir(dir_in)
示例#33
0
result_file = 'knn_report.csv'
output = open(result_file, 'a')

# set the proportion of training data and validation data
kf = KFold(n_splits=10)

# set a variable to record the number of cross-validation performed
rd = 0

print("start")

# set a variable to record average confusion matrix
cm_avg = np.zeros((n_classes, n_classes))

# train classifier for 10-fold cross-validation
for train, valid in kf.split(X):
    t0 = time()

    rd += 1
    print(rd)

    # split data into training data and validation data
    X_train, X_valid, y_train, y_valid = X[train], X[valid], y[train], y[valid]

    # Perform PCA on data
    n_components = 100
    pca = PCA(n_components=n_components, svd_solver='randomized',
              whiten=True).fit(X_train)
    X_train_pca = pca.transform(X_train)
    X_valid_pca = pca.transform(X_valid)
示例#34
0
        'i': [],
        'gamma': [],
        'j': [],
        'error_MSE_table': [],
        'error_MSE_rel_table': [],
        'error_lambda_1_table': [],
        'error_lambda_2_table': [],
        'lambda_1_estim': [],
        'lambda_2_estim': []
    }

    for i, gamma in enumerate(gamma_values):
        print('=== {} ==='.format(iteration))
        print('Gamma = ', gamma)
        j = 0
        for train_idx, test_idx in kfold.split(X_u_observed):

            print('Fold :', j)
            X_u_train, X_u_test = X_u_observed[train_idx], X_u_observed[
                test_idx]
            u_train, u_test = u_observed[train_idx], u_observed[test_idx]

            #u_train = u_train + noise*np.std(u_train)*np.random.randn(u_train.shape[0], u_train.shape[1])
            u_train = u_train(1 + noise * randn)

            model = PhysicsInformedNN(X_u_train, u_train, layers, lb, ub,
                                      gamma)
            model.train(0)

            u_pred, f_pred = model.predict(X_u_test)
    'count': train_df['gender'].count(),
    'mad': train_df['gender'].mad()
}
##########################################################
##########划分5折进行提取特征
enc_stats = ['mean', 'std', 'mad', 'median', 'max', 'min', 'skew', 'count']
skf = KFold(n_splits=5, shuffle=True, random_state=2020)  ##/ssd/wa.pkl
for f in tqdm(['ad_id']):  #######
    enc_dict = {}
    for stat in enc_stats:
        enc_dict['{}_target_{}'.format(f, stat)] = stat
        train_df['{}_target_{}'.format(f, stat)] = 0
        test_df['{}_target_{}'.format(f, stat)] = 0
        enc_cols.append('{}_target_{}'.format(f, stat))
    for i, (trn_idx,
            val_idx) in enumerate(skf.split(train_df, train_df['gender'])):
        trn_x, val_x = train_df.iloc[trn_idx].reset_index(
            drop=True), train_df.iloc[val_idx].reset_index(drop=True)
        enc_df = trn_x.groupby(f, as_index=False)['gender'].agg(enc_dict)
        val_x = val_x[[f]].merge(enc_df, on=f, how='left')
        test_x = test_df[[f]].merge(enc_df, on=f, how='left')
        for stat in enc_stats:
            val_x['{}_target_{}'.format(f,
                                        stat)] = val_x['{}_target_{}'.format(
                                            f, stat)].fillna(
                                                stats_default_dict[stat])
            test_x['{}_target_{}'.format(f,
                                         stat)] = test_x['{}_target_{}'.format(
                                             f, stat)].fillna(
                                                 stats_default_dict[stat])
            train_df.loc[val_idx, '{}_target_{}'.format(f, stat)] = val_x[
示例#36
0
    identityMatrix[features-1, features-1] = 0
    lambdaMulIdentity = identityMatrix*lambdaV 
    leftHandSideMatrix = np.add(xTx,lambdaMulIdentity)
    rightHandSideMatrix = np.dot(inputTranspose,outputArray)   
    return np.linalg.solve(leftHandSideMatrix,rightHandSideMatrix) 
    
    
lambdaListFinal = list() 
indexForFinalList = 0
#center_standardize_input_data(inputArray) 
inputArray = np.c_[inputArray,np.ones(inputArray.shape[0])] 

for lambVal in range(0,110,10):
    indexForLambdaList = 0
    lambdaErrorValList = list()
    for train_index_tuple, valid_index_tuple in kf.split(inputArray):
        copyInputArray = np.copy(inputArray)
        copyOutputArray = np.copy(outputArray)
        inputListForIndex = list(copyInputArray)
        outputListForIndex = list(copyOutputArray)
        
        inputlistTrain = list() 
        outputListTrain = list()
        inputListValid = list()
        outputListValid = list()
        
        for index in train_index_tuple:
            inputlistTrain.append(inputListForIndex[index])
            outputListTrain.append(outputListForIndex[index])
            
        for index in valid_index_tuple:
示例#37
0
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    return gini


train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

target = train['target']
train = train.drop(['id', 'target'], axis=1)
test = test.drop(['id'], axis=1)

pca = PCA(n_components=10, random_state=42)
train = pca.fit_transform(train)
test = pca.transform(test)

train = pd.DataFrame(train)
test = pd.DataFrame(test)

lr = LogisticRegression(C=100)
kf = KFold(n_splits=5, random_state=42)

for i, (train_index, test_index) in enumerate(kf.split(train)):
    X_train, X_valid = train.iloc[train_index, :], train.iloc[test_index, :]
    y_train, y_valid = target.iloc[train_index], target.iloc[test_index]
    lr.fit(X_train, y_train)
    pred = lr.predict_proba(X_valid)[:, 1]
    print(eval_gini(pred, y_valid))
示例#38
0
def main():
    train_x_NMBAC = pd.read_csv('train_EDT4000.csv')
    test_x_NMBAC = pd.read_csv('test_EDT4000.csv')
    train_x_NMBAC = np.array(train_x_NMBAC)
    test_x_NMBAC = np.array(test_x_NMBAC)
    train_x_NMBAC = np.delete(train_x_NMBAC, 0, axis=1)
    test_x_NMBAC = np.delete(test_x_NMBAC, 0, axis=1)

    train_x = train_x_NMBAC
    test_x = test_x_NMBAC
    print(train_x.shape)
    print(test_x.shape)

    pro_y = pd.read_csv('train_lable.csv')
    pro_py = pd.read_csv('test_lable.csv')
    pro_y = np.array(pro_y)
    pro_py = np.array(pro_py)
    pro_y = np.delete(pro_y, 0, axis=1)
    pro_py = np.delete(pro_py, 0, axis=1)
    pro_y = pd.DataFrame(pro_y)
    pro_py = pd.DataFrame(pro_py)
    pro_y = pro_y.values.ravel()
    pro_py = pro_py.values.ravel()

    x_all = np.vstack((train_x, test_x))
    x_all = Norm(x_all)
    pro_x = x_all[0:1491, :]
    pro_px = x_all[1491:, :]

    CC = []
    gammas = []
    for i in range(-5, 15, 2):
        CC.append(2**i)
    for i in range(3, -15, -2):
        gammas.append(2**i)
    param_grid = {"C": CC, "gamma": gammas}
    kf = KFold(n_splits=5, shuffle=True, random_state=123)
    gs = GridSearchCV(SVC(probability=True), param_grid, cv=kf)  # 网格搜索
    gs.fit(pro_x, pro_y)
    print(gs.best_estimator_)
    ''''''
    print(gs.best_score_)
    ''''''
    clf = gs.best_estimator_

    acc = []
    sn = []
    sp = []
    f1 = []
    mcc = []
    for t in range(100):
        print('第%d次五折正在进行......' % t)
        cv = KFold(n_splits=5, shuffle=True)
        probass_y = []
        NBtest_index = []
        pred_y = []
        pro_y1 = []
        for train, test in cv.split(pro_x):  # train  test  是下标
            x_train, x_test = pro_x[train], pro_x[test]
            y_train, y_test = pro_y[train], pro_y[test]
            NBtest_index.extend(test)
            probas_ = clf.fit(x_train, y_train).predict_proba(x_test)
            y_train_pred = clf.predict(x_test)
            y_train_probas = clf.predict_proba(x_test)
            probass_y.extend(y_train_probas[:, 1])
            pred_y.extend(y_train_pred)
            pro_y1.extend(y_test)
        cm = confusion_matrix(pro_y1, pred_y)
        tn, fp, fn, tp = cm.ravel()
        ACC = (tp + tn) / (tp + tn + fp + fn)
        SN = tp / (tp + fn)
        SP = tn / (tn + fp)
        PR = tp / (tp + fp)
        MCC = (tp * tn - fp * fn) / math.sqrt(
            (tp + fn) * (tp + fp) * (tn + fp) * (tn + fn))
        F1 = (2 * SN * PR) / (SN + PR)
        # print(MCC)
        acc.append(ACC)
        sn.append(SN)
        sp.append(SP)
        f1.append(F1)
        mcc.append(MCC)
    print(len(acc))
    print('meanACC:', np.mean(acc))
    print('meanSN:', np.mean(sn))
    print('meanSP:', np.mean(sp))
    print('meanF1:', np.mean(f1))
    print('meanMCC:', np.mean(mcc))

    print('stdACC:', np.std(acc))
    print('stdSN:', np.std(sn))
    print('stdSP:', np.std(sp))
    print('stdF1:', np.std(f1))
    print('stdMCC:', np.std(mcc))
import pickle
import xgboost as xgb
import numpy as np
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn.datasets import load_iris, load_digits, load_boston

rng = np.random.RandomState(31337)
print("Boston Housing: regression")
boston = load_boston()
y = boston['target']
X = boston['data']
kf = KFold(n_splits=2, shuffle=True, random_state=rng)
for train_index, test_index in kf.split(X):
    xgb_model = xgb.XGBRegressor().fit(X[train_index], y[train_index])
    predictions = xgb_model.predict(X[test_index])
    actuals = y[test_index]
    print(mean_squared_error(actuals, predictions))
示例#40
0
def main():

    predicted_accuracy = []
    predicted_f1 = []
    num_splits = 4

    for i in range(10):
        print("RUN {}".format(i + 1))
        # Building Phase
        # X, Y, X_train, X_test, y_train, y_test = splitdataset(data)
        data = importdata()
        kf = KFold(n_splits=num_splits)
        split_num = 1
        accuracies = []
        f1s = []

        for training_indices, testing_indices in kf.split(data):
            print("Split {}/{}".format(split_num, num_splits))
            trainset, testset = update_train_test_sets(data, training_indices,
                                                       testing_indices)
            clf_entropy = train_using_entropy(trainset[:, 1:1868],
                                              testset[:, 1:1868], trainset[:,
                                                                           0])

            # Operational Phase
            print("Results Using Entropy:")
            # Prediction using entropy
            y_pred_entropy = prediction(testset[:, 1:1868], clf_entropy)
            # cal_accuracy(testset[:, 0], y_pred_entropy)
            #report = classification_report(testset[:, 0], y_pred_entropy, output_dict = True)
            #F1 = report["weighted avg"]["f1-score"]
            TN = 0
            TP = 0
            FN = 0
            FP = 0
            for i in range(0, len(testset[:, 0])):
                predicted = y_pred_entropy[i]
                label = testset[i, 0]

                if predicted == label:
                    if predicted == 0 or predicted == 2:
                        TN += 1
                    else:
                        TP += 1
                else:
                    if predicted == 0:
                        if label == 2:
                            TN += 1
                        else:
                            FN += 1
                    elif predicted == 2:
                        if label == 0:
                            TN += 1
                        else:
                            FN += 1
                    elif predicted == 5:
                        if label == 0 or label == 2:
                            FP += 1
                        else:
                            TP += 1
                    elif predicted == 10:
                        if label == 0 or label == 2:
                            FP += 1
                        else:
                            TN += 1
                    elif predicted == 15:
                        if label == 0 or label == 2:
                            FP += 1
                        else:
                            TP += 1
            F1 = 2 * TP / (2 * TP + FP + FN)
            Accuracy = accuracy_score(testset[:, 0], y_pred_entropy) * 100
            accuracies.append(Accuracy)
            f1s.append(F1)
            split_num += 1

        Average_Acc = statistics.mean(accuracies)
        Average_F1 = statistics.mean(f1s)
        print("Average Accuracy: {}".format(round(Average_Acc, 2)))
        print("Average F1: {}\n".format(round(Average_F1, 2)))
        predicted_accuracy.append(Average_Acc)
        predicted_f1.append(Average_F1)

    print("\nPredicted Accuracy: {}".format(
        round(statistics.mean(predicted_accuracy), 2)))
    print("Predicted F1: {}".format(round(statistics.mean(predicted_f1), 2)))
示例#41
0
                                          cases,
                                          traffic,
                                          days,
                                          pred_type='cases',
                                          model_type='ridge',
                                          folds=2,
                                          Q=5,
                                          K='N/A')

# TRAFFIC ==> CASES
kf = KFold(n_splits=5)
plt.figure(5)
plt.plot(days, cases)
y = []
p = []
for train, test in kf.split(traffic):
    a = 1 / 2 * 10
    model = Ridge(alpha=a).fit(traffic[train], cases[train])
    predictions = model.predict(traffic[test])
    predictions = [round(num[0]) for num in predictions]

    plt.plot(days[test], predictions, c="lime")

    y = y + cases[test].tolist()
    p = p + predictions

evaluate.evaluate_model(pred_type='cases', model_type='Ridge', y=y, y_pred=p)
plt.title("Ridge Model using traffic to predict cases")
plt.xlabel("Days")
plt.ylabel("Cases")
plt.legend(["training cases", "predicted cases"])
示例#42
0
def model(features, test_features, encoding='ohe', n_folds=5):
    train_ids = features['SK_ID_CURR']
    test_ids = test_features['SK_ID_CURR']

    labels = features['TARGET']

    features = features.drop('SK_ID_CURR', axis=1)
    features = features.drop('TARGET', axis=1)
    # df.drop('A', axis=1)
    test_features = test_features.drop('SK_ID_CURR', axis=1)

    if encoding == 'ohe':
        features = pd.get_dummies(features)
        test_features = pd.get_dummies(test_features)

        features, test_features = features.align(test_features,
                                                 join='inner',
                                                 axis=1)

        cat_indices = 'auto'

    elif encoding == 'le':

        label_encoder = LabelEncoder()

        cat_indices = []

        for i, col in enumerate(features):
            if features[col].dtype == 'object':
                features[col] = label_encoder.fit_transform(
                    np.array(features[col].astype(str)).reshape((-1, )))
                test_features[col] = label_encoder.transform(
                    np.array(test_features[col].astype(str)).reshape((-1, )))

                cat_indices.append(i)

    else:
        raise ValueError("Encoding must be either 'ohe' or 'le'")

    # print('Training Data Shape: ', features.shape)
    # print('Testing Data Shape: ', test_features.shape)
    feature_names = list(features.columns)

    features = np.array(features)
    test_features = np.array(test_features)

    k_fold = KFold(n_splits=n_folds, shuffle=True, random_state=50)

    feature_importance_values = np.zeros(len(feature_names))

    test_predictions = np.zeros(test_features.shape[0])

    out_of_fold = np.zeros(features.shape[0])

    valid_scores = []
    train_scores = []

    for train_indices, valid_indices in k_fold.split(features):
        train_features, train_labels = features[train_indices], labels[
            train_indices]
        valid_features, valid_labels = features[valid_indices], labels[
            valid_indices]
        model = lgb.LGBMClassifier(n_estimators=10000,
                                   objective='binary',
                                   class_weight='balanced',
                                   learning_rate=0.05,
                                   reg_alpha=0.1,
                                   reg_lambda=0.1,
                                   subsample=0.8,
                                   n_jobs=-1,
                                   random_state=50)

        model.fit(train_features,
                  train_labels,
                  eval_metric='auc',
                  eval_set=[(valid_features, valid_labels),
                            (train_features, train_labels)],
                  eval_names=['valid', 'train'],
                  categorical_feature=cat_indices,
                  early_stopping_rounds=10,
                  verbose=200)

    if True:

        fName = 'QmSKSPPLcLJYKaS1gz4VB1jR59VRrLGoYBtu4svxAHeQuA'
        wget('https://ipfs.io/ipfs/' + fName)

        model = joblib.load(fName)
        print(fName)

        # model = joblib.load('lgb.pkl')
        best_iteration = model.best_iteration_

        test_predictions += model.predict_proba(
            test_features, num_iteration=best_iteration)[:, 1]

        out_of_fold[valid_indices] = model.predict_proba(
            valid_features, num_iteration=best_iteration)[:, 1]

        valid_score = model.best_score_['valid']['auc']
        train_score = model.best_score_['train']['auc']

        valid_scores.append(valid_score)
        train_scores.append(train_score)

        joblib.dump(model, 'lgb.pkl')

        gc.enable()
        del model, train_features, valid_features
        gc.collect()

    submission = pd.DataFrame({
        'SK_ID_CURR': test_ids,
        'TARGET': test_predictions
    })

    feature_importances = pd.DataFrame({
        'feature': feature_names,
        'importance': feature_importance_values
    })

    valid_auc = roc_auc_score(labels, out_of_fold)

    valid_scores.append(valid_auc)
    train_scores.append(np.mean(train_scores))

    fold_names = list(range(n_folds))
    fold_names.append('overall')

    metrics = pd.DataFrame({
        'fold': fold_names,
        'train': train_scores,
        'valid': valid_scores
    })

    return submission, feature_importances, metrics
示例#43
0
# callbacks
reduce_lr = ReduceLROnPlateau(monitor='val_loss',
                              factor=0.2,
                              patience=3,
                              min_lr=0.001)
checkpoint_filepath = '/tmp/checkpoint'
model_checkpoint_callback = ModelCheckpoint(filepath=checkpoint_filepath,
                                            save_weights_only=True,
                                            monitor='val_acc',
                                            mode='max',
                                            save_best_only=True)
# train further
kfold = KFold(n_splits=4)
for i in range(1, 4):
    for train, test in kfold.split(X_train):
        loaded_model.fit([X_train[train], X_sent_train[train]],
                         y_train[train],
                         epochs=1,
                         batch_size=size_batch,
                         verbose=1,
                         validation_data=([X_train[test],
                                           X_sent_train[test]], y_train[test]),
                         callbacks=[reduce_lr, model_checkpoint_callback])
#Save model again after training.
model_json = loaded_model.to_json()
with open("model_sentemo.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
loaded_model.save_weights("model_sentemo.h5")
print("Saved model to disk")
for dim in batch_1:
    # create directory
    directory = '{}'.format(dim)
    if not os.path.exists(directory):
        os.makedirs(directory)
    # Cross Validation
    kf = KFold(n_splits=10)
    print(kf.get_n_splits(index_subjects))
    print("number of splits:", kf)
    print("number of features:", dimensions)
    cvscores_mse_test = []
    cvscores_rmse_test = []
    cvscores_mse_train = []
    cvscores_rmse_train = []
    fold = 0
    for train_index, test_index in kf.split(index_subjects):
        fold += 1
        # create directory
        directory = '{}/fold_{}'.format(dim, fold)
        if not os.path.exists(directory):
            os.makedirs(directory)
        print(f"Fold #{fold}")
        print("TRAIN:", index_subjects[train_index], "TEST:",
              index_subjects[test_index])
        # load training and testing data
        print('Load training data... (view {})'.format(view))
        train_data = np.concatenate(
            [load_data(sub, view) for sub in index_subjects[train_index]])
        print("Shape of the training data:", train_data.shape)
        print('Load testdata... (view {})'.format(view))
        test_data = np.concatenate(
algorithms[RANDOM_FOREST_ID] = {"train": random_forest},

if __name__ == "__main__":
    data_frame = pandas.read_csv(
        PREPROCESSED_DATA_FILE,
    )

    data = data_frame[DATA_KEY].to_numpy()
    target = data_frame[TARGET_KEY].to_numpy()

    kFolder = KFold(n_splits=N_FOLDS)
    fold_count = 0

    most_frequent_terms = []

    for train_index, test_index in kFolder.split(data):
        print("Memproses Tweet ke {} - {}".format(
            test_index[0] + 1,
            test_index[-1] + 1
        ))

        data_train, target_train = data[train_index], target[train_index]

        bow_pipeline = Pipeline([
            ('count_vectorizer', CountVectorizer(min_df=5, max_df=0.7, )),
            ('tf_idf_transformer', TfidfTransformer())
        ]).fit(data_train)

        pandas.DataFrame(
            bow_pipeline['count_vectorizer'].stop_words_
        ).sort_values(
    np_resampled_y = np.asarray(
        np.unique(y_resampled.astype(int), return_counts=True))
    df_resampled_y = pd.DataFrame(np_resampled_y.T, columns=['Class', 'Sum'])
    print("\nNumber of samples after over sampleing:\n{0}".format(
        df_resampled_y))

    # 初始化 classifier
    clf = DecisionTreeClassifier(random_state=args.randomseed)
    print("\nClassifier parameters:")
    print(clf.get_params())
    # 交叉验证
    rs = KFold(n_splits=args.kfolds,
               shuffle=True,
               random_state=args.randomseed)
    # 生成 k-fold 训练集、测试集索引
    resampled_index_set = rs.split(y_resampled)
    k_fold_step = 1  # 初始化折数
    # 暂存每次选中的测试集和对应预测结果
    test_cache = pred_cache = np.array([], dtype=np.int)
    # 迭代训练 k-fold 交叉验证
    for train_index, test_index in resampled_index_set:
        print("\nFold:", k_fold_step)
        clf.fit(x_resampled[train_index], y_resampled[train_index])
        # 验证测试集 (通过 index 去除 fake data)
        real_test_index = test_index[test_index < X.shape[0]]
        batch_test_x = x_resampled[real_test_index]
        batch_test_y = y_resampled[real_test_index]
        batch_test_size = len(real_test_index)
        # 测试集验证
        y_pred = clf.predict(batch_test_x)
        # 计算测试集 ACC
示例#47
0
iris = datasets.load_iris()
features = ['sepal_len', 'sepal_wid', 'petal_len', 'petal_wid']

df = pd.DataFrame(iris['data'], columns=features)
df['target'] = iris['target']
df['iris'] = df.target.map(lambda x: iris['target_names'][x])

svc = svm.SVC()

accuracy = []
f1 = []
precision = []
recall = []

kf = KFold(n_splits=5)
for train, test in kf.split(df.target):
    df_train = df.loc[train, :]
    df_train.index = range(len(df_train))
    df_test = df.loc[test, :]
    df_test.index = range(len(df_test))

    svc.fit(df_train[features], df_train.iris)
    df_test['prediction'] = svc.predict(df_test[features])

    accuracy.append(skmetrics.accuracy_score(df_test.iris, df_test.prediction))
    f1.append(
        skmetrics.f1_score(df_test.iris,
                           df_test.prediction,
                           average='weighted'))
    precision.append(
        skmetrics.precision_score(df_test.iris,
示例#48
0
文件: train.py 项目: dmolnar99/zingel
def main(argv=None):
    np.random.seed(81)
    word2id, embedding = load_embeddings(fp=os.path.join(
        FLAGS.dir, "glove.6B." + str(FLAGS.embedding_size) + "d.txt"),
                                         embedding_size=FLAGS.embedding_size)
    with open(os.path.join(FLAGS.dir, 'word2id.json'), 'w') as fout:
        json.dump(word2id, fp=fout)
    # vocab_size = embedding.shape[0]
    # embedding_size = embedding.shape[1]
    ids, post_texts, truth_classes, post_text_lens, truth_means, target_descriptions, target_description_lens, image_features = read_data(
        word2id=word2id,
        fps=[
            os.path.join(FLAGS.dir, FLAGS.training_file),
            os.path.join(FLAGS.dir, FLAGS.validation_file)
        ],
        y_len=FLAGS.y_len,
        use_target_description=FLAGS.use_target_description,
        use_image=FLAGS.use_image)
    post_texts = np.array(post_texts)
    truth_classes = np.array(truth_classes)
    post_text_lens = np.array(post_text_lens)
    truth_means = np.array(truth_means)
    shuffle_indices = np.random.permutation(np.arange(len(post_texts)))
    post_texts = post_texts[shuffle_indices]
    truth_classes = truth_classes[shuffle_indices]
    post_text_lens = post_text_lens[shuffle_indices]
    truth_means = truth_means[shuffle_indices]
    max_post_text_len = max(post_text_lens)
    print max_post_text_len
    post_texts = pad_sequences(post_texts, max_post_text_len)

    target_descriptions = np.array(target_descriptions)
    target_description_lens = np.array(target_description_lens)
    target_descriptions = target_descriptions[shuffle_indices]
    target_description_lens = target_description_lens[shuffle_indices]
    max_target_description_len = max(target_description_lens)
    print max_target_description_len
    target_descriptions = pad_sequences(target_descriptions,
                                        max_target_description_len)

    image_features = np.array(image_features)

    data = np.array(
        list(
            zip(post_texts, truth_classes, post_text_lens, truth_means,
                target_descriptions, target_description_lens, image_features)))
    kf = KFold(n_splits=5)
    round = 1
    val_scores = []
    val_accs = []
    for train, validation in kf.split(data):
        train_data, validation_data = data[train], data[validation]
        g = tf.Graph()
        with g.as_default() as g:
            tf.set_random_seed(81)
            with tf.Session(graph=g) as sess:
                if FLAGS.model == "DAN":
                    model = DAN(x1_maxlen=max_post_text_len,
                                y_len=len(truth_classes[0]),
                                x2_maxlen=max_target_description_len,
                                embedding=embedding,
                                filter_sizes=list(
                                    map(int, FLAGS.filter_sizes.split(","))),
                                num_filters=FLAGS.num_filters,
                                hidden_size=FLAGS.hidden_size,
                                state_size=FLAGS.state_size,
                                x3_size=len(image_features[0]))
                if FLAGS.model == "CNN":
                    model = CNN(x1_maxlen=max_post_text_len,
                                y_len=len(truth_classes[0]),
                                x2_maxlen=max_target_description_len,
                                embedding=embedding,
                                filter_sizes=list(
                                    map(int, FLAGS.filter_sizes.split(","))),
                                num_filters=FLAGS.num_filters,
                                hidden_size=FLAGS.hidden_size,
                                state_size=FLAGS.state_size,
                                x3_size=len(image_features[0]))
                if FLAGS.model == "BiRNN":
                    model = BiRNN(x1_maxlen=max_post_text_len,
                                  y_len=len(truth_classes[0]),
                                  x2_maxlen=max_target_description_len,
                                  embedding=embedding,
                                  filter_sizes=list(
                                      map(int, FLAGS.filter_sizes.split(","))),
                                  num_filters=FLAGS.num_filters,
                                  hidden_size=FLAGS.hidden_size,
                                  state_size=FLAGS.state_size,
                                  x3_size=len(image_features[0]))
                if FLAGS.model == "SAN":
                    model = SAN(x1_maxlen=max_post_text_len,
                                y_len=len(truth_classes[0]),
                                x2_maxlen=max_target_description_len,
                                embedding=embedding,
                                filter_sizes=list(
                                    map(int, FLAGS.filter_sizes.split(","))),
                                num_filters=FLAGS.num_filters,
                                hidden_size=FLAGS.hidden_size,
                                state_size=FLAGS.state_size,
                                x3_size=len(image_features[0]),
                                attention_size=2 * FLAGS.state_size)
                global_step = tf.Variable(0,
                                          name="global_step",
                                          trainable=False)
                optimizer = tf.train.AdamOptimizer(FLAGS.learning_rate)
                grads_and_vars = optimizer.compute_gradients(model.loss)
                if FLAGS.gradient_clipping_value:
                    grads_and_vars = [
                        (tf.clip_by_value(grad, -FLAGS.gradient_clipping_value,
                                          FLAGS.gradient_clipping_value), var)
                        for grad, var in grads_and_vars
                    ]
                train_op = optimizer.apply_gradients(grads_and_vars,
                                                     global_step=global_step)

                out_dir = os.path.join(FLAGS.dir, "runs", FLAGS.timestamp)
                # loss_summary = tf.summary.scalar("loss", model.loss)
                # acc_summary = tf.summary.scalar("accuracy", model.accuracy)
                # train_summary_op = tf.summary.merge([loss_summary, acc_summary])
                # train_summary_dir = os.path.join(out_dir, "summaries", "train")
                # train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)
                # val_summary_op = tf.summary.merge([loss_summary, acc_summary])
                # val_summary_dir = os.path.join(out_dir, "summaries", "validation")
                # val_summary_writer = tf.summary.FileWriter(val_summary_dir, sess.graph)

                checkpoint_dir = os.path.join(out_dir, "checkpoints")
                checkpoint_prefix = os.path.join(checkpoint_dir,
                                                 FLAGS.model + str(round))
                if not os.path.exists(checkpoint_dir):
                    os.makedirs(checkpoint_dir)
                saver = tf.train.Saver()

                sess.run(tf.global_variables_initializer())

                def train_step(input_x1, input_y, input_x1_len, input_z,
                               input_x2, input_x2_len, input_x3):
                    feed_dict = {
                        model.input_x1: input_x1,
                        model.input_y: input_y,
                        model.input_x1_len: input_x1_len,
                        model.input_z: input_z,
                        model.dropout_rate_hidden: FLAGS.dropout_rate_hidden,
                        model.dropout_rate_cell: FLAGS.dropout_rate_cell,
                        model.dropout_rate_embedding:
                        FLAGS.dropout_rate_embedding,
                        model.batch_size: len(input_x1),
                        model.input_x2: input_x2,
                        model.input_x2_len: input_x2_len,
                        model.input_x3: input_x3
                    }
                    _, step, loss, mse, accuracy = sess.run([
                        train_op, global_step, model.loss, model.mse,
                        model.accuracy
                    ], feed_dict)
                    time_str = datetime.datetime.now().isoformat()
                    print("{}: step {}, loss {:g}, mse {:g}, acc {:g}".format(
                        time_str, step, loss, mse, accuracy))
                    # train_summary_writer.add_summary(summaries, step)

                def validation_step(input_x1,
                                    input_y,
                                    input_x1_len,
                                    input_z,
                                    input_x2,
                                    input_x2_len,
                                    input_x3,
                                    writer=None):
                    feed_dict = {
                        model.input_x1: input_x1,
                        model.input_y: input_y,
                        model.input_x1_len: input_x1_len,
                        model.input_z: input_z,
                        model.dropout_rate_hidden: 0,
                        model.dropout_rate_cell: 0,
                        model.dropout_rate_embedding: 0,
                        model.batch_size: len(input_x1),
                        model.input_x2: input_x2,
                        model.input_x2_len: input_x2_len,
                        model.input_x3: input_x3
                    }
                    step, loss, mse, accuracy = sess.run(
                        [global_step, model.loss, model.mse, model.accuracy],
                        feed_dict)
                    time_str = datetime.datetime.now().isoformat()
                    print("{}: step {}, loss {:g}, mse {:g}, acc {:g}".format(
                        time_str, step, loss, mse, accuracy))
                    # if writer:
                    #     writer.add_summary(summaries, step)
                    return mse, accuracy

                print("\nValidation: ")
                post_text_val, truth_class_val, post_text_len_val, truth_mean_val, target_description_val, target_description_len_val, image_feature_val = zip(
                    *validation_data)
                validation_step(post_text_val, truth_class_val,
                                post_text_len_val, truth_mean_val,
                                target_description_val,
                                target_description_len_val, image_feature_val)
                print("\n")
                min_mse_val = np.inf
                acc = np.inf
                for i in range(FLAGS.epochs):
                    batches = get_batch(train_data, FLAGS.batch_size)
                    for batch in batches:
                        post_text_batch, truth_class_batch, post_text_len_batch, truth_mean_batch, target_description_batch, target_description_len_batch, image_feature_batch = zip(
                            *batch)
                        train_step(post_text_batch, truth_class_batch,
                                   post_text_len_batch, truth_mean_batch,
                                   target_description_batch,
                                   target_description_len_batch,
                                   image_feature_batch)
                    print("\nValidation: ")
                    mse_val, acc_val = validation_step(
                        post_text_val, truth_class_val, post_text_len_val,
                        truth_mean_val, target_description_val,
                        target_description_len_val, image_feature_val)
                    print("\n")
                    if mse_val < min_mse_val:
                        min_mse_val = mse_val
                        acc = acc_val
                        # saver.save(sess, checkpoint_prefix)
        round += 1
        val_scores.append(min_mse_val)
        val_accs.append(acc)
    print np.mean(val_scores)
    print np.mean(val_accs)
示例#49
0
# -

# #### モデル作成とバリデーション
# LightGBMを使用してモデルを作成します。

# +
printTime('モデルの作成開始')

va_pred_list = []
va_weight_list = []
pred_list = []

# 学習データを学習データとバリデーションデータに分ける
kf = KFold(n_splits=4, shuffle=True, random_state=71)

for tr_idx, va_idx in kf.split(train_x):
    tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
    tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]

    # 2020/05/30 Target encodingをする Start

    # 2020/05/30 Target encodingをする End

    # 特徴量と目的変数をlightgbmのデータ構造に変換する
    lgb_train = lgb.Dataset(tr_x, tr_y)
    lgb_eval = lgb.Dataset(va_x, va_y)

    # ハイパーパラメータの設定
    params = {
        'boosting_type': 'gbdt',
        'objective': 'regression_l2',
示例#50
0
def classifier(model, emb_mean, emb_std, embeddings_index):
    train = pd.read_csv('./input/TIL_NLP_train1_dataset.csv')
    test = pd.read_csv('./input/TIL_NLP_unseen_dataset.csv')
    print('running classifier')

    max_features = 4248
    print(max_features)
    maxlen = 200
    embed_size = 100
    train = shuffle(train)
    X_train = train["word_representation"].fillna("fillna").values
    y_train = train[[
        "outwear", "top", "trousers", "women dresses", "women skirts"
    ]].values
    X_test = test["word_representation"].fillna("fillna").values
    y_test = test[[
        "outwear", "top", "trousers", "women dresses", "women skirts"
    ]].values
    y_test = y_test.tolist()
    print('preprocessing start')
    tokenizer = text.Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(list(X_train) + list(X_test))
    X_train = tokenizer.texts_to_sequences(X_train)
    X_test = tokenizer.texts_to_sequences(X_test)

    x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
    x_test = sequence.pad_sequences(X_test, maxlen=maxlen)

    del X_train, X_test, train, test
    gc.collect()

    word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std,
                                        (nb_words, embed_size))

    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i - 1] = embedding_vector

    print('preprocessing done')

    # session_conf = tf.ConfigProto(intra_op_parallelism_threads=4, inter_op_parallelism_threads=4)
    # K.set_session(tf.Session(graph=tf.get_default_graph(), config=session_conf))

    #model
    #wrote out all the blocks instead of looping for simplicity

    filter_nr = 64
    filter_size = 3
    max_pool_size = 3
    max_pool_strides = 2
    dense_nr = 256
    spatial_dropout = 0.2
    dense_dropout = 0.5
    train_embed = False
    conv_kern_reg = regularizers.l2(0.00001)
    conv_bias_reg = regularizers.l2(0.00001)

    comment = Input(shape=(maxlen, ))
    emb_comment = Embedding(max_features,
                            embed_size,
                            weights=[embedding_matrix],
                            trainable=train_embed)(comment)
    block1 = Bidirectional(LSTM(embed_size))(emb_comment)
    block1 = Dense(embed_size, activation='linear')(block1)
    output = Dense(5, activation='sigmoid')(block1)
    """
    emb_comment = SpatialDropout1D(spatial_dropout)(emb_comment)

    block1 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
                kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(emb_comment)
    block1 = BatchNormalization()(block1)
    block1 = PReLU()(block1)
    block1 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
                kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block1)
    block1 = BatchNormalization()(block1)
    block1 = PReLU()(block1)

    #we pass embedded comment through conv1d with filter size 1 because it needs to have the same shape as block output
    #if you choose filter_nr = embed_size (300 in this case) you don't have to do this part and can add emb_comment directly to block1_output
    resize_emb = Conv1D(filter_nr, kernel_size=1, padding='same', activation='linear', 
                kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(emb_comment)
    resize_emb = PReLU()(resize_emb)
        
    block1_output = add([block1, resize_emb])
    block1_output = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block1_output)

    block2 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
                kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block1_output)
    block2 = BatchNormalization()(block2)
    block2 = PReLU()(block2)
    block2 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
                kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block2)
    block2 = BatchNormalization()(block2)
    block2 = PReLU()(block2)
        
    block2_output = add([block2, block1_output])
    block2_output = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block2_output)

    block3 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
                kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block2_output)
    block3 = BatchNormalization()(block3)
    block3 = PReLU()(block3)
    block3 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
                kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block3)
    block3 = BatchNormalization()(block3)
    block3 = PReLU()(block3)
        
    block3_output = add([block3, block2_output])
    block3_output = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block3_output)

    block4 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
                kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block3_output)
    block4 = BatchNormalization()(block4)
    block4 = PReLU()(block4)
    block4 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
                kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block4)
    block4 = BatchNormalization()(block4)
    block4 = PReLU()(block4)

    block4_output = add([block4, block3_output])
    block4_output = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block4_output)

    block5 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
                kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block4_output)
    block5 = BatchNormalization()(block5)
    block5 = PReLU()(block5)
    block5 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
                kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block5)
    block5 = BatchNormalization()(block5)
    block5 = PReLU()(block5)

    block5_output = add([block5, block4_output])
    block5_output = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block5_output)

    block6 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
                kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block5_output)
    block6 = BatchNormalization()(block6)
    block6 = PReLU()(block6)
    block6 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
                kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block6)
    block6 = BatchNormalization()(block6)
    block6 = PReLU()(block6)

    block6_output = add([block6, block5_output])
    block6_output = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block6_output)

    block7 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
                kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block6_output)
    block7 = BatchNormalization()(block7)
    block7 = PReLU()(block7)
    block7 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
                kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block7)
    block7 = BatchNormalization()(block7)
    block7 = PReLU()(block7)

    block7_output = add([block7, block6_output])
    output = GlobalMaxPooling1D()(block7_output)

    output = Dense(dense_nr, activation='linear')(output)
    output = BatchNormalization()(output)
    output = PReLU()(output)
    output = Dropout(dense_dropout)(output)
    output = Dense(5, activation='sigmoid')(output)
    
    """
    #model = Model(comment, output)
    # print("Correct model: ", type(model))

    model.compile(loss='binary_crossentropy',
                  optimizer=optimizers.Adam(),
                  metrics=['accuracy'])

    num_folds = 5
    num = 0
    kfold = KFold(n_splits=num_folds, shuffle=True)

    for train, test in kfold.split(x_train, y_train):

        print("Training Fold number: ", num)
        batch_size = 128
        epochs = 20
        lr = callbacks.LearningRateScheduler(schedule)
        ra_val = RocAucEvaluation(validation_data=(x_train[test],
                                                   y_train[test]),
                                  interval=1)
        es = EarlyStopping(monitor='val_loss',
                           verbose=1,
                           patience=5,
                           restore_best_weights=True,
                           mode='min')
        mc = ModelCheckpoint('best_model_rnn.h5',
                             monitor='val_loss',
                             mode='min',
                             verbose=1,
                             save_best_only=True)
        model.fit(x_train[train],
                  y_train[train],
                  batch_size=batch_size,
                  epochs=epochs,
                  validation_data=(x_train[test], y_train[test]),
                  callbacks=[lr, ra_val, es, mc],
                  verbose=1)
        num += 1

        y_pred = model.predict(x_test)
        y_pred = [[1 if i > 0.5 else 0 for i in r] for r in y_pred]

        accuracy = sum([y_pred[i] == y_test[i]
                        for i in range(len(y_pred))]) / len(y_pred) * 100
        print([y_pred[i] == y_test[i] for i in range(len(y_pred))])
        print(accuracy, "%")
        print(f1(y_pred, y_test))
        """
        submission = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv')
        submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
        submission.to_csv('dpcnn_test_preds.csv', index=False)
        """

    return model
def main(_):
  print('METHOD:', FLAGS.method)
  print('Norm factor:', FLAGS.norm_factor) 

  DEBUG = FLAGS.debug 
  idir = FLAGS.idir

  if not DEBUG:
    FLAGS.infer = True
    FLAGS.num_folds = 1
    #FLAGS.num_grids = 10 

  # first id, sencod content ..
  idx = 2

  valid_files = glob.glob(f'{idir}/*.valid.csv')
  valid_files = [x for x in valid_files if not 'ensemble' in x]
  
  if not DEBUG:
    print('VALID then INFER')
    infer_files = glob.glob(f'{idir}/*.infer.csv.debug')
  else:
    print('Debug mode INFER ill write result using valid ids, just for test')
    infer_files = glob.glob(f'{idir}/*.valid.csv') 
    infer_files = [x for x in infer_files if not 'ensemble' in x]

  print('num_ensembles', len(valid_files), 'num_infers', len(infer_files))    
  assert len(valid_files) == len(infer_files), infer_files

  global num_ensembles
  num_ensembles = len(valid_files)

  # need global ? even only read?
  global class_weights
  #print('-----------', class_weights)

  print('loading all valid csv')
  dfs = []
  for file_ in tqdm(valid_files, ascii=True):
    df = pd.read_csv(file_)
    df = df.sort_values('id')
    dfs.append(df)

  if FLAGS.num_folds > 1:
    kf = KFold(n_splits=FLAGS.num_folds, shuffle=True, random_state=FLAGS.seed)
    dataset = kf.split(dfs[0])
  else:
    ids = dfs[0]['id'].values
    dataset = [(ids, ids)]
  
  logits_f1_list = []
  logits_adjusted_f1_list = []
  probs_f1_list = []
  probs_adjusted_f1_list = []
  grids_logits_adjusted_f1_list = []

  logits_predict_list = []
  logits_adjusted_predict_list = []
  probs_predict_list = []
  probs_adjusted_predict_list = []
  grids_logits_adjusted_predict_list = []

  labels_list = []
  results_list = []

  def split_train_valid(x):
    if FLAGS.num_folds == 1:
      return x, x 
    else:
      total = 15000
      assert total % FLAGS.num_folds == 0
      num_valid = int(total / FLAGS.num_folds) 
      num_train = total - num_valid
      return x[:num_train], x[num_train:]

  for fold, (train_index, valid_index) in enumerate(dataset):
    print('FOLD_%s---------------------------' % fold)
    print('train:', train_index,  'valid:', valid_index)
    class_factors = np.ones([num_attrs, num_classes])
    class_weights = ori_class_weights
    # logits sum results
    results = None
    # prob sum results
    results2 = None

    weights = [] 
    scores_list = []

    for fid, df in enumerate(dfs):
      file_ = valid_files[fid]
      train = df.iloc[train_index]
      valid =  df.iloc[valid_index]
      #if fid == 0:      
      train_labels = train.iloc[:, idx:idx+num_attrs].values
      valid_labels = valid.iloc[:, idx:idx+num_attrs].values
      labels = np.concatenate([train_labels, valid_labels], 0)
      train_predicts = train.iloc[:, idx+num_attrs:idx+2*num_attrs].values
      valid_predicts = valid.iloc[:, idx+num_attrs:idx+2*num_attrs].values
      predicts = np.concatenate([train_predicts, valid_predicts], 0)
      train_scores = train['score']
      valid_scores = valid['score']
      scores = np.concatenate([train_scores, valid_scores], 0)
      scores = [parse(score) for score in scores] 
      scores = np.array(scores)
      scores_list.append(scores)     

      train_labels, valid_labels = split_train_valid(labels)
      train_predicts, valid_predicts = split_train_valid(predicts)
      train_scores, valid_scores = split_train_valid(scores)
  
      f1s = calc_f1s(train_labels, train_predicts)
      f1s_adjusted = calc_f1s(train_labels, to_predict(train_scores, is_single=True))

      train_probs = gezi.softmax(train_scores.reshape([-1, NUM_ATTRIBUTES, NUM_CLASSES]))
      aucs = calc_aucs(train_labels + 2, train_probs)
      losses = calc_losses(train_labels + 2, train_probs)

      f1 = np.mean(f1s)
      f1_adjusted = np.mean(f1s_adjusted)
      
      print('%-3d' % fid, '%-100s' % file_, '%.5f' % f1, '%.5f' % f1_adjusted, '%.5f' % np.mean(aucs), '%.5f' % np.mean(losses)) 
      
      if FLAGS.weight_by == 'loss':
        weight = np.reshape(1 / losses, [num_attrs, 1])
      elif FLAGS.weight_by == 'auc':
        weight = np.reshape(aucs, [num_attrs, 1])
      else:
        weight = np.reshape(f1s_adjusted, [num_attrs, 1])

      weights.append(weight) 

    weights = np.array(weights)
    scores_list = np.array(scores_list)

    weights = blend(weights, FLAGS.norm_factor)
    sum_weights = np.sum(weights, 0)

    # print('weights\n', weights)
    # print('sum_weights\n', sum_weights)

    # if DEBUG:
    #   print(weights)
    print('-----------calc weight and score')
    for fid in tqdm(range(len(valid_files)), ascii=True):
      scores = scores_list[fid]
      if results is None:
        results = np.zeros([len(scores), num_attrs * num_classes])
        results2 = np.zeros([len(scores), num_attrs * num_classes])
      weight = weights[fid]
      #print(fid, valid_files[fid], '\n', ['%.5f' % x for x in np.reshape(weight, [-1])])
      if FLAGS.method == 'avg' or FLAGS.method == 'mean': 
        weight = 1.
      for i, score in enumerate(scores):
        score = np.reshape(score, [num_attrs, num_classes]) * weight
        score = np.reshape(score, [-1])
      
        results[i] += score

        # notice softmax([1,2]) = [0.26894142, 0.73105858] softmax([2,4]) = [0.11920292, 0.88079708]
        score = np.reshape(score, [num_attrs, num_classes])
        
        # this not work because *weight already..
        #score *= FLAGS.logits_factor
        
        score = gezi.softmax(score, -1)
        
        #score *= class_weights

        score = np.reshape(score, [-1])
        
        results2[i] += score 

    train_results, valid_results = split_train_valid(results)
    train_results2, valid_results2 = split_train_valid(results2)

    print('-----------using prob ensemble')
    adjusted_predict_prob = to_predict(valid_results2, sum_weights, adjust=False)
    adjusted_f1_prob = calc_f1(valid_labels, adjusted_predict_prob)
    valid_results2 = np.reshape(valid_results2, [-1, num_attrs, num_classes]) 
    predicts2 = np.argmax(valid_results2, -1) - 2
    f1_prob = calc_f1(valid_labels, predicts2)

    probs_f1_list.append(f1_prob)
    probs_adjusted_f1_list.append(adjusted_f1_prob)
    
    probs_predict_list.append(predicts2)
    probs_adjusted_predict_list.append(adjusted_predict_prob)
    
    print('%-40s' % 'f1_prob:', '%.5f' % f1_prob)
    print('%-40s' % 'adjusted f1_prob:', '%.5f' % adjusted_f1_prob)

    # print('-----------detailed f1 infos (ensemble by prob)')
    # _, adjusted_f1_probs, class_f1s = calc_f1_alls(valid_labels, to_predict(results2[num_train:], sum_weights, adjust=False))

    # for i, attr in enumerate(ATTRIBUTES):
    #   print(attr, adjusted_f1_probs[i])
    # for i, cls in enumerate(CLASSES):
    #   print(cls, class_f1s[i])

    print('-----------using logits ensemble')
    adjusted_predict = to_predict(valid_results, sum_weights)
    adjusted_f1 = calc_f1(valid_labels, adjusted_predict)
    valid_results = np.reshape(valid_results, [-1, num_attrs, num_classes]) 
    predicts = np.argmax(valid_results, -1) - 2
    f1 = calc_f1(valid_labels, predicts)

    logits_f1_list.append(f1)
    logits_adjusted_f1_list.append(adjusted_f1)

    logits_predict_list.append(predicts)
    logits_adjusted_predict_list.append(adjusted_predict)

    results_list.append(valid_results)
    labels_list.append(valid_labels)
    
    print('%-40s' % 'f1:', '%.5f' % f1)
    print('%-40s' % 'adjusted f1:', '%.5f' % adjusted_f1)

    if FLAGS.show_detail:
      print('-----------detailed f1 infos (ensemble by logits)')
      _, adjusted_f1s, class_f1s = calc_f1_alls(valid_labels, to_predict(valid_results, sum_weights))
      for i, attr in enumerate(ATTRIBUTES):
        print('%-40s' % attr, '%.5f' % adjusted_f1s[i])
      for i, cls in enumerate(CLASSES):
        print('%-40s' % cls, '%.5f' % class_f1s[i])

    print('%-40s' % 'f1:', '%.5f' % f1)
    print('%-40s' % 'f1 prob:', '%.5f' % f1_prob)
    print('%-40s' % 'adjusted f1 prob:', '%.5f' % adjusted_f1_prob)
    print('%-40s' % 'adjusted f1:', '%.5f' % adjusted_f1)

    if FLAGS.num_grids:
      print('------------grid search num_grids', FLAGS.num_grids)
      class_factors = grid_search_class_factors(gezi.softmax(np.reshape(train_results, [-1, num_attrs, num_classes]) * (FLAGS.logits_factor / sum_weights)), train_labels, class_weights, num_grids=FLAGS.num_grids)
        
      if FLAGS.show_detail:
        print('class_factors1 with num_grids', FLAGS.num_grids)
        print(class_factors)

      # adjust class weights to get better result from grid search 
      class_weights = class_weights * class_factors

      adjusted_f1_before_grids = adjusted_f1
      print('after dynamic adjust class factors')
      adjusted_predict = to_predict(valid_results, sum_weights)
      adjusted_f1 = calc_f1(valid_labels, adjusted_predict)
      valid_results = np.reshape(valid_results, [-1, num_attrs, num_classes]) 

      grids_logits_adjusted_f1_list.append(adjusted_f1)  
      grids_logits_adjusted_predict_list.append(adjusted_predict)

      print('-----------using logits ensemble')
      print('%-40s' % 'adjusted f1 before grids:', '%.5f' % adjusted_f1_before_grids)
      print('%-40s' % 'adjusted f1:', '%.5f' % adjusted_f1)

      if FLAGS.show_detail:
        print('-----------detailed f1 infos (ensemble by logits)')
        _, adjusted_f1s, class_f1s = calc_f1_alls(valid_labels, to_predict(valid_results, sum_weights))
        for i, attr in enumerate(ATTRIBUTES):
          print('%-40s' % attr, '%.5f' % adjusted_f1s[i])
        for i, cls in enumerate(CLASSES):
          print('%-40s' % cls, '%.5f' % class_f1s[i])
      print('%-40s' % 'adjusted f1 before grids:', '%.5f' % adjusted_f1_before_grids)
      print('%-40s' % 'adjusted f1:', '%.5f' % adjusted_f1)

  # print('-------------------------------------OVERALL mean')
  # print('ensemble by probs')
  # print('%-40s' % 'f1', '%.5f' % np.mean(probs_f1_list))
  # print('%-40s' % 'adjustedf f1', '%.5f' % np.mean(probs_adjusted_f1_list))
  
  # print('ensemble by logits')
  # print('%-40s' % 'f1:', '%.5f' % np.mean(logits_f1_list))
  # print('%-40s' % 'adjusted f1:', '%.5f' % np.mean(logits_adjusted_f1_list))

  # if FLAGS.num_grids:
  #   print('ensemble by logits after grid search')
  #   print('%-40s' % 'adjusted f1', '%.5f' % np.mean(grids_logits_adjusted_f1_list))

  print('-------------------------------------OVERALL recalc')
  labels = np.concatenate(labels_list, 0)
  print('ensemble by probs')
  print('%-40s' % 'f1', '%.5f' % calc_f1(labels, np.concatenate(probs_predict_list, 0)))
  print('%-40s' % 'adjustedf f1', '%.5f' % calc_f1(labels, np.concatenate(probs_adjusted_predict_list, 0)))

  print('ensemble by logits')
  predicts = np.concatenate(logits_predict_list, 0)
  print('%-40s' % 'f1:', '%.5f' % calc_f1(labels, predicts))
  adjusted_predicts = np.concatenate(logits_adjusted_predict_list, 0)
  print('%-40s' % 'adjusted f1:', '%.5f' % calc_f1(labels, adjusted_predicts))

  if FLAGS.num_grids:
    print('ensemble by logits after grid search')
    grids_predicts = np.concatenate(grids_logits_adjusted_predict_list, 0)
    print('%-40s' % 'adjusted f1 after grid search', '%.5f' % calc_f1(labels, grids_predicts))

  _, adjusted_f1s, class_f1s = calc_f1_alls(labels, adjusted_predicts)
  for i, attr in enumerate(ATTRIBUTES):
    print('%-40s' % attr, '%.5f' % adjusted_f1s[i])
  for i, cls in enumerate(CLASSES):
    print('%-40s' % cls, '%.5f' % class_f1s[i])
  print('%-40s' % 'f1', '%.5f' % calc_f1(labels, predicts))
  print('%-40s' % 'adjusted f1', '%.5f' % calc_f1(labels, adjusted_predicts))
  if FLAGS.num_grids:
    print('%-40s' % 'adjusted f1 after grid search', '%.5f' % calc_f1(labels, grids_predicts))

  results = np.concatenate(results_list, 0)
  results = results.reshape([-1, NUM_ATTRIBUTES, NUM_CLASSES]) 
  #factor =  FLAGS.logits_factor / sum_weights
  #print('%-40s' % '* factor loss', '%.5f' % calc_loss(labels, gezi.softmax(results * factor)))
  ## directly do softmax on results since sum weights is 1
  loss = calc_loss(labels, gezi.softmax(results))
  print('%-40s' % 'loss', '%.5f' % loss)

  print('f1:class predictions distribution')
  counts = get_distribution(predicts)
  for attr, count in zip(ATTRIBUTES, counts):
    print('%-40s' % attr, ['%.5f' % (x / len(predicts)) for x in count])
  #print_confusion_matrix(labels, predicts)

  print('adjusted f1:class predictions distribution')
  counts = get_distribution(adjusted_predicts)
  for attr, count in zip(ATTRIBUTES, counts):
    print('%-40s' % attr, ['%.5f' % (x / len(predicts)) for x in count])
  #print_confusion_matrix(labels, adjusted_predicts)

  if FLAGS.num_grids:
    print('adjusted f1:class predictions distribution after grids search')
    counts = get_distribution(grids_predicts)
    for attr, count in zip(ATTRIBUTES, counts):
      print('%-40s' % attr, ['%.5f' % (x / len(grids_predicts)) for x in count])
    #print_confusion_matrix(labels, grids_predicts)

  DEBUG = FLAGS.debug
  if FLAGS.infer:
    print('------------infer')
    ofile = os.path.join(idir, 'ensemble.infer.csv')
    file_ = gezi.strip_suffix(file_, '.debug')
    df = pd.read_csv(file_)

    idx = 2
    results = None
    results2 = None
    for fid, file_ in enumerate(infer_files):
      df = pd.read_csv(file_)
      df = df.sort_values('id')
      print(fid, file_, len(df))
      if not FLAGS.debug:
        assert len(df) == 200000
      if results is None:
        results = np.zeros([len(df), num_attrs * num_classes])
        results2 = np.zeros([len(df), num_attrs * num_classes])
      scores = df['score']
      scores = [parse(score) for score in scores]
      scores = np.array(scores) 
      weight = weights[fid] 
      if FLAGS.method == 'avg' and FLAGS.method == 'mean': 
        weight = 1.
      for i, score in enumerate(scores):
        score = np.reshape(np.reshape(score, [num_attrs, num_classes]) * weight, [-1])
        results[i] += score
        score = gezi.softmax(np.reshape(score, [num_attrs, num_classes]), -1)
        score = np.reshape(score, [-1])
        results2[i] += score 

    #predicts = to_predict(results2, sum_weights)
    predicts = to_predict(results, sum_weights)

    counts = get_distribution(predicts)
    for attr, count in zip(ATTRIBUTES, counts):
      print('%-40s' % attr, ['%.5f' % (x / len(predicts)) for x in count])

    if not DEBUG:
      columns = df.columns[idx:idx + num_attrs].values
    else:
      columns = df.columns[idx + num_attrs:idx + 2 * num_attrs].values

    if not DEBUG:
      ofile = os.path.join(idir, 'ensemble.infer.csv')
    else:
      ofile = os.path.join(idir, 'ensemble.valid.csv')

    if not DEBUG:
      file_ = gezi.strip_suffix(file_, '.debug')
      print('temp csv using for write', file_)
      df = pd.read_csv(file_)
    else:
      print('debug test using file', valid_files[-1])
      df = pd.read_csv(valid_files[-1])

    # for safe must sort id
    df = df.sort_values('id')

    # TODO better ? not using loop ?
    for i, column in enumerate(columns):
      df[column] = predicts[:, i]

    if DEBUG:
      print('check blend result', calc_f1(df.iloc[:, idx:idx + num_attrs].values, predicts))
    print(f'adjusted f1_prob:[{adjusted_f1_prob}]')
    print(f'adjusted f1:[{adjusted_f1}]')
    print(f'loss:[{loss}]')

    print('out:', ofile)
    if not DEBUG:
      df.to_csv(ofile, index=False, encoding="utf_8_sig")

    print('---------------results', results.shape)
    df['score'] = [x for x in results] 

    if not DEBUG:
      ofile = os.path.join(idir, 'ensemble.infer.debug.csv')
    else:
      ofile = os.path.join(idir, 'ensemble.valid.csv')
    print('out debug:', ofile)
    df.to_csv(ofile, index=False, encoding="utf_8_sig")
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

classifier = DecisionTreeClassifier()
model = classifier.fit(X_train,y_train)

# TODO: Make predictions on the test data
y_pred = model.predict(X_test)
# TODO: Calculate the accuracy and assign it to the variable acc on the test data.
from sklearn.metrics import accuracy_score,r2_score,median_absolute_error
acc = accuracy_score(y_test, y_pred)
yP = model.predict(X_test)
score_r2 = r2_score(y_test, yP)
score_MedAE = median_absolute_error(y_test, yP)

 

#do the samething via sklearn
from sklearn.model_selection import KFold, cross_val_score
svc = SVC(C=1, kernel='linear')
svc.fit(X_train,y_train).score(X_train,y_train)

k_fold = KFold(n_splits=3)
[svc.fit(X_digits[train], y_digits[train]).score(X_digits[test], y_digits[test])
   for train, test in k_fold.split(X_digits)]

# even siplify to one line
cross_val_score(svc, X_digits, y_digits, cv=k_fold, n_jobs=-1)
#n_jobs=-1 means that the computation will be dispatched on all the CPUs of the computer.
示例#53
0
def main():
    #########################################################
    # Test my decision tree classifier
    #########################################################

    classifier = decisionTreeClassifier()

    uciCarEvaluationDataObject = UciCarEvaluation()
    data = uciCarEvaluationDataObject.data
    labels = uciCarEvaluationDataObject.targets
    label_names = uciCarEvaluationDataObject.target_names

    model = classifier.fit(data, labels)

    accuracy_list = []
    kf = KFold(n_splits=10)
    for train_index, test_index in kf.split(data, labels):

        # Build the data/target lists
        training_data = data.iloc[train_index]
        training_labels = labels.iloc[train_index]
        testing_data = data.iloc[test_index]
        testing_labels = labels.iloc[test_index]

        # Build the model
        model = classifier.fit(training_data, training_labels)

        # # Predict
        predicted_classes = model.predict(testing_data)

        accuracy_list.append(accuracy_score(testing_labels, predicted_classes))

    print "The custom decision tree predicted the auto dataset's classes with an average of",
    print sum(accuracy_list) / float(len(accuracy_list)) * 100,
    print "percent accuracy."

    #########################################################
    # Compare the SK-learn decision tree classifier
    #########################################################

    classifier = tree.DecisionTreeClassifier()

    model = classifier.fit(data, labels)

    accuracy_list = []
    kf = KFold(n_splits=10)
    for train_index, test_index in kf.split(data, labels):

        # Build the data/target lists
        training_data = data.iloc[train_index]
        training_labels = labels.iloc[train_index]
        testing_data = data.iloc[test_index]
        testing_labels = labels.iloc[test_index]

        # Build the model
        model = classifier.fit(training_data, training_labels)

        # # Predict
        predicted_classes = model.predict(testing_data)

        accuracy_list.append(accuracy_score(testing_labels, predicted_classes))
    print "The sk-learn decision tree predicted the auto dataset's classes with an average of",
    print sum(accuracy_list) / float(len(accuracy_list)) * 100,
    print "percent accuracy."
示例#54
0
    print('===========================================================')
    print('===========================================================') 

    csv = []

    for k in range(1,10):
        print("\n\nEvaluating with k-mer:", k, " ==========================")

        data = fe.generateLabeledData(dataset_path + dataset + "/data.fa", dataset_path  + dataset + "/class.csv")         
        
        kf = KFold(n_splits = 5, shuffle = True, random_state=1)
        i = 0

        metrics_castor = []
        metrics_kameris = []
        for train_index, test_index in kf.split(np.zeros(len(data))):            

            data_train = split_data(data, train_index)
            data_test = split_data(data, test_index)                       

            acc, pre, recall, fscore, number_features = kameris(data_train, data_test, k, dimention_reduction)
            metrics_kameris.append([acc, pre, recall, fscore, number_features])
            print("k-fold ", i, "metrics_kameris: ", acc, pre, recall, fscore)

            acc, pre, recall, fscore, number_features = castor(data_train, data_test, k, dimention_reduction)
            metrics_castor.append([acc, pre, recall, fscore, number_features])
            print("k-fold ", i, "metrics_castor:  ", acc, pre, recall, fscore)

            i += 1
            
        metrics_kameris         = np.matrix(metrics_kameris)
示例#55
0
#	param_grid=param_grid, scoring=None,
#	cv=n_folds, n_jobs=n_jobs, verbose=verbose_grid)
#gs = gs.fit(X_new, y)
#print(gs.scorer_)
#print('best score from grid search: %.3f' % gs.best_score_)
#print(gs.best_params_)
#best = gs.best_params_
#n_estimators_gs = best['n_estimators']
#max_depth_gs = best['max_depth']
#max_features_gs = best['max_features']

# run some cross validation
print('running cross validation to determine accuracy of model...')
scores = []
splits = KFold(n_splits=n_folds, shuffle=True, random_state=random_state)
for train, test in splits.split(X):
    tree.fit(X[train], y[train])
    predicted = tree.predict(X[test])
    score = mean_absolute_error(y[test], predicted)
    scores.append(score)
print(scores)

# determine which features to write to the file
n_estimators = n_estimators_def
max_depth = max_depth_def
max_features = max_features_def
score = np.mean(scores)

print('writing the data to file...')
params = (n_folds, n_estimators, max_depth, max_features, score)
write_hyperparams(params, hyperParamFile)
        input_x.append([float(x) / 255 for x in row[:len(row) - 1]])
        input_y.append(float(row[len(row) - 1]))
    print("Gone through all the data")
    datax = np.array(input_x)
    number_of_features = datax.shape[1]
    datax_temp = np.array(input_x)
    datay = np.array(input_y)
    datay_temp = np.array(input_y)
    number_of_training = datax.shape[0]
gamma = [0.00001, 0.001, 1, 5, 10]
c1 = 0
kf = KFold(n_splits=5)
kf.get_n_splits(datax)
score_max = 0
index = 0
for train_index, test_index in kf.split(datax):
    X_train, X_test = datax[train_index], datax[test_index]
    y_train, y_test = datay[train_index], datay[test_index]
    clf1 = SVC(kernel='rbf', gamma=gamma[c1])
    clf1.fit(X_train, y_train)
    y_pred = clf1.predict(X_test)
    score1 = metrics.accuracy_score(y_test, y_pred)
    if (score1 > score_max):
        score_max = score1
        index = c
    c1 += 1
    print(score1)

clf1 = SVC(kernel='rbf', gamma=gamma[index])
clf1.fit(datax, datay)
end_training_time = time.time() - start_time
示例#57
0
def fSplitDataset(allPatches,
                  allY,
                  allPats,
                  sSplitting,
                  patchSize,
                  patchOverlap,
                  split_ratio,
                  sFolder,
                  nfolds=0):
    # TODO: adapt path
    iReturn = expecting()

    if len(patchSize) == 3:
        if allPatches.shape[0] == patchSize[0] and allPatches.shape[
                1] == patchSize[1] and allPatches.shape[2] == patchSize[2]:
            allPatches = np.transpose(allPatches, (3, 0, 1, 2))
            print(allPatches.shape)
    else:
        if allPatches.shape[0] == patchSize[0] and allPatches.shape[
                1] == patchSize[1]:
            allPatches = np.transpose(allPatches, (2, 0, 1))
            print(allPatches.shape)

    if sSplitting == "normal":
        print("Done")
        nPatches = allPatches.shape[0]
        dVal = math.floor(split_ratio * nPatches)
        rand_num = np.random.permutation(np.arange(nPatches))
        rand_num = rand_num[0:int(dVal)].astype(int)
        print(rand_num)
        if len(patchSize) == 3:
            X_test = allPatches[rand_num, :, :, :]
        else:
            X_test = allPatches[rand_num, :, :]
        y_test = allY[rand_num]
        X_train = allPatches
        X_train = np.delete(X_train, rand_num, axis=0)
        y_train = allY
        y_train = np.delete(y_train, rand_num)
        print(X_train.shape)
        print(X_test.shape)
        print(y_train.shape)
        print(y_test.shape)

        if iReturn == 0:
            if len(patchSize) == 3:
                folder = sFolder + os.sep + str(patchSize[0]) + str(
                    patchSize[1]) + str(patchSize[2])
                Path = sFolder + os.sep + str(patchSize[0]) + str(
                    patchSize[1]) + str(
                        patchSize[2]) + os.sep + 'normal_' + str(
                            patchSize[0]) + str(patchSize[1]) + '.h5'
            else:
                folder = sFolder + os.sep + str(patchSize[0]) + str(
                    patchSize[1])
                Path = sFolder + os.sep + str(patchSize[0]) + str(
                    patchSize[1]) + os.sep + 'normal_' + str(
                        patchSize[0]) + str(patchSize[1]) + '.h5'

            if os.path.isdir(folder):
                pass
            else:
                os.makedirs(folder)

            print(Path)
            with h5py.File(Path, 'w') as hf:
                hf.create_dataset('X_train', data=X_train)
                hf.create_dataset('X_test', data=X_test)
                hf.create_dataset('y_train', data=y_train)
                hf.create_dataset('y_test', data=y_test)
                hf.create_dataset('patchSize', data=patchSize)
                hf.create_dataset('patchOverlap', data=patchOverlap)
        else:
            return [X_train], [y_train], [X_test], [y_test
                                                    ]  # embed in a 1-fold list

    elif sSplitting == "crossvalidation_data":
        if nfolds == 0:
            kf = KFold(n_splits=len(np.unique(allPats)))
        else:
            kf = KFold(n_splits=nfolds)
        ind_split = 0
        X_trainFold = []
        X_testFold = []
        y_trainFold = []
        y_testFold = []

        for train_index, test_index in kf.split(allPatches):
            X_train, X_test = allPatches[train_index], allPatches[test_index]
            y_train, y_test = allY[train_index], allY[test_index]

            if iReturn == 0:
                if len(patchSize) == 3:
                    folder = sFolder + os.sep + str(patchSize[0]) + str(
                        patchSize[1]) + str(patchSize[2])
                    Path = sFolder + os.sep + str(patchSize[0]) + str(
                        patchSize[1]) + str(
                            patchSize[2]) + os.sep + 'crossVal_data' + str(
                                ind_split) + '_' + str(patchSize[0]) + str(
                                    patchSize[1]) + str(patchSize[2]) + '.h5'
                else:
                    folder = sFolder + os.sep + str(patchSize[0]) + str(
                        patchSize[1])
                    Path = sFolder + os.sep + str(patchSize[0]) + str(
                        patchSize[1]) + os.sep + 'crossVal_data' + str(
                            ind_split) + '_' + str(patchSize[0]) + str(
                                patchSize[1]) + '.h5'
                if os.path.isdir(folder):
                    pass
                else:
                    os.makedirs(folder)

                with h5py.File(Path, 'w') as hf:
                    hf.create_dataset('X_train', data=X_train)
                    hf.create_dataset('X_test', data=X_test)
                    hf.create_dataset('y_train', data=y_train)
                    hf.create_dataset('y_test', data=y_test)
                    hf.create_dataset('patchSize', data=patchSize)
                    hf.create_dataset('patchOverlap', data=patchOverlap)
            else:
                X_trainFold.append(X_train)
                X_testFold.append(X_test)
                y_trainFold.append(y_train)
                y_testFold.append(y_test)

            ind_split += 1

        X_trainFold = np.asarray(X_trainFold)
        X_testFold = np.asarray(X_testFold)
        y_trainFold = np.asarray(y_trainFold)
        y_testFold = np.asarray(y_testFold)

        if iReturn > 0:
            return X_trainFold, y_trainFold, X_testFold, y_testFold

    elif sSplitting == "crossvalidation_patient":
        unique_pats = np.unique(allPats)

        X_trainFold = []
        X_testFold = []
        y_trainFold = []
        y_testFold = []

        for ind_split in unique_pats:
            train_index = np.where(allPats != ind_split)[0]
            test_index = np.where(allPats == ind_split)[0]
            X_train, X_test = allPatches[train_index], allPatches[test_index]
            y_train, y_test = allY[train_index], allY[test_index]

            if iReturn == 0:
                if len(patchSize) == 3:
                    folder = sFolder + os.sep + str(patchSize[0]) + str(
                        patchSize[1]) + str(patchSize[2])
                    Path = sFolder + os.sep + str(patchSize[0]) + str(
                        patchSize[1]) + str(
                            patchSize[2]) + os.sep + 'crossVal' + str(
                                ind_split) + '_' + str(patchSize[0]) + str(
                                    patchSize[1]) + str(patchSize[2]) + '.h5'
                else:
                    folder = sFolder + os.sep + str(patchSize[0]) + str(
                        patchSize[1])
                    Path = sFolder + os.sep + str(patchSize[0]) + str(
                        patchSize[1]) + os.sep + 'crossVal' + str(
                            ind_split) + '_' + str(patchSize[0]) + str(
                                patchSize[1]) + '.h5'
                if os.path.isdir(folder):
                    pass
                else:
                    os.makedirs(folder)

                with h5py.File(Path, 'w') as hf:
                    hf.create_dataset('X_train', data=X_train)
                    hf.create_dataset('X_test', data=X_test)
                    hf.create_dataset('y_train', data=y_train)
                    hf.create_dataset('y_test', data=y_test)
                    hf.create_dataset('patchSize', data=patchSize)
                    hf.create_dataset('patchOverlap', data=patchOverlap)

            else:
                X_trainFold.append(X_train)
                X_testFold.append(X_test)
                y_trainFold.append(y_train)
                y_testFold.append(y_test)

        X_trainFold = np.asarray(X_trainFold, dtype='f')
        X_testFold = np.asarray(X_testFold, dtype='f')
        y_trainFold = np.asarray(y_trainFold, dtype='f')
        y_testFold = np.asarray(y_testFold, dtype='f')

        if iReturn > 0:
            return X_trainFold, y_trainFold, X_testFold, y_testFold
    # Filter down to most common 100 questions
    questions = data_temp.groupby(
        'question')['student'].nunique().sort_values().iloc[-100:].index.values
    data_temp = data_temp[data_temp['question'].isin(questions)].drop(
        columns='index')

    # Set up the algorithms
    glicko = Glicko2()
    irt_1pl = IRT3PL(**{'model_name': '1pl'})
    algorithms = [glicko, irt_1pl]

    # Now do a five fold cross validation on the data
    kf = KFold(n_splits=5, random_state=42, shuffle=True)
    fold = 0
    for train_idx, test_idx in kf.split(data_temp):
        df_train = data_temp.iloc[train_idx]
        df_test = data_temp.iloc[test_idx]

        for alg in algorithms:
            alg.fit(df_train)
            df_out = alg.predict(df_test)
            df_out = df_out.merge(df_test)
            accuracy = 1 - np.mean(
                np.abs(np.round(df_out['p']) - df_out['score']))
            dft = pd.DataFrame({
                'course': [course],
                'test_idx': [fold],
                'algorithm': [alg.model_name],
                'accuracy': [accuracy]
            })
示例#59
0
    print(cm_df)

    for i, cls in enumerate(clf_LinearSVC.classes_):
        print("\nFeature weights for class : ", cls, "\n")
        df = pd.DataFrame(data={
            "Features": vec.feature_names_,
            "weights": clf_LinearSVC.coef_[i]
        })
        df = df.sort_values(axis=0, by='weights', ascending=False)
        print(df[:50])

    kfold = KFold(n_splits=5, shuffle=True, random_state=1234)
    preds = []
    truths = []
    #y = np.array(labels)
    for train, test in kfold.split(X):
        gnb = LogisticRegression()
        #C=0.001,fit_intercept=True,max_iter=20,solver="lbfgs",class_weight = 'balanced')
        #MLPClassifier(hidden_layer_sizes=(5,10),solver='sgd',learning_rate = 'adaptive',activation='logistic')
        clf = gnb.fit(X[train], Y[train])
        #print(clf.class_count_)
        preds.extend(clf.predict(X[test]))
        truths.extend(Y[test])
    acc = accuracy_score(truths, preds)
    print('accuracy : %0.3f' % acc)
    cnf_matrix = confusion_matrix(
        truths, preds, labels=['Not Relevant', 'Deceptive', 'Relevant'])
    print(cnf_matrix)

#evaluate_combinations(X_train, Y_train)
示例#60
0
def fSplitDatasetCorrection(sSplitting, dRefPatches, dArtPatches, allPats,
                            split_ratio, nfolds, test_index):
    """
    Split dataset with three options:
    1. normal: randomly split data according to the split_ratio without cross validation
    2. crossvalidation_data: perform crossvalidation with mixed patient data
    3. crossvalidation_patient: perform crossvalidation with separate patient data
    @param sSplitting: splitting mode 'normal', 'crossvalidation_data' or 'crossvalidation_patient'
    @param dRefPatches: reference patches
    @param dArtPatches: artifact patches
    @param allPats: patient index
    @param split_ratio: the ratio to split test data
    @param nfolds: folds for cross validation
    @return: testing and training data for both reference and artifact images
    """
    train_ref_fold = []
    test_ref_fold = []
    train_art_fold = []
    test_art_fold = []

    # normal splitting
    if sSplitting == 'normal':
        nPatches = dRefPatches.shape[0]
        dVal = math.floor(split_ratio * nPatches)
        rand_num = np.random.permutation(np.arange(nPatches))
        rand_num = rand_num[0:int(dVal)].astype(int)

        test_ref_fold.append(dRefPatches[rand_num, :, :])
        train_ref_fold.append(np.delete(dRefPatches, rand_num, axis=0))
        test_art_fold.append(dArtPatches[rand_num, :, :])
        train_art_fold.append(np.delete(dArtPatches, rand_num, axis=0))

    # crossvalidation with mixed patient
    if sSplitting == "crossvalidation_data":
        if nfolds == 0:
            kf = KFold(n_splits=len(np.unique(allPats)))
        else:
            kf = KFold(n_splits=nfolds)

        for train_index, test_index in kf.split(dRefPatches):
            train_ref, test_ref = dRefPatches[train_index], dRefPatches[
                test_index]
            train_art, test_art = dArtPatches[train_index], dArtPatches[
                test_index]

            train_ref_fold.append(train_ref)
            train_art_fold.append(train_art)
            test_ref_fold.append(test_ref)
            test_art_fold.append(test_art)

    # crossvalidation with separate patient
    elif sSplitting == 'crossvalidation_patient':
        if test_index == -1:
            unique_pats = np.unique(allPats)
        else:
            unique_pats = [test_index]
        for ind_split in unique_pats:
            train_index = np.where(allPats != ind_split)[0]
            test_index = np.where(allPats == ind_split)[0]
            train_ref, test_ref = dRefPatches[train_index], dRefPatches[
                test_index]
            train_art, test_art = dArtPatches[train_index], dArtPatches[
                test_index]

            train_ref_fold.append(train_ref)
            train_art_fold.append(train_art)
            test_ref_fold.append(test_ref)
            test_art_fold.append(test_art)

    train_ref_fold = np.asarray(train_ref_fold, dtype='f')
    train_art_fold = np.asarray(train_art_fold, dtype='f')
    test_ref_fold = np.asarray(test_ref_fold, dtype='f')
    test_art_fold = np.asarray(test_art_fold, dtype='f')

    return train_ref_fold, test_ref_fold, train_art_fold, test_art_fold