def stacking_proba(clf,X_train,y,X_test,nfolds=5,random_seed=2017,return_score=False,
                   shuffle=True,metric='acc',clf_name='UnKnown'):
    folds = StratifiedKFold(n_splits=nfolds, shuffle=shuffle, random_state=random_seed)
    folds.get_n_splits(X_train,y)
    #return stacking_proba for train set
    train_stacking_proba=np.zeros((X_train.shape[0],np.unique(y).shape[0]))
    score=0
    for i,(train_index, validate_index) in enumerate(folds.split(X_train, y)):
        # print(str(clf_name)+" folds:"+str(i+1)+"/"+str(nfolds))
        X_train_fold=X_train[train_index,:]
        y_train_fold=y[train_index]
        X_validate_fold=X_train[validate_index,:]
        y_validate_fold=y[validate_index]
        clf.fit(X_train_fold,y_train_fold)
        fold_preds=clf.predict_proba(X_validate_fold)
        train_stacking_proba[validate_index,:]=fold_preds
        #validation
        fold_preds_a = np.argmax(fold_preds, axis=1)
        fold_score=len(np.nonzero(y_validate_fold - fold_preds_a == 0)[0]) / len(y_validate_fold)
        # print('validate '+metric+":"+str(fold_score))
        score+=fold_score
    score/=nfolds
    #return stacking_proba for test set
    clf.fit(X_train,y)
    test_stacking_proba=clf.predict_proba(X_test)

    if np.unique(y).shape[0] == 2: # when binary classification only return positive class proba
        train_stacking_proba=train_stacking_proba[:,1]
        test_stacking_proba=test_stacking_proba[:,1]
    if return_score:
        return train_stacking_proba,test_stacking_proba,score
    else:
        return train_stacking_proba,test_stacking_proba
def _get_fold_generator(target_values):
    if params.stratified_cv:
        cv = StratifiedKFold(n_splits=params.n_cv_splits, shuffle=True, random_state=cfg.RANDOM_SEED)
        cv.get_n_splits(target_values)
        fold_generator = cv.split(target_values, target_values)
    else:
        cv = KFold(n_splits=params.n_cv_splits, shuffle=True, random_state=cfg.RANDOM_SEED)
        fold_generator = cv.split(target_values)
    return fold_generator
Пример #3
0
    def stratified_cross_validate(self, k):
        attributes = np.append(self.training_attributes, self.testing_attributes, axis=0)
        labels = np.append(self.training_labels, self.testing_labels, axis=0)

        all_data = np.array([np.append(attributes[i], labels[i]) for i in range(len(attributes))])

        #print("all data : %s" % all_data)
        #print("")

        np.random.shuffle(all_data)

        X = all_data[:, :-1]
        y = all_data[:, -1]
        print(X.shape, y.shape)
        skf = StratifiedKFold(n_splits=2)
        print(skf.get_n_splits(X, y))
        for train_index, test_index in skf.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            yield (X_train, y_train, X_test, y_test)

        #print("shuffled data : %s" % all_data)
        #print("")

        for i in range(k):
            split = len(all_data) / k
            #print("split : %s" % split)

            test_data = all_data[i * split:(i + 1) * split, :]
            train_data = np.delete(all_data, np.arange(i * split, (i + 1) * split), axis=0)

            train_input, train_output = train_data[:, :-1], train_data[:, -1]
            test_input, test_output = test_data[:, :-1], test_data[:, -1]

            yield (train_input, train_output, test_input, test_output)
def train_val(X_trainval, y_trainval, temp_dir, current_ration, test_count,
              hyper_count):

    # construct the validataion set by stratified cross-validataion
    skf_train_val = StratifiedKFold(n_splits=VAL_FOLD,
                                    random_state=RANDOM_STATE,
                                    shuffle=True)

    skf_train_val.get_n_splits(X_trainval, y_trainval)
    fold_count = 1

    all_train_loss = []
    all_val_loss = []

    all_val_acc = []
    all_val_f1 = []
    all_val_precision = []
    all_val_recall = []

    all_num_train = []
    all_num_val = []
    all_num_under_val = []

    for train_index, val_index in skf_train_val.split(X_trainval, y_trainval):

        X_train = X_trainval.iloc[train_index]
        y_train = y_trainval.iloc[train_index]

        X_val = X_trainval.iloc[val_index]
        y_val = y_trainval.iloc[val_index]

        # calculating the required OS & US samples
        print("-" * 70)
        print("START  SAMPLING TRAIN SET [{}] ".format(fold_count))
        start_time = time.time()
        num_class0, num_class1 = y_train.value_counts()
        diff = num_class0 - num_class1
        num_os_instance = int(diff * current_ration)
        num_us_instance = int(diff * (1 - current_ration))

        # performing OS & US by Resampling
        sample_train = pd.concat([y_train, X_train], axis=1)
        sample_train_over = oversampling(sample_train, num_os_instance)
        sample_train_over_under = undersampling(sample_train_over,
                                                num_us_instance)

        end_time = time.time()
        print("FINISH SAMPLING TRAIN SET [{}]: {}".format(
            fold_count, (end_time - start_time)))

        # undersampling val set by resampling
        num_class0, num_class1 = y_val.value_counts()
        all_num_val.append([num_class0, num_class1])
        sample_val = pd.concat([y_val, X_val], axis=1)
        num_us_instance = num_class0 - num_class1
        under_sample_val = undersampling(sample_val, num_us_instance)

        # saving undersampled val data
        under_val_path = temp_dir + '/sample_val_under.tsv'
        under_sample_val.to_csv(under_val_path,
                                sep='\t',
                                encoding="utf-8",
                                index=False,
                                header=False)

        num_class0, num_class1 = under_sample_val['y'].value_counts()
        all_num_under_val.append([num_class0, num_class1])

        #  saving sampled train data
        train_path = temp_dir + '/sample_train_aug.tsv'
        sample_train_over_under.to_csv(train_path,
                                       sep='\t',
                                       encoding="utf-8",
                                       index=False,
                                       header=False)

        num_class0, num_class1 = sample_train_over_under['y'].value_counts()
        all_num_train.append([num_class0, num_class1])

        del sample_train, X_train, y_train
        del sample_val, X_val, y_val
        del sample_train_over, sample_train_over_under

        # processing to model classifier (BERT)
        history = bert(train_path,
                       under_val_path,
                       INPUT_EPOCH,
                       EVAL_STEPS,
                       test_count,
                       hyper_count,
                       fold_count,
                       predict=False)

        # calculating average train loss
        all_train_loss.append(history['train_loss'])

        # calculating average val loss, accuracy, f1, precision and recall
        all_val_loss.append(history['val_loss'])
        all_val_acc.append(history['val_acc'])
        all_val_f1.append(history['val_f1'])
        all_val_precision.append(history['val_precision'])
        all_val_recall.append(history['val_recall'])

        # increamenting the fold index, and repeat above process
        fold_count = fold_count + 1

    # return back the ICV log
    results = {
        'final_all_train_loss': all_train_loss,
        'final_all_val_loss': all_val_loss,
        'final_all_val_acc': all_val_acc,
        'final_all_val_f1': all_val_f1,
        'final_all_val_precision': all_val_precision,
        'final_all_val_recall': all_val_recall,
        'final_avg_val_loss': np.mean(all_val_loss),
        'train_distribution': all_num_train,
        'val_distribution': all_num_val,
        'under val val_distribution': all_num_under_val
    }

    return results
Пример #5
0
    def cross_validation(self):
        ''' do a 6 fold cross-validation, draw ROC curve
        '''

        self._load_data()

        mean_tpr = 0.0
        mean_fpr = numpy.linspace(0, 1, 100)
        colors = ['cyan', 'indigo', 'seagreen', 'yellow', 'blue', 'darkorange']
        lw = 2
        i = 0

        #pdf = PdfPages('../data/cnn_cv.pdf')
        plt.figure(figsize = (10,10))

        cvscores = []
        kfold = StratifiedKFold(n_splits=6, shuffle=True, random_state=seed)
        for (train, test), color in zip(kfold.split(self.X_train, self.y_train), colors):
            
            self._init_model(verbose=False)
            # Fit the model
            self.model.fit(self.X_train[train], self.y_train[train],
                           nb_epoch=self.nb_epoch, batch_size=self.batch_size,
                           verbose=self.verbose)
            # evaluate the model
            scores = self.model.evaluate(
                self.X_train[test], self.y_train[test], verbose=self.verbose)
            print("%s: %.2f%%" %
                  (self.model.metrics_names[1], scores[1] * 100))
            cvscores.append(scores[1] * 100)

            # Compute ROC curve and area the curve, mean ROC using interpolation
            probas_ = self.model.predict(self.X_train[test])
            fpr, tpr, thresholds = roc_curve(self.y_train[test], probas_[:, 0])
            mean_tpr += interp(mean_fpr, fpr, tpr)
            mean_tpr[0] = 0.0
            roc_auc = auc(fpr, tpr)
            plt.plot(fpr, tpr, lw=lw, color=color,
                     label='ROC fold %d (area = %0.2f)' % (i, roc_auc))
            i += 1

        cv_results = "%.2f%% (+/- %.2f%%)" % (numpy.mean(cvscores), numpy.std(cvscores))

        plt.plot([0, 1], [0, 1], linestyle='--', lw=lw, color='k', label='Luck')
        mean_tpr /= kfold.get_n_splits(self.X_train, self.y_train)
        mean_tpr[-1] = 1.0
        mean_auc = auc(mean_fpr, mean_tpr)
        plt.plot(mean_fpr, mean_tpr, color='g', linestyle='--',
                 label='Mean ROC (area = %0.2f)' % mean_auc, lw=lw)
        
        df = pd.read_csv(self.fname)
        y_true = df.pop('target')
        plot_roc(df, y_true)
        
        plt.xlim([-0.05, 1.05])
        plt.ylim([-0.05, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Training cross-validation ROC\nAccuracy:' + cv_results)
        plt.legend(loc="lower right")
        plt.show()
        #plt.savefig('../data/cnn_cv_' + prefix + '_' + datetime.datetime.now().strftime('%Y%m%d-%H.%M.%S') + '.eps', format = 'eps', dpi=600) #, bbox_inches='tight')   
        plt.savefig('../data/cnn_cv_'+ datetime.datetime.now().strftime('%Y%m%d-%H.%M.%S') +'.eps', format = 'eps', dpi = 600)
        #pdf.close()
        plt.close()
        print 'Saving ROC plot in .eps in data folder...'
Пример #6
0
    def kfold_plot(self, train, ytrain, model):

        #     kf = StratifiedKFold(y=ytrain, n_folds=5)
        kf = StratifiedKFold(n_splits=5)
        scores = []
        mean_tpr = 0.0
        mean_fpr = np.linspace(0, 1, 100)
        exe_time = []

        colors = cycle(['cyan', 'indigo', 'seagreen', 'yellow', 'blue'])
        lw = 2

        i = 0
        for (train_index, test_index), color in zip(kf.split(train, ytrain),
                                                    colors):
            X_train, X_test = train.iloc[train_index], train.iloc[test_index]
            y_train, y_test = ytrain.iloc[train_index], ytrain.iloc[test_index]
            begin_t = time.time()
            predictions = model(X_train, X_test, y_train)
            end_t = time.time()
            exe_time.append(round(end_t - begin_t, 3))
            scores.append(roc_auc_score(y_test.astype(float), predictions))
            fpr, tpr, thresholds = roc_curve(y_test, predictions)
            mean_tpr += interp(mean_fpr, fpr, tpr)
            mean_tpr[0] = 0.0
            roc_auc = auc(fpr, tpr)
            plt.plot(fpr,
                     tpr,
                     lw=lw,
                     color=color,
                     label='ROC fold %d (area = %0.2f)' % (i, roc_auc))
            i += 1
        plt.plot([0, 1], [0, 1],
                 linestyle='--',
                 lw=lw,
                 color='k',
                 label='Luck')

        mean_tpr /= kf.get_n_splits(train, ytrain)
        mean_tpr[-1] = 1.0
        mean_auc = auc(mean_fpr, mean_tpr)
        plt.plot(mean_fpr,
                 mean_tpr,
                 color='g',
                 linestyle='--',
                 label='Mean ROC (area = %0.2f)' % mean_auc,
                 lw=lw)

        plt.xlim([-0.05, 1.05])
        plt.ylim([-0.05, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver operating characteristic')
        plt.legend(loc='lower right')
        plt.show()

        #     print 'scores: ', scores
        print('mean scores: ', np.mean(scores))
        print('mean model process time: ', np.mean(exe_time), 's')

        return scores, np.mean(scores), np.mean(exe_time)
def load_data(fold=1, n_workers=N_WORKERS, spec_dir="specs_train_v1", train_verified=True, train_unverified=True,
              normalize=False, fix_lengths=True, max_len=None, min_len=None, validate_verified=True,
              train_file="train.csv", load_test=True, train_on_all=False):
    """ load data """

    if not min_len:
        min_len = N_FRAMES

    # get annotations
    files, labels, verified = get_files_and_labels(os.path.join(DATA_ROOT, train_file), spec_dir)
    _, _, verified_for_val = get_files_and_labels(os.path.join(DATA_ROOT, "train.csv"), spec_dir)

    # stratified split
    np.random.seed(4711)
    r_idx = np.random.permutation(len(files))
    files, labels, verified = files[r_idx], labels[r_idx], verified[r_idx]
    verified_for_val = verified_for_val[r_idx]

    verified_indices = np.nonzero(verified)[0]
    unverified_indices = np.nonzero(~verified)[0]

    from sklearn.model_selection import StratifiedKFold
    sss = StratifiedKFold(n_splits=4, random_state=0)
    sss.get_n_splits(files[verified], labels[verified])
    for i_fold, (train_index_ver, test_index_ver) in enumerate(sss.split(files[verified], labels[verified])):
        if i_fold + 1 == fold:
            break

    sss = StratifiedKFold(n_splits=4, random_state=0)
    sss.get_n_splits(files[~verified], labels[~verified])
    for i_fold, (train_index_unver, test_index_unver) in enumerate(sss.split(files[~verified], labels[~verified])):
        if i_fold + 1 == fold:
            break

    train_index = np.concatenate((verified_indices[train_index_ver], unverified_indices[train_index_unver]))
    test_index = np.concatenate((verified_indices[test_index_ver], unverified_indices[test_index_unver]))

    if train_on_all:
        train_index = np.concatenate((train_index, test_index))

    # split into train and validation data
    tr_files, tr_labels, tr_verified = files[train_index], labels[train_index], verified[train_index]
    va_files, va_labels, va_verified = files[test_index], labels[test_index], verified_for_val[test_index]

    # load only verified labels
    train_idx = np.zeros_like(tr_verified, dtype=np.bool)
    if train_verified:
        train_idx = train_idx | tr_verified
    if train_unverified:
        train_idx = train_idx | (tr_verified == False)
    tr_files = tr_files[train_idx]
    tr_labels = tr_labels[train_idx]

    # keep only verified examples for validation
    if validate_verified:
        va_files = va_files[va_verified]
        va_labels = va_labels[va_verified]

    # create data pools
    pool = AugmentedAudioFileClassificationDataPool

    train_pool = pool(tr_files, tr_labels, None, n_workers=n_workers, shuffle=True, use_cache=True)
    valid_pool = pool(va_files, va_labels, None, n_workers=n_workers, shuffle=False, use_cache=True)

    if load_test:
        test_pool = load_data_test(spec_dir=spec_dir.replace("train", "test"))["test"]
    else:
        test_pool = None

    # fix spectrogram lengths
    print("Fixing spectrogram lengths ...")
    if max_len is None:
        max_len = np.max([s.shape[-1] for s in train_pool.cache.values()])

    def fix_pool(pool, test_mode):

        for k in pool.cache.keys():

            # copy spectrogram
            spec = pool.cache[k].copy()
            tmp = spec.copy()

            while spec.shape[-1] < max_len:

                if test_mode and spec.shape[-1] >= min_len:
                    break

                spec = np.concatenate((spec, tmp), axis=-1)

            # clip spectrogram if too long
            pool.cache[k] = spec[:, :, 0:max_len]

        return pool

    if fix_lengths:
        train_pool = fix_pool(train_pool, test_mode=False)
        valid_pool = fix_pool(valid_pool, test_mode=False)
        if load_test:
            test_pool = fix_pool(test_pool, test_mode=True)

    # normalize data
    if normalize:
        print("Normalizing data ...")

        specs = train_pool.cache.values()
        specs = np.concatenate(specs, axis=2).astype(np.float32)

        sub = specs.mean(axis=(0, 2), keepdims=True)[0]
        div = specs.std(axis=(0, 2), keepdims=True)[0]

        # sub = specs.min()
        # div = np.max(specs - sub)

        for key in train_pool.cache.keys():
            train_pool.cache[key] -= sub
            train_pool.cache[key] /= div

        for key in valid_pool.cache.keys():
            valid_pool.cache[key] -= sub
            valid_pool.cache[key] /= div

        if load_test:
            for key in test_pool.cache.keys():
                test_pool.cache[key] -= sub  # [0:1]
                test_pool.cache[key] /= div  # [0:1]

    print("Train %d" % train_pool.shape[0])
    print("Valid %d" % valid_pool.shape[0])
    if load_test:
        print("Test  %d" % test_pool.shape[0])

    return {'train': train_pool, 'valid': valid_pool, 'test': test_pool}
Пример #8
0
def k_fold_cross_validation(k, hiddenLayers, numEpochs):

    # load dataset
    noShow = ds.import_data_df([ds._FILE_PATHS['merged']])
    noShow_X, noShow_y = noShow.iloc[:, :-1].values, noShow.iloc[:, -1].values
    noShow_y = np.array([[i] for i in noShow_y])

    # Stratified k-fold
    skf = StratifiedKFold(n_splits=k, shuffle=True)
    skf.get_n_splits(noShow_X, noShow_y)
    #print(skf)

    # store results
    losses = []
    accuracies = []
    f1s = []

    fold = 1
    for train_index, test_index in skf.split(noShow_X, noShow_y):
        #print("Fold: "+str(fold))
        fold += 1
        trainX, testX = noShow_X[train_index], noShow_X[test_index]
        trainY, testY = noShow_y[train_index], noShow_y[test_index]

        # separate training in validation and training set
        trainX, valX, trainY, valY = train_test_split(trainX,
                                                      trainY,
                                                      test_size=0.15,
                                                      random_state=42,
                                                      stratify=trainY)

        # number of features
        numFeatures = trainX.shape[1]
        # number of classes
        numLabels = trainY.shape[1]

        # init
        NN = nn.NN_Sigmoid(hiddenLayers,
                           numFeatures,
                           numLabels,
                           learning_rate=0.05,
                           cross_entropy_weight=4,
                           optimizer="Adam")
        # train
        NN.train(numEpochs,
                 trainX,
                 trainY,
                 valX=valX,
                 valY=valY,
                 val_epochs=25,
                 val_patience=5)
        # test
        _, loss, acc, f1 = NN.predict(testX, testY)
        # close tf session
        NN.close_session()

        losses.append(loss)
        accuracies.append(acc)
        f1s.append(f1)

    return losses, accuracies, f1s
Пример #9
0
def _performCV(X, y, sel_SAVs, n_estimators=1000, max_features='auto',
               n_splits=10, ROC_fig='ROC.png', feature_names=None,
               CVseed=666, stratification=None, **kwargs):

    assert stratification in [None, 'protein', 'residue']

    # set classifier
    classifier = RandomForestClassifier(
        n_estimators=n_estimators, max_features=max_features,
        oob_score=True, n_jobs=-1, class_weight='balanced')

    # define folds
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=CVseed)
    CV_folds = []
    for train, test in cv.split(X, y):
        CV_folds.append([train, test])

    # protein-stratification: a same protein should not be found in
    # both training and test sets
    if stratification is not None:
        # for each fold, count occurrences of each protein/residue
        occurrences = {}
        if stratification == 'protein':
            # e.g. 'P01112'
            accs = np.array([s.split()[0] for s in sel_SAVs['SAV_coords']])
        else:
            # e.g. P01112 99
            accs = np.array([' '.join(s.split()[:2])
                             for s in sel_SAVs['SAV_coords']])
        for k, (train, test) in enumerate(CV_folds):
            counts = Counter(accs[test])
            for acc, count in counts.items():
                occurrences.setdefault(acc, np.zeros(n_splits, dtype=int))
                occurrences[acc][k] = count
        # for each acc. number, find fold with largest occurrences
        best_fold = {a: np.argmax(c) for a, c in occurrences.items()}
        new_folds = np.array([best_fold[a] for a in accs])
        # update folds
        for k in range(n_splits):
            CV_folds[k][0] = np.where(new_folds != k)[0]
            CV_folds[k][1] = np.where(new_folds == k)[0]

    # cross-validation loop
    CV_info = {k: [] for k in [
        'AUROC', 'AUPRC', 'OOB score', 'optimal cutoff', 'MCC',
        'precision (0)', 'recall (0)', 'F1 score (0)',
        'precision (1)', 'recall (1)', 'F1 score (1)',
        'precision', 'recall', 'F1 score',
        'feat. importances', 'predictions_0', 'predictions_1']}
    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 20)
    i = 0
    for train, test in CV_folds:
        # create training and test datasets
        X_train = X[train]
        X_test = X[test]
        y_train = y[train]
        y_test = y[test]
        # train Random Forest classifier
        classifier.fit(X_train, y_train)
        # calculate probabilities over decision trees
        y_pred = classifier.predict_proba(X_test)[:, 1]

        # compute ROC, AUROC, optimal cutoff (argmax of Youden's index), etc.
        sm = calcScoreMetrics(y_test, y_pred)
        for stat in ['AUROC', 'AUPRC', 'optimal cutoff']:
            CV_info[stat].append(sm[stat])
        # compute Matthews corr. coeff., precision/recall, etc. on classes
        y_pred_binary = np.where(y_pred > sm['optimal cutoff'], 1, 0)
        cm = calcClassMetrics(y_test, y_pred_binary)
        for stat in cm.keys():
            CV_info[stat].append(cm[stat])
        # other info
        mean_tpr += np.interp(mean_fpr, sm['ROC']['FPR'], sm['ROC']['TPR'])
        CV_info['OOB score'].append(classifier.oob_score_)
        CV_info['feat. importances'].append(
            np.array(classifier.feature_importances_))
        CV_info['predictions_0'].extend(y_pred[y_test == 0])
        CV_info['predictions_1'].extend(y_pred[y_test == 1])
        # print log
        i += 1
        LOGGER.info('CV iteration #{:2d}:   '.format(i) +
                    'AUROC = {:.3f}   '.format(sm['AUROC']) +
                    'AUPRC = {:.3f}   '.format(sm['AUPRC']) +
                    'OOB score = {:.3f}'.format(classifier.oob_score_))

    # compute average ROC curves
    mean_tpr /= cv.get_n_splits(X, y)
    mean_tpr[0] = 0.0
    mean_tpr[-1] = 1.0
    # compute average ROC, optimal cutoff and other stats
    stats = {}
    for s in CV_info.keys():
        if s in ['predictions_0', 'predictions_1']:
            continue
        stats[s] = (np.mean(CV_info[s], axis=0), np.std(CV_info[s], axis=0))

    LOGGER.info('-'*60)
    LOGGER.info('Cross-validation summary:')
    LOGGER.info(f'training dataset size:   {len(y):<d}')
    LOGGER.info(f'fraction of positives:   {sum(y)/len(y):.3f}')
    for s in ['AUROC', 'AUPRC', 'OOB score', 'optimal cutoff']:
        if s == 'optimal cutoff':
            fields = ('optimal cutoff*:', stats[s][0], stats[s][1])
        else:
            fields = (f'mean {s}:', stats[s][0], stats[s][1])
        LOGGER.info('{:24} {:.3f} +/- {:.3f}'.format(*fields))
    LOGGER.info("(* argmax of Youden's index)")

    n_feats = len(stats['feat. importances'][0])
    if feature_names is None:
        feature_names = [f'feature {i}' for i in range(n_feats)]
    LOGGER.info('feature importances:')
    for i, feat_name in enumerate(feature_names):
        LOGGER.info('{:>23s}: {:.3f}'.format(
            feat_name, stats['feat. importances'][0][i]))
    LOGGER.info('-'*60)

    path_prob = calcPathogenicityProbs(CV_info, **kwargs)
    CV_summary = {
        'dataset size': len(y),
        'dataset bias': sum(y)/len(y),
        'mean ROC': list(zip(mean_fpr, mean_tpr)),
        'optimal cutoff': stats['optimal cutoff'],
        'feat. importances': stats['feat. importances'],
        'path. probability': path_prob,
        'training dataset': sel_SAVs,
        'folds': CV_folds
    }
    for s in ['AUROC', 'AUPRC', 'OOB score', 'MCC',
              'precision (0)', 'recall (0)', 'F1 score (0)',
              'precision (1)', 'recall (1)', 'F1 score (1)',
              'precision', 'recall', 'F1 score']:
        CV_summary['mean ' + s] = stats[s]

    # plot average ROC
    if ROC_fig is not None:
        print_ROC_figure(ROC_fig, mean_fpr, mean_tpr, stats['AUROC'])

    return CV_summary
			else:
				Y_raw.append(0)
			X_raw.append(float(row[6]))

print len(X_raw)
print len(Y_raw)
X = np.array(X_raw)
X = np.reshape(X,(-2,1))
Y = np.array(Y_raw)

print len(X)
print len(Y)
# print X

skf = StratifiedKFold(n_splits=10,random_state=40)
skf.get_n_splits(X,Y)

# X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state = 42)

index = 0

precision_score_list_LR = list()
recall_score_list_LR = list()
precision_score_list_SVC_poly = list()
recall_score_list_SVC_poly = list()
precision_score_list_RF = list()
recall_score_list_RF = list()
for train_index, test_index in skf.split(X,Y):
	print "########################"
	X_train, X_test = X[train_index], X[test_index]
	# X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    3,
    labels=['normal', 'prediabetes', 'diabetes'])
df['DiabetesPedigreeFunction'] = df['DiabetesPedigreeFunction'].map({
    "normal":
    0,
    "prediabetes":
    1,
    "diabetes":
    2
})
y = df.Outcome
x = df.drop('Outcome', axis=1)

accuracy = []
skf = StratifiedKFold(n_splits=10, random_state=None)
skf.get_n_splits(x, y)

# x is the feature set and y is the target
for train_index, test_index in skf.split(x, y):
    #print ("Train:", train_index, "validation:", test_index)
    X1_train, X1_test = x.iloc[train_index], x.iloc[test_index]
    y1_train, y1_test = y.iloc[train_index], y.iloc[test_index]
    ##standard scalar
    st_x = StandardScaler()
    X1_train = st_x.fit_transform(X1_train)
    X1_test = st_x.transform(X1_test)
    ##PCA
    pca = PCA()
    X1_train = pca.fit_transform(X1_train)
    X1_test = pca.transform(X1_test)
    explained_variance = pca.explained_variance_ratio_
Пример #12
0
def train_stage(df_path, lgb_path, xgb_path, cb_path):

    print('Load Train Data.')
    df = pd.read_csv(df_path)
    print('\nShape of Train Data: {}'.format(df.shape))

    y_df = np.array(df['target'])
    df_ids = np.array(df.index)
    df.drop(['ID_code', 'target'], axis=1, inplace=True)

    lgb_cv_result = np.zeros(df.shape[0])
    xgb_cv_result = np.zeros(df.shape[0])
    cb_cv_result = np.zeros(df.shape[0])

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    skf.get_n_splits(df_ids, y_df)

    print('\nModel Fitting...')
    for counter, ids in enumerate(skf.split(df_ids, y_df)):
        print('\nFold {}'.format(counter + 1))
        X_fit, y_fit = df.values[ids[0]], y_df[ids[0]]
        X_val, y_val = df.values[ids[1]], y_df[ids[1]]

        print('LigthGBM')
        lgb_cv_result[ids[1]] += fit_lgb(X_fit,
                                         y_fit,
                                         X_val,
                                         y_val,
                                         counter,
                                         lgb_path,
                                         name='lgb')
        print('XGBoost')
        xgb_cv_result[ids[1]] += fit_xgb(X_fit,
                                         y_fit,
                                         X_val,
                                         y_val,
                                         counter,
                                         xgb_path,
                                         name='xgb')
        print('CatBoost')
        cb_cv_result[ids[1]] += fit_cb(X_fit,
                                       y_fit,
                                       X_val,
                                       y_val,
                                       counter,
                                       cb_path,
                                       name='cb')

        del X_fit, X_val, y_fit, y_val
        gc.collect()

    auc_lgb = round(roc_auc_score(y_df, lgb_cv_result), 4)
    auc_xgb = round(roc_auc_score(y_df, xgb_cv_result), 4)
    auc_cb = round(roc_auc_score(y_df, cb_cv_result), 4)
    auc_mean = round(
        roc_auc_score(y_df,
                      (lgb_cv_result + xgb_cv_result + cb_cv_result) / 3), 4)
    auc_mean_lgb_cb = round(
        roc_auc_score(y_df, (lgb_cv_result + cb_cv_result) / 2), 4)
    print('\nLightGBM VAL AUC: {}'.format(auc_lgb))
    print('XGBoost  VAL AUC: {}'.format(auc_xgb))
    print('Catboost VAL AUC: {}'.format(auc_cb))
    print('Mean Catboost+LightGBM VAL AUC: {}'.format(auc_mean_lgb_cb))
    print('Mean XGBoost+Catboost+LightGBM, VAL AUC: {}\n'.format(auc_mean))

    return 0
Пример #13
0
del df['length']
del tdf['length']
#del df['doc']
#del tdf['doc']

#Getting the Test and Train data

#Doing a 10 fold split using StratifiedKFold
y_col = 'person'
target = df[y_col].values
test_target = tdf[y_col].values
del df[y_col]
del tdf[y_col]
train_data = df.values
test_data = tdf.values
skf.get_n_splits(df, target)

#Decision Tree Classfier

decisiontree_classifier = tree.DecisionTreeClassifier()
precision_list = []
recall_list = []
fscore_list = []
for train_index, test_index in skf.split(train_data, target):
    decisiontree_classifier.fit(train_data[train_index], target[train_index])
    y_pred = decisiontree_classifier.predict(train_data[test_index])
    precision, recall, fscore, support = precision_recall_fscore_support(
        target[test_index], y_pred, average='macro')
    precision_list.append(precision)
    recall_list.append(recall)
    fscore_list.append(fscore)
Пример #14
0
def _performCV(X,
               y,
               n_estimators=1000,
               max_features='auto',
               n_splits=10,
               ROC_fig='ROC.png',
               feature_names=None,
               **kwargs):

    # set classifier
    classifier = RandomForestClassifier(n_estimators=n_estimators,
                                        max_features=max_features,
                                        oob_score=True,
                                        class_weight='balanced',
                                        n_jobs=-1)

    # set cross-validation procedure
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=666)

    # cross-validation loop
    CV_info = {
        'AUROC': [],
        'AUPRC': [],
        'feat_importance': [],
        'OOB_score': [],
        'Youden_cutoff': [],
        'predictions_0': [],
        'predictions_1': []
    }
    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)
    i = 0
    for train, test in cv.split(X, y):
        # create training and test datasets
        X_train = X[train]
        X_test = X[test]
        y_train = y[train]
        y_test = y[test]
        # train Random Forest classifier
        classifier.fit(X_train, y_train)
        # calculate probabilities over decision trees
        y_pred = classifier.predict_proba(X_test)
        # compute ROC, AUROC, optimal cutoff (argmax of Youden's index), etc...
        d = calcMetrics(y_test, y_pred[:, 1])
        auroc = d['AUROC']
        auprc = d['AUPRC']
        J_opt = d['optimal cutoff']
        # store other info and metrics for each iteration
        mean_tpr += np.interp(mean_fpr, d['FPR'], d['TPR'])
        CV_info['AUROC'].append(auroc)
        CV_info['AUPRC'].append(auprc)
        CV_info['feat_importance'].append(classifier.feature_importances_)
        CV_info['OOB_score'].append(classifier.oob_score_)
        CV_info['Youden_cutoff'].append(J_opt)
        CV_info['predictions_0'].extend(y_pred[np.where(y_test == 0), 1][0])
        CV_info['predictions_1'].extend(y_pred[np.where(y_test == 1), 1][0])
        # print log
        i += 1
        LOGGER.info(f'CV iteration #{i:2d}:   AUROC = {auroc:.3f}   ' + \
        f'AUPRC = {auprc:.3f}   OOB score = {classifier.oob_score_:.3f}')

    # compute average ROC, optimal cutoff and other stats
    mean_tpr /= cv.get_n_splits(X, y)
    mean_tpr[0] = 0.0
    mean_tpr[-1] = 1.0
    mean_auroc = auc(mean_fpr, mean_tpr)
    mean_auprc = np.mean(CV_info['AUPRC'])
    mean_oob = np.mean(CV_info['OOB_score'])
    avg_J_opt = np.mean(CV_info['Youden_cutoff'])
    std_J_opt = np.std(CV_info['Youden_cutoff'])
    avg_feat_imp = np.mean(np.array(CV_info['feat_importance']), axis=0)
    LOGGER.info('-' * 60)
    LOGGER.info('Cross-validation summary:')
    LOGGER.info(f'training dataset size:   {len(y):<d}')
    LOGGER.info(f'fraction of positives:   {sum(y)/len(y):.3f}')
    LOGGER.info(f'mean AUROC:              {mean_auroc:.3f}')
    LOGGER.info(f'mean AUPRC:              {mean_auprc:.3f}')
    LOGGER.info(f'mean OOB score:          {mean_oob:.3f}')
    LOGGER.info(
        f'optimal cutoff*:         {avg_J_opt:.3f} +/- {std_J_opt:.3f}')
    LOGGER.info("(* argmax of Youden's index)")
    LOGGER.info('feature importances:')
    if feature_names is None:
        feature_names = [f'feature {i}' for i in range(len(avg_feat_imp))]
    for feat_name, importance in zip(feature_names, avg_feat_imp):
        LOGGER.info(f'{feat_name:>23s}: {importance:.3f}')
    LOGGER.info('-' * 60)
    path_prob = calcPathogenicityProbs(CV_info, **kwargs)
    CV_summary = {
        'dataset size': len(y),
        'dataset bias': sum(y) / len(y),
        'mean AUROC': mean_auroc,
        'mean AUPRC': mean_auprc,
        'mean OOB score': mean_oob,
        'mean ROC': list(zip(mean_fpr, mean_tpr)),
        'optimal cutoff': (avg_J_opt, std_J_opt),
        'feat. importance': avg_feat_imp,
        'path. probability': path_prob
    }

    # plot average ROC
    if ROC_fig is not None:
        print_ROC_figure(ROC_fig, mean_fpr, mean_tpr, mean_auroc)

    return CV_summary
Пример #15
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--train-file', type=str, default="data/train.csv", help="")
    parser.add_argument('--test-file', type=str, default="data/test.csv", help="")
    parser.add_argument('--model-type', type=str, default="SVM", help="")
    parser.add_argument('--train-or-test', type=str, default="train", help="")
    parser.add_argument('--result-file', type=str, default="result/result.csv", help="")
    args = parser.parse_args()
    train_file=args.train_file
    test_file=args.test_file
    model_type=args.model_type
    train_or_test=args.train_or_test
    result_file=args.result_file

    print("Loading training data...")
    X, Y = read_train(train_file)
    X = preprocessing.scale(X)
    #pca = PCA(n_components=2048)
    #pca.fit(X)
    #print(pca.explained_variance_ratio_)
    #X = pca.transform(X)
    print("Finish loading training data!")
    # model_type = {RC: RidgeClassifer, KNN:KNeighbors, GNB:Gaussian Naive-Bayes, LR:Logistic Regression, LDA:Linear Discriminant Analysis, SVM:Support Vector Machine, MLP:Multi-layer Perceptron, EL:Ensemble Learning}
    if model_type == 'RC':
        ALPHA=10                                                                #50:67.32, 40:66.02, 30:64.71, 25:64.38, 20:64.43 10:66.35, 1:72.49
        print("alpha: {}".format(ALPHA))
        model = RidgeClassifier(alpha=ALPHA, normalize=True)                     #0.10,0,11:0.986923, 0.12,0.13,0.14:0.987179, 0.15:0.987051
    elif model_type == 'KNN':
        model = KNeighborsClassifier(n_neighbors=12, n_jobs=4)                      #0.975000,
    elif model_type == 'GNB':
        model = GaussianNB()                                                        #0.925897,
    elif model_type == 'LR':
        C=0.0005
        print("C: {}".format(C))
        model = LogisticRegression(C=C)                                             #1.0:0.9923077
    elif model_type == 'LDA':
        model = LinearDiscriminantAnalysis() #QuadraticDiscriminantAnalysis()       #0.978077
    elif model_type == 'SVM':
        #model = SVC(C=3.0, kernel='rbf', gamma='auto')                              #0.985890,0.987180,0.987436
        C=1e-4
        print("C: {}".format(C))
        model = LinearSVC(C=C)                                                  #0.001:0.988590,0.00075:0.988718
    elif model_type == 'MLP':
        model = MLPClassifier(random_state=1, max_iter=500, tol=1e-4, hidden_layer_sizes=(256,256), activation='relu',
                         solver='adam', alpha=1e-4, batch_size=256, learning_rate_init=0.0005, learning_rate='adaptive')  #0.986667,0.987051
    else: # EL:Ensemble Learning
        model = VotingClassifier( estimators=[("LR", LogisticRegression(C=0.001)), ("RC", RidgeClassifier(alpha=10, normalize=True)), ("SVM", LinearSVC(C=1e-4))],
                 voting="hard", n_jobs=-1)

    '''
    X1, Y1 = shuffle(X, Y, random_state=1)
    N=780
    X_train, X_dev = X1[N:], X1[:N]
    Y_train, Y_dev = Y1[N:], Y1[:N]
    model.fit(X_train, Y_train)
    acc = model.score(X_dev,Y_dev)
    print("Accuracy: {}".format(acc))
    #'''
    
    if train_or_test == 'train':
        k=10 #kFold
        skf=StratifiedKFold(n_splits=k, random_state=1, shuffle=True)
        #skf=KFold(n_splits=k)
        skf.get_n_splits(X,Y)
        print(skf)
        sum_acc = 0.0
        fold = 1
        for train_index, dev_index in skf.split(X,Y):
            #print("Train Index:", train_index, ",dev Index:", dev_index)
            X_train, X_dev = X[train_index], X[dev_index]
            Y_train, Y_dev = Y[train_index], Y[dev_index]
            model.fit(X_train, Y_train)
            acc = model.score(X_dev,Y_dev)
            sum_acc += acc
            print("Fold {} accuracy: {}".format(fold, acc))
            fold += 1
        average_acc = sum_acc/k
        print("Average accuracy: {}".format(average_acc))
    else:
        print("Training...")
        model.fit(X, Y)
        print("Finish training. Now testing...")
        ids, X_test = read_test(test_file)
        X_test = preprocessing.scale(X_test)
        #X_test = pca.transform(X_test)
        Y_pred = model.predict(X_test)
        writeResultCsv(ids,Y_pred,result_file)
        print("Finish testing!")
Пример #16
0
def gbdt_cv_modeling():
    """

    :return:
    """

    '''Data input'''
    data_b_train = pd.read_csv('../data/B_train_final.csv', index_col='no')
    data_test = pd.read_csv('../data/B_test_final.csv', index_col='no')

    data_train = data_b_train

    data_train_without_label = data_train.drop('flag', axis=1)
    frames = [data_train_without_label, data_test]

    '''给定一个随机数种子,打乱train'''
    s = 0
    np.random.seed(s)
    sampler = np.random.permutation(len(data_train.values))
    data_train_randomized = data_train.take(sampler)

    feature_name = list(data_train.columns.values)
    '''缺失值填充'''
    data_train_filled = data_train_randomized.fillna(value=10)

    '''构造训练集和测试集'''
    x_temp = data_train_filled.iloc[:, :-1].as_matrix()  # 自变量
    y = data_train_filled.iloc[:, -1].as_matrix()  # 因变量

    '''Feature selection'''
    X, dropped_feature_name, len_feature_choose = lgb_feature_selection(feature_name, x_temp, y, '0.1*mean')

    '''处理 验证集 B_test'''
    data_test_filled = data_test.fillna(value=10)
    data_test_filled_after_feature_selection = data_test_feature_drop(data_test_filled, dropped_feature_name)

    '''Split train/test data sets'''
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)  # 分层抽样  cv的意思是cross-validation

    '''Choose a classification model'''
    parameter_n_estimators = 400
    classifier = GradientBoostingClassifier(n_estimators=parameter_n_estimators)

    '''Model fit, predict and ROC'''
    colors = cycle(['cyan', 'indigo', 'seagreen', 'orange', 'blue'])
    lw = 2
    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 500)
    i_of_roc = 0
    a = 0

    probability_set_of_b_test = []

    for (train_indice, test_indice), color in zip(cv.split(X, y), colors):
        a_model = classifier.fit(X[train_indice], y[train_indice])

        probas_ = a_model.predict_proba(X[test_indice])

        prob_of_b_test = a_model.predict_proba(data_test_filled_after_feature_selection)  # 对B_test进行预测

        probability_set_of_b_test.append(prob_of_b_test[:, 1])

        fpr, tpr, thresholds = roc_curve(y[test_indice], probas_[:, 1])

        a += 1  # 序号加1

        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0

        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, lw=lw, color=color, label='ROC fold %d (area = %0.4f)' % (i_of_roc, roc_auc))
        i_of_roc += 1

    plt.plot([0, 1], [0, 1], linestyle='--', lw=lw, color='k', label='Luck')

    mean_tpr /= cv.get_n_splits(X, y)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    print 'mean_auc=' + str(mean_auc)
    plt.plot(mean_fpr, mean_tpr, color='g', linestyle='--', label='Mean ROC (area = %0.4f)' % mean_auc, lw=lw)

    plt.xlim([-0.01, 1.01])
    plt.ylim([-0.01, 1.01])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')

    plt.title('ROC_rd_' + str(s) + '_gbdt_' + str(len_feature_choose) + '_features')
    plt.legend(loc="lower right")
    plt.show()

    avg_prob = (probability_set_of_b_test[0] + probability_set_of_b_test[1] + probability_set_of_b_test[2] +
                probability_set_of_b_test[3] + probability_set_of_b_test[4]) * 1.0 / 5

    result_file_name = '../result/B_test_gbdt_predict_cv_fillna_10_rd_' + str(s) + '_N_' + str(parameter_n_estimators) + '_features_' + \
                       str(len_feature_choose) + '.csv'
Пример #17
0
#with open ('index_label_tuples.json') as fh:
#    index_label_tuples = json.load (fh)'

train_size = int(1* len(index_label_tuples))
y = np.array([index_label_tuples[i][1] for i in range(len(index_label_tuples))])
x = np.array([index_label_tuples[i][0] for i in range(train_size)])
print('before padding', x.shape)
print('Pad sequences (samples x time)')
x = sequence.pad_sequences(x, maxlen=maxlen)
print('x shape:', x.shape)


print('Build model...')
#k-flod cross validation on model
kfold = StratifiedKFold(n_splits=2, shuffle=True)
kfold.get_n_splits(x,y)
cvscores= []
print(kfold)
D1 = pd.DataFrame([])
D2 = pd.DataFrame([])
for train, test in kfold.split(x,y):
    y_test = y[test];
    model = Sequential()
    model.add(Embedding(max_features, 128, dropout=0.2))
    model.add(LSTM(128, dropout_W=0.2, dropout_U=0.2))
    model.add(Dense(3, activation='softmax'))

# try using different optimizers and different optimizer configs
    #D2 = D2.append(pd.DataFrame(y[test]))
    model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',#ROOTmns
Пример #18
0
    def split_files_cv(self,
                       fasta,
                       top_peaks_pwm,
                       num_splits,
                       name,
                       bed=False):
        def correction(line):
            try:
                regular = re.match(
                    r'([0-9]+)::(.+):([0-9]+)-([0-9]+)\([+,-]\)', line)
                return 'chr{}'.format(
                    regular.group(2)), regular.group(3), regular.group(4)
            except:
                print(
                    'this line dont match the regular expresion for bed cg%: ',
                    line)

        def create_files(filetype, n_CV, dataframe):
            with open(filetype + name + str(n_CV) + '.bed', 'w') as finalbed:
                with open(filetype + name + str(n_CV) + '.fa', 'w') as finalfa:
                    for index, row in dataframe.iterrows():
                        finalbed.write('{}\t{}\t{}\t{}\n'.format(
                            row.chr, row.start, row.end, row.id))
                        finalfa.write('>{}\n{}\n'.format(row.id, row.sequence))

        with open(fasta, 'r') as totalfastafile:
            df = pandas.read_csv(totalfastafile,
                                 sep='>',
                                 names=['sequence', 'id'])
        dfid = df['id'].dropna().reset_index(drop=True)
        dfsequence = df['sequence'].dropna().reset_index(drop=True)
        dfasta = pandas.merge(dfid,
                              dfsequence,
                              left_index=True,
                              right_index=True)

        if 'foreground' in name:

            with open(bed, 'r') as totalbedfile:
                dfbed = pandas.read_csv(totalbedfile,
                                        sep='\t',
                                        names=['chr', 'start', 'end', 'id'])
            df = pandas.merge(dfbed, dfasta, on="id")
            del dfid, dfsequence, dfasta, dfbed
            self.n_samples, _ = df.shape

        elif 'cg' in name and self.n_samples != None:

            if self.n_samples == None:
                raise (print(
                    'total number of peaks in foreground is not defined, run foreground before background'
                ))
            df = dfasta
            # create all the bed collumns based on >names from random sequences fasta in the dataframe
            df['chr'], df['start'], df['end'] = zip(*df['id'].map(correction))
            true_mean_gc = round(self.gc_total)
            print(true_mean_gc, 'rounded total mean')
            # extract from the 1000000 list only the ones that has same GC content as peaks mean
            df = df.loc[df['sequence'].apply(lambda x: round(GC(x))) ==
                        true_mean_gc].reset_index(drop=True)
            print(df.shape, 'hola')
            print(df.head())
            # keep only as much sequences as in seq peaks file
            df = df.loc[:self.n_samples - 1]
            print(df.shape, 'qtal')
            print(df.head())

        df['class'] = np.hstack(([1] * self.n_samples))
        # delete the peaks used to create the PWM
        df = df.loc[int(top_peaks_pwm - 1):].reset_index(drop=True)
        print(df.shape)
        skf = StratifiedKFold(n_splits=num_splits, shuffle=True)
        skf.get_n_splits(df, df['class'])
        for count, (train_index,
                    test_index) in enumerate(skf.split(df, df['class'])):
            X_train, X_test = df.loc[train_index], df.loc[test_index]
            create_files('training_', count, X_train)
            create_files('testing_', count, X_test)
Пример #19
0
def authentication(data,data_flip,labels,thread_cnt,data_filename):
    print("Authentication")

    # Get k-fold split of dataset (k=5)
    cv = StratifiedKFold(n_splits=2,shuffle=False,random_state=0)
    cv.get_n_splits(data,labels)

    ### Perform k-fold cross validation
    y_prob = np.array([])
    y_pred = np.array([])
    y_true = np.array([])
    for k,(train_index,test_index) in enumerate(cv.split(data,labels)):
        print("     Fold - " + str(k))

        # Get training and testing sets
        train = np.vstack([data[train_index,:],data_flip[train_index,:]])
        train_labels = np.append(labels[train_index],labels[train_index])
        test = data[test_index,:]
        test_labels = labels[test_index]

        # Normalize to z-scores
        mu = np.mean(train,axis=0)
        std = np.std(train,axis=0)
        train = (train - mu) / std
        test = (test - mu) / std

        # Get training classes
        classes = np.unique(train_labels)
        classes_split = list(split_list(classes.tolist(),thread_cnt))

        ### TRAINING
        # Binary SVM for each class
        class_svms = []
        c_idxes = []
        threads = []
        que = Queue()

        # Thread to train each class binary SVM
        for li in classes_split:
            for i,c in enumerate(li):
                threads.append(Thread(target=authentication_train,args=(c,train,train_labels,que)))
                threads[-1].start()
            
            # Collect training thread results
            _ = [ t.join() for t in threads ]
            while not que.empty():
                (c_idx,svm) = que.get()
                c_idxes.append(c_idx)
                class_svms.append(svm)

        ### TESTING
        threads = []
        que = Queue()
        for li in classes_split:
            for i,c in enumerate(li):
                c_idx = c_idxes.index(c)
                threads.append(Thread(target=authentication_test,args=(c,class_svms[c_idx],test,test_labels,que)))
                threads[-1].start()

            # Collect testing thread results
            _ = [ t.join() for t in threads ]
            while not que.empty():
                result = que.get()

                c = int(result[2])
                c_prob = result[0]
                c_true = result[1]
                c_pred = np.zeros(c_prob.shape[0])
                c_pred[c_prob<0.5] = 1

                y_prob = np.append(y_prob,c_prob)
                y_true = np.append(y_true,c_true)
                y_pred = np.append(y_pred,c_pred)
    
    print()

    ### OVERALL RESULTS    
    TP, FN, FP, TN = metrics.confusion_matrix(y_true,y_pred,labels=[0,1]).ravel()
    ACC = (TP + TN) / (TP + TN + FP + FN)
    FAR = FP / (FP + TN)
    FRR = FN / (FN + TP)

    fpr, tpr, thresholds = metrics.roc_curve(y_true,y_prob,pos_label=0)
    EER = brentq(lambda x : 1. - x - interp1d(fpr, tpr)(x), 0., 1.)
    EER_thresh = interp1d(fpr, thresholds)(EER)
    y_prob = np.ones(y_prob.shape) - y_prob
    AUC = metrics.roc_auc_score(y_true,y_prob)
    
    # Print results
    print(data_filename)
    print("--------------------------------------------------------------------------------------")
    print("Authentication Results:")
    print("TP: " + str(TP) + "\n" +
    "FP: " + str(FP) + "\n" +
    "FN: " + str(FN) + "\n" +
    "TN: " + str(TN) + "\n" +
    "ACC: " + str(ACC) + "\n" +
    "FAR: " + str(FAR) + "\n" +
    "FRR: " + str(FRR) + "\n" +
    "AUC: " + str(AUC) + "\n" +
    "EER: " + str(EER) + "\n" +
    "EER_thresh " + str(EER_thresh))
    print()
Пример #20
0
                          }}




all_data , y_train = encode_dataset(train=train,test=test,meta=meta,target_model='lightgbm')
print("*****************************")
print(all_data.head())
train_obs = len(y_train)
train = all_data[:train_obs]
test = all_data[train_obs:]
train_ids = train.index
test_ids  = test.index

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
skf.get_n_splits(train_ids, y_train)

lgb_test_result  = np.zeros(test_ids.shape[0])
#lgb_train_result = np.zeros(train_ids.shape[0])
#xgb_test_result  = np.zeros(test_ids.shape[0])
#xgb_train_result = np.zeros(train_ids.shape[0])
counter = 0
#Transform data using small groups to reduce memory usage
m = 100000
print('\nLightGBM\n')

for train_index, test_index in skf.split(train_ids, y_train):
    print('Fold {}\n'.format(counter + 1))
    print("**************************") 
    print("train_index:",train_index)
    print("**************************")
Пример #21
0
            for p in range(samples):
                data[cont, :, :, :] = getPatch(img, patch_dim, rows[p],
                                               cols[p])
                labels[cont] = i - 1
                cont += 1

data /= 255

crossval_splits = 5
accuracy = numpy.zeros(crossval_splits)
sensitivity = numpy.zeros(crossval_splits)
specificity = numpy.zeros(crossval_splits)
cont = 0

skf = StratifiedKFold(n_splits=crossval_splits, shuffle=True, random_state=123)
skf.get_n_splits(data, labels)

for train_index, test_index in skf.split(data, labels):
    train_data, test_data = data[train_index], data[test_index]
    train_labels, test_labels = labels[train_index], labels[test_index]
    train_labels = keras.utils.to_categorical(train_labels,
                                              num_classes=no_classes)

    #Convolutional Neural Network
    # create model
    kernel1 = 3
    kernel2 = 5
    no_filters1 = 20
    no_filters2 = 40
    model = keras.models.Sequential()
    #First Convolutional Layer
Пример #22
0
import numpy as np
from sklearn.model_selection import StratifiedKFold

X = ["a", "b", "c", "d"]
y = [1, 1, 2, 2]
skf = StratifiedKFold(n_splits=2)
#for train, test in kf.split(X):
#    print("%s %s" % (train, test))

splits = skf.get_n_splits(X,y)

print(splits)
Пример #23
0
    def validation(self,
                   X,
                   Y,
                   cat_features,
                   method=1,
                   verbose=False,
                   n_folds=5,
                   short=True):
        """
        validation method, you can choose between different validation strategies

        Args:
            X: pandas.DataFrame, shape = (, 24)
            Y: pandas.Series
            method number: [1,2,3] # deprecated for ensemble
            cat_features: [9,10,11] see .train docstring
            n_folds: > 2


        always using k-fold, if n_folds is 1 it is automatically put to 2

        NOTE:
        https://www.youtube.com/watch?v=pA6uXzrDSUs&index=23&list=PLpQWTe-45nxL3bhyAJMEs90KF_gZmuqtm
        """
        if verbose:
            print("{} [{}.validation] start validation method {}".format(
                ctime(), self.name, method))
        validation_score = 0

        if n_folds < 2: n_folds = 2

        from sklearn.model_selection import StratifiedKFold
        splitclass = StratifiedKFold(n_splits=n_folds)

        # the following 20 lines come from sklearn docs example
        # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ShuffleSplit.html
        for train_index, test_index in splitclass.split(X, Y):

            train_X, train_Y = X.loc[train_index], Y.loc[train_index]
            validation_X, validation_Y = X.loc[test_index], Y.loc[test_index]

            assert train_X.shape[0] == train_Y.shape[0]
            assert validation_X.shape[0] == validation_Y.shape[0]

            train_X.reset_index(drop=True, inplace=True)
            train_Y.reset_index(drop=True, inplace=True)
            validation_X.reset_index(drop=True, inplace=True)
            validation_Y.reset_index(drop=True, inplace=True)
            self.meta_predict(train_X,
                              train_Y,
                              validation_X,
                              cat_features,
                              short=short)
            score = self.evaluate(validation_Y)

            if verbose:
                print("{} [{}.validation] single score = {} ".format(
                    ctime(), self.name, score))
            validation_score += score

        # the total validation score is an average of the single validation scores
        validation_score /= splitclass.get_n_splits(X)
        self.validation_score = validation_score

        if verbose:
            print("{} [{}.validation] validation score = {} ".format(
                ctime(), self.name, validation_score))
        if verbose:
            print("{} [{}.validation] finished validation method {}".format(
                ctime(), self.name, method))

        return validation_score
Пример #24
0
                               callbacks=[early_stopping])
    Y_score = bilstm_model.predict(X[test])
    histories.append(history)
    # Compute ROC curve and area the curve
    fpr, tpr, thresholds = roc_curve(encoded_Y[test], Y_score)
    mean_tpr += interp(mean_fpr, fpr, tpr)
    mean_tpr[0] = 0.0
    roc_auc = auc(fpr, tpr)
    pyp.plot(fpr,
             tpr,
             lw=lw,
             color=color,
             label='ROC fold %d (area = %0.2f)' % (i, roc_auc))

pyp.plot([0, 1], [0, 1], linestyle='--', lw=lw, color='k', label='Luck')
mean_tpr /= kfold.get_n_splits(X, encoded_Y)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
print 'ROC AUC: %.2f' % mean_auc

#pyp.plot(mean_fpr, mean_tpr, color='g', linestyle='--',label='Mean ROC (area = %0.2f)' % mean_auc, lw=lw)
#pyp.xlim([0, 1.0])
#pyp.ylim([0, 1.0])
#pyp.xlabel('False Positive Rate')
#pyp.ylabel('True Positive Rate')
#pyp.title('Receiver operating characteristic example')
#pyp.legend(loc="lower right")
#pyp.show()

#over all events
momentum_input['signal'] = np.zeros((len(full_event['signal']), n_cand_per_jet,
def xgb_lgb_cv_modeling():
    """
    :return:
    """

    '''Data input'''
    data_train = pd.read_csv('../data/train.csv', index_col='ID')
    data_predict = pd.read_csv('../data/pred.csv', index_col='ID')

    '''trainset feature engineering 根据具体的数据集进行编写'''
    data_train_without_label = data_train.drop('Label', axis=1)
    
    '''Sample'''
    # s = 0
    # np.random.seed(s)
    # sampler = np.random.permutation(len(data_train_without_label.values))
    # data_train_randomized = data_train_without_label.take(sampler)

    feature_name = list(data_train_without_label.columns.values)
    data_predict_user_id = list(data_predict.index.values)

    '''fillna'''
    frames = [data_train_without_label, data_predict]
    data_all = pd.concat(frames)
    data_train_filled = data_train_without_label.fillna(value=data_all.median())

    '''construct train and test dataset'''
    x_temp = data_train_filled.iloc[:, :].as_matrix()  # 自变量
    y = data_train.iloc[:, -1].as_matrix()  # 因变量

    '''Feature selection'''
    X, dropped_feature_name, len_feature_choose = xgb_feature_selection(feature_name, x_temp, y, '0.1*mean')
    # 0.1*mean可以选出10个特征
    # 0.00001*mean可以选出14个特征

    '''online test dataset -- B_test'''
    # del data_predict['V17']
    # data_predict['UserInfo_242x40'] = data_predict['UserInfo_242'] * data_predict['UserInfo_40']

    data_predict_filled = data_predict.fillna(value=data_all.median())
    data_predict_filled_after_feature_selection = data_test_feature_drop(data_predict_filled, dropped_feature_name)

    '''Split train/test data sets'''
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)  # 分层抽样  cv的意思是cross-validation

    '''Choose a classification model'''
    parameter_n_estimators = 100
    classifier = LGBMClassifier(n_estimators=parameter_n_estimators, learning_rate=0.1)

    '''hyperparameter optimization'''
    # param = {
    #     'max_depth': 6,
    #     'num_leaves': 64,
    #     'learning_rate': 0.03,
    #     'scale_pos_weight': 1,
    #     'num_threads': 40,
    #     'objective': 'binary',
    #     'bagging_fraction': 0.7,
    #     'bagging_freq': 1,
    #     'min_sum_hessian_in_leaf': 100
    # }
    #
    # param['is_unbalance'] = 'true'
    # param['metric'] = 'auc'

    # (1)num_leaves
    #
    # LightGBM使用的是leaf - wise的算法,因此在调节树的复杂程度时,使用的是num_leaves而不是max_depth。
    #
    # 大致换算关系:num_leaves = 2 ^ (max_depth)
    #
    # (2)样本分布非平衡数据集:可以param[‘is_unbalance’]=’true’
    #
    # (3)Bagging参数:bagging_fraction + bagging_freq(必须同时设置)、feature_fraction
    #
    # (4)min_data_in_leaf、min_sum_hessian_in_leaf

    '''Model fit, predict and ROC'''
    colors = cycle(['cyan', 'indigo', 'seagreen', 'orange', 'blue'])
    lw = 2
    mean_f1 = 0.0
    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 500)
    i_of_roc = 0
    a = 0

    th = 0.5

    for (train_indice, test_indice), color in zip(cv.split(X, y), colors):
        a_model = classifier.fit(X[train_indice], y[train_indice])

        # y_predict_label = a_model.predict(X[test_indice])

        probas_ = a_model.predict_proba(X[test_indice])

        fpr, tpr, thresholds = roc_curve(y[test_indice], probas_[:, 1])

        a += 1

        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0

        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, lw=lw, color=color, label='ROC fold %d (area = %0.4f)' % (i_of_roc, roc_auc))
        i_of_roc += 1

        label_transformed = probas_[:, 1]
        for i in range(len(label_transformed)):
            if label_transformed[i] > th:
                label_transformed[i] = 1
            else:
                label_transformed[i] = 0
        lt = label_transformed.astype('int32')
        f1 = f1_score(y[test_indice], lt)
        mean_f1 += f1

    plt.plot([0, 1], [0, 1], linestyle='--', lw=lw, color='k', label='Luck')

    mean_tpr /= cv.get_n_splits(X, y)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    print('mean_auc=' + str(mean_auc))
    print('mean_f1=' + str(mean_f1/5))
    plt.plot(mean_fpr, mean_tpr, color='g', linestyle='--', label='Mean ROC (area = %0.4f)' % mean_auc, lw=lw)
    plt.xlim([-0.01, 1.01])
    plt.ylim([-0.01, 1.01])
    plt.xlabel('False Positive Rate mean_f1:'+str(mean_f1))
    plt.ylabel('True Positive Rate')

    plt.title('ROC_gbdt_' + str(len_feature_choose) + '_features_f1_' + str(mean_f1/5))
    plt.legend(loc="lower right")
    plt.savefig('../result/pred_ROC_XL' + '_N_' + str(parameter_n_estimators) + '_features_' + str(len_feature_choose) +
                '_proba_to_label_using_th_' + str(th) + '.png')
    # plt.show()

    a_model = classifier.fit(X, y)

    # label_predict = a_model.predict(data_predict_filled_after_feature_selection)  # 对B_test进行预测
    proba_predict = a_model.predict_proba(data_predict_filled_after_feature_selection)

    '''proba result'''
    result_file_name = '../result/pred_result_XL_N_' + str(parameter_n_estimators) + '_features_' + str(len_feature_choose) + '_proba.csv'
    write_predict_results_to_csv(result_file_name, data_predict_user_id, proba_predict[:, 1].tolist())

    # '''写入要提交的结果'''
    # result_file_name = '../result/pred_result_N_' + str(parameter_n_estimators) + '_features_' + str(len_feature_choose) + '.csv'
    # write_predict_results_to_csv(result_file_name, data_predict_user_id, label_predict.tolist())

    '''results file'''
    label_transformed = proba_predict[:, 1]
    for i in range(len(label_transformed)):
        if label_transformed[i] > th:
            label_transformed[i] = 1
        else:
            label_transformed[i] = 0
    lt = label_transformed.astype('int32')
    result_file_name = '../result/pred_result_XL_N_' + str(parameter_n_estimators) + '_features_' + str(len_feature_choose) + \
                       '_proba_to_label_using_th_' + str(th) + '.csv'
    write_predict_results_to_csv(result_file_name, data_predict_user_id, lt.tolist())
Пример #26
0
              np.sum(np.abs(prediction_result2 - data_labels)))

        # Cross validate - kNN - All data
        knn_all = KNeighborsClassifier(n_neighbors=knn_k)
        knn_scores_all = cross_val_score(knn_all,
                                         sat_data,
                                         data_labels,
                                         cv=crossval_kfold)
        # Add to output dict
        print('Accuracy, mean of ' + str(crossval_split_k) + '-fold split= ',
              np.mean(knn_scores_all))
        knn_mean_acc[dataset_use] = np.mean(knn_scores_all)

        # Get split for cofusion matrix calculation
        skf = StratifiedKFold(n_splits=crossval_split_k)
        skf.get_n_splits(sat_data, labels)
        # Initialize output confusion matrix and kappa
        knn_all_confmat = np.zeros((n_classes, n_classes))
        knn_all_kappa = []
        # Use split
        for train_index, test_index in skf.split(sat_data, labels):
            # Split into training and test set
            y_train, y_test = labels[train_index], labels[test_index]
            X_train, X_test = sat_data[train_index], sat_data[test_index]
            # Fit classifier
            knn_all.fit(X_train, y_train)
            # Do prediction
            y_pred = knn_all.predict(X_test)
            # Calculate confusion matrix
            conf_mat_temp = confusion_matrix(y_test, y_pred)
            # Add contribution to overall confusion matrix
Пример #27
0
def search_models(training_i, x, train_columns):
    """
        This function computes the performance using 5 baseline models. 
        It was done through stratified k-folds cross-validation using the complication's respective training set, with k= 3. 
        We performed random hyperparameter search for all the hyperparameters over 20 iterations. 
        We finally selected the top two set of hyperparameters that achieved the highest average area under the receiving operator characteristic curve (AUROC) on the validation sets, 
        resulting with 6 final models per complication. The function below returns the 6 models for each respective training subset.
        """

    baselines = ["Logistic Regression", "KNN", "LGBM", "SVM"]
    baselines_need_transforms = ["Logistic Regression"]
    baselines_need_transforms2 = ["KNN", "SVM"]

    X = training_i[train_columns]
    Y = training_i[x]

    n_iterations = 30  # number of iterations for random search
    top_n = 2  # select top n parameter sets
    all_ = {}

    # prepare indexes for stratified cross validation
    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)
    #         skf = ShuffleSplit(n_splits= 3, random_state=0)

    skf.get_n_splits(X, Y)

    print("Stratified K Fold into 3 splits ...")

    for base in (baselines):
        print(f"Search in progress : {base}")

        roc_auc_mean, auprc_mean, dict_list, model_list, train_list, val_list = (
            [] for i in range(6))
        models_1, vals_1, trains_1 = ([] for i in range(3))

        for i in range(0, n_iterations):
            if ((i + 1) % 10 == 0):
                print(f"Random search {i+1}...")

            skf_split = skf.split(X, Y)
            param_dictionary, model, X_ = choose_baseline(base, X)

            roc_in_k, pr_in_k, clf_in_k, train_in_k, val_in_k = (
                [] for i in range(5))
            j = 0

            for train_index, val_index in skf_split:
                X_train = X_.iloc[train_index]
                y_train = Y.iloc[train_index]

                X_val = X_.iloc[val_index]
                y_val = Y.iloc[val_index]
                if (model in baselines_need_transforms):

                    X_val, X_train = apply_transforms_MinMax_Scaler(
                        X_val, X_train, train_columns)

                if (model in baselines_need_transforms2):

                    X_val, X_train = apply_transforms_STD_Scaler(
                        X_val, X_train, train_columns)

                clf = model(**param_dictionary)

                clf = clf.fit(X_train, y_train)

                # predicting
                y_pred = clf.predict_proba(X_val)[:, 1]

                #                     calculate performance across folds
                roc = roc_auc_score(y_val, y_pred)
                AUPRC = average_precision_score(y_val, y_pred)
                pr_in_k.append(AUPRC)
                roc_in_k.append(roc)
                roc_array = np.asarray(roc_in_k)
                pr_array = np.asarray(pr_in_k)
                clf_in_k.append(clf)
                train_in_k.append(train_index)
                val_in_k.append(val_index)
                j = j + 1

        #   append the lists for each hyperparameter search

            roc_auc_mean.append(roc_array.mean())
            auprc_mean.append(pr_array.mean())
            dict_list.append(param_dictionary)
            val_list.append(val_in_k)
            train_list.append(train_in_k)
            model_list.append(clf_in_k)
            gc.collect()

    # Storing results for this model
        print(f"Storing results of top models for {base}")
        results_pd = pd.DataFrame({
            "avg_roc_auc": roc_auc_mean,
            "avg_auprc": auprc_mean,
            "clf_s": model_list,
            "validation_sets": val_list,
            "train_sets": train_list
        })

        results_pd.sort_values("avg_roc_auc",
                               ascending=False,
                               axis=0,
                               inplace=True)
        top_pd = results_pd.head(top_n)
        models_1.append(top_pd['clf_s'].values[0:3][:6])
        vals_1.append(top_pd['validation_sets'].values[0:3][:6])
        trains_1.append(top_pd['train_sets'].values[0:3][:6])
        param_df = pd.DataFrame()
        param_df["models"] = models_1
        param_df["vals"] = vals_1
        param_df["trains"] = trains_1
        param_df["auc_val"] = top_pd.avg_roc_auc.mean()
        param_df["auprc_val"] = top_pd.avg_auprc.mean()

        val_sets, train_sets, models_ = ([] for i in range(3))

        for i in range(len(param_df.vals[0])):
            for j in (param_df.vals[0][i]):
                val_sets.append(j)
        for i in range(len(param_df.trains[0])):
            for j in (param_df.trains[0][i]):
                train_sets.append(j)

        for i in range(len(param_df.models[0])):
            for j in (param_df.models[0][i]):
                models_.append(j)


#             storing top models and performance for all the different types of models
        all_[base] = [
            models_, param_df["auc_val"].values, param_df["auprc_val"].values,
            val_sets, train_sets
        ]

    return (all_)
Пример #28
0
def predict():

    #importing our already trained model using pickle
    #    with open ('log_model', 'rb') as f:
    #        lr_model = pickle.load(f)

    #importing the dataset as a corpus using the pandas library
    data = pd.read_csv('data.csv')
    data.head()

    #inspecting the data to see what it looks like
    data['Body'][0]

    data['Body'][:7]

    #Looking at the data, there are some missing columns, so let's take care of that
    data.fillna('Article unavailable')

    #data cleaning with text preprocessing techniques
    #data cleaning first round
    #using regular expressions and string to clean

    #function for first round of data cleaning
    def clean_text_round1(text):
        text = str(text).lower()  #making all text lowercase
        text = re.sub('\[.*?\]', '',
                      text)  #removing full stops and question marks
        text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
        text = re.sub('\w*\d\w*', '', text)  #removing digits
        return text

    round1 = lambda x: clean_text_round1(x)

    #Let's take a look at the updated text
    data_clean = pd.DataFrame(data.Body.apply(round1))
    data_clean

    #let's apply a second round of cleaning because some nonsensical text was ignored in the first clean
    def clean_text_round2(text):
        text = re.sub('[' '""...]', '', text)
        text = re.sub('\n', '', text)
        return text

    round2 = lambda x: clean_text_round2(x)

    #let's take a look at the updated text again
    data_clean = pd.DataFrame(data_clean.Body.apply(round2))
    data_clean['Body'][0]

    #Concatenating our cleaned data to our corpus
    data['clean_body'] = data_clean
    data['clean_body'][0]

    #Extract features and target variables
    import numpy as np

    X = np.array(data['clean_body'], data['URLs'])  #feature variables
    y = np.array(data['Label'])  #target variable

    y = list(map(int, y))

    #Split the data into folds
    from sklearn.model_selection import StratifiedKFold

    kf = StratifiedKFold(n_splits=2)
    kf.get_n_splits(X)

    #Split the data into train and test
    from sklearn.model_selection import train_test_split

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

    #create a document-term matrix for the train and test data using tfidf vectorizer
    from sklearn.feature_extraction.text import TfidfVectorizer

    tf = TfidfVectorizer(stop_words='english',
                         max_df=0.7)  #removing all Englilsh stop words
    tf_train = tf.fit_transform(X_train)
    tf_test = tf.transform(X_test)

    #get feature names
    #tf.get_feature_names()

    #Now we feed our data into our classifiers to develop our model.
    #First we try the Naive Bayes classifier
    from sklearn.naive_bayes import MultinomialNB

    nb = MultinomialNB()

    #training the model
    nb.fit(tf_train, y_train)

    #predicting
    nb_pred = nb.predict(tf_test)
    nb_pred[0:10]

    #Evaluating the accuracy of the model
    nb_score = nb.score(tf_test, y_test)
    print('accuracy: %0.3f' % nb_score)

    #Next let's build another model using logistic Regression
    from sklearn.linear_model import LogisticRegression

    lr = LogisticRegression()

    #training the model
    lr.fit(tf_train, y_train)

    #predicting
    lr_pred = lr.predict(tf_test)
    lr_pred[0:10]

    #Evaluating the accuracy of the logistic regression model
    lr_score = lr.score(tf_test, y_test)
    print('accuracy: %0.3f' % lr_score)

    #Using the random forest classifier
    from sklearn.ensemble import RandomForestClassifier

    rf = RandomForestClassifier()

    #training the model
    rf.fit(tf_train, y_train)

    #predicting
    rf_pred = rf.predict(tf_test)
    rf_pred[0:10]

    #Evaluating the accuracy of the model
    rf_score = rf.score(tf_test, y_test)
    print('accuracy: %0.3f' % rf_score)

    #let's pickle the data_dtm for future use
    #data_dtm.to_pickle("dtm.pkl")

    #Since the logistic regression model is the most accurate classifier, lets save it and test it with
    #other news articles

    #saving the model

    with open('log_model', 'wb') as f:
        pickle.dump(lr, f)

    #importing the model and testing

    with open('log_model', 'rb') as f:
        lr_model = pickle.load(f)

    if request.method == 'POST':
        article = request.form['article']
        input_article = [article]
        vect = tf.transform(input_article).toarray()
        lr_predict = lr_model.predict(vect)

    return render_template('result.html', prediction=lr_predict)
Пример #29
0
from sklearn.preprocessing import MinMaxScaler,StandardScaler


scaler = StandardScaler()
X_name = scaler.fit_transform(X_name)


acc_sum_color = 0
acc_sum_face = 0
acc_sum_name = 0
acc_sum_hybrid = 0


kf = StratifiedKFold(n_splits=10) 
kf.get_n_splits(X_color,Y) 
X_hybrid = [None]*(count)
for train_index, test_index in kf.split(X_color,Y):
	 X_color_train, X_color_test = X_color[train_index], X_color[test_index]
	 X_face_train, X_face_test = X_face[train_index], X_face[test_index]
	 X_name_train, X_name_test = X_name[train_index], X_name[test_index]
	 y_train, y_test = Y[train_index], Y[test_index]

	 pnn = algorithms.PNN()
	 pnn.fit(X_color_train, y_train)
	 predicted_color_prob = pnn.predict_proba(X_color_test)
	 
	 	
	 pnn = algorithms.PNN()
	 pnn.fit(X_face_train, y_train)
	 predicted_face_prob = pnn.predict_proba(X_face_test)
    #Getting the mean accuracy and standard deviation of accuracy score
    mean_score=np.mean(cvscores)
    std_score=np.std(cvscores)

    #printing the results.
    print("####################################")
    print("Accuracy:")
    print mean_score
    print ("+/-")
    print std_score

    print("####################################")
    print("Confusion Matrix:")
    print conf
    print("####################################")
    print("ROC AND AUC")
    plt.plot([0, 1], [0, 1], linestyle='--', lw=lw, color='k',label='Luck')
    mean_tpr /= kfold.get_n_splits(x, y)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    plt.plot(mean_fpr, mean_tpr, color='g', linestyle='--',label='Mean ROC (area = %0.2f)' % mean_auc, lw=lw)
    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC CURVE FOR SIFT D = '+str(p)+' pixels')
    plt.legend(loc="lower right")
    plt.show()
    print("##################################")
Пример #31
0
def poly(data, label, n_folds=10, scale=True, exclude=[],
         feature_selection=False, save=True, scoring='auc',
         project_name='', concurrency=1, verbose=True):
    '''
    Input
    data         = numpy matrix with as many rows as samples
    label        = numpy vector that labels each data row
    n_folds      = number of folds to run
    scale        = whether to scale data or not
    exclude      = list of classifiers to exclude from the analysis
    feature_selection = whether to use feature selection or not (anova)
    save         = whether to save intermediate steps or not
    scoring      = Type of score to use ['auc', 'f1']
    project_name = prefix used to save the intermediate steps
    concurrency  = number of parallel jobs to run
    verbose      = whether to print or not results

    Ouput
    scores       = matrix with scores for each fold and classifier
    confusions   = confussion matrix for each classifier
    predictions  = Cross validated predicitons for each classifier
    '''

    assert label.shape[0] == data.shape[0],\
        "Label dimesions do not match data number of rows"
    _le = LabelEncoder()
    _le.fit(label)
    label = _le.transform(label)
    n_class = len(np.unique(label))

    if save and not os.path.exists('poly_{}/models'.format(project_name)):
        os.makedirs('poly_{}/models'.format(project_name))

    if not verbose:
        logger.setLevel(logging.ERROR)
    logger.info('Building classifiers ...')
    classifiers = build_classifiers(exclude, scale,
                                    feature_selection,
                                    data.shape[1])

    scores = pd.DataFrame(columns=pd.MultiIndex.from_product(
        [classifiers.keys(), ['train', 'test']]),
        index=range(n_folds))
    predictions = pd.DataFrame(columns=classifiers.keys(),
                               index=range(data.shape[0]))
    test_prob = pd.DataFrame(columns=classifiers.keys(),
                             index=range(data.shape[0]))
    confusions = {}
    coefficients = {}
    # !fitted_clfs =
    # pd.DataFrame(columns=classifiers.keys(), index = range(n_folds))

    logger.info('Initialization, done.')

    skf = StratifiedKFold(n_splits=n_folds, random_state=1988)
    skf.get_n_splits(np.zeros(data.shape[0]), label)
    kf = list(skf.split(np.zeros(data.shape[0]), label))

    # Parallel processing of tasks
    manager = Manager()
    args = manager.list()
    args.append({})  # Store inputs
    shared = args[0]
    shared['kf'] = kf
    shared['X'] = data
    shared['y'] = label
    args[0] = shared

    args2 = []
    for clf_name, val in classifiers.items():
        for n_fold in range(n_folds):
            args2.append((args, clf_name, val, n_fold, project_name,
                          save, scoring))

    if concurrency == 1:
        result = list(starmap(fit_clf, args2))
    else:
        pool = Pool(processes=concurrency)
        result = pool.starmap(fit_clf, args2)
        pool.close()

    fitted_clfs = {key: [] for key in classifiers}

    # Gather results
    for clf_name in classifiers:
        coefficients[clf_name] = []
        temp = np.zeros((n_class, n_class))
        temp_pred = np.zeros((data.shape[0], ))
        temp_prob = np.zeros((data.shape[0], ))
        clfs = fitted_clfs[clf_name]
        for n in range(n_folds):
            train_score, test_score, prediction, prob, confusion,\
                coefs, fitted_clf = result.pop(0)
            clfs.append(fitted_clf)
            scores.loc[n, (clf_name, 'train')] = train_score
            scores.loc[n, (clf_name, 'test')] = test_score
            temp += confusion
            temp_prob[kf[n][1]] = prob
            temp_pred[kf[n][1]] = _le.inverse_transform(prediction)
            coefficients[clf_name].append(coefs)

        confusions[clf_name] = temp
        predictions[clf_name] = temp_pred
        test_prob[clf_name] = temp_prob

    # Voting
    fitted_clfs = pd.DataFrame(fitted_clfs)
    scores['Voting', 'train'] = np.zeros((n_folds, ))
    scores['Voting', 'test'] = np.zeros((n_folds, ))
    temp = np.zeros((n_class, n_class))
    temp_pred = np.zeros((data.shape[0], ))
    for n, (train, test) in enumerate(kf):
        clf = MyVoter(fitted_clfs.loc[n].values)
        X, y = data[train, :], label[train]
        scores.loc[n, ('Voting', 'train')] = _scorer(clf, X, y)
        X, y = data[test, :], label[test]
        scores.loc[n, ('Voting', 'test')] = _scorer(clf, X, y)
        temp_pred[test] = clf.predict(X)
        temp += confusion_matrix(y, temp_pred[test])

    confusions['Voting'] = temp
    predictions['Voting'] = temp_pred
    test_prob['Voting'] = temp_pred
    ######

    # saving confusion matrices
    if save:
        with open('poly_' + project_name + '/confusions.pkl', 'wb') as f:
            p.dump(confusions, f, protocol=2)

    if verbose:
        print(scores.astype('float').describe().transpose()
              [['mean', 'std', 'min', 'max']])
    return Report(scores, confusions, predictions, test_prob, coefficients)
    test_acc = []
    test_f1 = []
    test_precision = []
    test_recall = []

    all_num_trainval = []
    all_num_test = []
    all_num_under_test = []

    # constructing stratified K-Folds for Outer-Cross-Validation (OCV)
    # splitting into trainval & test datasets
    skf = StratifiedKFold(n_splits=TEST_FOLD,
                          random_state=RANDOM_STATE,
                          shuffle=True)

    skf.get_n_splits(X, y)
    test_count = 1
    for trainval_index, test_index in skf.split(X, y):
        X_trainval, X_test = X[trainval_index], X[test_index]
        y_trainval, y_test = y[trainval_index], y[test_index]

        # undersampling test set
        print("-" * 70)
        print("START PROCESSING TEST SET")
        num_class0, num_class1 = y_test.value_counts()
        all_num_test.append([num_class0, num_class1])
        num_us_instance = num_class0 - num_class1
        sample_test = pd.concat([y_test, X_test], axis=1)
        under_sample_test = undersampling(sample_test, num_us_instance)

        num_class0, num_class1 = under_sample_test['y'].value_counts()
Пример #33
0
def load_data_stratified(filename, fold, ispca, n_component):
    tanggal = strftime("%d%m%y-%H%M%S")
    text_file = open("extract/data-" + tanggal + ".txt", "w")
    t0 = time()
    with open(filename, 'rb') as csvfile:
        lines = csv.reader(csvfile)
        dataset = list(lines)
    trainingSetFold = []
    trainingSetTFold = []
    testSetDFold = []
    testSetTFold = []

    trainSet = []
    trainLabel = []

    for x in range(len(dataset)):
        for y in range(len(dataset[0]) - 1):
            dataset[x][y] = float(dataset[x][y])
        trainSet.append(dataset[x][:-1])
        trainLabel.append(dataset[x][-1])

    skf = StratifiedKFold(n_splits=10)
    skf.get_n_splits(trainSet, trainLabel)

    for train_index, test_index in skf.split(trainSet, trainLabel):
        trainingSetD = []
        trainingSetT = []
        testSetT = []
        testSetD = []
        for y in train_index:
            trainingSetD.append(trainSet[y])
            trainingSetT.append(trainLabel[y])
        for y in test_index:
            testSetD.append(trainSet[y])
            testSetT.append(trainLabel[y])
        if ispca:
            from sklearn.decomposition import PCA
            t0 = time()
            pca = PCA(n_components=n_component)
            trainingSet = pca.fit_transform(trainingSetD)
            testSet = pca.transform(testSetD)
            timepreprocesss = ("%0.3fs" % (time() - t0))
            print("PCA from " + str(len(dataset[0]) - 1) + " to " +
                  str(n_component) + " done in %s" % timepreprocesss)
            text_file.write(
                "PCA from %s to %s done in %s\n" %
                (str(len(dataset[0]) - 1), str(n_component), timepreprocesss))
            trainingSet = trainingSet.tolist()
            trainingSetD = trainingSet
            testSet = testSet.tolist()
            testSetD = testSet
        trainingSetFold.append(trainingSetD)
        trainingSetTFold.append(trainingSetT)
        testSetDFold.append(testSetD)
        testSetTFold.append(testSetT)
    timeload = ("%0.5fs" % (time() - t0))
    print "Load time > " + timeload + ", Dimension > " + str(
        len(dataset)) + "*" + str(len(dataset[0]))
    text_file.write("Load time > %s ---- Dimension > %s * %s\n" %
                    (timeload, str(len(dataset)), str(len(dataset[0]))))
    return trainingSetFold, trainingSetTFold, testSetDFold, testSetTFold
]

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)

for name, pipeline in pipelines:
    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)
    for train, test in cv.split(X, y):
        probas_ = pipeline.fit(X[train], y[train]).predict_proba(X[test])
        fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0
        roc_auc = auc(fpr, tpr)

    mean_tpr /= cv.get_n_splits(X, y)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    plt.plot(mean_fpr, mean_tpr, linestyle='--',
             label='{} (area = %0.2f)'.format(name) % mean_auc, lw=LW)

plt.plot([0, 1], [0, 1], linestyle='--', lw=LW, color='k',
         label='Luck')

# make nice plotting
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
ax.spines['left'].set_position(('outward', 10))
ax.spines['bottom'].set_position(('outward', 10))

if __name__ == '__main__':

    with open(op_file, 'w') as of:

        x_data, y_data = read_data(ip_txt_file)

        ext_feature = read_external_features(ip_txt_file, ip_feat_file)

        cv_count = 0
        k_score = []

        # Stratified cross-validation
        skf = StratifiedKFold(n_splits=sys_params['cross_val'])
        skf.get_n_splits(x_data, y_data)

        # Run the model for each splits
        for train_index, test_index in skf.split(x_data, y_data):
            cv_count += 1
            print '\nRunning Stratified Cross Validation: {0}/{1}...'.format(
                cv_count, sys_params['cross_val'])

            x_train, x_test = x_data[train_index], x_data[test_index]
            y_train, y_test = y_data[train_index], y_data[test_index]

            # Convert the class labels into categorical
            y_train, y_test = to_categorical(y_train), to_categorical(y_test)

            # Reshape the data for CNN
            x_train = x_train.reshape(x_train.shape[0], x_train.shape[1],
Пример #36
0
i = 0
for (train, test), color in zip(cv.split(X, y), colors):
    probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test])
    assert isinstance(probas_, np.ndarray)
    print(probas_.shape)
    # Compute ROC curve and area the curve
    fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
    mean_tpr += interp(mean_fpr, fpr, tpr)
    mean_tpr[0] = 0.0
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=lw, color=color,
             label='ROC fold %d (area = %0.2f)' % (i, roc_auc))

    i += 1
plt.plot([0, 1], [0, 1], linestyle='--', lw=lw, color='k',
         label='Luck')

mean_tpr /= cv.get_n_splits(X, y)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, color='g', linestyle='--',
         label='Mean ROC (area = %0.2f)' % mean_auc, lw=lw)

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.savefig(os.path.join(local_path, 'plot_roc_crossval.png'))
Пример #37
0
def main():
    feature_array_all=np.loadtxt('x_1170.txt',dtype=np.float32)
    f = open("y.txt", "rb")
    label_vector= f.read().decode()
    label_vector=list(label_vector)
    f.close()
    label_vector = np.array(label_vector,dtype=np.float32)

    #The independent testing dataset is taken out and cannot participate in 5-CV
    X_trainset,X_testset,y_trainset,y_testset=train_test_split(feature_array_all,label_vector,test_size=0.2,random_state=0,stratify=label_vector)

    X=X_trainset
    y=y_trainset
    skf = StratifiedKFold(n_splits=5, random_state=2, shuffle=True)
    skf.get_n_splits(X, y)

    ACC_sum=0
    roc_auc_sum=0
    Sn_sum=0
    Sp_sum=0
    F1_sum=0
    MCC_sum=0
    cnt=1
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        clf=svm.SVC(probability=True,C=14.251026703029963,gamma=0.007196856730011528)
        clf=clf.fit(X_train,y_train)
        score_r=clf.score(X_test,y_test)
        predict_y_test = clf.predict(X_test)
 
        TP=0
        TN=0
        FP=0
        FN=0
        for i in range(0,len(y_test)):
            if int(y_test[i])==1 and int(predict_y_test[i])==1:
                TP=TP+1
            elif int(y_test[i])==1 and int(predict_y_test[i])==0:
                FN=FN+1
            elif int(y_test[i])==0 and int(predict_y_test[i])==0:
                TN=TN+1
            elif int(y_test[i])==0 and int(predict_y_test[i])==1:
                FP=FP+1
        Sn=float(TP)/(TP+FN)
        Sp=float(TN)/(TN+FP)
        ACC=float((TP+TN))/(TP+TN+FP+FN)
        prob_predict_y_test = clf.predict_proba(X_test)
        predictions_test = prob_predict_y_test[:, 1]       
        
        y_validation=np.array(y_test,dtype=int)
        fpr, tpr, thresholds =metrics.roc_curve(y_validation, predictions_test,pos_label=1)
        roc_auc = auc(fpr, tpr)
        F1=metrics.f1_score(y_validation, np.array(predict_y_test, int))
        MCC=metrics.matthews_corrcoef(y_validation, np.array(predict_y_test, int))
        print('Times=%s'%cnt)
        print('svm ACC:%s'%ACC)
        print('svm AUC:%s'%roc_auc)
        print('svm Sn:%s'%Sn)
        print('svm Sp:%s'%Sp)
        print('svm F1:%s'%F1)
        print('svm MCC:%s'%MCC)
        ACC_sum+=ACC
        roc_auc_sum+=roc_auc
        Sn_sum+=Sn
        Sp_sum+=Sp
        F1_sum+=F1
        MCC_sum+=MCC
        cnt+=1

    ACC=ACC_sum/5
    roc_auc=roc_auc_sum/5
    Sn=Sn_sum/5
    Sp=Sp_sum/5
    F1=F1_sum/5
    MCC=MCC_sum/5
    print('')
    print('5-Fold cross validation_Conclusion')
    print('svm ACC:%s'%ACC)
    print('svm AUC:%s'%roc_auc)
    print('svm Sn:%s'%Sn)
    print('svm Sp:%s'%Sp)
    print('svm F1:%s'%F1)
    print('svm MCC:%s'%MCC)
Пример #38
0
def identification(data,data_flip,labels,thread_cnt,data_filename):
    print("Identification")

    # Get k-fold split of dataset (k=5)
    cv = StratifiedKFold(n_splits=5,shuffle=False,random_state=1)
    cv.get_n_splits(data,labels)

    ### Perform k-fold cross validation
    y_prob_list = []
    y_pred = np.array([])
    y_true = np.array([])
    for k,(train_index,test_index) in enumerate(cv.split(data,labels)):
        print("     Fold - " + str(k))

        # Get training and testing sets
        train = np.vstack([data[train_index,:],data_flip[train_index,:]])
        train_labels = np.append(labels[train_index],labels[train_index])
        test = data[test_index,:]
        test_labels = labels[test_index]

        # Normalize to z-scores
        mu = np.mean(train,axis=0)
        std = np.std(train,axis=0)
        train = (train - mu) / std
        test = (test - mu) / std

        # Get training classes
        classes = np.unique(train_labels)

        ### TRAINING
        svm = SVC(kernel='linear', probability=True)
        svm.fit(train,train_labels)

        ### TESTING
        prediction = svm.predict(test)
        prob = svm.predict_proba(test)

        for i,label in enumerate(test_labels):
            j = int(label-1)
            y_prob_list.append(prob[i,j]) 

        y_true = np.append(y_true,test_labels)
        y_pred = np.append(y_pred,prediction)
    
    print()

    ### OVERALL RESULTS    
    confusion_matrix = metrics.confusion_matrix(y_true,y_pred)
    TP = 0
    FP = 0
    FN = 0
    TN = 0
    for i in range(confusion_matrix.shape[0]):
        TP_i = confusion_matrix[i,i]
        FP_i = np.sum(confusion_matrix[i,:]) - TP_i
        FN_i = np.sum(confusion_matrix[:,i]) - TP_i
        TN_i = np.sum(np.sum(confusion_matrix)) - TP_i - FP_i - FN_i

        TP = TP + TP_i
        FP = FP + FP_i
        FN = FN + FN_i
        TN = TN + TN_i

    ACC = (TP + TN) / (TP + TN + FP + FN)
    FAR = FP / (FP + TN)
    FRR = FN / (FN + TP)

    # Print results
    print(data_filename)
    print("--------------------------------------------------------------------------------------")
    print("Identification Results:")
    print("TP: " + str(TP) + "\n" +
    "FP: " + str(FP) + "\n" +
    "FN: " + str(FN) + "\n" +
    "TN: " + str(TN) + "\n" +
    "ACC: " + str(ACC) + "\n" +
    "FAR: " + str(FAR) + "\n" +
    "FRR: " + str(FRR))
    print(str(min(y_prob_list)))
    print()
Пример #39
0
def make_cnn(params):
    
    common_features = []
    conv_activation = params['activ']
    kernel_size = params['kern']
    num_filters = params['nfilt']
    pool_size = params['pool_size']
    dense_layer_size = params['dense_size']
    num_dense_layers = params['num_dense_layers']
    dense_activation = params['dense_activ']
    num_conv_layers = params['num_conv']
    num_pool_layers = params['num_pool']
    
    train_images = params['train_images']
    train_labels = params['train_labels']
    num_images = params['num_images']
    image_width = params['image_width']
    stride = params['stride']
    
    num_train_images = int(num_images * 4/5)
    num_test_images = int(num_images * 1/5)
    
    try:
        #construct CNN
        
        for i in range(0, num_conv_layers):
            if i==0:
                common_features.append(Conv2D(num_filters, kernel_size=kernel_size, activation=conv_activation, input_shape=(image_width,image_width,1), strides=(stride, stride)))
            else:
                common_features.append(Conv2D(num_filters, kernel_size=kernel_size, activation=conv_activation, strides=(stride, stride)))
        
            if i<num_pool_layers:
                common_features.append(MaxPooling2D(pool_size = (pool_size, pool_size), strides=(stride, stride)))
            
            if kernel_size > 3:
                kernel_size -= 1
               
        for i in range(0, num_dense_layers):
            if i==0:
                common_features.append(Flatten())
                 
            common_features.append(Dense(dense_layer_size, activation=dense_activation))
        common_features.append(Dense(2, activation='sigmoid'))
        
        
        # measure 5-fold classification accuracy
        
        num_folds = 5
        kfold = StratifiedKFold(n_splits = num_folds, shuffle=True, random_state=1)
        kfold.get_n_splits(train_images, train_labels)
        average_train_performance = 0
        
        for train_indices, test_indices in kfold.split(train_images, train_labels):
            folded_train_images = []
            folded_train_labels = []
            folded_test_images = []
            folded_test_labels = []
            
            for train_index in train_indices:
                folded_train_images.append(train_images[train_index])
                folded_train_labels.append(train_labels[train_index])
                
            for test_index in test_indices:
                folded_test_images.append(train_images[test_index])
                folded_test_labels.append(train_labels[test_index])
                
            cnn_model = Sequential(common_features)
            print(cnn_model.summary())
            cnn_model.compile(optimizer='sgd', loss='categorical_crossentropy',metrics=['accuracy'],)
            
            cnn_model.fit(np.reshape(folded_train_images, [num_train_images, image_width, image_width, 1]), np.reshape(to_categorical(folded_train_labels), [num_train_images, 2]), epochs=20, batch_size=16,)
            
            # measure classification accuracy for the validation fold
            train_performance = cnn_model.evaluate(np.reshape(folded_test_images, [num_test_images, image_width, image_width, 1]), np.reshape(to_categorical(folded_test_labels), [num_test_images, 2]))
            average_train_performance += train_performance[1]
            print("Accuracy on Train set: {0}".format(train_performance[1]))
        
        # return avg classification accuracy over 5 folds
        average_train_performance /= 5
        return average_train_performance*-1
    
    # if an error happens, return a big number. this can be triggered sometimes, for instance if there are more pool layers than conv layers
    except Exception as e:
        print("ERROR: {}".format(e))
        return 1000
        ycat.name = ycat.name + '_cat';
    
    ##########################################################################################        
    # <PLACEHOLDER FOR NON-GENERIC CODE: INSERT CODE HERE>
    X = X.dropna(); 
    y = np.log(200+y);
    ycat = pd.qcut(y, quantiles);
    ycat.name = ycat.name + '_cat';
    # <PLACEHOLDER FOR NON-GENERIC CODE: INSERT CODE HERE>    
    ########################################################################################## 
    
    # Get first iteration of the k-fold indices, use it for the train-validation split
    # Other iterations may be used later  
    #print 'Splitting training data into training and validation sets...';
    skf = StratifiedKFold(n_splits=int(1./validation_size), shuffle=True);
    skf.get_n_splits(X, y);
    train_indices, valid_indices = next(iter(skf.split(X, ycat)));
    # Scale the numeric columns if required.
    X = X.join(pd.Series('TRAIN', index=train_indices, name = 'rowtype').append(pd.Series('VALID', index=valid_indices, name = 'rowtype')));
    X_test=test_dataset.join(pd.Series('TEST', index=test_dataset.index, name = 'rowtype'));   

    # Combine train, valid and test covariates to create a consolidated covariate set
    covariates = pd.concat([X, X_test], axis=0, ignore_index=True);     
    # If id column does not exist, create one.
    if (idcol is None) or ( len(idcol) == 0 ):
        idcol = 'id';
        covariates=covariates.join(pd.Series( range(1, len(covariates) + 1,1), index=covariates.index, name = idcol ));
    
    # Find and add columns with zero std deviation to irrelevant columns- These add no information.                    
    irrelevant_cols = irrelevant_cols + (covariates.std(axis=0, numeric_only=True) < 0.5)[(covariates.std(axis=0) == 0.0)].index.tolist();                                                     
    
Пример #41
0
def main():

    CID = opts.cluster

    if (opts.load != 'none'): CID = opts.load

    X_train, X_test, Y_train, Y_test, X, X2, X3, enc = f.get_data_pro(
        testsize=0.2)

    #Y_inv = decode_y(Y_train)

    ranges = np.linspace(.1, 1.0, 10)

    for size in ranges:

        x_train, x_placeholder, y_train, yplaceholder = train_test_split(
            X_train, Y_train, test_size=1 - size, random_state=0)

        skf = StratifiedKFold(n_splits=10)

        y_train_dec = decode_y(y_train)

        skf.get_n_splits(x_train, y_train_dec)

        accues = []
        aucs = []

        for train_index, validate_index in skf.split(x_train, y_train_dec):
            print("TRAIN:", train_index, "TEST:", validate_index)
            x_cvtrain, x_validate = x_train[train_index], x_train[
                validate_index]
            y_cvtrain, y_validate = y_train[train_index], y_train[
                validate_index]

            model = bulid_model(x_cvtrain,
                                x_validate,
                                y_cvtrain,
                                y_validate,
                                X,
                                X2,
                                X3,
                                CID,
                                fromfile=opts.load)

            Y_de = decode_y(y_validate, features=enc.active_features_)
            Y_pred = model.predict(x_validate)
            Y_score = model.predict_proba(x_validate)

            fpr, tpr, thplaceholder = roc_curve(Y_de, Y_score[:, 1])
            Y_depred = decode_y(Y_pred, features=enc.active_features_)

            accues.append(accuracy_score(Y_depred, Y_de))
            aucs.append(auc(fpr, tpr))

        print("###########################")
        print(accues)
        print(aucs)

    # model, history = bulid_model(
    #     X_train, X_test, Y_train, Y_test, X, X2, X3, CID, fromfile=opts.load)

    # #newData = X_test.reshape(X_test.shape[0], 1, 100, 20)

    # Y_score = model.predict_proba(X_test)

    # roc.roc_plot(
    #     Y_test,
    #     Y_score,
    #     2,
    #     filepath=os.path.join('figures', CID + opts.title + 'roc.svg'),
    #     fmt='svg',
    #     title=opts.title)

    # plt.close()

    # print(history.history.keys())
    # # summarize history for accuracy

    # plt.plot(history.history['acc'])
    # plt.plot(history.history['val_acc'])
    # plt.title('model accuracy')
    # plt.ylabel('accuracy')
    # plt.xlabel('epoch')
    # plt.legend(['train', 'test'], loc='upper left')
    # plt.savefig(os.path.join('figures', CID + opts.title + 'learning-c.svg'),format='svg')
    # plt.close()

    #Y_de = decode_y(Y_test, features=enc.active_features_)
    #Y_pred = model.predict(X_test)
    #Y_depred = decode_y(Y_pred, features=enc.active_features_)
    #print(classification_report(Y_de, Y_depred))

    return