예제 #1
0
    split_train = {l:0 for l in labels}
    for l in labels:
        split_train[l] = sum(developement_df[l].values)

    split_test = {l:0 for l in labels}
    for l in labels:
        split_test[l] = sum(testing_df[l].values)

    n_samples_train = len(developement_df)
    n_samples_test = len(testing_df)

    # Create the appropriate statistics container for the whole experiment.
    training_stats = Statistics()
    validation_stats = Statistics()
    testing_stats = Statistics()
    seeds = create_seeds(iterations)
    min_class_freq = min(split_train.values())
    cv_folds = min([min_class_freq, cv_folds])
    statistics_objects = []
    best_params = {l: {'score': 0.0, 'params': {}} for l in labels}

    print("Running Supervised Ensemble Classification...")
    # def do_iteration(i):
    for i in range(iterations):
        print("Iteration " + str(i+1))
        rng = np.random.RandomState()
        rng.seed(seeds[i])
        dev_df_i = developement_df.copy(deep=True)
        test_df_i = testing_df.copy(deep=True)

        folds_i = list(DataFrameStratifiedKFold(
예제 #2
0
    if len(selection) == 0:
        print("Please select some features using the command line args. Use --help or -h for help.")
        sys.exit(1)
    print(selection)


    # ---------------------- THRESHOLD TESTING ---------------------------- #
    developement_df, _ = prep_data_frames(selection, load_interactome=False)
    thresholds = np.arange(0, 1.1, step=0.1)
    folds = list(DataFrameStratifiedKFold(
        n_splits=cv_folds, shuffle=True, random_state=None
    ).split(developement_df, y=developement_df['label'].values))
    statistics = Statistics()
    params = sk_generate_params('lr', columns=None)
    labels = get_labels_from_file('data/labels.tsv')
    seeds = create_seeds(len(labels))

    things = {}
    def pr_curve(i):
        label = labels[i]
        statistics_l = Statistics()
        print('Doing label {}'.format(label))

        for train_idx, valid_idx in folds:
            rng = np.random.RandomState()
            rng.seed(seeds[i])
            training_fold = developement_df.loc[train_idx, ]
            training_fold = training_fold.reset_index(drop=True)
            validation_fold = developement_df.loc[valid_idx, ]
            validation_fold = validation_fold.reset_index(drop=True)
            base_estimators = make_classifiers(method, balanced, labels, random_state=rng)