def select_multisurf(X, y, percentile=10):
    num = math.ceil(X.shape[0] * percentile / 100)
    selector = MultiSURF(n_features_to_select=num,
                         discrete_threshold=3,
                         n_jobs=-1)
    selector.fit(X, y)
    return selector
示例#2
0
    def fit(self, X, y=None, **kwargs):

        X, y = self.check_X_y(X, y)
        self.check_params(X, y)

        selector = MultiSURF(n_features_to_select=self.num_features, )
        selector.fit(X, y)

        _support = selector.top_features_[:self.num_features]
        self.support = self.check_support(_support)

        return self
def multisurf_fs(X_df,X_train_all,X_test_all,y_train):
    '''MultiSURF for feature selection'''
    fs = MultiSURF(discrete_threshold = 1000, n_jobs=1)
    fs.fit(X_train_all, y_train)

    feature_scores = fs.feature_importances_
    feature_ids = np.where(feature_scores>=0)[0]
    selected_features = np.array(X_df.columns[feature_ids])

    #New X_train and X_test matrices
    X_train = X_train_all[:,feature_ids]
    X_test = X_test_all[:,feature_ids]

    return selected_features, feature_scores, X_train, X_test
示例#4
0
def job(experiment_path,cv):
    job_start_time = time.time()

    file = open(experiment_path+'/phase1pickle', 'rb')
    phase1_pickle =  pickle.load(file)
    file.close()

    cv_info = phase1_pickle[0]
    learning_iterations = phase1_pickle[3]
    N = phase1_pickle[4]
    nu = phase1_pickle[5]
    attribute_tracking_method = phase1_pickle[6]
    random_state = phase1_pickle[7]
    class_label = phase1_pickle[8]
    feature_selection_sample_size = phase1_pickle[10]
    rule_compaction_method = phase1_pickle[11]
    data_headers = phase1_pickle[1][2]

    train_data_features = cv_info[cv][0]
    train_data_phenotypes = cv_info[cv][1]
    train_instance_labels = cv_info[cv][2]
    train_group_labels = cv_info[cv][3]
    test_data_features = cv_info[cv][4]
    test_data_phenotypes = cv_info[cv][5]
    test_instance_labels = cv_info[cv][6]
    test_group_labels = cv_info[cv][7]
    inst_label = cv_info[cv][8]
    group_label = cv_info[cv][9]

    # Create CV directory
    if not os.path.exists(experiment_path + '/CV_' + str(cv)):
        os.mkdir(experiment_path + '/CV_' + str(cv))

    #MultiSURF Feature Scoring
    merged = np.insert(train_data_features, train_data_features.shape[1], train_data_phenotypes, 1)
    rb_sample = np.random.choice(merged.shape[0], min(feature_selection_sample_size,merged.shape[0]), replace=False)
    new_data = []
    for i in rb_sample:
        new_data.append(merged[i])
    new_data = np.array(new_data)
    data_featuresR = np.delete(new_data, -1, axis=1)
    data_phenotypesR = new_data[:, -1]
    featureimportance_model = MultiSURF()
    featureimportance_model.fit(data_featuresR, data_phenotypesR)
    scores = featureimportance_model.feature_importances_

    # Train ExSTraCS Model
    model = ExSTraCS(learning_iterations=learning_iterations, N=N, nu=nu,attribute_tracking_method=attribute_tracking_method,
                     rule_compaction=rule_compaction_method,random_state=random_state,do_correct_set_subsumption=False,expert_knowledge=scores)
    model.fit(train_data_features, train_data_phenotypes)

    outfile = open(experiment_path + '/CV_' + str(cv) + '/model', 'wb')
    pickle.dump(model, outfile)
    outfile.close()

    # Export Testing Accuracy for each instance
    predicted_data_phenotypes = model.predict(test_data_features)
    equality = np.equal(predicted_data_phenotypes, test_data_phenotypes)
    with open(experiment_path + '/CV_' + str(cv) + '/instTestingAccuracy.csv', mode='w') as file:
        writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        writer.writerow([inst_label, 'isCorrect'])
        for i in range(len(test_instance_labels)):
            writer.writerow([test_instance_labels[i], 1 if equality[i] else 0])
    file.close()

    # Export Aggregate Testing Accuracy
    outfile = open(experiment_path + '/CV_' + str(cv) + '/testingAccuracy.txt', mode='w')
    outfile.write(str(model.score(test_data_features, test_data_phenotypes)))
    outfile.close()

    # Save train and testing datasets into csvs
    with open(experiment_path + '/CV_' + str(cv) + '/trainDataset.csv', mode='w') as file:
        writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        writer.writerow(list(data_headers) + [class_label, inst_label, group_label])
        for i in range(len(train_instance_labels)):
            writer.writerow(list(train_data_features[i]) + [train_data_phenotypes[i]] + [train_instance_labels[i]] + [
                train_group_labels[i]])
    file.close()

    with open(experiment_path + '/CV_' + str(cv) + '/testDataset.csv', mode='w') as file:
        writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        writer.writerow(list(data_headers) + [class_label, inst_label, group_label])
        for i in range(len(test_instance_labels)):
            writer.writerow(list(test_data_features[i]) + [test_data_phenotypes[i]] + [test_instance_labels[i]] + [
                test_group_labels[i]])
    file.close()

    # Get AT Scores for each instance
    AT_scores = model.get_attribute_tracking_scores(instance_labels=np.array(train_instance_labels))

    # Normalize AT Scores
    normalized_AT_scores = []
    for i in range(len(AT_scores)):
        normalized = AT_scores[i][1]
        max_score = max(normalized)
        for j in range(len(normalized)):
            if max_score != 0:
                normalized[j] /= max_score
            else:
                normalized[j] = 0
        normalized_AT_scores.append(list(normalized))

    # Save Normalized AT Scores
    with open(experiment_path + '/CV_' + str(cv) + '/normalizedATScores.csv', mode='w') as file:
        writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        writer.writerow([inst_label] + list(data_headers))
        for i in range(len(train_instance_labels)):
            writer.writerow([train_instance_labels[i]] + normalized_AT_scores[i])
    file.close()

    # Save Runtime
    runtime_file = open(experiment_path + '/CV_' + str(cv) + '/runtime.txt', 'w')
    runtime_file.write(str(time.time() - job_start_time))
    runtime_file.close()

    # Print completion
    print('CV '+str(cv) + " phase 1 complete")