示例#1
0
def train_logistic(train_features, train_labels, test_features,
                   scikit_balancing, train_size, skip_feature_selection,
                   skip_grid_search, penalty, cost, dual, tol, num_jobs):
    """
  Performs all the data transformations on test data and returns the trained model and the
  transformed test data
  """
    # balance the train data set and create requested train size.
    train_features, train_labels, penalty_weights = utils.prepare_train_data(
        train_features, train_labels, scikit_balancing, train_size)

    # Impute the data and replace missing values
    imputer = Imputer(missing_values="NaN",
                      strategy='mean',
                      axis=0,
                      copy=False)
    imputer.fit(train_features)
    train_features = imputer.transform(train_features)
    test_features = imputer.transform(test_features)

    if not skip_feature_selection:
        # feature selector expects scaled features
        (scaled_train_features,
         scaled_test_features) = utils.scale_data(train_features,
                                                  test_features, 'minmax')
        feature_selector_obj = feature_selection.feature_selector(
            scaled_train_features, train_labels, len(train_labels),
            scikit_balancing)
        feature_selector_obj.select_optimal_set(num_jobs)
        train_features = feature_selector_obj.transform(train_features)
        test_features = feature_selector_obj.transform(test_features)
        print("Selected %d features for grid search and final test." %
              len(feature_selector_obj.get_selected_features()))

    # requested grid search. find best parameters, to achieve highest average recall
    if not skip_grid_search:
        algorithm = "logistic"
        clf = grid_search.grid_search("macro-recall", train_features,
                                      train_labels, scikit_balancing,
                                      algorithm, num_jobs)
        params = clf.best_params_
        print("Best Parameters are: {} ".format(params))
        print("Best Cross Validation Score (mean, std): ({},{})".format(
            clf.cv_results_['mean_test_score'][clf.best_index_],
            clf.cv_results_['std_test_score'][clf.best_index_]))
        penalty = params['penalty']
        cost = params['C']

    # Now perform the training on full train data. check on test data
    model = LogisticRegression(penalty=penalty,
                               dual=dual,
                               C=cost,
                               tol=tol,
                               max_iter=5000,
                               class_weight=penalty_weights)
    model = model.fit(train_features, train_labels)

    return (model, train_features, train_labels, test_features)
示例#2
0
def main():
  (train_features, train_labels, test_features, test_labels, class_values, class_names,
   feature_label_names) = utils.prepare_data(args.input_filename,
                                             args.label_column,
                                             args.train_size,
                                             args.test_size,
                                             args.imbalanced_data)
  # now that we have limited the data to requested train size, scale data since svm needs
  # to be scaled
  (train_feautres, test_features) = utils.scale_data(train_features,
                                                     test_features,
                                                     args.scaling_method)
  
  # We let scikit use its balancing scheme if it is explicitly requested
  penalty_weights = 'balanced' if args.imbalanced_data else None
 
  # feature selection if requested
  if args.feature_selection_algo:
    feature_selector_obj =  feature_selection.feature_selector(args.evaluation,
                                                               train_features,
                                                               train_labels,
                                                               feature_label_names,
                                                               -1,
                                                               penalty_weights,
                                                               args.feature_selection_algo,
                                                               args.num_jobs)
    train_features = feature_selector_obj.transform(train_features)
    test_features = feature_selector_obj.transform(test_features)
    print "Selected " + str(len(feature_selector_obj.get_selected_features())) + " features"
    print "Top 10 features: " + str(feature_selector_obj.get_top_features(10))


  # ovr only works for linear svm
  multi_class = 'ovr' if args.kernel == 'linear' else args.multi_class
  model = models.train_svm(train_features,
                           train_labels,
                           penalty_weights,
                           args.skip_grid_search,
                           args.evaluation,
                           args.num_jobs,
                           args.kernel,
                           args.cost,
                           args.gamma,
                           args.degree,
                           args.multi_class)

  # Predict test and report full stats
  y_true, y_pred = test_labels, model.predict(test_features)
  print("\n*****************************\n")
  print('MAE: ' +
        str(metrics.mean_absolute_error(y_true, y_pred, multioutput='uniform_average')))
  print('MSE: ' +
        str(metrics.mean_squared_error(y_true, y_pred, multioutput='uniform_average')))
  
  print('Classification report:')
  print(metrics.classification_report(y_true, y_pred, class_values, class_names))
  print('Precision Recall')
  print(metrics.precision_recall_fscore_support(y_true, y_pred, labels=class_values,
                                                pos_label=None,
                                                average='weighted'))

  # print and plot confusion matrix
  print('Confusion Matrix Without Normalization')
  numpy.set_printoptions(precision=2)
  cm = metrics.confusion_matrix(y_true, y_pred, class_values)
  print(cm)
  print('Confusion Matrix With Normalization')
  cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, numpy.newaxis]
  print(cm_normalized)
  
  plt.figure()
  plt.subplot(2, 1, 1)
  utils.plot_confusion_matrix(cm, class_names, 'Unnormalized confusion matrix')

  # Normalize the confusion matrix by row (i.e by the number of samples
  # in each class)
  plt.subplot(2, 1, 2)
  utils.plot_confusion_matrix(cm_normalized, class_names, 'Normalized confusion matrix')

  #plt.savefig(args.output_figure + '.pdf')
  pdf = PdfPages(args.output_figure + '.pdf')
  plt.savefig(pdf, format='pdf')
  pdf.close()
def main():
    add_log_vars = True
    (train_features, train_labels, test_features, test_labels, class_values,
     class_names, feature_label_names) = utils.prepare_data(
         args.input_filename, args.label_column, args.train_size,
         args.test_size, args.imbalanced_data, add_log_vars)
    print("Label is {}".format(feature_label_names[-1]))

    # We let scikit use its balancing scheme if it is explicitly requested
    penalty_weights = 'balanced' if args.imbalanced_data else None

    # feature selection if requested
    if args.feature_selection_algo:
        feature_selector_obj = feature_selection.feature_selector(
            args.evaluation, train_features, train_labels, feature_label_names,
            -1, penalty_weights, args.feature_selection_algo, args.num_jobs)
        train_features = feature_selector_obj.transform(train_features)
        test_features = feature_selector_obj.transform(test_features)
        print "Selected " + str(
            len(feature_selector_obj.get_selected_features())) + " features"
        print "Top 10 features: " + str(
            feature_selector_obj.get_top_features(10))

    # multinomial only works with lbfgs
    solver = 'liblinear' if args.multi_class == 'ovr' else 'lbfgs'
    model = models.train_logistic(train_features, train_labels,
                                  penalty_weights, args.skip_grid_search,
                                  args.evaluation, args.num_jobs, args.penalty,
                                  args.cost, args.multi_class, solver)

    # Predict test and report full stats
    y_true = test_labels
    y_pred_prob = model.predict_proba(test_features)
    df = pd.DataFrame(data=y_pred_prob, columns=model.classes_)
    df['max_prob'] = df.max(axis=1)
    df['max_prob_class'] = df.idxmax(axis=1)
    df['true'] = y_true
    y_pred = df['max_prob_class']

    print("\n*****************************\n")
    print('MAE on test: {}'.format(
        mean_absolute_error(y_true, y_pred, multioutput='uniform_average')))

    print('Test Accuracy: {}'.format(accuracy_score(y_true, y_pred) * 100.))
    print('Classification report:')
    print(classification_report(y_true, y_pred, class_values))
    print('Weighted Precision Recall:')
    print(
        precision_recall_fscore_support(y_true,
                                        y_pred,
                                        labels=class_values,
                                        pos_label=None,
                                        average='weighted'))
    print('Unweighted Precision Recall:')
    print(
        precision_recall_fscore_support(y_true,
                                        y_pred,
                                        labels=class_values,
                                        pos_label=None,
                                        average='macro'))

    # print and plot confusion matrix
    print('Confusion Matrix Without Normalization')
    numpy.set_printoptions(precision=2)
    cm = metrics.confusion_matrix(y_true, y_pred, class_values)
    print(cm)
    print('Confusion Matrix With Normalization')
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, numpy.newaxis]
    print(cm_normalized)

    plt.figure()
    plt.subplot(2, 1, 1)
    utils.plot_confusion_matrix(cm, class_names,
                                'Unnormalized confusion matrix')

    # Normalize the confusion matrix by row (i.e by the number of samples
    # in each class)
    plt.subplot(2, 1, 2)
    utils.plot_confusion_matrix(cm_normalized, class_names,
                                'Normalized confusion matrix')

    #plt.savefig(args.output_figure + '.pdf')
    pdf = PdfPages(args.output_figure + '.pdf')
    plt.savefig(pdf, format='pdf')
    pdf.close()

    # Now print stats on subsets based on confidence of max_prob_class. Sort predictions
    # by confidence in descending order and take subsets from the top of the sorted df
    df = df.sort_values(by='max_prob', ascending=False)
    print(','.join([
        'Probability Threshold', 'Percentage Predicted', 'Accuracy',
        'AverageRecall', 'AveragePrecision', 'AverageFscore'
    ]))
    for percent_to_predict in range(1, 100):
        lowest_idx = int(percent_to_predict * len(df.index) / 100.0)
        df_subset = df.iloc[0:lowest_idx]
        prob_threshold = df_subset['max_prob'].min()
        accuracy = accuracy_score(df_subset['true'],
                                  df_subset['max_prob_class'])

        (precision, recall, fscore,
         support) = precision_recall_fscore_support(y_true,
                                                    y_pred,
                                                    labels=class_values,
                                                    pos_label=None,
                                                    average='macro')
        print(','.join(
            map(str, [
                prob_threshold, percent_to_predict, accuracy, recall,
                precision, fscore
            ])))
示例#4
0
def train_knn(train_features, train_labels, test_features, imbalanced_data,
              train_size, scaling_method, minmax_min, minmax_max,
              skip_feature_selection, skip_grid_search, n_neighbors, weights,
              algorithm, metric, num_jobs):
    """
  Performs all the data transformations on test data and returns the trained model and the
  transformed test data
  """
    # balance the train data set and create requested train size. Here instead of
    # scikit balancing, we will use imbalanced_data flag and discard the last output since
    # it is irrelevant to knn. In order not to balance the data, the third argument should
    # be true (simulate scikit balancing); so we will use imabalanced_data flag in place of
    # scikit_balancing.
    train_features, train_labels, dummy = utils.prepare_train_data(
        train_features, train_labels, imbalanced_data, train_size)

    # Impute the data and replace missing values
    imputer = Imputer(missing_values="NaN",
                      strategy='mean',
                      axis=0,
                      copy=False)
    imputer.fit(train_features)
    train_features = imputer.transform(train_features)
    test_features = imputer.transform(test_features)

    # now that we have limited the data to requested train size, scale data
    (train_features,
     test_features) = utils.scale_data(train_features, test_features,
                                       scaling_method, minmax_min, minmax_max)

    if not skip_feature_selection:
        feature_selector_obj = feature_selection.feature_selector(
            train_features, train_labels, len(train_labels), imbalanced_data)
        feature_selector_obj.select_optimal_set(num_jobs)
        train_features = feature_selector_obj.transform(train_features)
        test_features = feature_selector_obj.transform(test_features)
        print("Selected %d features for grid search and final test." %
              len(feature_selector_obj.get_selected_features()))

    # requested grid search. find best parameters, to achieve highest average recall
    if not skip_grid_search:
        algorithm = "knn"
        clf = grid_search.grid_search("macro-recall", train_features,
                                      train_labels, imbalanced_data, algorithm,
                                      num_jobs)
        params = clf.best_params_
        print("Best Parameters are: {} ".format(params))
        print("Best Cross Validation Score (mean, std): ({},{})".format(
            clf.cv_results_['mean_test_score'][clf.best_index_],
            clf.cv_results_['std_test_score'][clf.best_index_]))
        n_neighbors = params['n_neighbors']
        weights = params['weights']
        algorithm = params['algorithm']
        metric = params['metric']

    # Now perform the training on full train data. check on test data
    model = KNeighborsClassifier(n_neighbors=n_neighbors,
                                 weights=weights,
                                 algorithm=algorithm,
                                 metric=metric)
    model = model.fit(train_features, train_labels)

    return (model, train_features, train_labels, test_features)
示例#5
0
def train_svm(train_features, train_labels, test_features, scikit_balancing,
              train_size, scaling_method, minmax_min, minmax_max,
              skip_feature_selection, skip_grid_search, kernel, gamma, cost,
              degree, num_jobs):
    """ Balances, extracts the requested train size, imputes, scales and finally performs
  features selection on the train data. Then it performs grid search, train a model using
  the best parameters.

  Performs all the data transformations on test data and returns the trained model and the
  transformed test data
  """
    # balance the train data set and create requested train size.
    train_features, train_labels, penalty_weights = utils.prepare_train_data(
        train_features, train_labels, scikit_balancing, train_size)

    # Impute the data and replace missing values
    imputer = Imputer(missing_values="NaN",
                      strategy='mean',
                      axis=0,
                      copy=False)
    imputer.fit(train_features)
    train_features = imputer.transform(train_features)
    test_features = imputer.transform(test_features)

    # now that we have limited the data to requested train size, scale data
    (train_features,
     test_features) = utils.scale_data(train_features, test_features,
                                       scaling_method, minmax_min, minmax_max)

    if not skip_feature_selection:
        feature_selector_obj = feature_selection.feature_selector(
            train_features, train_labels, len(train_labels), scikit_balancing)
        feature_selector_obj.select_optimal_set(num_jobs)
        train_features = feature_selector_obj.transform(train_features)
        test_features = feature_selector_obj.transform(test_features)
        print("Selected %d features for grid search and final test." %
              len(feature_selector_obj.get_selected_features()))

    # requested grid search. find best parameters, to achieve highest average recall
    if not skip_grid_search:
        algorithm = "linear-svm" if kernel == "linear" else "kernel-svm"
        clf = grid_search.grid_search("macro-recall", train_features,
                                      train_labels, scikit_balancing,
                                      algorithm, num_jobs)
        params = clf.best_params_
        print("Best Parameters are: {} ".format(params))
        print("Best Cross Validation Score (mean, std): ({},{})".format(
            clf.cv_results_['mean_test_score'][clf.best_index_],
            clf.cv_results_['std_test_score'][clf.best_index_]))
        if 'kernel' in params:
            kernel = params['kernel']
        if 'gamma' in params:
            gamma = params['gamma']
        if 'C' in params:
            cost = params['C']
        if 'degree' in params:
            degree = params['degree']

    # Now perform the training on full train data. check on test data
    # We enable probability estimates, so that we can identify the top samples.
    model = svm.SVC(tol=0.05,
                    cache_size=6000,
                    class_weight=penalty_weights,
                    kernel=kernel,
                    gamma=gamma,
                    C=cost,
                    degree=degree,
                    probability=True)
    model = model.fit(train_features, train_labels)

    return (model, train_features, train_labels, test_features)
示例#6
0
def train_random_forest(train_features, train_labels, test_features,
                        scikit_balancing, train_size, skip_feature_selection,
                        skip_grid_search, max_features, n_estimators,
                        criterion, min_samples_split, min_samples_leaf,
                        num_jobs):
    """
  Performs all the data transformations on test data and returns the trained model and the
  transformed test data
  """
    # balance the train data set and create requested train size.
    train_features, train_labels, penalty_weights = utils.prepare_train_data(
        train_features, train_labels, scikit_balancing, train_size)

    # Impute the data and replace missing values
    imputer = Imputer(missing_values="NaN",
                      strategy='mean',
                      axis=0,
                      copy=False)
    imputer.fit(train_features)
    train_features = imputer.transform(train_features)
    test_features = imputer.transform(test_features)

    if not skip_feature_selection:
        # feature selector expects scaled features
        (scaled_train_features,
         scaled_test_features) = utils.scale_data(train_features,
                                                  test_features, 'minmax')
        feature_selector_obj = feature_selection.feature_selector(
            scaled_train_features, train_labels, len(train_labels),
            scikit_balancing)
        feature_selector_obj.select_optimal_set(num_jobs)
        train_features = feature_selector_obj.transform(train_features)
        test_features = feature_selector_obj.transform(test_features)
        print("Selected %d features for grid search and final test." %
              len(feature_selector_obj.get_selected_features()))

    max_features = utils.extract_max_features(max_features)
    # requested grid search. find best parameters, to achieve highest average recall
    if not skip_grid_search:
        algorithm = "random-forest"
        clf = grid_search.grid_search("macro-recall", train_features,
                                      train_labels, scikit_balancing,
                                      algorithm, num_jobs)
        params = clf.best_params_
        print("Best Parameters are: {} ".format(params))
        print("Best Cross Validation Score (mean, std): ({},{})".format(
            clf.cv_results_['mean_test_score'][clf.best_index_],
            clf.cv_results_['std_test_score'][clf.best_index_]))
        n_estimators = max(params['n_estimators'], n_estimators)
        criterion = params['criterion']
        max_features = params['max_features']
        min_samples_split = params['min_samples_split']
        min_samples_leaf = params['min_samples_leaf']

    # Now perform the training on full train data. check on test data
    model = RandomForestClassifier(n_estimators=n_estimators,
                                   n_jobs=num_jobs,
                                   criterion=criterion,
                                   max_features=max_features,
                                   min_samples_split=min_samples_split,
                                   min_samples_leaf=min_samples_leaf,
                                   class_weight=penalty_weights)
    model = model.fit(train_features, train_labels)

    return (model, train_features, train_labels, test_features)
    def optimise_step(self,
                      df_train,
                      df_target,
                      npoints=1,
                      nrandom=1,
                      n_iter=50,
                      set_callbacks=True):
        """Evaluates the data.
        Build the pipeline. If no parameters are set, default configuration for
        each step is used
        Parameters
        ----------
        space : dict, default = None.
        df_train : pandas dataframe of shape = (n_train, n_features)
            The train dataset with numerical features.
        y_train : pandas series of shape = (n_train,)
            The numerical encoded target for classification tasks.
        max_evals : int, default = 20, max evaluation times
        set_callbacks (opt): bool,default: True
             If callable then callback(res) is called after each call to func. If list of callables, then each callable in the list is called.
        ----------
        Returns
        ---------
        result : dict
            - result['best_score'] : Best Score after Tuning
            - result['best_score_std'] : Standar Divation of best score
            - result['best_parmas'] : Best parameters
            - result['params'] : all paramsters (# = checked candicated)
            - result['time_cost(s)'] : total time of finding out the best parameters
            - result['all_cv_results'] : all cv results
            - result['mean_score_time'] : time for each cv result
        """
        # checke parallel strategy

        ce = Categorical_encoder()
        X = ce.fit_transform(df_train, df_target)

        if len(df_train.dtypes[df_train.dtypes == 'float'].index) != 0:
            scal = Scaler()
            X = scal.fit_transform(X, df_target)
            self.perform_scaling is True
        else:
            pass

        mid_result = {}
        tuning_result = {}
        if len(pd.DataFrame(X).columns) > 20:
            search_space_LGB = Classifier(
                strategy="LightGBM").get_search_spaces(
                    need_feature_selection=True)
            search_space_SVC = Classifier(strategy="SVC").get_search_spaces(
                need_feature_selection=True)
            search_spaces = [search_space_SVC, search_space_LGB]
        else:
            search_space_LGB = Classifier(
                strategy="LightGBM").get_search_spaces(
                    need_feature_selection=False)
            search_space_SVC = Classifier(strategy="SVC").get_search_spaces(
                need_feature_selection=False)
            search_spaces = [search_space_SVC, search_space_LGB]

        # Initialize a pipeline
        fs = None
        for i in range(len(search_spaces)):
            if isinstance(search_spaces, tuple):
                for p in search_spaces[i][0].keys():
                    if (p.startswith("fs__")):
                        fs = feature_selector()
                    else:
                        print(
                            ">> Number of Features < 20, ignore feature selection"
                        )
                        pass
            else:
                for p in search_spaces[i].keys():
                    if (p.startswith("fs__")):
                        fs = feature_selector()
                    else:
                        pass

        # Do we need to cache transformers?
        cache = False

        if (fs is not None):
            if ("fs__strategy" in search_spaces):
                if (search_spaces["fs__strategy"] != "variance"):
                    cache = True
                else:
                    pass
        else:
            pass
        mprint(f'Start turning Hyperparameters .... ')
        print("")
        print(">>> Categorical Features have encoded with :" +
              str({'strategy': ce.strategy}))
        print("")
        if self.perform_scaling is True:
            print(">>> Numerical Features have encoded with :" +
                  scal.__class__.__name__)
            print("")

        for baseestimator in self.baseEstimator:
            # Pipeline creation

            lgb = Classifier(strategy="LightGBM").get_estimator()
            #  rf = Classifier(strategy="RandomForest").get_estimator()
            #  svc = Classifier(strategy="SVC").get_estimator()

            if (fs is not None):
                if cache:
                    pipe = Pipeline([('fs', fs), ('model', lgb)],
                                    memory=self.to_path)
                else:
                    pipe = Pipeline([('fs', fs), ('model', lgb)])
            else:
                if cache:
                    pipe = Pipeline([('model', lgb)], memory=self.to_path)
                else:
                    pipe = Pipeline([('model', lgb)])

            if (self.parallel_strategy is True):
                opt = BayesSearchCV(pipe,
                                    search_spaces=search_spaces,
                                    scoring=self.scoring,
                                    cv=self.cv,
                                    npoints=npoints,
                                    n_jobs=-1,
                                    n_iter=n_iter,
                                    nrandom=nrandom,
                                    return_train_score=False,
                                    optimizer_kwargs={
                                        'base_estimator': baseestimator,
                                        "acq_func": "EI"
                                    },
                                    random_state=self.random_state,
                                    verbose=self.verbose,
                                    refit=self.refit)
            else:
                opt = BayesSearchCV(pipe,
                                    search_spaces=search_spaces,
                                    scoring=self.scoring,
                                    cv=self.cv,
                                    npoints=npoints,
                                    n_jobs=1,
                                    n_iter=n_iter,
                                    nrandom=nrandom,
                                    return_train_score=False,
                                    optimizer_kwargs={
                                        'base_estimator': baseestimator,
                                        "acq_func": "EI"
                                    },
                                    random_state=self.random_state,
                                    verbose=self.verbose,
                                    refit=self.refit)

            if not isinstance(baseestimator, GaussianProcessRegressor):
                if set_callbacks is True:
                    mid_result = self.report_perf(
                        opt,
                        X,
                        df_target,
                        ' with Surrogate Model:' + baseestimator,
                        callbacks=[
                            self.on_step,
                            DeadlineStopper(60 *
                                            60)  # ,DeltaYStopper(0.000001)
                        ])
                else:
                    mid_result = self.report_perf(
                        opt,
                        X,
                        df_target,
                        ' with Surrogate Model: ' + baseestimator,
                    )
                tuning_result[baseestimator] = mid_result

            else:
                if set_callbacks is True:
                    mid_result = self.report_perf(
                        opt,
                        X,
                        df_target,
                        ' with Surrogate Model:' +
                        baseestimator.__class__.__name__,
                        callbacks=[
                            self.on_step,
                            DeadlineStopper(60 *
                                            60)  # ,DeltaYStopper(0.000001)
                        ])
                else:
                    mid_result = self.report_perf(
                        opt,
                        X,
                        df_target,
                        ' with Surrogate Model: ' +
                        baseestimator.__class__.__name__,
                    )
                tuning_result[baseestimator.__class__.__name__] = mid_result

        bests = pd.DataFrame()
        for key in tuning_result.keys():
            if tuning_result[key]['best_score'] == max(
                    d['best_score'] for d in tuning_result.values()):
                bests = bests.append(
                    {
                        'best_score': tuning_result[key]['best_score'],
                        'best_SM': key,
                        'time': tuning_result[key]['Time_cost']
                    },
                    ignore_index=True)
                bests = bests.sort_values(
                    by=['time'], ascending=True).reset_index(drop=True)
                best_base_estimator = bests['best_SM'][0]
                best_param = tuning_result[best_base_estimator]['best_parmas']

        print("")
        print('######## Congratulations! Here is the Best Parameters: #######')
        print('Best Score is:',
              tuning_result[best_base_estimator]['best_score'])
        try:
            print('with Surrogate Model ' + best_base_estimator)
        except:
            print('with Surrogate Model ' +
                  best_base_estimator.__class__.__name__)
        pprint.pprint(best_param)

        self.best_param_ = best_param

        return best_param, tuning_result
示例#8
0
def main():
    df = pandas.read_csv(args.input_filename, index_col=False, header=0)
    data = df.values
    column_names = df.columns.values.tolist()

    # Impute the data and replace missing values
    imputer = preprocessing.Imputer(missing_values="NaN",
                                    strategy='mean',
                                    axis=0,
                                    copy=False)
    imputer.fit(data)
    data = imputer.transform(data)

    # Extract features/labels and their names from raw data
    features = data[:, 0:args.label_column]
    labels = data[:, args.label_column].astype(int)
    feature_names = column_names[0:args.label_column]
    label_name = column_names[args.label_column]

    # scale data no matter what, since the feature selector is L1-SVM
    (scaled_features, dummy) = utils.scale_data(features, None, 'minmax')

    # open output file and write header with max_num_features selected features
    output_file = open(args.output_filename, 'w')
    output_file_writer = csv.writer(output_file)
    header = [
        "num_features_selected", "test_size", "avg_true_positive",
        "avg_false_positive", "avg_true_negative", "avg_false_negative",
        "avg_accuracy", "avg_pos_f1", "avg_neg_f1", "avg_average_f1",
        "avg_pos_precision", "avg_neg_precision", "avg_average_precision",
        "avg_pos_recall", "avg_neg_recall", "avg_average_recall"
    ]

    for i in range(1, args.max_num_features + 1):
        header.extend(["feature" + str(i), "feature" + str(i) + "_weight"])
    output_file_writer.writerow(header)

    feature_selector_obj = feature_selection.feature_selector(
        scaled_features, labels, args.num_samples, args.scikit_balancing)

    for num_features in range(args.min_num_features,
                              args.max_num_features + 1):
        # Before anything, must set to feature selector object to num_feature
        feature_selector_obj.select_top_features(num_features)
        selected_features = feature_selector_obj.get_selected_features(
            feature_names)

        # Print selected and unselected features.
        print '\nSelected Feature,Weight'
        for feature, feature_coef in selected_features:
            print(feature + "," + str(feature_coef))

        # Now transform and restrict the features to those only selected by the L1-svm
        transformed_scaled_features = feature_selector_obj.transform(
            scaled_features)
        transformed_features = feature_selector_obj.transform(features)

        print('\n' + str(len(selected_features)) + ' out of ' +
              str(features.shape[1]) + ' features are selected.\n')

        # Now perform the learning task using the top features and report results. Make
        # sure to pass scaled features to svm
        num_test_trials = 10
        test_size = args.test_size if args.test_size <= 1.0 else int(
            args.test_size)
        if args.learning_algorithm == 'random-forest':
            rf_max_features = utils.extract_max_features(args.rf_max_features)
            metrics = perform_random_forest(
                transformed_features, labels, args.rf_num_trees,
                args.rf_criterion, rf_max_features, args.rf_min_samples_split,
                args.rf_min_samples_leaf, args.scikit_balancing, test_size,
                num_test_trials)

        elif args.learning_algorithm == 'svm':
            metrics = perform_svm(transformed_scaled_features, labels,
                                  args.svm_kernel, args.svm_gamma,
                                  args.svm_cost, args.svm_degree,
                                  args.scikit_balancing, test_size,
                                  num_test_trials)
        elif args.learning_algorithm == 'logistic':
            metrics = perform_logistic(transformed_features, labels,
                                       args.logistic_penalty,
                                       args.logistic_cost,
                                       args.scikit_balancing, test_size,
                                       num_test_trials)
        elif args.learning_algorithm == 'knn':
            metrics = perform_knn(transformed_scaled_features, labels,
                                  args.knn_num_neighbors, args.knn_weights,
                                  args.knn_algorithm, args.knn_metric,
                                  args.knn_imbalanced_data, test_size,
                                  num_test_trials)

        # write a row for num_features selected to output file
        output_row = [len(selected_features)]
        output_row.extend(metrics)
        for feature, feature_coef in selected_features:
            output_row.extend([feature, feature_coef])
        output_row.extend([''] * (len(header) - len(output_row)))
        output_file_writer.writerow(output_row)

        print '******************************\n'

    output_file.close()