示例#1
0
def perform_knn(features, labels, knn_num_neighbors, knn_weights,
                knn_algorithm, knn_metric, knn_imbalanced_data, test_size,
                num_test_trials):
    """ Runs multiple version of knn in parallel with selected features.

  The number of parallel processes are at most num_test_trials and never exceed
  number of available cores - 2.
  """
    finished_trials = 0
    metrics = list()
    while finished_trials < num_test_trials:
        num_processes = min(multiprocessing.cpu_count() - 2,
                            num_test_trials - finished_trials)
        print 'Running ' + str(num_processes) + ' parallel processes'
        # Replicate the data for processes
        replicated_inputs = (
            (features, labels, knn_num_neighbors, knn_weights, knn_algorithm,
             knn_metric, knn_imbalanced_data, test_size), ) * num_processes
        pool = multiprocessing.Pool(processes=num_processes)
        metrics.extend(pool.map(perform_single_knn, replicated_inputs))
        pool.close()
        finished_trials += num_processes

    # Take average of each metric
    mean_metrics = map(numpy.mean, zip(*metrics))
    print("Mean Test Accuracy: %0.2f%%\n" % mean_metrics[5])
    return mean_metrics
示例#2
0
def perform_random_forest(features, labels, rf_num_trees, rf_criterion,
                          rf_max_features, rf_min_samples_split,
                          rf_min_samples_leaf, scikit_balancing, test_size,
                          num_test_trials):
    """ Runs multiple version of random forest in parallel with selected features.

  The number of parallel processes are at most num_test_trials and never exceed
  number of available cores - 2.
  """
    finished_trials = 0
    metrics = list()
    while finished_trials < num_test_trials:
        num_processes = min(multiprocessing.cpu_count() - 2,
                            num_test_trials - finished_trials)
        print 'Running ' + str(num_processes) + ' parallel processes'
        # Replicate the data for processes
        replicated_inputs = (
            (features, labels, rf_num_trees, rf_criterion, rf_max_features,
             rf_min_samples_split, rf_min_samples_leaf, scikit_balancing,
             test_size), ) * num_processes
        pool = multiprocessing.Pool(processes=num_processes)
        metrics.extend(
            pool.map(perform_single_random_forest, replicated_inputs))
        pool.close()
        finished_trials += num_processes

    # Take average of each metric
    mean_metrics = map(numpy.mean, zip(*metrics))
    print("Mean Test Accuracy: %0.2f%%\n" % mean_metrics[5])
    return mean_metrics
示例#3
0
 def __init__(self, name: Text) -> None:
     metrics = _build_default_metrics(binary=True)
     metrics.extend([
         Metric('precision',
                sklearn.metrics.precision_score,
                binary_only=True),
         Metric('recall', sklearn.metrics.recall_score, binary_only=True),
         Metric('f1', sklearn.metrics.f1_score, binary_only=True),
     ])
     super(ClassificationPerformanceMetrics, self).__init__(name,
                                                            metrics=metrics)
示例#4
0
def _build_default_metrics(binary: bool) -> List[Metric]:
    """Builds and returns the default set of `Metric`s."""

    metrics = [
        Metric('num', lambda y_true, y_pred: len(y_true), binary_only=binary)
    ]

    if binary:
        metrics.extend([
            Metric('auc', sklearn.metrics.roc_auc_score, binary_only=True),
            Metric('auprc',
                   sklearn.metrics.average_precision_score,
                   binary_only=True),
            TopPercentileMetric('freq',
                                compute_frequency,
                                binary_only=True,
                                top_percentile=100),
        ])
        for top_percentile in [10, 5, 1]:
            metrics.append(
                TopPercentileMetric('freq @{:04.1f}'.format(top_percentile),
                                    compute_frequency,
                                    binary_only=True,
                                    top_percentile=top_percentile))
    else:
        metrics.extend([
            Metric('pearson', sp.stats.pearsonr, binary_only=False),
            Metric('spearman', sp.stats.spearmanr, binary_only=False),
            Metric('mse',
                   sklearn.metrics.mean_squared_error,
                   binary_only=False),
            Metric('mae',
                   sklearn.metrics.mean_absolute_error,
                   binary_only=False),
        ])
    return metrics