示例#1
0
def validate_feature_linear(features,
                            labels,
                            classes,
                            n_folds=5,
                            print_folds=True,
                            print_absolute=True,
                            print_logloss=True):
    kfold = cv.LabelKFold(labels, n_folds)
    model = lda.LDA()
    if print_absolute:
        score = cross_validation.cross_val_score(model,
                                                 features,
                                                 classes,
                                                 cv=kfold)
    if print_absolute: print("absolute scores")
    if print_folds: print("\tfolds:", score)
    if print_absolute: print("\tmean:", score.mean(), "std:", numpy.std(score))

    scores = score_calculation.loglossKFold(features,
                                            classes,
                                            model,
                                            kfold,
                                            given_kfold=True)
    if print_logloss: print("logloss scores")
    if print_folds: print("\tfolds", scores)
    if print_logloss:
        print("\tmean:", numpy.mean(scores), "std:", numpy.std(scores))
def split_and_write(count_df, experiment_proto, save_dir):
  """Splits into folds, write each test fold to an sstable.

  Only the test folds are written out so the train table for any fold is
  all the other sstables. This limits re-writing the data multiple times.

  Args:
    count_df: dataframe where the index is the sequence and the rest of the
      columns are counts in each round of seletion.
    experiment_proto: selection_pb2.Experiment proto describing the experimental
      configuration and results.
    save_dir: string base name for the directory in which to save output.
  """
  experiment_proto = copy.deepcopy(experiment_proto)
  # These FASTQ files describe all the experiment data, not the split train /
  # test data.
  _remove_fastq_paths(experiment_proto)

  label_kfold = cross_validation.LabelKFold(count_df.cluster, n_folds=5)
  for i, (train, test) in enumerate(label_kfold):
    logging.info("Fold %d has %d train and %d test", i, len(train), len(test))

    test_counts = count_df.iloc[test]
    train_counts = count_df.iloc[train]

    for split_name, subcounts in [("test", test_counts), ("train",
                                                          train_counts)]:
      update_experiment_read_counts(experiment_proto, subcounts)
      path = os.path.join(save_dir,
                          "experiment_fold_%d_%s.pbtxt" % (i, split_name))
      with gfile.GFile(path, "w") as f:
        f.write(text_format.MessageToString(experiment_proto))

    # HDF5 can be quickly read and writen from Python
    logging.info("Saving count table as HDF5.")
    path = os.path.join(save_dir, "table_fold_%d.h5" % i)
    io_utils.write_dataframe_to_hdf5(test_counts, path)

    # we use the sstable of examples for TensorFlow
    logging.info("Saving SSTable of TensorFlow example protos.")
    path = os.path.join(save_dir, "examples_fold_%d.sstable" % i)
    write_sstable(test_counts, path,
                  experiment_proto.forward_primer,
                  experiment_proto.reverse_primer)
示例#3
0
def test_label_kfold():
    rng = np.random.RandomState(0)

    # Parameters of the test
    n_labels = 15
    n_samples = 1000
    n_folds = 5

    # Construct the test data
    tolerance = 0.05 * n_samples  # 5 percent error allowed
    labels = rng.randint(0, n_labels, n_samples)
    folds = cval.LabelKFold(labels, n_folds=n_folds).idxs
    ideal_n_labels_per_fold = n_samples // n_folds

    # Check that folds have approximately the same size
    assert_equal(len(folds), len(labels))
    for i in np.unique(folds):
        assert_greater_equal(tolerance,
                             abs(sum(folds == i) - ideal_n_labels_per_fold))

    # Check that each label appears only in 1 fold
    for label in np.unique(labels):
        assert_equal(len(np.unique(folds[labels == label])), 1)

    # Check that no label is on both sides of the split
    labels = np.asarray(labels, dtype=object)
    for train, test in cval.LabelKFold(labels, n_folds=n_folds):
        assert_equal(len(np.intersect1d(labels[train], labels[test])), 0)

    # Construct the test data
    labels = [
        'Albert', 'Jean', 'Bertrand', 'Michel', 'Jean', 'Francis', 'Robert',
        'Michel', 'Rachel', 'Lois', 'Michelle', 'Bernard', 'Marion', 'Laura',
        'Jean', 'Rachel', 'Franck', 'John', 'Gael', 'Anna', 'Alix', 'Robert',
        'Marion', 'David', 'Tony', 'Abel', 'Becky', 'Madmood', 'Cary', 'Mary',
        'Alexandre', 'David', 'Francis', 'Barack', 'Abdoul', 'Rasha', 'Xi',
        'Silvia'
    ]
    labels = np.asarray(labels, dtype=object)

    n_labels = len(np.unique(labels))
    n_samples = len(labels)
    n_folds = 5
    tolerance = 0.05 * n_samples  # 5 percent error allowed
    folds = cval.LabelKFold(labels, n_folds=n_folds).idxs
    ideal_n_labels_per_fold = n_samples // n_folds

    # Check that folds have approximately the same size
    assert_equal(len(folds), len(labels))
    for i in np.unique(folds):
        assert_greater_equal(tolerance,
                             abs(sum(folds == i) - ideal_n_labels_per_fold))

    # Check that each label appears only in 1 fold
    for label in np.unique(labels):
        assert_equal(len(np.unique(folds[labels == label])), 1)

    # Check that no label is on both sides of the split
    for train, test in cval.LabelKFold(labels, n_folds=n_folds):
        assert_equal(len(np.intersect1d(labels[train], labels[test])), 0)

    # Should fail if there are more folds than labels
    labels = np.array([1, 1, 1, 2, 2])
    assert_raises(ValueError, cval.LabelKFold, labels, n_folds=3)
示例#4
0
#Initializing the classifiers (All are tree based classifiers)

dt = DecisionTreeClassifier()
rf = RandomForestClassifier(n_estimators=51)
extree = ExtraTreeClassifier()
classifier_list = [dt, rf, extree]
classifier_name_list = ["Decision Tree", "Random Forests", "Extra Trees"]

data = dataFrame.values

# Initializing Cross Validation Models

kf = cross_validation.KFold(len(labels), n_folds=5)
stratifiedkf = cross_validation.StratifiedKFold(labels, n_folds=4)
labeledkf = cross_validation.LabelKFold(labels, n_folds=4)
leavePout = cross_validation.LeavePOut(len(labels), p=100)
cross_validation_model_list = [kf, stratifiedkf, labeledkf, leavePout]
cross_validation_model_names = [
    "K-Fold", "Stratified K-fold", "Labeled K-Fold", "Leave P Out"
]

# Cross Validating each given classifier

for classifier, classifier_name in zip(classifier_list, classifier_name_list):
    scores = cross_validation.cross_val_score(classifier, data, labels, cv=10)
    print "-------- For Classifier : ", classifier_name, " ---------------"
    print "Score Array : ", scores
    print "Mean Score : ", scores.mean()
    print "Standard Deviation : ", scores.std()
    print "------------------------------------------------------"
示例#5
0
    def cross_validate(self,
                       X_train,
                       y_train,
                       folds=3,
                       normalize_predictions=False):
        ''' clf - sklearn classifier implementing predict_proba function
			X_train - list of lists of pandas dataframes, each sub-list contains dataframes for related data (to keep in the same fold)
					  each dataframe contains index-related points for windowed calculations
					  ALL DATAFRAMES MUST HAVE SAME COLUMNS--Missing data is indicated with np.nan
			y_train - truth labels for each datapoint, organized in same general structure as X_train: list of lists of np.arrays

			Returns: AUC's for each fold '''

        aucs = []

        group_labels = range(len(X_train))
        pred_points = []
        pred_groups = []
        pred_subgroups = []

        #somewhat unusual use of labelkfold--each group is its own fold
        for train_indices, test_indices in cross_validation.LabelKFold(
                group_labels, folds):
            train_x = [x for i, x in enumerate(X_train) if i in train_indices]
            train_y = [y for i, y in enumerate(y_train) if i in train_indices]
            self.fit(train_x, train_y)

            test_x = [x for i, x in enumerate(X_train) if i in test_indices]
            test_y = [y for i, y in enumerate(y_train) if i in test_indices]
            predictions = self.predict_proba(test_x)

            #Option to compute AUCs on nomalized predictions across prediction groups
            if normalize_predictions:
                for group in range(len(predictions)):
                    for subgroup in range(len(predictions[group])):
                        predictions[group][subgroup] = normalize(
                            predictions[group][subgroup])

            #book-keeping of how many groups and sub-groups were able to be predicted for
            concat_predictions = []
            group_finite = 0
            subgroup_finite = 0
            for group in predictions:
                group_has_finite = False
                for subgroup in group:
                    concat_predictions.append(subgroup)
                    if np.any(np.isfinite(subgroup)):
                        subgroup_finite += 1
                        group_has_finite = True
                if group_has_finite:
                    group_finite += 1

            #linearize predictions
            concat_truths = np.hstack(itertools.chain(*test_y))
            concat_predictions = np.hstack(concat_predictions)

            #drop truths/predictions where predicion is nan
            concat_truths = concat_truths[np.where(
                np.isfinite(concat_predictions))]
            concat_predictions = concat_predictions[np.where(
                np.isfinite(concat_predictions))]

            #calculate auc
            aucs.append(roc_auc_score(concat_truths, concat_predictions))
            pred_groups.append(group_finite)
            pred_subgroups.append(subgroup_finite)
            pred_points.append(len(concat_predictions)
                               )  #len actually predicted, with nans dropped

        return {
            'aucs': aucs,
            'loss': self.cv_loss(aucs),
            'pred_groups': pred_groups,
            'pred_subgroups': pred_subgroups,
            'pred_points': pred_points
        }
示例#6
0
def run(inputFilename,
        metricFilename,
        cv=3,
        neighbors=10,
        threshold=1.0,
        gene_coeff=1,
        gene_exp=1,
        geo_coeff=1,
        geo_exp=1,
        feat_coeff=1,
        feat_exp=1,
        verbose=0,
        limit=-1,
        labeled=0,
        jobs=1,
        weighted=True,
        algorithm="knn",
        prefix=""):

    if verbose:
        print("Running " + repr(cv) + "-fold cross-validation:")

        if algorithm == "knn":
            print("  algorithm = knn")
            print("  neighbors = " + repr(neighbors))
            print("  distance threshold = " + repr(threshold))
            print("  gene_coeff = " + repr(gene_coeff))
            print("  gene_exp = " + repr(gene_exp))
            print("  geo_ceff = " + repr(geo_coeff))
            print("  geo_exp = " + repr(geo_exp))
            print("  feat_coeff = " + repr(feat_coeff))
            print("  feat_exp = " + repr(feat_exp))
            print("  weighted = " + repr(weighted))
        elif algorithm == "random":
            print("  algorithm = random")
        elif algorithm == "zero":
            print("  algorithm = zero")
        elif algorithm == "majority":
            print("  algorithm = majority")
        print("Loading feature dataset")

    inputDataset = Dataset()
    inputDataset.loadFromNPZ(inputFilename)
    inputFeatureList = inputDataset.getFeatureList()

    # pick out the features of interest
    inputFeatureList = [((l, f, s), v) for ((l, f, s), v) in inputFeatureList
                        if f.startswith(prefix)]

    if limit > 0:
        inputFeatureList = inputFeatureList[:limit]

    print("Using " + repr(len(inputFeatureList)) + " features")

    np.random.shuffle(inputFeatureList)
    inputData, inputTarget = zip(*inputFeatureList)
    inputData = np.array(inputData)
    inputTarget = np.array(inputTarget)

    if verbose:
        print("Making classifier")

    if algorithm == 'knn':
        distanceDataset = DistanceDataset()
        distanceDataset.loadFromNPZ(metricFilename)
        rc = GeoMetricKNNClassifier(distanceDataset,
                                    k=neighbors,
                                    threshold=threshold,
                                    verbose=verbose,
                                    gene_coeff=gene_coeff,
                                    gene_exp=gene_exp,
                                    geo_coeff=geo_coeff,
                                    geo_exp=geo_exp,
                                    weighted=weighted)
    elif algorithm == 'baggingKNN':
        distanceDataset = DistanceDataset()
        distanceDataset.loadFromNPZ(metricFilename)
        rc = GeoMetricKNNClassifier(distanceDataset,
                                    k=neighbors,
                                    threshold=threshold,
                                    verbose=verbose,
                                    gene_coeff=gene_coeff,
                                    gene_exp=gene_exp,
                                    geo_coeff=geo_coeff,
                                    geo_exp=geo_exp,
                                    weighted=weighted)
        rc = BaggingClassifier(rc, n_estimators=3, bootstrap=False)
    elif algorithm == 'random':
        rc = RandomClassifier()
    elif algorithm == 'majority':
        rc = AverageClassifier()
    elif algorithm == 'zero':
        rc = ZeroClassifier()
    elif algorithm == 'als':
        geocoordDataset = Dataset()
        geocoordDataset.loadFromNPZ(
            "../../../data/results/geodata/geocoord_features.npz")
        rc = MatrixFactorClassifier(algorithm='als', additionalDatasets=[])
    elif algorithm == 'svd':
        rc = MatrixFactorClassifier(algorithm='svd')
    elif algorithm == "featureKNNdivided":
        rc = DividedFeatureMetricKNNClassifier(k=neighbors,
                                               threshold=threshold,
                                               verbose=verbose,
                                               weighted=weighted)
    elif algorithm == "featureKNN":
        rc = FeatureMetricKNNClassifier(k=neighbors,
                                        threshold=threshold,
                                        verbose=verbose,
                                        weighted=weighted)
    elif algorithm == "KNNdivided":
        distanceDataset = DistanceDataset()
        distanceDataset.loadFromNPZ(metricFilename)
        rc = DividedKNNClassifier(distanceDataset,
                                  k=neighbors,
                                  threshold=threshold,
                                  verbose=verbose,
                                  gene_coeff=gene_coeff,
                                  gene_exp=gene_exp,
                                  geo_coeff=geo_coeff,
                                  geo_exp=geo_exp,
                                  feat_coeff=feat_coeff,
                                  feat_exp=feat_exp,
                                  weighted=weighted)
    elif algorithm == "ensemble":
        distanceDataset = DistanceDataset()
        distanceDataset.loadFromNPZ(metricFilename)
        rc = EnsembleClassifier()
        c = SavedMetricKNNClassifier(distanceDataset,
                                     "GENETIC",
                                     k=neighbors,
                                     threshold=threshold,
                                     verbose=verbose,
                                     weighted=weighted)
        rc.addClassifier(c)
        c = SavedMetricKNNClassifier(distanceDataset,
                                     "GEOGRAPHIC",
                                     k=neighbors,
                                     threshold=threshold,
                                     verbose=verbose,
                                     weighted=weighted)
        rc.addClassifier(c)
        c = DividedFeatureMetricKNNClassifier(k=neighbors,
                                              threshold=threshold,
                                              verbose=verbose,
                                              weighted=weighted)
        #c = DividedFeatureMetricKNNClassifier(k=neighbors, threshold=threshold, verbose=verbose, weighted=weighted)
        rc.addClassifier(c)
        #c = MatrixFactorClassifier(algorithm='als')
        #rc.addClassifier(c)
    else:
        sys.exit("ERROR: No algorithm named " + algorithm)

    if verbose:
        print("Cross validating")

    if labeled:
        labels = [inputDataset.languageCodes.index(l) for l, _, _ in inputData]
        kf = cross_validation.LabelKFold(labels, n_folds=cv)
    else:
        kf = cross_validation.KFold(len(inputData), n_folds=cv)

    scores = cross_validation.cross_val_score(rc,
                                              inputData,
                                              inputTarget,
                                              cv=kf,
                                              scoring="accuracy",
                                              n_jobs=jobs,
                                              verbose=verbose)
    print("Accuracy: %0.2f (+/- %0.2f)" %
          (scores.mean() * 100, scores.std() * 2 * 100))