示例#1
0
def tags_report(feats, num=15):
    # compute a score for each tag and pick the ones that meet some threshold.
    tags = library.tags()
    meansquare = sum(len(v)**2 for v in tags.itervalues()) / len(tags)
    significance = int(meansquare**0.5)
    tags = [(k, v) for k, v in tags.iteritems() if len(v) > significance]
    # compute the mean and standard deviation for each feature.
    # for each tag, compute the mean for each track associated with that tag.
    # select features whose tag mean is more distant from the library mean
    # than the standard deviation.
    lib_mean = feats.mean(axis=0)
    lib_std = feats.std(axis=0)
    threshold = lib_std * 1.5
    # get the index for each track
    track_map = dict()
    for i, t in enumerate(library.tracks()):
        track_map[t.hash] = i
    # for each tag, make a mask with the indexes of its tracks
    names = features.names()
    for tag, vals in tags:
        print("tag %s is associated with %d tracks" % (tag, len(vals)))
        indexes = np.array([track_map[t.hash] for t in vals])
        tag_mean = feats[indexes, :].mean(axis=0)
        outliers = np.argwhere(np.absolute(tag_mean - lib_mean) > threshold)
        for i in outliers[..., 0]:
            print("    %s local mean=%.2f; library mean=%.2f" %
                  (names[i], tag_mean[i], lib_mean[i]))
示例#2
0
def correlation_report(feats, num=20):
    R = np.corrcoef(feats, rowvar=False)

    fig = plt.figure(1, figsize=(1280 / 64, 1280 / 64), dpi=96)
    plt.matshow(R)
    plt.gca().set_aspect(1.)
    plt.gca().axis('off')
    plt.savefig("correlation.png", dpi=96, bbox_inches='tight')

    # we only need half of this matrix, because it is symmetrical
    R = np.triu(R, k=1)
    # we only care about magnitude of correlation, not direction
    flatR = R.ravel()
    np.absolute(flatR, out=flatR, where=np.isfinite(flatR))
    ordering = np.argsort(flatR)
    ordering = np.compress(np.isfinite(flatR[ordering]), ordering)
    names = features.names()

    print("top %d most highly correlated variables" % num)
    for flat in ordering[::-1][:num]:
        pair = np.unravel_index(flat, R.shape)
        coeff = R[pair]
        print("    %s . %s: %s" % (names[pair[0]], names[pair[1]], ns(coeff)))
    print("bottom %d least highly correlated variables" % num)
    for flat in ordering[:num]:
        pair = np.unravel_index(flat, R.shape)
        coeff = R[pair]
        print("    %s . %s: %s" % (names[pair[0]], names[pair[1]], ns(coeff)))
示例#3
0
def mean_stdev_limits_report(feats, *args, **kwargs):
    print("mean, stdev, and limits for each feature")
    names = features.names()
    for i in np.arange(feats.shape[-1]):
        feat = feats[:, i]
        minv, maxv = feat.min(), feat.max()
        meanv, stdv = feat.mean(), feat.std()
        print("%s: (%s .. %s); mean=%s, stdev=%s " %
              (names[i], ns(minv), ns(maxv), ns(meanv), ns(stdv)))
示例#4
0
def normaltest_report(feats, num=20):
    # to what degree does each feature represent a normal distribution?
    numfeats = feats.shape[-1]
    statistic = np.zeros(numfeats)
    pvalue = np.zeros(numfeats)
    for i in np.arange(numfeats):
        s, p = scipy.stats.normaltest(feats[:, i])
        statistic[i] = s
        pvalue[i] = p
        print("    %s s=%s, p=%s" % (features.names()[i], ns(s), ns(p)))
示例#5
0
def kurtosis_report(feats, num=20):
    # which are the most and the least gaussian features present?
    mean = feats.mean(axis=0)
    var = feats.var(axis=0)
    diffmean = feats - mean
    indexes = np.arange(feats.shape[-1])
    usable = (mean != 0) & (var != 0)
    mean = np.compress(usable, mean)
    var = np.compress(usable, var)
    diffmean = np.compress(usable, diffmean)
    indexes = np.compress(usable, indexes)

    kurt = (1. / feats.shape[-1]) * np.sum(diffmean**4) / (var**2) - 3.0
    ordering = np.argsort(kurt)
    print("top %d most gaussian features" % num)
    names = features.names()
    for i in ordering[::-1][:num]:
        print("    %s (%s)" % (names[indexes[i]], ns(kurt[i])))
    print("bottom %d least gaussian features" % num)
    for i in ordering[:num]:
        print("    %s (%s)" % (names[indexes[i]], ns(kurt[i])))
示例#6
0
def scaled_mean_stdev_report(feats, *args, **kwargs):
    print("mean, stdev for each feature after minmax and power scaling")
    names = features.names()
    # scale the limits so that all values fall within 0..1 for each feature
    scaled = feats.copy()
    scaled -= scaled.min(axis=0)
    maxv = scaled.max(axis=0)
    scaled[:, maxv.nonzero()] /= maxv[maxv.nonzero()]

    # compute the linear average, then get the logarithm in that base of the
    # value 0.5. We will correct for distribution nonlinearity by raising every
    # scaled value to this power.
    meanv = scaled.mean(axis=0)
    powers = np.ones_like(meanv)
    powers[meanv.nonzero()] = np.log(0.5) / np.log(meanv[meanv.nonzero()])
    curved = scaled**powers

    # print out a little report of what we found
    for i in np.arange(feats.shape[-1]):
        lmean, lstd = curved[:, i].mean(), curved[:, i].std()
        print("%s: %s**%s = %s, dev=%s" %
              (names[i], ns(meanv[i]), ns(powers[i]), ns(lmean), ns(lstd)))

    # Plot the scaled and curved feature matrices.
    figsize = (feats.shape[0] / 96, 2 * feats.shape[1] / 96)
    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=figsize)
    axes[0].matshow(scaled, cmap='gray')
    axes[0].axis('off')
    axes[0].set_aspect(1.0)
    axes[1].matshow(curved, cmap='gray')
    axes[1].axis('off')
    axes[1].set_aspect(1.0)
    plt.savefig("scaled_featmatrix.png", dpi=96, bbox_inches='tight')

    # Plot histograms of the scaled and curved features.
    hist_bins = 16
    figsize = (4, feats.shape[1] / 96)
    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=figsize)
    histogram = np.zeros((feats.shape[1], hist_bins), dtype=np.float)
    for i in np.arange(feats.shape[1]):
        hist, edges = np.histogram(scaled[:, i],
                                   bins=hist_bins,
                                   range=(0, 1),
                                   density=True)
        histogram[i] = hist / hist.max()
    histogram = np.repeat(histogram, 128 / hist_bins, axis=1)
    histogram = np.pad(histogram, (8, 8),
                       'constant',
                       constant_values=(0.5, 0.5))
    axes[0].matshow(histogram, cmap='gray')
    axes[0].axis('off')
    axes[0].set_aspect(1.0)
    histogram = np.zeros((feats.shape[1], hist_bins), dtype=np.float)
    for i in np.arange(feats.shape[1]):
        hist, edges = np.histogram(curved[:, i],
                                   bins=hist_bins,
                                   range=(0, 1),
                                   density=True)
        histogram[i] = hist / hist.max()
    histogram = np.repeat(histogram, 128 / hist_bins, axis=1)
    histogram = np.pad(histogram, (8, 8),
                       'constant',
                       constant_values=(0.5, 0.5))
    axes[1].matshow(histogram, cmap='gray')
    axes[1].axis('off')
    axes[1].set_aspect(1.0)
    plt.savefig("scaled_featdist.png", dpi=96, bbox_inches='tight')
示例#7
0
def print_feat(i, feats):
    name = features.names()[i]
    avg = feats[:, i].mean()
    dev = feats[:, i].std()
    scale = (dev / np.abs(avg)) * 100.0
    print("    %s (%s . %s, %.2f%%)" % (name, ns(avg), ns(dev), scale))
示例#8
0
def train(num_labels=None,
          gridcv=False,
          randomcv=False,
          kbest=None,
          rfecv=False):
    # Load the track library. Collect metadata labels. Generate a target
    # matrix. Load features for each track in the target matrix.
    libtracks = library.tracks()
    labels = collect_labels(libtracks, num_labels)
    tracklist, target = generate_target(labels)
    data = Dataset(features.normalize(features.matrix(tracklist)), target)
    feat_names = features.names()

    if kbest:
        reduce_kbest(data, feat_names, kbest)

    if rfecv:
        reduce_rfecv(data, feat_names)

    train, test = split_dataset(data, test_size=0.4, random_state=0)
    # A random forest should be able to handle the excessive dimensionality
    # of our dataset relative to the number of samples.
    clf = RandomForestClassifier(n_estimators=120, n_jobs=-1, verbose=1)

    if randomcv:
        print "random parameter search..."
        randomsearch(
            clf, train, 20, {
                "max_depth": [3, None],
                "max_features": scipy.stats.randint(50, 100),
                "min_samples_split": scipy.stats.randint(2, 11),
                "min_samples_leaf": scipy.stats.randint(1, 11),
                "bootstrap": [True, False],
                "criterion": ["gini", "entropy"]
            })

    if gridcv:
        print "grid parameter search..."
        gridsearch(
            clf, train, {
                "max_depth": [3, None],
                "max_features": [50, 75, 100],
                "min_samples_split": [2, 3, 10],
                "min_samples_leaf": [1, 3, 10],
                "bootstrap": [True, False],
                "criterion": ["gini", "entropy"]
            })

    print("training classifier...")
    clf.fit(*train)
    mean_importance = clf.feature_importances_.mean()
    # Measure prediction accuracy for the original training run.
    pred_target = clf.predict(test.input)
    orig_score = accuracy_score(test.target, pred_target)
    print("accuracy score with %d features: %.2f%%" %
          (len(feat_names), orig_score * 100.0))

    # Reduce the feature set.
    print("selecting best features...")
    sfm = SelectFromModel(clf, threshold='1.5*mean')
    sfm.fit(*train)
    # Print the names of the most important features
    feature_subset = sfm.get_support(indices=True)
    for i in feature_subset:
        importance = clf.feature_importances_[i] / mean_importance
        print "    %.1f: '%s'" % (importance, feat_names[i])

    # make a new training set with just the useful features.
    print("preparing new training subset...")
    slim_train = transform_input(sfm, train)
    slim_test = transform_input(sfm, test)
    feat_names = [feat_names[i] for i in feature_subset]

    # train a new classifier using the reduced feature set.
    print("training subset classifier...")
    clf_slim = RandomForestClassifier(n_estimators=120, n_jobs=-1, verbose=1)
    clf_slim.fit(*slim_train)

    # measure accuracy of the retrained models
    pred_slim = clf_slim.predict(slim_test.input)
    slim_score = accuracy_score(slim_test.target, pred_slim)
    print("subset accuracy with %d features: %.2f%%" %
          (len(feature_subset), slim_score * 100.0))