Пример #1
0
def classify_clusters(data, clf, haz_test, nohaz_test, dens_layers):
    """Classifies data by density cluster IDs and calculates hazardous
       asteroids' mass fraction in the clusters."""
    data_ = data
    # scales = [(min(data[:, 0]), max(data[:, 0])), (min(data[:, 1]), max(data[:, 1]))]
    # print "scales:", scales
    # labels = (-1)*np.ones(len(data))
    merged_clusters = []
    for class_id, (eps, min_samples) in enumerate(dens_layers):
        densclust = density_clusters(data_, eps=eps, min_samples=min_samples)
        print "len(densclust.labels_):", len(densclust.labels_), type(
            densclust.labels_)
        print np.unique(densclust.labels_)
        merged, data_ = merge_clusters(data_, densclust.labels_, class_id)
        merged_clusters.append(merged)

    merged, data_ = merge_clusters(data_,
                                   densclust.labels_,
                                   class_id + 1,
                                   tail=True)
    merged_clusters.append(merged)

    # vd.plot_densclusters(merged_clusters, scales=scales)
    merged_p = np.random.permutation(np.concatenate(tuple(merged_clusters)))

    # clf = KNeighborsClassifier(n_neighbors=int(0.01*len(data)))
    # clf = svm.SVC(C=1, gamma=100.) #kernel='poly'
    merged_px, merged_py = ld.split_by_lastcol(merged_p)
    fitter = clf.fit(merged_px, merged_py)

    ids = range(len(dens_layers) + 1)
    predict_haz = clf.predict(haz_test)
    predict_nohaz = clf.predict(nohaz_test)
    # print "predict_haz:", predict_haz[:10]
    # print "predict_nohaz:", predict_nohaz[:10]

    classnum_haz = np.bincount(predict_haz.astype(int))
    classnum_nohaz = np.bincount(predict_nohaz.astype(int))
    # print "classnum_haz:", classnum_haz[:10]
    # print "classnum_nohaz:", classnum_nohaz[:10]

    haz_prob = ([
        haz / float(haz + nohaz)
        for haz, nohaz in zip(classnum_haz, classnum_nohaz)
    ])
    print "haz_prob:", haz_prob

    # scales = [(min(data[:, 0]), max(data[:, 0])), (min(data[:, 1]), max(data[:, 1]))]
    # vd.plot_classifier(merged_px, clf, num=200, haz=haz_test, nohaz=nohaz_test, clustprobs=haz_prob, scales=scales)
    return merged_clusters, merged_px, clf, haz_prob
Пример #2
0
def sgmask_clf2d_fit(clf, cutcol, inner, outer, scales):
    """
    Fits classifier to separate asteroids belonging to the subgroup 
    from the rest of asteroids. 
    """

    x, y = cutcol
    xmin, xmax = scales[0]
    ymin, ymax = scales[1]

    inner_c = inner[cutcol]
    outer_c = outer[cutcol]

    inner_c = inner_c[inner_c[x] >= xmin]
    inner_c = inner_c[inner_c[x] <= xmax]
    inner_c = inner_c[inner_c[y] >= ymin]
    inner_c = inner_c[inner_c[y] <= ymax]

    outer_c = outer_c[outer_c[x] >= xmin]
    outer_c = outer_c[outer_c[x] <= xmax]
    outer_c = outer_c[outer_c[y] >= ymin]
    outer_c = outer_c[outer_c[y] <= ymax]

    inner_cut = inner_c.as_matrix()
    outer_cut = outer_c.as_matrix()

    bounds = np.asarray(scales).T

    inner_cut, insc = ld.normalize_dataset(inner_cut, bounds=bounds)
    outer_cut, outsc = ld.normalize_dataset(outer_cut, bounds=bounds)

    innum = len(inner_cut)
    sgincol = np.reshape(np.ones(innum), (innum, 1))
    inner_cut_id = np.append(inner_cut, sgincol, axis=1)

    outnum = len(outer_cut)
    sgoutcol = np.reshape(np.zeros(outnum), (outnum, 1))
    outer_cut_id = np.append(outer_cut, sgoutcol, axis=1)

    together = np.concatenate((inner_cut_id, outer_cut_id))
    together = np.random.permutation(together)

    xtrain, ytrain = ld.split_by_lastcol(together)
    clf = clf.fit(xtrain, ytrain)

    return clf
Пример #3
0
def sgmask_clf(hazdf, nohazdf, hazdf_rest, nohazdf_rest, clf, cutcol):
    """
    Fits classifier to separate asteroids belonging to the subgroup 
    from the rest of asteroids. 
    """

    df = pd.concat((hazdf, nohazdf))
    x, y = cutcol[0], cutcol[1]
    xmin, xmax = min(df[x]), max(df[x])
    ymin, ymax = min(df[y]), max(df[y])

    datacut = df[cutcol].as_matrix()
    datacut, scales = ld.normalize_dataset(datacut)

    ndata = len(datacut)
    sgincol = np.reshape(np.ones(ndata), (ndata, 1))
    datacut_ = np.append(datacut, sgincol, axis=1)

    rest = pd.concat((hazdf_rest, nohazdf_rest))
    rest = rest[rest[x] >= xmin]
    rest = rest[rest[x] <= xmax]

    rest = rest[rest[y] >= ymin]
    rest = rest[rest[y] <= ymax]

    restcut = rest[cutcol].as_matrix()
    restcut, scales = ld.normalize_dataset(restcut)
    nrest = len(restcut)
    sgoutcol = np.reshape(np.zeros(nrest), (nrest, 1))
    restcut_ = np.append(restcut, sgoutcol, axis=1)

    data_rest = np.concatenate((datacut_, restcut_))
    data_rest = np.random.permutation(data_rest)

    xtrain, ytrain = ld.split_by_lastcol(data_rest)
    clf = clf.fit(xtrain, ytrain)

    # c = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    # c1i = np.where(c==1)[0]
    # c0i = np.where(c==0)[0]

    return clf
Пример #4
0
def sgmask_clf2d_fitcut(clf, inner_cut, outer_cut):
    """
    Fits classifier to separate asteroids belonging to the subgroup 
    from the rest of asteroids. 
    """
    innum = len(inner_cut)
    sgincol = np.reshape(np.ones(innum), (innum, 1))
    inner_cut_id = np.append(inner_cut, sgincol, axis=1)

    outnum = len(outer_cut)
    sgoutcol = np.reshape(np.zeros(outnum), (outnum, 1))
    outer_cut_id = np.append(outer_cut, sgoutcol, axis=1)

    together = np.concatenate((inner_cut_id, outer_cut_id))
    together = np.random.permutation(together)

    xtrain, ytrain = ld.split_by_lastcol(together)
    clf = clf.fit(xtrain, ytrain)

    return clf
Пример #5
0
def classify_dbclusters(clusters, clf, haz_test, nohaz_test):
    """
    Classifies data by density cluster IDs and calculates PHA mass fraction 
    in the clusters.
    """

    mixed = np.random.permutation(np.concatenate(tuple(clusters)))
    mixed_x, mixed_y = ld.split_by_lastcol(mixed)
    clf = clf.fit(mixed_x, mixed_y)

    predict_haz = clf.predict(haz_test)
    predict_nohaz = clf.predict(nohaz_test)

    classnum_haz = np.bincount(predict_haz.astype(int))
    classnum_nohaz = np.bincount(predict_nohaz.astype(int))

    haz_prob = ([
        haz / float(haz + nohaz)
        for haz, nohaz in zip(classnum_haz, classnum_nohaz)
    ])

    return mixed_x, clf, haz_prob