def classify_clusters(data, clf, haz_test, nohaz_test, dens_layers): """Classifies data by density cluster IDs and calculates hazardous asteroids' mass fraction in the clusters.""" data_ = data # scales = [(min(data[:, 0]), max(data[:, 0])), (min(data[:, 1]), max(data[:, 1]))] # print "scales:", scales # labels = (-1)*np.ones(len(data)) merged_clusters = [] for class_id, (eps, min_samples) in enumerate(dens_layers): densclust = density_clusters(data_, eps=eps, min_samples=min_samples) print "len(densclust.labels_):", len(densclust.labels_), type( densclust.labels_) print np.unique(densclust.labels_) merged, data_ = merge_clusters(data_, densclust.labels_, class_id) merged_clusters.append(merged) merged, data_ = merge_clusters(data_, densclust.labels_, class_id + 1, tail=True) merged_clusters.append(merged) # vd.plot_densclusters(merged_clusters, scales=scales) merged_p = np.random.permutation(np.concatenate(tuple(merged_clusters))) # clf = KNeighborsClassifier(n_neighbors=int(0.01*len(data))) # clf = svm.SVC(C=1, gamma=100.) #kernel='poly' merged_px, merged_py = ld.split_by_lastcol(merged_p) fitter = clf.fit(merged_px, merged_py) ids = range(len(dens_layers) + 1) predict_haz = clf.predict(haz_test) predict_nohaz = clf.predict(nohaz_test) # print "predict_haz:", predict_haz[:10] # print "predict_nohaz:", predict_nohaz[:10] classnum_haz = np.bincount(predict_haz.astype(int)) classnum_nohaz = np.bincount(predict_nohaz.astype(int)) # print "classnum_haz:", classnum_haz[:10] # print "classnum_nohaz:", classnum_nohaz[:10] haz_prob = ([ haz / float(haz + nohaz) for haz, nohaz in zip(classnum_haz, classnum_nohaz) ]) print "haz_prob:", haz_prob # scales = [(min(data[:, 0]), max(data[:, 0])), (min(data[:, 1]), max(data[:, 1]))] # vd.plot_classifier(merged_px, clf, num=200, haz=haz_test, nohaz=nohaz_test, clustprobs=haz_prob, scales=scales) return merged_clusters, merged_px, clf, haz_prob
def sgmask_clf2d_fit(clf, cutcol, inner, outer, scales): """ Fits classifier to separate asteroids belonging to the subgroup from the rest of asteroids. """ x, y = cutcol xmin, xmax = scales[0] ymin, ymax = scales[1] inner_c = inner[cutcol] outer_c = outer[cutcol] inner_c = inner_c[inner_c[x] >= xmin] inner_c = inner_c[inner_c[x] <= xmax] inner_c = inner_c[inner_c[y] >= ymin] inner_c = inner_c[inner_c[y] <= ymax] outer_c = outer_c[outer_c[x] >= xmin] outer_c = outer_c[outer_c[x] <= xmax] outer_c = outer_c[outer_c[y] >= ymin] outer_c = outer_c[outer_c[y] <= ymax] inner_cut = inner_c.as_matrix() outer_cut = outer_c.as_matrix() bounds = np.asarray(scales).T inner_cut, insc = ld.normalize_dataset(inner_cut, bounds=bounds) outer_cut, outsc = ld.normalize_dataset(outer_cut, bounds=bounds) innum = len(inner_cut) sgincol = np.reshape(np.ones(innum), (innum, 1)) inner_cut_id = np.append(inner_cut, sgincol, axis=1) outnum = len(outer_cut) sgoutcol = np.reshape(np.zeros(outnum), (outnum, 1)) outer_cut_id = np.append(outer_cut, sgoutcol, axis=1) together = np.concatenate((inner_cut_id, outer_cut_id)) together = np.random.permutation(together) xtrain, ytrain = ld.split_by_lastcol(together) clf = clf.fit(xtrain, ytrain) return clf
def sgmask_clf(hazdf, nohazdf, hazdf_rest, nohazdf_rest, clf, cutcol): """ Fits classifier to separate asteroids belonging to the subgroup from the rest of asteroids. """ df = pd.concat((hazdf, nohazdf)) x, y = cutcol[0], cutcol[1] xmin, xmax = min(df[x]), max(df[x]) ymin, ymax = min(df[y]), max(df[y]) datacut = df[cutcol].as_matrix() datacut, scales = ld.normalize_dataset(datacut) ndata = len(datacut) sgincol = np.reshape(np.ones(ndata), (ndata, 1)) datacut_ = np.append(datacut, sgincol, axis=1) rest = pd.concat((hazdf_rest, nohazdf_rest)) rest = rest[rest[x] >= xmin] rest = rest[rest[x] <= xmax] rest = rest[rest[y] >= ymin] rest = rest[rest[y] <= ymax] restcut = rest[cutcol].as_matrix() restcut, scales = ld.normalize_dataset(restcut) nrest = len(restcut) sgoutcol = np.reshape(np.zeros(nrest), (nrest, 1)) restcut_ = np.append(restcut, sgoutcol, axis=1) data_rest = np.concatenate((datacut_, restcut_)) data_rest = np.random.permutation(data_rest) xtrain, ytrain = ld.split_by_lastcol(data_rest) clf = clf.fit(xtrain, ytrain) # c = clf.predict(np.c_[xx.ravel(), yy.ravel()]) # c1i = np.where(c==1)[0] # c0i = np.where(c==0)[0] return clf
def sgmask_clf2d_fitcut(clf, inner_cut, outer_cut): """ Fits classifier to separate asteroids belonging to the subgroup from the rest of asteroids. """ innum = len(inner_cut) sgincol = np.reshape(np.ones(innum), (innum, 1)) inner_cut_id = np.append(inner_cut, sgincol, axis=1) outnum = len(outer_cut) sgoutcol = np.reshape(np.zeros(outnum), (outnum, 1)) outer_cut_id = np.append(outer_cut, sgoutcol, axis=1) together = np.concatenate((inner_cut_id, outer_cut_id)) together = np.random.permutation(together) xtrain, ytrain = ld.split_by_lastcol(together) clf = clf.fit(xtrain, ytrain) return clf
def classify_dbclusters(clusters, clf, haz_test, nohaz_test): """ Classifies data by density cluster IDs and calculates PHA mass fraction in the clusters. """ mixed = np.random.permutation(np.concatenate(tuple(clusters))) mixed_x, mixed_y = ld.split_by_lastcol(mixed) clf = clf.fit(mixed_x, mixed_y) predict_haz = clf.predict(haz_test) predict_nohaz = clf.predict(nohaz_test) classnum_haz = np.bincount(predict_haz.astype(int)) classnum_nohaz = np.bincount(predict_nohaz.astype(int)) haz_prob = ([ haz / float(haz + nohaz) for haz, nohaz in zip(classnum_haz, classnum_nohaz) ]) return mixed_x, clf, haz_prob