def test_dbscan(self): dbs = dbscan.DBSCAN(eps=0.4, min_samples=5) clusters = dbs.fit_predict(self.X) self.assertEqual(len(np.unique(clusters[clusters >= 0])), 2) dbs = dbscan.DBSCAN(eps=0.2) clusters = dbs.fit_predict(self.X) self.assertGreater(len(np.unique(clusters[clusters >= 0])), 3) self.assertGreater(np.sum(clusters < 0), 3)
def process_aggregation(subj, filename, clas, boxes, tips, notches): # scale a suitable eps for clustering the boxes from the median box width w = [] for idn in range(0, len(boxes)): w.append(boxes[idn][2]) if len(w) > 0: typ_width = median(w) else: typ_width = 60 # scale a suitable min_points from the number of classifications min_point = max(int(clas * .405), 2) # cluster the boxes centroids epb = max(typ_width * .20, 20) scanb = dbscan.DBSCAN(epb, min_point) scanb.cluster(boxes) sorted_boxes = sorted(scanb.points, key=operator.itemgetter(0), reverse=True) bc_p = json.dumps(sorted_boxes) bclusters = json.dumps(scanb.clusters) # cluster the tip and notch points ept = max(typ_width * .15, 20) scant = dbscan.DBSCAN(ept, min_point) scant.cluster(tips) sorted_tips = sorted(scant.points, key=operator.itemgetter(1)) tc_p = json.dumps(sorted_tips) tclusters = json.dumps(scant.clusters) scann = dbscan.DBSCAN(ept, min_point) scann.cluster(notches) nc_p = json.dumps(scann.points) nclusters = json.dumps(scann.clusters) # clean up the clusters to settle on boxes with exactly two enclosed tip points fluke_positions = fluke_pos(subj, sorted_tips, sorted_boxes) # prepare a row for write the resolved fluke boxes and points to file new_row = { 'subject_ids': subj, 'filename': filename, 'classifications': clas, 'boxes': json.dumps(boxes), 'box_clusters': bc_p, 'bclusters': bclusters, 'tips': json.dumps(tips), 'tip_clusters': tc_p, 'tclusters': tclusters, 'notches': json.dumps(notches), 'notch_clusters': nc_p, 'nclusters': nclusters, 'flukes': json.dumps(fluke_positions) } return new_row
def process_aggregation(subj, image, clas, ep, min_point, h_palms, flowring, leafles): if clas > 10: # test for a minimum of 10 valid clssifications scanh = dbscan.DBSCAN(ep, min_point) scanh.cluster(h_palms) hc_p = json.dumps(scanh.points) count_h = len(scanh.points) hclusters = json.dumps(scanh.clusters) hnoise = json.dumps(scanh.noise) scanf = dbscan.DBSCAN(ep, min_point) scanf.cluster(flowring) fc_p = json.dumps(scanf.points) count_f = len(scanf.points) fclusters = json.dumps(scanf.clusters) fnoise = json.dumps(scanf.noise) scanl = dbscan.DBSCAN(ep, min_point) scanl.cluster(leafles) lc_p = json.dumps(scanl.points) count_l = len(scanl.points) lclusters = json.dumps(scanl.clusters) lnoise = json.dumps(scanl.noise) print(subject) new_row = { 'subject_ids': subj, 'image': image_number, 'classifications': i, 'Count_h_palms': count_h, 'H_palm_clusters': hc_p, 'Hclusters': hclusters, 'Hnoise': hnoise, 'Count_flowering': count_f, 'flowering_clusters': fc_p, 'fclusters': fclusters, 'fnoise': fnoise, 'Count_leafless': count_l, 'leafless_clusters': lc_p, 'lclusters': lclusters, 'lnoise': lnoise } writer.writerow(new_row) return True else: return False
def main(): FileName = 'DBSCAN_data.csv' eps = 5 MinPts = 2 print("\nImporting csv file", FileName, "...") D = dbscan.dataRead(FileName) print("IMPORTING COMPLETE\n") obj = dbscan.DBSCAN(D, eps, MinPts) print("The input dataset for our clustering is:") obj.displayDataset() print("Running DBSCAN clustering...") obj.runDBSCAN() print("CLUSTERING COMPLETE\n") print("The clusters are:") obj.displayClusters() print("The noise obtained after clustering are:") obj.displayNoise() ClusterList = obj.createClusterList() NoiseList = obj.createNoiseList() graphplot.plot(ClusterList, NoiseList)
print(len(a)) print(ag.GetLabels(len(a))) from sklearn import datasets blobs, _ = datasets.make_blobs(n_samples=300, random_state=10) ag = agnes.Agnes(3) skk.fit(blobs) print("sklearn clusters") print(skk.labels_[:30]) ag.Fit(blobs) print('agnes clusters') print(ag.GetLabels(len(blobs)).astype(np.int)[:30]) # DBSCAN TESTING sys.stderr.write("Importing DBSCAN\n") import dbscan sys.stderr.write("Importing Finished\n") sys.stderr.write("Initting\n") db = dbscan.DBSCAN(0.5, 5) sys.stderr.write("Finished Init\n") sys.stderr.write("Fitting\n") db.Fit(a) sys.stderr.write("Finished Fitting\n") sys.stderr.write("Cluster Representation\n") # ???? print(len(a)) print(db.GetLabels(len(a)))
for i in range(n): c = agnes.Agnes(3, 'wards') t0 = time.time() c.Fit(blobs) t1 = time.time() t_total += (t1-t0) aimpl = t_total/n t_total = 0.0 for i in range(n): c = cluster.DBSCAN(1.5, 5) t0 = time.time() c.fit(blobs) t1 = time.time() t_total += (t1-t0) dref = t_total/n t_total = 0.0 for i in range(n): c = dbscan.DBSCAN(1.5, 5) t0 = time.time() c.Fit(blobs) t1 = time.time() t_total += (t1-t0) dimpl = t_total/n print('{:<5d}\t{:<10.4f}\t{:<10.4f}\t{:<10.4f}\t{:<10.4f}\t{:<10.4f}\t{:<10.4f}'.format(num, kref, kimpl, aref, aimpl, dref, dimpl))
for p in range(0, len(D)): for q in range(0, len(D)): distance[p, q] = numpy.linalg.norm(D[p] - D[q]) return numpy.sort(distance, axis=0) # Reading and extracting data data = pd.read_csv('cho.txt', header=None, sep='\t') #data = pd.read_csv('iyer.txt', header=None, sep='\t') #data = pd.read_csv('new_dataset_1.txt', header=None, sep='\t') data = data.values data_ground_truth = data[:, 1] data_features = data[:, 2:] # Determining eps for i in range( 3, 20, 1 ): # i - MinPts, we consider an representative range 3 to 20 all the time epsOpt(i - 1, data_features) # obtain sorted distance plot by running epsOpt # determing eps by taking the average of best eps for each MinPts from 3 to 20 by the plot, # the best eps for each MinPts is the gap point # Determining MinPts by iteration # after determining eps by first running, subsititute eps in the following DBSCAN function and then run again for j in range(3, 20, 1): data_id = dbscan.DBSCAN(data_features, 1.3, j) ARI = dbscan.adjusted_rand_score(data_ground_truth, data_id) print('The Rand Index of eps {} MinPts {} is {}'.format(1.3, j, ARI)) # choose the MinPts-eps pair with the largest rand index
# as usual the module must be in the current directory or in the sys path. import dbscan # data is just a list of [x,y] data = [(629.1, 187.4), [636.5, 73.7], [474.4, 300.0], [541.7, 476.9], [544.9, 471.6], [529.1, 494.8, 'label'], (533.8, 473.2), [508.0, 362.1, 'label'], [485.9, 246.3], [484.9, 251.6], [370.1, 253.7], [604.7, 271.6], (607.0, 288.4), [603.8, 297.9], [719.6, 333.7]] # determine a suitable eps and min_points eps = 30 min_points = 3 print('epsilon =', eps, ' min_points =', min_points) # and plug into a module containing an instance of class DBSCAN: scan = dbscan.DBSCAN(eps, min_points) # pass the data to the cluster function scan.cluster(data) # all done! Get the clustered data back: print('clusters found:', scan.clusters) print('number_of_clusters =', (len(scan.clusters))) print('noise ie points in no cluster:', scan.noise) # to save it in a known format convert to json strings. # note subtle changes in brackets and quotes will occur! clusters = json.dumps(scan.clusters) noise = json.dumps(scan.noise) print('ready to write clusters =', clusters) print('ready to write noise =', noise)