def dummy_func((count,zooniverse_id,markings,user_ids,tools,num_users)): if markings == []: return zooniverse_id,(-1,0),num_users clustering_engine = Agglomerative(None,None,{}) clustering_results = clustering_engine.__cluster__(markings,user_ids,tools,markings,None,None) print("++ " + str(count)) return zooniverse_id,clustering_results,num_users
def agglomerative_clustering(filename, n_clusters, linkage, affinity): sample = read_data(filename) agglomerative_instance = Agglomerative(n_clusters, linkage, affinity) agglomerative_instance.fit(sample) clusters = agglomerative_instance.get_clusters() print(linkage, affinity) print("Cluster Result: \n", clusters)
path = l[0] t = [r.split(",") for r in l[1:] if r != ""] roi_dict[path] = [(int(x)/1.92,int(y)/1.92) for (x,y) in t] client = pymongo.MongoClient() db = client['penguin'] classification_collection = db["penguin_classifications"] subject_collection = db["penguin_subjects"] # for c in classification_collection.find(): # _id = c["_id"] # zooniverse_id = c["subjects"][0]["zooniverse_id"] # # classification_collection.update_one({"_id":_id},{"$set":{"zooniverse_id":zooniverse_id}}) clustering_engine = Agglomerative(None,None,{}) # result = db.profiles.create_index([('zooniverse_id', pymongo.ASCENDING)],unique=False) # print result for subject in subject_collection.find(): # _id = c["_id"] zooniverse_id = subject["subjects"][0]["zooniverse_id"] print(zooniverse_id) markings = [] user_ids = [] tools = [] num_users = 0 path = subject["metadata"]["path"]
ub_roi = [[0, height], [width, height]] return lb_roi, ub_roi project = PlanktonPortal() # project.__top_users__() #project.__set_subjects__([u'APK00011p5', u'APK0001bw9', u'APK0001dj4', u'APK00019zu', u'APK00018ri', u'APK0001dxl', u'APK0001ana', u'APK0000ppu', u'APK0000dvx', u'APK0000pyd', u'APK00019ol', u'APK00072zo', u'APK0000h5h', u'APK00001fk', u'APK0000a69', u'APK0000km2', u'APK000175z', u'APK00019yw', u'APK0000e39', u'APK0000kga']) # project.__random_gold_sample__(max_subjects=50) project.__gold_sample__(["yshish"], [ "ElisabethB", "Damon22", "MingMing", "elizabeth", "JasonJason", "rlb66xyz", "planetari7", "fermor332002", "artman40", "Quia" ], max_subjects=200) clustering = Agglomerative(project) # clustering2 = GoldClustering(project) classifier = ExpertClassification(project, clustering) # for subject_id in project.gold_standard_subjects: print subject_id clustering.__fit__(subject_id, gold_standard=True) print clustering.goldResults[subject_id] classifier.__classify__([subject_id], True) print # project.__store_annotations__(subject_id,max_users=20) # clustering.__fit__(subject_id) # clustering.__fit__(subject_id,gold_standard=True) # # # # clustering.__check__() # #
path = l[0] t = [r.split(",") for r in l[1:] if r != ""] roi_dict[path] = [(int(x) / 1.92, int(y) / 1.92) for (x, y) in t] client = pymongo.MongoClient() db = client['penguin'] classification_collection = db["penguin_classifications"] subject_collection = db["penguin_subjects"] # for c in classification_collection.find(): # _id = c["_id"] # zooniverse_id = c["subjects"][0]["zooniverse_id"] # # classification_collection.update_one({"_id":_id},{"$set":{"zooniverse_id":zooniverse_id}}) clustering_engine = Agglomerative(None, None, {}) # result = db.profiles.create_index([('zooniverse_id', pymongo.ASCENDING)],unique=False) # print result for subject in subject_collection.find(): # _id = c["_id"] zooniverse_id = subject["subjects"][0]["zooniverse_id"] print(zooniverse_id) markings = [] user_ids = [] tools = [] num_users = 0 path = subject["metadata"]["path"]
def test_agglomerative(self): df = pd.read_csv('iris.data', header=None) x = df.drop([4], axis=1) y = df[4] X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=79) agglomerative_single_model = Agglomerative(X_train, 3) agglomerative_complete_model = Agglomerative(X_train, 3, 'complete') agglomerative_average_model = Agglomerative(X_train, 3, 'average') agglomerative_average_group_model = Agglomerative( X_train, 3, 'average_group') agglomerative_single_label = agglomerative_single_model.get_cluster() agglomerative_complete_label = agglomerative_complete_model.get_cluster( ) agglomerative_average_label = agglomerative_average_model.get_cluster() agglomerative_average_group_label = agglomerative_average_group_model.get_cluster( ) agglomerative_single_map = self.get_mapping_to_label( 3, agglomerative_single_label, y_train) agglomerative_complete_map = self.get_mapping_to_label( 3, agglomerative_complete_label, y_train) agglomerative_average_map = self.get_mapping_to_label( 3, agglomerative_average_label, y_train) agglomerative_average_group_map = self.get_mapping_to_label( 3, agglomerative_average_group_label, y_train) agglomerative_single_pred = [] agglomerative_complete_pred = [] agglomerative_average_pred = [] agglomerative_average_group_pred = [] for i, row in X_test.iterrows(): agglomerative_single_pred.append( agglomerative_single_model.predict(row)) agglomerative_complete_pred.append( agglomerative_complete_model.predict(row)) agglomerative_average_pred.append( agglomerative_average_model.predict(row)) agglomerative_average_group_pred.append( agglomerative_average_group_model.predict(row)) print( 'Akurasi agglomerative dengan single linkage = ', self.get_accuracy( self.apply_map_to_cluster(agglomerative_single_pred, agglomerative_single_map), y_test.reset_index(drop=True))) print( 'Akurasi agglomerative dengan complete linkage = ', self.get_accuracy( self.apply_map_to_cluster(agglomerative_complete_pred, agglomerative_complete_map), y_test.reset_index(drop=True))) print( 'Akurasi agglomerative dengan average linkage = ', self.get_accuracy( self.apply_map_to_cluster(agglomerative_average_pred, agglomerative_average_map), y_test.reset_index(drop=True))) print( 'Akurasi agglomerative dengan average_group linkage = ', self.get_accuracy( self.apply_map_to_cluster(agglomerative_average_group_pred, agglomerative_average_group_map), y_test.reset_index(drop=True))) model_single = AgglomerativeClustering(3, linkage="single").fit(X_train) model_complete = AgglomerativeClustering( 3, linkage="complete").fit(X_train) model_average = AgglomerativeClustering(3, linkage="average").fit(X_train) model_ward = AgglomerativeClustering(3, linkage="ward").fit(X_train) agg_single_map = self.get_mapping_to_label(3, model_single.labels_, y_train) agg_complete_map = self.get_mapping_to_label(3, model_complete.labels_, y_train) agg_average_map = self.get_mapping_to_label(3, model_average.labels_, y_train) agg_ward_map = self.get_mapping_to_label(3, model_ward.labels_, y_train) print('Akurasi agglomerative sklearn dengan single linkage = %0.3f' % metrics.v_measure_score(y_train, model_single.labels_)) print('Akurasi agglomerative sklearn dengan complete linkage = %0.3f' % metrics.v_measure_score(y_train, model_complete.labels_)) print('Akurasi agglomerative sklearn dengan average linkage = %0.3f' % metrics.v_measure_score(y_train, model_average.labels_)) print('Akurasi agglomerative sklearn dengan ward linkage = %0.3f' % metrics.v_measure_score(y_train, model_ward.labels_))
subject = self.subject_collection.find_one({"zooniverse_id":subject_id}) cutout = subject["metadata"]["cutout"] width = cutout["width"] height = cutout["height"] lb_roi = [[0,0],[width,0]] ub_roi = [[0,height],[width,height]] return lb_roi,ub_roi project = PlanktonPortal() # project.__top_users__() #project.__set_subjects__([u'APK00011p5', u'APK0001bw9', u'APK0001dj4', u'APK00019zu', u'APK00018ri', u'APK0001dxl', u'APK0001ana', u'APK0000ppu', u'APK0000dvx', u'APK0000pyd', u'APK00019ol', u'APK00072zo', u'APK0000h5h', u'APK00001fk', u'APK0000a69', u'APK0000km2', u'APK000175z', u'APK00019yw', u'APK0000e39', u'APK0000kga']) # project.__random_gold_sample__(max_subjects=50) project.__gold_sample__(["yshish"],["ElisabethB","Damon22","MingMing","elizabeth","JasonJason","rlb66xyz","planetari7","fermor332002","artman40","Quia"],max_subjects=200) clustering = Agglomerative(project) # clustering2 = GoldClustering(project) classifier = ExpertClassification(project,clustering) # for subject_id in project.gold_standard_subjects: print subject_id clustering.__fit__(subject_id,gold_standard=True) print clustering.goldResults[subject_id] classifier.__classify__([subject_id],True) print # project.__store_annotations__(subject_id,max_users=20) # clustering.__fit__(subject_id) # clustering.__fit__(subject_id,gold_standard=True) # # # # clustering.__check__() # #
data = data.dropna(how='any') # deleted any missing value # del data['education'] # deleted categorial education because it's same with education-num the_data = data.loc[:, 'age':'hours-per-week'] label = data.loc[:, 'class'] encoder = preprocessing.LabelEncoder() label = encoder.fit_transform(label) # for col in ["workclass", "marital-status", "occupation", "relationship", "race", "sex", # "native-country"]: # the_data[col] = encoder.fit_transform(the_data[col]) # print(the_data) # print(the_data.shape) testing_count = 1000 #model = AgglomerativeClustering(linkage="ward", n_clusters=2) #model.fit(the_data.head(n=testing_count)) testing = label[0:testing_count] model2 = Agglomerative(linkage="average") model2.fit(the_data.head(n=testing_count)) print(model2.labels_) # print(testing) # print(model.labels_) count = 0 for idx, label in enumerate(model2.labels_): if label == testing[idx]: count += 1 print("Correct: ", count, " From: ", testing_count) print("Accuracy: ", count * 100.0 / testing_count)
# ax1.imshow(image) # plt.show() user_buckets = {5: 0.05, 4: 0.25, 3: 0.5, 2: 0.75, 1: 0.95} alg_threshold = range(100, 250, 10) results = {t: [] for t in alg_threshold} user_results = [] with AggregationAPI(11, "development") as whales: whales.__setup__() # whales.__migrate__() rectangle_clustering = BlobClustering("rectangle", whales, {}) point_clustering = Agglomerative("point", {}) postgres_cursor = whales.postgres_session.cursor() select = "SELECT classification_subjects.subject_id from classifications INNER JOIN classification_subjects ON classification_subjects.classification_id = classifications.id where workflow_id = 84" postgres_cursor.execute(select) # for subject_id in postgres_cursor.fetchall()[:50]: subject_id = subject_id[0] print subject_id # subject_id = 494953 # T1 - rectangle outline # t2 - points classifications, markings, _, _ = whales.__sort_annotations__(