def run(): rows = csv_util.query_csv_for_rows('labeled_data/entities.csv') labeled_entities = set() resolved_entities = set() for row in rows: entity_id = row[0]+'_'+row[3] # (i.e., "surfaceform_shorttext") label = row[2] if label == 'Y': labeled_entities.add(entity_id) resolved_entities.add(entity_id) elif label=='N': labeled_entities.add(entity_id) print str(len(labeled_entities))+" annotated entities." print str(len(resolved_entities))+" unanimously annotated entities."
def run(): rows = csv_util.query_csv_for_rows('labeled_data/entities.csv') labeled_entities = set() resolved_entities = set() for row in rows: entity_id = row[0] + '_' + row[3] # (i.e., "surfaceform_shorttext") label = row[2] if label == 'Y': labeled_entities.add(entity_id) resolved_entities.add(entity_id) elif label == 'N': labeled_entities.add(entity_id) print str(len(labeled_entities)) + " annotated entities." print str(len(resolved_entities)) + " unanimously annotated entities."
def get_resolved_ambiguous_entities(): """ Returns the ambiguous entities for which the intended meaning has been unanimously resolved by human annotators. """ all_entities = defaultdict(list) correct_meaning_label = "Y" row_count = -1 labeled_entities_dataset = csv_util.query_csv_for_rows("labeled_data/entities.csv", False) for candidate_row in labeled_entities_dataset: row_count = row_count + 1 if row_count == 0: # header row surfaceform_col = candidate_row.index("surface_form") shorttext_col = candidate_row.index("short_text") candidate_meaning_col = candidate_row.index("candidate_meaning") candidate_label_col = candidate_row.index("candidate_is_relevant") userkey_col = candidate_row.index("user_key") continue # use "surfaceform_shorttext" as ID for entity surfaceform = candidate_row[surfaceform_col] shorttext = candidate_row[shorttext_col] entity_id = surfaceform + "_" + shorttext meaning = candidate_row[candidate_meaning_col] label = candidate_row[candidate_label_col] userkey = candidate_row[userkey_col] all_entities[entity_id].append((meaning, label, surfaceform, shorttext, userkey)) # test if entity is ambiguous (i.e., has more than one candidate meaning) and # if so if entity has been resolved (i.e., has at least one candidate labeled # as the intended meaning) resolved_entities = {} for entity in all_entities: entity_tuple_list = all_entities[entity] if len(entity_tuple_list) < 2: continue candidate_meanings = [] intended_meanings = [] user = None for (meaning, label, surfaceform, shorttext, userkey) in entity_tuple_list: # title of a potential meaning of the ambiguous entity if not meaning in candidate_meanings: candidate_meanings.append(meaning) # annotated label indicating whether this candidate # meaning is the intended meaning of the entity if label == correct_meaning_label and not meaning in intended_meanings: intended_meanings.append(meaning) if user is None: user = userkey if len(intended_meanings) > 1 and len(intended_meanings) > 0 and user != None: # this entity is ambiguous, has been manually resolved, # and we know the user who wrote it entity_obj = ResolvedEntity(candidate_meanings, intended_meanings, surfaceform, shorttext, user) entity_id = entity_obj.get_id() resolved_entities[entity_id] = entity_obj return resolved_entities
def get_bridged_usernames(): usernames = {} userhashes = csv_util.query_csv_for_rows("labeled_data/user_privacy/anonymized_userhash.csv") for (userkey, username) in userhashes: usernames[userkey] = username
def get_resolved_ambiguous_entities(): ''' Returns the ambiguous entities for which the intended meaning has been unanimously resolved by human annotators. ''' all_entities = defaultdict(list) correct_meaning_label = 'Y' row_count = -1 labeled_entities_dataset = csv_util.query_csv_for_rows('labeled_data/entities.csv', False) for candidate_row in labeled_entities_dataset: row_count = row_count+1 if row_count==0: # header row surfaceform_col = candidate_row.index('surface_form') shorttext_col = candidate_row.index('short_text') candidate_meaning_col = candidate_row.index('candidate_meaning') candidate_label_col = candidate_row.index('candidate_is_relevant') userkey_col = candidate_row.index('user_key') continue # use "surfaceform_shorttext" as ID for entity surfaceform = candidate_row[surfaceform_col] shorttext = candidate_row[shorttext_col] entity_id = surfaceform+'_'+shorttext meaning = candidate_row[candidate_meaning_col] label = candidate_row[candidate_label_col] userkey = candidate_row[userkey_col] all_entities[entity_id].append((meaning, label, surfaceform, shorttext, userkey)) # test if entity is ambiguous (i.e., has more than one candidate meaning) and # if so if entity has been resolved (i.e., has at least one candidate labeled # as the intended meaning) resolved_entities = {} for entity in all_entities: entity_tuple_list = all_entities[entity] if len(entity_tuple_list) < 2: continue candidate_meanings = [] intended_meanings = [] user = None for (meaning, label, surfaceform, shorttext, userkey) in entity_tuple_list: # title of a potential meaning of the ambiguous entity if not meaning in candidate_meanings: candidate_meanings.append(meaning) # annotated label indicating whether this candidate # meaning is the intended meaning of the entity if label==correct_meaning_label and not meaning in intended_meanings: intended_meanings.append(meaning) if user is None: user = userkey if len(intended_meanings)>1 and len(intended_meanings)>0 and user!=None: # this entity is ambiguous, has been manually resolved, # and we know the user who wrote it entity_obj = ResolvedEntity(candidate_meanings, intended_meanings, surfaceform, shorttext, user) entity_id = entity_obj.get_id() resolved_entities[entity_id] = entity_obj return resolved_entities
def get_bridged_usernames(): usernames = {} userhashes = csv_util.query_csv_for_rows('labeled_data/user_privacy/anonymized_userhash.csv') for (userkey, username) in userhashes: usernames[userkey] = username
def run(): username_rows = csv_util.query_csv_for_rows('labeled_data/user_identity.csv') total_flickr = 0 total_twitter = 0 total_youtube = 0 exists_flickrs = 0 exists_twitters = 0 exists_youtubes = 0 FP_Flickr = 0 FP_Twitter = 0 FP_Youtube = 0 TP_Flickr = 0 TP_Twitter = 0 TP_Youtube = 0 for row in username_rows: #username = row[0] flickr_label = row[1] twitter_label = row[2] youtube_label = row[3] exists_wikipedia = row[4] exists_flickr = row[5] exists_twitter = row[6] exists_youtube = row[7] if exists_flickr=='TRUE' and exists_wikipedia=='TRUE': exists_flickrs+=1 if exists_twitter=='TRUE' and exists_wikipedia=='TRUE': exists_twitters+=1 if exists_youtube=='TRUE' and exists_wikipedia=='TRUE': exists_youtubes+=1 if flickr_label!='': total_flickr+=1 if twitter_label!='': total_twitter+=1 if youtube_label!='': total_youtube+=1 if flickr_label=='TP': TP_Flickr+=1 if twitter_label=='TP': TP_Twitter+=1 if youtube_label=='TP': TP_Youtube+=1 if flickr_label=='FP': FP_Flickr+=1 if twitter_label=='FP': FP_Twitter+=1 if youtube_label=='FP': FP_Youtube+=1 print "Table 3" print "Initial Sample Twitter: "+str(total_twitter) print "Reused: "+str(exists_twitters)+" "+str(100*float(exists_twitters)/total_twitter) print " " print "Initial Sample YouTube: "+str(total_youtube) print "Reused: "+str(exists_youtubes)+" "+str(100*float(exists_youtubes)/total_youtube) print " " print "Initial Sample Flickr: "+str(total_flickr) print "Reused: "+str(exists_flickrs)+" "+str(100*float(exists_flickrs)/total_flickr) print " " print "Twitter Bridged: "+str(TP_Twitter)+" "+str(100*float(TP_Twitter)/(TP_Twitter+FP_Twitter)) print "Youtube Bridged: "+str(TP_Youtube)+" "+str(100*float(TP_Youtube)/(TP_Youtube+FP_Youtube)) print "Flickr Bridged: "+str(TP_Flickr)+" "+str(100*float(TP_Flickr)/(TP_Flickr+FP_Flickr))