Python get_valid_ne_candidates 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: dataset_generation.entity_dataset_mgr

메소드/함수: get_valid_ne_candidates

hotexamples.com에서의 예제들: 2

Python get_valid_ne_candidates - 2개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 dataset_generation.entity_dataset_mgr.get_valid_ne_candidates에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: unresolved_entities_task.py 프로젝트: Big-Data/reslve

def make_tweet_entities_csv_for_turk():
    twitter_site = short_text_websites.get_twitter_site()
    entities_to_evaluate = entity_dataset_mgr.get_valid_ne_candidates(twitter_site)
    if entities_to_evaluate is None:
        print "No ambiguous entities + candidates in cache. Run run_all_dataset_generators "+\
        "script and choose to first fetch and store more entities from short texts."
        return
    
    judged_row_plus_headers = csv_util.query_csv_for_rows(__entities_results_csv_path__, False)
    judged_row_num = 0
    already_judged = [] # list of (entity id, candidate link)
    for judge_row in judged_row_plus_headers:
        try:
            if judged_row_num==0: # row 0 is header
                entity_id_col = judge_row.index('Input.entity_id')
                candidate_link_col = judge_row.index('Input.candidate_link') 
            else:
                judged_tuple = (judge_row[entity_id_col], judge_row[candidate_link_col])
                if not judged_tuple in already_judged:
                    already_judged.append(judged_tuple)
            judged_row_num = judged_row_num+1    
        except:
            continue # just ignore a problematic row      
        
    # Determine what entity+candidate tasks we actually want to write to a spreadsheet 
    # and send to mturk since we don't have resources for unlimited mturk tasks
    tasks = {} # NamedEntity object -> candidate judgment tasks we actually want performed
    user_entities = defaultdict(list) # username -> [NamedEntity obj]
    done_shorttexts = [] # list of shorttext id
    random.shuffle(entities_to_evaluate) # so we get a random subset of a user's entities
    for ne_obj in entities_to_evaluate:
        
        # "40 nouns usually enough to establish statistically significant 
        # differences between WSD algorithms" (Santamaria et al., 2010)
        username = ne_obj.username
        if len(user_entities[username]) > 50:
            continue # have enough entities for this user
        
        # limiting our dataset to one named entity per short text
        shorttext_id = ne_obj.shorttext_id
        if shorttext_id in done_shorttexts:
            continue
        
        # no need to create tasks for candidates we already have annotator judgments for
        entity_id = ne_obj.get_entity_id()
        candidate_URLs = ne_obj.get_candidate_wikiURLs()
        valid_candidate_tasks = []
        for candidate_URL in candidate_URLs:
            if ((entity_id, candidate_URL) in already_judged):
                continue
            valid_candidate_tasks.append(candidate_URL)
        if len(valid_candidate_tasks)==0:
            continue # already have annotator judgments for all of this entity's candidates
        if len(candidate_URLs)+len(valid_candidate_tasks) < 2:
            # this would be a non-ambiguous entity, and we should never reach this 
            # point because such entities should have been filtered out by now
            raise
        tasks[entity_id] = valid_candidate_tasks
        user_entities[username].append(ne_obj)
        done_shorttexts.append(shorttext_id)
            
    # put valid entities + candidates in the spreadsheet until reach our limit of tasks
    task_max = 1400    
    
    rows = []
    headers = ['entity_id', 'short_text', 'ambiguous_entity', 'candidate_link']
    rows.append(headers)
    
    for username in user_entities:
        
        # add users until reach our limit on the number of tasks we can afford, 
        # but break at this point in the loop rather than in the inner loop to
        # ensure that we do have at least 50 entities per user (even if this
        # means we go over our task limit a little in order to reach that amount)
        if len(rows) > task_max:
            break
        
        # bypass users who haven't written the minimum number of valid entities
        # required to establish statistical significance between the algorithms
        if len(user_entities[username]) < 50:
            continue
        
        # should be 50 NamedEntity objects per user, and we'll make tasks for their candidates
        for ne_obj in user_entities[username]:
            entity_id = ne_obj.get_entity_id()
        
            # make sure the entity presented to a Turker looks the same as
            # it appears in the short text (ie with the same capitalization)
            original_shorttext = ne_obj.shorttext_str.decode('latin-1')
            surface_form = ne_obj.surface_form
            if not surface_form in original_shorttext:
                surface_form = __match_appearance__(surface_form, original_shorttext)
            
            # shuffle candidates so that they don't appear
            # in wikiminer's/dbpedia's ranking order and bias the turker
            candidate_URLs = tasks[entity_id]
            random.shuffle(candidate_URLs)
            choices = candidate_URLs[:] # copy (list slicing)
            for choice in choices:
                # make a separate row for each candidate link 
                # rather than putting all links in a single cell
                row = [entity_id, original_shorttext, surface_form, choice]
                rows.append(row)
            
            if len(rows)%50==0:
                # write the rows every once in a while in case we reach an error
                print "Updating spreadsheet..."+str(len(rows))
                csv_util.write_to_spreadsheet(__entities_to_judge_csv_path__, rows)
        
    # dump to csv
    csv_util.write_to_spreadsheet(__entities_to_judge_csv_path__, rows)

예제 #2

파일 보기

파일: RESLVE_rankings_mgr.py 프로젝트: Big-Data/reslve

def run_all_algorithms(RESLVE_alg, site, use_cache):
    ''' 
    @param RESLVE_alg: A constructed reslve_algorithm object 
    @param use_cache: False if still working on algorithms and boosting
    performance and therefore don't want to cache their rankings in a file yet;
    True if ready to cache algorithms' rankings ''' 
    
    # Valid entities and their labels annotated by Mechanical Turk workers
    entities_to_evaluate = entity_dataset_mgr.get_valid_ne_candidates(site)
    entity_judgments = entity_dataset_mgr.get_entity_judgements(site)
    if (entities_to_evaluate is None or len(entities_to_evaluate)==0 or 
        entity_judgments is None or len(entity_judgments))==0:
        print "No labeled ambiguous entities + candidates available. Run appropriate scripts first."
        return {}
    
    # entities that have been labeled by human judges
    entities_to_resolve = [ne_obj for ne_obj in entities_to_evaluate 
                           if ne_obj.get_entity_id() in entity_judgments]
    print str(len(entities_to_evaluate))+" and "+str(len(entity_judgments))+\
    " judgments available, resulting in "+str(len(entities_to_resolve))+" entities to resolve"
    
    # Usernames that do not belong to the same individual on the site and
    # Wikipedia and that we'll use as a baseline for no background knowledge
    nonmatch_usernames = crosssite_username_dataset_mgr.get_confirmed_nonmatch_usernames(site)
    
    resolved_entities = []
    for ne_obj in entities_to_resolve:
        print str(len(resolved_entities))+" out of "+\
        str(len(entities_to_resolve))+" resolved.."
        
        entity_id = ne_obj.get_entity_id()
        evaluated_candidates = entity_judgments[entity_id]
        
        # construct a ResolvedEntity object to represent this
        # ambiguous entity and its various candidate rankings
        resolved_entity = ResolvedEntity(ne_obj, evaluated_candidates)   
        resolved_entities.append(resolved_entity)
        
        reslve_algorithms = [RESLVE_alg]
        for reslve_alg in reslve_algorithms:
            print "Ranking candidates using RESLVE's "+str(reslve_alg.alg_type)+" algorithm..."
                        
            candidate_titles = ne_obj.get_candidate_titles()
            
            # perform the RESLVE ranking..
            reslve_ranking_user_match = reslve_alg.rank_candidates(candidate_titles, 
                                                                   ne_obj.username)
            
            # perform the same algorithm's ranking again but this time use 
            # a non-match user's interest model as background information, 
            # which according to our hypothesis should provide less relevant
            # semantic background knowledge and thus have lower performance
            random.shuffle(nonmatch_usernames)
            random_nonmatch_username = nonmatch_usernames[0]
            reslve_ranking_user_nonmatch = reslve_alg.rank_candidates(candidate_titles, 
                                                                      random_nonmatch_username)
            
            resolved_entity.add_reslve_ranking(reslve_alg.alg_id, 
                                               reslve_ranking_user_match, reslve_ranking_user_nonmatch)
        # cache intermittently in case we need to exit..    
        __save_resolved_entities__(resolved_entities, site, use_cache)
            
    __save_resolved_entities__(resolved_entities, site, use_cache) # Cache resolved entities
    return resolved_entities