def build_wikipedia_editor_username_cache():
    ''' Fetches large numbers of active Wikipedia editors who have made edits 
    recently and stores them in a cache, which we can access while attempting
    to find usernames that exist on both Wikipedia and a short text source site. 
    (The occurrence of such cross-site username matces may be low, so we want 
    to have a large cache of Wikipedia editors to draw upon).
    The cache is a mapping of { username -> { edited page -> number of edits on page } } '''
    
    # Load the Wikipedia usernames+edits cache
    output_str = "Wikipedia editor usernames and their edited pages..."
    editor_usernames = pkl_util.load_pickle(output_str, 
                                            __wikipedia_editors_cache_path__)
    if editor_usernames is None:
        editor_usernames = []

    # Prompt how many Wikipedia usernames to fetch and query Wikipedia until retrieved that many
    desired_num_editors = prompt_and_print.prompt_num_entries_to_build("active Wikipedia editors", 
                                                                       editor_usernames)
    pre_fetch_len = len(editor_usernames)
    wikipedia_api_util.query_editors_of_recentchanges(desired_num_editors, editor_usernames)
    print "Fetched "+str(len(editor_usernames)-pre_fetch_len)+" more recent and active Wikipedia editors"
    
    # make sure all usernames are lowercase
    editor_usernames = [u.lower() for u in editor_usernames]
    
    # Update cache
    print "Cached a total of "+str(len(editor_usernames))+" Wikipedia editor usernames"
    pkl_util.write_pickle(output_str, editor_usernames, __wikipedia_editors_cache_path__)
Exemplo n.º 2
0
def get_annotator_decisions(site):
    ''' Loads the cache of turker IDs and their candidate decisions
    sp we can compute measures of inter-annotator agreement. '''
    annotator_decisions = pkl_util.load_pickle(__annotator_output_str__, __get_annotator_cache_path__(site)) 
    if annotator_decisions is None:
        print "No cache of annotator decisions available. Run unresolved_entities_task.py first."
    return annotator_decisions    
def get_edits_by_user(username):
    editor_names_to_edits_cache = pkl_util.load_pickle("Wikipedia editor usernames to their edited pages+counts",
                                                       __edits_cache_path__)
    try:
        return editor_names_to_edits_cache[username]
    except:
        return []
Exemplo n.º 4
0
def get_valid_ne_candidates(site):
    ''' Returns the ambiguous entities mapped to their possible candidates  
    from which humans need to manually choose the correct candidate. '''
    ne_objs = pkl_util.load_pickle(__get_detected_entities_output_str__(site),
                                   __get_ne_cache_path__(site))
    if ne_objs is None:
        return None
    return __filter_invalid_entities__(site, ne_objs)   
Exemplo n.º 5
0
def get_resolved_entities(site, use_cache):
    if use_cache:
        # in test mode, so we want to re-run the ranking algorithms and return the results
        return run_all_algorithms(site, use_cache)
       
    # try to read RESLVE system's results from cache; if cache
    # unavailable,  rerun the algorithms and write to cache
    resolved_entities = pkl_util.load_pickle(__resolved_entities_output_str__, 
                                             __get_resolved_entities_cache_path__(site))
    if resolved_entities is None:
        return run_all_algorithms(site, use_cache)
    return resolved_entities
def build_crosssite_username_dataset(site):
    ''' Searches the given site for the Wikipedia editor usernames we have previously cached.
    Does so until have a sufficient set of unique users who have both active Wikipedia accounts 
    and active accounts on the given site. Saves those users in a csv file and a pkl cache 
    and also writes to cache the usernames that are determined to NOT exist on the given 
    site so we don't bother searching for them again in the future. 
    @param site: Should be a Site object 
    '''
    siteNameStr = str(site.siteName)
    
    # Load or create/initialize the spreadsheet of usernames
    usernames_csv_path = __get_usernames_csv_path__(site)
    csv_string = 'usernames that exist on both Wikipedia and '+siteNameStr
    headers = [COLUMN_USERNAME, __COLUMN_SAME_INDIVIDUAL__]
    usernames_in_csv = csv_util.load_or_initialize_csv(usernames_csv_path, csv_string, headers, COLUMN_USERNAME)

    # Load the caches of Wikipedia usernames:
    editor_names_cache = pkl_util.load_pickle("Wikipedia editor usernames",
                                          __wikipedia_editors_cache_path__)
    editor_usernames = [] if (editor_names_cache is None) else editor_names_cache
    # editor usernames that do NOT exist on the given site
    nonexistent_usernames_path = __get_nonexistent_usernames_cache_path__(site)
    nonexistent_usernames_cache = pkl_util.load_pickle("Wikipedia usernames that do NOT exist on "+siteNameStr+"...", 
                                                       nonexistent_usernames_path)
    if nonexistent_usernames_cache==None:
        nonexistent_usernames_cache = []
    
    # only need to analyze those usernames that we haven't 
    # already determined do or do not exist on given site
    usernames_todo = __get_remaining_todo__(editor_usernames, 
                                        [usernames_in_csv, nonexistent_usernames_cache])
    
    # Prompt how many matching usernames to fetch from the given site  
    desired_num_usernames = prompt_and_print.prompt_num_entries_to_build(csv_string, usernames_in_csv)
    num_to_append = desired_num_usernames - len(usernames_in_csv)
    if len(usernames_todo) < num_to_append:
        print "Only "+str(len(usernames_todo))+" unanalyzed Wikipedia usernames in cache. If you "+\
        "want "+str(desired_num_usernames)+" total in the cross-site usernames csv, you'll have to "+\
        "re-run script and choose to first fetch more Wikipedia editor usernames."
    
    prompt_count = 0
    while(len(usernames_in_csv)<desired_num_usernames and len(usernames_todo)>0):
        
        # Intermittently prompt user whether to continue fetching matching usernames or exit script
        if prompt_count >= __PROMPT_COUNT__:
            continue_searching = prompt_and_print.prompt_continue_building(csv_string, usernames_in_csv, desired_num_usernames)
            if not continue_searching:
                break
            prompt_count = 0 # reset count
        prompt_count = prompt_count + 1
        
        # get lists of usernames that do or do not also exist on the given site
        match_response = site.fetching_existence_status(usernames_todo, desired_num_usernames)
        existing = match_response[site.get_existing_response_key()]
        nonexisting = match_response[site.get_nonexisting_response_key()]
        
        print "Found "+str(len(existing))+" existing and active usernames on "+siteNameStr

        # update the spreadsheet with any new usernames that have been fetched
        existing_rows = [[username, __VALUE_UNCONFIRMED__] for username in existing]
        csv_util.append_to_spreadsheet(csv_string, usernames_csv_path, 
                                       usernames_in_csv, existing_rows)
        # and update the list of usernames in the csv so we know how 
        # many more we still need to fetch to reach the desired num
        usernames_in_csv.extend(existing)
    
        # Also update the cache of Wikipedia usernames that do NOT exist on the given site
        nonexistent_usernames_cache.extend(nonexisting)
        nonexistent_write_str = "usernames that DO NOT exist on both Wikipedia and "+siteNameStr+"..."
        pkl_util.write_pickle(nonexistent_write_str, nonexistent_usernames_cache, nonexistent_usernames_path)
        
        # remove any usernames that we now determined do not exist on given site
        usernames_todo = __get_remaining_todo__(usernames_todo, [existing, nonexistent_usernames_cache])
      
        rate_limited = match_response[site.get_rate_limit_key()]
        if rate_limited:
            break # reached rate limit, so break
def get_nltk_entity_cache(site):
    shorttext_entities = pkl_util.load_pickle(__output_str__, __get_nltk_entities_cache_path__(site))
    if shorttext_entities is None:
        shorttext_entities = {}    
    return shorttext_entities
def build_wikipedia_edits_dataset(crosssite_usernames, prompt=True):
    
    # Load or create/initialize the spreadsheet of users' wikipedia edits
    csv_string = 'Wikipedia edits made by usernames that also exist on a site that is a source of short texts'
    headers = [COLUMN_USERNAME, __COLUMN_ARTICLE_ID__, __COLUMN_NUM_EDITS__]
    usernames_in_csv = csv_util.load_or_initialize_csv(__edits_csv_path__, csv_string, headers, COLUMN_USERNAME)
    
    # Load the cache of edits, a dict: { username -> {edited page -> num edits } }
    editor_names_to_edits_cache = pkl_util.load_pickle("Wikipedia editor usernames to their edited pages+counts",
                                                       __edits_cache_path__)
    if editor_names_to_edits_cache is None:
        editor_names_to_edits_cache = {}

    # only need to fetch the edits for usernames that we haven't already done
    editors_todo = [u for u in crosssite_usernames if u not in usernames_in_csv]
    
    # Exit if all available names are done
    if len(editors_todo)==0:
        print "Wikipedia edit data fetched and stored for all "+\
        str(len(crosssite_usernames))+" confirmed cross-site editors. Exiting."
        return 
    
    print str(len(crosssite_usernames))+" cross-site editors total, and "+\
    str(len(editors_todo))+" editors not yet in spreadsheet of edits "
    
    # Prompt how many users to fetch edits for
    if prompt:
        desired_num_editors = prompt_and_print.prompt_num_entries_to_build(csv_string, usernames_in_csv)
        num_to_append = desired_num_editors - len(usernames_in_csv)
        if len(editors_todo) < num_to_append:
            print "Only "+str(len(editors_todo))+" cross-site usernames available. If you want "+\
            "want "+str(desired_num_editors)+" total editors' edits in the edits csv, you'll have to "+\
            "re-run script and choose to first fetch more cross-site usernames."
    else:
        desired_num_editors = 1
    
    edits_rows = []
    #prompt_count = 0
    progress_count = 1
    for username in editors_todo:
        
        if len(usernames_in_csv) >= desired_num_editors:
            # have enough so exit
            break
        
        '''
        # Intermittently prompt user whether to continue fetching matching usernames or exit script
        if prompt and prompt_count >= __PROMPT_COUNT__:
            continue_searching = prompt_and_print.prompt_continue_building(csv_string, usernames_in_csv, desired_num_editors)
            if not continue_searching:
                break
            prompt_count = 0 # reset count
        prompt_count = prompt_count + 1
        '''
        
        if progress_count%10==0:
            print "Querying for pages edited by cross site usernames... Number "+\
            "usernames whose edits have been fetched so far: "+str(progress_count)
        progress_count = progress_count+1
        
        user_edits = wikipedia_api_util.query_usercontribs(username, False)
        for article_id in user_edits:
            num_times_edited = user_edits[article_id]
            edits_row = [username, article_id, num_times_edited]
            edits_rows.append(edits_row)
            
        # keep track that we'll be adding this username to the csv
        usernames_in_csv.append(username)
        editor_names_to_edits_cache[username] = user_edits # add that user+edits to cache
                
    # update the spreadsheet with any new editors' edits that have been fetched
    csv_util.append_to_spreadsheet(csv_string, __edits_csv_path__, usernames_in_csv, edits_rows, False)  
        
    # update the edit mapping cache
    pkl_util.write_pickle("user edits to file...", 
                          editor_names_to_edits_cache, __edits_cache_path__)
        
Exemplo n.º 9
0
def get_entityless_shorttexts(site):
    entityless_shorttexts = pkl_util.load_pickle(__entityless_output_str__, __get_entityless_cache_path__(site))
    if entityless_shorttexts is None:
        entityless_shorttexts = [] 
    return entityless_shorttexts
Exemplo n.º 10
0
def get_num_cached_ne_objs(site):
    ne_objs = pkl_util.load_pickle(__get_detected_entities_output_str__(site),
                                   __get_ne_cache_path__(site))
    if ne_objs is None:
        return 0
    return len(ne_objs)
Exemplo n.º 11
0
def get_entity_judgements(site):
    judgments = pkl_util.load_pickle(__candidate_judgments_output_str__, 
                                     __get_candidate_judgments_cache_path__(site)) 
    if judgments is None:
        print "No cache of judgments available. Run unresolved_entities_task.py first."
    return judgments
Exemplo n.º 12
0
def build_entities_dataset(shorttext_rows, site):
    
    siteNameStr = str(site.siteName)
    
    # Load or create/initialize the spreadsheet of users' short texts
    entity_csv_path = __get_entities_csv_path__(site)
    output_str = __get_detected_entities_output_str__(site)
    headers = [COLUMN_ENTITY_ID, __COLUMN_ENTITY_STRING__, COLUMN_SHORTTEXT_ID, COLUMN_SHORTTEXT_STRING, COLUMN_USERNAME]
    entities_in_csv = csv_util.load_or_initialize_csv(entity_csv_path, output_str, headers, COLUMN_ENTITY_ID)
    shorttexts_in_csv = csv_util.get_all_column_values(entity_csv_path, COLUMN_SHORTTEXT_ID)
    print "A total of "+str(len(shorttext_rows))+" short texts available to detect and resolve entities in..."
    
    # Load the cache of ambiguous entity objects
    ne_objs = pkl_util.load_pickle(output_str, __get_ne_cache_path__(site))
    if ne_objs is None:
        ne_objs = []
    
    # Load the cache of short texts that contain no entities
    # and that we don't need to keep querying services with
    entityless_shorttexts = get_entityless_shorttexts(site)
        
    # Load the cache of problematic short texts that we can 
    # go back and look at later..
    problematic_shorttexts = get_problematic_shorttexts(site)
    
    # Prompt how many users to fetch short texts for
    desired_num_entities = prompt_and_print.prompt_num_entries_to_build(output_str, shorttexts_in_csv)
    
    entities_rows = []
    progress_count = 1
    all_shorttexts_done = True
    for shorttext_row in shorttext_rows:
        
        shorttext_id = shorttext_row[0]
        if shorttext_id in shorttexts_in_csv or shorttext_id in entityless_shorttexts or shorttext_id in problematic_shorttexts:
            # already did entities for this shorttext (and either successfully 
            # detected some, successfully detected none, or encountered an error)
            continue
        all_shorttexts_done = False
        
        try:
            if len(entities_in_csv) >= desired_num_entities:
                # have enough so exit
                break
            
            if progress_count%10==0:
                print "Detecting named entities in short texts posted on "+siteNameStr+\
                " by cross-site usernames... Number of short texts whose entities have been fetched so far: \n"+\
                str(len(entities_in_csv))
            progress_count = progress_count+1
            
            original_shorttext = shorttext_row[1]
            username = shorttext_row[2]
            
            # get the entities contained in each short text
            # clean the short text before attempting to detect entities in it
            clean_shorttext = text_util.format_text_for_NER(original_shorttext, site)
            if clean_shorttext=='':
                # whole string was invalid, perhaps a URL or 
                # some other content that gets totally filtered
                problematic_shorttexts.append(shorttext_id)
                continue
            
            detected_entities = named_entity_finder.find_and_construct_named_entities(shorttext_id, original_shorttext, username, site)
            if len(detected_entities)==0:
                entityless_shorttexts.append(shorttext_id)
                
            for ne_obj in detected_entities:
                # cache this entity object
                ne_objs.append(ne_obj)
                
                # make a row in the spreadsheet for this entity
                ne_id = ne_obj.get_entity_id()
                entity_row = [ne_id, ne_obj.surface_form, 
                              shorttext_id, original_shorttext,
                              username]
                entities_rows.append(entity_row)
                
                # keep track that we'll be adding this entity to the csv
                entities_in_csv.append(ne_id)
        except Exception as st_e:
            print "Problematic short text "+str(shorttext_row[1]), st_e
            if 'referenced before assignment' in str(st_e):
                raise # it's a server error so we need to stop 
            problematic_shorttexts.append(shorttext_id)
            continue
                
    # update the spreadsheet with any new users' short texts that have been fetched
    csv_util.append_to_spreadsheet(output_str, entity_csv_path, entities_in_csv, entities_rows, False)  
    
    # update the cache of ambiguous surface form objects
    pkl_util.write_pickle(output_str, ne_objs, __get_ne_cache_path__(site))
    pkl_util.write_pickle(__entityless_output_str__, entityless_shorttexts, __get_entityless_cache_path__(site))
    pkl_util.write_pickle(__problematic_output_str__, problematic_shorttexts, __get_problematic_cache_path__(site))
    print "Cached a total of "+str(len(ne_objs))+" ambiguous named entities"
    if all_shorttexts_done:
        print "Completed detecting and resolving entities in all short texts available."
    else:
        print "More short texts available to detect and resolve entities for."
Exemplo n.º 13
0
def get_problematic_shorttexts(site):
    problematic_shorttexts = pkl_util.load_pickle(__problematic_output_str__, __get_problematic_cache_path__(site))
    if problematic_shorttexts is None:
        problematic_shorttexts = []     
    return problematic_shorttexts