def build_all_datasets():
    
    # Build up the cache of Wikipedia editor usernames
    if prompt_and_print.prompt_for_build_wikipedia_username_cache():
        crosssite_username_dataset_mgr.build_wikipedia_editor_username_cache()
        
    # Prompt to ask from which site we want to build a dataset
    try:
        site = prompt_and_print.prompt_for_site()
    except KeyError:
        print "Sorry, that is not a recognized site. Exiting."
        return
    
    # Build up the spreadsheet of usernames that
    # on exist both Wikipedia and the passed site
    if prompt_and_print.prompt_for_build_username_csv():
        crosssite_username_dataset_mgr.build_crosssite_username_dataset(site)
        
    # Get the confirmed usernames from the spreadsheet since these will 
    # be the usernames from which Wikipedia edits and short texts are fetched 
    crosssite_usernames = crosssite_username_dataset_mgr.get_confirmed_usernames(site)
    
    # Build the spreadsheet of articles that
    # these usernames have edited on Wikipedia
    if prompt_and_print.prompt_for_build_edits_csv():
        wikipedia_edits_dataset_mgr.build_wikipedia_edits_dataset(crosssite_usernames, site)
        
    # Build the spreadsheet of short texts that
    # these usernames have posted on the input site
    if prompt_and_print.prompt_for_build_shorttexts_csv(site):
        short_text_dataset_mgr.build_shorttexts_dataset(crosssite_usernames, site)
     
    # Get the shorttexts fetched from the given site   
    shorttext_rows = short_text_dataset_mgr.get_shorttext_rows(site)
    
    # Cache the nouns and Named Entities detected by nltk, which are used when validating entities later
    if prompt_and_print.prompt_for_cache_nltk_entities(site):
        nltk_extraction_dataset_mgr.extract_entities(shorttext_rows, site)
        
    # Build the spreadsheet of named entities that are
    # contained within these short texts on the given site
    if prompt_and_print.prompt_for_build_entity_csv(site):
        entity_dataset_mgr.build_entities_dataset(shorttext_rows, site) 
示例#2
0
def __filter_invalid_entities__(site, ne_objs):
    crosssite_usernames = crosssite_username_dataset_mgr.get_confirmed_usernames(site)
    en_lang_users = site.get_en_lang_users(crosssite_usernames)
    valid_entity_cache = nltk_extraction_dataset_mgr.get_nltk_entity_cache(site)
    valid_ne_objs = [ne_obj for ne_obj in ne_objs if ne_obj.is_valid_entity(en_lang_users, valid_entity_cache)]
    return valid_ne_objs