def build_all_datasets(): # Build up the cache of Wikipedia editor usernames if prompt_and_print.prompt_for_build_wikipedia_username_cache(): crosssite_username_dataset_mgr.build_wikipedia_editor_username_cache() # Prompt to ask from which site we want to build a dataset try: site = prompt_and_print.prompt_for_site() except KeyError: print "Sorry, that is not a recognized site. Exiting." return # Build up the spreadsheet of usernames that # on exist both Wikipedia and the passed site if prompt_and_print.prompt_for_build_username_csv(): crosssite_username_dataset_mgr.build_crosssite_username_dataset(site) # Get the confirmed usernames from the spreadsheet since these will # be the usernames from which Wikipedia edits and short texts are fetched crosssite_usernames = crosssite_username_dataset_mgr.get_confirmed_usernames(site) # Build the spreadsheet of articles that # these usernames have edited on Wikipedia if prompt_and_print.prompt_for_build_edits_csv(): wikipedia_edits_dataset_mgr.build_wikipedia_edits_dataset(crosssite_usernames, site) # Build the spreadsheet of short texts that # these usernames have posted on the input site if prompt_and_print.prompt_for_build_shorttexts_csv(site): short_text_dataset_mgr.build_shorttexts_dataset(crosssite_usernames, site) # Get the shorttexts fetched from the given site shorttext_rows = short_text_dataset_mgr.get_shorttext_rows(site) # Cache the nouns and Named Entities detected by nltk, which are used when validating entities later if prompt_and_print.prompt_for_cache_nltk_entities(site): nltk_extraction_dataset_mgr.extract_entities(shorttext_rows, site) # Build the spreadsheet of named entities that are # contained within these short texts on the given site if prompt_and_print.prompt_for_build_entity_csv(site): entity_dataset_mgr.build_entities_dataset(shorttext_rows, site)
def __filter_invalid_entities__(site, ne_objs): crosssite_usernames = crosssite_username_dataset_mgr.get_confirmed_usernames(site) en_lang_users = site.get_en_lang_users(crosssite_usernames) valid_entity_cache = nltk_extraction_dataset_mgr.get_nltk_entity_cache(site) valid_ne_objs = [ne_obj for ne_obj in ne_objs if ne_obj.is_valid_entity(en_lang_users, valid_entity_cache)] return valid_ne_objs