def query_page_content_text(page_title): try : if "#" in page_title: # ignore anchor tags (for example Microbrewery#Craft beer) and just get the main page title page_title = page_title[:page_title.index("#")] content_query = 'titles='+(str(page_title).replace(' ', '_'))+'&prop=revisions&rvprop=content&format=xml' content_xml = __query_wiki__(content_query) dom = parseString(content_xml) content = dom.getElementsByTagName('rev')[0].childNodes[0].data if "#REDIRECT" in content: # this is a direct page, so we need original page with the actual content orig_page_title = content[content.index('[[')+2:content.index(']]')] return query_page_content_text(orig_page_title) # clean out wikimarkup content = __clean_wikimarkup__(content) # clean out newlines, non-printable characters, etc content = text_util.format_text_for_NER(content, None) content = content.encode("utf-8") return content except Exception as e: print "Problem retrieving page content of page "+str(page_title), e return ''
def find_and_construct_named_entities(shorttext_id, original_shorttext, username, site=None): # use wikipedia miner and dpedia spotlight to detect # named entities and their candidate resources detected_entities = [] clean_shorttext = text_util.format_text_for_NER(original_shorttext, site) try: sf_to_candidates_wikiminer = find_candidates_wikipedia_miner(clean_shorttext) except: sf_to_candidates_wikiminer = {} try: sf_to_candidates_dbpedia = find_candidates_dbpedia(clean_shorttext) except: sf_to_candidates_dbpedia = {} all_detected_surface_forms = set(sf_to_candidates_wikiminer.keys()).union(sf_to_candidates_dbpedia.keys()) # now construct a NamedEntity object for each detected surface form for surface_form in all_detected_surface_forms: ne_obj = NamedEntity(surface_form, shorttext_id, original_shorttext, username, site) # set the NamedEntity's baseline candidate rankings if surface_form in sf_to_candidates_wikiminer: ne_obj.set_wikipedia_miner_ranking(sf_to_candidates_wikiminer[surface_form]) if surface_form in sf_to_candidates_dbpedia: ne_obj.set_dbpedia_spotlight_ranking(sf_to_candidates_dbpedia[surface_form]) detected_entities.append(ne_obj) return detected_entities
def build_entities_dataset(shorttext_rows, site): siteNameStr = str(site.siteName) # Load or create/initialize the spreadsheet of users' short texts entity_csv_path = __get_entities_csv_path__(site) output_str = __get_detected_entities_output_str__(site) headers = [COLUMN_ENTITY_ID, __COLUMN_ENTITY_STRING__, COLUMN_SHORTTEXT_ID, COLUMN_SHORTTEXT_STRING, COLUMN_USERNAME] entities_in_csv = csv_util.load_or_initialize_csv(entity_csv_path, output_str, headers, COLUMN_ENTITY_ID) shorttexts_in_csv = csv_util.get_all_column_values(entity_csv_path, COLUMN_SHORTTEXT_ID) print "A total of "+str(len(shorttext_rows))+" short texts available to detect and resolve entities in..." # Load the cache of ambiguous entity objects ne_objs = pkl_util.load_pickle(output_str, __get_ne_cache_path__(site)) if ne_objs is None: ne_objs = [] # Load the cache of short texts that contain no entities # and that we don't need to keep querying services with entityless_shorttexts = get_entityless_shorttexts(site) # Load the cache of problematic short texts that we can # go back and look at later.. problematic_shorttexts = get_problematic_shorttexts(site) # Prompt how many users to fetch short texts for desired_num_entities = prompt_and_print.prompt_num_entries_to_build(output_str, shorttexts_in_csv) entities_rows = [] progress_count = 1 all_shorttexts_done = True for shorttext_row in shorttext_rows: shorttext_id = shorttext_row[0] if shorttext_id in shorttexts_in_csv or shorttext_id in entityless_shorttexts or shorttext_id in problematic_shorttexts: # already did entities for this shorttext (and either successfully # detected some, successfully detected none, or encountered an error) continue all_shorttexts_done = False try: if len(entities_in_csv) >= desired_num_entities: # have enough so exit break if progress_count%10==0: print "Detecting named entities in short texts posted on "+siteNameStr+\ " by cross-site usernames... Number of short texts whose entities have been fetched so far: \n"+\ str(len(entities_in_csv)) progress_count = progress_count+1 original_shorttext = shorttext_row[1] username = shorttext_row[2] # get the entities contained in each short text # clean the short text before attempting to detect entities in it clean_shorttext = text_util.format_text_for_NER(original_shorttext, site) if clean_shorttext=='': # whole string was invalid, perhaps a URL or # some other content that gets totally filtered problematic_shorttexts.append(shorttext_id) continue detected_entities = named_entity_finder.find_and_construct_named_entities(shorttext_id, original_shorttext, username, site) if len(detected_entities)==0: entityless_shorttexts.append(shorttext_id) for ne_obj in detected_entities: # cache this entity object ne_objs.append(ne_obj) # make a row in the spreadsheet for this entity ne_id = ne_obj.get_entity_id() entity_row = [ne_id, ne_obj.surface_form, shorttext_id, original_shorttext, username] entities_rows.append(entity_row) # keep track that we'll be adding this entity to the csv entities_in_csv.append(ne_id) except Exception as st_e: print "Problematic short text "+str(shorttext_row[1]), st_e if 'referenced before assignment' in str(st_e): raise # it's a server error so we need to stop problematic_shorttexts.append(shorttext_id) continue # update the spreadsheet with any new users' short texts that have been fetched csv_util.append_to_spreadsheet(output_str, entity_csv_path, entities_in_csv, entities_rows, False) # update the cache of ambiguous surface form objects pkl_util.write_pickle(output_str, ne_objs, __get_ne_cache_path__(site)) pkl_util.write_pickle(__entityless_output_str__, entityless_shorttexts, __get_entityless_cache_path__(site)) pkl_util.write_pickle(__problematic_output_str__, problematic_shorttexts, __get_problematic_cache_path__(site)) print "Cached a total of "+str(len(ne_objs))+" ambiguous named entities" if all_shorttexts_done: print "Completed detecting and resolving entities in all short texts available." else: print "More short texts available to detect and resolve entities for."