def main(): # get paths to all SpatialML files of required format all_files = os.listdir(config.SPATIALML_CORPUS_DIR) files_wanted = [file for file in all_files if file.endswith(config.SPATIALML_FILE_SUFFIX)] # TESTING # files_wanted = [files_wanted[4]] # for each file strip unwanted tags and write result to a file with the same name in the simple SpatialML directory for filename in files_wanted: # parse as xml content = utilities.read_from_file(config.SPATIALML_CORPUS_DIR + filename) soup = BeautifulSoup(content, 'xml') # unwrap all unneeded tags (replace with contents) for tag in soup.find_all('LINK') + soup.find_all(('RLINK')) + soup.find_all('SIGNAL'): tag.unwrap() # unwrap nominal place tags (= tags of nominal references eg. 'city') for tag in soup.find_all('PLACE', attrs={'form': 'NOM'}): tag.unwrap() # unwrap predicative place tags (= tags of e.g. 'Japanese' rather than 'Japan') for tag in soup.find_all('PLACE', attrs={'predicative': 'true'}): tag.unwrap() # write to file with same name in simple SpatialML directory utilities.write_to_file(config.SPATIALML_SIMPLE_DIR + filename, str(soup))
def evaluate(): # for each original file in the SpatialML corpus all_files = os.listdir(SPATIALML_RAW_DIR) totals = [0, 0, 0, 0] for spatialml_file in all_files: print("Evaluating {}...".format(spatialml_file)) # process file to obtain list of identified locations text = read_from_file(SPATIALML_RAW_DIR + spatialml_file) corenlp_tagged_text = corenlp_tag_text(text) identified_locations = identify(corenlp_tagged_text) # extract the "gold standard" locations from corresponding file in simplified corpus spatialml = read_from_file(SPATIALML_SIMPLE_DIR + spatialml_file) corpus_locations = get_locations_from_spatialml(spatialml) # TODO handle better - currently just skip where no matches either way if len(identified_locations) > 0 and len(corpus_locations) > 0: results = evaluate_identified_locs_against_corpus_locs(identified_locations, corpus_locations) for index in range(len(results)): totals[index] += results[index] else: all_files.remove(spatialml_file) print(totals) num_tests = len(all_files) averages = [] for value in totals: averages.append(value / num_tests) av_recog_prec = averages[0] av_recog_recall = averages[2] print(av_recog_prec, av_recog_recall) print("Recog F-measure ", harmonic_mean(av_recog_prec, av_recog_recall)) av_disambig_prec = averages[1] av_disambig_recall = averages[3] print("Disambig F-measure ", harmonic_mean(av_disambig_prec, av_disambig_recall))
def main(): # get paths to all SpatialML files of required format all_files = os.listdir(config.SPATIALML_CORPUS_DIR) files_wanted = [file for file in all_files if file.endswith(config.SPATIALML_FILE_SUFFIX)] # for each file obtain just the text and write this to a file with the same name in the raw SpatialML directory for filename in files_wanted: content = utilities.read_from_file(config.SPATIALML_CORPUS_DIR + filename) soup = BeautifulSoup(content, "xml") text = soup.get_text() utilities.write_to_file(config.SPATIALML_RAW_DIR + filename, text)
def get_locations_from_spatialml(spatialml_file): """ Process all place tags from some stripped SpatialML text (as obtained using strip_spatialml.py) into a list of location objects. """ # process the spatial_ml text as xml spatialml_text = utilities.read_from_file(config.SPATIALML_SIMPLE_DIR + spatialml_file) soup = BeautifulSoup(spatialml_text, 'xml') # iterate through all the child elements of the SpatialML tag (both Tags and NavigableStrings) keeping track of # non-tag chars covered chars_processed = 0 locations = [] for child in soup.find('SpatialML').children: # if reach a place tag process this as a CorpusLocation if child.name == 'PLACE': gazref = child['gazref'] if child.has_attr('gazref') else None name = child.string coordinate = process_latLong(child['latLong'], spatialml_file) if child.has_attr('latLong') else None country = child['country'] if child.has_attr('country') else None # id = child['id'] # not needed I think start = chars_processed chars_processed += len(child.string) stop = chars_processed # add new location to list new_loc = CorpusLocation(name, start, stop, gazref, country, coordinate) locations.append(new_loc) # otherwise just add length of the string to the chars processed elif isinstance(child, NavigableString): chars_processed += len(child) # should only be place tags or NavigableStrings as children so raise error else: raise Exception("Something went wrong...") return locations
def identify_spatialml_raw_locations(disambiguation_function, pickled_dir): """ Main logic of script - for all raw SpatialML files run pipeline on with given disambiguation function and pickle resulting list of locations to a corresponding file in pickled_dir. """ print("Running pipeline on raw SpatialML files using disambiguation function {}...\n" .format(disambiguation_function)) for spatialml_file in os.listdir(config.SPATIALML_RAW_DIR): print("Processing {}...".format(spatialml_file)) # run text in file through pipeline to get list of IdentifiedLocations text = utilities.read_from_file(config.SPATIALML_RAW_DIR + spatialml_file) corenlp_tagged_text = corenlp_interface.corenlp_tag_text(text) locations = identification.identify(corenlp_tagged_text, disambiguation_function) # TODO deal with using different disambig methods better # pickle locations to corresponding file in corresponding dir with open(pickled_dir + spatialml_file, 'wb') as pickle_file: pickle.dump(locations, pickle_file) print("\n\n")
def map_locations(url=None, file=None, display_map=False): """ Main logic of program, perform entire pipeline on the text indicated by the command line arguments given, writing each stage of the pipeline to files in the results directory. """ # exit if neither url nor file given if url is None and file is None: print("A url or file must be given to read content to process from, see help (-h or --help option) for more " "information.") exit(1) # starting message loc = url if file is None else file print("Starting map_locations for {}...".format(loc)) # obtain the content to process if file is not None: # read content from file print("Reading article from file...") title = file content = utilities.read_from_file(file) elif url is not None: # make request to Readability API for url print("Obtaining article from url...") readability_response = readability_interface.readability_request(url) title = readability_response['title'] html_content = readability_response['content'] content = BeautifulSoup(html_content).get_text() # form results directory for article print("Forming results directory for article...") results_dir = make_results_dir(title) # store content of article print("Writing article content to file...") content_file = results_dir + '01_content.txt' utilities.write_to_file(content_file, content) # tag file using Stanford CoreNLP server print("Tagging named entities in article...") try: corenlp_tagged_text = corenlp_interface.corenlp_tag_text(content) except ConnectionRefusedError as ex: # print (most likely) reason for error, trace, and quit print("Stanford CoreNLP server must be run to tag named entities! (settings in config.py)") ex.with_traceback() # store tagged article print("Writing tagged article to file...") corenlp_tagged_file = results_dir + '02_corenlp_tagged.xml' utilities.write_to_file(corenlp_tagged_file, corenlp_tagged_text) # disambiguate identified locations to find most likely candidate (candidates written to files in disambiguate()) print("Disambiguating identified locations...") identified_locations = identification.identify(corenlp_tagged_text, results_dir) # print("\n********************", identified_locs_to_xml(identified_locations, corenlp_tagged_text), "*******************\n") # form kml for identified locations print("Creating kml for article locations...") kml = kml_generation.create_kml(identified_locations) print("Writing kml to file...") relative_kml_file = '04_kml.kml' kml_file = results_dir + relative_kml_file utilities.write_to_file(kml_file, kml) print("Creating html files for map...") # map html file with open(config.CONTEXT_DIR + config.MAP_VIEW_TEMPLATE) as template_file: template = string.Template(template_file.read()) html = template.substitute(kml_file=relative_kml_file, title=title) map_html_file = results_dir + '05_map_view.html' utilities.write_to_file(map_html_file, html) # article html file with open(config.CONTEXT_DIR + config.ARTICLE_TEMPLATE) as template_file: template = string.Template(template_file.read()) # Form article content html, adding bold tags around identified locations. # find positions of all ided locs and add bold tags in reverse order so positions don't shift content_html_list = list(content) positions = {} for ided_loc in identified_locations: positions[ided_loc.start] = ided_loc.stop start_positions = reversed(sorted(positions.keys())) for start_pos in start_positions: stop_pos = positions[start_pos] content_html_list.insert(stop_pos-1, '</b>') content_html_list.insert(start_pos-1, '<b>') # replace newlines with paragraphs for index, el in enumerate(content_html_list): if el == '\n': content_html_list[index] = '<p>' content_html = ''.join(content_html_list) # create and save the html html = template.substitute(article_title=title, article_content=content_html) article_html_file = results_dir + '06_identified_locs.html' utilities.write_to_file(article_html_file, html) if display_map: print("Opening map...") # webbrowser.open_new_tab(article_html_file) webbrowser.open_new_tab(map_html_file) print("Map: file://" + map_html_file) print("map_locations successfully completed for {}.\n".format(loc))