def main():

    # get paths to all SpatialML files of required format
    all_files = os.listdir(config.SPATIALML_CORPUS_DIR)
    files_wanted = [file for file in all_files if file.endswith(config.SPATIALML_FILE_SUFFIX)]

    # TESTING
    # files_wanted = [files_wanted[4]]

    # for each file strip unwanted tags and write result to a file with the same name in the simple SpatialML directory
    for filename in files_wanted:

        # parse as xml
        content = utilities.read_from_file(config.SPATIALML_CORPUS_DIR + filename)
        soup = BeautifulSoup(content, 'xml')

        # unwrap all unneeded tags (replace with contents)
        for tag in soup.find_all('LINK') + soup.find_all(('RLINK')) + soup.find_all('SIGNAL'):
            tag.unwrap()

        # unwrap nominal place tags (= tags of nominal references eg. 'city')
        for tag in soup.find_all('PLACE', attrs={'form': 'NOM'}):
            tag.unwrap()

        # unwrap predicative place tags (= tags of e.g. 'Japanese' rather than 'Japan')
        for tag in soup.find_all('PLACE', attrs={'predicative': 'true'}):
            tag.unwrap()

        # write to file with same name in simple SpatialML directory
        utilities.write_to_file(config.SPATIALML_SIMPLE_DIR + filename, str(soup))
예제 #2
0
def evaluate():
    # for each original file in the SpatialML corpus
    all_files = os.listdir(SPATIALML_RAW_DIR)

    totals = [0, 0, 0, 0]

    for spatialml_file in all_files:

        print("Evaluating {}...".format(spatialml_file))

        # process file to obtain list of identified locations
        text = read_from_file(SPATIALML_RAW_DIR + spatialml_file)
        corenlp_tagged_text = corenlp_tag_text(text)
        identified_locations = identify(corenlp_tagged_text)

        # extract the "gold standard" locations from corresponding file in simplified corpus
        spatialml = read_from_file(SPATIALML_SIMPLE_DIR + spatialml_file)
        corpus_locations = get_locations_from_spatialml(spatialml)

        # TODO handle better - currently just skip where no matches either way
        if len(identified_locations) > 0 and len(corpus_locations) > 0:
            results = evaluate_identified_locs_against_corpus_locs(identified_locations, corpus_locations)
            for index in range(len(results)):
                totals[index] += results[index]
        else:
            all_files.remove(spatialml_file)

        print(totals)

    num_tests = len(all_files)
    averages = []
    for value in totals:
        averages.append(value / num_tests)

    av_recog_prec = averages[0]
    av_recog_recall = averages[2]
    print(av_recog_prec, av_recog_recall)
    print("Recog F-measure ", harmonic_mean(av_recog_prec, av_recog_recall))

    av_disambig_prec = averages[1]
    av_disambig_recall = averages[3]
    print("Disambig F-measure ", harmonic_mean(av_disambig_prec, av_disambig_recall))
예제 #3
0
def main():

    # get paths to all SpatialML files of required format
    all_files = os.listdir(config.SPATIALML_CORPUS_DIR)
    files_wanted = [file for file in all_files if file.endswith(config.SPATIALML_FILE_SUFFIX)]

    # for each file obtain just the text and write this to a file with the same name in the raw SpatialML directory
    for filename in files_wanted:
        content = utilities.read_from_file(config.SPATIALML_CORPUS_DIR + filename)
        soup = BeautifulSoup(content, "xml")
        text = soup.get_text()
        utilities.write_to_file(config.SPATIALML_RAW_DIR + filename, text)
def get_locations_from_spatialml(spatialml_file):
    """ Process all place tags from some stripped SpatialML text (as obtained using strip_spatialml.py) into a list
    of location objects.
    """

    # process the spatial_ml text as xml
    spatialml_text = utilities.read_from_file(config.SPATIALML_SIMPLE_DIR + spatialml_file)
    soup = BeautifulSoup(spatialml_text, 'xml')

    # iterate through all the child elements of the SpatialML tag (both Tags and NavigableStrings) keeping track of
    # non-tag chars covered
    chars_processed = 0
    locations = []
    for child in soup.find('SpatialML').children:

        # if reach a place tag process this as a CorpusLocation
        if child.name == 'PLACE':

            gazref = child['gazref'] if child.has_attr('gazref') else None
            name = child.string
            coordinate = process_latLong(child['latLong'], spatialml_file) if child.has_attr('latLong') else None
            country = child['country']  if child.has_attr('country') else None

            # id = child['id'] # not needed I think

            start = chars_processed
            chars_processed += len(child.string)
            stop = chars_processed

            # add new location to list
            new_loc = CorpusLocation(name, start, stop, gazref, country, coordinate)
            locations.append(new_loc)

        # otherwise just add length of the string to the chars processed
        elif isinstance(child, NavigableString):
            chars_processed += len(child)

        # should only be place tags or NavigableStrings as children so raise error
        else:
            raise Exception("Something went wrong...")

    return locations
def identify_spatialml_raw_locations(disambiguation_function, pickled_dir):
    """ Main logic of script - for all raw SpatialML files run pipeline on with given disambiguation function and
        pickle resulting list of locations to a corresponding file in pickled_dir.
    """

    print("Running pipeline on raw SpatialML files using disambiguation function {}...\n"
          .format(disambiguation_function))

    for spatialml_file in os.listdir(config.SPATIALML_RAW_DIR):

        print("Processing {}...".format(spatialml_file))

        # run text in file through pipeline to get list of IdentifiedLocations
        text = utilities.read_from_file(config.SPATIALML_RAW_DIR + spatialml_file)
        corenlp_tagged_text = corenlp_interface.corenlp_tag_text(text)
        locations = identification.identify(corenlp_tagged_text, disambiguation_function)

        # TODO deal with using different disambig methods better
        # pickle locations to corresponding file in corresponding dir
        with open(pickled_dir + spatialml_file, 'wb') as pickle_file:
            pickle.dump(locations, pickle_file)

    print("\n\n")
예제 #6
0
def map_locations(url=None, file=None, display_map=False):
    """ Main logic of program, perform entire pipeline on the text indicated by the command line arguments given,
        writing each stage of the pipeline to files in the results directory. """

    # exit if neither url nor file given
    if url is None and file is None:
        print("A url or file must be given to read content to process from, see help (-h or --help option) for more "
              "information.")
        exit(1)

    # starting message
    loc = url if file is None else file
    print("Starting map_locations for {}...".format(loc))

    # obtain the content to process
    if file is not None:
        # read content from file
        print("Reading article from file...")
        title = file
        content = utilities.read_from_file(file)

    elif url is not None:
        # make request to Readability API for url
        print("Obtaining article from url...")
        readability_response = readability_interface.readability_request(url)
        title = readability_response['title']
        html_content = readability_response['content']
        content = BeautifulSoup(html_content).get_text()

    # form results directory for article
    print("Forming results directory for article...")
    results_dir = make_results_dir(title)

    # store content of article
    print("Writing article content to file...")
    content_file = results_dir + '01_content.txt'
    utilities.write_to_file(content_file, content)

    # tag file using Stanford CoreNLP server
    print("Tagging named entities in article...")
    try:
        corenlp_tagged_text = corenlp_interface.corenlp_tag_text(content)
    except ConnectionRefusedError as ex:
        # print (most likely) reason for error, trace, and quit
        print("Stanford CoreNLP server must be run to tag named entities! (settings in config.py)")
        ex.with_traceback()

    # store tagged article
    print("Writing tagged article to file...")
    corenlp_tagged_file = results_dir + '02_corenlp_tagged.xml'
    utilities.write_to_file(corenlp_tagged_file, corenlp_tagged_text)

    # disambiguate identified locations to find most likely candidate (candidates written to files in disambiguate())
    print("Disambiguating identified locations...")
    identified_locations = identification.identify(corenlp_tagged_text, results_dir)


    # print("\n********************", identified_locs_to_xml(identified_locations, corenlp_tagged_text), "*******************\n")


    # form kml for identified locations
    print("Creating kml for article locations...")
    kml = kml_generation.create_kml(identified_locations)

    print("Writing kml to file...")
    relative_kml_file = '04_kml.kml'
    kml_file = results_dir + relative_kml_file
    utilities.write_to_file(kml_file, kml)

    print("Creating html files for map...")

    # map html file
    with open(config.CONTEXT_DIR + config.MAP_VIEW_TEMPLATE) as template_file:
        template = string.Template(template_file.read())
        html = template.substitute(kml_file=relative_kml_file, title=title)
        map_html_file = results_dir + '05_map_view.html'
        utilities.write_to_file(map_html_file, html)

    # article html file
    with open(config.CONTEXT_DIR + config.ARTICLE_TEMPLATE) as template_file:
        template = string.Template(template_file.read())

        # Form article content html, adding bold tags around identified locations.
        # find positions of all ided locs and add bold tags in reverse order so positions don't shift
        content_html_list = list(content)
        positions = {}
        for ided_loc in identified_locations:
            positions[ided_loc.start] = ided_loc.stop

        start_positions = reversed(sorted(positions.keys()))
        for start_pos in start_positions:
            stop_pos = positions[start_pos]
            content_html_list.insert(stop_pos-1, '</b>')
            content_html_list.insert(start_pos-1, '<b>')

        # replace newlines with paragraphs
        for index, el in enumerate(content_html_list):
            if el == '\n':
                content_html_list[index] = '<p>'

        content_html = ''.join(content_html_list)

        # create and save the html
        html = template.substitute(article_title=title, article_content=content_html)
        article_html_file = results_dir + '06_identified_locs.html'
        utilities.write_to_file(article_html_file, html)

    if display_map:
        print("Opening map...")
        # webbrowser.open_new_tab(article_html_file)
        webbrowser.open_new_tab(map_html_file)

    print("Map: file://" + map_html_file)

    print("map_locations successfully completed for {}.\n".format(loc))