def update_countries_with_regions(entities, countries, text):
    # adds countries derived from regions to country list
    ambiguous_locations = {}

    subs = pd.DataFrame()
    for entity in {i[0] for i in entities if i[1]=='LOCATION'}:
        a = almost_everything[almost_everything.subdivision == entity]
        if not a.empty:
            subs = pd.concat([subs, a], ignore_index=True)

    if not subs.empty:
        subs.country_name = subs.country_name.apply(standardize_country_name)
        no_dupes = subs.drop_duplicates(['country_name', 'subdivision'])
        for value_count in no_dupes.subdivision.value_counts().iteritems():
            count = value_count[1]
            place = value_count[0]
            probability = 1.0 / count
            if probability == 1.0:
                # only one country exists for a single subdivision
                probability = 0.8 # correcting for imperfect entity parsing
                possible_countries = subs[subs.subdivision == place].country_name.tolist()
                country = possible_countries[0]
                if country in countries:
                    priors = countries[country]
                    new_count = priors['count'] + len(possible_countries)
                    new_probability = probabilities.independent_either_probability(priors['probability'], probability)
                    countries[country] = {'count': new_count, 'probability': new_probability}
                else:
                    countries[country] = {'count': len(possible_countries), 'probability': probability}
            else:
                # multiple countries exist for a single subdivision
                possible_countries = no_dupes[no_dupes.subdivision == place].country_name.tolist()
                new_probabilities = context_adjustment(place, possible_countries, probability, text)
                ambiguous_locations[place] = {'possible_countries': possible_countries}
                for country in possible_countries:
                    if country in countries:
                        priors = countries[country]
                        new_count = priors['count'] + new_probabilities[country]['count']
                        new_probability = probabilities.independent_either_probability(priors['probability'], new_probabilities[country]['probability'])
                        countries[country] = {'count': new_count, 'probability': new_probability}
                    else:
                        countries[country] = {'count': new_probabilities[country]['count'], 'probability': new_probabilities[country]['probability']}
    return countries, ambiguous_locations
Пример #2
0
def update_countries_with_regions(entities, countries, text):
    # adds countries derived from regions to country list
    ambiguous_locations = {}

    subs = pd.DataFrame()
    for entity in {i[0] for i in entities if i[1] == 'LOCATION'}:
        a = almost_everything[almost_everything.subdivision == entity]
        if not a.empty:
            subs = pd.concat([subs, a], ignore_index=True)

    if not subs.empty:
        subs.country_name = subs.country_name.apply(standardize_country_name)
        no_dupes = subs.drop_duplicates(['country_name', 'subdivision'])
        for value_count in no_dupes.subdivision.value_counts().iteritems():
            count = value_count[1]
            place = value_count[0]
            probability = 1.0 / count
            if probability == 1.0:
                # only one country exists for a single subdivision
                probability = 0.8  # correcting for imperfect entity parsing
                possible_countries = subs[subs.subdivision ==
                                          place].country_name.tolist()
                country = possible_countries[0]
                if country in countries:
                    priors = countries[country]
                    new_count = priors['count'] + len(possible_countries)
                    new_probability = probabilities.independent_either_probability(
                        priors['probability'], probability)
                    countries[country] = {
                        'count': new_count,
                        'probability': new_probability
                    }
                else:
                    countries[country] = {
                        'count': len(possible_countries),
                        'probability': probability
                    }
            else:
                # multiple countries exist for a single subdivision
                possible_countries = no_dupes[no_dupes.subdivision ==
                                              place].country_name.tolist()
                new_probabilities = context_adjustment(place,
                                                       possible_countries,
                                                       probability, text)
                ambiguous_locations[place] = {
                    'possible_countries': possible_countries
                }
                for country in possible_countries:
                    if country in countries:
                        priors = countries[country]
                        new_count = priors['count'] + new_probabilities[
                            country]['count']
                        new_probability = probabilities.independent_either_probability(
                            priors['probability'],
                            new_probabilities[country]['probability'])
                        countries[country] = {
                            'count': new_count,
                            'probability': new_probability
                        }
                    else:
                        countries[country] = {
                            'count':
                            new_probabilities[country]['count'],
                            'probability':
                            new_probabilities[country]['probability']
                        }
    return countries, ambiguous_locations
Пример #3
0
def context_adjustment(place, possible_countries, probability, text):
    # get contextual windows revolving around ambiguous place name
    #     print('{} could be in {} with a probability of {} for each'.format(place, possible_countries, probability))
    window = 60
    bottom = lambda x: x - window if x - window > 0 else 0
    top = lambda x: x + window if x + window < len(text) else len(text)
    #     print indices
    indices = list(textual.find_all(text, place))
    contexts = [text[bottom(i):top(i)] for i in indices]
    #     print('{} has surrounding contexts of {}'.format(place, contexts))
    #     print
    new_probabilities = []
    while not new_probabilities:
        # waits until any contextual clues are acquired rather than getting every possible contextual clue which can lead to false positives when get multiple copies of same error
        for context in contexts:
            context = textual.remove_word(context, place)
            tokens = nltk.word_tokenize(context)
            codes = [t for t in tokens if t == t.upper() and t.isalpha()]

            # chop off first and last token which are likely not whole words
            tokens = [token.lower() for token in tokens
                      if token.isalpha()][1:-2]
            bi_tokens = bigrams(tokens)
            tri_tokens = trigrams(tokens)
            tokens = tokens + [' '.join(t) for t in bi_tokens
                               ] + [' '.join(t) for t in tri_tokens]

            # fix capitalization of state codes
            tokens = [(lambda x: x.upper()
                       if x.upper() in codes else textual.titlecase(x))(t)
                      for t in tokens]
            #             print('Recognized locations in the context are {}'.format(filter(lambda x: x in [i for i in almost_everything.subdivision.tolist()], tokens)))
            context_countries = []

            # check whether contextual token is a country subdivision
            for i in tokens:
                a = almost_everything[almost_everything.subdivision == i]
                if not a.empty:
                    list_ = a.country_name.tolist()
                    context_countries.extend(list_)
    #                 print('{} could refer to {}'.format(i, list_))

    # use the number of contextual countries that are the same as the ambiguous countries to compute new probabilities
            if context_countries:
                context_count = Counter(context_countries)
                #             print('Counts for each context-country are {}'.format(context_count))
                ambiguous_country_counts = zip(
                    possible_countries,
                    map(lambda x: context_count[x], possible_countries))
                #                 print('Counts for ambiguous countries are {}'.format(ambiguous_country_counts))
                new_probabilities.extend(
                    adjust_probabilities(probability,
                                         ambiguous_country_counts))
                break  # break out of for loop when gather first contextual clue
        break  # break out of while loop when there are no contextual clues after looping through all

    # combine multiple contexts into a single count and probability per country
    dict_ = {}
    if new_probabilities:
        country_set = {i[0] for i in new_probabilities}
        for country in country_set:
            probs = [i[1] for i in new_probabilities if i[0] == country]
            count = len(probs)
            probability = probs.pop(0)
            if probs:
                for i in probs:
                    probability = probabilities.independent_either_probability(
                        probability, i)
            dict_[country] = {'count': count, 'probability': probability}
    else:
        for country in possible_countries:
            dict_[country] = {'count': 1, 'probability': probability}
    return dict_
def context_adjustment(place, possible_countries, probability, text):
    # get contextual windows revolving around ambiguous place name
#     print('{} could be in {} with a probability of {} for each'.format(place, possible_countries, probability))
    window = 60
    bottom = lambda x: x-window if x-window > 0 else 0
    top = lambda x: x+window if x+window < len(text) else len(text)
#     print indices
    indices = list(textual.find_all(text, place))
    contexts = [text[bottom(i):top(i)] for i in indices]
#     print('{} has surrounding contexts of {}'.format(place, contexts))
#     print
    new_probabilities = []
    while not new_probabilities:
        # waits until any contextual clues are acquired rather than getting every possible contextual clue which can lead to false positives when get multiple copies of same error
        for context in contexts:
            context = textual.remove_word(context, place)
            tokens = nltk.word_tokenize(context)
            codes = [t for t in tokens if t==t.upper() and t.isalpha()]

            # chop off first and last token which are likely not whole words
            tokens = [token.lower() for token in tokens if token.isalpha()][1:-2]
            bi_tokens = bigrams(tokens)
            tri_tokens = trigrams(tokens)
            tokens = tokens + [' '.join(t) for t in bi_tokens] + [' '.join(t) for t in tri_tokens]

            # fix capitalization of state codes
            tokens = [(lambda x: x.upper() if x.upper() in codes else textual.titlecase(x))(t) for t in tokens]
#             print('Recognized locations in the context are {}'.format(filter(lambda x: x in [i for i in almost_everything.subdivision.tolist()], tokens)))
            context_countries = []

            # check whether contextual token is a country subdivision
            for i in tokens:
                a = almost_everything[almost_everything.subdivision == i]
                if not a.empty:
                    list_ = a.country_name.tolist()
                    context_countries.extend(list_)
    #                 print('{} could refer to {}'.format(i, list_))

            # use the number of contextual countries that are the same as the ambiguous countries to compute new probabilities
            if context_countries:
                context_count = Counter(context_countries)
    #             print('Counts for each context-country are {}'.format(context_count))
                ambiguous_country_counts = zip(possible_countries, map(lambda x: context_count[x], possible_countries))
#                 print('Counts for ambiguous countries are {}'.format(ambiguous_country_counts))
                new_probabilities.extend(adjust_probabilities(probability, ambiguous_country_counts))
                break # break out of for loop when gather first contextual clue
        break # break out of while loop when there are no contextual clues after looping through all

    # combine multiple contexts into a single count and probability per country
    dict_ = {}
    if new_probabilities:
        country_set = {i[0] for i in new_probabilities}
        for country in country_set:
            probs = [i[1] for i in new_probabilities if i[0] == country]
            count = len(probs)
            probability = probs.pop(0)
            if probs:
                for i in probs:
                    probability = probabilities.independent_either_probability(probability, i)
            dict_[country] = {'count': count, 'probability': probability}
    else:
        for country in possible_countries:
            dict_[country] = {'count': 1, 'probability': probability}
    return dict_