def test_negate_flag(): """ """ ## Test Statements statements = [ "I do not want to go out tonight. Not again.", "I can not continue living this way", "I can not not wait until tomorrow!" ] ## Initialize Tokenizer init_params = DEFAULT_TOKENIZER_INIT.copy() init_params["stopwords"] = set() init_params["negate_handling"] = True init_params["negate_token"] = True tokenizer = Tokenizer(**init_params) ## Check assert [ tokenizer.tokenize(i) for i in statements ] == [[ 'i', 'do', 'not_want', 'to', 'go', 'out', 'tonight', 'not_again', '<NEGATE_FLAG>', '<NEGATE_FLAG>' ], ['i', 'can', 'not_continue', 'living', 'this', 'way', '<NEGATE_FLAG>'], [ 'i', 'can', 'not_not_wait', 'until', 'tomorrow', '<NEGATE_FLAG>', '<NEGATE_FLAG>' ]]
def test_keep_retweets(): """ """ ## Test Statement test_statement = "RT: @Friend1 Time to get ready for school #Sucks #IDontWantToLearn" ## Initialize Tokenizer (Preserving Retweet) init_params = DEFAULT_TOKENIZER_INIT.copy() init_params["keep_retweets"] = True init_params["keep_case"] = True tokenizer = Tokenizer(**init_params) assert tokenizer.tokenize(test_statement) == \ ['<RETWEET>', '<USER_MENTION>', 'Time', 'get', 'ready', 'school', 'Sucks', 'IDontWantToLearn'] ## Initialize Tokenizer (Preserving Retweet, Dropping Case) init_params = DEFAULT_TOKENIZER_INIT.copy() init_params["keep_retweets"] = True init_params["keep_case"] = False tokenizer = Tokenizer(**init_params) assert tokenizer.tokenize(test_statement) == \ ['<RETWEET>', '<USER_MENTION>', 'time', 'get', 'ready', 'school', 'sucks', 'idontwanttolearn'] ## Initialize Tokenizer (Dropping Retweet) init_params = DEFAULT_TOKENIZER_INIT.copy() init_params["keep_retweets"] = False init_params["keep_case"] = False tokenizer = Tokenizer(**init_params) assert tokenizer.tokenize(test_statement) == \ ['<USER_MENTION>', 'time', 'get', 'ready', 'school', 'sucks', 'idontwanttolearn']
def test_expand_contractions(): """ """ ## Initialize Tokenizer init_params = DEFAULT_TOKENIZER_INIT.copy() init_params["expand_contractions"] = False init_params["stopwords"] = set() tokenizer = Tokenizer(**init_params) ## Test test_statement = "I can\'t wait to go to the movies later. Should\'ve gone yesterday." assert tokenizer.tokenize(test_statement) == \ ['i','can', 'not_wait','to','go','to','the','movies','later',"should've",'gone','yesterday'] ## Initialize Tokenizer Again (Without Negation Handling) init_params = DEFAULT_TOKENIZER_INIT.copy() init_params["expand_contractions"] = False init_params["negate_handling"] = False init_params["stopwords"] = set() tokenizer = Tokenizer(**init_params) assert tokenizer.tokenize(test_statement) == \ ['i','can\'t', 'wait','to','go','to','the','movies','later',"should've",'gone','yesterday'] ## Initialize One Last Time (With Expansion) init_params = DEFAULT_TOKENIZER_INIT.copy() init_params["expand_contractions"] = True init_params["negate_handling"] = False init_params["stopwords"] = set() tokenizer = Tokenizer(**init_params) assert tokenizer.tokenize(test_statement) == \ ['i', 'can', 'not', 'wait', 'to', 'go', 'to', 'the', 'movies', 'later', 'should', 'have', 'gone', 'yesterday']
def test_keep_punctuation(): """ """ ## Initialize Tokenizer init_params = DEFAULT_TOKENIZER_INIT.copy() init_params["keep_punctuation"] = True init_params["stopwords"] = set() tokenizer = Tokenizer(**init_params) ## Test test_statement = "HOW CAN YOU NOT KNOW THAT!?!" assert tokenizer.tokenize(test_statement) == \ ['how', 'can', 'you', 'not_know', 'that', '!?!']
def test_upper_flag(): """ """ ## Initialize Tokenizer init_params = DEFAULT_TOKENIZER_INIT.copy() init_params["upper_flag"] = True init_params["stopwords"] = set() tokenizer = Tokenizer(**init_params) ## Test test_statement = "HOW CAN YOU NOT KNOW THAT!" assert tokenizer.tokenize(test_statement) == \ ['how', 'can', 'you', 'not_know', 'that', '<UPPER_FLAG>']
def test_keep_numbers(): """ """ ## Initialize Tokenizer init_params = DEFAULT_TOKENIZER_INIT.copy() init_params["keep_numbers"] = True init_params["keep_punctuation"] = False init_params["stopwords"] = set() tokenizer = Tokenizer(**init_params) ## Test test_statement = "HOW CAN YOU 2 NOT KNOW THAT in 2019!?!" assert tokenizer.tokenize(test_statement) == \ ['how', 'can', 'you', "<NUMERIC>", 'not_know', 'that', 'in', '<NUMERIC>']
def test_filter_stopwords(): """ """ ## Initialize Tokenizer tokenizer = Tokenizer(**DEFAULT_TOKENIZER_INIT) ## Test Statements statements = [ "I can\'t wait to go to the movies later. 💔 me some Ashton Kucher!", "You have to be kidding me.", "OH NO! Not the dog!" ] expected_output = [[ "not_wait", "go", "movies", "later", "💔", "ashton", "kucher" ], ["kidding"], ["oh", "dog"]] ## Check for s, e in zip(statements, expected_output): s_tokenized = tokenizer.tokenize(s) assert s_tokenized == e
def test_keep_case(): """ """ ## Initialize Tokenizer init_params = DEFAULT_TOKENIZER_INIT.copy() init_params["keep_case"] = True tokenizer = Tokenizer(**init_params) ## Test Statements statements = [ "I can\'t wait to go to the movies later. 💔 me some Ashton Kucher!", "You have to be kidding me.", "OH NO! Not the dog!" ] expected_output = [[ "not_wait", "go", "movies", "later", "💔", "Ashton", "Kucher" ], ["kidding"], ["OH", "dog"]] ## Check for s, e in zip(statements, expected_output): s_tokenized = tokenizer.tokenize(s) assert s_tokenized == e
def test_negate_handling(): """ """ ## Test Statements statements = [ "I do not want to go out tonight", "I can not continue living this way", "I can not not wait until tomorrow!" ] ## Initialize Tokenizer Without Handling init_params = DEFAULT_TOKENIZER_INIT.copy() init_params["stopwords"] = set() init_params["negate_handling"] = False tokenizer = Tokenizer(**init_params) ## Check assert [tokenizer.tokenize(i) for i in statements ] == [['i', 'do', 'not', 'want', 'to', 'go', 'out', 'tonight'], ['i', 'can', 'not', 'continue', 'living', 'this', 'way'], ['i', 'can', 'not', 'not', 'wait', 'until', 'tomorrow']] ## Initialize Tokenize With Handling init_params = DEFAULT_TOKENIZER_INIT.copy() init_params["stopwords"] = set() init_params["negate_handling"] = True tokenizer = Tokenizer(**init_params) assert [tokenizer.tokenize(i) for i in statements ] == [['i', 'do', 'not_want', 'to', 'go', 'out', 'tonight'], ['i', 'can', 'not_continue', 'living', 'this', 'way'], ['i', 'can', 'not_not_wait', 'until', 'tomorrow']]
def test_keep_url(): """ """ ## Test Statement test_statements = [ "Just found a really cool website to help with transportation http://ts.jhu.edu/Shuttles/", "Just found a really cool website to help with transportation ts.jhu.edu/Shuttles/" ] ## Initialize Tokenizer (Preserving URL) init_params = DEFAULT_TOKENIZER_INIT.copy() init_params["keep_url"] = True tokenizer = Tokenizer(**init_params) ## Check for t in test_statements: assert tokenizer.tokenize(t) == [ 'found', 'really', 'cool', 'website', 'help', 'transportation', '<URL_TOKEN>' ] ## Initialize Tokenizer (Dropping URL) init_params = DEFAULT_TOKENIZER_INIT.copy() init_params["keep_url"] = False tokenizer = Tokenizer(**init_params) ## Check for t in test_statements: assert tokenizer.tokenize(t) == [ 'found', 'really', 'cool', 'website', 'help', 'transportation' ]
def test_keep_user_mentions(): """ """ ## Test Statements test_statements = [ "Going to the movies later with @Friend1.", # Twitter "Calling u/Friend1 to chime in here." ] # Reddit ## Initialize Tokenizer (Dropping User Mentions) init_params = DEFAULT_TOKENIZER_INIT.copy() init_params["keep_user_mentions"] = False init_params["stopwords"] = set() tokenizer = Tokenizer(**init_params) assert [tokenizer.tokenize(i) for i in test_statements] == \ [['going', 'to', 'the', 'movies', 'later', 'with'], ['calling', 'to', 'chime', 'in', 'here']] ## Initialize Tokenizer (Keeping User Mentions) init_params = DEFAULT_TOKENIZER_INIT.copy() init_params["keep_user_mentions"] = True init_params["stopwords"] = set() tokenizer = Tokenizer(**init_params) assert [tokenizer.tokenize(i) for i in test_statements] == \ [['going', 'to', 'the', 'movies', 'later', 'with', '<USER_MENTION>'], ['calling', '<USER_MENTION>','to', 'chime', 'in', 'here']]
def _initialize_class_resources(self): """ Initialize resources and methods useful for identifying location strings in free text. Reference global objects Args: None Returns: None """ ## Geo Resources self._geo_resources = geo_resources.copy() ## Abbreviations self._geo_abbr = geo_abbreviations.copy() ## Affixes self._geo_affixes = geo_affixes.copy() ## Common Words self._common_words = set(coca_words) | \ set(nltk_stopwords) | \ set(ignore_words) ## Tokenizer self.tokenizer = Tokenizer(stopwords=None, keep_case=True, negate_handling=False, negate_token=False, upper_flag=False, keep_punctuation=True, keep_numbers=False, expand_contractions=False, keep_user_mentions=False, keep_pronouns=True, keep_url=False, keep_hashtags=False, keep_retweets=False, emoji_handling="strip")
def test_keep_hashtag(): """ """ ## Test Statement test_statement = "Time to get ready for school #Sucks #IDontWantToLearn" ## Initialize Tokenizer (Preserving Hashtags) init_params = DEFAULT_TOKENIZER_INIT.copy() init_params["keep_hashtags"] = True init_params["keep_case"] = True tokenizer = Tokenizer(**init_params) ## Check assert tokenizer.tokenize(test_statement) == \ ['Time', 'get', 'ready', 'school', 'Sucks', 'IDontWantToLearn'] ## Initialize Tokenizer (Dropping Hashtags) init_params = DEFAULT_TOKENIZER_INIT.copy() init_params["keep_hashtags"] = False init_params["keep_case"] = True tokenizer = Tokenizer(**init_params) ## Check assert tokenizer.tokenize(test_statement) == \ ['Time', 'get', 'ready', 'school']
def test_emoji_handling(): """ """ ## Sample Text text = 'RT @lav09rO5KgJS: Tell em J.T. ! 😂😍http://t.co/Tc_qbFYmFYm' ## Test 1 (No Special Handling) tokenizer = Tokenizer(**DEFAULT_TOKENIZER_INIT) tokens_no_handle = tokenizer.tokenize(text) assert tokens_no_handle == [ '<USER_MENTION>', 'tell', 'em', 'j.t.', '😂', '😍', '<URL_TOKEN>' ] ## Test 2 (Replace) init_params = DEFAULT_TOKENIZER_INIT.copy() init_params["emoji_handling"] = "replace" tokenizer = Tokenizer(**init_params) tokens_replace = tokenizer.tokenize(text) assert tokens_replace == [ '<USER_MENTION>', 'tell', 'em', 'j.t.', '<EMOJI>', '<EMOJI>', '<URL_TOKEN>' ] ## Test 3 (Strip) init_params = DEFAULT_TOKENIZER_INIT.copy() init_params["emoji_handling"] = "strip" tokenizer = Tokenizer(**init_params) tokens_strip = tokenizer.tokenize(text) assert tokens_strip == [ '<USER_MENTION>', 'tell', 'em', 'j.t.', '<URL_TOKEN>' ] ## Test 4 (Error) init_params = DEFAULT_TOKENIZER_INIT.copy() init_params["emoji_handling"] = "FAKE_ARGUMENT" tokenizer = Tokenizer(**init_params) with pytest.raises(ValueError): _ = tokenizer.tokenize(text)
class LocationExtractor(object): """ Location Extractor. Find locations in free-text strings """ def __init__(self): """ Location Extractor. Find locations in free-text strings Args: None """ ## Class Initialization self._initialize_class_resources() ## Compile Dictionaries self._compile_gazeteer() self._compile_abbreviations() self._compile_geolocation_hierarchy() def __repr__(self): """ Return clean human-readable name of class. Args: None Returns: desc (str): Description of the class """ return "LocationExtractor()" def _initialize_class_resources(self): """ Initialize resources and methods useful for identifying location strings in free text. Reference global objects Args: None Returns: None """ ## Geo Resources self._geo_resources = geo_resources.copy() ## Abbreviations self._geo_abbr = geo_abbreviations.copy() ## Affixes self._geo_affixes = geo_affixes.copy() ## Common Words self._common_words = set(coca_words) | \ set(nltk_stopwords) | \ set(ignore_words) ## Tokenizer self.tokenizer = Tokenizer(stopwords=None, keep_case=True, negate_handling=False, negate_token=False, upper_flag=False, keep_punctuation=True, keep_numbers=False, expand_contractions=False, keep_user_mentions=False, keep_pronouns=True, keep_url=False, keep_hashtags=False, keep_retweets=False, emoji_handling="strip") def _compile_gazeteer(self): """ Combine strings into a single gazetteer Args: None Returns: None """ ## Initialize Gazeteer self.gazeteer = set() ## Add City Names self.gazeteer.update( self._geo_resources["city_ascii"].map(_safe_decode).str.lower()) ## Add County Names self.gazeteer.update(self._geo_resources["county_ascii"].map( _safe_decode).dropna().str.lower()) ## Add State Names self.gazeteer.update(self._geo_resources["state_ascii"].map( _safe_decode).dropna().str.lower()) ## Add Country Names self.gazeteer.update(self._geo_resources["country"].map( _safe_decode).dropna().str.lower()) ## Add Continent Names self.gazeteer.update(self._geo_resources["continent"].map( _safe_decode).dropna().str.lower()) ## Filter Out Common Words for cw in self._common_words: if cw in self.gazeteer: self.gazeteer.remove(cw) ## Filter Out Small Words self.gazeteer = set([i for i in self.gazeteer if len(i) > 3]) def _compile_abbreviations(self): """ Create a mapping between abbreviations and their full name Args: None Returns: None """ self.abbr_map = dict( (y, x) for _, (x, y) in self._geo_abbr[["name", "abbreviation"]].iterrows()) def _compile_geolocation_hierarchy(self): """ Create a dictionary that maps location strings to other location strings at higher geographic levels (e.g. city -> state, country) Args: None Returns: None """ self.geo_hierarchy = { "city": {}, "county": {}, "state": {}, "country": {} } ## Country -> Continent for _, (country, continent) in self._geo_resources[["country", "continent"]].iterrows(): self.geo_hierarchy["country"][country.lower()] = set( [continent.lower()]) ## State -> Country, Continent for _, (state, country, continent) in self._geo_resources[[ "state_ascii", "country", "continent" ]].iterrows(): if pd.isnull(state): continue if state.lower() not in self.geo_hierarchy["state"]: self.geo_hierarchy["state"][state.lower()] = set() self.geo_hierarchy["state"][state.lower()].add(country.lower()) self.geo_hierarchy["state"][state.lower()].add(continent.lower()) ## County -> State, Country, Continent for _, (county, state, country, continent) in self._geo_resources[[ "county_ascii", "state_ascii", "country", "continent" ]].iterrows(): if pd.isnull(county): continue if county.lower() not in self.geo_hierarchy["county"]: self.geo_hierarchy["county"][county.lower()] = set() if not pd.isnull(state): self.geo_hierarchy["county"][county.lower()].add(state.lower()) self.geo_hierarchy["county"][county.lower()].add(country.lower()) self.geo_hierarchy["county"][county.lower()].add(continent.lower()) ## City -> County, State, Country, Continent for _, (city, county, state, country, continent) in self._geo_resources[[ "city_ascii", "county_ascii", "state_ascii", "country", "continent" ]].iterrows(): ca = city.lower() if ca not in self.geo_hierarchy["city"]: self.geo_hierarchy["city"][ca] = set() if not pd.isnull(county): self.geo_hierarchy["city"][ca].add(county.lower()) if not pd.isnull(state): self.geo_hierarchy["city"][ca].add(state.lower()) self.geo_hierarchy["city"][ca].add(country.lower()) self.geo_hierarchy["city"][ca].add(continent.lower()) def _filter_out_substrings(self, strings): """ Filter out strings in a list which are substrings of another item in the list Args: strings (list): List of strings to filter Returns: filtered_strings (list): List of strings without substrings """ strings = sorted(set(strings), key=lambda x: len(x)) filtered_strings = [] n = len(strings) for i in range(n): matches_ahead = False str_i = strings[i] for j in range(i + 1, n): if str_i in strings[j]: matches_ahead = True break if not matches_ahead: filtered_strings.append(str_i) return filtered_strings def _look_for_exact_match(self, tokens): """ Create n-grams of a token list and find all that match to our gazeteer. Args: tokens (list): List of tokens Returns: matches_filtered (list): List of string matches to the gazeteer """ ## Get Lowercase N-Grams ngrams = get_ngrams([t.lower() for t in tokens], 1, 4) ngrams = list(map(lambda n: " ".join(list(n)), ngrams)) ## Identify Exact Matches matches = [] for n in ngrams: if n in self.gazeteer: matches.append(n) ## Remove Substrings matches_filtered = self._filter_out_substrings(matches) return matches_filtered def _combine_syntax_matches(self, matches): """ Combine syntax-based location matches that occur next to each other Args: matches (list): List of syntax-based location matches Returns: combined_syntax_matches (list): Combined location strings """ n_matches = len(matches) if n_matches == 1: return matches combined_syntax_matches = [] j = 0 while j < n_matches - 1: match_j = matches[j].split(", ") k = j + 1 while k < n_matches and matches[k].split(", ")[0] == match_j[-1]: match_j.append(matches[k].split(", ")[1]) k += 1 j = k combined_syntax_matches.append(", ".join(match_j)) return combined_syntax_matches def _look_for_syntax_match(self, tokens, window=4): """ Look for span of tokens that reflects a standard syntax location reference (e.g. City, State, Country) Args: tokens (list): List of tokens in a sentence window (int): How many tokens to allow to left and right when checking for start/end of match Returns: syntax_matches (list): List of possible syntax matches """ n = len(tokens) j = 1 syntax_matches = [] while j < n - 1: if tokens[j] != ",": j += 1 else: before_window = [] after_window = [] of_cache = "" for b in tokens[max(0, j - window):j][::-1]: if b.istitle() and not b.startswith(":") or b.lower( ) == "of": if b.lower() not in self._common_words: before_window.append(b + of_cache) of_cache = "" elif b.lower() in self._geo_affixes["suffix"] and len( before_window) == 0: before_window.append(b + of_cache) of_cache = "" elif b.lower() in self._geo_affixes["prefix"] and len( before_window) > 0: before_window.append(b + of_cache) of_cache = "" elif b.lower() == "of": of_cache += " of" else: of_cache = "" break else: of_cache = "" break before_window = before_window[::-1] if all(b.lower() in self._common_words or b.lower() in self._geo_affixes["suffix"] or b.lower() in self._geo_affixes["prefix"] for b in before_window): before_window = [] of_cache = "" for a in tokens[j + 1:min(j + 1 + window, n)]: if (a.istitle() and not a.startswith(":")) or (a.lower() == "of"): if a.lower() not in self._common_words: after_window.append(of_cache + a) of_cache = "" elif a.lower() in self._geo_affixes["prefix"] and len( after_window) == 0: after_window.append(of_cache + a) of_cache = "" elif a.lower() in self._geo_affixes["suffix"] and len( after_window) > 0: after_window.append(of_cache + a) of_cache = "" elif a.lower() == "of": of_cache += "of " else: of_cache = "" break else: of_cache = "" break if all(a.lower() in self._common_words or a.lower() in self._geo_affixes["suffix"] or a.lower() in self._geo_affixes["prefix"] for a in after_window): after_window = [] if len(before_window) == 0 or len(after_window) == 0: j += 1 else: same_level_match = False for level in ["state", "country"]: if any(b.lower() in self.geo_hierarchy[level] for b in before_window) and \ any(a.lower() in self.geo_hierarchy[level] for a in after_window): same_level_match = True if not same_level_match: before_window_comb = " ".join(before_window) after_window_comb = " ".join(after_window) if before_window_comb.lower() in self.gazeteer or \ after_window_comb.lower() in self.gazeteer: syntax_matches.append(before_window_comb + ", " + after_window_comb) j += len(after_window) ## Normalize Case syntax_matches = [s.lower() for s in syntax_matches] ## Combine Syntax Matches syntax_matches = self._combine_syntax_matches(syntax_matches) return syntax_matches def _expand_abbreviations(self, tokens): """ Transform abbreviations into their full name Args: tokens (list): List of tokens in a sentence Returns: expanded_tokens (list): Original list of tokens with any matched abbreviations expanded to their full name """ expanded_tokens = [tokens[0]] for i in range(1, len(tokens)): if tokens[i - 1] == "," and tokens[i].replace(".", "") in self.abbr_map: expanded_tokens.append(self.abbr_map[tokens[i].replace( ".", "")]) else: expanded_tokens.append(tokens[i]) expanded_tokens = flatten(i.split(" ") for i in expanded_tokens) return expanded_tokens def _find_sub_list(self, sl, l): """ Find lists within lists Args: sl (list): List to look for l (list): Large list of items to search within Returns: results (list): Start, end indice spans of matches """ results = [] sll = len(sl) for ind in (i for i, e in enumerate(l) if e == sl[0]): if l[ind:ind + sll] == sl: results.append((ind, ind + sll - 1)) return results def _append_affixes(self, tokens, matches): """ For any location string matches, look for modifiers before and after in the affix dictionary Args: tokens (list): List of tokens in a sentence matches (list): List of existing location-matches Returns: affixed_matches (list): List of matches with any affixes identified in the token list """ n = len(tokens) tokens_lower = list(map(lambda i: i.lower(), tokens)) tokens_lower = flatten([i.split(" ") for i in tokens_lower]) affixed_matches = [] for m in matches: ## Re-Tokenize Match m_toks = self.tokenizer.tokenize(m) ## Look For Span Matches m_spans = self._find_sub_list(m_toks, tokens_lower) ## Check Spans for Affixes for span_start, span_end in m_spans: affixed_span = m_toks.copy() if span_start > 0: if tokens[span_start - 1].istitle() and tokens[ span_start - 1].lower() in self._geo_affixes["prefix"]: affixed_span = tokens_lower[ span_start - 1:span_start] + affixed_span if span_end < n - 2: if tokens[span_end + 1].istitle() and tokens[ span_end + 1].lower() in self._geo_affixes["suffix"]: affixed_span = affixed_span + tokens_lower[span_end + 1:span_end + 2] affixed_matches.append(" ".join(affixed_span).replace( " ,", ",")) return affixed_matches def _find_locations(self, sent): """ Find location mentions within a sentence Args: sent (str): Free-form text sentence Returns: affixed_matches (list): List of identified location matches """ ## Tokenize Sentence tokens = self.tokenizer.tokenize(sent) if len(tokens) == 0: return [] ## Expand Abbreviations tokens = self._expand_abbreviations(tokens) ## Look for Exact Matches exact_matches = self._look_for_exact_match(tokens) ## Look for Syntax Matches syntax_matches = self._look_for_syntax_match(tokens) ## Consolidate Matches combined_matches = self._filter_out_substrings(syntax_matches + exact_matches) ## Append Affixes affixed_matches = self._append_affixes(tokens, combined_matches) return affixed_matches def _merge_overlap(self, matches): """ Combine location matches that share a string E.g. "los angeles, california" and "california, US" Args: matches (list): List of matched location strings Returns: merged_matches (list): Combined match list """ merged_matches = [] seen = set() for m, match in enumerate(matches): match_split = match.split(", ") match_set = [match] seen.add(m) for m2, match2 in enumerate(matches): if m == m2 or m2 in seen: continue match2_split = match2.split(", ") if set(match_split) & set(match2_split) != set(): match_set.append(match2) seen.add(m2) seen_added = True match_set = self._combine_syntax_matches(match_set) merged_matches.extend(match_set) return self._filter_out_substrings(merged_matches) def _combine_using_hierarchy(self, matches): """ Combine string location matches based on the gazeteer geographic hierarchy (e.g. city, state, country, continent) Args: matches (list): List of proposed string location matches Returns: reg_matches (list): Matches, combining any that fall in the same location hierarchy """ combined_matches = [] reg_matches = [] for m, match in enumerate(matches): match_split = match.split(", ") match_level = None proposed_match = None for m2, match2 in enumerate(matches): if m == m2: continue match2_split = match2.split(", ") levels = ["city", "county", "state", "country"] for tl, topy_level in enumerate(levels): if match_split[-1] in self.geo_hierarchy[topy_level]: if match2_split[0] in self.geo_hierarchy[topy_level][ match_split[-1]]: if match_level is None or topy_level in levels[: match_level]: proposed_match = (m, m2) match_level = tl if proposed_match is None: reg_matches.append(match) else: combined_matches.append((proposed_match, match_level)) combined_matches = sorted(combined_matches, key=lambda x: x[1]) for (ml, mr), _ in combined_matches: reg_matches.append(matches[ml] + ", " + matches[mr]) ## Filter Duplicates reg_matches = self._filter_out_substrings(reg_matches) ## Merge Overlap reg_matches = self._merge_overlap(reg_matches) return reg_matches def _is_all_commons(self, match): """ Helper noting if all tokens in a matched hierarchy are made of common words Args: match (str): Proposed location match Returns: is_match (bool): Whether all tokens in the match are common words """ if all(m in self._common_words for m in match.replace(", ", "").split()): return True return False def find_locations(self, text): """ Find locations within a string of text Args: text (str): Input free-form text (potentially a paragraph) Returns: locations (list): List of recognized location strings """ ## Translate to Ascii text = unidecode(text) ## Split Up Sentences sentences = sent_tokenize(text) ## Look for Locations locations = [] for sent in sentences: sent_locs = self._find_locations(sent) locations.extend(sent_locs) ## Combine Using Hierarchy locations = self._combine_using_hierarchy(locations) ## Filter Commons locations = [l for l in locations if not self._is_all_commons(l)] return locations