Python Tokenizer 예제들, smgeo.util.tokenizer.Tokenizer Python 예제들

예제 #1

0

파일 보기

def test_negate_flag():
    """

    """
    ## Test Statements
    statements = [
        "I do not want to go out tonight. Not again.",
        "I can not continue living this way",
        "I can not not wait until tomorrow!"
    ]
    ## Initialize Tokenizer
    init_params = DEFAULT_TOKENIZER_INIT.copy()
    init_params["stopwords"] = set()
    init_params["negate_handling"] = True
    init_params["negate_token"] = True
    tokenizer = Tokenizer(**init_params)
    ## Check
    assert [
        tokenizer.tokenize(i) for i in statements
    ] == [[
        'i', 'do', 'not_want', 'to', 'go', 'out', 'tonight', 'not_again',
        '<NEGATE_FLAG>', '<NEGATE_FLAG>'
    ], ['i', 'can', 'not_continue', 'living', 'this', 'way', '<NEGATE_FLAG>'],
          [
              'i', 'can', 'not_not_wait', 'until', 'tomorrow', '<NEGATE_FLAG>',
              '<NEGATE_FLAG>'
          ]]

예제 #2

0

파일 보기

def test_keep_retweets():
    """

    """
    ## Test Statement
    test_statement = "RT: @Friend1 Time to get ready for school #Sucks #IDontWantToLearn"
    ## Initialize Tokenizer (Preserving Retweet)
    init_params = DEFAULT_TOKENIZER_INIT.copy()
    init_params["keep_retweets"] = True
    init_params["keep_case"] = True
    tokenizer = Tokenizer(**init_params)
    assert tokenizer.tokenize(test_statement) == \
        ['<RETWEET>', '<USER_MENTION>', 'Time', 'get', 'ready', 'school', 'Sucks', 'IDontWantToLearn']
    ## Initialize Tokenizer (Preserving Retweet, Dropping Case)
    init_params = DEFAULT_TOKENIZER_INIT.copy()
    init_params["keep_retweets"] = True
    init_params["keep_case"] = False
    tokenizer = Tokenizer(**init_params)
    assert tokenizer.tokenize(test_statement) == \
        ['<RETWEET>', '<USER_MENTION>', 'time', 'get', 'ready', 'school', 'sucks', 'idontwanttolearn']
    ## Initialize Tokenizer (Dropping Retweet)
    init_params = DEFAULT_TOKENIZER_INIT.copy()
    init_params["keep_retweets"] = False
    init_params["keep_case"] = False
    tokenizer = Tokenizer(**init_params)
    assert tokenizer.tokenize(test_statement) == \
        ['<USER_MENTION>', 'time', 'get', 'ready', 'school', 'sucks', 'idontwanttolearn']

예제 #3

0

파일 보기

def test_expand_contractions():
    """

    """
    ## Initialize Tokenizer
    init_params = DEFAULT_TOKENIZER_INIT.copy()
    init_params["expand_contractions"] = False
    init_params["stopwords"] = set()
    tokenizer = Tokenizer(**init_params)
    ## Test
    test_statement = "I can\'t wait to go to the movies later. Should\'ve gone yesterday."
    assert tokenizer.tokenize(test_statement) == \
        ['i','can', 'not_wait','to','go','to','the','movies','later',"should've",'gone','yesterday']
    ## Initialize Tokenizer Again (Without Negation Handling)
    init_params = DEFAULT_TOKENIZER_INIT.copy()
    init_params["expand_contractions"] = False
    init_params["negate_handling"] = False
    init_params["stopwords"] = set()
    tokenizer = Tokenizer(**init_params)
    assert tokenizer.tokenize(test_statement) == \
        ['i','can\'t', 'wait','to','go','to','the','movies','later',"should've",'gone','yesterday']
    ## Initialize One Last Time (With Expansion)
    init_params = DEFAULT_TOKENIZER_INIT.copy()
    init_params["expand_contractions"] = True
    init_params["negate_handling"] = False
    init_params["stopwords"] = set()
    tokenizer = Tokenizer(**init_params)
    assert tokenizer.tokenize(test_statement) == \
        ['i', 'can', 'not', 'wait', 'to', 'go', 'to', 'the', 'movies', 'later', 'should', 'have', 'gone', 'yesterday']

예제 #4

0

파일 보기

def test_keep_punctuation():
    """

    """
    ## Initialize Tokenizer
    init_params = DEFAULT_TOKENIZER_INIT.copy()
    init_params["keep_punctuation"] = True
    init_params["stopwords"] = set()
    tokenizer = Tokenizer(**init_params)
    ## Test
    test_statement = "HOW CAN YOU NOT KNOW THAT!?!"
    assert tokenizer.tokenize(test_statement) == \
        ['how', 'can', 'you', 'not_know', 'that', '!?!']

예제 #5

0

파일 보기

def test_upper_flag():
    """

    """
    ## Initialize Tokenizer
    init_params = DEFAULT_TOKENIZER_INIT.copy()
    init_params["upper_flag"] = True
    init_params["stopwords"] = set()
    tokenizer = Tokenizer(**init_params)
    ## Test
    test_statement = "HOW CAN YOU NOT KNOW THAT!"
    assert tokenizer.tokenize(test_statement) == \
        ['how', 'can', 'you', 'not_know', 'that', '<UPPER_FLAG>']

예제 #6

0

파일 보기

def test_keep_numbers():
    """

    """
    ## Initialize Tokenizer
    init_params = DEFAULT_TOKENIZER_INIT.copy()
    init_params["keep_numbers"] = True
    init_params["keep_punctuation"] = False
    init_params["stopwords"] = set()
    tokenizer = Tokenizer(**init_params)
    ## Test
    test_statement = "HOW CAN YOU 2 NOT KNOW THAT in 2019!?!"
    assert tokenizer.tokenize(test_statement) == \
        ['how', 'can', 'you', "<NUMERIC>", 'not_know', 'that', 'in', '<NUMERIC>']

예제 #7

0

파일 보기

def test_filter_stopwords():
    """

    """
    ## Initialize Tokenizer
    tokenizer = Tokenizer(**DEFAULT_TOKENIZER_INIT)
    ## Test Statements
    statements = [
        "I can\'t wait to go to the movies later. 💔 me some Ashton Kucher!",
        "You have to be kidding me.", "OH NO! Not the dog!"
    ]
    expected_output = [[
        "not_wait", "go", "movies", "later", "💔", "ashton", "kucher"
    ], ["kidding"], ["oh", "dog"]]
    ## Check
    for s, e in zip(statements, expected_output):
        s_tokenized = tokenizer.tokenize(s)
        assert s_tokenized == e

예제 #8

0

파일 보기

def test_keep_case():
    """

    """
    ## Initialize Tokenizer
    init_params = DEFAULT_TOKENIZER_INIT.copy()
    init_params["keep_case"] = True
    tokenizer = Tokenizer(**init_params)
    ## Test Statements
    statements = [
        "I can\'t wait to go to the movies later. 💔 me some Ashton Kucher!",
        "You have to be kidding me.", "OH NO! Not the dog!"
    ]
    expected_output = [[
        "not_wait", "go", "movies", "later", "💔", "Ashton", "Kucher"
    ], ["kidding"], ["OH", "dog"]]
    ## Check
    for s, e in zip(statements, expected_output):
        s_tokenized = tokenizer.tokenize(s)
        assert s_tokenized == e

예제 #9

0

파일 보기

def test_negate_handling():
    """

    """
    ## Test Statements
    statements = [
        "I do not want to go out tonight",
        "I can not continue living this way",
        "I can not not wait until tomorrow!"
    ]
    ## Initialize Tokenizer Without Handling
    init_params = DEFAULT_TOKENIZER_INIT.copy()
    init_params["stopwords"] = set()
    init_params["negate_handling"] = False
    tokenizer = Tokenizer(**init_params)
    ## Check
    assert [tokenizer.tokenize(i) for i in statements
            ] == [['i', 'do', 'not', 'want', 'to', 'go', 'out', 'tonight'],
                  ['i', 'can', 'not', 'continue', 'living', 'this', 'way'],
                  ['i', 'can', 'not', 'not', 'wait', 'until', 'tomorrow']]
    ## Initialize Tokenize With Handling
    init_params = DEFAULT_TOKENIZER_INIT.copy()
    init_params["stopwords"] = set()
    init_params["negate_handling"] = True
    tokenizer = Tokenizer(**init_params)
    assert [tokenizer.tokenize(i) for i in statements
            ] == [['i', 'do', 'not_want', 'to', 'go', 'out', 'tonight'],
                  ['i', 'can', 'not_continue', 'living', 'this', 'way'],
                  ['i', 'can', 'not_not_wait', 'until', 'tomorrow']]

예제 #10

0

파일 보기

def test_keep_url():
    """

    """
    ## Test Statement
    test_statements = [
        "Just found a really cool website to help with transportation http://ts.jhu.edu/Shuttles/",
        "Just found a really cool website to help with transportation ts.jhu.edu/Shuttles/"
    ]
    ## Initialize Tokenizer (Preserving URL)
    init_params = DEFAULT_TOKENIZER_INIT.copy()
    init_params["keep_url"] = True
    tokenizer = Tokenizer(**init_params)
    ## Check
    for t in test_statements:
        assert tokenizer.tokenize(t) == [
            'found', 'really', 'cool', 'website', 'help', 'transportation',
            '<URL_TOKEN>'
        ]
    ## Initialize Tokenizer (Dropping URL)
    init_params = DEFAULT_TOKENIZER_INIT.copy()
    init_params["keep_url"] = False
    tokenizer = Tokenizer(**init_params)
    ## Check
    for t in test_statements:
        assert tokenizer.tokenize(t) == [
            'found', 'really', 'cool', 'website', 'help', 'transportation'
        ]

예제 #11

0

파일 보기

def test_keep_user_mentions():
    """

    """
    ## Test Statements
    test_statements = [
        "Going to the movies later with @Friend1.",  # Twitter
        "Calling u/Friend1 to chime in here."
    ]  # Reddit
    ## Initialize Tokenizer (Dropping User Mentions)
    init_params = DEFAULT_TOKENIZER_INIT.copy()
    init_params["keep_user_mentions"] = False
    init_params["stopwords"] = set()
    tokenizer = Tokenizer(**init_params)
    assert [tokenizer.tokenize(i) for i in test_statements] == \
                [['going', 'to', 'the', 'movies', 'later', 'with'],
                 ['calling', 'to', 'chime', 'in', 'here']]
    ## Initialize Tokenizer (Keeping User Mentions)
    init_params = DEFAULT_TOKENIZER_INIT.copy()
    init_params["keep_user_mentions"] = True
    init_params["stopwords"] = set()
    tokenizer = Tokenizer(**init_params)
    assert [tokenizer.tokenize(i) for i in test_statements] == \
                [['going', 'to', 'the', 'movies', 'later', 'with', '<USER_MENTION>'],
                 ['calling', '<USER_MENTION>','to', 'chime', 'in', 'here']]

예제 #12

0

파일 보기

파일: location_extractor.py 프로젝트: kharrigian/smgeo

    def _initialize_class_resources(self):
        """
        Initialize resources and methods useful for identifying
        location strings in free text. Reference global objects

        Args:
            None
        
        Returns:
            None
        """
        ## Geo Resources
        self._geo_resources = geo_resources.copy()
        ## Abbreviations
        self._geo_abbr = geo_abbreviations.copy()
        ## Affixes
        self._geo_affixes = geo_affixes.copy()
        ## Common Words
        self._common_words = set(coca_words) | \
                             set(nltk_stopwords) | \
                             set(ignore_words)
        ## Tokenizer
        self.tokenizer = Tokenizer(stopwords=None,
                                   keep_case=True,
                                   negate_handling=False,
                                   negate_token=False,
                                   upper_flag=False,
                                   keep_punctuation=True,
                                   keep_numbers=False,
                                   expand_contractions=False,
                                   keep_user_mentions=False,
                                   keep_pronouns=True,
                                   keep_url=False,
                                   keep_hashtags=False,
                                   keep_retweets=False,
                                   emoji_handling="strip")

예제 #13

0

파일 보기

def test_keep_hashtag():
    """

    """
    ## Test Statement
    test_statement = "Time to get ready for school #Sucks #IDontWantToLearn"
    ## Initialize Tokenizer (Preserving Hashtags)
    init_params = DEFAULT_TOKENIZER_INIT.copy()
    init_params["keep_hashtags"] = True
    init_params["keep_case"] = True
    tokenizer = Tokenizer(**init_params)
    ## Check
    assert tokenizer.tokenize(test_statement) == \
        ['Time', 'get', 'ready', 'school', 'Sucks', 'IDontWantToLearn']
    ## Initialize Tokenizer (Dropping Hashtags)
    init_params = DEFAULT_TOKENIZER_INIT.copy()
    init_params["keep_hashtags"] = False
    init_params["keep_case"] = True
    tokenizer = Tokenizer(**init_params)
    ## Check
    assert tokenizer.tokenize(test_statement) == \
        ['Time', 'get', 'ready', 'school']

예제 #14

0

파일 보기

def test_emoji_handling():
    """

    """
    ## Sample Text
    text = 'RT @lav09rO5KgJS: Tell em J.T. ! 😂😍http://t.co/Tc_qbFYmFYm'
    ## Test 1 (No Special Handling)
    tokenizer = Tokenizer(**DEFAULT_TOKENIZER_INIT)
    tokens_no_handle = tokenizer.tokenize(text)
    assert tokens_no_handle == [
        '<USER_MENTION>', 'tell', 'em', 'j.t.', '😂', '😍', '<URL_TOKEN>'
    ]
    ## Test 2 (Replace)
    init_params = DEFAULT_TOKENIZER_INIT.copy()
    init_params["emoji_handling"] = "replace"
    tokenizer = Tokenizer(**init_params)
    tokens_replace = tokenizer.tokenize(text)
    assert tokens_replace == [
        '<USER_MENTION>', 'tell', 'em', 'j.t.', '<EMOJI>', '<EMOJI>',
        '<URL_TOKEN>'
    ]
    ## Test 3 (Strip)
    init_params = DEFAULT_TOKENIZER_INIT.copy()
    init_params["emoji_handling"] = "strip"
    tokenizer = Tokenizer(**init_params)
    tokens_strip = tokenizer.tokenize(text)
    assert tokens_strip == [
        '<USER_MENTION>', 'tell', 'em', 'j.t.', '<URL_TOKEN>'
    ]
    ## Test 4 (Error)
    init_params = DEFAULT_TOKENIZER_INIT.copy()
    init_params["emoji_handling"] = "FAKE_ARGUMENT"
    tokenizer = Tokenizer(**init_params)
    with pytest.raises(ValueError):
        _ = tokenizer.tokenize(text)

예제 #15

0

파일 보기

파일: location_extractor.py 프로젝트: kharrigian/smgeo

class LocationExtractor(object):
    """
    Location Extractor. Find locations in free-text strings
    """
    def __init__(self):
        """
        Location Extractor. Find locations in free-text strings

        Args:
            None
        """
        ## Class Initialization
        self._initialize_class_resources()
        ## Compile Dictionaries
        self._compile_gazeteer()
        self._compile_abbreviations()
        self._compile_geolocation_hierarchy()

    def __repr__(self):
        """
        Return clean human-readable name of class.

        Args:
            None
        
        Returns:
            desc (str): Description of the class
        """
        return "LocationExtractor()"

    def _initialize_class_resources(self):
        """
        Initialize resources and methods useful for identifying
        location strings in free text. Reference global objects

        Args:
            None
        
        Returns:
            None
        """
        ## Geo Resources
        self._geo_resources = geo_resources.copy()
        ## Abbreviations
        self._geo_abbr = geo_abbreviations.copy()
        ## Affixes
        self._geo_affixes = geo_affixes.copy()
        ## Common Words
        self._common_words = set(coca_words) | \
                             set(nltk_stopwords) | \
                             set(ignore_words)
        ## Tokenizer
        self.tokenizer = Tokenizer(stopwords=None,
                                   keep_case=True,
                                   negate_handling=False,
                                   negate_token=False,
                                   upper_flag=False,
                                   keep_punctuation=True,
                                   keep_numbers=False,
                                   expand_contractions=False,
                                   keep_user_mentions=False,
                                   keep_pronouns=True,
                                   keep_url=False,
                                   keep_hashtags=False,
                                   keep_retweets=False,
                                   emoji_handling="strip")

    def _compile_gazeteer(self):
        """
        Combine strings into a single gazetteer

        Args:
            None
        
        Returns:
            None
        """
        ## Initialize Gazeteer
        self.gazeteer = set()
        ## Add City Names
        self.gazeteer.update(
            self._geo_resources["city_ascii"].map(_safe_decode).str.lower())
        ## Add County Names
        self.gazeteer.update(self._geo_resources["county_ascii"].map(
            _safe_decode).dropna().str.lower())
        ## Add State Names
        self.gazeteer.update(self._geo_resources["state_ascii"].map(
            _safe_decode).dropna().str.lower())
        ## Add Country Names
        self.gazeteer.update(self._geo_resources["country"].map(
            _safe_decode).dropna().str.lower())
        ## Add Continent Names
        self.gazeteer.update(self._geo_resources["continent"].map(
            _safe_decode).dropna().str.lower())
        ## Filter Out Common Words
        for cw in self._common_words:
            if cw in self.gazeteer:
                self.gazeteer.remove(cw)
        ## Filter Out Small Words
        self.gazeteer = set([i for i in self.gazeteer if len(i) > 3])

    def _compile_abbreviations(self):
        """
        Create a mapping between abbreviations and their full name

        Args:
            None
        
        Returns:
            None
        """
        self.abbr_map = dict(
            (y, x)
            for _, (x,
                    y) in self._geo_abbr[["name", "abbreviation"]].iterrows())

    def _compile_geolocation_hierarchy(self):
        """
        Create a dictionary that maps location strings to other location
        strings at higher geographic levels (e.g. city -> state, country)

        Args:
            None
        
        Returns:
            None
        """
        self.geo_hierarchy = {
            "city": {},
            "county": {},
            "state": {},
            "country": {}
        }
        ## Country -> Continent
        for _, (country,
                continent) in self._geo_resources[["country",
                                                   "continent"]].iterrows():
            self.geo_hierarchy["country"][country.lower()] = set(
                [continent.lower()])
        ## State -> Country, Continent
        for _, (state, country, continent) in self._geo_resources[[
                "state_ascii", "country", "continent"
        ]].iterrows():
            if pd.isnull(state):
                continue
            if state.lower() not in self.geo_hierarchy["state"]:
                self.geo_hierarchy["state"][state.lower()] = set()
            self.geo_hierarchy["state"][state.lower()].add(country.lower())
            self.geo_hierarchy["state"][state.lower()].add(continent.lower())
        ## County -> State, Country, Continent
        for _, (county, state, country, continent) in self._geo_resources[[
                "county_ascii", "state_ascii", "country", "continent"
        ]].iterrows():
            if pd.isnull(county):
                continue
            if county.lower() not in self.geo_hierarchy["county"]:
                self.geo_hierarchy["county"][county.lower()] = set()
            if not pd.isnull(state):
                self.geo_hierarchy["county"][county.lower()].add(state.lower())
            self.geo_hierarchy["county"][county.lower()].add(country.lower())
            self.geo_hierarchy["county"][county.lower()].add(continent.lower())
        ## City -> County, State, Country, Continent
        for _, (city, county, state, country,
                continent) in self._geo_resources[[
                    "city_ascii", "county_ascii", "state_ascii", "country",
                    "continent"
                ]].iterrows():
            ca = city.lower()
            if ca not in self.geo_hierarchy["city"]:
                self.geo_hierarchy["city"][ca] = set()
            if not pd.isnull(county):
                self.geo_hierarchy["city"][ca].add(county.lower())
            if not pd.isnull(state):
                self.geo_hierarchy["city"][ca].add(state.lower())
            self.geo_hierarchy["city"][ca].add(country.lower())
            self.geo_hierarchy["city"][ca].add(continent.lower())

    def _filter_out_substrings(self, strings):
        """
        Filter out strings in a list which are substrings
        of another item in the list

        Args:
            strings (list): List of strings to filter
        
        Returns:
            filtered_strings (list): List of strings without substrings
        """
        strings = sorted(set(strings), key=lambda x: len(x))
        filtered_strings = []
        n = len(strings)
        for i in range(n):
            matches_ahead = False
            str_i = strings[i]
            for j in range(i + 1, n):
                if str_i in strings[j]:
                    matches_ahead = True
                    break
            if not matches_ahead:
                filtered_strings.append(str_i)
        return filtered_strings

    def _look_for_exact_match(self, tokens):
        """
        Create n-grams of a token list and find all that 
        match to our gazeteer.

        Args:
            tokens (list): List of tokens
        
        Returns:
            matches_filtered (list): List of string matches to the
                                     gazeteer
        """
        ## Get Lowercase N-Grams
        ngrams = get_ngrams([t.lower() for t in tokens], 1, 4)
        ngrams = list(map(lambda n: " ".join(list(n)), ngrams))
        ## Identify Exact Matches
        matches = []
        for n in ngrams:
            if n in self.gazeteer:
                matches.append(n)
        ## Remove Substrings
        matches_filtered = self._filter_out_substrings(matches)
        return matches_filtered

    def _combine_syntax_matches(self, matches):
        """
        Combine syntax-based location matches that occur
        next to each other

        Args:
            matches (list): List of syntax-based location matches
        
        Returns:
            combined_syntax_matches (list): Combined location strings
        """
        n_matches = len(matches)
        if n_matches == 1:
            return matches
        combined_syntax_matches = []
        j = 0
        while j < n_matches - 1:
            match_j = matches[j].split(", ")
            k = j + 1
            while k < n_matches and matches[k].split(", ")[0] == match_j[-1]:
                match_j.append(matches[k].split(", ")[1])
                k += 1
            j = k
            combined_syntax_matches.append(", ".join(match_j))
        return combined_syntax_matches

    def _look_for_syntax_match(self, tokens, window=4):
        """
        Look for span of tokens that reflects a standard
        syntax location reference (e.g. City, State, Country)

        Args:
            tokens (list): List of tokens in a sentence
            window (int): How many tokens to allow to left and right when
                          checking for start/end of match
        
        Returns:
            syntax_matches (list): List of possible syntax matches
        """
        n = len(tokens)
        j = 1
        syntax_matches = []
        while j < n - 1:
            if tokens[j] != ",":
                j += 1
            else:
                before_window = []
                after_window = []
                of_cache = ""
                for b in tokens[max(0, j - window):j][::-1]:
                    if b.istitle() and not b.startswith(":") or b.lower(
                    ) == "of":
                        if b.lower() not in self._common_words:
                            before_window.append(b + of_cache)
                            of_cache = ""
                        elif b.lower() in self._geo_affixes["suffix"] and len(
                                before_window) == 0:
                            before_window.append(b + of_cache)
                            of_cache = ""
                        elif b.lower() in self._geo_affixes["prefix"] and len(
                                before_window) > 0:
                            before_window.append(b + of_cache)
                            of_cache = ""
                        elif b.lower() == "of":
                            of_cache += " of"
                        else:
                            of_cache = ""
                            break
                    else:
                        of_cache = ""
                        break
                before_window = before_window[::-1]
                if all(b.lower() in self._common_words
                       or b.lower() in self._geo_affixes["suffix"]
                       or b.lower() in self._geo_affixes["prefix"]
                       for b in before_window):
                    before_window = []
                of_cache = ""
                for a in tokens[j + 1:min(j + 1 + window, n)]:
                    if (a.istitle() and not a.startswith(":")) or (a.lower()
                                                                   == "of"):
                        if a.lower() not in self._common_words:
                            after_window.append(of_cache + a)
                            of_cache = ""
                        elif a.lower() in self._geo_affixes["prefix"] and len(
                                after_window) == 0:
                            after_window.append(of_cache + a)
                            of_cache = ""
                        elif a.lower() in self._geo_affixes["suffix"] and len(
                                after_window) > 0:
                            after_window.append(of_cache + a)
                            of_cache = ""
                        elif a.lower() == "of":
                            of_cache += "of "
                        else:
                            of_cache = ""
                            break
                    else:
                        of_cache = ""
                        break
                if all(a.lower() in self._common_words
                       or a.lower() in self._geo_affixes["suffix"]
                       or a.lower() in self._geo_affixes["prefix"]
                       for a in after_window):
                    after_window = []
                if len(before_window) == 0 or len(after_window) == 0:
                    j += 1
                else:
                    same_level_match = False
                    for level in ["state", "country"]:
                        if any(b.lower() in self.geo_hierarchy[level] for b in before_window) and \
                           any(a.lower() in self.geo_hierarchy[level] for a in after_window):
                            same_level_match = True
                    if not same_level_match:
                        before_window_comb = " ".join(before_window)
                        after_window_comb = " ".join(after_window)
                        if before_window_comb.lower() in self.gazeteer or \
                        after_window_comb.lower() in self.gazeteer:
                            syntax_matches.append(before_window_comb + ", " +
                                                  after_window_comb)
                    j += len(after_window)
        ## Normalize Case
        syntax_matches = [s.lower() for s in syntax_matches]
        ## Combine Syntax Matches
        syntax_matches = self._combine_syntax_matches(syntax_matches)
        return syntax_matches

    def _expand_abbreviations(self, tokens):
        """
        Transform abbreviations into their full name

        Args:
            tokens (list): List of tokens in a sentence
        
        Returns:
            expanded_tokens (list): Original list of tokens with any matched
                                    abbreviations expanded to their full name
        """
        expanded_tokens = [tokens[0]]
        for i in range(1, len(tokens)):
            if tokens[i - 1] == "," and tokens[i].replace(".",
                                                          "") in self.abbr_map:
                expanded_tokens.append(self.abbr_map[tokens[i].replace(
                    ".", "")])
            else:
                expanded_tokens.append(tokens[i])
        expanded_tokens = flatten(i.split(" ") for i in expanded_tokens)
        return expanded_tokens

    def _find_sub_list(self, sl, l):
        """
        Find lists within lists

        Args:
            sl (list): List to look for
            l (list): Large list of items to search within
        
        Returns:
            results (list): Start, end indice spans of matches
        """
        results = []
        sll = len(sl)
        for ind in (i for i, e in enumerate(l) if e == sl[0]):
            if l[ind:ind + sll] == sl:
                results.append((ind, ind + sll - 1))
        return results

    def _append_affixes(self, tokens, matches):
        """
        For any location string matches, look for modifiers before
        and after in the affix dictionary

        Args:
            tokens (list): List of tokens in a sentence
            matches (list): List of existing location-matches
        
        Returns:
            affixed_matches (list): List of matches with any affixes identified
                                    in the token list
        """
        n = len(tokens)
        tokens_lower = list(map(lambda i: i.lower(), tokens))
        tokens_lower = flatten([i.split(" ") for i in tokens_lower])
        affixed_matches = []
        for m in matches:
            ## Re-Tokenize Match
            m_toks = self.tokenizer.tokenize(m)
            ## Look For Span Matches
            m_spans = self._find_sub_list(m_toks, tokens_lower)
            ## Check Spans for Affixes
            for span_start, span_end in m_spans:
                affixed_span = m_toks.copy()
                if span_start > 0:
                    if tokens[span_start - 1].istitle() and tokens[
                            span_start -
                            1].lower() in self._geo_affixes["prefix"]:
                        affixed_span = tokens_lower[
                            span_start - 1:span_start] + affixed_span
                if span_end < n - 2:
                    if tokens[span_end + 1].istitle() and tokens[
                            span_end +
                            1].lower() in self._geo_affixes["suffix"]:
                        affixed_span = affixed_span + tokens_lower[span_end +
                                                                   1:span_end +
                                                                   2]
                affixed_matches.append(" ".join(affixed_span).replace(
                    " ,", ","))
        return affixed_matches

    def _find_locations(self, sent):
        """
        Find location mentions within a sentence

        Args:
            sent (str): Free-form text sentence
        
        Returns:
            affixed_matches (list): List of identified location matches
        """
        ## Tokenize Sentence
        tokens = self.tokenizer.tokenize(sent)
        if len(tokens) == 0:
            return []
        ## Expand Abbreviations
        tokens = self._expand_abbreviations(tokens)
        ## Look for Exact Matches
        exact_matches = self._look_for_exact_match(tokens)
        ## Look for Syntax Matches
        syntax_matches = self._look_for_syntax_match(tokens)
        ## Consolidate Matches
        combined_matches = self._filter_out_substrings(syntax_matches +
                                                       exact_matches)
        ## Append Affixes
        affixed_matches = self._append_affixes(tokens, combined_matches)
        return affixed_matches

    def _merge_overlap(self, matches):
        """
        Combine location matches that share a string 
        E.g. "los angeles, california" and "california, US"

        Args:
            matches (list): List of matched location strings
        
        Returns:
            merged_matches (list): Combined match list
        """
        merged_matches = []
        seen = set()
        for m, match in enumerate(matches):
            match_split = match.split(", ")
            match_set = [match]
            seen.add(m)
            for m2, match2 in enumerate(matches):
                if m == m2 or m2 in seen:
                    continue
                match2_split = match2.split(", ")
                if set(match_split) & set(match2_split) != set():
                    match_set.append(match2)
                    seen.add(m2)
                    seen_added = True
            match_set = self._combine_syntax_matches(match_set)
            merged_matches.extend(match_set)
        return self._filter_out_substrings(merged_matches)

    def _combine_using_hierarchy(self, matches):
        """
        Combine string location matches based on the gazeteer geographic
        hierarchy (e.g. city, state, country, continent)

        Args:
            matches (list): List of proposed string location matches
        
        Returns:
            reg_matches (list): Matches, combining any that fall in the same location
                                hierarchy
        """
        combined_matches = []
        reg_matches = []
        for m, match in enumerate(matches):
            match_split = match.split(", ")
            match_level = None
            proposed_match = None
            for m2, match2 in enumerate(matches):
                if m == m2:
                    continue
                match2_split = match2.split(", ")
                levels = ["city", "county", "state", "country"]
                for tl, topy_level in enumerate(levels):
                    if match_split[-1] in self.geo_hierarchy[topy_level]:
                        if match2_split[0] in self.geo_hierarchy[topy_level][
                                match_split[-1]]:
                            if match_level is None or topy_level in levels[:
                                                                           match_level]:
                                proposed_match = (m, m2)
                                match_level = tl
            if proposed_match is None:
                reg_matches.append(match)
            else:
                combined_matches.append((proposed_match, match_level))
        combined_matches = sorted(combined_matches, key=lambda x: x[1])
        for (ml, mr), _ in combined_matches:
            reg_matches.append(matches[ml] + ", " + matches[mr])
        ## Filter Duplicates
        reg_matches = self._filter_out_substrings(reg_matches)
        ## Merge Overlap
        reg_matches = self._merge_overlap(reg_matches)
        return reg_matches

    def _is_all_commons(self, match):
        """
        Helper noting if all tokens in a matched hierarchy are made
        of common words

        Args:
            match (str): Proposed location match
        
        Returns:
            is_match (bool): Whether all tokens in the match are common words
        """
        if all(m in self._common_words
               for m in match.replace(", ", "").split()):
            return True
        return False

    def find_locations(self, text):
        """
        Find locations within a string of text

        Args:
            text (str): Input free-form text (potentially a paragraph)
        
        Returns:
            locations (list): List of recognized location strings
        """
        ## Translate to Ascii
        text = unidecode(text)
        ## Split Up Sentences
        sentences = sent_tokenize(text)
        ## Look for Locations
        locations = []
        for sent in sentences:
            sent_locs = self._find_locations(sent)
            locations.extend(sent_locs)
        ## Combine Using Hierarchy
        locations = self._combine_using_hierarchy(locations)
        ## Filter Commons
        locations = [l for l in locations if not self._is_all_commons(l)]
        return locations