Exemplo n.º 1
0
    def generate_tokens(self):
        token_frequency = {}

        tokens = []

        count = 0
        average_freq = 0

        for city in self.city_data['cities']:
            city_split = city.lower().split(' ')

            count += len(city_split)

            for word in city_split:
                if word not in token_frequency:
                    token_frequency[word] = 0

                token_frequency[word] += 1

        for token in token_frequency:
            token_frequency[token] /= count
            average_freq += token_frequency[token]

        average_freq /= count

        for token in token_frequency:
            if token_frequency[token] > average_freq:
                potential_token = Token(contents=token,
                                        token_type=TokenType.GRAMMATICAL)

                if potential_token not in tokens:
                    tokens.append(potential_token)
            else:
                for char in token:
                    potential_token = Token(contents=char,
                                            token_type=TokenType.NONETYPE)

                    if re.search(VOWEL_REGEX, char):
                        potential_token.token_type = TokenType.VOWEL
                    else:
                        potential_token.token_type = TokenType.CONSONANT

                    if potential_token not in tokens:
                        tokens.append(potential_token)

        return tokens