Exemplo n.º 1
0
    def __init__(self, connection):
        super(GreekTokenizer, self).__init__(connection)

        # Set up patterns that will be reused
        self.vowels = 'αειηουωΑΕΙΗΟΥΩ'
        self.grave = '\u0300'
        self.acute = '\u0301'
        self.sigma = 'σ\b'
        self.sigma_alt = 'ς'
        # diacriticals should not be considered part of ``word_characters`` so
        # that extraneous diacritical marks unattended by a proper word
        # character to bind to do not appear as proper words during
        # tokenization of display tokens (see BaseTokenizer.tokenize);
        # also ignore the middle dot character, which is a punctuation mark
        self.word_regex = re.compile('[ΆΈ-ώ' + self.sigma_alt + ']+',
                                     flags=re.UNICODE)

        self.diacrit_sub1 = \
            r'[\s.,;?!]([' + self.diacriticals + ']+)([' + self.vowels + ']{2,})'
        self.diacrit_sub2 = \
            r'[\s.,;?!]([' + self.diacriticals + ']+)([' + self.vowels + ']{1})'

        self.split_pattern = ''.join([
            '( / )|([\\s]+)|([^\\w\\d', self.diacriticals, self.sigma_alt,
            r"])"
        ])

        self.lemmatizer = Lemmata('lemmata', 'greek')
Exemplo n.º 2
0
    def __init__(self, connection):
        super(LatinTokenizer, self).__init__(connection)

        # Set up patterns that will be reused
        self.jv_replacer = JVReplacer()
        self.lemmatizer = Lemmata('lemmata', 'lat')

        self.split_pattern = \
            '( / )|([\\s]+)|([^\\w' + self.diacriticals + ']+)'
Exemplo n.º 3
0
    def __init__(self, connection):
        super(LatinTokenizer, self).__init__(connection)

        # Set up patterns that will be reused
        self.jv_replacer = JVReplacer()
        self.lemmatizer = Lemmata('lemmata', 'latin')

        self.split_pattern = \
            '[<].+[>][\s]| / | \. \. \.|\.\~\.\~\.|[^\w' + self.diacriticals + ']'
Exemplo n.º 4
0
 def __init__(self):
     self.lemmatizer = Lemmata(dictionary='lemmata', language='latin')
     self.jv = JVReplacer()
     self.word_tokenizer = WordTokenizer('latin')
     self.count_dictionary = dict()
     self.punctuation_list = [
         '!', ';', ':', '?', '-', '–', '&', '*', '(', ')', '[', ']', ',',
         '"', '\''
     ]
Exemplo n.º 5
0
 def test_latin_lemmata(self):
     """Test Lemmata class lookup() method"""
     lemmatizer = Lemmata(dictionary = 'lemmata', language = 'latin')
     test_str = 'Ceterum antequam destinata componam'
     target = [('ceterum', [('ceterus', 1.0)]), ('antequam', [('antequam', 1.0)]), ('destinata', [('destinatus', 0.25), ('destinatum', 0.25), ('destinata', 0.25), ('destino', 0.25)]), ('componam', [('compono', 1.0)])]  # pylint: disable=line-too-long
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lookup(tokens)
     self.assertEqual(lemmas, target)
Exemplo n.º 6
0
 def test_latin_lemmata(self):
     """Test Lemmata class lookup() method"""
     lemmatizer = Lemmata(dictionary='lemmata', language='latin')
     test_str = 'Ceterum antequam destinata componam'
     target = [('ceterum', [('ceterus', 1.0)]), ('antequam', [('antequam', 1.0)]), ('destinata', [('destinatus', 0.25), ('destinatum', 0.25), ('destinata', 0.25), ('destino', 0.25)]), ('componam', [('compono', 1.0)])]  # pylint: disable=line-too-long
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lookup(tokens)
     self.assertEqual(lemmas, target)
Exemplo n.º 7
0
 def test_latin_translations(self):
     """Test Synonym class lookup() function and Lemmata class isolate() method"""
     #first build the lemma list as in test_latin_lemmata()
     lemmatizer = Lemmata(dictionary = 'lemmata', language = 'latin')
     test_str = 'Ceterum antequam destinata componam'
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lookup(tokens)
     #now isolate the list of lemmas
     lemmas = lemmatizer.isolate(lemmas)
     translations = Synonyms(dictionary = 'translations', language = 'latin')
     translations = translations.lookup_synonyms(lemmas)
     target = [('destino', [('σκοπός', 1.0)]), ('compono', [('συντίθημι', 1.0)])]
     self.assertEqual(translations, target)
Exemplo n.º 8
0
 def test_latin_synonyms(self):
     """Test Synonym class lookup() function and Lemmata class isolate() method"""
     #first build the lemma list as in test_latin_lemmata()
     lemmatizer = Lemmata(dictionary = 'lemmata', language = 'latin')
     test_str = 'Ceterum antequam destinata componam'
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lookup(tokens)
     #now isolate the list of lemmas
     lemmas = lemmatizer.isolate(lemmas)
     synonyms = Synonyms(dictionary = 'synonyms', language = 'latin')
     syns = synonyms.lookup_synonyms(lemmas)
     target = [('ceterus', [('ceteroqui', 0.5), ('perquiesco', 0.5)]), ('compono', [('struo', 0.5), ('condo', 0.5)])]
     self.assertEqual(syns, target)
Exemplo n.º 9
0
 def test_latin_translations(self):
     """Test Synonym class lookup() function and Lemmata class isolate() method"""
     #first build the lemma list as in test_latin_lemmata()
     lemmatizer = Lemmata(dictionary='lemmata', language='latin')
     test_str = 'Ceterum antequam destinata componam'
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lookup(tokens)
     #now isolate the list of lemmas
     lemmas = lemmatizer.isolate(lemmas)
     translations = Synonyms(dictionary='translations', language='latin')
     translations = translations.lookup_synonyms(lemmas)
     target = [('destino', [('σκοπός', 1.0)]),
               ('compono', [('συντίθημι', 1.0)])]
     self.assertEqual(translations, target)
Exemplo n.º 10
0
 def test_latin_synonyms(self):
     """Test Synonym class lookup() function and Lemmata class isolate() method"""
     #first build the lemma list as in test_latin_lemmata()
     lemmatizer = Lemmata(dictionary='lemmata', language='latin')
     test_str = 'Ceterum antequam destinata componam'
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lookup(tokens)
     #now isolate the list of lemmas
     lemmas = lemmatizer.isolate(lemmas)
     synonyms = Synonyms(dictionary='synonyms', language='latin')
     syns = synonyms.lookup_synonyms(lemmas)
     target = [('ceterus', [('ceteroqui', 0.5), ('perquiesco', 0.5)]),
               ('compono', [('struo', 0.5), ('condo', 0.5)])]
     self.assertEqual(syns, target)
Exemplo n.º 11
0
    def __init__(self, connection):
        super(GreekTokenizer, self).__init__(connection)

        # Set up patterns that will be reused
        self.vowels = 'αειηουωΑΕΙΗΟΥΩ'
        self.grave = '\u0300'
        self.acute = '\u0301'
        self.sigma = 'σ\b'
        self.sigma_alt = 'ς'
        self.word_characters = 'Ά-ώ' + self.sigma_alt + self.diacriticals

        self.diacrit_sub1 = \
            '([\s])([' + self.diacriticals + ']+)([' + self.vowels + ']{2,})'
        self.diacrit_sub2 = \
            '([\s])([' + self.diacriticals + ']+)([' + self.vowels + ']{1})'

        self.split_pattern = '[<].+[>][\s]| / |[^\w' + self.diacriticals + self.sigma_alt + '\']'

        self.lemmatizer = Lemmata('lemmata', 'greek')
Exemplo n.º 12
0
class LatinTokenizer(BaseTokenizer):
    def __init__(self, connection):
        super(LatinTokenizer, self).__init__(connection)

        # Set up patterns that will be reused
        self.jv_replacer = JVReplacer()
        self.lemmatizer = Lemmata('lemmata', 'lat')

        self.split_pattern = \
            '( / )|([\\s]+)|([^\\w' + self.diacriticals + ']+)'

    def normalize(self, raw, split=True):
        """Normalize a Latin word.

        Parameters
        ----------
        raw : str or list of str
            The string(s) to normalize.

        Returns
        -------
        normalized : str or list of str
            The normalized string(s).

        Notes
        -----
        This function should be applied to Latin words prior to generating
        other features (e.g., lemmata).
        """
        # Apply the global normalizer
        normalized, tags = super(LatinTokenizer, self).normalize(raw)

        # Replace j/v with i/u, respectively
        normalized = self.jv_replacer.replace(normalized)

        if split:
            normalized = re.split(self.split_pattern,
                                  normalized,
                                  flags=re.UNICODE)
            normalized = [
                t for t in normalized if t and re.search(r'[\w]+', t)
            ]

        return normalized, tags

    def featurize(self, tokens):
        """Lemmatize a Latin token.

        Parameters
        ----------
        tokens : list of str
            The token to featurize.
        Returns
        -------
        lemmata : dict
            The features for the token.

        Notes
        -----
        Input should be sanitized with `LatinTokenizer.normalize` prior to
        using this method.
        """
        if not isinstance(tokens, list):
            tokens = [tokens]
        lemmata = self.lemmatizer.lookup(tokens)
        #        print("Latin lemmata:", lemmata)
        fixed_lemmata = []
        for lem in lemmata:
            lem_lemmata = [l[0] for l in lem[1]]
            fixed_lemmata.append(lem_lemmata)
#        print("fixed lemmata:", fixed_lemmata)
        grams = trigrammify(tokens)
        features = {'lemmata': fixed_lemmata, 'sound': grams}
        #        print('features', features)
        # for i, l in enumerate(lemmata):
        #     features.append({'lemmata': [lem[0] for lem in l[1]]})
        return features
Exemplo n.º 13
0
class GreekTokenizer(BaseTokenizer):
    def __init__(self, connection):
        super(GreekTokenizer, self).__init__(connection)

        # Set up patterns that will be reused
        self.vowels = 'αειηουωΑΕΙΗΟΥΩ'
        self.grave = '\u0300'
        self.acute = '\u0301'
        self.sigma = 'σ\b'
        self.sigma_alt = 'ς'
        # diacriticals should not be considered part of ``word_characters`` so
        # that extraneous diacritical marks unattended by a proper word
        # character to bind to do not appear as proper words during
        # tokenization of display tokens (see BaseTokenizer.tokenize);
        # also ignore the middle dot character, which is a punctuation mark
        self.word_regex = re.compile('[ΆΈ-ώ' + self.sigma_alt + ']+',
                                     flags=re.UNICODE)

        self.diacrit_sub1 = \
            r'[\s.,;?!]([' + self.diacriticals + ']+)([' + self.vowels + ']{2,})'
        self.diacrit_sub2 = \
            r'[\s.,;?!]([' + self.diacriticals + ']+)([' + self.vowels + ']{1})'

        self.split_pattern = ''.join([
            '( / )|([\\s]+)|([^\\w\\d', self.diacriticals, self.sigma_alt,
            r"])"
        ])

        self.lemmatizer = Lemmata('lemmata', 'greek')

    def normalize(self, raw, split=True):
        """Normalize a single Greek word.

        Parameters
        ----------
        raw : str or list of str
            The word to normalize.

        Returns
        -------
        normalized : str
            The normalized string.
        """
        # Perform the global normalization
        normalized, tags = super(GreekTokenizer, self).normalize(raw)

        # Convert grave accent to acute
        normalized = re.sub(self.grave,
                            self.acute,
                            normalized,
                            flags=re.UNICODE)

        # Remove diacriticals from vowels
        normalized = re.sub(self.diacrit_sub1,
                            r' \2',
                            normalized,
                            flags=re.UNICODE)
        normalized = re.sub(self.diacrit_sub2,
                            r' \2\1',
                            normalized,
                            flags=re.UNICODE)

        # Substitute sigmas
        normalized = re.sub(self.sigma,
                            self.sigma_alt,
                            normalized,
                            flags=re.UNICODE)

        # Remove digits and single-quotes from the normalized output
        normalized = re.sub(r"['\d]+", r' ', normalized, flags=re.UNICODE)

        # Split the output into a list of normalized tokens if requested
        if split:
            normalized = re.split(self.split_pattern,
                                  normalized,
                                  flags=re.UNICODE)
            normalized = [
                t for t in normalized if t and re.search(r'[\w]+', t)
            ]

        return normalized, tags

    def featurize(self, tokens):
        """Get the features for a single Greek token.

        Parameters
        ----------
        token : str
            The token to featurize.

        Returns
        -------
        features : dict
            The features for the token.

        Notes
        -----
        Input should be sanitized with `greek_normalizer` prior to using this
        method.
        """
        lemmata = self.lemmatizer.lookup(tokens)
        fixed_lemmata = []
        for lem in lemmata:
            lem_lemmata = [l[0] for l in lem[1]]
            fixed_lemmata.append(lem_lemmata)
        features = {'lemmata': fixed_lemmata}
        return features
Exemplo n.º 14
0
class LatinTokenizer(BaseTokenizer):
    def __init__(self, connection):
        super(LatinTokenizer, self).__init__(connection)

        # Set up patterns that will be reused
        self.jv_replacer = JVReplacer()
        self.lemmatizer = Lemmata('lemmata', 'latin')

        self.split_pattern = \
            '[<].+[>][\s]| / | \. \. \.|\.\~\.\~\.|[^\w' + self.diacriticals + ']'

    # def tokenize(self, raw, record=True, text=None):
    #     normalized = unicodedata.normalize('NFKD', raw).lower()
    #     normalized = self.jv_replacer.replace(normalized)
    #     normalized = re.split(self.split_pattern, normalized, flags=re.UNICODE)
    #     display = re.split(self.split_pattern, raw, flags=re.UNICODE)
    #     featurized = self.featurize(normalized)
    #
    #     tokens = []
    #     frequencies = collections.Counter(
    #         [n for i, n in enumerate(normalized) if
    #          re.search('[\w]+', normalized[i], flags=re.UNICODE)])
    #     frequency_list = []
    #
    #     try:
    #         text_id = text.path
    #     except AttributeError:
    #         text_id = None
    #
    #     base = len(self.tokens)
    #
    #     for i, d in enumerate(display):
    #         idx = i + base
    #         if re.search('[\w]', d, flags=re.UNICODE):
    #             n = normalized[i]
    #             f = featurized[i]
    #             t = Token(text=text_id, index=idx, display=d, form=n, **f)
    #         else:
    #             t = Token(text=text_id, index=idx, display=d)
    #         tokens.append(t)
    #
    #     # Update the internal record if necessary
    #     if record:
    #         self.tokens.extend([t for t in tokens])
    #         self.frequencies.update(frequencies)
    #         frequencies = self.frequencies
    #         if '' in self.frequencies:
    #             del self.frequencies['']
    #
    #     print(frequencies)
    #     print(self.frequencies)
    #
    #     # Prep the freqeuncy objects
    #     for k, v in frequencies.items():
    #         f = Frequency(text=text_id, form=k, frequency=v)
    #         frequency_list.append(f)
    #
    #     return tokens, frequency_list

    def normalize(self, raw):
        """Normalize a Latin word.

        Parameters
        ----------
        raw : str or list of str
            The string(s) to normalize.

        Returns
        -------
        normalized : str or list of str
            The normalized string(s).

        Notes
        -----
        This function should be applied to Latin words prior to generating
        other features (e.g., lemmata).
        """
        # Apply the global normalizer
        normalized = super(LatinTokenizer, self).normalize(raw)

        # Replace j/v with i/u, respectively
        normalized = self.jv_replacer.replace(normalized)

        return normalized

    def featurize(self, tokens):
        """Lemmatize a Latin token.

        Parameters
        ----------
        tokens : list of str
            The token to featurize.

        Returns
        -------
        lemmata : dict
            The features for the token.

        Notes
        -----
        Input should be sanitized with `LatinTokenizer.normalize` prior to
        using this method.
        """
        if not isinstance(tokens, list):
            tokens = [tokens]
        lemmata = self.lemmatizer.lookup(tokens)
        features = []
        for i, l in enumerate(lemmata):
            features.append({'lemmata': [lem[0] for lem in l[1]]})
        return features
Exemplo n.º 15
0
    ----------
    filepath: a file in .tess format
    '''
    tessobj = TessFile(filepath)
    tokengenerator = iter(tessobj.read_tokens())
    stop = 0
    while stop != 1:
        try: 
            rawtoken = next(tokengenerator)
            cleantoken_list = token_cleanup(rawtoken)
            count_lemma(cleantoken_list[0])
        except StopIteration:
            stop = 1


lemmatizer = Lemmata(dictionary = 'lemmata', language = 'latin')
def count_lemma(targettoken):
    '''Builds a complex data structure that will contain the 'average context'
    for each type in the corpus.
    param targettoken: the token in question
    param c: the context tokens
    global SKIP_LIBRARY: a dictionary whose keys are types and whose values are
    dictionaries; in turn their keys are context types and values are
    incremented counts.
    '''
    global COUNT_LIBRARY
    lemmas = lemmatizer.lookup([targettoken])
    lemmas = lemmatizer.isolate(lemmas)
    for lemma in lemmas:
        if lemma not in COUNT_LIBRARY:
            COUNT_LIBRARY[lemma] = 0
Exemplo n.º 16
0
from cltk.semantics.latin.lookup import Lemmata

_LEM_MAPPER = {
    'latin': Lemmata('lemmata', 'lat'),
    'greek': Lemmata('lemmata', 'grc')
}


def get_lemmatizer(language):
    return _LEM_MAPPER[language]
Exemplo n.º 17
0
class FrequencyModel:
    '''Generate unsupervised count of lemma frequencies in the Tesserae Latin text corpus.'''
    def __init__(self):
        self.lemmatizer = Lemmata(dictionary='lemmata', language='latin')
        self.jv = JVReplacer()
        self.word_tokenizer = WordTokenizer('latin')
        self.count_dictionary = dict()
        self.punctuation_list = [
            '!', ';', ':', '?', '-', '–', '&', '*', '(', ')', '[', ']', ',',
            '"', '\''
        ]

    def read_files(self, filepath):
        '''Reads the corpus and builds the self.count_dictionary dictionary object by calling
        the countgram() method on individual tokens.
        Dependencies
        ------------
        TessFile class from tesserae.utils
        Lemmata class from cltk.semantics.latin.lookup
        JVReplacer class from cltk.stem.latin.j_v
        WordTokenizer class from cltk.tokenize.word
        Parameters
        ----------
        filepath: a file in .tess format
        Results
        -------
        Updates self.count_dictionary
        Returns
        -------
        none'''
        tessobj = TessFile(filepath)
        tokengenerator = iter(tessobj.read_tokens())
        stop = 0
        while stop != 1:
            try:
                rawtoken = next(tokengenerator)
                cleantoken_list = self.token_cleanup(rawtoken)
                token = cleantoken_list[0]
                self.countgram(token)
            except StopIteration:
                stop = 1

    def countgram(self, targettoken):
        '''Update the frequency model with a new token from the corpus.'''
        lemmas = self.lemmatizer.lookup([targettoken])
        lemmas = self.lemmatizer.isolate(lemmas)
        for lem in lemmas:
            try:
                test_presence = self.count_dictionary[lem]
            except KeyError:
                self.count_dictionary[lem] = 0
            self.count_dictionary[lem] += 1

    def lemmatize(self, target):
        '''Use the unsupervised count of lemma frequencies generated by read_files()
        to assign probabilities in the case of an ambiguous lemmatization.
        parameters
        ----------
        target: a token to be lemmatized
        results
        -------
        a list of tuples of the form [(lemma, probability)]
        '''
        if target in self.punctuation_list:
            lemmalist = [('punc', 1)]
            return lemmalist
        if target == 'ne':
            lemmalist = [('ne', 1)]
            return lemmalist
        lemmalist = self.lemmatizer.lookup([target])
        lemmas = self.lemmatizer.isolate(lemmalist)
        if len(lemmas) > 1:
            all_lemmas_total = sum([self.count_dictionary[l] for l in lemmas])
            try:
                lemmalist = [(l, (self.count_dictionary[l] / all_lemmas_total))
                             for l in lemmas]
            except ZeroDivisionError:
                print([(self.count_dictionary[l], l) for l in lemmas])
            return lemmalist
        lemmalist = []
        lemmaobj = (lemmas[0], 1)
        lemmalist.append(lemmaobj)
        return lemmalist

    def token_cleanup(self, rawtoken):
        '''Standardize tokens by replaceing j with i and v with u, and
        split into multiple tokens as needed with tokenize() method of word_tokenizer class
        parameters
        ----------
        rawtoken: the token as drawn from the text
        return
        ------
        tokenlist: a list of possible word or punctuation tokens
        '''
        rawtoken = self.jv.replace(rawtoken)
        rawtoken = rawtoken.lower()
        tokenlist = self.word_tokenizer.tokenize(rawtoken)
        #sometimes words are split into enclitics and punctuation.
        return tokenlist

    def save_pickle(self, filename):
        '''Saves the self.count_dictionary object for later reuse.
        dependencies
        ------------
        os package
        parameters
        ----------
        filename: name for the pickle file'''
        relativepath = join('~', 'cltk_data', 'latin', 'model',
                            'latin_models_cltk', 'frequency')
        path = expanduser(relativepath)
        pickle_file = join(path, filename)
        if not os.path.isdir(path):
            os.makedirs(path)
        pickle.dump(self.count_dictionary, open(pickle_file, "wb"))

    def load_pickle(self, filename):
        '''Load the self.count_dictionary object saved by save_pickle.
        dependencies
        ------------
        os package
        parameters
        ----------
        filename: name of the pickle file'''
        relativepath = join('~', 'cltk_data', 'latin', 'model',
                            'latin_models_cltk', 'frequency')
        path = expanduser(relativepath)
        pickle_file = join(path, filename)
        pickle_file = open(pickle_file, 'rb')
        self.count_dictionary = pickle.load(pickle_file)

    def train_model(self):
        '''open all the tesserae files and call read_files() on each to build freq model'''
        relativepath = join('~', 'cltk_data', 'latin', 'text',
                            'latin_text_tesserae_collection', 'la')
        path = expanduser(relativepath)
        onlyfiles = [f for f in listdir(path) if isfile(join(path, f)) and 'augustine' not in f and 'ambrose' not in f and 'jerome' not in f and 'tertullian' not in f and 'eugippius' not in f and 'hilary' not in f]  # pylint: disable=line-too-long
        onlyfiles = [join(path, f) for f in onlyfiles]
        for filename in onlyfiles:
            if '.tess' in filename:
                print(filename)
                self.read_files(filename)

    def test_count_dictionary(self, token_list, lemma_list):
        '''Test the ability of lemmatize(), (which uses the self.count_dictionary dictionary,
        to predict the most likely lemmatization in ambiguous cases. Punctuation is
        automatically counted as correct, because the 'punc' lemmatization usage is inconsistent
        in the test corpus.
        dependencies
        ------------
        itemgetter class from operator package
        parameters
        ----------
        token_list: a list of tokens
        lemma_list: a list of corresponding 'correct' lemmatizaitons
        results
        -------
        prints four numbers: the number of correctly assigned lemmas in ambiguous cases;
        the number of ambiguous cases in total; the number of tokens analyzed; and a
        decimal between 0 and 1 representing the proportion of correct lemmatizations.
        return
        ------
        a list object containing all incorrect lemmatizations for analysis. Format:
        [(token, answer_given, correct_answer), (token...)]

        NOTE: Initial tests show roughly 91% accuracy, identification of punctuation included.
        '''
        trials = 0
        correct = 0
        errors = []
        for position in range(0, (len(token_list) - 1)):
            lemmalist = self.lemmatizer.lookup(token_list[position])
            lemmalist = lemmalist[1]
            lemma = max(lemmalist, key=itemgetter(1))
            if len(lemmalist) > 1:
                trials = trials + 1
                if lemma[0] == lemma_list[position] or lemma[0] == 'punc':
                    correct = correct + 1
                else:
                    errors.append(
                        (token_list[position], lemma[0], lemma_list[position]))
        print(correct)
        print(trials)
        print(len(lemma_list))
        rate = (len(lemma_list) - trials + correct) / len(lemma_list)
        print(rate)
        return errors
Exemplo n.º 18
0
class GreekTokenizer(BaseTokenizer):
    def __init__(self, connection):
        super(GreekTokenizer, self).__init__(connection)

        # Set up patterns that will be reused
        self.vowels = 'αειηουωΑΕΙΗΟΥΩ'
        self.grave = '\u0300'
        self.acute = '\u0301'
        self.sigma = 'σ\b'
        self.sigma_alt = 'ς'
        self.word_characters = 'Ά-ώ' + self.sigma_alt + self.diacriticals

        self.diacrit_sub1 = \
            '([\s])([' + self.diacriticals + ']+)([' + self.vowels + ']{2,})'
        self.diacrit_sub2 = \
            '([\s])([' + self.diacriticals + ']+)([' + self.vowels + ']{1})'

        self.split_pattern = '[<].+[>][\s]| / |[^\w' + self.diacriticals + self.sigma_alt + '\']'

        self.lemmatizer = Lemmata('lemmata', 'greek')

    def normalize(self, raw):
        """Normalize a single Greek word.

        Parameters
        ----------
        raw : str or list of str
            The word to normalize.

        Returns
        -------
        normalized : str
            The normalized string.
        """
        # Perform the global normalization
        normalized = super(GreekTokenizer, self).normalize(raw)

        # Convert grave accent to acute
        normalized = re.sub(self.grave,
                            self.acute,
                            normalized,
                            flags=re.UNICODE)

        # Remove diacriticals from vowels
        normalized = re.sub(self.diacrit_sub1,
                            r'\1\3',
                            normalized,
                            flags=re.UNICODE)
        normalized = re.sub(self.diacrit_sub2,
                            r'\1\3\2',
                            normalized,
                            flags=re.UNICODE)

        # Substitute sigmas
        normalized = re.sub(self.sigma,
                            self.sigma_alt,
                            normalized,
                            flags=re.UNICODE)

        normalized = re.sub(r'\'', '', normalized, flags=re.UNICODE)

        normalized = re.sub(r'[\'0-9]+', '', normalized, flags=re.UNICODE)

        return normalized

    def featurize(self, tokens):
        """Get the features for a single Greek token.

        Parameters
        ----------
        token : str
            The token to featurize.

        Returns
        -------
        features : dict
            The features for the token.

        Notes
        -----
        Input should be sanitized with `greek_normalizer` prior to using this
        method.
        """
        features = []
        lemmata = self.lemmatizer.lookup(tokens)
        for i, l in enumerate(lemmata):
            features.append({'lemmata': [lem[0] for lem in l[1]]})
        return features