Exemplo n.º 1
0
 def test_latin_translations(self):
     """Test Synonym class lookup() function and Lemmata class isolate() method"""
     #first build the lemma list as in test_latin_lemmata()
     lemmatizer = Lemmata(dictionary = 'lemmata', language = 'latin')
     test_str = 'Ceterum antequam destinata componam'
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lookup(tokens)
     #now isolate the list of lemmas
     lemmas = lemmatizer.isolate(lemmas)
     translations = Synonyms(dictionary = 'translations', language = 'latin')
     translations = translations.lookup_synonyms(lemmas)
     target = [('destino', [('σκοπός', 1.0)]), ('compono', [('συντίθημι', 1.0)])]
     self.assertEqual(translations, target)
Exemplo n.º 2
0
 def test_latin_synonyms(self):
     """Test Synonym class lookup() function and Lemmata class isolate() method"""
     #first build the lemma list as in test_latin_lemmata()
     lemmatizer = Lemmata(dictionary = 'lemmata', language = 'latin')
     test_str = 'Ceterum antequam destinata componam'
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lookup(tokens)
     #now isolate the list of lemmas
     lemmas = lemmatizer.isolate(lemmas)
     synonyms = Synonyms(dictionary = 'synonyms', language = 'latin')
     syns = synonyms.lookup_synonyms(lemmas)
     target = [('ceterus', [('ceteroqui', 0.5), ('perquiesco', 0.5)]), ('compono', [('struo', 0.5), ('condo', 0.5)])]
     self.assertEqual(syns, target)
Exemplo n.º 3
0
 def test_latin_translations(self):
     """Test Synonym class lookup() function and Lemmata class isolate() method"""
     #first build the lemma list as in test_latin_lemmata()
     lemmatizer = Lemmata(dictionary='lemmata', language='latin')
     test_str = 'Ceterum antequam destinata componam'
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lookup(tokens)
     #now isolate the list of lemmas
     lemmas = lemmatizer.isolate(lemmas)
     translations = Synonyms(dictionary='translations', language='latin')
     translations = translations.lookup_synonyms(lemmas)
     target = [('destino', [('σκοπός', 1.0)]),
               ('compono', [('συντίθημι', 1.0)])]
     self.assertEqual(translations, target)
Exemplo n.º 4
0
 def test_latin_synonyms(self):
     """Test Synonym class lookup() function and Lemmata class isolate() method"""
     #first build the lemma list as in test_latin_lemmata()
     lemmatizer = Lemmata(dictionary='lemmata', language='latin')
     test_str = 'Ceterum antequam destinata componam'
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lookup(tokens)
     #now isolate the list of lemmas
     lemmas = lemmatizer.isolate(lemmas)
     synonyms = Synonyms(dictionary='synonyms', language='latin')
     syns = synonyms.lookup_synonyms(lemmas)
     target = [('ceterus', [('ceteroqui', 0.5), ('perquiesco', 0.5)]),
               ('compono', [('struo', 0.5), ('condo', 0.5)])]
     self.assertEqual(syns, target)
Exemplo n.º 5
0
class FrequencyModel:
    '''Generate unsupervised count of lemma frequencies in the Tesserae Latin text corpus.'''
    def __init__(self):
        self.lemmatizer = Lemmata(dictionary='lemmata', language='latin')
        self.jv = JVReplacer()
        self.word_tokenizer = WordTokenizer('latin')
        self.count_dictionary = dict()
        self.punctuation_list = [
            '!', ';', ':', '?', '-', '–', '&', '*', '(', ')', '[', ']', ',',
            '"', '\''
        ]

    def read_files(self, filepath):
        '''Reads the corpus and builds the self.count_dictionary dictionary object by calling
        the countgram() method on individual tokens.
        Dependencies
        ------------
        TessFile class from tesserae.utils
        Lemmata class from cltk.semantics.latin.lookup
        JVReplacer class from cltk.stem.latin.j_v
        WordTokenizer class from cltk.tokenize.word
        Parameters
        ----------
        filepath: a file in .tess format
        Results
        -------
        Updates self.count_dictionary
        Returns
        -------
        none'''
        tessobj = TessFile(filepath)
        tokengenerator = iter(tessobj.read_tokens())
        stop = 0
        while stop != 1:
            try:
                rawtoken = next(tokengenerator)
                cleantoken_list = self.token_cleanup(rawtoken)
                token = cleantoken_list[0]
                self.countgram(token)
            except StopIteration:
                stop = 1

    def countgram(self, targettoken):
        '''Update the frequency model with a new token from the corpus.'''
        lemmas = self.lemmatizer.lookup([targettoken])
        lemmas = self.lemmatizer.isolate(lemmas)
        for lem in lemmas:
            try:
                test_presence = self.count_dictionary[lem]
            except KeyError:
                self.count_dictionary[lem] = 0
            self.count_dictionary[lem] += 1

    def lemmatize(self, target):
        '''Use the unsupervised count of lemma frequencies generated by read_files()
        to assign probabilities in the case of an ambiguous lemmatization.
        parameters
        ----------
        target: a token to be lemmatized
        results
        -------
        a list of tuples of the form [(lemma, probability)]
        '''
        if target in self.punctuation_list:
            lemmalist = [('punc', 1)]
            return lemmalist
        if target == 'ne':
            lemmalist = [('ne', 1)]
            return lemmalist
        lemmalist = self.lemmatizer.lookup([target])
        lemmas = self.lemmatizer.isolate(lemmalist)
        if len(lemmas) > 1:
            all_lemmas_total = sum([self.count_dictionary[l] for l in lemmas])
            try:
                lemmalist = [(l, (self.count_dictionary[l] / all_lemmas_total))
                             for l in lemmas]
            except ZeroDivisionError:
                print([(self.count_dictionary[l], l) for l in lemmas])
            return lemmalist
        lemmalist = []
        lemmaobj = (lemmas[0], 1)
        lemmalist.append(lemmaobj)
        return lemmalist

    def token_cleanup(self, rawtoken):
        '''Standardize tokens by replaceing j with i and v with u, and
        split into multiple tokens as needed with tokenize() method of word_tokenizer class
        parameters
        ----------
        rawtoken: the token as drawn from the text
        return
        ------
        tokenlist: a list of possible word or punctuation tokens
        '''
        rawtoken = self.jv.replace(rawtoken)
        rawtoken = rawtoken.lower()
        tokenlist = self.word_tokenizer.tokenize(rawtoken)
        #sometimes words are split into enclitics and punctuation.
        return tokenlist

    def save_pickle(self, filename):
        '''Saves the self.count_dictionary object for later reuse.
        dependencies
        ------------
        os package
        parameters
        ----------
        filename: name for the pickle file'''
        relativepath = join('~', 'cltk_data', 'latin', 'model',
                            'latin_models_cltk', 'frequency')
        path = expanduser(relativepath)
        pickle_file = join(path, filename)
        if not os.path.isdir(path):
            os.makedirs(path)
        pickle.dump(self.count_dictionary, open(pickle_file, "wb"))

    def load_pickle(self, filename):
        '''Load the self.count_dictionary object saved by save_pickle.
        dependencies
        ------------
        os package
        parameters
        ----------
        filename: name of the pickle file'''
        relativepath = join('~', 'cltk_data', 'latin', 'model',
                            'latin_models_cltk', 'frequency')
        path = expanduser(relativepath)
        pickle_file = join(path, filename)
        pickle_file = open(pickle_file, 'rb')
        self.count_dictionary = pickle.load(pickle_file)

    def train_model(self):
        '''open all the tesserae files and call read_files() on each to build freq model'''
        relativepath = join('~', 'cltk_data', 'latin', 'text',
                            'latin_text_tesserae_collection', 'la')
        path = expanduser(relativepath)
        onlyfiles = [f for f in listdir(path) if isfile(join(path, f)) and 'augustine' not in f and 'ambrose' not in f and 'jerome' not in f and 'tertullian' not in f and 'eugippius' not in f and 'hilary' not in f]  # pylint: disable=line-too-long
        onlyfiles = [join(path, f) for f in onlyfiles]
        for filename in onlyfiles:
            if '.tess' in filename:
                print(filename)
                self.read_files(filename)

    def test_count_dictionary(self, token_list, lemma_list):
        '''Test the ability of lemmatize(), (which uses the self.count_dictionary dictionary,
        to predict the most likely lemmatization in ambiguous cases. Punctuation is
        automatically counted as correct, because the 'punc' lemmatization usage is inconsistent
        in the test corpus.
        dependencies
        ------------
        itemgetter class from operator package
        parameters
        ----------
        token_list: a list of tokens
        lemma_list: a list of corresponding 'correct' lemmatizaitons
        results
        -------
        prints four numbers: the number of correctly assigned lemmas in ambiguous cases;
        the number of ambiguous cases in total; the number of tokens analyzed; and a
        decimal between 0 and 1 representing the proportion of correct lemmatizations.
        return
        ------
        a list object containing all incorrect lemmatizations for analysis. Format:
        [(token, answer_given, correct_answer), (token...)]

        NOTE: Initial tests show roughly 91% accuracy, identification of punctuation included.
        '''
        trials = 0
        correct = 0
        errors = []
        for position in range(0, (len(token_list) - 1)):
            lemmalist = self.lemmatizer.lookup(token_list[position])
            lemmalist = lemmalist[1]
            lemma = max(lemmalist, key=itemgetter(1))
            if len(lemmalist) > 1:
                trials = trials + 1
                if lemma[0] == lemma_list[position] or lemma[0] == 'punc':
                    correct = correct + 1
                else:
                    errors.append(
                        (token_list[position], lemma[0], lemma_list[position]))
        print(correct)
        print(trials)
        print(len(lemma_list))
        rate = (len(lemma_list) - trials + correct) / len(lemma_list)
        print(rate)
        return errors