示例#1
0
 def test_json_corpus_reader(self):
     """Test filtered corpus sents method."""
     reader = get_corpus_reader(language='latin',
                                corpus_name='latin_text_perseus')
     # this has simple sections
     reader._fileids = ['cicero__on-behalf-of-aulus-caecina__latin.json']
     self.assertTrue(len(list(reader.paras())) >= 1)
     self.assertTrue(len(list(reader.sents())) > 400)
     self.assertTrue(len(list(reader.words())) > 12200)
     reader = get_corpus_reader(language='latin',
                                corpus_name='latin_text_perseus')
     # this example has subsections
     reader._fileids = [
         'ausonius-decimus-magnus__eclogarum-liber__latin.json'
     ]
     self.assertTrue(len(list(reader.docs())) == 1)
     self.assertTrue(len(list(reader.paras())) >= 1)
     self.assertTrue(len(list(reader.sents())) > 50)
     self.assertTrue(len(list(reader.words())) > 2750)
     reader = get_corpus_reader(corpus_name='greek_text_perseus',
                                language='greek')
     reader._fileids = ['plato__apology__grc.json']
     self.assertTrue(len(list(reader.docs())) == 1)
     self.assertTrue(len(list(reader.paras())) > 1)
     self.assertTrue(len(list(reader.sents())) > 260)
     self.assertTrue(len(list(reader.words())) > 9800)
示例#2
0
 def setUpClass(cls):
     try:
         corpus_importer = CorpusImporter('latin')
         corpus_importer.import_corpus('latin_text_latin_library')
     except:
         raise Exception('Failure to download test corpus')
     cls.reader = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library')
     cls.reader._fileids = ['pervig.txt']
     # Need a additional instance because tests below change internals #TO-DO Fix
     cls.reader_2 = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library')
     cls.reader_3 = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library')
     cls.reader_4 = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library')
示例#3
0
 def test_json_corpus_reader(self):
     """Test filtered corpus sents method."""
     reader = get_corpus_reader(language="latin", corpus_name="latin_text_perseus")
     # this has simple sections
     reader._fileids = ["cicero__on-behalf-of-aulus-caecina__latin.json"]
     self.assertTrue(len(list(reader.paras())) >= 1)
     self.assertTrue(len(list(reader.sents())) > 400)
     self.assertTrue(len(list(reader.words())) > 12000)
     reader = get_corpus_reader(language="latin", corpus_name="latin_text_perseus")
     # this example has subsections
     reader._fileids = ["ausonius-decimus-magnus__eclogarum-liber__latin.json"]
     self.assertTrue(len(list(reader.docs())) == 1)
     self.assertTrue(len(list(reader.paras())) >= 1)
     self.assertTrue(len(list(reader.sents())) > 50)
     self.assertTrue(len(list(reader.words())) > 2750)
示例#4
0
 def test_json_corpus_reader(self):
     """Test filtered corpus sents method."""
     reader = get_corpus_reader(language='latin', corpus_name='latin_text_perseus')
     # this has simple sections
     reader._fileids = ['cicero__on-behalf-of-aulus-caecina__latin.json']
     self.assertTrue(len(list(reader.paras())) >= 1)
     self.assertTrue(len(list(reader.sents())) > 400)
     self.assertTrue(len(list(reader.words())) > 12200)
     reader = get_corpus_reader(language='latin', corpus_name='latin_text_perseus')
     # this example has subsections
     reader._fileids = ['ausonius-decimus-magnus__eclogarum-liber__latin.json']
     self.assertTrue(len(list(reader.docs())) == 1)
     self.assertTrue(len(list(reader.paras())) >= 1)
     self.assertTrue(len(list(reader.sents())) > 50)
     self.assertTrue(len(list(reader.words())) > 2750)
示例#5
0
 def setUpClass(cls):
     try:
         corpus_importer = CorpusImporter('latin')
         corpus_importer.import_corpus('latin_text_latin_library')
     except:
         raise Exception('Failure to download test corpus')
     cls.reader = get_corpus_reader(language='latin',
                                    corpus_name='latin_text_latin_library')
     cls.reader._fileids = ['pervig.txt']
     # Need a additional instance because tests below change internals #TO-DO Fix
     cls.reader_2 = get_corpus_reader(
         language='latin', corpus_name='latin_text_latin_library')
     cls.reader_3 = get_corpus_reader(
         language='latin', corpus_name='latin_text_latin_library')
     cls.reader_4 = get_corpus_reader(
         language='latin', corpus_name='latin_text_latin_library')
 def __init__(self):
     self.sent_tokenizer = SentenceTokenizer()
     self.word_tokenizer = WordTokenizer('greek')
     self.corpus_reader = get_corpus_reader(
         corpus_name='greek_text_perseus', language='greek')
     self.lemmatizer = LemmaReplacer('greek')
     self.tfidf_vectorizer = TfidfVectorizer(input="filename")
示例#7
0
 def test_import_latin_library_corpus_filter_by_file_and_dir(self):
     """Test the Latin Library corpus reader filter by directories."""
     reader = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library')
     filtered_reader, files_found, dirs_found = assemble_corpus(reader, ['old'],
                                                                corpus_directories_by_type,
                                                                corpus_texts_by_type)
     self.assertTrue(len(list(filtered_reader.fileids())) > 0)
示例#8
0
 def test_import_latin_library_corpus_reader(self):
     """Test the Latin Library corpus reader."""
     corpus_importer = CorpusImporter('latin')
     corpus_importer.import_corpus('latin_text_latin_library')
     reader = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library')
     ALL_FILE_IDS = list(reader.fileids())
     self.assertTrue(len(ALL_FILE_IDS) > 2100)
示例#9
0
 def test_import_latin_library_corpus_reader(self):
     """Test the Latin Library corpus reader."""
     corpus_importer = CorpusImporter('latin')
     corpus_importer.import_corpus('latin_text_latin_library')
     reader = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library')
     ALL_FILE_IDS = list(reader.fileids())
     self.assertTrue(len(ALL_FILE_IDS) > 2100)
示例#10
0
 def setUpClass(self):
     try:
         corpus_importer = CorpusImporter("latin")
         corpus_importer.import_corpus("latin_models_cltk")
         corpus_importer.import_corpus("latin_text_latin_library")
     except:
         raise Exception("Failure to download test corpus")
     self.reader = get_corpus_reader(language="latin",
                                     corpus_name="latin_text_latin_library")
     self.reader._fileids = ["pervig.txt"]
     # Need a additional instance because tests below change internals #TO-DO Fix
     self.reader_2 = get_corpus_reader(
         language="latin", corpus_name="latin_text_latin_library")
     self.reader_3 = get_corpus_reader(
         language="latin", corpus_name="latin_text_latin_library")
     self.reader_4 = get_corpus_reader(
         language="latin", corpus_name="latin_text_latin_library")
示例#11
0
 def test_import_latin_library_corpus_filter_by_file(self):
     """Test the Latin Library corpus reader filter by files."""
     reader = get_corpus_reader(language='latin',
                                corpus_name='latin_text_latin_library')
     filtered_reader = assemble_corpus(reader,
                                       types_requested=['old'],
                                       type_files=corpus_texts_by_type)
     self.assertTrue(len(list(filtered_reader.fileids())) > 0)
示例#12
0
def main():
    if len(sys.argv) < 2:
        print(
            "Please supply an inflected word on the command line. Example: search_by_lemma.py κύνεσσιν\n"
        )
        sys.exit()
    infl = sys.argv[1]
    lem = lemmatize(infl)[0]  # lemmatized
    print("searching for " + lem + " <- " + infl)
    index = {}
    for work in ["iliad", "odyssey"]:
        for book in range(1, 24 + 1):  # ranges from 1 to 24
            filename = 'texts/homer.' + work + '.part.' + str(book) + '.tess'
            #print(filename)
            reader = get_corpus_reader(corpus_name='greek_text_tesserae',
                                       language='greek')
            reader._fileids = [filename]
            sentences = list(reader.sents([filename]))
            sentences = [cltk_normalize(s) for s in sentences]
            count_sentences = 0
            for s in sentences:
                count_sentences = count_sentences + 1
                no_punct = re.sub(
                    r"[,;:\.']", '', s
                )  # remove punctuation, which lemmatizer treats as independent words
                words = re.split("\s+", no_punct)
                count_words = 0
                for word in lemmatize(no_punct):
                    count_words = count_words + 1
                    if lem == word:
                        i = count_words - 1
                        w = words[i]
                        context = " ".join(
                            words[max(i - 3, 0):min(i + 4,
                                                    len(words) - 1)])
                        #context = re.sub(re.compile("("+w+")"),r"__\1__",context) # ... surround with __ __
                        pos_tagged = tagger.tag_tnt(no_punct)
                        # ... tag words in sentence with parts of speech, https://github.com/cltk/tutorials/blob/master/8%20Part-of-speech%20tagging.ipynb
                        # for descriptions of what the POS tags mean, see https://linguistics.stackexchange.com/questions/12803/what-do-the-labels-mean-in-this-latin-pos-tagging
                        describe = w
                        for t in pos_tagged:
                            if t[0] == w:
                                describe = t[0] + " " + pos_tag_to_description(
                                    t[1])
                                break
                        print(work + " " + str(book) + ", sentence " +
                              str(count_sentences) + ", word " +
                              str(count_words) + ": " + describe + "    " +
                              context)
                        if w in index:
                            index[w] += 1
                        else:
                            index[w] = 1
                #sys.exit()
    for w in sorted(list(index.keys())):
        print(str(index[w]) + " " + w)
示例#13
0
def getWordList(selectedWork):
    reader = get_corpus_reader(corpus_name='greek_text_perseus',
                               language='greek')
    docs = list(reader.docs())

    reader._fileids = [selectedWork]

    words = list(reader.words())

    return words
示例#14
0
 def test_filtered_corpus_reader_docs(self):
     """Test filtered corpus docs method."""
     reader = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library')
     reader._fileids = ['catullus.txt']
     docs = list(reader.docs())
     words = distinct_words(docs)
     if 'Latin' in words:
         self.fail('Filtered word present!')
     if 'Library' in words:
         self.fail('Filtered word present!')
     self.assertTrue(len(docs) > 0)
示例#15
0
 def test_filtered_corpus_reader_paras(self):
     """Test filtered corpus paras method."""
     reader = get_corpus_reader(language='latin',
                                corpus_name='latin_text_latin_library')
     reader._fileids = ['catullus.txt']
     paras = list(reader.paras())
     sents = [sent for para in paras for sent in para]
     uniq_words = distinct_words(sents)
     if 'Latin' in uniq_words:
         self.fail('Filtered word present!')
     if 'Library' in uniq_words:
         self.fail('Filtered word present!')
     self.assertTrue(len(paras) > 0)
示例#16
0
 def test_tesserae_corpus_reader(self):
     """Test Tesserae corpus methods."""
     # Update when corpus is add to CLTK
     reader = get_corpus_reader(language="greek", corpus_name="greek_text_tesserae")
     sample = reader.fileids()[0]
     self.assertTrue(len(list(reader.docs(sample))) >= 1)
     self.assertTrue(len(list(reader.texts(sample))) >= 1)
     self.assertTrue(len(list(reader.paras(sample))) >= 1)
     self.assertTrue(len(list(reader.sents(sample))) >= 1)
     self.assertTrue(len(list(reader.words(sample))) >= 1)
     self.assertTrue(len(list(reader.lines(sample))) >= 1)
     self.assertTrue(reader.describe())
     self.assertTrue(len(list(reader.pos_tokenize(sample))) >= 1)
示例#17
0
 def test_tesserae_corpus_reader(self):
     """Test Tesserae corpus methods."""
     # Update when corpus is add to CLTK
     reader = get_corpus_reader(language='greek', corpus_name='greek_text_tesserae')
     sample = reader.fileids()[0]
     self.assertTrue(len(list(reader.docs(sample))) >= 1)
     self.assertTrue(len(list(reader.texts(sample))) >= 1)
     self.assertTrue(len(list(reader.paras(sample))) >= 1)
     self.assertTrue(len(list(reader.sents(sample))) >= 1)
     self.assertTrue(len(list(reader.words(sample))) >= 1)
     self.assertTrue(len(list(reader.lines(sample))) >= 1)
     self.assertTrue(reader.describe())
     self.assertTrue(len(list(reader.pos_tokenize(sample))) >= 1)
示例#18
0
    def choose_corpus(
        self,
        corpus_name: 'latin_text_latin_library or latin_text_perseus' = ''
    ) -> "list":
        """Lists the available Latin texts. Currently supports Latin Library and Perseus Library. Will display a list of available texts as a pandas `series`. 
        :Param corpus_name: either 'latin_text_latin_library' or 'latin_text_perseus.' 
        These are listed in the corpus_names attribute.  
        """

        self.reader = get_corpus_reader(language='latin',
                                        corpus_name=corpus_name)
        self.catalog = list(self.reader.fileids())
        self.corpus_name = corpus_name
示例#19
0
def extract(name):
    reader = get_corpus_reader(language="latin", corpus_name=name)
    lines = []
    if name == "latin_text_perseus":
        sentences = reader.sents()
    elif name == "latin_text_tesserae":
        sentences = reader.sents(fileids=reader.fileids())
    elif name == "latin_text_latin_library":
        sentences = (" ".join(sentence) for sentence in reader.sents())
    for sentence in tqdm(sentences):
        try:
            cleaned_sentence = preprocess(preprocess_like_evalatin(sentence))
            cleaned_sentence = re.sub(r"\s+", " ", cleaned_sentence).strip()
            if len(cleaned_sentence.split()) >= 5:
                if "�" not in cleaned_sentence:
                    lines.append(cleaned_sentence)
        except:
            continue
    return lines
示例#20
0
 def test_filtered_corpus_reader_docs(self):
     """Test filtered corpus docs method."""
     reader = get_corpus_reader(language='latin',
                                corpus_name='latin_text_latin_library')
     reader._fileids = ['catullus.txt']
     docs = list(reader.docs())
     words = distinct_words(docs)
     if 'Latin' in words:
         self.fail('Filtered word present!')
     if 'Library' in words:
         self.fail('Filtered word present!')
     self.assertTrue(len(docs) > 0)
     problem_files = [
         'caesar/bc3.txt', 'hymni.txt', 'varro.frag.txt', 'varro.ll10.txt',
         'varro.ll5.txt', 'varro.ll6.txt', 'varro.ll7.txt', 'varro.ll8.txt',
         'varro.ll9.txt'
     ]
     for filename in problem_files:
         doc = list(reader.docs([filename]))
         assert (doc)
         assert (len(doc[0]) > 100)
示例#21
0
from cltk.corpus.utils.importer import CorpusImporter
from cltk.corpus.readers import get_corpus_reader
from cltk.stem.lemma import LemmaReplacer
from cltk.corpus.utils.formatter import cltk_normalize
from cltk.lemmatize.greek.backoff import BackoffGreekLemmatizer
from cltk.phonology.greek.transcription import Transcriber
from cltk.tag.pos import POSTag
from cltk.tag import ner

corpus_importer = CorpusImporter('greek')
corpus_importer.import_corpus('greek_models_cltk')

corpus_importer2 = CorpusImporter('greek')
corpus_importer2.import_corpus('greek_text_perseus')

philippians_reader = get_corpus_reader(corpus_name="greek_text_perseus",
                                       language="greek")

philippians_reader._fileids = [
    'new-testament__letter-to-the-philippians__grc.json'
]

# print(list(perseus_reader.sents()))

sentences = list(philippians_reader.sents())
sentence = cltk_normalize(sentences[0])
lemmatizer = LemmaReplacer('greek')
word_list = lemmatizer.lemmatize(sentence)

tagger = POSTag('greek')

parts_of_speech = tagger.tag_ngram_123_backoff(sentence)
示例#22
0
 def __init__(self, corpus_name):
     self.corpus_name = corpus_name
     self.catalog = catalog
     self.reader = get_corpus_reader(language='latin',
                                     corpus_name=corpus_name)
示例#23
0
 def test_json_corpus_reader_sizes(self):
     """Test filtered corpus sizes method."""
     reader = get_corpus_reader(language='latin',
                                corpus_name='latin_text_perseus')
     self.assertTrue(len(list(reader.sizes())) > 290)
示例#24
0
 def test_json_corpus_reader_sizes(self):
     """Test filtered corpus sizes method."""
     reader = get_corpus_reader(language='latin', corpus_name='latin_text_perseus')
     self.assertTrue(len(list(reader.sizes())) > 290)
示例#25
0
 def test_filtered_corpus_reader_sizes(self):
     """Test filtered corpus sizes method."""
     reader = get_corpus_reader(language='latin',
                                corpus_name='latin_text_latin_library')
     reader._fileids = ['catullus.txt']
     self.assertTrue(len(list(reader.sizes())) > 0)