def _install(lang, lst): print("Downloading %s " % (lang)) corpus_importer = CorpusImporter(lang) for _corpus in lst: if _corpus['location'] == 'remote': print(" Downloading %s " % (_corpus['name'])) corpus_importer.import_corpus(_corpus['name'])
def get_src_header_and_transcriber(source: str) -> Tuple[str, G2P_func]: """Return the column name for the output csv header, and a transcriber for the source language.""" if source == 'lat': try: src_transcriber = Transcriber(dialect="Classical", reconstruction="Allen") src = 'Latin' except FileNotFoundError: print( "Did not have the corpus `latin_models_cltk`, downloading it now" ) from cltk.corpus.utils.importer import CorpusImporter corpus_importer = CorpusImporter('latin') corpus_importer.import_corpus('latin_models_cltk') src_transcriber = Transcriber(dialect="Classical", reconstruction="Allen") src = 'Latin' @lru_cache(maxsize=None) def src_func(token): try: ipa = src_transcriber.transcribe(token) except IndexError: ipa = src_transcriber.transcribe(token, syllabify=False) ipa = ipa.strip('[]') # Some weird cases of failed macronization. ipa = re.sub(r'(.)_', r'\1ː', ipa) return ipa else: src = 'Proto-Germanic' src_func = PGmc_ipa_trans return src, src_func
def setUp(self): corpus_importer = CorpusImporter("old_english") corpus_importer.import_corpus("old_english_models_cltk") file_rel = os.path.join('~/cltk_data/old_english/model/old_english_models_cltk/README.md') file = os.path.expanduser(file_rel) file_exists = os.path.isfile(file) self.assertTrue(file_exists)
def test_import_latin_library_corpus_reader(self): """Test the Latin Library corpus reader.""" corpus_importer = CorpusImporter('latin') corpus_importer.import_corpus('latin_text_latin_library') reader = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library') ALL_FILE_IDS = list(reader.fileids()) self.assertTrue(len(ALL_FILE_IDS) > 2100)
def _install(lang, lst): print("Downloading %s " % (lang)) corpus_importer = CorpusImporter(lang) for _corpus in lst: if _corpus['location'] == 'remote': print(" Downloading %s " % (_corpus['name'])) corpus_importer.import_corpus(_corpus['name'])
def test_import_nonexistant_corpus(self): """Test that creating a CorpusImporter for a non existent lang fails smoothly """ with self.assertRaises(CorpusImportError): corpus_importer = CorpusImporter('greek') corpus_importer.import_corpus('euclids_book_of_recipes')
def setUpClass(self): corpus_importer = CorpusImporter('greek') corpus_importer.import_corpus('greek_models_cltk') corpus_importer = CorpusImporter('latin') corpus_importer.import_corpus('latin_models_cltk') self.greek_text = """ὅλως δ’ ἀντεχόμενοί τινες, ὡς οἴονται, δικαίου τινός (ὁ γὰρ νόμος δίκαιόν τἰ τὴν κατὰ πόλεμον δουλείαν τιθέασι δικαίαν, ἅμα δ’ οὔ φασιν· τήν τε γὰρ ἀρχὴν ἐνδέχεται μὴ δικαίαν εἶναι τῶν πολέμων, καὶ τὸν ἀνάξιον δουλεύειν οὐδαμῶς ἂν φαίη τις δοῦλον εἶναι· εἰ δὲ μή, συμβήσεται τοὺς εὐγενεστάτους εἶναι δοκοῦντας δούλους εἶναι καὶ ἐκ δούλων, ἐὰν συμβῇ πραθῆναι ληφθέντας.""" # pylint: disable=line-too-long self.latin_text = "O di inmortales! ubinam gentium sumus? in qua urbe vivimus? quam rem publicam habemus? Hic, hic sunt in nostro numero, patres conscripti, in hoc orbis terrae sanctissimo gravissimoque consilio, qui de nostro omnium interitu, qui de huius urbis atque adeo de orbis terrarum exitio cogitent! Hos ego video consul et de re publica sententiam rogo et, quos ferro trucidari oportebat, eos nondum voce volnero! Fuisti igitur apud Laecam illa nocte, Catilina, distribuisti partes Italiae, statuisti, quo quemque proficisci placeret, delegisti, quos Romae relinqueres, quos tecum educeres, discripsisti urbis partes ad incendia, confirmasti te ipsum iam esse exiturum, dixisti paulum tibi esse etiam nunc morae, quod ego viverem." # pylint: disable=line-too-long
def test_import_latin_library_corpus_reader(self): """Test the Latin Library corpus reader.""" corpus_importer = CorpusImporter('latin') corpus_importer.import_corpus('latin_text_latin_library') reader = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library') ALL_FILE_IDS = list(reader.fileids()) self.assertTrue(len(ALL_FILE_IDS) > 2100)
def setUp(self): corpus_importer = CorpusImporter('french') corpus_importer.import_corpus('french_data_cltk') file_rel = os.path.join('~/cltk_data/french/text/french_data_cltk/README.md') file = os.path.expanduser(file_rel) file_exists = os.path.isfile(file) self.assertTrue(file_exists)
def setUp(self): corpus_importer = CorpusImporter("old_norse") corpus_importer.import_corpus("old_norse_models_cltk") file_rel = os.path.join(get_cltk_data_dir() + '/old_norse/model/old_norse_models_cltk/README.md') file = os.path.expanduser(file_rel) file_exists = os.path.isfile(file) self.assertTrue(file_exists)
def setUpClass(self): corpus_importer = CorpusImporter('greek') corpus_importer.import_corpus('greek_models_cltk') corpus_importer = CorpusImporter('latin') corpus_importer.import_corpus('latin_models_cltk') self.greek_text = """ὅλως δ’ ἀντεχόμενοί τινες, ὡς οἴονται, δικαίου τινός (ὁ γὰρ νόμος δίκαιόν τἰ τὴν κατὰ πόλεμον δουλείαν τιθέασι δικαίαν, ἅμα δ’ οὔ φασιν· τήν τε γὰρ ἀρχὴν ἐνδέχεται μὴ δικαίαν εἶναι τῶν πολέμων, καὶ τὸν ἀνάξιον δουλεύειν οὐδαμῶς ἂν φαίη τις δοῦλον εἶναι· εἰ δὲ μή, συμβήσεται τοὺς εὐγενεστάτους εἶναι δοκοῦντας δούλους εἶναι καὶ ἐκ δούλων, ἐὰν συμβῇ πραθῆναι ληφθέντας.""" # pylint: disable=line-too-long self.latin_text = "O di inmortales! ubinam gentium sumus? in qua urbe vivimus? quam rem publicam habemus? Hic, hic sunt in nostro numero, patres conscripti, in hoc orbis terrae sanctissimo gravissimoque consilio, qui de nostro omnium interitu, qui de huius urbis atque adeo de orbis terrarum exitio cogitent! Hos ego video consul et de re publica sententiam rogo et, quos ferro trucidari oportebat, eos nondum voce volnero! Fuisti igitur apud Laecam illa nocte, Catilina, distribuisti partes Italiae, statuisti, quo quemque proficisci placeret, delegisti, quos Romae relinqueres, quos tecum educeres, discripsisti urbis partes ad incendia, confirmasti te ipsum iam esse exiturum, dixisti paulum tibi esse etiam nunc morae, quod ego viverem." # pylint: disable=line-too-long
def _is_cloned_get_make(self): """Check if installed, if not, install it. TODO: Add check for Windows and Linux as they are added. TODO: This could be 3 functions. """ if self.operating_system == 'mac': branch = 'apple' else: # branch = 'master' raise OSError( 'Lapos for Linux/Windows not currently available through CLTK. Please file issue if you can fix it.' ) fp = os.path.expanduser( '~/cltk_data/multilingual/software/lapos/README.md') if os.path.isfile(fp): return True else: importer = CorpusImporter('multilingual') importer.import_corpus('lapos', branch=branch) if os.path.isfile(fp): print('Cloned Lapos successfully.') self.make() return True else: logger.error( "Something went wrong with importing the Lapos tagger on the '{}' branch." .format(branch)) raise CorpusImportError
def test_import_nonexistant_corpus(self): """Test that creating a CorpusImporter for a non existent lang fails smoothly """ with self.assertRaises(CorpusImportError): corpus_importer = CorpusImporter('greek') corpus_importer.import_corpus('euclids_book_of_recipes')
def setup(self): """Download CLTK packages and trainer corpora. Launches the CLTK package download interface. Overridden by the CLTK child classes to launch the automated CLTK downloader. Convenience method if user has not already downloaded CLTK packages and trainer sets. Example: >>> LatinText('').setup() """ # check if cltk is already installed, if not, install it if not importlib.find_loader('cltk'): pip.main(['install', 'cltk']) # include cltk inline from cltk.corpus.utils.importer import CorpusImporter setup_language = self.options['language'] # for ancient greek, change to 'greek' for purposes of cltk setup if setup_language == 'ancient greek': setup_language = 'greek' corpus_importer = CorpusImporter(setup_language) # loop through, check if extant, attempt to download, skip any errors for cltk_corpus in corpus_importer.list_corpora: print('Downloading', cltk_corpus) try: corpus_importer.import_corpus(cltk_corpus) except: print('Problem downloading', cltk_corpus, '(skipping)') return True
def test_import_greek_software_tlgu(self): """Test cloning TLGU.""" corpus_importer = CorpusImporter('greek') corpus_importer.import_corpus('greek_software_tlgu') file_rel = os.path.join('~/cltk_data/greek/software/greek_software_tlgu/README.md') _file = os.path.expanduser(file_rel) file_exists = os.path.isfile(_file) self.assertTrue(file_exists)
def test_import_latin_text_antique_digiliblt(self): """Test cloning the Antique Latin from digilibLT.""" corpus_importer = CorpusImporter('latin') corpus_importer.import_corpus('latin_text_antique_digiliblt') file_rel = os.path.join('~/cltk_data/latin/text/latin_text_antique_digiliblt/README.md') _file = os.path.expanduser(file_rel) file_exists = os.path.isfile(_file) self.assertTrue(file_exists)
def test_import_latin_models_cltk(self): """Test cloning the CLTK Latin models.""" corpus_importer = CorpusImporter('latin') corpus_importer.import_corpus('latin_models_cltk') file_rel = os.path.join('~/cltk_data/latin/model/latin_models_cltk/README.md') _file = os.path.expanduser(file_rel) file_exists = os.path.isfile(_file) self.assertTrue(file_exists)
def test_import_greek_text_perseus(self): """Test cloning the Perseus Greek text corpus.""" corpus_importer = CorpusImporter('greek') corpus_importer.import_corpus('greek_text_perseus') file_rel = os.path.join('~/cltk_data/greek/text/greek_text_perseus/README.md') _file = os.path.expanduser(file_rel) file_exists = os.path.isfile(_file) self.assertTrue(file_exists)
def test_import_lat_pos_lemma_cltk(self): """Test cloning the CLTK POS lemmata dict.""" corpus_importer = CorpusImporter('latin') corpus_importer.import_corpus('latin_pos_lemmata_cltk') file_rel = os.path.join('~/cltk_data/latin/lemma/latin_pos_lemmata_cltk/README.md') _file = os.path.expanduser(file_rel) file_exists = os.path.isfile(_file) self.assertTrue(file_exists)
def test_import_latin_text_antique_digiliblt(self): """Test cloning the Antique Latin from digilibLT.""" corpus_importer = CorpusImporter('latin') corpus_importer.import_corpus('latin_text_antique_digiliblt') file_rel = os.path.join('~/cltk_data/latin/text/latin_text_antique_digiliblt/README.md') _file = os.path.expanduser(file_rel) file_exists = os.path.isfile(_file) self.assertTrue(file_exists)
def test_git_import_chinese_cbeta_txt(self): """Test import of plaintext CBETA.""" corpus_importer = CorpusImporter('chinese') corpus_importer.import_corpus('chinese_text_cbeta_txt') file_rel = os.path.join('~/cltk_data/chinese/text/chinese_text_cbeta_txt/README.md') file = os.path.expanduser(file_rel) file_exists = os.path.isfile(file) self.assertTrue(file_exists)
def test_git_import_tib_lexica_tdc(self): """Test import of Tibetan dictionary.""" corpus_importer = CorpusImporter('tibetan') corpus_importer.import_corpus('tibetan_lexica_tdc') file_rel = os.path.join('~/cltk_data/tibetan/lexicon/tibetan_lexica_tdc/README.md') _file = os.path.expanduser(file_rel) file_exists = os.path.isfile(_file) self.assertTrue(file_exists)
def test_git_import_tib_pos_tdc(self): """Test import Tibetan POS files.""" corpus_importer = CorpusImporter('tibetan') corpus_importer.import_corpus('tibetan_pos_tdc') file_rel = os.path.join('~/cltk_data/tibetan/pos/tibetan_pos_tdc/README.md') _file = os.path.expanduser(file_rel) file_exists = os.path.isfile(_file) self.assertTrue(file_exists)
def test_git_import_copt_script(self): """Test import of Coptic Scriptorium.""" corpus_importer = CorpusImporter('coptic') corpus_importer.import_corpus('coptic_text_scriptorium') file_rel = os.path.join('~/cltk_data/coptic/text/coptic_text_scriptorium/README.md') _file = os.path.expanduser(file_rel) file_exists = os.path.isfile(_file) self.assertTrue(file_exists)
def test_import_la_treebank_pers(self): """Test cloning the Perseus Latin treebank corpus.""" corpus_importer = CorpusImporter('latin') corpus_importer.import_corpus('latin_treebank_perseus') file_rel = os.path.join('~/cltk_data/latin/treebank/latin_treebank_perseus/README.md') _file = os.path.expanduser(file_rel) file_exists = os.path.isfile(_file) self.assertTrue(file_exists)
def setUpClass(self): try: corpus_importer = CorpusImporter('sanskrit') corpus_importer.import_corpus('sanskrit_models_cltk') corpus_importer = CorpusImporter('greek') corpus_importer.import_corpus('greek_models_cltk') except: raise Exception('Failure to download test corpus')
def test_import_la_text_lac_curt(self): """Test cloning the Lacus Curtius Latin text corpus.""" corpus_importer = CorpusImporter('latin') corpus_importer.import_corpus('latin_text_lacus_curtius') file_rel = os.path.join('~/cltk_data/latin/text/latin_text_lacus_curtius/README.md') _file = os.path.expanduser(file_rel) file_exists = os.path.isfile(_file) self.assertTrue(file_exists)
def setup(): corpus_importer = CorpusImporter('latin') # corpus_importer.import_corpus('latin_models_cltk') corpora = corpus_importer.list_corpora corpora.remove('phi5') corpora.remove('phi7') for corpus in corpora: corpus_importer.import_corpus(corpus)
def test_import_proper_names_greek(self): """Test cloning the Greek proper names corpus.""" corpus_importer = CorpusImporter('greek') corpus_importer.import_corpus('greek_proper_names_cltk') file_rel = os.path.join('~/cltk_data/greek/lexicon/greek_proper_names_cltk/README.md') _file = os.path.expanduser(file_rel) file_exists = os.path.isfile(_file) self.assertTrue(file_exists)
def pos_tagger_example_latin(): corpus_importer = CorpusImporter('latin') corpus_importer.import_corpus('latin_models_cltk') tagger = pos.POSTag('latin') pos_tags = tagger.tag_ngram_123_backoff('Gallia est omnis divisa in partes tres') print(pos_tags)
def test_import_lat_text_lat_lib(self): """Test cloning the Latin Library text corpus.""" corpus_importer = CorpusImporter('latin') corpus_importer.import_corpus('latin_text_latin_library') file_rel = os.path.join('~/cltk_data/latin/text/latin_text_latin_library/README.md') _file = os.path.expanduser(file_rel) file_exists = os.path.isfile(_file) self.assertTrue(file_exists)
def test_import_punjabi_punjabi_text_gurban(self): pun_import = CorpusImporter('punjabi') corpora_list = pun_import.list_corpora self.assertTrue('punjabi_text_gurban' in corpora_list) pun_import.import_corpus('punjabi_text_gurban') file_path = os.path.join('~/cltk_data/punjabi/text/punjabi_text_gurban/README.md') _file = os.path.expanduser(file_path) self.assertTrue(os.path.isfile(_file))
def setUpClass(cls): try: corpus_importer = CorpusImporter('sanskrit') corpus_importer.import_corpus('sanskrit_models_cltk') corpus_importer = CorpusImporter('greek') corpus_importer.import_corpus('greek_models_cltk') except: raise Exception('Failure to download test corpus')
def setUp(self): corpus_importer = CorpusImporter('french') corpus_importer.import_corpus('french_data_cltk') file_rel = os.path.join( '~/cltk_data/french/text/french_data_cltk/README.md') file = os.path.expanduser(file_rel) file_exists = os.path.isfile(file) self.assertTrue(file_exists)
def test_import_proper_names_latin(self): """Test cloning the Latin proper names corpus.""" corpus_importer = CorpusImporter('latin') corpus_importer.import_corpus('latin_proper_names_cltk') file_rel = os.path.join('~/cltk_data/latin/lexicon/latin_proper_names_cltk/README.md') _file = os.path.expanduser(file_rel) file_exists = os.path.isfile(_file) self.assertTrue(file_exists)
def test_import_la_treebank_pers(self): """Test cloning the Perseus Latin treebank corpus.""" corpus_importer = CorpusImporter('latin') corpus_importer.import_corpus('latin_treebank_perseus') file_rel = os.path.join('~/cltk_data/latin/treebank/latin_treebank_perseus/README.md') file = os.path.expanduser(file_rel) file_exists = os.path.isfile(file) self.assertTrue(file_exists)
def test_git_import_tib_lexica_tdc(self): """Test import of Tibetan dictionary.""" corpus_importer = CorpusImporter('tibetan') corpus_importer.import_corpus('tibetan_lexica_tdc') file_rel = os.path.join('~/cltk_data/tibetan/lexicon/tibetan_lexica_tdc/README.md') file = os.path.expanduser(file_rel) file_exists = os.path.isfile(file) self.assertTrue(file_exists)
def test_import_latin_models_cltk(self): """Test cloning the CLTK Latin models.""" corpus_importer = CorpusImporter('latin') corpus_importer.import_corpus('latin_models_cltk') file_rel = os.path.join('~/cltk_data/latin/model/latin_models_cltk/README.md') file = os.path.expanduser(file_rel) file_exists = os.path.isfile(file) self.assertTrue(file_exists)
def test_git_import_copt_script(self): """Test import of Coptic Scriptorium.""" corpus_importer = CorpusImporter('coptic') corpus_importer.import_corpus('coptic_text_scriptorium') file_rel = os.path.join('~/cltk_data/coptic/text/coptic_text_scriptorium/README.md') file = os.path.expanduser(file_rel) file_exists = os.path.isfile(file) self.assertTrue(file_exists)
def test_git_import_tib_pos_tdc(self): """Test import Tibetan POS files.""" corpus_importer = CorpusImporter('tibetan') corpus_importer.import_corpus('tibetan_pos_tdc') file_rel = os.path.join('~/cltk_data/tibetan/pos/tibetan_pos_tdc/README.md') file = os.path.expanduser(file_rel) file_exists = os.path.isfile(file) self.assertTrue(file_exists)
def test_import_lat_pos_lemma_cltk(self): """Test cloning the CLTK POS lemmata dict.""" corpus_importer = CorpusImporter('latin') corpus_importer.import_corpus('latin_pos_lemmata_cltk') file_rel = os.path.join('~/cltk_data/latin/lemma/latin_pos_lemmata_cltk/README.md') file = os.path.expanduser(file_rel) file_exists = os.path.isfile(file) self.assertTrue(file_exists)
def test_import_lat_text_lat_lib(self): """Test cloning the Latin Library text corpus.""" corpus_importer = CorpusImporter('latin') corpus_importer.import_corpus('latin_text_latin_library') file_rel = os.path.join('~/cltk_data/latin/text/latin_text_latin_library/README.md') file = os.path.expanduser(file_rel) file_exists = os.path.isfile(file) self.assertTrue(file_exists)
def test_git_import_chinese_cbeta_txt(self): """Test import of plaintext CBETA.""" corpus_importer = CorpusImporter('chinese') corpus_importer.import_corpus('chinese_text_cbeta_txt') file_rel = os.path.join('~/cltk_data/chinese/text/chinese_text_cbeta_txt/README.md') file = os.path.expanduser(file_rel) file_exists = os.path.isfile(file) self.assertTrue(file_exists)
def test_import_la_text_lac_curt(self): """Test cloning the Lacus Curtius Latin text corpus.""" corpus_importer = CorpusImporter('latin') corpus_importer.import_corpus('latin_text_lacus_curtius') file_rel = os.path.join('~/cltk_data/latin/text/latin_text_lacus_curtius/README.md') file = os.path.expanduser(file_rel) file_exists = os.path.isfile(file) self.assertTrue(file_exists)
def test_import_proper_names_greek(self): """Test cloning the Greek proper names corpus.""" corpus_importer = CorpusImporter('greek') corpus_importer.import_corpus('greek_proper_names_cltk') file_rel = os.path.join('~/cltk_data/greek/lexicon/greek_proper_names_cltk/README.md') file = os.path.expanduser(file_rel) file_exists = os.path.isfile(file) self.assertTrue(file_exists)
def test_import_greek_text_perseus(self): """Test cloning the Perseus Greek text corpus.""" corpus_importer = CorpusImporter('greek') corpus_importer.import_corpus('greek_text_perseus') file_rel = os.path.join('~/cltk_data/greek/text/greek_text_perseus/README.md') file = os.path.expanduser(file_rel) file_exists = os.path.isfile(file) self.assertTrue(file_exists)
def test_import_greek_software_tlgu(self): """Test cloning TLGU.""" corpus_importer = CorpusImporter('greek') corpus_importer.import_corpus('greek_software_tlgu') file_rel = os.path.join('~/cltk_data/greek/software/greek_software_tlgu/README.md') file = os.path.expanduser(file_rel) file_exists = os.path.isfile(file) self.assertTrue(file_exists)
def __init__( self, pathDF, language='english', dataType='pickle', dataIndex='multi', colname='text', maxValues=2500, pathMeta=False, pathType=False, showLogging=False, model_params=(4,5,300) ): super(CorpusML, self).__init__( pathDF, dataType, dataIndex, colname, maxValues, pathMeta, pathType ) if showLogging: logging.basicConfig( format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO ) self.model = gensim.models.Word2Vec( workers=model_params[0], min_count=model_params[1], size=model_params[2] ) # self.model.random.seed(42) self.language = language if self.language == 'latin' or self.language == 'greek': from cltk.corpus.utils.importer import CorpusImporter corpus_importer = CorpusImporter(self.language) corpus_importer.import_corpus( '{0}_models_cltk'.format(self.language) ) from cltk.stem.lemma import LemmaReplacer from cltk.tokenize.word import nltk_tokenize_words as tokenizer lemmatizer = LemmaReplacer(self.language) if self.language == 'latin': from cltk.stem.latin.j_v import JVReplacer from cltk.stop.latin.stops import STOPS_LIST as stopwords self.jvreplacer = JVReplacer() elif self.language == 'greek': from cltk.stop.greek.stops import STOPS_LIST as stopwords elif self.language == 'english' or 'german': import nltk nltk.download('stopwords') from nltk.stem import WordNetLemmatizer from nltk.tokenize import word_tokenize as tokenizer from nltk.corpus import stopwords stopwords = stopwords.words(self.language) lemmatizer = WordNetLemmatizer() else: raise ValueError( 'Could not find lemmatizer, tokenizer,\ and stopwords for chosen language.') self.lemmatizer = lemmatizer self.tokenizer = tokenizer self.stopwords = stopwords
def test_import_proper_names_latin(self): """Test cloning the Latin proper names corpus.""" corpus_importer = CorpusImporter('latin') corpus_importer.import_corpus('latin_proper_names_cltk') file_rel = os.path.join( '~/cltk_data/latin/lexicon/latin_proper_names_cltk/README.md') file = os.path.expanduser(file_rel) file_exists = os.path.isfile(file) self.assertTrue(file_exists)
def retrieveLatinModels(self): """ Loads the required Latin data models (for the cltk processing) from the internet. Uses the CorpusImporter('latin') to access the resources. The data will be stored in the local project ...> from then the cltk """ latinDownloader = CorpusImporter('latin') latinDownloader.import_corpus('latin_text_latin_library') latinDownloader.import_corpus('latin_models_cltk')
def test_import_greek_models_cltk(self): """Test pull (not clone) the CLTK Greek models. Import was run in ``setUp()``. """ corpus_importer = CorpusImporter('greek') corpus_importer.import_corpus('greek_models_cltk') file_rel = os.path.join('~/cltk_data/greek/model/greek_models_cltk/README.md') file = os.path.expanduser(file_rel) file_exists = os.path.isfile(file) self.assertTrue(file_exists)
def setUp(self): file_rel = os.path.join('~/cltk_data/latin/text/latin_text_perseus/README.md') file = os.path.expanduser(file_rel) if not os.path.isfile(file): corpus_importer = CorpusImporter('latin') corpus_importer.import_corpus('latin_text_perseus') file_exists = os.path.isfile(file) self.assertTrue(file_exists) self.app = api_json.app.test_client()
def main(): corpus_importer = CorpusImporter('latin') corpora_list = corpus_importer.list_corpora print(corpora_list) corpus_importer.import_corpus('latin_models_cltk') sentence = 'Aeneadum genetrix, hominum divomque voluptas, alma Venus, caeli subter labentia signa quae mare navigerum, quae terras frugiferentis concelebras, per te quoniam genus omne animantum concipitur visitque exortum lumina solis.' sentence = sentence.lower() lemmatizer = LemmaReplacer('latin') lemmatized_sentence = lemmatizer.lemmatize(sentence) print(lemmatized_sentence)
def setUp(self): """Import sanskrit models first, some CSV files necessary for the Indian lang tokenizers. """ corpus_importer = CorpusImporter('sanskrit') corpus_importer.import_corpus('sanskrit_models_cltk') file_rel = os.path.join('~/cltk_data/sanskrit/model/sanskrit_models_cltk/README.md') file = os.path.expanduser(file_rel) file_exists = os.path.isfile(file) self.assertTrue(file_exists)
def _check_latest_data(lang): """Check for presence of proper names dir, clone if not.""" assert lang in NER_DICT.keys(), \ 'Invalid language. Choose from: {}'.format(', '.join(NER_DICT.keys())) ner_file_path = os.path.expanduser(NER_DICT[lang]) if not os.path.isfile(ner_file_path): corpus_importer = CorpusImporter(lang) corpus_importer.import_corpus('{}_models_cltk'.format(lang))
def _check_import_source(): """Check if tlgu imported, if not import it.""" path_rel = '~/cltk_data/greek/software/greek_software_tlgu/tlgu.h' path = os.path.expanduser(path_rel) if not os.path.isfile(path): try: corpus_importer = CorpusImporter('greek') corpus_importer.import_corpus('tlgu') except Exception as exc: logger.error('Failed to import TLGU: %s', exc) raise
def setUpClass(cls): try: corpus_importer = CorpusImporter('latin') corpus_importer.import_corpus('latin_text_latin_library') except: raise Exception('Failure to download test corpus') cls.reader = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library') cls.reader._fileids = ['pervig.txt'] # Need a additional instance because tests below change internals #TO-DO Fix cls.reader_2 = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library') cls.reader_3 = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library') cls.reader_4 = get_corpus_reader(language='latin', corpus_name='latin_text_latin_library')