def onekgreek_tei_xml_to_text(): """Find TEI XML dir of TEI XML for the First 1k Years of Greek corpus.""" if not bs4_installed: logger.error("Install `bs4` and `lxml` to parse these TEI files.") raise ImportError xml_dir = make_cltk_path("grc/text/grc_text_first1kgreek/data/*/*/*.xml") xml_paths = glob.glob(xml_dir) if not len(xml_paths): logger.error( "1K Greek corpus not installed. Use ``FetchCorpus`` to get `First1KGreek`." ) raise FileNotFoundError xml_paths = [path for path in xml_paths if "__cts__" not in path] # new dir new_dir = make_cltk_path("grc/text/grc_text_first1kgreek_plaintext/") if not os.path.isdir(new_dir): os.makedirs(new_dir) for xml_path in xml_paths: _, xml_name = os.path.split(xml_path) xml_name = xml_name.rstrip(".xml") xml_name += ".txt" with open(xml_path) as file_open: soup = BeautifulSoup(file_open, "lxml") body = soup.body text = body.get_text() new_plaintext_path = os.path.join(new_dir, xml_name) with open(new_plaintext_path, "w") as file_open: file_open.write(text)
def divide_works(self, corpus): """Use the work-breaking option. TODO: Maybe incorporate this into ``convert_corpus()`` TODO: Write test for this """ if corpus == "tlg": orig_dir = make_cltk_path("originals/tlg") works_dir = make_cltk_path("grc/text/tlg/individual_works") file_prefix = "TLG" lat = False elif corpus == "phi5": orig_dir = make_cltk_path("originals/phi5") works_dir = make_cltk_path("lat/text/phi5/individual_works") file_prefix = "LAT" lat = True # this is for the optional TLGU argument to convert() elif corpus == "phi7": raise CLTKException( "``phi7`` cannot be divided into individual works.") else: raise CLTKException( f"Invalid corpus '{corpus}'. This should never happen.") if not os.path.exists(works_dir): os.makedirs(works_dir) files = os.listdir(orig_dir) texts = [ x for x in files if x.endswith(".TXT") and x.startswith(file_prefix) ] for file in texts: orig_file_path = os.path.join(orig_dir, file) new_file_path = os.path.join(works_dir, file) try: self.convert(orig_file_path, new_file_path, divide_works=True, lat=lat) logger.info("Writing files at %s to %s.", orig_file_path, works_dir) except Exception as err: logger.error("Failed to convert files: %s.", err)
def assemble_phi5_works_filepaths(): """Reads PHI5 index and builds a list of absolute filepaths.""" plaintext_dir = make_cltk_path("lat/text/phi5/individual_works/") all_filepaths = [] for author_code in PHI5_WORKS_INDEX: author_data = PHI5_WORKS_INDEX[author_code] works = author_data["works"] for work in works: f = os.path.join(plaintext_dir, author_code + ".TXT" + "-" + work + ".txt") all_filepaths.append(f) return all_filepaths
def onekgreek_tei_xml_to_text_capitains(): """Use MyCapitains program to convert TEI to plaintext.""" file = make_cltk_path( "grc/text/grc_text_first1kgreek/data/tlg0627/tlg021/tlg0627.tlg021.1st1K-grc1.xml" ) xml_dir = make_cltk_path("grc/text/grc_text_first1kgreek/data/*/*/*.xml") xml_paths = glob.glob(xml_dir) if not len(xml_paths): logger.error( "1K Greek corpus not installed. Use CorpusInstaller to get `First1KGreek`." ) raise FileNotFoundError xml_paths = [path for path in xml_paths if "__cts__" not in path] # new dir new_dir = make_cltk_path("grc/text/grc_text_first1kgreek_plaintext/") if not os.path.isdir(new_dir): os.makedirs(new_dir) for xml_path in xml_paths: _, xml_name = os.path.split(xml_path) xml_name = xml_name.rstrip(".xml") xml_name += ".txt" plain_text = "" with open(xml_path) as file_open: text = CapitainsCtsText(resource=file_open) for ref in text.getReffs(level=len(text.citation)): psg = text.getTextualNode(subreference=ref, simple=True) text_line = psg.export(Mimetypes.PLAINTEXT, exclude=["tei:note"]) plain_text += text_line new_plaintext_path = os.path.join(new_dir, xml_name) with open(new_plaintext_path, "w") as file_open: file_open.write(plain_text)
def _check_and_download_tlgu_source(self): """Check if tlgu downloaded, if not download it.""" path = make_cltk_path("grc/software/grc_software_tlgu/tlgu.h") if not os.path.isfile(path): dl_msg = f"This part of the CLTK depends upon TLGU, software written by Dimitri Marinakis `<http://tlgu.carmen.gr/>`_." print(dl_msg) repo_url = "https://github.com/cltk/grc_software_tlgu.git" dl_dir = os.path.split(path)[0] dl_question = ( f"Do you want to download TLGU from '{repo_url}' to '{dl_dir}'?" ) if self.interactive: do_download = query_yes_no(question=dl_question) else: do_download = True if do_download: fetch_corpus = FetchCorpus(language="grc") fetch_corpus.import_corpus(corpus_name="grc_software_tlgu") else: raise CLTKException( f"TLGU software required for this class to work.")
def __init__(self, interactive: bool = True): self.interactive = interactive self.lewis_yaml_fp = make_cltk_path( "lat", "lexicon", "cltk_lat_lewis_elementary_lexicon", "lewis.yaml") try: self.entries = self._load_entries() except FileNotFoundError: if self.interactive: dl_msg = f"This part of the CLTK depends upon Lewis's *An Elementary Latin Dictionary* (1890)." print(dl_msg) dl_question = "Do you want to download this?" do_download = query_yes_no(question=dl_question) else: do_download = True if do_download: fetch_corpus = FetchCorpus(language="lat") fetch_corpus.import_corpus( corpus_name="cltk_lat_lewis_elementary_lexicon") else: raise CLTKException( f"File '{self.lewis_yaml_fp}' is not found. It is required for this class." ) self.entries = self._load_entries()
def __init__(self, interactive: bool = True): self.interactive = interactive self.zoega_yaml_fp = make_cltk_path("non", "dictionary", "cltk_non_zoega_dictionary", "dictionary.yaml") try: self.entries = self._load_entries() except FileNotFoundError: if self.interactive: dl_msg = f"This part of the CLTK depends upon Zoëga's *A Concise Old Norse Dictionary* (1890)." print(dl_msg) dl_question = "Do you want to download this?" do_download = query_yes_no(question=dl_question) else: do_download = True if do_download: fetch_corpus = FetchCorpus(language="non") fetch_corpus.import_corpus( corpus_name="cltk_non_zoega_dictionary") else: raise CLTKException( f"File '{self.zoega_yaml_fp}' is not found. It is required for this class." ) self.entries = self._load_entries()
def assemble_tlg_author_filepaths(): """Reads TLG index and builds a list of absolute filepaths.""" plaintext_dir = make_cltk_path("grc/text/tlg/plaintext/") filepaths = [os.path.join(plaintext_dir, x + ".TXT") for x in TLG_INDEX] return filepaths
def assemble_phi5_author_filepaths(): """Reads PHI5 index and builds a list of absolute filepaths.""" plaintext_dir = make_cltk_path("lat/text/phi5/plaintext/") filepaths = [os.path.join(plaintext_dir, x + ".TXT") for x in PHI5_INDEX] return filepaths
def test_path(self): """Test empty_path() with argument.""" self.assertEqual(make_cltk_path('greek', 'perseus_corpus'), os.path.expanduser(os.path.join('~', 'cltk_data', 'greek', 'perseus_corpus')))
def test_empty_path(self): """Test empty empty_path()""" self.assertEqual(make_cltk_path(), os.path.expanduser(os.path.join('~', 'cltk_data')))
def test_path(self): """Test empty_path() with argument.""" self.assertEqual( make_cltk_path('greek', 'perseus_corpus'), os.path.expanduser( os.path.join('~', 'cltk_data', 'greek', 'perseus_corpus')))
def test_empty_path(self): """Test empty empty_path()""" self.assertEqual(make_cltk_path(), get_cltk_data_dir())
def _check_install(self): """Check if tlgu installed, if not install it.""" try: subprocess.check_output(["which", "tlgu"]) except subprocess.SubprocessError as sub_err: print("TLGU not installed.") logger.info("TLGU not installed: %s", sub_err) logger.info("Installing TLGU.") if not subprocess.check_output(["which", "gcc"]): logger.error("GCC seems not to be installed.") else: tlgu_path = make_cltk_path("grc/software/grc_software_tlgu") if self.interactive: install_question = "Do you want to install TLGU?" do_install = query_yes_no(question=install_question) if not do_install: raise CLTKException( "TLGU installation required for this class to work." ) else: print("Non-interactive installation. Continuing ...") command = "cd {0} && make install".format(tlgu_path) print(f"Going to run command: ``{command}``") try: p_out = subprocess.call(command, shell=True) except subprocess.SubprocessError as sub_err: print( "Error executing installation. Going to check output of ``subprocess.call()`` ..." ) raise CLTKException(sub_err) if p_out == 0: msg = "TLGU installed." print(msg) logger.info(msg) return True else: msg = "TLGU install without sudo failed. Going to try again with sudo (usually required for Linux) ..." print(msg) logger.error(msg) command = "cd {0} && sudo make install".format(tlgu_path) if self.interactive: install_question = "Do you want to install TLGU? with sudo?" do_install = query_yes_no(question=install_question) if not do_install: raise CLTKException( "TLGU installation required for this class to work." ) p_out = subprocess.call(command, shell=True) else: print("Going to run command:", command) p_out = subprocess.call(command, shell=True) if p_out == 0: msg = "TLGU installed." print(msg) logger.info(msg) else: msg = "TLGU install with sudo failed." print(msg) logger.error(msg) raise CLTKException( "TLGU installation required for this class to work.")
def test_tlgu_init(self): """Test constructors of TLGU module for check, import, and install.""" TLGU(interactive=False) header_file = make_cltk_path( "greek/software/greek_software_tlgu/README.md") self.assertTrue(os.path.isfile(header_file))
def convert_corpus(self, corpus, markup=None, lat=None): # pylint: disable=W0613 """Look for imported TLG or PHI files and convert them all to ``~/cltk_data/grc/text/tlg/<plaintext>``. TODO: Add markup options to input. TODO: Add rm_newlines, divide_works, and extra_args """ orig_path = make_cltk_path("originals") target_path = make_cltk_path() assert corpus in [ "tlg", "phi5", "phi7", ], "Corpus must be 'tlg', 'phi5', or 'phi7'" if corpus in ["tlg", "phi5", "phi7"]: orig_path = os.path.join(orig_path, corpus) if corpus in ["tlg", "phi7"]: if "phi7" and lat is True: lat = True target_path = os.path.join(target_path, "lat", "text", corpus) else: lat = None target_path = os.path.join(target_path, "grc", "text", corpus) else: target_path = os.path.join(target_path, "lat", "text", corpus) lat = True try: corpus_files = os.listdir(orig_path) except Exception as exception: logger.error("Failed to find TLG files: %s", exception) raise # make a list of files to be converted txts = [x for x in corpus_files if x.endswith("TXT")] # loop through list and convert one at a time for txt in txts: orig_txt_path = os.path.join(orig_path, txt) if markup is None: target_txt_dir = os.path.join(target_path, "plaintext") else: target_txt_dir = os.path.join(target_path, str(markup)) if not os.path.isdir(target_txt_dir): os.makedirs(target_txt_dir) target_txt_path = os.path.join(target_txt_dir, txt) try: self.convert( orig_txt_path, target_txt_path, markup=False, rm_newlines=False, divide_works=False, lat=lat, extra_args=None, ) except Exception as exception: logger.error( "Failed to convert file '%s' to '%s': %s", orig_txt_path, target_txt_path, exception, )