def _get_user_defined_corpora(self): """Check CLTK_DATA_DIR + '/distributed_corpora.yaml' for any custom, distributed corpora that the user wants to load locally. """ if self.testing: distributed_corpora_fp = os.path.normpath( CLTK_DATA_DIR + "/test_distributed_corpora.yaml") else: distributed_corpora_fp = os.path.normpath( CLTK_DATA_DIR + "/distributed_corpora.yaml") try: with open(distributed_corpora_fp) as file_open: corpora_dict = yaml.safe_load(file_open) except FileNotFoundError: logger.info( "``~/cltk_data/distributed_corpora.yaml`` file not found.") return [] except yaml.parser.ParserError as parse_err: logger.debug("Yaml parsing error: %s" % parse_err) return [] user_defined_corpora = [] for corpus_name in corpora_dict: about = corpora_dict[corpus_name] if about["language"].lower() == self.language: user_defined_corpus = dict() user_defined_corpus["origin"] = about["origin"] user_defined_corpus["type"] = about["type"] user_defined_corpus["name"] = corpus_name user_defined_corpus["user_defined"] = True user_defined_corpora.append(user_defined_corpus) return user_defined_corpora
def _retrieve_morpheus_entry(self, word): """Return Morpheus entry for word Entry format: [(head word, tag, macronized form)] :param word: unmacronized, lowercased word :ptype word: string :return: Morpheus entry in tuples :rtype : list """ entry = self.macron_data.get(word) if entry is None: logger.info("No Morpheus entry found for {}.".format(word)) return None elif len(entry) == 0: logger.info("No Morpheus entry found for {}.".format(word)) return entry
def _git_user_defined_corpus(self, corpus_name, corpus_type, uri: str, branch="master"): """Clone or update a git repo defined by user. TODO: This code is very redundant with what's in import_corpus(), could be refactored. """ type_dir_rel = os.path.join(CLTK_DATA_DIR, self.language, corpus_type) type_dir = os.path.expanduser(type_dir_rel) repo_name = uri.split("/")[-1] # eg, 'latin_corpus_newton_example.git' repo_name = repo_name.rstrip(".git") target_dir = os.path.join(type_dir, repo_name) target_file = os.path.join(type_dir, repo_name, "README.md") # check if corpus already present # if not, clone if not os.path.isfile(target_file): if not os.path.isdir(type_dir): os.makedirs(type_dir) try: msg = "Cloning '{}' from '{}'".format(corpus_name, uri) logger.info(msg) Repo.clone_from(uri, target_dir, branch=branch, depth=1, progress=ProgressPrinter()) except CorpusImportError as corpus_imp_err: msg = "Git clone of '{}' failed: '{}'".format( uri, corpus_imp_err) logger.error(msg) # if corpus is present, pull latest else: try: repo = Repo(target_dir) assert not repo.bare # or: assert repo.exists() git_origin = repo.remotes.origin msg = "Pulling latest '{}' from '{}'.".format(corpus_name, uri) logger.info(msg) git_origin.pull() except CorpusImportError as corpus_imp_err: msg = "Git pull of '{}' failed: '{}'".format( uri, corpus_imp_err) logger.error(msg)
def divide_works(self, corpus): """Use the work-breaking option. TODO: Maybe incorporate this into ``convert_corpus()`` TODO: Write test for this """ if corpus == "tlg": orig_dir = make_cltk_path("originals/tlg") works_dir = make_cltk_path("grc/text/tlg/individual_works") file_prefix = "TLG" lat = False elif corpus == "phi5": orig_dir = make_cltk_path("originals/phi5") works_dir = make_cltk_path("lat/text/phi5/individual_works") file_prefix = "LAT" lat = True # this is for the optional TLGU argument to convert() elif corpus == "phi7": raise CLTKException( "``phi7`` cannot be divided into individual works.") else: raise CLTKException( f"Invalid corpus '{corpus}'. This should never happen.") if not os.path.exists(works_dir): os.makedirs(works_dir) files = os.listdir(orig_dir) texts = [ x for x in files if x.endswith(".TXT") and x.startswith(file_prefix) ] for file in texts: orig_file_path = os.path.join(orig_dir, file) new_file_path = os.path.join(works_dir, file) try: self.convert(orig_file_path, new_file_path, divide_works=True, lat=lat) logger.info("Writing files at %s to %s.", orig_file_path, works_dir) except Exception as err: logger.error("Failed to convert files: %s.", err)
def _copy_dir_recursive(src_rel, dst_rel): """Copy contents of one directory to another. `dst_rel` dir cannot exist. Source: http://stackoverflow.com/a/1994840 TODO: Move this to file_operations.py module. :type src_rel: str :param src_rel: Directory to be copied. :type dst_rel: str :param dst_rel: Directory to be created with contents of ``src_rel``. """ src = os.path.expanduser(src_rel) dst = os.path.expanduser(dst_rel) try: shutil.copytree(src, dst) logger.info("Files copied from %s to %s", src, dst) except OSError as exc: if exc.errno == errno.ENOTDIR: shutil.copy(src, dst) logger.info("Files copied from %s to %s", src, dst) else: raise
def _long_by_position(self, syllable: str, sentence: List[str]) -> bool: """Check if syllable is long by position. Returns ``True`` if syllable is long by position Long by position includes contexts when: 1. Next syllable begins with two consonants, unless those consonants are a stop + liquid combination 2. Next syllable begins with a double consonant 3. Syllable ends with a consonant and the next syllable begins with a consonant Args: syllable: Current syllable sentence: Sentence in which syllable appears Returns: Whether or not a syllable is long by position >>> from cltk.prosody.grc import Scansion >>> syllables_sentence = ["μεν", "και", "α", "πει", "ρος"] >>> [Scansion()._long_by_position(syllable=syllable, sentence=syllables_sentence) for syllable in syllables_sentence] [True, False, False, False, False] """ try: next_syll = sentence[sentence.index(syllable) + 1] # Long by position by case 1 if (next_syll[0] in self.sing_cons and next_syll[1] in self.sing_cons) and (next_syll[0] not in self.stops and next_syll[1] not in self.liquids): return True # Long by position by case 2 if syllable[-1] in self.vowels and next_syll[0] in self.doub_cons: return True # Long by position by case 3 if syllable[-1] in self.sing_cons and (next_syll[0] in self.sing_cons): return True except IndexError: logger.info( "IndexError while checking if syllable '%s' is long. Continuing.", syllable, ) return False
def _macronize_word(self, word: Tuple[str, str]) -> Tuple[str, str, str]: """Return macronized word. :param word: (word, tag) :return: (word, tag, macronized_form) """ head_word = word[0] tag = word[1] if tag is None: logger.info("Tagger {} could not tag {}.".format(self.tagger, head_word)) return head_word, tag, head_word elif tag == "U--------": return (head_word, tag.lower(), head_word) else: entries = self._retrieve_morpheus_entry(head_word) if entries is None: return head_word, tag.lower(), head_word matched_entry = [entry for entry in entries if entry[0] == tag.lower()] if len(matched_entry) == 0: logger.info( "No matching Morpheus entry found for {}.".format(head_word) ) return head_word, tag.lower(), entries[0][2] elif len(matched_entry) == 1: return head_word, tag.lower(), matched_entry[0][2].lower() else: logger.info("Multiple matching entries found for {}.".format(head_word)) return head_word, tag.lower(), matched_entry[1][2].lower()
def import_corpus(self, corpus_name: str, local_path: str = None, branch: str = "master"): """Download a remote or load local corpus into dir ``~/cltk_data``. TODO: maybe add ``from git import RemoteProgress`` TODO: refactor this, it's getting kinda long :param corpus_name: The name of an available corpus. :param local_path: A filepath, required when importing local corpora. :param branch: What Git branch to clone. """ matching_corpus_list = [ _dict for _dict in self.all_corpora_for_lang if _dict["name"] == corpus_name ] if not matching_corpus_list: raise CorpusImportError( f"No corpus ``{corpus_name}`` for language ``{self.language}``." ) if len(matching_corpus_list) > 1: raise CorpusImportError( f"Found more than one corpus with the name ``{corpus_name}``.") matching_corpus = matching_corpus_list[0] if matching_corpus.get("user_defined"): """{'origin': 'https://github.com/kylepjohnson/latin_corpus_newton_example.git', 'type': 'text', 'name': 'example_distributed_latin_corpus', 'user_defined': True} """ self._git_user_defined_corpus( matching_corpus["name"], matching_corpus["type"], matching_corpus["origin"], ) return elif matching_corpus.get("location") == "local": # {'location': 'local', 'name': 'phi5', 'origin': None, 'type': 'text'} msg = "Importing from local path: '{}'".format(local_path) logger.info(msg) if corpus_name not in ["phi5", "phi7", "tlg"]: raise CorpusImportError( f"Unsupported local corpus ``{corpus_name}``.") if corpus_name == "phi5": # normalize path for checking dir if local_path.endswith("/"): local_path = local_path[:-1] # check for right corpus dir if os.path.split(local_path)[1] != "PHI5": logger.info("Directory must be named 'PHI5'.") if corpus_name == "phi7": # normalize local_path for checking dir if local_path.endswith("/"): local_path = local_path[:-1] # check for right corpus dir if os.path.split(local_path)[1] != "PHI7": logger.info("Directory must be named 'PHI7'.") if corpus_name == "tlg": # normalize path for checking dir if local_path.endswith("/"): local_path = local_path[:-1] # check for right corpus dir if os.path.split(local_path)[1] != "TLG_E": logger.info("Directory must be named 'TLG_E'.") # move the dir-checking commands into a function data_dir = os.path.expanduser(CLTK_DATA_DIR) originals_dir = os.path.join(data_dir, "originals") # check for `originals` dir; if not present mkdir if not os.path.isdir(originals_dir): os.makedirs(originals_dir) msg = "Wrote directory at '{}'.".format(originals_dir) logger.info(msg) tlg_originals_dir = os.path.join(data_dir, "originals", corpus_name) # check for `originals/<corpus_name>`; if pres, delete if os.path.isdir(tlg_originals_dir): shutil.rmtree(tlg_originals_dir) msg = "Removed directory at '{}'.".format(tlg_originals_dir) logger.info(msg) # copy_dir requires that target if not os.path.isdir(tlg_originals_dir): self._copy_dir_recursive(local_path, tlg_originals_dir) else: """{'type': 'text', 'name': 'lat_text_perseus', 'origin': 'https://github.com/cltk/lat_text_perseus.git'}, """ if (not matching_corpus.get("type") and not matching_corpus.get("name") and not matching_corpus.get("origin")): raise FetchCorpus(f"Malformed record for ``{corpus_name}``.") git_uri = matching_corpus["origin"] type_dir_rel = os.path.join(CLTK_DATA_DIR, self.language, matching_corpus["type"]) type_dir = os.path.expanduser(type_dir_rel) target_dir = os.path.join(type_dir, corpus_name) target_file = os.path.join(type_dir, corpus_name, "README.md") # check if corpus already present # if not, clone if not os.path.isfile(target_file): if not os.path.isdir(type_dir): os.makedirs(type_dir) try: msg = "Cloning '{}' from '{}'".format(corpus_name, git_uri) logger.info(msg) Repo.clone_from( git_uri, target_dir, branch=branch, depth=1, progress=ProgressPrinter(), ) except CorpusImportError as corpus_imp_err: msg = "Git clone of '{}' failed: '{}'".format( git_uri, corpus_imp_err) logger.error(msg) # if corpus is present, pull latest else: try: repo = Repo(target_dir) assert not repo.bare # or: assert repo.exists() git_origin = repo.remotes.origin msg = "Pulling latest '{}' from '{}'.".format( corpus_name, git_uri) logger.info(msg) git_origin.pull() except CorpusImportError as corpus_imp_err: msg = "Git pull of '{}' failed: '{}'".format( git_uri, corpus_imp_err) logger.error(msg)
def _make_syllables(self, sentences_words: str) -> List[List[List[str]]]: """First tokenize, then divide word tokens into a list of syllables. Note that a syllable in this instance is defined as a vocalic group (i.e., vowel or a diphthong). This means that all syllables which are not the last syllable in the word will end with a vowel or diphthong. Todo: * Determine whether a CLTK syllabifier could replace this. Args: sentences_words: Text string Returns: List of list of list of syllables >>> from cltk.prosody.grc import Scansion >>> text_string = "νέος μὲν καὶ ἄπειρος, δικῶν ἔγωγε ἔτι. μὲν καὶ ἄπειρος." >>> Scansion()._make_syllables(text_string) [[['νε', 'ος'], ['μεν'], ['και'], ['α', 'πει', 'ρος'], ['δι', 'κων'], ['ε', 'γω', 'γε'], ['ε', 'τι']], [['μεν'], ['και'], ['α', 'πει', 'ρος']]] """ text = self._tokenize(sentences_words) all_syllables = list() for sentence in text: syll_per_sent = list() for word in sentence: syll_start = 0 # Begins syllable iterator syll_per_word = list() cur_letter_in = 0 # Begins general iterator while cur_letter_in < len(word): letter = word[cur_letter_in] if (cur_letter_in != len(word) - 1) and ( word[cur_letter_in] + word[cur_letter_in + 1]) in self.diphthongs: cur_letter_in += 1 # Syllable ends with a diphthong syll_per_word.append(word[syll_start:cur_letter_in + 1]) syll_start = cur_letter_in + 1 elif (letter in self.vowels) or (letter in self.long_vowels): # Syllable ends with a vowel syll_per_word.append(word[syll_start:cur_letter_in + 1]) syll_start = cur_letter_in + 1 cur_letter_in += 1 try: last_vowel = syll_per_word[-1][-1] # Last vowel of a word # Modifies general iterator to accomodate consonants after # the last syllable in a word cur_letter_in = len(word) - 1 # Contains all of the consonants after the last vowel in a word leftovers = "" while word[cur_letter_in] != last_vowel: if word[cur_letter_in] != ".": # Adds consonants to leftovers leftovers = word[cur_letter_in] + leftovers cur_letter_in -= 1 # Adds leftovers to last syllable in a word syll_per_word[-1] += leftovers syll_per_sent.append(syll_per_word) except IndexError: logger.info( "IndexError while making syllables of '%s'. Continuing.", word) all_syllables.append(syll_per_sent) return all_syllables
def _check_install(self): """Check if tlgu installed, if not install it.""" try: subprocess.check_output(["which", "tlgu"]) except subprocess.SubprocessError as sub_err: print("TLGU not installed.") logger.info("TLGU not installed: %s", sub_err) logger.info("Installing TLGU.") if not subprocess.check_output(["which", "gcc"]): logger.error("GCC seems not to be installed.") else: tlgu_path = make_cltk_path("grc/software/grc_software_tlgu") if self.interactive: install_question = "Do you want to install TLGU?" do_install = query_yes_no(question=install_question) if not do_install: raise CLTKException( "TLGU installation required for this class to work." ) else: print("Non-interactive installation. Continuing ...") command = "cd {0} && make install".format(tlgu_path) print(f"Going to run command: ``{command}``") try: p_out = subprocess.call(command, shell=True) except subprocess.SubprocessError as sub_err: print( "Error executing installation. Going to check output of ``subprocess.call()`` ..." ) raise CLTKException(sub_err) if p_out == 0: msg = "TLGU installed." print(msg) logger.info(msg) return True else: msg = "TLGU install without sudo failed. Going to try again with sudo (usually required for Linux) ..." print(msg) logger.error(msg) command = "cd {0} && sudo make install".format(tlgu_path) if self.interactive: install_question = "Do you want to install TLGU? with sudo?" do_install = query_yes_no(question=install_question) if not do_install: raise CLTKException( "TLGU installation required for this class to work." ) p_out = subprocess.call(command, shell=True) else: print("Going to run command:", command) p_out = subprocess.call(command, shell=True) if p_out == 0: msg = "TLGU installed." print(msg) logger.info(msg) else: msg = "TLGU install with sudo failed." print(msg) logger.error(msg) raise CLTKException( "TLGU installation required for this class to work.")
def convert( input_path=None, output_path=None, markup=None, rm_newlines=False, divide_works=False, lat=False, extra_args=None, ): """ Do conversion. :param input_path: TLG filepath to convert. :param output_path: filepath of new converted text. :param markup: Specificity of inline markup. Default None removes all numerical markup; 'full' gives most detailed, with reference numbers included before each text line. :param rm_newlines: No spaces; removes line ends and hyphens before an ID code; hyphens and spaces before page and column ends are retained. :param divide_works: Each work (book) is output as a separate file in the form output_file-xxx.txt; if an output file is not specified, this option has no effect. :param lat: Primarily Latin text (PHI). Some TLG texts, notably doccan1.txt and doccan2.txt are mostly roman texts lacking explicit language change codes. Setting this option will force a change to Latin text after each citation block is encountered. :param extra_args: Any other tlgu args to be passed, in list form and without dashes, e.g.: ['p', 'b', 'B']. """ # setup file paths input_path = os.path.expanduser(input_path) output_path = os.path.expanduser(output_path) # check input path exists assert os.path.isfile(input_path), "File {0} does not exist.".format( input_path) # setup tlgu flags tlgu_options = [] if markup == "full": full_args = ["v", "w", "x", "y", "z"] [tlgu_options.append(x) for x in full_args] # pylint: disable=W0106 if rm_newlines: tlgu_options.append("N") if divide_works: tlgu_options.append("W") if lat: tlgu_options.append("r") # setup extra args if extra_args is None: extra_args = [] else: try: extra_args = list(extra_args) except Exception as exc: logger.error("Argument 'extra_args' must be a list: %s.", exc) raise tlgu_options = tlgu_options + extra_args # assemble all tlgu flags tlgu_options = list(set(tlgu_options)) if tlgu_options: tlgu_flags = "-" + " -".join(tlgu_options) else: tlgu_flags = "" # make tlgu call tlgu_call = "tlgu {0} {1} {2}".format(tlgu_flags, input_path, output_path) logger.info(tlgu_call) try: p_out = subprocess.call(tlgu_call, shell=True) if p_out == 1: logger.error("Failed to convert %s to %s.", input_path, output_path) except Exception as exc: logger.error("Failed to convert %s to %s: %s", input_path, output_path, exc) raise