def onekgreek_tei_xml_to_text_capitains(): """Use MyCapitains program to convert TEI to plaintext.""" file = os.path.expanduser( '~/cltk_data/greek/text/greek_text_first1kgreek/data/tlg0627/tlg021/tlg0627.tlg021.1st1K-grc1.xml') xml_dir = os.path.expanduser('~/cltk_data/greek/text/greek_text_first1kgreek/data/*/*/*.xml') xml_paths = glob.glob(xml_dir) if not len(xml_paths): logger.error('1K Greek corpus not installed. Use CorpusInstaller to get `First1KGreek`.') raise FileNotFoundError xml_paths = [path for path in xml_paths if '__cts__' not in path] # new dir new_dir = os.path.expanduser('~/cltk_data/greek/text/greek_text_first1kgreek_plaintext/') if not os.path.isdir(new_dir): os.makedirs(new_dir) for xml_path in xml_paths: _, xml_name = os.path.split(xml_path) xml_name = xml_name.rstrip('.xml') xml_name += '.txt' plain_text = '' with open(xml_path) as file_open: text = CapitainsCtsText(resource=file_open) for ref in text.getReffs(level=len(text.citation)): psg = text.getTextualNode(subreference=ref, simple=True) text_line = psg.export(Mimetypes.PLAINTEXT, exclude=["tei:note"]) plain_text += text_line new_plaintext_path = os.path.join(new_dir, xml_name) with open(new_plaintext_path, 'w') as file_open: file_open.write(plain_text)
def onekgreek_tei_xml_to_text(): """Find TEI XML dir of TEI XML for the First 1k Years of Greek corpus.""" if not bs4_installed: logger.error('Install `bs4` and `lxml` to parse these TEI files.') raise ImportError xml_dir = os.path.expanduser('~/cltk_data/greek/text/greek_text_first1kgreek/data/*/*/*.xml') xml_paths = glob.glob(xml_dir) if not len(xml_paths): logger.error('1K Greek corpus not installed. Use CorpusInstaller to get `First1KGreek`.') raise FileNotFoundError xml_paths = [path for path in xml_paths if '__cts__' not in path] # new dir new_dir = os.path.expanduser('~/cltk_data/greek/text/greek_text_first1kgreek_plaintext/') if not os.path.isdir(new_dir): os.makedirs(new_dir) for xml_path in xml_paths: _, xml_name = os.path.split(xml_path) xml_name = xml_name.rstrip('.xml') xml_name += '.txt' with open(xml_path) as file_open: soup = BeautifulSoup(file_open, 'lxml') body = soup.body text = body.get_text() new_plaintext_path = os.path.join(new_dir, xml_name) with open(new_plaintext_path, 'w') as file_open: file_open.write(text)
def find_alliteration(self): """ Find alliterations in the complete verse. :return: """ if len(self.phonological_features_text) == 0: logger.error("No phonological transcription found") raise ValueError else: first_sounds = [] for i, line in enumerate(self.phonological_features_text): first_sounds.append([]) for j, short_line in enumerate(line): first_sounds[i].append([]) for viisuord in short_line: first_sounds[i][j].append(viisuord[0]) verse_alliterations = [] n_alliterations_lines = [] for i, first_sound_line in enumerate(first_sounds): if isinstance(self.long_lines[i][0], ShortLine) and isinstance(self.long_lines[i][1], ShortLine): self.long_lines[i][0].get_first_sounds() self.long_lines[i][1].get_first_sounds() alli, counter = self.long_lines[i][0].find_alliterations(self.long_lines[i][1]) verse_alliterations.append(alli) n_alliterations_lines.append(counter) elif isinstance(self.long_lines[i][0], LongLine): self.long_lines[i][0].get_first_sounds() alli, counter = self.long_lines[i][0].find_alliterations() verse_alliterations.append(alli) n_alliterations_lines.append(counter) return verse_alliterations, n_alliterations_lines
def syllabify(self): """ Syllables may play a role in verse classification. """ if len(self.long_lines) == 0: logger.error("No text was imported") self.syllabified_text = [] else: syllabifier = Syllabifier(language="old_norse", break_geminants=True) syllabified_text = [] for i, line in enumerate(self.long_lines): syllabified_text.append([]) for j, viisuordh in enumerate(line): syllabified_text[i].append([]) words = [] for word in tokenize_old_norse_words(viisuordh): # punctuation is not necessary here word = word.replace(",", "") word = word.replace(".", "") word = word.replace(";", "") word = word.replace("!", "") word = word.replace("?", "") word = word.replace("-", "") word = word.replace(":", "") if word != '': words.append(syllabifier.syllabify(word.lower())) syllabified_text[i][j].append(words) self.syllabified_text = syllabified_text
def divide_works(self, corpus): """Use the work-breaking option. TODO: Maybe incorporate this into ``convert_corpus()`` TODO: Write test for this """ if corpus == 'tlg': orig_dir_rel = '~/cltk_data/originals/tlg' works_dir_rel = '~/cltk_data/greek/text/tlg/individual_works' file_prefix = 'TLG' latin = False elif corpus == 'phi5': orig_dir_rel = '~/cltk_data/originals/phi5' works_dir_rel = '~/cltk_data/latin/text/phi5/individual_works' file_prefix = 'LAT' latin = True # this is for the optional TLGU argument to convert() orig_dir = os.path.expanduser(orig_dir_rel) works_dir = os.path.expanduser(works_dir_rel) if not os.path.exists(works_dir): os.makedirs(works_dir) files = os.listdir(orig_dir) texts = [x for x in files if x.endswith('.TXT') and x.startswith(file_prefix)] for file in texts: orig_file_path = os.path.join(orig_dir, file) new_file_path = os.path.join(works_dir, file) try: self.convert(orig_file_path, new_file_path, divide_works=True, latin=latin) logger.info('Writing files at %s to %s.', orig_file_path, works_dir) except Exception as err: logger.error('Failed to convert files: %s.', err)
def open_pickle(path: str): """Open a pickle and return loaded pickle object. :type path: str :param : path: File path to pickle file to be opened. :rtype : object """ try: with open(path, 'rb') as opened_pickle: try: return pickle.load(opened_pickle) except Exception as pickle_error: logger.error(pickle_error) raise except FileNotFoundError as fnf_error: logger.error(fnf_error) raise except IOError as io_err: logger.error(io_err) raise except EOFError as eof_error: logger.error(eof_error) raise except pickle.UnpicklingError as unp_error: logger.error(unp_error) raise
def list_corpora(self): """Show corpora available for the CLTK to download.""" try: corpora = LANGUAGE_CORPORA[self.language] corpus_names = [corpus['name'] for corpus in corpora] return corpus_names except (NameError, KeyError) as error: msg = 'Corpus not available for language "{}": {}'.format(self.language, error) logger.error(msg) raise CorpusImportError(msg)
def list_corpora(self): """Show corpora available for the CLTK to download.""" try: corpora = LANGUAGE_CORPORA[self.language] except NameError as name_error: logger.error('Corpus not available for language %s: %s', (self.language, name_error)) corpus_list = [] for corpus in corpora: corpus_list.append(corpus['name']) return corpus_list
def _check_import_source(): """Check if tlgu imported, if not import it.""" path_rel = '~/cltk_data/greek/software/greek_software_tlgu/tlgu.h' path = os.path.expanduser(path_rel) if not os.path.isfile(path): try: corpus_importer = CorpusImporter('greek') corpus_importer.import_corpus('tlgu') except Exception as exc: logger.error('Failed to import TLGU: %s', exc) raise
def _check_install(self): """Check if tlgu installed, if not install it.""" try: subprocess.check_output(['which', 'tlgu']) except Exception as exc: logger.info('TLGU not installed: %s', exc) logger.info('Installing TLGU.') if not subprocess.check_output(['which', 'gcc']): logger.error('GCC seems not to be installed.') else: tlgu_path_rel = '~/cltk_data/greek/software/greek_software_tlgu' tlgu_path = os.path.expanduser(tlgu_path_rel) if not self.testing: print('Do you want to install TLGU? To continue, press Return. To exit, Control-C.') input() else: print('Automated or test build, skipping keyboard input confirmation for installation of TLGU.') try: p_out = subprocess.call('cd {0} && make install'.format(tlgu_path), shell=True) if p_out == 0: logger.info('TLGU installed.') else: logger.error('TLGU install without sudo failed.') except Exception as exc: logger.error('TLGU install failed: %s', exc) else: # for Linux needing root access to '/usr/local/bin' p_out = subprocess.call('cd {0} && sudo make install'.format(tlgu_path), shell=True) if p_out == 0: logger.info('TLGU installed.') else: logger.error('TLGU install with sudo failed.')
def _check_install(): """Check if tlgu installed, if not install it.""" try: subprocess.check_output(['which', 'tlgu']) except Exception as exc: logger.info('TLGU not installed: %s', exc) logger.info('Installing TLGU.') if not subprocess.check_output(['which', 'gcc']): logger.error('GCC seems not to be installed.') else: tlgu_path_rel = '~/cltk_data/greek/software/greek_software_tlgu' tlgu_path = os.path.expanduser(tlgu_path_rel) try: p_out = subprocess.call('cd {0} && make install'.format(tlgu_path), shell=True) if p_out == 0: logger.info('TLGU installed.') else: logger.error('TLGU install without sudo failed.') except Exception as exc: logger.error('TLGU install failed: %s', exc) else: # for Linux needing root access to '/usr/local/bin' p_out = subprocess.call('cd {0} && sudo make install'.format(tlgu_path), shell=True) if p_out == 0: logger.info('TLGU installed.') else: logger.error('TLGU install with sudo failed.')
def to_phonetics(self): """ Transcribing words in verse helps find alliteration. """ if len(self.long_lines) == 0: logger.error("No text was imported") self.syllabified_text = [] else: transcriber = Transcriber(DIPHTHONGS_IPA, DIPHTHONGS_IPA_class, IPA_class, old_norse_rules) transcribed_text = [] for i, line in enumerate(self.long_lines): transcribed_text.append([]) for viisuordh in line: transcribed_text[i].append(transcriber.main(viisuordh)) self.transcribed_text = transcribed_text
def convert_corpus(self, corpus, markup=None, break_lines=False, divide_works=False, latin=None, extra_args=None): # pylint: disable=W0613 """Look for imported TLG or PHI files and convert them all to ``~/cltk_data/greek/text/tlg/<plaintext>``. TODO: Should this and/or convert() be static? TODO: Add markup options to input. TODO: Do something with break_lines, divide_works, and extra_args or rm them """ orig_path_rel = '~/cltk_data/originals' orig_path = os.path.expanduser(orig_path_rel) target_path_rel = '~/cltk_data' target_path = os.path.expanduser(target_path_rel) assert corpus in ['tlg', 'phi5', 'phi7'], "Corpus must be 'tlg', 'phi5', or 'phi7'" if corpus in ['tlg', 'phi5', 'phi7']: orig_path = os.path.join(orig_path, corpus) if corpus in ['tlg', 'phi7']: if 'phi7' and latin is True: latin = True target_path = os.path.join(target_path, 'latin', 'text', corpus) else: latin = None target_path = os.path.join(target_path, 'greek', 'text', corpus) else: target_path = os.path.join(target_path, 'latin', 'text', corpus) latin = True try: corpus_files = os.listdir(orig_path) except Exception as exception: logger.error("Failed to find TLG files: %s", exception) raise # make a list of files to be converted txts = [] [txts.append(x) for x in corpus_files if x.endswith('TXT')] # pylint: disable=W0106 # loop through list and convert one at a time for txt in txts: orig_txt_path = os.path.join(orig_path, txt) if markup is None: target_txt_dir = os.path.join(target_path, 'plaintext') else: target_txt_dir = os.path.join(target_path, str(markup)) if not os.path.isdir(target_txt_dir): os.makedirs(target_txt_dir) target_txt_path = os.path.join(target_txt_dir, txt) try: self.convert(orig_txt_path, target_txt_path, markup=None, break_lines=False, divide_works=False, latin=latin, extra_args=None) except Exception as exception: logger.error("Failed to convert file '%s' to '%s': %s", orig_txt_path, target_txt_path, exception)
def write_concordance_from_string(self, text, name): """A reworkinng of write_concordance_from_file(). Refactor these.""" list_of_lists = self._build_concordance(text) user_data_rel = '~/cltk_data/user_data' user_data = os.path.expanduser(user_data_rel) if not os.path.isdir(user_data): os.makedirs(user_data) file_path = os.path.join(user_data, 'concordance_' + name + '.txt') concordance_output = '' for word_list in list_of_lists: for line in word_list: concordance_output += line + '\n' try: with open(file_path, 'w') as open_file: open_file.write(concordance_output) logger.info("Wrote concordance to '%s'." % file_path) except IOError as io_error: logger.error("Failed to write concordance to '%s'." % file_path)
def ratio(string_a, string_b): """At the most basic level, return a Levenshtein distance ratio via fuzzywuzzy. :param string_a: str :param string_b: str :return: float """ from cltk.utils.cltk_logger import logger try: from fuzzywuzzy import fuzz except ImportError as imp_err: # pragma: no cover message = "'fuzzywuzzy' library required for this module: %s. Install with `pip install fuzzywuzzy python-Levenshtein`" % imp_err logger.error(message) print(message) raise ImportError return fuzz.ratio(string_a, string_b) / 100
def syllabify(self, hierarchy): """ Syllables may play a role in verse classification. """ if len(self.long_lines) == 0: logger.error("No text was imported") self.syllabified_text = [] else: syllabifier = Syllabifier(language="old_norse", break_geminants=True) syllabifier.set_hierarchy(hierarchy) syllabified_text = [] for i, long_line in enumerate(self.long_lines): syllabified_text.append([]) for short_line in long_line: assert isinstance(short_line, ShortLine) or isinstance(short_line, LongLine) short_line.syllabify(syllabifier) syllabified_text[i].append(short_line.syllabified) self.syllabified_text = syllabified_text
def _check_corpus_availability(self, corpus_name): """Check whether a corpus is available for import. :type corpus_name: str :param corpus_name: Name of available corpus. :rtype : str """ try: corpora = LANGUAGE_CORPORA[self.language] except NameError as name_error: logger.error('Corpus not available for language %s: %s', (self.language, name_error)) corpus_properties = None for corpus in corpora: if corpus['name'] == corpus_name: corpus_properties = corpus if not corpus_properties: logger.info("Corpus '%s' not available for the '%s' language.", corpus_name, self.language) return corpus_properties
def divide_works(self, corpus): """Use the work-breaking option. TODO: Maybe incorporate this into ``convert_corpus()`` TODO: Write test for this """ if corpus == 'tlg': orig_dir_rel = get_cltk_data_dir() + '/originals/tlg' works_dir_rel = get_cltk_data_dir( ) + '/greek/text/tlg/individual_works' file_prefix = 'TLG' latin = False elif corpus == 'phi5': orig_dir_rel = get_cltk_data_dir() + '/originals/phi5' works_dir_rel = get_cltk_data_dir( ) + '/latin/text/phi5/individual_works' file_prefix = 'LAT' latin = True # this is for the optional TLGU argument to convert() orig_dir = os.path.expanduser(orig_dir_rel) works_dir = os.path.expanduser(works_dir_rel) if not os.path.exists(works_dir): os.makedirs(works_dir) files = os.listdir(orig_dir) texts = [ x for x in files if x.endswith('.TXT') and x.startswith(file_prefix) ] for file in texts: orig_file_path = os.path.join(orig_dir, file) new_file_path = os.path.join(works_dir, file) try: self.convert(orig_file_path, new_file_path, divide_works=True, latin=latin) logger.info('Writing files at %s to %s.', orig_file_path, works_dir) except Exception as err: logger.error('Failed to convert files: %s.', err)
def from_regular_expression(re_rule, estimated_sound, ipa_class): """ :param re_rule: pattern (first argument of re.sub) :param estimated_sound: an IPA character (second argument of re.sub) :param ipa_class: dict whose keys are IPA characters and values are Vowel or Consonant instances :return: corresponding Rule instance """ assert len(re_rule) > 0 if re_rule[0] == "^": place = Rank.first elif re_rule[-1] == "$": place = Rank.last else: place = Rank.inner before_pattern = r"(?<=\(\?\<\=\[)\w*" core_pattern = r"(?<=\))\w(?=\(\?\=)|(?<=\^)\w(?=\(\?\=)|(?<=\))\w(?=\$)" after_pattern = r"(?<=\(\?\=\[)\w*" before_search = re.search(before_pattern, re_rule) core_search = re.search(core_pattern, re_rule) after_search = re.search(after_pattern, re_rule) if before_search is None: before = None else: before = [ ipa_class[ipar].to_abstract() for ipar in before_search.group(0) ] if core_search is not None: core = ipa_class[core_search.group(0)] else: logger.error("No core") raise ValueError if after_search is None: after = None else: after = [ ipa_class[ipar].to_abstract() for ipar in after_search.group(0) ] abstract_position = AbstractPosition(place, before, after) return Rule(abstract_position, core, ipa_class[estimated_sound])
def syllabify(self, hierarchy): """ Syllables may play a role in verse classification. """ if len(self.long_lines) == 0: logger.error("No text was imported") self.syllabified_text = [] else: syllabifier = Syllabifier(language="old_norse", break_geminants=True) syllabifier.set_hierarchy(hierarchy) syllabified_text = [] for i, long_line in enumerate(self.long_lines): syllabified_text.append([]) for short_line in long_line: assert isinstance(short_line, ShortLine) or isinstance( short_line, LongLine) short_line.syllabify(syllabifier) syllabified_text[i].append(short_line.syllabified) self.syllabified_text = syllabified_text
def _get_corpus_properties(self, corpus_name): """Check whether a corpus is available for import. :type corpus_name: str :param corpus_name: Name of available corpus. :rtype : str """ try: corpora = LANGUAGE_CORPORA[self.language] except NameError as name_error: msg = 'Corpus not available for language ' \ '"%s": %s' % (self.language, name_error) logger.error(msg) raise CorpusImportError(msg) for corpus_properties in corpora: if corpus_properties['name'] == corpus_name: return corpus_properties msg = 'Corpus "%s" not available for the ' \ '"%s" language.' % (corpus_name, self.language) logger.error(msg) raise CorpusImportError(msg)
def write_concordance_from_string(text: str, name: str) -> None: """A reworkinng of write_concordance_from_file(). Refactor these.""" list_of_lists = build_concordance(text) # type: List[List[str]] user_data_rel = '~/cltk_data/user_data' # type: str user_data = os.path.expanduser(user_data_rel) # type: str if not os.path.isdir(user_data): os.makedirs(user_data) file_path = os.path.join(user_data, 'concordance_' + name + '.txt') # type: str concordance_output = '' # type: str for word_list in list_of_lists: for line in word_list: concordance_output += line + '\n' try: with open(file_path, 'w') as open_file: open_file.write(concordance_output) logger.info("Wrote concordance to '%s'.", file_path) except IOError as io_error: logger.error("Failed to write concordance to '%s'. Error: %s", file_path, io_error)
def _get_corpus_properties(self, corpus_name): """Check whether a corpus is available for import. :type corpus_name: str :param corpus_name: Name of available corpus. :rtype : str """ try: corpora = LANGUAGE_CORPORA[self.language] except NameError as name_error: msg = 'Corpus not available for language ' \ '"%s": %s' % (self.language, name_error) logger.error(msg) raise CorpusImportError(msg) for corpus_properties in corpora: if corpus_properties['name'] == corpus_name: return corpus_properties msg = 'Corpus "%s" not available for the ' \ '"%s" language.' % (corpus_name, self.language) logger.error(msg) raise CorpusImportError(msg)
def _check_install(self): """Check if tlgu installed, if not install it.""" try: subprocess.check_output(['which', 'tlgu']) except Exception as exc: logger.info('TLGU not installed: %s', exc) logger.info('Installing TLGU.') if not subprocess.check_output(['which', 'gcc']): logger.error('GCC seems not to be installed.') else: tlgu_path_rel = get_cltk_data_dir( ) + '/greek/software/greek_software_tlgu' tlgu_path = os.path.expanduser(tlgu_path_rel) if not self.testing: print('Do you want to install TLGU?') print('To continue, press Return. To exit, Control-C.') input() else: print( 'Automated or test build, skipping keyboard input confirmation for installation of TLGU.' ) try: command = 'cd {0} && make install'.format(tlgu_path) print('Going to run command:', command) p_out = subprocess.call(command, shell=True) if p_out == 0: logger.info('TLGU installed.') else: logger.error('TLGU install without sudo failed.') except Exception as exc: logger.error('TLGU install failed: %s', exc) else: # for Linux needing root access to '/usr/local/bin' if not self.testing: print( 'Could not install without root access. Do you want to install TLGU with sudo?' ) command = 'cd {0} && sudo make install'.format( tlgu_path) print('Going to run command:', command) print('To continue, press Return. To exit, Control-C.') input() p_out = subprocess.call(command, shell=True) else: command = 'cd {0} && sudo make install'.format( tlgu_path) p_out = subprocess.call(command, shell=True) if p_out == 0: logger.info('TLGU installed.') else: logger.error('TLGU install with sudo failed.')
def onekgreek_tei_xml_to_text_capitains(): """Use MyCapitains program to convert TEI to plaintext.""" file = os.path.expanduser( get_cltk_data_dir() + '/greek/text/greek_text_first1kgreek/data/tlg0627/tlg021/tlg0627.tlg021.1st1K-grc1.xml' ) xml_dir = os.path.normpath( get_cltk_data_dir() + '/greek/text/greek_text_first1kgreek/data/*/*/*.xml') xml_paths = glob.glob(xml_dir) if not len(xml_paths): logger.error( '1K Greek corpus not installed. Use CorpusInstaller to get `First1KGreek`.' ) raise FileNotFoundError xml_paths = [path for path in xml_paths if '__cts__' not in path] # new dir new_dir = os.path.normpath( get_cltk_data_dir() + '/greek/text/greek_text_first1kgreek_plaintext/') if not os.path.isdir(new_dir): os.makedirs(new_dir) for xml_path in xml_paths: _, xml_name = os.path.split(xml_path) xml_name = xml_name.rstrip('.xml') xml_name += '.txt' plain_text = '' with open(xml_path) as file_open: text = CapitainsCtsText(resource=file_open) for ref in text.getReffs(level=len(text.citation)): psg = text.getTextualNode(subreference=ref, simple=True) text_line = psg.export(Mimetypes.PLAINTEXT, exclude=["tei:note"]) plain_text += text_line new_plaintext_path = os.path.join(new_dir, xml_name) with open(new_plaintext_path, 'w') as file_open: file_open.write(plain_text)
def to_phonetics(self): """ Transcribing words in verse helps find alliteration. """ if len(self.long_lines) == 0: logger.error("No text was imported") self.syllabified_text = [] else: transcriber = Transcriber(DIPHTHONGS_IPA, DIPHTHONGS_IPA_class, IPA_class, old_norse_rules) transcribed_text = [] phonological_features_text = [] for i, long_line in enumerate(self.long_lines): transcribed_text.append([]) phonological_features_text.append([]) for short_line in long_line: assert isinstance(short_line, ShortLine) or isinstance(short_line, LongLine) short_line.to_phonetics(transcriber) transcribed_text[i].append(short_line.transcribed) phonological_features_text[i].append(short_line.phonological_features_text) self.transcribed_text = transcribed_text self.phonological_features_text = phonological_features_text
def to_phonetics(self): """ Transcribing words in verse helps find alliteration. """ if len(self.long_lines) == 0: logger.error("No text was imported") self.syllabified_text = [] else: transcriber = Transcriber(DIPHTHONGS_IPA, DIPHTHONGS_IPA_class, IPA_class, old_norse_rules) transcribed_text = [] phonological_features_text = [] for i, long_line in enumerate(self.long_lines): transcribed_text.append([]) phonological_features_text.append([]) for short_line in long_line: assert isinstance(short_line, ShortLine) or isinstance(short_line, LongLine) short_line.to_phonetics(transcriber) transcribed_text[i].append(short_line.transcribed) phonological_features_text[i].append(short_line.phonological_features_text) self.transcribed_text = transcribed_text self.phonological_features_text = phonological_features_text
def _git_user_defined_corpus(self, corpus_name, corpus_type, uri:str, branch='master'): """Clone or update a git repo defined by user. TODO: This code is very redundant with what's in import_corpus(), could be refactored. """ # git_uri = urljoin('https://github.com/cltk/', corpus_name + '.git') # self._download_corpus(corpus_type, corpus_name, path) type_dir_rel = os.path.join(CLTK_DATA_DIR, self.language, corpus_type) type_dir = os.path.expanduser(type_dir_rel) repo_name = uri.split('/')[-1] # eg, 'latin_corpus_newton_example.git' repo_name = repo_name.rstrip('.git') target_dir = os.path.join(type_dir, repo_name) target_file = os.path.join(type_dir, repo_name, 'README.md') # check if corpus already present # if not, clone if not os.path.isfile(target_file): if not os.path.isdir(type_dir): os.makedirs(type_dir) try: msg = "Cloning '{}' from '{}'".format(corpus_name, uri) logger.info(msg) Repo.clone_from(uri, target_dir, branch=branch, depth=1, progress=ProgressPrinter()) except CorpusImportError as corpus_imp_err: msg = "Git clone of '{}' failed: '{}'".format(uri, corpus_imp_err) logger.error(msg) # if corpus is present, pull latest else: try: repo = Repo(target_dir) assert not repo.bare # or: assert repo.exists() git_origin = repo.remotes.origin msg = "Pulling latest '{}' from '{}'.".format(corpus_name, uri) logger.info(msg) git_origin.pull() except CorpusImportError as corpus_imp_err: msg = "Git pull of '{}' failed: '{}'".format(uri, corpus_imp_err) logger.error(msg)
def _git_user_defined_corpus(self, corpus_name, corpus_type, uri:str, branch='master'): """Clone or update a git repo defined by user. TODO: This code is very redundant with what's in import_corpus(), could be refactored. """ # git_uri = urljoin('https://github.com/cltk/', corpus_name + '.git') # self._download_corpus(corpus_type, corpus_name, path) type_dir_rel = os.path.join(CLTK_DATA_DIR, self.language, corpus_type) type_dir = os.path.expanduser(type_dir_rel) repo_name = uri.split('/')[-1] # eg, 'latin_corpus_newton_example.git' repo_name = repo_name.rstrip('.git') target_dir = os.path.join(type_dir, repo_name) target_file = os.path.join(type_dir, repo_name, 'README.md') # check if corpus already present # if not, clone if not os.path.isfile(target_file): if not os.path.isdir(type_dir): os.makedirs(type_dir) try: msg = "Cloning '{}' from '{}'".format(corpus_name, uri) logger.info(msg) Repo.clone_from(uri, target_dir, branch=branch, depth=1, progress=ProgressPrinter()) except CorpusImportError as corpus_imp_err: msg = "Git clone of '{}' failed: '{}'".format(uri, corpus_imp_err) logger.error(msg) # if corpus is present, pull latest else: try: repo = Repo(target_dir) assert not repo.bare # or: assert repo.exists() git_origin = repo.remotes.origin msg = "Pulling latest '{}' from '{}'.".format(corpus_name, uri) logger.info(msg) git_origin.pull() except CorpusImportError as corpus_imp_err: msg = "Git pull of '{}' failed: '{}'".format(uri, corpus_imp_err) logger.error(msg)
def write_concordance_from_file(filepaths: Union[str, List[str]], name: str) -> None: """This calls the modified ConcordanceIndex, taken and modified from the NLTK, and writes to disk a file named 'concordance_' + name at '~/cltk_data/user_data/'. TODO: Add language (here or in class), lowercase option, stemming/ lemmatization, else? :type filepaths: str or list :param filepaths: Filepath of text(s) to be used in concordance. :rtype : str """ assert isinstance(filepaths, (str, list)) if isinstance(filepaths, str): filepath = filepaths # type: str text = read_file(filepath) # type: str elif isinstance(filepaths, list): text = '' for filepath in filepaths: text += read_file(filepath) list_of_lists = build_concordance(text) # type: List[List[str]] user_data_rel = '~/cltk_data/user_data' # type: str user_data = os.path.expanduser(user_data_rel) # type: str if not os.path.isdir(user_data): os.makedirs(user_data) file_path = os.path.join(user_data, 'concordance_' + name + '.txt') concordance_output = '' # type: str for word_list in list_of_lists: for line in word_list: concordance_output += line + '\n' try: with open(file_path, 'w') as open_file: # type: IO open_file.write(concordance_output) logger.info("Wrote concordance to '%s'.", file_path) except IOError as io_error: logger.error("Failed to write concordance to '%s'. Error: %s", file_path, io_error)
def lemmatize(self, input_text, return_raw=False, return_string=False): """Take incoming string or list of tokens. Lookup done against a key-value list of lemmata-headword. If a string, tokenize with ``PunktLanguageVars()``. If a final period appears on a token, remove it, then re-add once replacement done. TODO: rm check for final period, change PunktLanguageVars() to nltk_tokenize_words() """ assert type(input_text) in [list, str], \ logger.error('Input must be a list or string.') if type(input_text) is str: punkt = PunktLanguageVars() tokens = punkt.word_tokenize(input_text) else: tokens = input_text lemmatized_tokens = [] for token in tokens: # check for final period final_period = False if token[-1] == '.': final_period = True token = token[:-1] # look for token in lemma dict keys if token.lower() in self.lemmata.keys(): headword = self.lemmata[token.lower()] # re-add final period if rm'd if final_period: headword += '.' # append to return list if not return_raw: lemmatized_tokens.append(headword) else: lemmatized_tokens.append(token + '/' + headword) # if token not found in lemma-headword list else: # re-add final period if rm'd if final_period: token += '.' if not return_raw: lemmatized_tokens.append(token) else: lemmatized_tokens.append(token + '/' + token) if not return_string: return lemmatized_tokens elif return_string: return ' '.join(lemmatized_tokens)
def lemmatize(self, input_text, return_raw=False, return_string=False): """Take incoming string or list of tokens. Lookup done against a key-value list of lemmata-headword. If a string, tokenize with ``PunktLanguageVars()``. If a final period appears on a token, remove it, then re-add once replacement done. TODO: rm check for final period, change PunktLanguageVars() to nltk_tokenize_words() """ assert type(input_text) in [list, str], \ logger.error('Input must be a list or string.') if type(input_text) is str: punkt = PunktLanguageVars() tokens = punkt.word_tokenize(input_text) else: tokens = input_text lemmatized_tokens = [] for token in tokens: # check for final period final_period = False if token[-1] == '.': final_period = True token = token[:-1] # look for token in lemma dict keys if token in self.lemmata.keys(): headword = self.lemmata[token.lower()] # re-add final period if rm'd if final_period: headword += '.' # append to return list if not return_raw: lemmatized_tokens.append(headword) else: lemmatized_tokens.append(token + '/' + headword) # if token not found in lemma-headword list else: # re-add final period if rm'd if final_period: token += '.' if not return_raw: lemmatized_tokens.append(token) else: lemmatized_tokens.append(token + '/' + token) if not return_string: return lemmatized_tokens elif return_string: return ' '.join(lemmatized_tokens)
def from_regular_expression(re_rule, estimated_sound, ipa_class): """ :param re_rule: pattern (first argument of re.sub) :param estimated_sound: an IPA character (second argument of re.sub) :param ipa_class: dict whose keys are IPA characters and values are Vowel or Consonant instances :return: corresponding Rule instance """ assert len(re_rule) > 0 if re_rule[0] == "^": place = Rank.first elif re_rule[-1] == "$": place = Rank.last else: place = Rank.inner before_pattern = r"(?<=\(\?\<\=\[)\w*" core_pattern = r"(?<=\))\w(?=\(\?\=)|(?<=\^)\w(?=\(\?\=)|(?<=\))\w(?=\$)" after_pattern = r"(?<=\(\?\=\[)\w*" before_search = re.search(before_pattern, re_rule) core_search = re.search(core_pattern, re_rule) after_search = re.search(after_pattern, re_rule) if before_search is None: before = None else: before = [ipa_class[ipar].to_abstract() for ipar in before_search.group(0)] if core_search is not None: core = ipa_class[core_search.group(0)] else: logger.error("No core") raise ValueError if after_search is None: after = None else: after = [ipa_class[ipar].to_abstract() for ipar in after_search.group(0)] abstract_position = AbstractPosition(place, before, after) return Rule(abstract_position, core, ipa_class[estimated_sound])
def write_concordance_from_file(self, filepaths, name): """This calls my modified ConcordanceIndex, taken and modified from the NLTK, and writes to disk a file named 'concordance_' + name at '~/cltk_data/user_data/'. TODO: Add language (here or in class), lowercase option, stemming/ lemmatization, else? :type filepaths: str or list :param filepaths: Filepath of text(s) to be used in concordance. :rtype : str """ assert isinstance(filepaths, (str, list)) if isinstance(filepaths, str): filepath = filepaths text = self._read_file(filepath) elif isinstance(filepaths, list): text = '' for filepath in filepaths: text += self._read_file(filepath) list_of_lists = self._build_concordance(text) user_data_rel = '~/cltk_data/user_data' user_data = os.path.expanduser(user_data_rel) if not os.path.isdir(user_data): os.makedirs(user_data) file_path = os.path.join(user_data, 'concordance_' + name + '.txt') concordance_output = '' for word_list in list_of_lists: for line in word_list: concordance_output += line + '\n' try: with open(file_path, 'w') as open_file: open_file.write(concordance_output) logger.info("Wrote concordance to '%s'." % file_path) except IOError as io_error: logger.error("Failed to write concordance to '%s'." % file_path)
def __init__(self, place=None, manner=None, voiced=None, ipar=None, geminate=None): if isinstance(place, Place) or place is None: self.place = place else: logger.error("Incorrect argument") if isinstance(manner, Manner) or manner is None: self.manner = manner else: logger.error("Incorrect argument") raise ValueError if type(voiced) == bool or voiced is None: self.voiced = voiced else: logger.error("Incorrect argument") raise TypeError if type(geminate) == bool or geminate is None: self.geminate = geminate else: logger.error("Incorrect argument") raise TypeError self.ipar = ipar
def __init__(self, place=None, manner=None, voiced=None, ipar=None, geminate=None): if place in PLACES or place is None: self.place = place else: logger.error("Incorrect argument") if manner in MANNERS or manner is None: self.manner = manner else: logger.error("Incorrect argument") raise ValueError if type(voiced) == bool or voiced is None: self.voiced = voiced else: logger.error("Incorrect argument") raise TypeError if type(geminate) == bool or geminate is None: self.geminate = geminate else: logger.error("Incorrect argument") raise TypeError self.ipar = ipar
def __init__(self, height=None, backness=None, rounded=None, length=None, ipar=None): if isinstance(height, Height) or height is None: self.height = height else: logger.error("Incorrect argument") raise ValueError if isinstance(backness, Backness) or backness is None: self.backness = backness else: logger.error("Incorrect argument") raise ValueError if type(rounded) == bool or rounded is None: self.rounded = rounded else: logger.error("Incorrect argument") raise TypeError if isinstance(length, Length) or length is None: self.length = length else: logger.error("Incorrect argument") raise ValueError self.ipar = ipar
def open_pickle(path: str): """Open a pickle and return loaded pickle object. :type path: str :param : path: File path to pickle file to be opened. :rtype : object """ try: with open(path, 'rb') as opened_pickle: try: return pickle.load(opened_pickle) except Exception as pickle_error: logger.error(pickle_error) raise except FileNotFoundError as fnf_error: logger.error(fnf_error) raise except IOError as io_err: logger.error(io_err) raise except EOFError as eof_error: logger.error(eof_error) raise
def __init__(self, height=None, backness=None, rounded=None, length=None, ipar=None): if height in HEIGHT or height is None: self.height = height else: logger.error("Incorrect argument") raise ValueError if backness in BACKNESS or backness is None: self.backness = backness else: logger.error("Incorrect argument") raise ValueError if type(rounded) == bool or rounded is None: self.rounded = rounded else: logger.error("Incorrect argument") raise TypeError if length in LENGTHS or length is None: self.length = length else: logger.error("Incorrect argument") raise ValueError self.ipar = ipar
def index_corpus(self): """Make a Whoosh index out of a pre-processed corpus, ie TLG, PHI5, or PHI7. TLG takes almost 13 min; PHI5 1.5 min. To setup index parameters >>> # cltk_index = CLTKIndex('latin', 'phi5') # 1.5 min, 363 docs >>> # cltk_index = CLTKIndex('latin', 'phi5', chunk='work') # 2 min, 837 docs >>> # cltk_index = CLTKIndex('greek', 'tlg') # 13 min, 1823 docs >>> # cltk_index = CLTKIndex('greek', 'tlg', chunk='work') #15.5 min, 6625 docs # And to start indexing: >>> # cltk_index.index_corpus() TODO: Prevent overwriting. Ask user to rm old dir before re-indexing. TODO: Add option for lemmatizing. TODO: Add for figure out lower() options. TODO: Process TLG through forthcoming normalize(). TODO: Add name to each index. TODO: Turn off any language-specific mods (eg, stemming, case) that Whoosh might be doing by default. """ # Setup index dir schema = Schema(path=ID(stored=True), author=TEXT(stored=True), content=TEXT) try: _index = create_in(self.index_path, schema) except FileNotFoundError: os.makedirs(self.index_path) _index = create_in(self.index_path, schema) writer = _index.writer() # Setup corpus to be indexed if self.lang == 'greek' and self.corpus == 'tlg': corpus_path = os.path.expanduser('~/cltk_data/greek/text/tlg/plaintext/') if self.chunk == 'work': corpus_path = os.path.expanduser('~/cltk_data/greek/text/tlg/individual_works/') elif self.lang == 'latin' and self.corpus == 'phi5': corpus_path = os.path.expanduser('~/cltk_data/latin/text/phi5/plaintext/') if self.chunk == 'work': corpus_path = os.path.expanduser('~/cltk_data/latin/text/phi5/individual_works/') assert os.path.isdir(corpus_path), 'Corpus does not exist in the following location: "%s". Use CLTK Corpus Importer and TLGU to create transformed corpus.' % corpus_path # pylint: disable=line-too-long files = os.listdir(corpus_path) if self.lang == 'greek' and self.corpus == 'tlg': files = [f[:-4] for f in files if f.startswith('TLG')] corpus_index = TLG_AUTHOR_MAP elif self.lang == 'latin' and self.corpus == 'phi5': files = [f[:-4] for f in files if f.startswith('LAT')] corpus_index = PHI5_AUTHOR_MAP time_0 = time.time() logger.info("Commencing indexing of %s documents of '%s' corpus." % (len(files), self.corpus)) # pylint: disable=line-too-long logger.info('Index will be written to: "%s".' % self.index_path) if self.chunk == 'author': for count, file in enumerate(files, 1): try: if self.lang == 'greek' and self.corpus == 'tlg': file = file[3:] author = corpus_index[file] path = os.path.join(corpus_path, 'TLG' + file + '.TXT') if self.lang == 'latin' and self.corpus == 'phi5': author = corpus_index[file] path = os.path.join(corpus_path, file + '.TXT') except KeyError as key_error: if file.startswith('LAT9999'): continue logger.error(key_error) raise with open(path) as file_open: content = file_open.read() writer.add_document(path=path, author=author, content=content) if count % 100 == 0: logger.info('Indexed doc %s.' % count) if self.chunk == 'work': for count, file in enumerate(files, 1): try: if self.lang == 'greek' and self.corpus == 'tlg': path = os.path.join(corpus_path, file + '.TXT') author = corpus_index[file[3:-8]] if self.lang == 'latin' and self.corpus == 'phi5': path = os.path.join(corpus_path, file + '.TXT') author = corpus_index[file[:-8]] except KeyError as key_error: if file.startswith('LAT9999'): continue logger.error(key_error) raise with open(path) as file_open: content = file_open.read() writer.add_document(path=path, author=author, content=content) if count % 100 == 0: logger.info('Indexed doc %s.' % count) logger.info('Commencing to commit changes.') writer.commit() time_1 = time.time() elapsed = time_1 - time_0 logger.info('Finished indexing all documents in %s seconds (averaging %s docs per sec.)' % (elapsed, (len(files) / elapsed))) # pylint: disable=line-too-long
def import_corpus(self, corpus_name, local_path=None, branch='master'): # pylint: disable=R0912 """Download a remote or load local corpus into dir ``~/cltk_data``. TODO: maybe add ``from git import RemoteProgress`` TODO: refactor this, it's getting kinda long :type corpus_name: str :param corpus_name: The name of an available corpus. :param local_path: str :param local_path: A filepath, required when importing local corpora. :param branch: What Git branch to clone. """ corpus_properties = self._get_corpus_properties(corpus_name) try: location = corpus_properties['location'] except KeyError: # git_uri = corpus_properties['git_remote'] git_name = corpus_properties[''] git_uri = corpus_properties['origin'] git_type = corpus_properties['type'] # pass this off to a special downloader just for custom urls self._git_user_defined_corpus(git_name, git_type, git_uri) return corpus_type = corpus_properties['type'] if location == 'remote': # git_uri = urljoin('https://github.com/cltk/', corpus_name + '.git') git_uri = corpus_properties['origin'] type_dir_rel = os.path.join(CLTK_DATA_DIR, self.language, corpus_type) type_dir = os.path.expanduser(type_dir_rel) target_dir = os.path.join(type_dir, corpus_name) target_file = os.path.join(type_dir, corpus_name, 'README.md') # check if corpus already present # if not, clone if not os.path.isfile(target_file): if not os.path.isdir(type_dir): os.makedirs(type_dir) try: msg = "Cloning '{}' from '{}'".format(corpus_name, git_uri) logger.info(msg) Repo.clone_from(git_uri, target_dir, branch=branch, depth=1, progress=ProgressPrinter()) except CorpusImportError as corpus_imp_err: msg = "Git clone of '{}' failed: '{}'".format(git_uri, corpus_imp_err) logger.error(msg) # if corpus is present, pull latest else: try: repo = Repo(target_dir) assert not repo.bare # or: assert repo.exists() git_origin = repo.remotes.origin msg = "Pulling latest '{}' from '{}'.".format(corpus_name, git_uri) logger.info(msg) git_origin.pull() except CorpusImportError as corpus_imp_err: msg = "Git pull of '{}' failed: '{}'".format(git_uri, corpus_imp_err) logger.error(msg) elif location == 'local': msg = "Importing from local path: '{}'".format(local_path) logger.info(msg) if corpus_name in ('phi5', 'phi7', 'tlg'): if corpus_name == 'phi5': # normalize path for checking dir if local_path.endswith('/'): local_path = local_path[:-1] # check for right corpus dir if os.path.split(local_path)[1] != 'PHI5': logger.info("Directory must be named 'PHI5'.") if corpus_name == 'phi7': # normalize local_path for checking dir if local_path.endswith('/'): local_path = local_path[:-1] # check for right corpus dir if os.path.split(local_path)[1] != 'PHI7': logger.info("Directory must be named 'PHI7'.") if corpus_name == 'tlg': # normalize path for checking dir if local_path.endswith('/'): local_path = local_path[:-1] # check for right corpus dir if os.path.split(local_path)[1] != 'TLG_E': logger.info("Directory must be named 'TLG_E'.") # move the dir-checking commands into a function data_dir = os.path.expanduser(CLTK_DATA_DIR) originals_dir = os.path.join(data_dir, 'originals') # check for `originals` dir; if not present mkdir if not os.path.isdir(originals_dir): os.makedirs(originals_dir) msg = "Wrote directory at '{}'.".format(originals_dir) logger.info(msg) tlg_originals_dir = os.path.join(data_dir, 'originals', corpus_name) # check for `originals/<corpus_name>`; if pres, delete if os.path.isdir(tlg_originals_dir): shutil.rmtree(tlg_originals_dir) msg = "Removed directory at '{}'.".format(tlg_originals_dir) logger.info(msg) # copy_dir requires that target if not os.path.isdir(tlg_originals_dir): self._copy_dir_recursive(local_path, tlg_originals_dir)
""" import logging import os import sys import time from cltk.utils.cltk_logger import logger # TODO: Fix this # KJ added this to fix failing build on Travis CI. Gensim seems to load boto, which in turn causes an error. try: from gensim.models import Word2Vec except AttributeError: logger.error( 'Command `from gensim.models import Word2Vec` failed with AttributeError.' ) from cltk.corpus.utils.formatter import phi5_plaintext_cleanup from cltk.corpus.utils.formatter import tlg_plaintext_cleanup from cltk.corpus.utils.formatter import assemble_phi5_author_filepaths from cltk.corpus.utils.formatter import assemble_tlg_author_filepaths from cltk.stem.latin.j_v import JVReplacer from cltk.stem.lemma import LemmaReplacer from cltk.stop.latin.stops import STOPS_LIST as latin_stops from cltk.tokenize.word import nltk_tokenize_words from cltk.tokenize.sentence import TokenizeSentence from cltk.tokenize.word import WordTokenizer def gen_docs(corpus, lemmatize, rm_stops):
def convert(self, input_path=None, output_path=None, markup=None, break_lines=False, divide_works=False, latin=False, extra_args=None): """ :param input_path: TLG filepath to convert. :param output_path: filepath of new converted text. :param markup: Specificity of inline markup. Default None removes all numerical markup; 'full' gives most detailed, with reference numbers included before each text line. :param break_lines: No spaces; removes line ends and hyphens before an ID code; hyphens and spaces before page and column ends are retained. :param divide_works: Each work (book) is output as a separate file in the form output_file-xxx.txt; if an output file is not specified, this option has no effect. :param latin: Primarily Latin text (PHI). Some TLG texts, notably doccan1.txt and doccan2.txt are mostly roman texts lacking explicit language change codes. Setting this option will force a change to Latin text after each citation block is encountered. :param extra_args: Any other tlgu args to be passed, in list form and without dashes, e.g.: ['p', 'b', 'B']. """ # setup file paths input_path = os.path.expanduser(input_path) output_path = os.path.expanduser(output_path) # check input path exists assert os.path.isfile(input_path), 'File {0} does not exist.'.format(input_path) # setup tlgu flags tlgu_options = [] if markup == 'full': full_args = ['v', 'w', 'x', 'y', 'z'] [tlgu_options.append(x) for x in full_args] # pylint: disable=W0106 if break_lines: tlgu_options.append('N') if divide_works: tlgu_options.append('W') if latin: tlgu_options.append('r') # setup extra args if extra_args is None: extra_args = [] else: try: extra_args = list(extra_args) except Exception as exc: logger.error("Argument 'extra_args' must be a list: %s.", exc) raise tlgu_options = tlgu_options + extra_args # assemble all tlgu flags tlgu_options = list(set(tlgu_options)) if tlgu_options: tlgu_flags = '-' + ' -'.join(tlgu_options) else: tlgu_flags = '' # make tlgu call tlgu_call = 'tlgu {0} {1} {2}'.format(tlgu_flags, input_path, output_path) logger.info(tlgu_call) try: p_out = subprocess.call(tlgu_call, shell=True) if p_out == 1: logger.error('Failed to convert %s to %s.', input_path, output_path) except Exception as exc: logger.error('Failed to convert %s to %s: %s', input_path, output_path, exc) raise
def import_corpus(self, corpus_name, local_path=None): # pylint: disable=R0912 """Download a remote or load local corpus into dir ``~/cltk_data``. TODO: maybe add ``from git import RemoteProgress`` TODO: refactor this, it's getting kinda long :type corpus_name: str :param corpus_name: The name of an available corpus. :param local_path: str :param local_path: A filepath, required when importing local corpora. """ corpus_properties = self._check_corpus_availability(corpus_name) location = corpus_properties['location'] corpus_type = corpus_properties['type'] if location == 'remote': git_uri = urljoin('https://github.com/cltk/', corpus_name + '.git') #self._download_corpus(corpus_type, corpus_name, path) type_dir_rel = os.path.join(CLTK_DATA_DIR, self.language, corpus_type) type_dir = os.path.expanduser(type_dir_rel) target_dir = os.path.join(type_dir, corpus_name) target_file = os.path.join(type_dir, corpus_name, 'README.md') # check if corpus already present # if not, clone if not os.path.isfile(target_file): if not os.path.isdir(type_dir): os.makedirs(type_dir) try: logger.info("Cloning '%s' from '%s'" % (corpus_name, git_uri)) Repo.clone_from(git_uri, target_dir, depth=1) except Exception as e: logger.error("Git clone of '%s' failed: '%s'", (git_uri, e)) # if corpus is present, pull latest else: try: repo = Repo(target_dir) assert not repo.bare # or: assert repo.exists() o = repo.remotes.origin logger.info("Pulling latest '%s' from '%s'." % (corpus_name, git_uri)) o.pull() except Exception as e: logger.error("Git pull of '%s' failed: '%s'" % (git_uri, e)) elif location == 'local': logger.info("Importing from local path: '%s'", local_path) if corpus_name in ('phi5', 'phi7', 'tlg'): if corpus_name == 'phi5': # normalize path for checking dir if local_path.endswith('/'): local_path = local_path[:-1] # check for right corpus dir if os.path.split(local_path)[1] != 'PHI5': logger.info("Directory must be named 'PHI5'.") if corpus_name == 'phi7': # normalize local_path for checking dir if local_path.endswith('/'): local_path = local_path[:-1] # check for right corpus dir if os.path.split(local_path)[1] != 'PHI7': logger.info("Directory must be named 'PHI7'.") if corpus_name == 'tlg': # normalize path for checking dir if local_path.endswith('/'): local_path = local_path[:-1] # check for right corpus dir if os.path.split(local_path)[1] != 'TLG_E': logger.info("Directory must be named 'TLG_E'.") # move the dir-checking commands into a function data_dir = os.path.expanduser(CLTK_DATA_DIR) originals_dir = os.path.join(data_dir, 'originals') # check for `originals` dir; if not present mkdir if not os.path.isdir(originals_dir): os.makedirs(originals_dir) logger.info("Wrote directory at '%s'.", originals_dir) tlg_originals_dir = os.path.join(data_dir, 'originals', corpus_name) # check for `originals/<corpus_name>`; if pres, delete if os.path.isdir(tlg_originals_dir): shutil.rmtree(tlg_originals_dir) logger.info("Removed directory at '%s'.", tlg_originals_dir) # copy_dir requires that target if not os.path.isdir(tlg_originals_dir): self._copy_dir_recursive(local_path, tlg_originals_dir)
""" from cltk.utils.cltk_logger import logger from nltk.tokenize import wordpunct_tokenize import re import unicodedata try: # James Tauber's greek_accentuation package from greek_accentuation import characters as chars except ImportError as import_error: message = 'Missing "greek_accentuation" package. Install with ' \ '`pip install greek-accentuation`.' logger.error(message) logger.error(import_error) raise __author__ = ['Jack Duff <*****@*****.**>'] __license__ = 'MIT License. See LICENSE.' # Dictionaries of phonological reconstructions for use in transcribing. # Probert, Philomen. 2010. Phonology, in E. Bakker, A Companion to the \ # Ancient Greek Language. # (Entries which are commented out are realized through diacritic analysis.) GREEK = { 'Attic': { 'Probert': {
""" from cltk.utils.cltk_logger import logger from nltk.tokenize import wordpunct_tokenize import re import unicodedata try: # James Tauber's greek_accentuation package from greek_accentuation import characters as chars except ImportError as import_error: print('Missing "greek_accentuation" package. Install with ' + '`pip install greek-accentuation`.') logger.error(import_error) raise __author__ = 'Jack Duff <*****@*****.**>' __license__ = 'MIT License. See LICENSE.' # Dictionaries of phonological reconstructions for use in transcribing. # Probert, Philomen. 2010. Phonology, in E. Bakker, A Companion to the \ # Ancient Greek Language. # (Entries which are commented out are realized through diacritic analysis.) GREEK = { 'Attic': { 'Probert': { 'correspondence': { 'α': 'ɑ',
fuzzywuzzy Good-to-haves: python-Levenshtein """ import re, string import unicodedata from cltk.tokenize.sentence import TokenizeSentence from cltk.utils.cltk_logger import logger try: from fuzzywuzzy import fuzz except ImportError as imp_err: logger.error("'fuzzywuzzy' library required for this module: %s" % imp_err) raise ImportError __author__ = 'Luke Hollis <*****@*****.**>' __license__ = 'MIT License. See LICENSE.' class Levenshtein: """A wrapper class for fuzzywuzzy's Levenshtein distance calculation methods.""" def __init__(self): """Initialize class. Currently empty.""" return @staticmethod def ratio(string_a, string_b): """At the most basic level, return a Levenshtein distance ratio via
def import_corpus(self, corpus_name, local_path=None, branch='master'): # pylint: disable=R0912 """Download a remote or load local corpus into dir ``~/cltk_data``. TODO: maybe add ``from git import RemoteProgress`` TODO: refactor this, it's getting kinda long :type corpus_name: str :param corpus_name: The name of an available corpus. :param local_path: str :param local_path: A filepath, required when importing local corpora. :param branch: What Git branch to clone. """ corpus_properties = self._get_corpus_properties(corpus_name) location = corpus_properties['location'] corpus_type = corpus_properties['type'] if location == 'remote': git_uri = urljoin('https://github.com/cltk/', corpus_name + '.git') # self._download_corpus(corpus_type, corpus_name, path) type_dir_rel = os.path.join(CLTK_DATA_DIR, self.language, corpus_type) type_dir = os.path.expanduser(type_dir_rel) target_dir = os.path.join(type_dir, corpus_name) target_file = os.path.join(type_dir, corpus_name, 'README.md') # check if corpus already present # if not, clone if not os.path.isfile(target_file): if not os.path.isdir(type_dir): os.makedirs(type_dir) try: msg = "Cloning '{}' from '{}'".format(corpus_name, git_uri) logger.info(msg) Repo.clone_from(git_uri, target_dir, branch=branch, depth=1, progress=ProgressPrinter()) except CorpusImportError as corpus_imp_err: msg = "Git clone of '{}' failed: '{}'".format(git_uri, corpus_imp_err) logger.error(msg) # if corpus is present, pull latest else: try: repo = Repo(target_dir) assert not repo.bare # or: assert repo.exists() git_origin = repo.remotes.origin msg = "Pulling latest '{}' from '{}'.".format(corpus_name, git_uri) logger.info(msg) git_origin.pull() except CorpusImportError as corpus_imp_err: msg = "Git pull of '{}' failed: '{}'".format(git_uri, corpus_imp_err) logger.error(msg) elif location == 'local': msg = "Importing from local path: '{}'".format(local_path) logger.info(msg) if corpus_name in ('phi5', 'phi7', 'tlg'): if corpus_name == 'phi5': # normalize path for checking dir if local_path.endswith('/'): local_path = local_path[:-1] # check for right corpus dir if os.path.split(local_path)[1] != 'PHI5': logger.info("Directory must be named 'PHI5'.") if corpus_name == 'phi7': # normalize local_path for checking dir if local_path.endswith('/'): local_path = local_path[:-1] # check for right corpus dir if os.path.split(local_path)[1] != 'PHI7': logger.info("Directory must be named 'PHI7'.") if corpus_name == 'tlg': # normalize path for checking dir if local_path.endswith('/'): local_path = local_path[:-1] # check for right corpus dir if os.path.split(local_path)[1] != 'TLG_E': logger.info("Directory must be named 'TLG_E'.") # move the dir-checking commands into a function data_dir = os.path.expanduser(CLTK_DATA_DIR) originals_dir = os.path.join(data_dir, 'originals') # check for `originals` dir; if not present mkdir if not os.path.isdir(originals_dir): os.makedirs(originals_dir) msg = "Wrote directory at '{}'.".format(originals_dir) logger.info(msg) tlg_originals_dir = os.path.join(data_dir, 'originals', corpus_name) # check for `originals/<corpus_name>`; if pres, delete if os.path.isdir(tlg_originals_dir): shutil.rmtree(tlg_originals_dir) msg = "Removed directory at '{}'.".format(tlg_originals_dir) logger.info(msg) # copy_dir requires that target if not os.path.isdir(tlg_originals_dir): self._copy_dir_recursive(local_path, tlg_originals_dir)
def convert(self, input_path=None, output_path=None, markup=None, rm_newlines=False, divide_works=False, latin=False, extra_args=None): """ :param input_path: TLG filepath to convert. :param output_path: filepath of new converted text. :param markup: Specificity of inline markup. Default None removes all numerical markup; 'full' gives most detailed, with reference numbers included before each text line. :param rm_newlines: No spaces; removes line ends and hyphens before an ID code; hyphens and spaces before page and column ends are retained. :param divide_works: Each work (book) is output as a separate file in the form output_file-xxx.txt; if an output file is not specified, this option has no effect. :param latin: Primarily Latin text (PHI). Some TLG texts, notably doccan1.txt and doccan2.txt are mostly roman texts lacking explicit language change codes. Setting this option will force a change to Latin text after each citation block is encountered. :param extra_args: Any other tlgu args to be passed, in list form and without dashes, e.g.: ['p', 'b', 'B']. """ # setup file paths input_path = os.path.expanduser(input_path) output_path = os.path.expanduser(output_path) # check input path exists assert os.path.isfile(input_path), 'File {0} does not exist.'.format( input_path) # setup tlgu flags tlgu_options = [] if markup == 'full': full_args = ['v', 'w', 'x', 'y', 'z'] [tlgu_options.append(x) for x in full_args] # pylint: disable=W0106 if rm_newlines: tlgu_options.append('N') if divide_works: tlgu_options.append('W') if latin: tlgu_options.append('r') # setup extra args if extra_args is None: extra_args = [] else: try: extra_args = list(extra_args) except Exception as exc: logger.error("Argument 'extra_args' must be a list: %s.", exc) raise tlgu_options = tlgu_options + extra_args # assemble all tlgu flags tlgu_options = list(set(tlgu_options)) if tlgu_options: tlgu_flags = '-' + ' -'.join(tlgu_options) else: tlgu_flags = '' # make tlgu call tlgu_call = 'tlgu {0} {1} {2}'.format(tlgu_flags, input_path, output_path) logger.info(tlgu_call) try: p_out = subprocess.call(tlgu_call, shell=True) if p_out == 1: logger.error('Failed to convert %s to %s.', input_path, output_path) except Exception as exc: logger.error('Failed to convert %s to %s: %s', input_path, output_path, exc) raise
"""Tools for working with Levenshtein distance algorithm and distance ratio between strings. """ from cltk.utils.cltk_logger import logger try: from fuzzywuzzy import fuzz except ImportError as imp_err: message = "'fuzzywuzzy' library required for this module: %s. Install with `pip install fuzzywuzzy python-Levenshtein`" % imp_err logger.error(message) print(message) raise ImportError __author__ = ['Luke Hollis <*****@*****.**>'] __license__ = 'MIT License. See LICENSE.' class Levenshtein: """A wrapper class for fuzzywuzzy's Levenshtein distance calculation methods.""" def __init__(self): """Initialize class. Currently empty.""" return @staticmethod def ratio(string_a, string_b): """At the most basic level, return a Levenshtein distance ratio via fuzzywuzzy. :param string_a: str :param string_b: str :return: float
def convert_corpus(self, corpus, markup=None, break_lines=False, divide_works=False, latin=None, extra_args=None): # pylint: disable=W0613 """Look for imported TLG or PHI files and convert them all to ``~/cltk_data/greek/text/tlg/<plaintext>``. TODO: Should this and/or convert() be static? TODO: Add markup options to input. TODO: Do something with break_lines, divide_works, and extra_args or rm them """ orig_path_rel = '~/cltk_data/originals' orig_path = os.path.expanduser(orig_path_rel) target_path_rel = '~/cltk_data' target_path = os.path.expanduser(target_path_rel) assert corpus in ['tlg', 'phi5', 'phi7'], "Corpus must be 'tlg', 'phi5', or 'phi7'" if corpus in ['tlg', 'phi5', 'phi7']: orig_path = os.path.join(orig_path, corpus) if corpus in ['tlg', 'phi7']: if 'phi7' and latin is True: latin = True target_path = os.path.join(target_path, 'latin', 'text', corpus) else: latin = None target_path = os.path.join(target_path, 'greek', 'text', corpus) else: target_path = os.path.join(target_path, 'latin', 'text', corpus) latin = True try: corpus_files = os.listdir(orig_path) except Exception as exception: logger.error("Failed to find TLG files: %s", exception) raise # make a list of files to be converted txts = [] [txts.append(x) for x in corpus_files if x.endswith('TXT')] # pylint: disable=W0106 # loop through list and convert one at a time for txt in txts: orig_txt_path = os.path.join(orig_path, txt) if markup is None: target_txt_dir = os.path.join(target_path, 'plaintext') else: target_txt_dir = os.path.join(target_path, str(markup)) if not os.path.isdir(target_txt_dir): os.makedirs(target_txt_dir) target_txt_path = os.path.join(target_txt_dir, txt) try: self.convert(orig_txt_path, target_txt_path, markup=None, break_lines=False, divide_works=False, latin=latin, extra_args=None) except Exception as exception: logger.error("Failed to convert file '%s' to '%s': %s", orig_txt_path, target_txt_path, exception)
TODO: Add CLTK logging to this. """ import logging import os import sys import time from cltk.utils.cltk_logger import logger # TODO: Fix this # KJ added this to fix failing build on Travis CI. Gensim seems to load boto, which in turn causes an error. try: from gensim.models import Word2Vec except AttributeError: logger.error('Command `from gensim.models import Word2Vec` failed with AttributeError.') from cltk.corpus.utils.formatter import phi5_plaintext_cleanup from cltk.corpus.utils.formatter import tlg_plaintext_cleanup from cltk.corpus.utils.formatter import assemble_phi5_author_filepaths from cltk.corpus.utils.formatter import assemble_tlg_author_filepaths from cltk.stem.latin.j_v import JVReplacer from cltk.stem.lemma import LemmaReplacer # Change lemmatizer from cltk.stop.latin import STOPS_LIST as latin_stops from cltk.tokenize.word import WordTokenizer from cltk.tokenize.sentence import TokenizeSentence from cltk.tokenize.word import WordTokenizer def gen_docs(corpus, lemmatize, rm_stops):
def index_corpus(self): """Make a Whoosh index out of a pre-processed corpus, ie TLG, PHI5, or PHI7. TLG takes almost 13 min; PHI5 1.5 min. To setup index parameters >>> cltk_index = CLTKIndex('latin', 'phi5') # 1.5 min, 363 docs >>> cltk_index = CLTKIndex('latin', 'phi5', chunk='work') # 2 min, 837 docs >>> cltk_index = CLTKIndex('greek', 'tlg') # 13 min, 1823 docs >>> cltk_index = CLTKIndex('greek', 'tlg', chunk='work') #15.5 min, 6625 docs # And to start indexing: >>> cltk_index.index_corpus() TODO: Prevent overwriting. Ask user to rm old dir before re-indexing. TODO: Add option for lemmatizing. TODO: Add for figure out lower() options. TODO: Process TLG through forthcoming normalize(). TODO: Add name to each index. TODO: Turn off any language-specific mods (eg, stemming, case) that Whoosh might be doing by default. """ # Setup index dir schema = Schema(path=ID(stored=True), author=TEXT(stored=True), content=TEXT) try: _index = create_in(self.index_path, schema) except FileNotFoundError: os.makedirs(self.index_path) _index = create_in(self.index_path, schema) writer = _index.writer() # Setup corpus to be indexed if self.lang == 'greek' and self.corpus == 'tlg': corpus_path = os.path.expanduser( '~/cltk_data/greek/text/tlg/plaintext/') if self.chunk == 'work': corpus_path = os.path.expanduser( '~/cltk_data/greek/text/tlg/individual_works/') elif self.lang == 'latin' and self.corpus == 'phi5': corpus_path = os.path.expanduser( '~/cltk_data/latin/text/phi5/plaintext/') if self.chunk == 'work': corpus_path = os.path.expanduser( '~/cltk_data/latin/text/phi5/individual_works/') assert os.path.isdir(corpus_path), 'Corpus does not exist in the following location: "%s". Use CLTK Corpus Importer and TLGU to create transformed corpus.' % corpus_path # pylint: disable=line-too-long files = os.listdir(corpus_path) if self.lang == 'greek' and self.corpus == 'tlg': files = [f[:-4] for f in files if f.startswith('TLG')] corpus_index = TLG_AUTHOR_MAP elif self.lang == 'latin' and self.corpus == 'phi5': files = [f[:-4] for f in files if f.startswith('LAT')] corpus_index = PHI5_AUTHOR_MAP time_0 = time.time() logger.info("Commencing indexing of %s documents of '%s' corpus." % (len(files), self.corpus)) # pylint: disable=line-too-long logger.info('Index will be written to: "%s".' % self.index_path) if self.chunk == 'author': for count, file in enumerate(files, 1): try: if self.lang == 'greek' and self.corpus == 'tlg': file = file[3:] author = corpus_index[file] path = os.path.join(corpus_path, 'TLG' + file + '.TXT') if self.lang == 'latin' and self.corpus == 'phi5': author = corpus_index[file] path = os.path.join(corpus_path, file + '.TXT') except KeyError as key_error: if file.startswith('LAT9999'): continue logger.error(key_error) raise with open(path) as file_open: content = file_open.read() writer.add_document(path=path, author=author, content=content) if count % 100 == 0: logger.info('Indexed doc %s.' % count) if self.chunk == 'work': for count, file in enumerate(files, 1): try: if self.lang == 'greek' and self.corpus == 'tlg': path = os.path.join(corpus_path, file + '.TXT') author = corpus_index[file[3:-8]] if self.lang == 'latin' and self.corpus == 'phi5': path = os.path.join(corpus_path, file + '.TXT') author = corpus_index[file[:-8]] except KeyError as key_error: if file.startswith('LAT9999'): continue logger.error(key_error) raise with open(path) as file_open: content = file_open.read() writer.add_document(path=path, author=author, content=content) if count % 100 == 0: logger.info('Indexed doc %s.' % count) logger.info('Commencing to commit changes.') writer.commit() time_1 = time.time() elapsed = time_1 - time_0 logger.info('Finished indexing all documents in %s seconds (averaging %s docs per sec.)' % (elapsed, (len(files) / elapsed))) # pylint: disable=line-too-long