def test_greek_betacode_to_unicode(self): """Test converting Beta Code to Unicode. Note: assertEqual appears to not be correctly comparing certain characters (``ά`` and ``ί``, at least). """ replacer = Replacer() # Generic test beta_1 = r"""O(/PWS OU)=N MH\ TAU)TO\ """ unicode_1 = replacer.beta_code(beta_1) target_1 = 'ὅπως οὖν μὴ ταὐτὸ ' # Test for iota and diaeresis self.assertEqual(unicode_1, target_1) beta_2 = r"""*XALDAI+KH\N""" unicode_2 = replacer.beta_code(beta_2) target_2 = 'Χαλδαϊκὴν' self.assertEqual(unicode_2, target_2) # Test for upsilon and diaeresis beta_3 = r"""PROU+POTETAGME/NWN""" unicode_3 = replacer.beta_code(beta_3) target_3 = 'προϋποτεταγμένων' self.assertEqual(unicode_3, target_3) # Test for lowercase beta_4 = r"""proi+sxome/nwn""" unicode_4 = replacer.beta_code(beta_4) target_4 = 'προϊσχομένων' self.assertEqual(unicode_4, target_4)
def test_greek_betacode_to_unicode(self): """Test converting Beta Code to Unicode. Note: assertEqual appears to not be correctly comparing certain characters (``ά`` and ``ί``, at least). """ beta_example = r"""O(/PWS OU)=N MH\ TAU)TO\ """ replacer = Replacer() unicode = replacer.beta_code(beta_example) target_unicode = 'ὅπως οὖν μὴ ταὐτὸ ' self.assertEqual(unicode, target_unicode)
def get_tags(): r = Replacer() entire_treebank = 'greek_treebank_perseus/agdt-1.7.xml' with open(entire_treebank, 'r') as f: xml_string = f.read() root = etree.fromstring(xml_string) sentences = root.findall('sentence') sentences_list = [] for sentence in sentences: # note: sentence is Element words_list = sentence.findall('word') sentence_list = [] # http://ilk.uvt.nl/conll/ for x in words_list: # note: word is class word = x.attrib #id = word['id'] form = word['form'].upper() # make upper case for Beta Code converter form = r.beta_code(form) try: # convert final sigmas if form[-1] == 'σ': form = form[:-1] + 'ς' except IndexError: pass form = form.lower() # rm nasty single quotes form_list = [char for char in form if char not in ["'", '᾽', '’', '[', ']']] form = ''.join(form_list) #lemma = word['lemma'] cpostag = word['relation'] # Coarse-grained part-of-speech tag cpostag = cpostag.split('_')[0] #postag = word['postag'] #feats = '_' # an underscore if not available #head = word['head'] #deprel = word['head'] #phead = '_' #pderprel = '_' word_tag = '/'.join([form, cpostag]) sentence_list.append(word_tag) sentence_str = ' '.join(sentence_list) sentences_list.append(sentence_str) treebank_training_set = '\n\n'.join(sentences_list) with open('penn_pos_training_set_reduce.pos', 'w') as f: f.write(treebank_training_set)
def get_tags(path): r = Replacer() entire_treebank = path with open(entire_treebank, 'r') as f: xml_string = f.read() root = etree.fromstring(xml_string) body = root.findall('body')[0] sentences = body.findall('sentence') sentences_list = [] for sentence in sentences: words_list = sentence.findall('word') sentence_list = [] for x in words_list: word = x.attrib form = word['form'].upper() form = r.beta_code(form) try: if form[-1] == 's': form = form[:-1] + '?' except IndexError: pass form = form.lower() form = clean(basify(form)) form_list = [ char for char in form if char not in [' ', "'", '?', '’', '[', ']'] ] form = ''.join(form_list) try: postag1 = word['postag'] postag1 = postag1 postag2 = word['lemma'] postag2 = clean(basify(postag2)) except: postag = 'x--------' if len(form) == 0: continue word_tag = '/'.join([form, postag1, postag2]) sentence_list.append(word_tag) sentence_str = ' '.join(sentence_list) sentences_list.append(sentence_str) treebank_training_set = '\n\n'.join(sentences_list) return treebank_training_set
def beta2uni(text_beta): """ Wrapper of the cltk.corpus.greek.beta_to_unicode.Replacer function """ if CLTK_NOT_FOUND: print( 'CLTK is not found in this environment. In order to use the beta2uni converter,', 'install this package with `pip install cltk` or `pip install dh-utils[betacode]`' ) return None text_beta = text_beta.translate(LATIN_UPPER_TRANS) text_uni = Replacer().beta_code(text_beta.upper()) return text_uni
def test_greek_betacode_to_unicode(self): """Test converting Beta Code to Unicode. Note: assertEqual appears to not be correctly comparing certain characters (``ά`` and ``ί``, at least). """ replacer = Replacer() # Generic test beta_1 = r"""O(/PWS OU)=N MH\ TAU)TO\ """ unicode_1 = replacer.beta_code(beta_1) target_1 = 'ὅπως οὖν μὴ ταὐτὸ ' # Test for iota and diaeresis self.assertEqual(unicode_1, target_1) beta_2 = r"""*XALDAI+KH\N""" unicode_2 = replacer.beta_code(beta_2) target_2 = 'Χαλδαϊκὴν' self.assertEqual(unicode_2, target_2) # Test for upsilon and diaeresis beta_3 = r"""PROU+POTETAGME/NWN""" unicode_3 = replacer.beta_code(beta_3) target_3 = 'προϋποτεταγμένων' self.assertEqual(unicode_3, target_3)
def get_tags(): r = Replacer() entire_treebank = '/home/q078011/cltk_data/greek/text/perseus_treebank_dev/v2.1/Greek/texts/tlg0003.tlg001.perseus-grc1.1.tb.xml' with open(entire_treebank, 'r') as f: xml_string = f.read() root = etree.fromstring(xml_string) body = root.findall('body')[0] sentences = body.findall('sentence') sentences_list = [] for sentence in sentences: words_list = sentence.findall('word') sentence_list = [] for x in words_list: word = x.attrib form = word['form'].upper() form = r.beta_code(form) try: if form[-1] == 's': form = form[:-1] + '?' except IndexError: pass form = form.lower() form = basify(form) form_list = [ char for char in form if char not in [' ', "'", '?', '’', '[', ']'] ] form = ''.join(form_list) try: postag = word['postag'] except: postag = 'x--------' if len(form) == 0: continue word_tag = '/'.join([form, postag]) sentence_list.append(word_tag) sentence_str = ' '.join(sentence_list) sentences_list.append(sentence_str) treebank_training_set = '\n\n'.join(sentences_list) with open('greek_training_set_2.pos', 'w') as f: f.write(treebank_training_set)
def compile_tlg_txt(self): """Reads original Beta Code files and converts to Unicode files""" logging.info('Starting TLG corpus compilation into files.') compiled_files_dir_tlg = os.path.join(self.compiled_files_dir, 'tlg') if os.path.isdir(compiled_files_dir_tlg) is True: pass else: os.mkdir(compiled_files_dir_tlg) self.make_tlg_index_file_author() self.read_tlg_index_file_author() for file_name in tlg_index: abbrev = tlg_index[file_name] orig_files_dir_tlg = os.path.join(self.orig_files_dir, 'tlg') file_name_txt = file_name + '.TXT' files_path = os.path.join(orig_files_dir_tlg, file_name_txt) try: with open(files_path, 'rb') as index_opened: txt_read = index_opened.read().decode('latin-1') txt_ascii = remove_non_ascii(txt_read) local_replacer = Replacer() new_uni = local_replacer.beta_code(txt_ascii) file_name_txt_uni = file_name + '.txt' file_path = os.path.join(compiled_files_dir_tlg, file_name_txt_uni) try: with open(file_path, 'w') as new_file: new_file.write(new_uni) except IOError: logging.error('Failed to write to new file %s of ' 'author %s', file_name, abbrev) logging.info('Finished TLG corpus compilation to %s', file_path) except IOError: logging.error('Failed to open TLG file %s of author %s', file_name, abbrev) self.make_tlg_meta_index() self.make_tlg_index_auth_works()
from cltk.corpus.greek.beta_to_unicode import Replacer from lxml import etree from greek_accentuation.characters import * from greek_accentuation.characters import strip_accents from transliterate import translit from cltk.corpus.greek.beta_to_unicode import Replacer r = Replacer() def g_translit(string): tr = translit(string, "el") if string[-1] == "s": tr = tr[:-1] tr = tr + r.beta_code('s') return tr def basify(string): basic = "".join([strip_accents(x) for x in string]) return basic def get_tags(): r = Replacer() entire_treebank = '/home/q078011/cltk_data/greek/text/perseus_treebank_dev/v2.1/Greek/texts/tlg0003.tlg001.perseus-grc1.1.tb.xml' with open(entire_treebank, 'r') as f: xml_string = f.read() root = etree.fromstring(xml_string) body = root.findall('body')[0]
from cltk.corpus.greek.beta_to_unicode import Replacer import re r = Replacer() fulls = '' labels = '' label = '' workw = '' lem = '' raws = '' postag = '' stemtype = '' keys = '' prim = {} f = open("homer-lmorph2.txt", "r") for l in f: print("starts", l) l = re.sub('\s+$', '', l) l = re.sub('\\+/', '/+', l) # o)i/+omai rather than o)i+/omai l = re.sub('\\+\\\\', '\\+', l) # o)i/+omai rather than o)i+/omai l = re.sub('u\'', 'u’', l) fds = l.split("\t") fds[0] = re.sub("\s+$", "", fds[0]) if (not re.search(" ", fds[0]) and re.search("indeclform", l)): postag = fds[1] if (len(fds) > 4): stemtype = fds[4] if (re.search(" ", fds[0])): prim = fds[0].split(" ") prim[1] = re.sub("w_", "w", prim[1])
def beta2uni(text_beta): text_beta = text_beta.translate( str.maketrans(string.ascii_lowercase, string.ascii_uppercase)) text_uni = Replacer().beta_code(text_beta) return text_uni
from xml.etree import ElementTree as ET import pickle import collections import pandas as pd import numpy as np from difflib import * from greek_accentuation.characters import strip_accents from transliterate import translit from cltk.corpus.greek.beta_to_unicode import Replacer from model.clean import clean from Levenshtein import distance #### Input list of words #### r = Replacer() tree = ET.parse( '/home/q078011/cltk_data/greek/text/greek_lexica_perseus/greek_english_lexicon.xml' ) root = tree.getroot() li = [entry for entry in root.iter('entryFree')] wrd = lambda i: clean( basify(r.beta_code([x.text for x in li[i].iter('orth')][0]))) w = [] for i in range(len(li)): try: w.append(wrd(i)) except AttributeError: pass