def test_greek_betacode_to_unicode(self): """Test converting Beta Code to Unicode. Note: assertEqual appears to not be correctly comparing certain characters (``ά`` and ``ί``, at least). """ replacer = Replacer() # Generic test beta_1 = r"""O(/PWS OU)=N MH\ TAU)TO\ """ unicode_1 = replacer.beta_code(beta_1) target_1 = 'ὅπως οὖν μὴ ταὐτὸ ' # Test for iota and diaeresis self.assertEqual(unicode_1, target_1) beta_2 = r"""*XALDAI+KH\N""" unicode_2 = replacer.beta_code(beta_2) target_2 = 'Χαλδαϊκὴν' self.assertEqual(unicode_2, target_2) # Test for upsilon and diaeresis beta_3 = r"""PROU+POTETAGME/NWN""" unicode_3 = replacer.beta_code(beta_3) target_3 = 'προϋποτεταγμένων' self.assertEqual(unicode_3, target_3) # Test for lowercase beta_4 = r"""proi+sxome/nwn""" unicode_4 = replacer.beta_code(beta_4) target_4 = 'προϊσχομένων' self.assertEqual(unicode_4, target_4)
def test_greek_betacode_to_unicode(self): """Test converting Beta Code to Unicode. Note: assertEqual appears to not be correctly comparing certain characters (``ά`` and ``ί``, at least). """ beta_example = r"""O(/PWS OU)=N MH\ TAU)TO\ """ replacer = Replacer() unicode = replacer.beta_code(beta_example) target_unicode = 'ὅπως οὖν μὴ ταὐτὸ ' self.assertEqual(unicode, target_unicode)
def beta2uni(text_beta): """ Wrapper of the cltk.corpus.greek.beta_to_unicode.Replacer function """ if CLTK_NOT_FOUND: print( 'CLTK is not found in this environment. In order to use the beta2uni converter,', 'install this package with `pip install cltk` or `pip install dh-utils[betacode]`' ) return None text_beta = text_beta.translate(LATIN_UPPER_TRANS) text_uni = Replacer().beta_code(text_beta.upper()) return text_uni
def get_tags(): r = Replacer() entire_treebank = 'greek_treebank_perseus/agdt-1.7.xml' with open(entire_treebank, 'r') as f: xml_string = f.read() root = etree.fromstring(xml_string) sentences = root.findall('sentence') sentences_list = [] for sentence in sentences: # note: sentence is Element words_list = sentence.findall('word') sentence_list = [] # http://ilk.uvt.nl/conll/ for x in words_list: # note: word is class word = x.attrib #id = word['id'] form = word['form'].upper() # make upper case for Beta Code converter form = r.beta_code(form) try: # convert final sigmas if form[-1] == 'σ': form = form[:-1] + 'ς' except IndexError: pass form = form.lower() # rm nasty single quotes form_list = [char for char in form if char not in ["'", '᾽', '’', '[', ']']] form = ''.join(form_list) #lemma = word['lemma'] cpostag = word['relation'] # Coarse-grained part-of-speech tag cpostag = cpostag.split('_')[0] #postag = word['postag'] #feats = '_' # an underscore if not available #head = word['head'] #deprel = word['head'] #phead = '_' #pderprel = '_' word_tag = '/'.join([form, cpostag]) sentence_list.append(word_tag) sentence_str = ' '.join(sentence_list) sentences_list.append(sentence_str) treebank_training_set = '\n\n'.join(sentences_list) with open('penn_pos_training_set_reduce.pos', 'w') as f: f.write(treebank_training_set)
def get_tags(path): r = Replacer() entire_treebank = path with open(entire_treebank, 'r') as f: xml_string = f.read() root = etree.fromstring(xml_string) body = root.findall('body')[0] sentences = body.findall('sentence') sentences_list = [] for sentence in sentences: words_list = sentence.findall('word') sentence_list = [] for x in words_list: word = x.attrib form = word['form'].upper() form = r.beta_code(form) try: if form[-1] == 's': form = form[:-1] + '?' except IndexError: pass form = form.lower() form = clean(basify(form)) form_list = [ char for char in form if char not in [' ', "'", '?', '’', '[', ']'] ] form = ''.join(form_list) try: postag1 = word['postag'] postag1 = postag1 postag2 = word['lemma'] postag2 = clean(basify(postag2)) except: postag = 'x--------' if len(form) == 0: continue word_tag = '/'.join([form, postag1, postag2]) sentence_list.append(word_tag) sentence_str = ' '.join(sentence_list) sentences_list.append(sentence_str) treebank_training_set = '\n\n'.join(sentences_list) return treebank_training_set
def get_tags(): r = Replacer() entire_treebank = '/home/q078011/cltk_data/greek/text/perseus_treebank_dev/v2.1/Greek/texts/tlg0003.tlg001.perseus-grc1.1.tb.xml' with open(entire_treebank, 'r') as f: xml_string = f.read() root = etree.fromstring(xml_string) body = root.findall('body')[0] sentences = body.findall('sentence') sentences_list = [] for sentence in sentences: words_list = sentence.findall('word') sentence_list = [] for x in words_list: word = x.attrib form = word['form'].upper() form = r.beta_code(form) try: if form[-1] == 's': form = form[:-1] + '?' except IndexError: pass form = form.lower() form = basify(form) form_list = [ char for char in form if char not in [' ', "'", '?', '’', '[', ']'] ] form = ''.join(form_list) try: postag = word['postag'] except: postag = 'x--------' if len(form) == 0: continue word_tag = '/'.join([form, postag]) sentence_list.append(word_tag) sentence_str = ' '.join(sentence_list) sentences_list.append(sentence_str) treebank_training_set = '\n\n'.join(sentences_list) with open('greek_training_set_2.pos', 'w') as f: f.write(treebank_training_set)
from cltk.corpus.greek.beta_to_unicode import Replacer from lxml import etree from greek_accentuation.characters import * from greek_accentuation.characters import strip_accents from transliterate import translit from cltk.corpus.greek.beta_to_unicode import Replacer r = Replacer() def g_translit(string): tr = translit(string, "el") if string[-1] == "s": tr = tr[:-1] tr = tr + r.beta_code('s') return tr def basify(string): basic = "".join([strip_accents(x) for x in string]) return basic def get_tags(): r = Replacer() entire_treebank = '/home/q078011/cltk_data/greek/text/perseus_treebank_dev/v2.1/Greek/texts/tlg0003.tlg001.perseus-grc1.1.tb.xml' with open(entire_treebank, 'r') as f: xml_string = f.read() root = etree.fromstring(xml_string) body = root.findall('body')[0]
def beta2uni(text_beta): text_beta = text_beta.translate( str.maketrans(string.ascii_lowercase, string.ascii_uppercase)) text_uni = Replacer().beta_code(text_beta) return text_uni