def test_other(self): for ext in ['ssf', 'conll', 'tnt']: wx_con = WXC( order='utf2wx', lang='hin', format_=ext, ssf_type='intra', rmask=False) utf_con = WXC( order='wx2utf', lang='hin', format_=ext, ssf_type='intra', rmask=False) with io.open('%s/%s/hin.%s' % (self.test_dir, ext, ext), encoding='utf-8') as fp: if ext == "ssf": sentences = re.finditer( "(<Sentence id=.*?>)(.*?)</Sentence>", fp.read(), re.S) for sid_sentence in sentences: sentence = sid_sentence.group(2).strip() wx = wx_con.convert(sentence) else: for line in fp: wx = wx_con.convert(line) utf = utf_con.convert(wx) wx_ = wx_con.convert(utf) self.assertEqual(wx, wx_)
def test_raw_text(self): for lang in self.languages: wx_con = WXC(order='utf2wx', lang=lang) utf_con = WXC(order='wx2utf', lang=lang) with io.open('%s/plain_text/%s.txt' % (self.test_dir, lang), encoding='utf-8') as fp: for line in fp: wx = wx_con.convert(line) utf = utf_con.convert(wx) wx_ = wx_con.convert(utf) self.assertEqual(wx, wx_)
def wx_utf_converter(relation_df): a = [] con = WXC(order='wx2utf', lang = 'hin') for i in relation_df.index: a.append(con.convert(relation_df.WORD[i])) relation_df['UTF_hindi'] = a return(relation_df)
def convert_encoding(filename, text_type, language, in_enc, out_enc): with codecs.open(filename, 'r', encoding='utf-8') as fp: logger.info('Loading text_type: %s data' % (text_type)) output_data = "" if text_type == "ssf": converter = WXC(order='%s2%s' % (in_enc, out_enc), lang=language) for line in fp: line = line.strip() ds = line.split() #print("Line:--", ds) if line == "": output_data += u"\n" elif line[0] == "<": output_data += u"%s\n" % (line) elif ds[0] == "))": output_data += u"\t%s\n" % (line) elif ds[0] == "0" or ds[1] == "((": output_data += u"%s\n" % (u"\t".join(ds)) elif ds[1] != "": #print("check", ds) word, tag = ds[1], ds[2] word_con = converter.convert(word) output_data += u"%s\t%s\t%s\n" % (ds[0], word_con, tag) else: pass return output_data else: if text_type == "tnt": converter = WXC(order='%s2%s' % (in_enc, out_enc), lang=language, format_='tnt') elif text_type == "text": converter = WXC(order='%s2%s' % (in_enc, out_enc), lang=language) elif text_type == "conll": converter = WXC(order='%s2%s' % (in_enc, out_enc), lang=language, format_='conll') else: raise Exception("Unknown Format %s" % text_type) pass text = fp.read() output_data = converter.convert(text) return output_data
def wx_utf_converter(relation_df): a = [] con = WXC(order='wx2utf', lang = 'hin') for i in relation_df.index: a.append(con.convert(relation_df.WORD[i])) print(a) relation_df['UTF_hindi'] = a #relation_df['UTF_hindi'] = pd.Series(a).values return(relation_df)
class ThreeStepDecoding(object): def __init__(self, lid, htrans=None, etrans=None, wx=False): self.ed = enchant.Dict('en') self.hblm = kenlm.LanguageModel('lm/hindi-n3-p5-lmplz.blm') self.eblm = kenlm.LanguageModel('lm/english-n3-p10-lmplz.blm') self.so_dec_eng = so_viterbi(self.eblm) self.so_dec_hin = so_viterbi(self.hblm) self.e2h = {kv.split()[0]:kv.split()[1].split('|') for kv in io.open('dicts/ENG2HIN12M.dict')} self.h2e = {kv.split()[0]:kv.split()[1].split('|') for kv in io.open('dicts/HIN2ENG12M.dict')} self.meta = Meta() self.lid = LID(model=lid, etrans=etrans, htrans=htrans) self.wx = wx if not self.wx: self.wxc = WXC(order='wx2utf', lang='hin') def max_likelihood(self, n_sentence, target, k_best=7): if target == 'en': auto_tags = self.so_dec_eng.decode(n_sentence, len(n_sentence), k_best) else: auto_tags = self.so_dec_hin.decode(n_sentence, len(n_sentence), k_best) #beamsearch best_sen = [n_sentence[idx][at] for idx, at in enumerate(auto_tags)] return best_sen def decode(self, words, ltags): words = [re.sub(r'([a-z])\1\1+', r'\1', w) for w in words] hi_trellis = [self.lid.htrans.get(wi.lower(), [wi]*5)[:5] + self.e2h.get(wi.lower() if li == 'en' else None, [u'_%s_' %wi])[:1] + [u'_%s_'%wi] for wi,li in zip(words, ltags)] # import pdb; pdb.set_trace() hi_mono = self.max_likelihood(hi_trellis, 'hi') en_trellis = [[wi] + self.lid.etrans.get(wi.lower(), [wi]*5)[:5] + self.h2e.get(wh if li == 'hi' else None, [wi])[:1] for wi,wh,li in zip (words, hi_mono, ltags)] en_mono = self.max_likelihood(en_trellis, 'en') out = hi_mono[:] for i, _ in enumerate(hi_mono): if ltags[i] in ['univ', 'acro', 'ne']: out[i] = words[i] elif ltags[i] in ['en', 'ne']: if words[i].lower() == en_mono[i]: out[i] = words[i] elif self.ed.check(words[i].lower()) and len(words[i])>1: out[i] = words[i] elif words[i].lower() in ['a', 'i']: out[i] = words[i] else: out[i] = en_mono[i] elif not self.wx: out[i] = self.wxc.convert(out[i]) return out def tag_sent(self, sent, trans=True): sent = sent.split() sent, ltags = zip(*self.lid.tag_sent(sent)) dec = self.decode(sent, ltags) return zip(sent, dec, ltags)
from wxconv import WXC con = WXC(order='utf2wx') con1 = WXC(order='wx2utf', lang='hin') print(con.convert('लेस।')) print(con1.convert('esehI'))
def wx_utf_converter_sentence(sentence): con = WXC(order='wx2utf', lang = 'hin') sentence1 = con.convert(sentence) return(sentence1)
if line.startswith('<Sentence'): sentence = [] elif line.startswith('</Sentence'): sentences.append(sentence) print(sentences) elif line.startswith('<ENAMEX'): ner_tag = tag_extract(line) print(ner_tag) else: print("bump") line = line.split() if len(line) == 0: continue try: index = float(line[0]) if index != int(index): sentence.append( wxc.convert(line[1]) + ' ' + line[2] + ' ' + '.' + ' ' + ner_tag) except ValueError: pass ner_tag = 'O' random.shuffle(sentences) train = 0.8 * len(sentences) // 1 test_a = 0.15 * len(sentences) // 1 test_b = len(sentences) - train - test_a train, test_a, test_b = sentences[0:train], sentences[ train:train + test_a], sentences[train + test_a:] write_conll(train, 'hin.train') write_conll(test_a, 'hin.test_a') write_conll(test_b, 'hin.test_b')
elif choice == '19': sentenceid = input("Enter the sentence id: ") spl = re.split(r'-', sentenceid)[0] f = open("./treebanks/UD_Hindi-HDTB/hi_hdtb-ud-" + spl + ".conllu", "r").readlines() f1 = open("./conll/conll_" + sentenceid + ".dat", "w") n = len(f) for i in range(0, n): if sentenceid in f[i]: j = i + 2 while (f[j] != '\n'): g = re.split(r'\t', f[j].rstrip()) con = WXC(order='utf2wx') g[1] = con.convert(g[1]) g[2] = "_" g[4] = "_" g[5] = "_" g[8] = "_" g[9] = "_" for x in g: f1.write(x + '\t') f1.write('\n') j = j + 1 break f1.close() cmd = 'python3 Createdata.py ' + sentenceid os.system(cmd) elif choice == '20':
def devnagari(self): con = WXC(order='wx2utf') return con.convert(self.text)
from wxconv import WXC import argparse import random parser = argparse.ArgumentParser() parser.add_argument('--format', type=str, help='format of the file text/ssf', required=True) parser.add_argument('--input', type=str, help='input file to be converted', required=True) parser.add_argument('--dist', type=int, nargs='+', default=[1, 0, 0], help='train:test_a:test_b') args = parser.parse_args() wxc = WXC(order='utf2wx') if args.format == 'text': open('hin.text', 'w').write(wxc.convert(open(args.input).read())) elif args.format == 'ssf': assert len(args.dist) == 3 def tag_extract(string): tag_list = ['PERSON', 'ORGANIZATION', 'LOCATION', 'ENTERTAINMENT', 'FACILITIES', 'ARTIFACT', 'LIVTHINGS', 'LOCOMOTIVE', 'PLANTS', 'MATERIALS', 'DISEASE', 'O'] for tag in tag_list: if tag in string: return tag def write_conll(sentences, output): f = open(output, 'w') for sentence in sentences: for word in sentence: f.write(word + '\n') f.write('\n') f.close()
sentence = [] ner_tag = 'O' for line in open('hindi-annotated-ssf-Form.txt'): if line.startswith('<Sentence'): sentence = [] elif line.startswith('</Sentence'): sentences.append(sentence) elif line.startswith('<ENAMEX'): ner_tag = tag_extract(line) print("tag"+ner_tag) else: line = line.split() if len(line) == 0: continue try: index = float(line[0]) if index != int(index): sentence.append(wxc.convert(line[1]) + ' ' + line[2] + ' ' + '.' + ' ' + ner_tag) except ValueError: pass ner_tag = 'O' random.shuffle(sentences) train = int(0.6 * len(sentences)) // 1 test = int(0.2 * len(sentences)) // 1 dev = len(sentences) train, test, dev = sentences[0:train], sentences[train:train + test], sentences[train+test:] write_conll(train, 'train') write_conll(test, 'test') write_conll(dev, 'dev')
p2 = data[sample_num]['copy_precision']['rouge-2'] p3 = data[sample_num]['copy_precision']['rouge-3'] #print(sample_num, n2, n3) print( "Sample_num= %s, novelty_rouge1 = %.2f, novelty_rouge2 = %.2f, novelty_rouge3 = %.2f " % (sample_num, n1, n2, n3)) #corpus_info[ trigrams[0]['article']] print("article: ") print(corpus_info[sample_num]['article']['content']) print("\n") print("summary: ") print(corpus_info[sample_num]['summary']['content']) print("\n---------------- WX formate ------------------\n") print("Article:") print(con.convert(corpus_info[sample_num]['article']['content'])) print("\n") print("Summary:") print(con.convert(corpus_info[sample_num]['summary']['content'])) print("\n=============================================================\n") # In[7]: import numpy as np from scipy import stats ####### Novelty checking: params = ['novelty', 'copy_precision', 'copy_recall'] for params_key in catergorized_info.keys():
#usr/bin/env/python # -*- coding: utf-8 -*- import wxconv from wxconv import WXC import sys fil = sys.argv[1] fem_file = open(fil, 'r').read().splitlines() data_file = fem_file con = WXC(order='utf2wx') data_transcripts = [row.decode('UTF-8') for row in data_file] data_WX = [con.convert(row).encode('UTF-8') for row in data_transcripts] #data_WX_words = [a + '\t' + b for a, b in zip(data_serial, data_WX)] print data_WX with open('./output_WX.txt', 'w') as outfile: for wd in data_WX: outfile.writelines(wd + '\n') outfile.close() ### You would need to manually remove word-final schwa, nukta in the output generated. ######
# output is converted in SSF with the help of second # type of converter and stored in SSF.txt ssf_converter2.func() print(main_format_data) i = 0 # print(len(main_format_data)) for j in range(len(main_format_data)): if main_format_data[j][1] == 'open_bracket_here': # main_format_data[j+1][5] = con1.convert(lexical.convertBhoj(con.convert(main_format_data[j+1][5]), match_list[j//2])) continue main_format_data[j][5] = con1.convert( lexical.convertBhoj(con.convert(main_format_data[j][5]), match_list[(j // 2)])) print(main_format_data[j][5]) i += 1 # print(main_format_data) ssf_converter.out_temp_file.write( '\t\t***Output after Lexical Generator***\n\n') main_format_writer(main_format_data) ssf_converter2.func() print(main_format_data) # for j in range(len(main_format_data)):