def wx_utf_converter(relation_df): a = [] con = WXC(order='wx2utf', lang = 'hin') for i in relation_df.index: a.append(con.convert(relation_df.WORD[i])) relation_df['UTF_hindi'] = a return(relation_df)
def wx_utf_converter(relation_df): a = [] con = WXC(order='wx2utf', lang = 'hin') for i in relation_df.index: a.append(con.convert(relation_df.WORD[i])) print(a) relation_df['UTF_hindi'] = a #relation_df['UTF_hindi'] = pd.Series(a).values return(relation_df)
def __init__(self, lid, htrans=None, etrans=None, wx=False): self.ed = enchant.Dict('en') self.hblm = kenlm.LanguageModel('lm/hindi-n3-p5-lmplz.blm') self.eblm = kenlm.LanguageModel('lm/english-n3-p10-lmplz.blm') self.so_dec_eng = so_viterbi(self.eblm) self.so_dec_hin = so_viterbi(self.hblm) self.e2h = {kv.split()[0]:kv.split()[1].split('|') for kv in io.open('dicts/ENG2HIN12M.dict')} self.h2e = {kv.split()[0]:kv.split()[1].split('|') for kv in io.open('dicts/HIN2ENG12M.dict')} self.meta = Meta() self.lid = LID(model=lid, etrans=etrans, htrans=htrans) self.wx = wx if not self.wx: self.wxc = WXC(order='wx2utf', lang='hin')
def test_other(self): for ext in ['ssf', 'conll', 'tnt']: wx_con = WXC( order='utf2wx', lang='hin', format_=ext, ssf_type='intra', rmask=False) utf_con = WXC( order='wx2utf', lang='hin', format_=ext, ssf_type='intra', rmask=False) with io.open('%s/%s/hin.%s' % (self.test_dir, ext, ext), encoding='utf-8') as fp: if ext == "ssf": sentences = re.finditer( "(<Sentence id=.*?>)(.*?)</Sentence>", fp.read(), re.S) for sid_sentence in sentences: sentence = sid_sentence.group(2).strip() wx = wx_con.convert(sentence) else: for line in fp: wx = wx_con.convert(line) utf = utf_con.convert(wx) wx_ = wx_con.convert(utf) self.assertEqual(wx, wx_)
def __init__(self, model, lang, gpu=False, wx=False): self.lang = lang self.is_ip_wx = wx parser = argparse.ArgumentParser( description='transliterate.py', formatter_class=argparse.ArgumentDefaultsHelpFormatter) opts.add_md_help_argument(parser) opts.translate_opts(parser) self.opt = parser.parse_args() self.trans_dict = dict() self.broken_words = dict() file_path = os.path.dirname(os.path.abspath(__file__)) if self.lang == 'hin': self.to_utf = WXC(order='wx2utf', lang='hin') self.non_alpha = re.compile(u'([^a-zA-Z]+)') self.alpha_letters = set(string.ascii_letters) self.com_abbr = { 'b': ['BI', 'be'], 'd': ['xI', 'xe'], 'g': ['jI'], 'k': ['ke', 'ki', 'kI'], 'h': ['hE', 'hEM'], 'ha': ['hE', 'hEM'], 'n': ['ina', 'ne'], 'm': ['meM', 'mEM'], 'p': ['pe'], 'q': ['kyoM'], 'r': ['Ora', 'ora'], 's': ['isa', 'se'], 'y': ['ye'] } if self.lang == 'eng': self.non_alpha = re.compile(u'([^a-z]+)') self.alpha_letters = set(string.ascii_letters[:26]) with open('%s/extras/COMMON_ABBR.eng' % file_path) as fp: self.com_abbr = {} for line in fp: k, v = line.split() self.com_abbr[k] = v.split('|') dummy_parser = argparse.ArgumentParser(description='train.py') opts.model_opts(dummy_parser) dummy_opt = dummy_parser.parse_known_args([])[0] if gpu: self.opt.gpu = 0 self.opt.cuda = self.opt.gpu > -1 self.opt.model = model self.opt.n_best = 5 self.opt.lang = lang if self.opt.cuda: torch.cuda.set_device(self.opt.gpu) # Load the model. self.fields, self.model, self.model_opt = onmt.ModelConstructor.load_test_model( self.opt, dummy_opt.__dict__)
def convert_encoding(filename, text_type, language, in_enc, out_enc): with codecs.open(filename, 'r', encoding='utf-8') as fp: logger.info('Loading text_type: %s data' % (text_type)) output_data = "" if text_type == "ssf": converter = WXC(order='%s2%s' % (in_enc, out_enc), lang=language) for line in fp: line = line.strip() ds = line.split() #print("Line:--", ds) if line == "": output_data += u"\n" elif line[0] == "<": output_data += u"%s\n" % (line) elif ds[0] == "))": output_data += u"\t%s\n" % (line) elif ds[0] == "0" or ds[1] == "((": output_data += u"%s\n" % (u"\t".join(ds)) elif ds[1] != "": #print("check", ds) word, tag = ds[1], ds[2] word_con = converter.convert(word) output_data += u"%s\t%s\t%s\n" % (ds[0], word_con, tag) else: pass return output_data else: if text_type == "tnt": converter = WXC(order='%s2%s' % (in_enc, out_enc), lang=language, format_='tnt') elif text_type == "text": converter = WXC(order='%s2%s' % (in_enc, out_enc), lang=language) elif text_type == "conll": converter = WXC(order='%s2%s' % (in_enc, out_enc), lang=language, format_='conll') else: raise Exception("Unknown Format %s" % text_type) pass text = fp.read() output_data = converter.convert(text) return output_data
class ThreeStepDecoding(object): def __init__(self, lid, htrans=None, etrans=None, wx=False): self.ed = enchant.Dict('en') self.hblm = kenlm.LanguageModel('lm/hindi-n3-p5-lmplz.blm') self.eblm = kenlm.LanguageModel('lm/english-n3-p10-lmplz.blm') self.so_dec_eng = so_viterbi(self.eblm) self.so_dec_hin = so_viterbi(self.hblm) self.e2h = {kv.split()[0]:kv.split()[1].split('|') for kv in io.open('dicts/ENG2HIN12M.dict')} self.h2e = {kv.split()[0]:kv.split()[1].split('|') for kv in io.open('dicts/HIN2ENG12M.dict')} self.meta = Meta() self.lid = LID(model=lid, etrans=etrans, htrans=htrans) self.wx = wx if not self.wx: self.wxc = WXC(order='wx2utf', lang='hin') def max_likelihood(self, n_sentence, target, k_best=7): if target == 'en': auto_tags = self.so_dec_eng.decode(n_sentence, len(n_sentence), k_best) else: auto_tags = self.so_dec_hin.decode(n_sentence, len(n_sentence), k_best) #beamsearch best_sen = [n_sentence[idx][at] for idx, at in enumerate(auto_tags)] return best_sen def decode(self, words, ltags): words = [re.sub(r'([a-z])\1\1+', r'\1', w) for w in words] hi_trellis = [self.lid.htrans.get(wi.lower(), [wi]*5)[:5] + self.e2h.get(wi.lower() if li == 'en' else None, [u'_%s_' %wi])[:1] + [u'_%s_'%wi] for wi,li in zip(words, ltags)] # import pdb; pdb.set_trace() hi_mono = self.max_likelihood(hi_trellis, 'hi') en_trellis = [[wi] + self.lid.etrans.get(wi.lower(), [wi]*5)[:5] + self.h2e.get(wh if li == 'hi' else None, [wi])[:1] for wi,wh,li in zip (words, hi_mono, ltags)] en_mono = self.max_likelihood(en_trellis, 'en') out = hi_mono[:] for i, _ in enumerate(hi_mono): if ltags[i] in ['univ', 'acro', 'ne']: out[i] = words[i] elif ltags[i] in ['en', 'ne']: if words[i].lower() == en_mono[i]: out[i] = words[i] elif self.ed.check(words[i].lower()) and len(words[i])>1: out[i] = words[i] elif words[i].lower() in ['a', 'i']: out[i] = words[i] else: out[i] = en_mono[i] elif not self.wx: out[i] = self.wxc.convert(out[i]) return out def tag_sent(self, sent, trans=True): sent = sent.split() sent, ltags = zip(*self.lid.tag_sent(sent)) dec = self.decode(sent, ltags) return zip(sent, dec, ltags)
def test_raw_text(self): for lang in self.languages: wx_con = WXC(order='utf2wx', lang=lang) utf_con = WXC(order='wx2utf', lang=lang) with io.open('%s/plain_text/%s.txt' % (self.test_dir, lang), encoding='utf-8') as fp: for line in fp: wx = wx_con.convert(line) utf = utf_con.convert(wx) wx_ = wx_con.convert(utf) self.assertEqual(wx, wx_)
from wxconv import WXC import warnings warnings.filterwarnings('ignore') parser = argparse.ArgumentParser( description='translate.py', formatter_class=argparse.ArgumentDefaultsHelpFormatter) opts.add_md_help_argument(parser) opts.translate_opts(parser) opt = parser.parse_args() if opt.lang == 'hin': to_wx = WXC(order='utf2wx', lang='hin') non_alpha = re.compile(u'([^a-zA-Z]+)') alpha_letters = set(string.ascii_letters) trans_dict = dict() broken_words = dict() def addone(text): for word in text.split(): if opt.lang == 'hin' and (word[0] == word[-1] == '_'): trans_dict[word] = word[1:-1] continue if word in trans_dict: continue words = non_alpha.split(word)
from wxconv import WXC import random wxc = WXC(order='utf2wx') def tag_extract(string): tag_list = [ 'PERSON', 'ORGANIZATION', 'LOCATION', 'ENTERTAINMENT', 'FACILITIES', 'ARTIFACT', 'LIVTHINGS', 'LOCOMOTIVE', 'PLANTS', 'MATERIALS', 'DISEASE', 'O' ] for tag in tag_list: if tag in string: return tag def write_conll(sentences, output): f = open(output, 'w') for sentence in sentences: for word in sentence: f.write(word + '\n') f.write('\n') f.close() sentences = [] sentence = [] ner_tag = 'O' for line in open('hindi-annotated-ssf-Form.txt'):
f2.close() elif choice == '19': sentenceid = input("Enter the sentence id: ") spl = re.split(r'-', sentenceid)[0] f = open("./treebanks/UD_Hindi-HDTB/hi_hdtb-ud-" + spl + ".conllu", "r").readlines() f1 = open("./conll/conll_" + sentenceid + ".dat", "w") n = len(f) for i in range(0, n): if sentenceid in f[i]: j = i + 2 while (f[j] != '\n'): g = re.split(r'\t', f[j].rstrip()) con = WXC(order='utf2wx') g[1] = con.convert(g[1]) g[2] = "_" g[4] = "_" g[5] = "_" g[8] = "_" g[9] = "_" for x in g: f1.write(x + '\t') f1.write('\n') j = j + 1 break f1.close() cmd = 'python3 Createdata.py ' + sentenceid os.system(cmd)
def devnagari(self): con = WXC(order='wx2utf') return con.convert(self.text)
import pprint import string import pickle from wxconv import WXC converter = WXC(order='utf2wx', lang='ben') def increment_map(dictionary, key, value): if key in dictionary.keys(): temp = dictionary[key] if value in temp.keys(): temp[value] += 1 else: temp[value] = 1 else: dictionary[key] = {} temp = dictionary[key] temp[value] = 1 def get_marker(words): # TODO work on marker = "-" if len(words) > 1: marker = words[-1] marker = converter.convert(marker) return marker def trim(dictionary, cutoff=3):
import os import SSF_converter.SSF_to_Input as input_converter import SSF_converter.output_to_SSF as ssf_converter import SSF_converter.output_to_SSF2 as ssf_converter2 import morph_analyser.make_prediction as morph_analyser import Pos_Tagger.final_predict_model as pos_tagger import chunking.predict as chunker import lexical.dictionaryAmit1 as lexical # import morph_generation.morph_inflection as morph_generator import torch from wxconv import WXC con = WXC(order='utf2wx') con1 = WXC(order='wx2utf', lang='hin') BASE_DIR = os.path.dirname(os.path.abspath('__file__')) BASE_DIR += '/SSF_converter/' main_file = BASE_DIR + "main_format.txt" local_add = os.path.dirname(os.path.abspath('__file__')) pos_tagger_input_file = local_add + '/Pos_Tagger/sentinput.txt' chunker_input_file = local_add + '/chunking/input.txt' match_list = [] checklist = [] def main_format_writer(data): # This file writes in main_format.txt. out_main_file = open(main_file, 'w', encoding='utf-8')
def wx_utf_converter_sentence(sentence): con = WXC(order='wx2utf', lang = 'hin') sentence1 = con.convert(sentence) return(sentence1)
import os import tornado.web from tornado import gen from logger import * import traceback from prediction.ner.predict import predict_tags from prediction.intent_classifier.predict import predict_classifier from prediction.smalltalk.getResponse import get_response from wxconv import WXC cur_dir = os.path.dirname(os.path.abspath(__file__)) con = WXC(order='utf2wx') con1 = WXC(order='wx2utf', lang='hin') @gen.coroutine def process(data): print(data) module = data['module'] input_string = data['data']['queries'][0] transformed_query = con.convert(unicode(input_string)) try: if(module == 'ner'): output = yield predict_tags(input_string) if(module == 'all'): output1 = yield predict_classifier(transformed_query) if output1 == 'cab_book': output = yield predict_tags(input_string) else: output = yield get_response(transformed_query) output = con1.convert(output) if(module == 'smalltalk'): output = yield get_response(transformed_query)
from wxconv import WXC import random wxc = WXC(order='utf2wx') def tag_extract(string): tag_list = ['PERSON', 'ORGANIZATION', 'LOCATION', 'ENTERTAINMENT', 'FACILITIES', 'ARTIFACT', 'LIVTHINGS', 'LOCOMOTIVE', 'PLANTS', 'MATERIALS', 'DISEASE', 'O'] for tag in tag_list: if tag in string: if tag=='PERSON': return 'I-PER' elif tag=='ORGANIZATION': return 'I-ORG' elif tag=='LOCATION': return 'I-LOC' elif tag=='O': return tag else : return 'I-MISC' def write_conll(sentences, output): f = open(output, 'w') for sentence in sentences: for word in sentence: f.write(word + '\n') f.write('\n') f.close() sentences = [] sentence = [] ner_tag = 'O'
trigrams = [ sample for sample in list_samples if (data[sample]['novelty']['rouge-3'] >= 25) ] print("Trigrams_Novelty_range >=%d --> #of samples = %d" % (i, len(trigrams))) print("\n---------------------------------------\n") #print(trigrams[:10]) n = 5 import random random.seed(42) randome_samples = random.sample(trigrams, n) ### wx formate: from wxconv import WXC con = WXC(order='utf2wx', lang='tel') for sample_num in randome_samples: n3 = data[sample_num]['novelty']['rouge-3'] n2 = data[sample_num]['novelty']['rouge-2'] n1 = data[sample_num]['novelty']['rouge-1'] p1 = data[sample_num]['copy_precision']['rouge-1'] p2 = data[sample_num]['copy_precision']['rouge-2'] p3 = data[sample_num]['copy_precision']['rouge-3'] #print(sample_num, n2, n3) print( "Sample_num= %s, novelty_rouge1 = %.2f, novelty_rouge2 = %.2f, novelty_rouge3 = %.2f " % (sample_num, n1, n2, n3))
#usr/bin/env/python # -*- coding: utf-8 -*- import wxconv from wxconv import WXC import sys fil = sys.argv[1] fem_file = open(fil, 'r').read().splitlines() data_file = fem_file con = WXC(order='utf2wx') data_transcripts = [row.decode('UTF-8') for row in data_file] data_WX = [con.convert(row).encode('UTF-8') for row in data_transcripts] #data_WX_words = [a + '\t' + b for a, b in zip(data_serial, data_WX)] print data_WX with open('./output_WX.txt', 'w') as outfile: for wd in data_WX: outfile.writelines(wd + '\n') outfile.close() ### You would need to manually remove word-final schwa, nukta in the output generated. ######
# coding=utf-8 from keras.models import load_model from training.ner.get_word_vectors import get_word_vector, get_sentence_vectors from keras.preprocessing import sequence import numpy as np, pickle import BotServiceConfig as env from tornado import gen import common.load_models as models from wxconv import WXC con = WXC(order='wx2utf', lang='hin') @gen.coroutine def predict_tags(sentence): sentence = str(sentence.encode('utf-8', 'ignore')) sentence_list = sentence.strip().split() sent_len = len(sentence_list) # Get padded word vectors x = encode_sentence(sentence) tags = models.tagger1.predict(x, batch_size=1)[0] tags = tags[-sent_len:] pred_tags = decode_result(tags) entityList = [] for i in xrange(len(pred_tags)): en = {} if 'B-' in pred_tags[i]: en[pred_tags[i][2:]] = None st = sentence_list[i] j = i + 1
from wxconv import WXC import argparse import random parser = argparse.ArgumentParser() parser.add_argument('--format', type=str, help='format of the file text/ssf', required=True) parser.add_argument('--input', type=str, help='input file to be converted', required=True) parser.add_argument('--dist', type=int, nargs='+', default=[1, 0, 0], help='train:test_a:test_b') args = parser.parse_args() wxc = WXC(order='utf2wx') if args.format == 'text': open('hin.text', 'w').write(wxc.convert(open(args.input).read())) elif args.format == 'ssf': assert len(args.dist) == 3 def tag_extract(string): tag_list = ['PERSON', 'ORGANIZATION', 'LOCATION', 'ENTERTAINMENT', 'FACILITIES', 'ARTIFACT', 'LIVTHINGS', 'LOCOMOTIVE', 'PLANTS', 'MATERIALS', 'DISEASE', 'O'] for tag in tag_list: if tag in string: return tag def write_conll(sentences, output): f = open(output, 'w') for sentence in sentences: for word in sentence: f.write(word + '\n') f.write('\n') f.close()
from wxconv import WXC con = WXC(order='utf2wx') con1 = WXC(order='wx2utf', lang='hin') print(con.convert('लेस।')) print(con1.convert('esehI'))
import os import csv from tqdm import tqdm import pprint import pickle import difflib from wxconv import WXC converter = WXC(order='utf2wx') def get_anncorra(directory_path): marker_anncorra = {} anncorra_marker = {} print('Opening directory: ', directory_path) for filename in tqdm(os.listdir(directory_path)): filepath = os.path.join(directory_path, filename) with open(filepath, newline='') as csvfile: csvreader = csv.reader(csvfile, delimiter='\t') for row in csvreader: if not len(row) == 0: marker = get_marker(row) tag = get_tag(row) increment_map(marker_anncorra, marker, tag) increment_map(anncorra_marker, tag, marker) print('Got AnnCorra') return marker_anncorra, anncorra_marker