def test_other(self): for ext in ['ssf', 'conll', 'tnt']: wx_con = WXC( order='utf2wx', lang='hin', format_=ext, ssf_type='intra', rmask=False) utf_con = WXC( order='wx2utf', lang='hin', format_=ext, ssf_type='intra', rmask=False) with io.open('%s/%s/hin.%s' % (self.test_dir, ext, ext), encoding='utf-8') as fp: if ext == "ssf": sentences = re.finditer( "(<Sentence id=.*?>)(.*?)</Sentence>", fp.read(), re.S) for sid_sentence in sentences: sentence = sid_sentence.group(2).strip() wx = wx_con.convert(sentence) else: for line in fp: wx = wx_con.convert(line) utf = utf_con.convert(wx) wx_ = wx_con.convert(utf) self.assertEqual(wx, wx_)
def test_raw_text(self): for lang in self.languages: wx_con = WXC(order='utf2wx', lang=lang) utf_con = WXC(order='wx2utf', lang=lang) with io.open('%s/plain_text/%s.txt' % (self.test_dir, lang), encoding='utf-8') as fp: for line in fp: wx = wx_con.convert(line) utf = utf_con.convert(wx) wx_ = wx_con.convert(utf) self.assertEqual(wx, wx_)
def wx_utf_converter(relation_df): a = [] con = WXC(order='wx2utf', lang = 'hin') for i in relation_df.index: a.append(con.convert(relation_df.WORD[i])) relation_df['UTF_hindi'] = a return(relation_df)
def __init__(self, model, lang, gpu=False, wx=False): self.lang = lang self.is_ip_wx = wx parser = argparse.ArgumentParser( description='transliterate.py', formatter_class=argparse.ArgumentDefaultsHelpFormatter) opts.add_md_help_argument(parser) opts.translate_opts(parser) self.opt = parser.parse_args() self.trans_dict = dict() self.broken_words = dict() file_path = os.path.dirname(os.path.abspath(__file__)) if self.lang == 'hin': self.to_utf = WXC(order='wx2utf', lang='hin') self.non_alpha = re.compile(u'([^a-zA-Z]+)') self.alpha_letters = set(string.ascii_letters) self.com_abbr = { 'b': ['BI', 'be'], 'd': ['xI', 'xe'], 'g': ['jI'], 'k': ['ke', 'ki', 'kI'], 'h': ['hE', 'hEM'], 'ha': ['hE', 'hEM'], 'n': ['ina', 'ne'], 'm': ['meM', 'mEM'], 'p': ['pe'], 'q': ['kyoM'], 'r': ['Ora', 'ora'], 's': ['isa', 'se'], 'y': ['ye'] } if self.lang == 'eng': self.non_alpha = re.compile(u'([^a-z]+)') self.alpha_letters = set(string.ascii_letters[:26]) with open('%s/extras/COMMON_ABBR.eng' % file_path) as fp: self.com_abbr = {} for line in fp: k, v = line.split() self.com_abbr[k] = v.split('|') dummy_parser = argparse.ArgumentParser(description='train.py') opts.model_opts(dummy_parser) dummy_opt = dummy_parser.parse_known_args([])[0] if gpu: self.opt.gpu = 0 self.opt.cuda = self.opt.gpu > -1 self.opt.model = model self.opt.n_best = 5 self.opt.lang = lang if self.opt.cuda: torch.cuda.set_device(self.opt.gpu) # Load the model. self.fields, self.model, self.model_opt = onmt.ModelConstructor.load_test_model( self.opt, dummy_opt.__dict__)
def convert_encoding(filename, text_type, language, in_enc, out_enc): with codecs.open(filename, 'r', encoding='utf-8') as fp: logger.info('Loading text_type: %s data' % (text_type)) output_data = "" if text_type == "ssf": converter = WXC(order='%s2%s' % (in_enc, out_enc), lang=language) for line in fp: line = line.strip() ds = line.split() #print("Line:--", ds) if line == "": output_data += u"\n" elif line[0] == "<": output_data += u"%s\n" % (line) elif ds[0] == "))": output_data += u"\t%s\n" % (line) elif ds[0] == "0" or ds[1] == "((": output_data += u"%s\n" % (u"\t".join(ds)) elif ds[1] != "": #print("check", ds) word, tag = ds[1], ds[2] word_con = converter.convert(word) output_data += u"%s\t%s\t%s\n" % (ds[0], word_con, tag) else: pass return output_data else: if text_type == "tnt": converter = WXC(order='%s2%s' % (in_enc, out_enc), lang=language, format_='tnt') elif text_type == "text": converter = WXC(order='%s2%s' % (in_enc, out_enc), lang=language) elif text_type == "conll": converter = WXC(order='%s2%s' % (in_enc, out_enc), lang=language, format_='conll') else: raise Exception("Unknown Format %s" % text_type) pass text = fp.read() output_data = converter.convert(text) return output_data
def wx_utf_converter(relation_df): a = [] con = WXC(order='wx2utf', lang = 'hin') for i in relation_df.index: a.append(con.convert(relation_df.WORD[i])) print(a) relation_df['UTF_hindi'] = a #relation_df['UTF_hindi'] = pd.Series(a).values return(relation_df)
def __init__(self, lid, htrans=None, etrans=None, wx=False): self.ed = enchant.Dict('en') self.hblm = kenlm.LanguageModel('lm/hindi-n3-p5-lmplz.blm') self.eblm = kenlm.LanguageModel('lm/english-n3-p10-lmplz.blm') self.so_dec_eng = so_viterbi(self.eblm) self.so_dec_hin = so_viterbi(self.hblm) self.e2h = {kv.split()[0]:kv.split()[1].split('|') for kv in io.open('dicts/ENG2HIN12M.dict')} self.h2e = {kv.split()[0]:kv.split()[1].split('|') for kv in io.open('dicts/HIN2ENG12M.dict')} self.meta = Meta() self.lid = LID(model=lid, etrans=etrans, htrans=htrans) self.wx = wx if not self.wx: self.wxc = WXC(order='wx2utf', lang='hin')
from wxconv import WXC import random wxc = WXC(order='utf2wx') def tag_extract(string): tag_list = [ 'PERSON', 'ORGANIZATION', 'LOCATION', 'ENTERTAINMENT', 'FACILITIES', 'ARTIFACT', 'LIVTHINGS', 'LOCOMOTIVE', 'PLANTS', 'MATERIALS', 'DISEASE', 'O' ] for tag in tag_list: if tag in string: return tag def write_conll(sentences, output): f = open(output, 'w') for sentence in sentences: for word in sentence: f.write(word + '\n') f.write('\n') f.close() sentences = [] sentence = [] ner_tag = 'O' for line in open('hindi-annotated-ssf-Form.txt'):
f2.close() elif choice == '19': sentenceid = input("Enter the sentence id: ") spl = re.split(r'-', sentenceid)[0] f = open("./treebanks/UD_Hindi-HDTB/hi_hdtb-ud-" + spl + ".conllu", "r").readlines() f1 = open("./conll/conll_" + sentenceid + ".dat", "w") n = len(f) for i in range(0, n): if sentenceid in f[i]: j = i + 2 while (f[j] != '\n'): g = re.split(r'\t', f[j].rstrip()) con = WXC(order='utf2wx') g[1] = con.convert(g[1]) g[2] = "_" g[4] = "_" g[5] = "_" g[8] = "_" g[9] = "_" for x in g: f1.write(x + '\t') f1.write('\n') j = j + 1 break f1.close() cmd = 'python3 Createdata.py ' + sentenceid os.system(cmd)
trigrams = [ sample for sample in list_samples if (data[sample]['novelty']['rouge-3'] >= 25) ] print("Trigrams_Novelty_range >=%d --> #of samples = %d" % (i, len(trigrams))) print("\n---------------------------------------\n") #print(trigrams[:10]) n = 5 import random random.seed(42) randome_samples = random.sample(trigrams, n) ### wx formate: from wxconv import WXC con = WXC(order='utf2wx', lang='tel') for sample_num in randome_samples: n3 = data[sample_num]['novelty']['rouge-3'] n2 = data[sample_num]['novelty']['rouge-2'] n1 = data[sample_num]['novelty']['rouge-1'] p1 = data[sample_num]['copy_precision']['rouge-1'] p2 = data[sample_num]['copy_precision']['rouge-2'] p3 = data[sample_num]['copy_precision']['rouge-3'] #print(sample_num, n2, n3) print( "Sample_num= %s, novelty_rouge1 = %.2f, novelty_rouge2 = %.2f, novelty_rouge3 = %.2f " % (sample_num, n1, n2, n3))
import pprint import string import pickle from wxconv import WXC converter = WXC(order='utf2wx', lang='ben') def increment_map(dictionary, key, value): if key in dictionary.keys(): temp = dictionary[key] if value in temp.keys(): temp[value] += 1 else: temp[value] = 1 else: dictionary[key] = {} temp = dictionary[key] temp[value] = 1 def get_marker(words): # TODO work on marker = "-" if len(words) > 1: marker = words[-1] marker = converter.convert(marker) return marker def trim(dictionary, cutoff=3):
# coding=utf-8 from keras.models import load_model from training.ner.get_word_vectors import get_word_vector, get_sentence_vectors from keras.preprocessing import sequence import numpy as np, pickle import BotServiceConfig as env from tornado import gen import common.load_models as models from wxconv import WXC con = WXC(order='wx2utf', lang='hin') @gen.coroutine def predict_tags(sentence): sentence = str(sentence.encode('utf-8', 'ignore')) sentence_list = sentence.strip().split() sent_len = len(sentence_list) # Get padded word vectors x = encode_sentence(sentence) tags = models.tagger1.predict(x, batch_size=1)[0] tags = tags[-sent_len:] pred_tags = decode_result(tags) entityList = [] for i in xrange(len(pred_tags)): en = {} if 'B-' in pred_tags[i]: en[pred_tags[i][2:]] = None st = sentence_list[i] j = i + 1
from wxconv import WXC import warnings warnings.filterwarnings('ignore') parser = argparse.ArgumentParser( description='translate.py', formatter_class=argparse.ArgumentDefaultsHelpFormatter) opts.add_md_help_argument(parser) opts.translate_opts(parser) opt = parser.parse_args() if opt.lang == 'hin': to_wx = WXC(order='utf2wx', lang='hin') non_alpha = re.compile(u'([^a-zA-Z]+)') alpha_letters = set(string.ascii_letters) trans_dict = dict() broken_words = dict() def addone(text): for word in text.split(): if opt.lang == 'hin' and (word[0] == word[-1] == '_'): trans_dict[word] = word[1:-1] continue if word in trans_dict: continue words = non_alpha.split(word)
def wx_utf_converter_sentence(sentence): con = WXC(order='wx2utf', lang = 'hin') sentence1 = con.convert(sentence) return(sentence1)
def devnagari(self): con = WXC(order='wx2utf') return con.convert(self.text)
from wxconv import WXC con = WXC(order='utf2wx') con1 = WXC(order='wx2utf', lang='hin') print(con.convert('लेस।')) print(con1.convert('esehI'))
import os import csv from tqdm import tqdm import pprint import pickle import difflib from wxconv import WXC converter = WXC(order='utf2wx') def get_anncorra(directory_path): marker_anncorra = {} anncorra_marker = {} print('Opening directory: ', directory_path) for filename in tqdm(os.listdir(directory_path)): filepath = os.path.join(directory_path, filename) with open(filepath, newline='') as csvfile: csvreader = csv.reader(csvfile, delimiter='\t') for row in csvreader: if not len(row) == 0: marker = get_marker(row) tag = get_tag(row) increment_map(marker_anncorra, marker, tag) increment_map(anncorra_marker, tag, marker) print('Got AnnCorra') return marker_anncorra, anncorra_marker