示例#1
0
def wx_utf_converter(relation_df):
	a = []
	con = WXC(order='wx2utf', lang = 'hin')
	for i in relation_df.index:
		a.append(con.convert(relation_df.WORD[i]))
	relation_df['UTF_hindi'] = a
	return(relation_df)
示例#2
0
def wx_utf_converter(relation_df):
	a = []
	con = WXC(order='wx2utf', lang = 'hin')
	for i in relation_df.index:
		a.append(con.convert(relation_df.WORD[i]))
	print(a)
	relation_df['UTF_hindi'] = a
	#relation_df['UTF_hindi'] = pd.Series(a).values
	return(relation_df)
示例#3
0
 def __init__(self, lid, htrans=None, etrans=None, wx=False):
     self.ed = enchant.Dict('en')
     self.hblm =  kenlm.LanguageModel('lm/hindi-n3-p5-lmplz.blm')
     self.eblm =  kenlm.LanguageModel('lm/english-n3-p10-lmplz.blm')
     self.so_dec_eng = so_viterbi(self.eblm)
     self.so_dec_hin = so_viterbi(self.hblm)
     self.e2h = {kv.split()[0]:kv.split()[1].split('|') for kv in io.open('dicts/ENG2HIN12M.dict')}
     self.h2e = {kv.split()[0]:kv.split()[1].split('|') for kv in io.open('dicts/HIN2ENG12M.dict')}
     self.meta = Meta()
     self.lid = LID(model=lid, etrans=etrans, htrans=htrans)
     self.wx = wx
     if not self.wx:
         self.wxc = WXC(order='wx2utf', lang='hin')
 def test_other(self):
     for ext in ['ssf', 'conll', 'tnt']:
         wx_con = WXC(
             order='utf2wx',
             lang='hin',
             format_=ext,
             ssf_type='intra',
             rmask=False)
         utf_con = WXC(
             order='wx2utf',
             lang='hin',
             format_=ext,
             ssf_type='intra',
             rmask=False)
         with io.open('%s/%s/hin.%s' % (self.test_dir, ext, ext),
                      encoding='utf-8') as fp:
             if ext == "ssf":
                 sentences = re.finditer(
                     "(<Sentence id=.*?>)(.*?)</Sentence>", fp.read(), re.S)
                 for sid_sentence in sentences:
                     sentence = sid_sentence.group(2).strip()
                     wx = wx_con.convert(sentence)
             else:
                 for line in fp:
                     wx = wx_con.convert(line)
                     utf = utf_con.convert(wx)
                     wx_ = wx_con.convert(utf)
                     self.assertEqual(wx, wx_)
示例#5
0
    def __init__(self, model, lang, gpu=False, wx=False):
        self.lang = lang
        self.is_ip_wx = wx
        parser = argparse.ArgumentParser(
            description='transliterate.py',
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
        opts.add_md_help_argument(parser)
        opts.translate_opts(parser)

        self.opt = parser.parse_args()
        self.trans_dict = dict()
        self.broken_words = dict()
        file_path = os.path.dirname(os.path.abspath(__file__))

        if self.lang == 'hin':
            self.to_utf = WXC(order='wx2utf', lang='hin')
            self.non_alpha = re.compile(u'([^a-zA-Z]+)')
            self.alpha_letters = set(string.ascii_letters)
            self.com_abbr = {
                'b': ['BI', 'be'],
                'd': ['xI', 'xe'],
                'g': ['jI'],
                'k': ['ke', 'ki', 'kI'],
                'h': ['hE', 'hEM'],
                'ha': ['hE', 'hEM'],
                'n': ['ina', 'ne'],
                'm': ['meM', 'mEM'],
                'p': ['pe'],
                'q': ['kyoM'],
                'r': ['Ora', 'ora'],
                's': ['isa', 'se'],
                'y': ['ye']
            }

        if self.lang == 'eng':
            self.non_alpha = re.compile(u'([^a-z]+)')
            self.alpha_letters = set(string.ascii_letters[:26])
            with open('%s/extras/COMMON_ABBR.eng' % file_path) as fp:
                self.com_abbr = {}
                for line in fp:
                    k, v = line.split()
                    self.com_abbr[k] = v.split('|')

        dummy_parser = argparse.ArgumentParser(description='train.py')
        opts.model_opts(dummy_parser)
        dummy_opt = dummy_parser.parse_known_args([])[0]
        if gpu:
            self.opt.gpu = 0

        self.opt.cuda = self.opt.gpu > -1
        self.opt.model = model
        self.opt.n_best = 5
        self.opt.lang = lang
        if self.opt.cuda:
            torch.cuda.set_device(self.opt.gpu)

        # Load the model.
        self.fields, self.model, self.model_opt = onmt.ModelConstructor.load_test_model(
            self.opt, dummy_opt.__dict__)
def convert_encoding(filename, text_type, language, in_enc, out_enc):

    with codecs.open(filename, 'r', encoding='utf-8') as fp:
        logger.info('Loading text_type: %s data' % (text_type))
        output_data = ""
        if text_type == "ssf":
            converter = WXC(order='%s2%s' % (in_enc, out_enc), lang=language)
            for line in fp:
                line = line.strip()
                ds = line.split()
                #print("Line:--", ds)
                if line == "":
                    output_data += u"\n"
                elif line[0] == "<":
                    output_data += u"%s\n" % (line)
                elif ds[0] == "))":
                    output_data += u"\t%s\n" % (line)
                elif ds[0] == "0" or ds[1] == "((":
                    output_data += u"%s\n" % (u"\t".join(ds))
                elif ds[1] != "":
                    #print("check", ds)
                    word, tag = ds[1], ds[2]
                    word_con = converter.convert(word)
                    output_data += u"%s\t%s\t%s\n" % (ds[0], word_con, tag)
                else:
                    pass
            return output_data
        else:
            if text_type == "tnt":
                converter = WXC(order='%s2%s' % (in_enc, out_enc),
                                lang=language,
                                format_='tnt')
            elif text_type == "text":
                converter = WXC(order='%s2%s' % (in_enc, out_enc),
                                lang=language)
            elif text_type == "conll":
                converter = WXC(order='%s2%s' % (in_enc, out_enc),
                                lang=language,
                                format_='conll')
            else:
                raise Exception("Unknown Format %s" % text_type)
                pass
        text = fp.read()
        output_data = converter.convert(text)
        return output_data
示例#7
0
class ThreeStepDecoding(object):
    def __init__(self, lid, htrans=None, etrans=None, wx=False):
        self.ed = enchant.Dict('en')
        self.hblm =  kenlm.LanguageModel('lm/hindi-n3-p5-lmplz.blm')
        self.eblm =  kenlm.LanguageModel('lm/english-n3-p10-lmplz.blm')
        self.so_dec_eng = so_viterbi(self.eblm)
        self.so_dec_hin = so_viterbi(self.hblm)
        self.e2h = {kv.split()[0]:kv.split()[1].split('|') for kv in io.open('dicts/ENG2HIN12M.dict')}
        self.h2e = {kv.split()[0]:kv.split()[1].split('|') for kv in io.open('dicts/HIN2ENG12M.dict')}
        self.meta = Meta()
        self.lid = LID(model=lid, etrans=etrans, htrans=htrans)
        self.wx = wx
        if not self.wx:
            self.wxc = WXC(order='wx2utf', lang='hin')

    def max_likelihood(self, n_sentence, target, k_best=7):
        if target == 'en':
            auto_tags = self.so_dec_eng.decode(n_sentence, len(n_sentence), k_best)
        else:
            auto_tags = self.so_dec_hin.decode(n_sentence, len(n_sentence), k_best)
        #beamsearch
        best_sen = [n_sentence[idx][at] for idx, at in enumerate(auto_tags)]
        return best_sen

    def decode(self, words, ltags):
        words = [re.sub(r'([a-z])\1\1+', r'\1', w) for w in words]
        hi_trellis = [self.lid.htrans.get(wi.lower(), [wi]*5)[:5] +
                      self.e2h.get(wi.lower() if li == 'en' else None, [u'_%s_' %wi])[:1] +
                      [u'_%s_'%wi] for wi,li in zip(words, ltags)]
        # import pdb; pdb.set_trace()
        hi_mono = self.max_likelihood(hi_trellis, 'hi')
        en_trellis = [[wi] + self.lid.etrans.get(wi.lower(), [wi]*5)[:5] +
                      self.h2e.get(wh if li == 'hi' else None, [wi])[:1]
                      for wi,wh,li in zip (words, hi_mono, ltags)]
        en_mono = self.max_likelihood(en_trellis, 'en')
        out = hi_mono[:]
        for i, _ in enumerate(hi_mono):
            if ltags[i]  in ['univ', 'acro', 'ne']:
                out[i] = words[i]
            elif ltags[i] in ['en', 'ne']:
                if words[i].lower() == en_mono[i]:
                    out[i] = words[i]
                elif self.ed.check(words[i].lower()) and len(words[i])>1:
                    out[i] = words[i]
                elif words[i].lower() in ['a', 'i']:
                    out[i] = words[i]
                else:
                    out[i] = en_mono[i]
            elif not self.wx:
                out[i] = self.wxc.convert(out[i])
        return out

    def tag_sent(self, sent, trans=True):
        sent = sent.split()
        sent, ltags = zip(*self.lid.tag_sent(sent))
        dec = self.decode(sent, ltags)
        return zip(sent, dec, ltags)
 def test_raw_text(self):
     for lang in self.languages:
         wx_con = WXC(order='utf2wx', lang=lang)
         utf_con = WXC(order='wx2utf', lang=lang)
         with io.open('%s/plain_text/%s.txt' % (self.test_dir, lang),
                      encoding='utf-8') as fp:
             for line in fp:
                 wx = wx_con.convert(line)
                 utf = utf_con.convert(wx)
                 wx_ = wx_con.convert(utf)
                 self.assertEqual(wx, wx_)
示例#9
0
from wxconv import WXC

import warnings

warnings.filterwarnings('ignore')

parser = argparse.ArgumentParser(
    description='translate.py',
    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
opts.add_md_help_argument(parser)
opts.translate_opts(parser)

opt = parser.parse_args()

if opt.lang == 'hin':
    to_wx = WXC(order='utf2wx', lang='hin')
    non_alpha = re.compile(u'([^a-zA-Z]+)')
    alpha_letters = set(string.ascii_letters)

trans_dict = dict()
broken_words = dict()


def addone(text):
    for word in text.split():
        if opt.lang == 'hin' and (word[0] == word[-1] == '_'):
            trans_dict[word] = word[1:-1]
            continue
        if word in trans_dict:
            continue
        words = non_alpha.split(word)
from wxconv import WXC
import random

wxc = WXC(order='utf2wx')


def tag_extract(string):
    tag_list = [
        'PERSON', 'ORGANIZATION', 'LOCATION', 'ENTERTAINMENT', 'FACILITIES',
        'ARTIFACT', 'LIVTHINGS', 'LOCOMOTIVE', 'PLANTS', 'MATERIALS',
        'DISEASE', 'O'
    ]
    for tag in tag_list:
        if tag in string:
            return tag


def write_conll(sentences, output):
    f = open(output, 'w')
    for sentence in sentences:
        for word in sentence:
            f.write(word + '\n')
        f.write('\n')
    f.close()


sentences = []
sentence = []
ner_tag = 'O'

for line in open('hindi-annotated-ssf-Form.txt'):
示例#11
0
        f2.close()
    elif choice == '19':

        sentenceid = input("Enter the sentence id: ")
        spl = re.split(r'-', sentenceid)[0]
        f = open("./treebanks/UD_Hindi-HDTB/hi_hdtb-ud-" + spl + ".conllu",
                 "r").readlines()
        f1 = open("./conll/conll_" + sentenceid + ".dat", "w")
        n = len(f)

        for i in range(0, n):
            if sentenceid in f[i]:
                j = i + 2
                while (f[j] != '\n'):
                    g = re.split(r'\t', f[j].rstrip())
                    con = WXC(order='utf2wx')
                    g[1] = con.convert(g[1])
                    g[2] = "_"
                    g[4] = "_"
                    g[5] = "_"
                    g[8] = "_"
                    g[9] = "_"
                    for x in g:
                        f1.write(x + '\t')
                    f1.write('\n')
                    j = j + 1
                break

        f1.close()
        cmd = 'python3 Createdata.py ' + sentenceid
        os.system(cmd)
示例#12
0
	def devnagari(self):
		con = WXC(order='wx2utf')
		return con.convert(self.text)
示例#13
0
import pprint
import string
import pickle
from wxconv import WXC

converter = WXC(order='utf2wx', lang='ben')


def increment_map(dictionary, key, value):
    if key in dictionary.keys():
        temp = dictionary[key]
        if value in temp.keys():
            temp[value] += 1
        else:
            temp[value] = 1
    else:
        dictionary[key] = {}
        temp = dictionary[key]
        temp[value] = 1


def get_marker(words):
    # TODO work on
    marker = "-"
    if len(words) > 1:
        marker = words[-1]
        marker = converter.convert(marker)
    return marker


def trim(dictionary, cutoff=3):
示例#14
0
import os
import SSF_converter.SSF_to_Input as input_converter
import SSF_converter.output_to_SSF as ssf_converter
import SSF_converter.output_to_SSF2 as ssf_converter2
import morph_analyser.make_prediction as morph_analyser
import Pos_Tagger.final_predict_model as pos_tagger
import chunking.predict as chunker
import lexical.dictionaryAmit1 as lexical
# import morph_generation.morph_inflection as morph_generator
import torch

from wxconv import WXC

con = WXC(order='utf2wx')
con1 = WXC(order='wx2utf', lang='hin')

BASE_DIR = os.path.dirname(os.path.abspath('__file__'))
BASE_DIR += '/SSF_converter/'
main_file = BASE_DIR + "main_format.txt"
local_add = os.path.dirname(os.path.abspath('__file__'))
pos_tagger_input_file = local_add + '/Pos_Tagger/sentinput.txt'
chunker_input_file = local_add + '/chunking/input.txt'

match_list = []

checklist = []


def main_format_writer(data):
    # This file writes in main_format.txt.
    out_main_file = open(main_file, 'w', encoding='utf-8')
示例#15
0
def wx_utf_converter_sentence(sentence):
	con = WXC(order='wx2utf', lang = 'hin')
	sentence1 = con.convert(sentence)
	return(sentence1)
示例#16
0
import os
import tornado.web
from tornado import gen
from logger import *
import traceback
from prediction.ner.predict import predict_tags
from prediction.intent_classifier.predict import predict_classifier
from prediction.smalltalk.getResponse import get_response
from wxconv import WXC

cur_dir = os.path.dirname(os.path.abspath(__file__))
con = WXC(order='utf2wx')
con1 = WXC(order='wx2utf', lang='hin')
@gen.coroutine
def process(data):
    print(data)
    module = data['module']
    input_string = data['data']['queries'][0]
    transformed_query = con.convert(unicode(input_string))
    try:
        if(module == 'ner'):
            output = yield predict_tags(input_string)
        if(module == 'all'):
            output1 = yield predict_classifier(transformed_query)
            if output1 == 'cab_book':
                output = yield predict_tags(input_string)
            else:
                output = yield get_response(transformed_query)
                output = con1.convert(output)
        if(module == 'smalltalk'):
            output = yield get_response(transformed_query)
示例#17
0
from wxconv import WXC
import random

wxc = WXC(order='utf2wx')

def tag_extract(string):
	tag_list = ['PERSON', 'ORGANIZATION', 'LOCATION', 'ENTERTAINMENT', 'FACILITIES', 'ARTIFACT', 'LIVTHINGS', 'LOCOMOTIVE', 'PLANTS', 'MATERIALS', 'DISEASE', 'O']
	for tag in tag_list:
		if tag in string:
			if tag=='PERSON':
				return 'I-PER'
			elif tag=='ORGANIZATION':
				return 'I-ORG'
			elif tag=='LOCATION':
				return 'I-LOC'
			elif tag=='O':
				return tag
			else :
				return 'I-MISC'

def write_conll(sentences, output):
	f = open(output, 'w')
	for sentence in sentences:
		for word in sentence:
			f.write(word + '\n')
		f.write('\n')
	f.close()

sentences = []
sentence = []
ner_tag = 'O'
示例#18
0
trigrams = [
    sample for sample in list_samples
    if (data[sample]['novelty']['rouge-3'] >= 25)
]
print("Trigrams_Novelty_range >=%d --> #of samples = %d" % (i, len(trigrams)))
print("\n---------------------------------------\n")
#print(trigrams[:10])

n = 5
import random
random.seed(42)
randome_samples = random.sample(trigrams, n)

### wx formate:
from wxconv import WXC
con = WXC(order='utf2wx', lang='tel')

for sample_num in randome_samples:

    n3 = data[sample_num]['novelty']['rouge-3']
    n2 = data[sample_num]['novelty']['rouge-2']
    n1 = data[sample_num]['novelty']['rouge-1']

    p1 = data[sample_num]['copy_precision']['rouge-1']
    p2 = data[sample_num]['copy_precision']['rouge-2']
    p3 = data[sample_num]['copy_precision']['rouge-3']

    #print(sample_num, n2, n3)
    print(
        "Sample_num= %s, novelty_rouge1 = %.2f, novelty_rouge2 = %.2f, novelty_rouge3 = %.2f  "
        % (sample_num, n1, n2, n3))
示例#19
0
#usr/bin/env/python
# -*- coding: utf-8 -*-

import wxconv
from wxconv import WXC
import sys
fil = sys.argv[1]
fem_file = open(fil, 'r').read().splitlines()

data_file = fem_file
con = WXC(order='utf2wx')
data_transcripts = [row.decode('UTF-8') for row in data_file]

data_WX = [con.convert(row).encode('UTF-8') for row in data_transcripts]

#data_WX_words = [a + '\t' + b for a, b in zip(data_serial, data_WX)]
print data_WX
with open('./output_WX.txt', 'w') as outfile:
    for wd in data_WX:
        outfile.writelines(wd + '\n')
    outfile.close()

### You would need to  manually remove word-final schwa, nukta in the output generated. ######
示例#20
0
# coding=utf-8
from keras.models import load_model
from training.ner.get_word_vectors import get_word_vector, get_sentence_vectors
from keras.preprocessing import sequence
import numpy as np, pickle
import BotServiceConfig as env
from tornado import gen
import common.load_models as models
from wxconv import WXC

con = WXC(order='wx2utf', lang='hin')


@gen.coroutine
def predict_tags(sentence):
    sentence = str(sentence.encode('utf-8', 'ignore'))
    sentence_list = sentence.strip().split()
    sent_len = len(sentence_list)
    # Get padded word vectors
    x = encode_sentence(sentence)
    tags = models.tagger1.predict(x, batch_size=1)[0]

    tags = tags[-sent_len:]
    pred_tags = decode_result(tags)
    entityList = []
    for i in xrange(len(pred_tags)):
        en = {}
        if 'B-' in pred_tags[i]:
            en[pred_tags[i][2:]] = None
            st = sentence_list[i]
            j = i + 1
示例#21
0
from wxconv import WXC
import argparse
import random

parser = argparse.ArgumentParser()
parser.add_argument('--format', type=str, help='format of the file text/ssf', required=True)
parser.add_argument('--input', type=str, help='input file to be converted', required=True)
parser.add_argument('--dist', type=int, nargs='+', default=[1, 0, 0], help='train:test_a:test_b')
args = parser.parse_args()
wxc = WXC(order='utf2wx')
if args.format == 'text':
    open('hin.text', 'w').write(wxc.convert(open(args.input).read()))
elif args.format == 'ssf':
    assert len(args.dist) == 3


    def tag_extract(string):
        tag_list = ['PERSON', 'ORGANIZATION', 'LOCATION', 'ENTERTAINMENT', 'FACILITIES', 'ARTIFACT', 'LIVTHINGS',
                    'LOCOMOTIVE', 'PLANTS', 'MATERIALS', 'DISEASE', 'O']
        for tag in tag_list:
            if tag in string:
                return tag


    def write_conll(sentences, output):
        f = open(output, 'w')
        for sentence in sentences:
            for word in sentence:
                f.write(word + '\n')
            f.write('\n')
        f.close()
示例#22
0
from wxconv import WXC

con = WXC(order='utf2wx')
con1 = WXC(order='wx2utf', lang='hin')

print(con.convert('लेस।'))
print(con1.convert('esehI'))
示例#23
0
import os
import csv
from tqdm import tqdm
import pprint
import pickle
import difflib
from wxconv import WXC

converter = WXC(order='utf2wx')


def get_anncorra(directory_path):
    marker_anncorra = {}
    anncorra_marker = {}

    print('Opening directory: ', directory_path)
    for filename in tqdm(os.listdir(directory_path)):
        filepath = os.path.join(directory_path, filename)

        with open(filepath, newline='') as csvfile:
            csvreader = csv.reader(csvfile, delimiter='\t')

            for row in csvreader:
                if not len(row) == 0:
                    marker = get_marker(row)
                    tag = get_tag(row)
                    increment_map(marker_anncorra, marker, tag)
                    increment_map(anncorra_marker, tag, marker)
    print('Got AnnCorra')

    return marker_anncorra, anncorra_marker