def test_other(self):
     for ext in ['ssf', 'conll', 'tnt']:
         wx_con = WXC(
             order='utf2wx',
             lang='hin',
             format_=ext,
             ssf_type='intra',
             rmask=False)
         utf_con = WXC(
             order='wx2utf',
             lang='hin',
             format_=ext,
             ssf_type='intra',
             rmask=False)
         with io.open('%s/%s/hin.%s' % (self.test_dir, ext, ext),
                      encoding='utf-8') as fp:
             if ext == "ssf":
                 sentences = re.finditer(
                     "(<Sentence id=.*?>)(.*?)</Sentence>", fp.read(), re.S)
                 for sid_sentence in sentences:
                     sentence = sid_sentence.group(2).strip()
                     wx = wx_con.convert(sentence)
             else:
                 for line in fp:
                     wx = wx_con.convert(line)
                     utf = utf_con.convert(wx)
                     wx_ = wx_con.convert(utf)
                     self.assertEqual(wx, wx_)
 def test_raw_text(self):
     for lang in self.languages:
         wx_con = WXC(order='utf2wx', lang=lang)
         utf_con = WXC(order='wx2utf', lang=lang)
         with io.open('%s/plain_text/%s.txt' % (self.test_dir, lang),
                      encoding='utf-8') as fp:
             for line in fp:
                 wx = wx_con.convert(line)
                 utf = utf_con.convert(wx)
                 wx_ = wx_con.convert(utf)
                 self.assertEqual(wx, wx_)
Пример #3
0
def wx_utf_converter(relation_df):
	a = []
	con = WXC(order='wx2utf', lang = 'hin')
	for i in relation_df.index:
		a.append(con.convert(relation_df.WORD[i]))
	relation_df['UTF_hindi'] = a
	return(relation_df)
Пример #4
0
def convert_encoding(filename, text_type, language, in_enc, out_enc):

    with codecs.open(filename, 'r', encoding='utf-8') as fp:
        logger.info('Loading text_type: %s data' % (text_type))
        output_data = ""
        if text_type == "ssf":
            converter = WXC(order='%s2%s' % (in_enc, out_enc), lang=language)
            for line in fp:
                line = line.strip()
                ds = line.split()
                #print("Line:--", ds)
                if line == "":
                    output_data += u"\n"
                elif line[0] == "<":
                    output_data += u"%s\n" % (line)
                elif ds[0] == "))":
                    output_data += u"\t%s\n" % (line)
                elif ds[0] == "0" or ds[1] == "((":
                    output_data += u"%s\n" % (u"\t".join(ds))
                elif ds[1] != "":
                    #print("check", ds)
                    word, tag = ds[1], ds[2]
                    word_con = converter.convert(word)
                    output_data += u"%s\t%s\t%s\n" % (ds[0], word_con, tag)
                else:
                    pass
            return output_data
        else:
            if text_type == "tnt":
                converter = WXC(order='%s2%s' % (in_enc, out_enc),
                                lang=language,
                                format_='tnt')
            elif text_type == "text":
                converter = WXC(order='%s2%s' % (in_enc, out_enc),
                                lang=language)
            elif text_type == "conll":
                converter = WXC(order='%s2%s' % (in_enc, out_enc),
                                lang=language,
                                format_='conll')
            else:
                raise Exception("Unknown Format %s" % text_type)
                pass
        text = fp.read()
        output_data = converter.convert(text)
        return output_data
Пример #5
0
def wx_utf_converter(relation_df):
	a = []
	con = WXC(order='wx2utf', lang = 'hin')
	for i in relation_df.index:
		a.append(con.convert(relation_df.WORD[i]))
	print(a)
	relation_df['UTF_hindi'] = a
	#relation_df['UTF_hindi'] = pd.Series(a).values
	return(relation_df)
Пример #6
0
class ThreeStepDecoding(object):
    def __init__(self, lid, htrans=None, etrans=None, wx=False):
        self.ed = enchant.Dict('en')
        self.hblm =  kenlm.LanguageModel('lm/hindi-n3-p5-lmplz.blm')
        self.eblm =  kenlm.LanguageModel('lm/english-n3-p10-lmplz.blm')
        self.so_dec_eng = so_viterbi(self.eblm)
        self.so_dec_hin = so_viterbi(self.hblm)
        self.e2h = {kv.split()[0]:kv.split()[1].split('|') for kv in io.open('dicts/ENG2HIN12M.dict')}
        self.h2e = {kv.split()[0]:kv.split()[1].split('|') for kv in io.open('dicts/HIN2ENG12M.dict')}
        self.meta = Meta()
        self.lid = LID(model=lid, etrans=etrans, htrans=htrans)
        self.wx = wx
        if not self.wx:
            self.wxc = WXC(order='wx2utf', lang='hin')

    def max_likelihood(self, n_sentence, target, k_best=7):
        if target == 'en':
            auto_tags = self.so_dec_eng.decode(n_sentence, len(n_sentence), k_best)
        else:
            auto_tags = self.so_dec_hin.decode(n_sentence, len(n_sentence), k_best)
        #beamsearch
        best_sen = [n_sentence[idx][at] for idx, at in enumerate(auto_tags)]
        return best_sen

    def decode(self, words, ltags):
        words = [re.sub(r'([a-z])\1\1+', r'\1', w) for w in words]
        hi_trellis = [self.lid.htrans.get(wi.lower(), [wi]*5)[:5] +
                      self.e2h.get(wi.lower() if li == 'en' else None, [u'_%s_' %wi])[:1] +
                      [u'_%s_'%wi] for wi,li in zip(words, ltags)]
        # import pdb; pdb.set_trace()
        hi_mono = self.max_likelihood(hi_trellis, 'hi')
        en_trellis = [[wi] + self.lid.etrans.get(wi.lower(), [wi]*5)[:5] +
                      self.h2e.get(wh if li == 'hi' else None, [wi])[:1]
                      for wi,wh,li in zip (words, hi_mono, ltags)]
        en_mono = self.max_likelihood(en_trellis, 'en')
        out = hi_mono[:]
        for i, _ in enumerate(hi_mono):
            if ltags[i]  in ['univ', 'acro', 'ne']:
                out[i] = words[i]
            elif ltags[i] in ['en', 'ne']:
                if words[i].lower() == en_mono[i]:
                    out[i] = words[i]
                elif self.ed.check(words[i].lower()) and len(words[i])>1:
                    out[i] = words[i]
                elif words[i].lower() in ['a', 'i']:
                    out[i] = words[i]
                else:
                    out[i] = en_mono[i]
            elif not self.wx:
                out[i] = self.wxc.convert(out[i])
        return out

    def tag_sent(self, sent, trans=True):
        sent = sent.split()
        sent, ltags = zip(*self.lid.tag_sent(sent))
        dec = self.decode(sent, ltags)
        return zip(sent, dec, ltags)
Пример #7
0
from wxconv import WXC

con = WXC(order='utf2wx')
con1 = WXC(order='wx2utf', lang='hin')

print(con.convert('लेस।'))
print(con1.convert('esehI'))
Пример #8
0
def wx_utf_converter_sentence(sentence):
	con = WXC(order='wx2utf', lang = 'hin')
	sentence1 = con.convert(sentence)
	return(sentence1)
Пример #9
0
    if line.startswith('<Sentence'):
        sentence = []
    elif line.startswith('</Sentence'):
        sentences.append(sentence)
        print(sentences)
    elif line.startswith('<ENAMEX'):
        ner_tag = tag_extract(line)
        print(ner_tag)
    else:
        print("bump")
        line = line.split()
        if len(line) == 0:
            continue
        try:
            index = float(line[0])
            if index != int(index):
                sentence.append(
                    wxc.convert(line[1]) + ' ' + line[2] + ' ' + '.' + ' ' +
                    ner_tag)
        except ValueError:
            pass
        ner_tag = 'O'
random.shuffle(sentences)
train = 0.8 * len(sentences) // 1
test_a = 0.15 * len(sentences) // 1
test_b = len(sentences) - train - test_a
train, test_a, test_b = sentences[0:train], sentences[
    train:train + test_a], sentences[train + test_a:]
write_conll(train, 'hin.train')
write_conll(test_a, 'hin.test_a')
write_conll(test_b, 'hin.test_b')
Пример #10
0
    elif choice == '19':

        sentenceid = input("Enter the sentence id: ")
        spl = re.split(r'-', sentenceid)[0]
        f = open("./treebanks/UD_Hindi-HDTB/hi_hdtb-ud-" + spl + ".conllu",
                 "r").readlines()
        f1 = open("./conll/conll_" + sentenceid + ".dat", "w")
        n = len(f)

        for i in range(0, n):
            if sentenceid in f[i]:
                j = i + 2
                while (f[j] != '\n'):
                    g = re.split(r'\t', f[j].rstrip())
                    con = WXC(order='utf2wx')
                    g[1] = con.convert(g[1])
                    g[2] = "_"
                    g[4] = "_"
                    g[5] = "_"
                    g[8] = "_"
                    g[9] = "_"
                    for x in g:
                        f1.write(x + '\t')
                    f1.write('\n')
                    j = j + 1
                break

        f1.close()
        cmd = 'python3 Createdata.py ' + sentenceid
        os.system(cmd)
    elif choice == '20':
Пример #11
0
	def devnagari(self):
		con = WXC(order='wx2utf')
		return con.convert(self.text)
Пример #12
0
from wxconv import WXC
import argparse
import random

parser = argparse.ArgumentParser()
parser.add_argument('--format', type=str, help='format of the file text/ssf', required=True)
parser.add_argument('--input', type=str, help='input file to be converted', required=True)
parser.add_argument('--dist', type=int, nargs='+', default=[1, 0, 0], help='train:test_a:test_b')
args = parser.parse_args()
wxc = WXC(order='utf2wx')
if args.format == 'text':
    open('hin.text', 'w').write(wxc.convert(open(args.input).read()))
elif args.format == 'ssf':
    assert len(args.dist) == 3


    def tag_extract(string):
        tag_list = ['PERSON', 'ORGANIZATION', 'LOCATION', 'ENTERTAINMENT', 'FACILITIES', 'ARTIFACT', 'LIVTHINGS',
                    'LOCOMOTIVE', 'PLANTS', 'MATERIALS', 'DISEASE', 'O']
        for tag in tag_list:
            if tag in string:
                return tag


    def write_conll(sentences, output):
        f = open(output, 'w')
        for sentence in sentences:
            for word in sentence:
                f.write(word + '\n')
            f.write('\n')
        f.close()
Пример #13
0
sentence = []
ner_tag = 'O'

for line in open('hindi-annotated-ssf-Form.txt'):
	if line.startswith('<Sentence'):
		sentence = []
	elif line.startswith('</Sentence'):
		sentences.append(sentence)
	elif line.startswith('<ENAMEX'):
		ner_tag = tag_extract(line)
		print("tag"+ner_tag)
	else:
		line = line.split()
		if len(line) == 0:
			continue
		try:
			index = float(line[0])
			if index != int(index):
				sentence.append(wxc.convert(line[1]) + ' ' + line[2] + ' ' + '.' + ' ' + ner_tag)
		except ValueError:
			pass
		ner_tag = 'O'

random.shuffle(sentences)
train = int(0.6 * len(sentences)) // 1
test = int(0.2 * len(sentences)) // 1
dev = len(sentences)
train, test, dev = sentences[0:train], sentences[train:train + test], sentences[train+test:]
write_conll(train, 'train')
write_conll(test, 'test')
write_conll(dev, 'dev')
Пример #14
0
    p2 = data[sample_num]['copy_precision']['rouge-2']
    p3 = data[sample_num]['copy_precision']['rouge-3']

    #print(sample_num, n2, n3)
    print(
        "Sample_num= %s, novelty_rouge1 = %.2f, novelty_rouge2 = %.2f, novelty_rouge3 = %.2f  "
        % (sample_num, n1, n2, n3))
    #corpus_info[ trigrams[0]['article']]
    print("article: ")
    print(corpus_info[sample_num]['article']['content'])
    print("\n")
    print("summary: ")
    print(corpus_info[sample_num]['summary']['content'])
    print("\n---------------- WX formate ------------------\n")
    print("Article:")
    print(con.convert(corpus_info[sample_num]['article']['content']))
    print("\n")
    print("Summary:")
    print(con.convert(corpus_info[sample_num]['summary']['content']))
    print("\n=============================================================\n")

# In[7]:

import numpy as np
from scipy import stats

####### Novelty checking:

params = ['novelty', 'copy_precision', 'copy_recall']

for params_key in catergorized_info.keys():
Пример #15
0
#usr/bin/env/python
# -*- coding: utf-8 -*-

import wxconv
from wxconv import WXC
import sys
fil = sys.argv[1]
fem_file = open(fil, 'r').read().splitlines()

data_file = fem_file
con = WXC(order='utf2wx')
data_transcripts = [row.decode('UTF-8') for row in data_file]

data_WX = [con.convert(row).encode('UTF-8') for row in data_transcripts]

#data_WX_words = [a + '\t' + b for a, b in zip(data_serial, data_WX)]
print data_WX
with open('./output_WX.txt', 'w') as outfile:
    for wd in data_WX:
        outfile.writelines(wd + '\n')
    outfile.close()

### You would need to  manually remove word-final schwa, nukta in the output generated. ######
Пример #16
0
    # output is converted in SSF with the help of second
    # type of converter and stored in SSF.txt
    ssf_converter2.func()
    print(main_format_data)

    i = 0
    # print(len(main_format_data))

    for j in range(len(main_format_data)):

        if main_format_data[j][1] == 'open_bracket_here':
            # main_format_data[j+1][5] = con1.convert(lexical.convertBhoj(con.convert(main_format_data[j+1][5]), match_list[j//2]))
            continue

        main_format_data[j][5] = con1.convert(
            lexical.convertBhoj(con.convert(main_format_data[j][5]),
                                match_list[(j // 2)]))
        print(main_format_data[j][5])

        i += 1

    # print(main_format_data)
    ssf_converter.out_temp_file.write(
        '\t\t***Output after Lexical Generator***\n\n')

    main_format_writer(main_format_data)

    ssf_converter2.func()
    print(main_format_data)

    # for j in range(len(main_format_data)):