Python WXC.WXC примеры, wxconv.WXC.WXC Python примеры использования

Пример #1

0

Показать файл

Файл: test_wxconv.py Проект: shaileshjannu/NamedEntityRecognition

 def test_other(self):
     for ext in ['ssf', 'conll', 'tnt']:
         wx_con = WXC(
             order='utf2wx',
             lang='hin',
             format_=ext,
             ssf_type='intra',
             rmask=False)
         utf_con = WXC(
             order='wx2utf',
             lang='hin',
             format_=ext,
             ssf_type='intra',
             rmask=False)
         with io.open('%s/%s/hin.%s' % (self.test_dir, ext, ext),
                      encoding='utf-8') as fp:
             if ext == "ssf":
                 sentences = re.finditer(
                     "(<Sentence id=.*?>)(.*?)</Sentence>", fp.read(), re.S)
                 for sid_sentence in sentences:
                     sentence = sid_sentence.group(2).strip()
                     wx = wx_con.convert(sentence)
             else:
                 for line in fp:
                     wx = wx_con.convert(line)
                     utf = utf_con.convert(wx)
                     wx_ = wx_con.convert(utf)
                     self.assertEqual(wx, wx_)

Пример #2

0

Показать файл

Файл: test_wxconv.py Проект: shaileshjannu/NamedEntityRecognition

 def test_raw_text(self):
     for lang in self.languages:
         wx_con = WXC(order='utf2wx', lang=lang)
         utf_con = WXC(order='wx2utf', lang=lang)
         with io.open('%s/plain_text/%s.txt' % (self.test_dir, lang),
                      encoding='utf-8') as fp:
             for line in fp:
                 wx = wx_con.convert(line)
                 utf = utf_con.convert(wx)
                 wx_ = wx_con.convert(utf)
                 self.assertEqual(wx, wx_)

Пример #3

0

Показать файл

Файл: E_Modules.py Проект: PawarKishori/Alignment1

def wx_utf_converter(relation_df):
	a = []
	con = WXC(order='wx2utf', lang = 'hin')
	for i in relation_df.index:
		a.append(con.convert(relation_df.WORD[i]))
	relation_df['UTF_hindi'] = a
	return(relation_df)

Пример #4

0

Показать файл

Файл: transliterate.py Проект: irshadbhat/csnli

    def __init__(self, model, lang, gpu=False, wx=False):
        self.lang = lang
        self.is_ip_wx = wx
        parser = argparse.ArgumentParser(
            description='transliterate.py',
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
        opts.add_md_help_argument(parser)
        opts.translate_opts(parser)

        self.opt = parser.parse_args()
        self.trans_dict = dict()
        self.broken_words = dict()
        file_path = os.path.dirname(os.path.abspath(__file__))

        if self.lang == 'hin':
            self.to_utf = WXC(order='wx2utf', lang='hin')
            self.non_alpha = re.compile(u'([^a-zA-Z]+)')
            self.alpha_letters = set(string.ascii_letters)
            self.com_abbr = {
                'b': ['BI', 'be'],
                'd': ['xI', 'xe'],
                'g': ['jI'],
                'k': ['ke', 'ki', 'kI'],
                'h': ['hE', 'hEM'],
                'ha': ['hE', 'hEM'],
                'n': ['ina', 'ne'],
                'm': ['meM', 'mEM'],
                'p': ['pe'],
                'q': ['kyoM'],
                'r': ['Ora', 'ora'],
                's': ['isa', 'se'],
                'y': ['ye']
            }

        if self.lang == 'eng':
            self.non_alpha = re.compile(u'([^a-z]+)')
            self.alpha_letters = set(string.ascii_letters[:26])
            with open('%s/extras/COMMON_ABBR.eng' % file_path) as fp:
                self.com_abbr = {}
                for line in fp:
                    k, v = line.split()
                    self.com_abbr[k] = v.split('|')

        dummy_parser = argparse.ArgumentParser(description='train.py')
        opts.model_opts(dummy_parser)
        dummy_opt = dummy_parser.parse_known_args([])[0]
        if gpu:
            self.opt.gpu = 0

        self.opt.cuda = self.opt.gpu > -1
        self.opt.model = model
        self.opt.n_best = 5
        self.opt.lang = lang
        if self.opt.cuda:
            torch.cuda.set_device(self.opt.gpu)

        # Load the model.
        self.fields, self.model, self.model_opt = onmt.ModelConstructor.load_test_model(
            self.opt, dummy_opt.__dict__)

Пример #5

0

Показать файл

Файл: convert_encoding.py Проект: zarmeen92/indic_tagger

def convert_encoding(filename, text_type, language, in_enc, out_enc):

    with codecs.open(filename, 'r', encoding='utf-8') as fp:
        logger.info('Loading text_type: %s data' % (text_type))
        output_data = ""
        if text_type == "ssf":
            converter = WXC(order='%s2%s' % (in_enc, out_enc), lang=language)
            for line in fp:
                line = line.strip()
                ds = line.split()
                #print("Line:--", ds)
                if line == "":
                    output_data += u"\n"
                elif line[0] == "<":
                    output_data += u"%s\n" % (line)
                elif ds[0] == "))":
                    output_data += u"\t%s\n" % (line)
                elif ds[0] == "0" or ds[1] == "((":
                    output_data += u"%s\n" % (u"\t".join(ds))
                elif ds[1] != "":
                    #print("check", ds)
                    word, tag = ds[1], ds[2]
                    word_con = converter.convert(word)
                    output_data += u"%s\t%s\t%s\n" % (ds[0], word_con, tag)
                else:
                    pass
            return output_data
        else:
            if text_type == "tnt":
                converter = WXC(order='%s2%s' % (in_enc, out_enc),
                                lang=language,
                                format_='tnt')
            elif text_type == "text":
                converter = WXC(order='%s2%s' % (in_enc, out_enc),
                                lang=language)
            elif text_type == "conll":
                converter = WXC(order='%s2%s' % (in_enc, out_enc),
                                lang=language,
                                format_='conll')
            else:
                raise Exception("Unknown Format %s" % text_type)
                pass
        text = fp.read()
        output_data = converter.convert(text)
        return output_data

Пример #6

0

Показать файл

Файл: Modules.py Проект: PawarKishori/Alignment1

def wx_utf_converter(relation_df):
	a = []
	con = WXC(order='wx2utf', lang = 'hin')
	for i in relation_df.index:
		a.append(con.convert(relation_df.WORD[i]))
	print(a)
	relation_df['UTF_hindi'] = a
	#relation_df['UTF_hindi'] = pd.Series(a).values
	return(relation_df)

Пример #7

0

Показать файл

 def __init__(self, lid, htrans=None, etrans=None, wx=False):
     self.ed = enchant.Dict('en')
     self.hblm =  kenlm.LanguageModel('lm/hindi-n3-p5-lmplz.blm')
     self.eblm =  kenlm.LanguageModel('lm/english-n3-p10-lmplz.blm')
     self.so_dec_eng = so_viterbi(self.eblm)
     self.so_dec_hin = so_viterbi(self.hblm)
     self.e2h = {kv.split()[0]:kv.split()[1].split('|') for kv in io.open('dicts/ENG2HIN12M.dict')}
     self.h2e = {kv.split()[0]:kv.split()[1].split('|') for kv in io.open('dicts/HIN2ENG12M.dict')}
     self.meta = Meta()
     self.lid = LID(model=lid, etrans=etrans, htrans=htrans)
     self.wx = wx
     if not self.wx:
         self.wxc = WXC(order='wx2utf', lang='hin')

Пример #8

0

Показать файл

Файл: hindi.py Проект: shaileshjannu/NamedEntityRecognition

from wxconv import WXC
import random

wxc = WXC(order='utf2wx')


def tag_extract(string):
    tag_list = [
        'PERSON', 'ORGANIZATION', 'LOCATION', 'ENTERTAINMENT', 'FACILITIES',
        'ARTIFACT', 'LIVTHINGS', 'LOCOMOTIVE', 'PLANTS', 'MATERIALS',
        'DISEASE', 'O'
    ]
    for tag in tag_list:
        if tag in string:
            return tag


def write_conll(sentences, output):
    f = open(output, 'w')
    for sentence in sentences:
        for word in sentence:
            f.write(word + '\n')
        f.write('\n')
    f.close()


sentences = []
sentence = []
ner_tag = 'O'

for line in open('hindi-annotated-ssf-Form.txt'):

Пример #9

0

Показать файл

Файл: query_hindi.py Проект: PawarKishori/QSE

        f2.close()
    elif choice == '19':

        sentenceid = input("Enter the sentence id: ")
        spl = re.split(r'-', sentenceid)[0]
        f = open("./treebanks/UD_Hindi-HDTB/hi_hdtb-ud-" + spl + ".conllu",
                 "r").readlines()
        f1 = open("./conll/conll_" + sentenceid + ".dat", "w")
        n = len(f)

        for i in range(0, n):
            if sentenceid in f[i]:
                j = i + 2
                while (f[j] != '\n'):
                    g = re.split(r'\t', f[j].rstrip())
                    con = WXC(order='utf2wx')
                    g[1] = con.convert(g[1])
                    g[2] = "_"
                    g[4] = "_"
                    g[5] = "_"
                    g[8] = "_"
                    g[9] = "_"
                    for x in g:
                        f1.write(x + '\t')
                    f1.write('\n')
                    j = j + 1
                break

        f1.close()
        cmd = 'python3 Createdata.py ' + sentenceid
        os.system(cmd)

Пример #10

0

Показать файл

trigrams = [
    sample for sample in list_samples
    if (data[sample]['novelty']['rouge-3'] >= 25)
]
print("Trigrams_Novelty_range >=%d --> #of samples = %d" % (i, len(trigrams)))
print("\n---------------------------------------\n")
#print(trigrams[:10])

n = 5
import random
random.seed(42)
randome_samples = random.sample(trigrams, n)

### wx formate:
from wxconv import WXC
con = WXC(order='utf2wx', lang='tel')

for sample_num in randome_samples:

    n3 = data[sample_num]['novelty']['rouge-3']
    n2 = data[sample_num]['novelty']['rouge-2']
    n1 = data[sample_num]['novelty']['rouge-1']

    p1 = data[sample_num]['copy_precision']['rouge-1']
    p2 = data[sample_num]['copy_precision']['rouge-2']
    p3 = data[sample_num]['copy_precision']['rouge-3']

    #print(sample_num, n2, n3)
    print(
        "Sample_num= %s, novelty_rouge1 = %.2f, novelty_rouge2 = %.2f, novelty_rouge3 = %.2f  "
        % (sample_num, n1, n2, n3))

Пример #11

0

Показать файл

import pprint
import string
import pickle
from wxconv import WXC

converter = WXC(order='utf2wx', lang='ben')


def increment_map(dictionary, key, value):
    if key in dictionary.keys():
        temp = dictionary[key]
        if value in temp.keys():
            temp[value] += 1
        else:
            temp[value] = 1
    else:
        dictionary[key] = {}
        temp = dictionary[key]
        temp[value] = 1


def get_marker(words):
    # TODO work on
    marker = "-"
    if len(words) > 1:
        marker = words[-1]
        marker = converter.convert(marker)
    return marker


def trim(dictionary, cutoff=3):

Пример #12

0

Показать файл

# coding=utf-8
from keras.models import load_model
from training.ner.get_word_vectors import get_word_vector, get_sentence_vectors
from keras.preprocessing import sequence
import numpy as np, pickle
import BotServiceConfig as env
from tornado import gen
import common.load_models as models
from wxconv import WXC

con = WXC(order='wx2utf', lang='hin')


@gen.coroutine
def predict_tags(sentence):
    sentence = str(sentence.encode('utf-8', 'ignore'))
    sentence_list = sentence.strip().split()
    sent_len = len(sentence_list)
    # Get padded word vectors
    x = encode_sentence(sentence)
    tags = models.tagger1.predict(x, batch_size=1)[0]

    tags = tags[-sent_len:]
    pred_tags = decode_result(tags)
    entityList = []
    for i in xrange(len(pred_tags)):
        en = {}
        if 'B-' in pred_tags[i]:
            en[pred_tags[i][2:]] = None
            st = sentence_list[i]
            j = i + 1

Пример #13

0

Показать файл

from wxconv import WXC

import warnings

warnings.filterwarnings('ignore')

parser = argparse.ArgumentParser(
    description='translate.py',
    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
opts.add_md_help_argument(parser)
opts.translate_opts(parser)

opt = parser.parse_args()

if opt.lang == 'hin':
    to_wx = WXC(order='utf2wx', lang='hin')
    non_alpha = re.compile(u'([^a-zA-Z]+)')
    alpha_letters = set(string.ascii_letters)

trans_dict = dict()
broken_words = dict()


def addone(text):
    for word in text.split():
        if opt.lang == 'hin' and (word[0] == word[-1] == '_'):
            trans_dict[word] = word[1:-1]
            continue
        if word in trans_dict:
            continue
        words = non_alpha.split(word)

Пример #14

0

Показать файл

Файл: Modules.py Проект: PawarKishori/Alignment1

def wx_utf_converter_sentence(sentence):
	con = WXC(order='wx2utf', lang = 'hin')
	sentence1 = con.convert(sentence)
	return(sentence1)

Пример #15

0

Показать файл

	def devnagari(self):
		con = WXC(order='wx2utf')
		return con.convert(self.text)

Пример #16

0

Показать файл

from wxconv import WXC

con = WXC(order='utf2wx')
con1 = WXC(order='wx2utf', lang='hin')

print(con.convert('लेस।'))
print(con1.convert('esehI'))

Пример #17

0

Показать файл

Файл: hindipreprocess.py Проект: zubairabid/Semester6

import os
import csv
from tqdm import tqdm
import pprint
import pickle
import difflib
from wxconv import WXC

converter = WXC(order='utf2wx')


def get_anncorra(directory_path):
    marker_anncorra = {}
    anncorra_marker = {}

    print('Opening directory: ', directory_path)
    for filename in tqdm(os.listdir(directory_path)):
        filepath = os.path.join(directory_path, filename)

        with open(filepath, newline='') as csvfile:
            csvreader = csv.reader(csvfile, delimiter='\t')

            for row in csvreader:
                if not len(row) == 0:
                    marker = get_marker(row)
                    tag = get_tag(row)
                    increment_map(marker_anncorra, marker, tag)
                    increment_map(anncorra_marker, tag, marker)
    print('Got AnnCorra')

    return marker_anncorra, anncorra_marker

Python WXC.WXC примеры использования