def test_kbest(self): """Make sure `k-best` works without failure""" k_best = range(2, 15) r2i = Transliterator(source='eng', target='hin', decode='beamsearch') i2r = Transliterator(source='hin', target='eng', decode='beamsearch') for k in k_best: hin = r2i.transform('indictrans', k_best=k) eng = i2r.transform(hin[0], k_best=k) self.assertTrue(len(hin) == k) self.assertTrue(len(eng) == k)
class IndictransTransliterator: def __init__(self): self.trn = Transliterator(source='hin', target='eng', decode='beamsearch', build_lookup=True) self.trans_dict = {} def transliterate(self, original): transliterations = self.get_all(original) return random.choice(transliterations) def get_all(self, original): if original in self.trans_dict: return self.trans_dict[original] else: transliterations = self.trn.transform(original, k_best=5) self.trans_dict[original] = transliterations return transliterations @staticmethod def _is_deva(unicode_tok): """Returns True if |unicode_tok| contains a Devanagari character""" for c in unicode_tok: if int('0900', 16) <= ord(c) <= int('097f', 16): return True return False
def predict(): if request.method == 'POST': print(request.form.get('NewYork')) try: NewYork = request.form['NewYork'] California = request.form['California'] Florida = request.form['Florida'] NewYorkstrip = NewYork.strip() NewYorkLower = NewYorkstrip.lower() f = open("data/Alldata.csv", "r") s = f.read() dic = ast.literal_eval(s) my_list = [] words = NewYorkLower.split() for c in words: trn = Transliterator(source=California.strip(), target=Florida.strip(), build_lookup=True) eng = trn.transform(c.lower()) my_list.append(dic.get(c,eng)) a = my_list listToStr = ' '.join(map(str, a)) except ValueError: return "Please check if the values are entered correctly" return render_template('home.html', prediction = listToStr)
def transliterate(): word = request.args.get('word', default="congress", type=str) trn = Transliterator(source='eng', target='hin', build_lookup=True, decode='beamsearch') best_transliterated_list = trn.transform(word, k_best=5) return {"transliteration": best_transliterated_list}
def read_kumaretal_2019_agg_downloads(path, mode, romanize=False): st_time = time() global FIELDS, MAX_CHAR_LEN from indictrans import Transliterator trn_hin2eng = Transliterator(source='hin', target='eng') n_trimmed, n_romanized = 0, 0 Example = namedtuple(f"{mode}_example", FIELDS, defaults=(None, ) * len(FIELDS)) examples = [] lines = read_csv_file(path, has_header=False) for i, line in enumerate(lines): uid, txt, label = line[0], line[1], line[2] if not txt: continue if romanize: new_txt = trn_hin2eng.transform(txt) if txt != new_txt: n_romanized += 1 txt = new_txt new_txt = clean_generic(txt) if new_txt.strip() == "": new_txt = txt if len(new_txt) > MAX_CHAR_LEN: n_trimmed += 1 newtokens, currsum = [], 0 for tkn in new_txt.split(): # 1 for space if currsum + len(tkn) + 1 <= MAX_CHAR_LEN: newtokens.append(tkn) currsum += len(tkn) + 1 else: break new_txt = " ".join(newtokens) example = Example(dataset="kumaretal_2019_agg", task="classification", split_type=mode, uid=uid, text=txt, label=label, text_pp=new_txt) examples.append(example) progress_bar(len(examples), len(lines), ["time"], [time() - st_time]) if romanize: print( f"len of {mode} data: {len(examples)} and # of trimmed instances: {n_trimmed} " f"and # of romanized instances: {n_romanized}") else: print( f"len of {mode} data: {len(examples)} and # of trimmed instances: {n_trimmed}" ) return examples
def test_rtrans(self): """Test Indic-to-Indic ML and Rule-Based models.""" with io.open('%s/indic-test' % self.test_dir, encoding='utf-8') as fp: # first line contains language codes lang_codes = fp.readline().split() lang2word = dict( zip(lang_codes, [[] for i in range(len(lang_codes))])) for line in fp: line = line.split() for i, word in enumerate(line): lang2word[lang_codes[i]].append(word) for src in lang_codes: for trg in lang_codes: if src == trg: continue s2t_ml = Transliterator(source=src, target=trg, rb=False) s2t_rb = Transliterator(source=src, target=trg, rb=True) for word in lang2word[src]: s2t_ml.transform(word) s2t_rb.transform(word)
def test_ru2ind(self): """Test [Roman, Urdu]-to-Indic ML models""" for lang_pair in self.trg2src: src = lang_pair[0] trg = lang_pair[1] trans = Transliterator(source=src, target=trg) with io.open('%s/%s_%s.testpairs' % (self.test_dir, trg, src), encoding='utf-8') as fp: for line in fp: expected, word = line.split() self.assertEqual(trans.transform(word), expected)
def test_ind2ru(self): """Test Indic-to-[Roman, Urdu] ML models""" for lang_pair in self.src2trg: src = lang_pair[0] trg = lang_pair[1] trans = Transliterator(source=src, target=trg) with io.open('%s/%s_%s.testpairs' % (self.test_dir, src, trg), encoding='utf-8') as fp: for line in fp: word, expected = line.split() self.assertEqual(trans.transform(word), expected)
def test_rtrans(self): """Test Indic-to-Indic ML and Rule-Based models.""" with io.open('%s/indic-test' % self.test_dir, encoding='utf-8') as fp: # first line contains language codes lang_codes = fp.readline().split() lang2word = dict(zip(lang_codes, [[] for i in range(len(lang_codes))])) for line in fp: line = line.split() for i, word in enumerate(line): lang2word[lang_codes[i]].append(word) for src in lang_codes: for trg in lang_codes: if src == trg: continue s2t_ml = Transliterator(source=src, target=trg, rb=False) s2t_rb = Transliterator(source=src, target=trg, rb=True) for word in lang2word[src]: s2t_ml.transform(word) s2t_rb.transform(word)
def read_iitp_product_reviews_hi_sa_downloads(path, mode): st_time = time() global FIELDS, MAX_CHAR_LEN from indictrans import Transliterator trn_hin2eng = Transliterator(source='hin', target='eng') n_trimmed = 0 Example = namedtuple(f"{mode}_example", FIELDS, defaults=(None, ) * len(FIELDS)) examples = [] lines = [line.strip() for line in open(path, "r")] for i, line in enumerate(lines): line = line.strip() if not line: continue vals = line.split(",") label = vals[0] txt = ",".join(vals[1:]) txt = trn_hin2eng.transform(txt) new_txt = "".join([char for char in txt]) if len(new_txt) > MAX_CHAR_LEN: n_trimmed += 1 newtokens, currsum = [], 0 for tkn in new_txt.split(): # 1 for space if currsum + len(tkn) + 1 <= MAX_CHAR_LEN: newtokens.append(tkn) currsum += len(tkn) + 1 else: break new_txt = " ".join(newtokens) example = Example(dataset="iitp_product_reviews_hi_sa", task="classification", split_type=mode, uid=len(examples), text=txt, label=label, text_pp=new_txt) examples.append(example) progress_bar(len(examples), len(lines), ["time"], [time() - st_time]) print( f"len of {mode} data: {len(examples)} and # of trimmed instances: {n_trimmed}" ) return examples
def read_hinglishpedia_downloads(path1, path2, mode, standardizing_tags={}): st_time = time() global FIELDS, MAX_CHAR_LEN n_trimmed = 0 FIELDS += [ fieldname for fieldname in [ "tgt", ] if fieldname not in FIELDS ] Example = namedtuple(f"{mode}_example", FIELDS, defaults=(None, ) * len(FIELDS)) examples = [] from indictrans import Transliterator trn_hin2eng = Transliterator(source='hin', target='eng') txt_lines = [line.strip() for line in open(path1, "r")] tag_lines = [line.strip() for line in open(path2, "r")] for i, (txt, tags) in tqdm(enumerate(zip(txt_lines, tag_lines))): if not txt: continue txt = trn_hin2eng.transform(txt) example = Example(dataset="hinglishpedia", task="classification", split_type=mode, uid=len(examples), text=txt, langids=" ".join([ standardizing_tags[lid] if lid in standardizing_tags else "other" for lid in tags.split() ]), text_pp=txt) examples.append(example) # progress_bar(len(examples), len(txt_lines), ["time"], [time() - st_time]) print( f"len of {mode} data: {len(examples)} and # of trimmed instances: {n_trimmed}" ) return examples
def get_transliteration(vocab, headers): trans = {} if headers is None: trn = Transliterator(source='eng', target='hin', build_lookup=True) trans = {item: trn.transform(item) for item in vocab} else: base_url = 'https://api.cognitive.microsofttranslator.com' path = '/transliterate?api-version=3.0&language=hi&fromScript=Latn&toScript=Deva' count = 0 body = [] constructed_url = base_url + path query = '' while (count <= 6500): for i in range(count, (count + 500), 50): for j in range(i, i + 50): query += vocab[j] + ' ' body.append({'text': query.strip()}) query = '' response = requests.post(constructed_url, headers=headers, json=body) result = response.json() for j, i in enumerate(result): trans.update({body[j]['text']: i['text']}) body = [] count += 500 for i in range(count, len(vocab), 50): for j in range(i, i + 50): if j < len(vocab): query += vocab[j] + ' ' body.append({'text': query.strip()}) query = '' response = requests.post(constructed_url, headers=headers, json=body) result = response.json() for j, i in enumerate(result): trans.update({body[j]['text']: i['text']}) return trans
from indictrans import Transliterator vocab = [] with open('all_roman.txt', 'r') as infile: con = infile.readlines() vocab = [x.strip('\n') for x in con] trn = Transliterator(source='eng', target='hin') with open('transliterations.txt', 'w+') as outfile: for word in vocab: deva = trn.transform(word) outfile.write(word + "\t" + deva + "\n")
class FeatureExtractor(BaseEstimator, TransformerMixin): """Extract review text, emojis and emoji sentiment. Takes a sequence of strings and produces a dict of values. Keys are `review`, `emojis`, and `emoji-sentiment`. """ def __init__(self, lang='ta'): self.lang = lang self.normalizer = BaseNormalizer(lang) # This language map was created using Google's googletrans module. Create the file alltextlang.txt by calling # detect_lang_and_store in feature_utils.py self.lmap = self.load_language_maps( os.path.join(os.path.dirname(sys.path[0]), '../resources/data/alltextslang.txt')) self.soundexer = Soundex() self.ta_trans = Transliterator(source='eng', target='tam', build_lookup=True) self.ml_trans = Transliterator(source='eng', target='mal', build_lookup=True) self.sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) self.sym_spell.load_dictionary( '../../src/extern/data/etymdict.csv.vocab.tsv.gz', term_index=0, count_index=1, separator="\t") super().__init__() def load_language_maps(self, mapfile): lmap = {} with open(mapfile, 'r') as mapf: for line in mapf: text, lang, conf = line.rstrip().split('\t') lmap[text] = (lang, float(conf)) return lmap def get_language_tag(self, text): return self.lmap.get(text, ('unknown', 0.0)) def fit(self, x, y=None): return self def transform(self, reviews): features = np.recarray( shape=(len(reviews), ), dtype=[ ('review', object), ('emojis', object), ('emoji_sentiment', object), ('lang_tag', object), ('len_range', object), ('soundexes', object), ], ) for i, review in enumerate(reviews): features['review'][i] = self.normalizer.normalize(text=review) emojis, sentiment = get_emojis_from_text(review) features['emojis'][i] = ' '.join(emojis) features['emoji_sentiment'][i] = sentiment lang, conf = self.get_language_tag(review.strip()) if lang == self.lang or lang == (self.lang + 'en'): # google agrees with some confidence agreement = 1 elif conf < 0.5: # google says not-tamil, but weakly agreement = 0.5 else: # google clearly says not-tamil agreement = 0 features['lang_tag'][i] = {'lang': lang, 'agreement': agreement} features['len_range'][i] = get_doc_len_range(review) if self.lang == 'ta': review_trans = self.ta_trans.transform(review) for word in review_trans.split(): suggestions = self.sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2, include_unknown=True) if len(suggestions) > 0 and suggestions[0].distance < 3: print(word, suggestions[0].term) # no match with dictionary, we need a more comprehensive dictionary plus phonetic similarity elif self.lang == 'ml': review_trans = self.ml_trans.transform(review) else: review_trans = review # TODO: introduce spell correct here for added normalisation # print(lang, review_trans) features['soundexes'][i] = ' '.join([ self.soundexer.soundex(word) for word in review_trans.split() ]) return features
class Vida: """ Only focused on English to 15 Indic Language Transliteration Hindi (hin) Bengali (ben) Gujarati (guj) Punjabi (pun) Malayalam (mal) Kannada (kan) Tamil (tam) Telugu (tel) Oriya (ori) Marathi (mar) Assamese (ass) Konkani (kon) Bodo (bod) Nepali (nep) Urdu (urd) English (eng) """ languages = { "hin": "Hindi", "ben": "Bengali", "guj": "Gujarati", "pun": "Punjabi", "mal": "Malayalam", "kan": "Kannada", "tam": "Tamil", "tel": "Telugu", "ori": "Oriya", "mar": "Marathi", "ass": "Assamese", "kon": "Konkani", "bod": "Bodo", "nep": "Nepali", "urd": "Urdu", "eng": "English" } codes = languages.keys() @staticmethod def is_ascii(s): return all(ord(c) < 128 for c in s) def __init__(self, text, source, target): self.text = text self.source = source self.target = target self.isEngSource = (self.source == 'eng') if self.isEngSource: self.usdictionary = enchant.Dict("en_US") self.gbdictionary = enchant.Dict("en_GB") self.validated = self.source in self.codes and self.target in self.codes if self.validated: self.engine = Transliterator(source=self.source, target=self.target) def run(self): status = False message = "Couldn't transliterate the text." content = {} output = [] if not self.validated: message = "Please provide languages and their code." output = self.text else: text = self.text.split() try: for index in xrange(len(text)): word = text[index] if not self.isEngSource: word = word.decode('utf-8') output.insert( index, self.engine.transform(word).encode('utf-8')) else: if not Vida.is_ascii(word): word = word.decode('utf-8') if not self.usdictionary.check( word) and not self.gbdictionary.check(word): output.insert( index, self.engine.transform(word).encode('utf-8')) else: output.insert(index, word) status = True message = "Succesfully transliterated the code." except UnicodeDecodeError, e: Repo.exception(e) message = "Couldn't decode the language properly." except IndexError, e: Repo.exception(e) message = "Couldn't properly frame the sentence." output = ' '.join(output)
#!/usr/bin/env python # -*- coding: utf-8 -*- from __future__ import unicode_literals import csv from indictrans import Transliterator trn = Transliterator(source='hin', target='eng', build_lookup=True) # print eng with open('dataSet.csv') as csvfile: reader = csv.reader(csvfile) for row in reader: hin = row[1].decode('utf-8').replace('\n', " ") eng = trn.transform(hin) print row[0], eng.encode('unicode-escape'), row[2]
fname = os.path.join(ud_dir, hi_tb, f) print 'Reading ' + fname ftrn = fname + '.en' fout = codecs.open(ftrn, 'w', encoding='utf-8') count = 0 with codecs.open(fname, encoding='utf-8') as fin: for line in fin: count += 1 if count % 10000 == 0 and count > 0: print(count) tok = line.strip() if tok.startswith('#') or tok == '': fout.write(unicode(tok) + '\n') continue tok = tok.split('\t') if len(tok) == 1 or '.' in tok[0] or '-' in tok[0]: fout.write(unicode(tok) + '\n') continue else: tok[1] = trn.transform(tok[1]) or '_' tok[2] = trn.transform(tok[2]) or '_' entry = '\t'.join(tok) + '\n' fout.write(unicode(entry)) fout.close()
def code_transliterate(self): trn = Transliterator(source='hin', target='eng', build_lookup=True) eng = trn.transform(self) return eng
tk = Tokenizer(lang=s[:2]) tk_back = Tokenizer(lang=t[:2]) l = u"रज्ज के रुलाया" #\nरज्ज के हंसाया\n\nमैंने दिल खो' के इश्क़ कमाया\n" l = l.lower().strip() lines = l.split("\n") print(lines) output = [] if flag == True: for l in lines: json = {} definitive = forward_transl_full.transform(l) json["text"] = definitive json["tokens"] = [] tokens = [] text_precedent = "" tokens = tk.tokenize(l) back_tokens = tk_back.tokenize(definitive) for i, t in enumerate(tokens): inner_json = {} choosen = back_tokens[i]