示例#1
0
 def test_kbest(self):
     """Make sure `k-best` works without failure"""
     k_best = range(2, 15)
     r2i = Transliterator(source='eng', target='hin', decode='beamsearch')
     i2r = Transliterator(source='hin', target='eng', decode='beamsearch')
     for k in k_best:
         hin = r2i.transform('indictrans', k_best=k)
         eng = i2r.transform(hin[0], k_best=k)
         self.assertTrue(len(hin) == k)
         self.assertTrue(len(eng) == k)
示例#2
0
 def test_kbest(self):
     """Make sure `k-best` works without failure"""
     k_best = range(2, 15)
     r2i = Transliterator(source='eng',
                          target='hin',
                          decode='beamsearch')
     i2r = Transliterator(source='hin',
                          target='eng',
                          decode='beamsearch')
     for k in k_best:
         hin = r2i.transform('indictrans', k_best=k)
         eng = i2r.transform(hin[0], k_best=k)
         self.assertTrue(len(hin) == k)
         self.assertTrue(len(eng) == k)
示例#3
0
class IndictransTransliterator:
    def __init__(self):
        self.trn = Transliterator(source='hin',
                                  target='eng',
                                  decode='beamsearch',
                                  build_lookup=True)
        self.trans_dict = {}

    def transliterate(self, original):
        transliterations = self.get_all(original)
        return random.choice(transliterations)

    def get_all(self, original):
        if original in self.trans_dict:
            return self.trans_dict[original]
        else:
            transliterations = self.trn.transform(original, k_best=5)
            self.trans_dict[original] = transliterations
            return transliterations

    @staticmethod
    def _is_deva(unicode_tok):
        """Returns True if |unicode_tok| contains a Devanagari character"""
        for c in unicode_tok:
            if int('0900', 16) <= ord(c) <= int('097f', 16):
                return True
        return False
示例#4
0
def predict():
    
    if request.method == 'POST':
        print(request.form.get('NewYork'))
        try:

            NewYork = request.form['NewYork']
            California = request.form['California']
            Florida = request.form['Florida']
            NewYorkstrip = NewYork.strip()
            NewYorkLower = NewYorkstrip.lower()
  
            f = open("data/Alldata.csv", "r")
            s = f.read()
            dic = ast.literal_eval(s)
            my_list = []
            words = NewYorkLower.split() 
            
            for c in words:
                trn = Transliterator(source=California.strip(), target=Florida.strip(), build_lookup=True)
                eng = trn.transform(c.lower())
                my_list.append(dic.get(c,eng)) 
                a = my_list
                
            listToStr = ' '.join(map(str, a)) 
            

        except ValueError:
            return "Please check if the values are entered correctly"
    return render_template('home.html', prediction = listToStr)
示例#5
0
def transliterate():
    word = request.args.get('word', default="congress", type=str)
    trn = Transliterator(source='eng',
                         target='hin',
                         build_lookup=True,
                         decode='beamsearch')
    best_transliterated_list = trn.transform(word, k_best=5)
    return {"transliteration": best_transliterated_list}
示例#6
0
def read_kumaretal_2019_agg_downloads(path, mode, romanize=False):
    st_time = time()

    global FIELDS, MAX_CHAR_LEN

    from indictrans import Transliterator
    trn_hin2eng = Transliterator(source='hin', target='eng')

    n_trimmed, n_romanized = 0, 0
    Example = namedtuple(f"{mode}_example",
                         FIELDS,
                         defaults=(None, ) * len(FIELDS))
    examples = []
    lines = read_csv_file(path, has_header=False)
    for i, line in enumerate(lines):
        uid, txt, label = line[0], line[1], line[2]
        if not txt:
            continue
        if romanize:
            new_txt = trn_hin2eng.transform(txt)
            if txt != new_txt:
                n_romanized += 1
            txt = new_txt
        new_txt = clean_generic(txt)
        if new_txt.strip() == "":
            new_txt = txt
        if len(new_txt) > MAX_CHAR_LEN:
            n_trimmed += 1
            newtokens, currsum = [], 0
            for tkn in new_txt.split():  # 1 for space
                if currsum + len(tkn) + 1 <= MAX_CHAR_LEN:
                    newtokens.append(tkn)
                    currsum += len(tkn) + 1
                else:
                    break
            new_txt = " ".join(newtokens)
        example = Example(dataset="kumaretal_2019_agg",
                          task="classification",
                          split_type=mode,
                          uid=uid,
                          text=txt,
                          label=label,
                          text_pp=new_txt)
        examples.append(example)
        progress_bar(len(examples), len(lines), ["time"], [time() - st_time])

    if romanize:
        print(
            f"len of {mode} data: {len(examples)} and # of trimmed instances: {n_trimmed} "
            f"and # of romanized instances: {n_romanized}")
    else:
        print(
            f"len of {mode} data: {len(examples)} and # of trimmed instances: {n_trimmed}"
        )

    return examples
示例#7
0
 def test_rtrans(self):
     """Test Indic-to-Indic ML and Rule-Based models."""
     with io.open('%s/indic-test' % self.test_dir, encoding='utf-8') as fp:
         # first line contains language codes
         lang_codes = fp.readline().split()
         lang2word = dict(
             zip(lang_codes, [[] for i in range(len(lang_codes))]))
         for line in fp:
             line = line.split()
             for i, word in enumerate(line):
                 lang2word[lang_codes[i]].append(word)
     for src in lang_codes:
         for trg in lang_codes:
             if src == trg:
                 continue
             s2t_ml = Transliterator(source=src, target=trg, rb=False)
             s2t_rb = Transliterator(source=src, target=trg, rb=True)
             for word in lang2word[src]:
                 s2t_ml.transform(word)
                 s2t_rb.transform(word)
示例#8
0
 def test_ru2ind(self):
     """Test [Roman, Urdu]-to-Indic ML models"""
     for lang_pair in self.trg2src:
         src = lang_pair[0]
         trg = lang_pair[1]
         trans = Transliterator(source=src, target=trg)
         with io.open('%s/%s_%s.testpairs' % (self.test_dir, trg, src),
                      encoding='utf-8') as fp:
             for line in fp:
                 expected, word = line.split()
                 self.assertEqual(trans.transform(word), expected)
示例#9
0
 def test_ind2ru(self):
     """Test Indic-to-[Roman, Urdu] ML models"""
     for lang_pair in self.src2trg:
         src = lang_pair[0]
         trg = lang_pair[1]
         trans = Transliterator(source=src, target=trg)
         with io.open('%s/%s_%s.testpairs' % (self.test_dir, src, trg),
                      encoding='utf-8') as fp:
             for line in fp:
                 word, expected = line.split()
                 self.assertEqual(trans.transform(word), expected)
示例#10
0
 def test_rtrans(self):
     """Test Indic-to-Indic ML and Rule-Based models."""
     with io.open('%s/indic-test' % self.test_dir, encoding='utf-8') as fp:
         # first line contains language codes
         lang_codes = fp.readline().split()
         lang2word = dict(zip(lang_codes,
                              [[] for i in range(len(lang_codes))]))
         for line in fp:
             line = line.split()
             for i, word in enumerate(line):
                 lang2word[lang_codes[i]].append(word)
     for src in lang_codes:
         for trg in lang_codes:
             if src == trg:
                 continue
             s2t_ml = Transliterator(source=src, target=trg, rb=False)
             s2t_rb = Transliterator(source=src, target=trg, rb=True)
             for word in lang2word[src]:
                 s2t_ml.transform(word)
                 s2t_rb.transform(word)
示例#11
0
 def test_ind2ru(self):
     """Test Indic-to-[Roman, Urdu] ML models"""
     for lang_pair in self.src2trg:
         src = lang_pair[0]
         trg = lang_pair[1]
         trans = Transliterator(source=src, target=trg)
         with io.open('%s/%s_%s.testpairs' % (self.test_dir, src, trg),
                      encoding='utf-8') as fp:
             for line in fp:
                 word, expected = line.split()
                 self.assertEqual(trans.transform(word), expected)
示例#12
0
 def test_ru2ind(self):
     """Test [Roman, Urdu]-to-Indic ML models"""
     for lang_pair in self.trg2src:
         src = lang_pair[0]
         trg = lang_pair[1]
         trans = Transliterator(source=src, target=trg)
         with io.open('%s/%s_%s.testpairs' % (self.test_dir, trg, src),
                      encoding='utf-8') as fp:
             for line in fp:
                 expected, word = line.split()
                 self.assertEqual(trans.transform(word), expected)
示例#13
0
def read_iitp_product_reviews_hi_sa_downloads(path, mode):
    st_time = time()

    global FIELDS, MAX_CHAR_LEN

    from indictrans import Transliterator
    trn_hin2eng = Transliterator(source='hin', target='eng')

    n_trimmed = 0
    Example = namedtuple(f"{mode}_example",
                         FIELDS,
                         defaults=(None, ) * len(FIELDS))
    examples = []
    lines = [line.strip() for line in open(path, "r")]
    for i, line in enumerate(lines):
        line = line.strip()
        if not line:
            continue
        vals = line.split(",")
        label = vals[0]
        txt = ",".join(vals[1:])
        txt = trn_hin2eng.transform(txt)
        new_txt = "".join([char for char in txt])
        if len(new_txt) > MAX_CHAR_LEN:
            n_trimmed += 1
            newtokens, currsum = [], 0
            for tkn in new_txt.split():  # 1 for space
                if currsum + len(tkn) + 1 <= MAX_CHAR_LEN:
                    newtokens.append(tkn)
                    currsum += len(tkn) + 1
                else:
                    break
            new_txt = " ".join(newtokens)
        example = Example(dataset="iitp_product_reviews_hi_sa",
                          task="classification",
                          split_type=mode,
                          uid=len(examples),
                          text=txt,
                          label=label,
                          text_pp=new_txt)
        examples.append(example)
        progress_bar(len(examples), len(lines), ["time"], [time() - st_time])
    print(
        f"len of {mode} data: {len(examples)} and # of trimmed instances: {n_trimmed}"
    )
    return examples
示例#14
0
def read_hinglishpedia_downloads(path1, path2, mode, standardizing_tags={}):
    st_time = time()

    global FIELDS, MAX_CHAR_LEN

    n_trimmed = 0
    FIELDS += [
        fieldname for fieldname in [
            "tgt",
        ] if fieldname not in FIELDS
    ]
    Example = namedtuple(f"{mode}_example",
                         FIELDS,
                         defaults=(None, ) * len(FIELDS))
    examples = []

    from indictrans import Transliterator
    trn_hin2eng = Transliterator(source='hin', target='eng')

    txt_lines = [line.strip() for line in open(path1, "r")]
    tag_lines = [line.strip() for line in open(path2, "r")]

    for i, (txt, tags) in tqdm(enumerate(zip(txt_lines, tag_lines))):
        if not txt:
            continue
        txt = trn_hin2eng.transform(txt)
        example = Example(dataset="hinglishpedia",
                          task="classification",
                          split_type=mode,
                          uid=len(examples),
                          text=txt,
                          langids=" ".join([
                              standardizing_tags[lid]
                              if lid in standardizing_tags else "other"
                              for lid in tags.split()
                          ]),
                          text_pp=txt)
        examples.append(example)
        # progress_bar(len(examples), len(txt_lines), ["time"], [time() - st_time])
    print(
        f"len of {mode} data: {len(examples)} and # of trimmed instances: {n_trimmed}"
    )
    return examples
示例#15
0
def get_transliteration(vocab, headers):
    trans = {}
    if headers is None:
        trn = Transliterator(source='eng', target='hin', build_lookup=True)
        trans = {item: trn.transform(item) for item in vocab}
    else:
        base_url = 'https://api.cognitive.microsofttranslator.com'
        path = '/transliterate?api-version=3.0&language=hi&fromScript=Latn&toScript=Deva'
        count = 0
        body = []
        constructed_url = base_url + path
        query = ''
        while (count <= 6500):
            for i in range(count, (count + 500), 50):
                for j in range(i, i + 50):
                    query += vocab[j] + ' '
                body.append({'text': query.strip()})
                query = ''
            response = requests.post(constructed_url,
                                     headers=headers,
                                     json=body)
            result = response.json()
            for j, i in enumerate(result):
                trans.update({body[j]['text']: i['text']})
            body = []
            count += 500

        for i in range(count, len(vocab), 50):
            for j in range(i, i + 50):
                if j < len(vocab):
                    query += vocab[j] + ' '
            body.append({'text': query.strip()})
            query = ''
        response = requests.post(constructed_url, headers=headers, json=body)
        result = response.json()
        for j, i in enumerate(result):
            trans.update({body[j]['text']: i['text']})

    return trans
示例#16
0
from indictrans import Transliterator
vocab = []

with open('all_roman.txt', 'r') as infile:
    con = infile.readlines()

vocab = [x.strip('\n') for x in con]

trn = Transliterator(source='eng', target='hin')

with open('transliterations.txt', 'w+') as outfile:
    for word in vocab:
        deva = trn.transform(word)
        outfile.write(word + "\t" + deva + "\n")
示例#17
0
class FeatureExtractor(BaseEstimator, TransformerMixin):
    """Extract review text, emojis and emoji sentiment.

    Takes a sequence of strings and produces a dict of values.  Keys are
    `review`, `emojis`, and `emoji-sentiment`.
    """
    def __init__(self, lang='ta'):
        self.lang = lang
        self.normalizer = BaseNormalizer(lang)
        # This language map was created using Google's googletrans module. Create the file alltextlang.txt by calling
        # detect_lang_and_store in feature_utils.py
        self.lmap = self.load_language_maps(
            os.path.join(os.path.dirname(sys.path[0]),
                         '../resources/data/alltextslang.txt'))
        self.soundexer = Soundex()
        self.ta_trans = Transliterator(source='eng',
                                       target='tam',
                                       build_lookup=True)
        self.ml_trans = Transliterator(source='eng',
                                       target='mal',
                                       build_lookup=True)
        self.sym_spell = SymSpell(max_dictionary_edit_distance=2,
                                  prefix_length=7)
        self.sym_spell.load_dictionary(
            '../../src/extern/data/etymdict.csv.vocab.tsv.gz',
            term_index=0,
            count_index=1,
            separator="\t")
        super().__init__()

    def load_language_maps(self, mapfile):
        lmap = {}
        with open(mapfile, 'r') as mapf:
            for line in mapf:
                text, lang, conf = line.rstrip().split('\t')
                lmap[text] = (lang, float(conf))
        return lmap

    def get_language_tag(self, text):
        return self.lmap.get(text, ('unknown', 0.0))

    def fit(self, x, y=None):
        return self

    def transform(self, reviews):
        features = np.recarray(
            shape=(len(reviews), ),
            dtype=[
                ('review', object),
                ('emojis', object),
                ('emoji_sentiment', object),
                ('lang_tag', object),
                ('len_range', object),
                ('soundexes', object),
            ],
        )
        for i, review in enumerate(reviews):
            features['review'][i] = self.normalizer.normalize(text=review)

            emojis, sentiment = get_emojis_from_text(review)
            features['emojis'][i] = ' '.join(emojis)
            features['emoji_sentiment'][i] = sentiment

            lang, conf = self.get_language_tag(review.strip())
            if lang == self.lang or lang == (self.lang + 'en'):
                # google agrees with some confidence
                agreement = 1
            elif conf < 0.5:
                # google says not-tamil, but weakly
                agreement = 0.5
            else:
                # google clearly says not-tamil
                agreement = 0
            features['lang_tag'][i] = {'lang': lang, 'agreement': agreement}
            features['len_range'][i] = get_doc_len_range(review)
            if self.lang == 'ta':
                review_trans = self.ta_trans.transform(review)
                for word in review_trans.split():
                    suggestions = self.sym_spell.lookup(word,
                                                        Verbosity.CLOSEST,
                                                        max_edit_distance=2,
                                                        include_unknown=True)
                    if len(suggestions) > 0 and suggestions[0].distance < 3:
                        print(word, suggestions[0].term)
                        # no match with dictionary, we need a more comprehensive dictionary plus phonetic similarity
            elif self.lang == 'ml':
                review_trans = self.ml_trans.transform(review)
            else:
                review_trans = review
            # TODO: introduce spell correct here for added normalisation
            # print(lang, review_trans)
            features['soundexes'][i] = ' '.join([
                self.soundexer.soundex(word) for word in review_trans.split()
            ])
        return features
示例#18
0
class Vida:
    """
	Only focused on English to 15 Indic Language Transliteration
	Hindi (hin)
	Bengali (ben)
	Gujarati (guj)
	Punjabi (pun)
	Malayalam (mal)
	Kannada (kan)
	Tamil (tam)
	Telugu (tel)
	Oriya (ori)
	Marathi (mar)
	Assamese (ass)
	Konkani (kon)
	Bodo (bod)
	Nepali (nep)
	Urdu (urd)
	English (eng)
	"""

    languages = {
        "hin": "Hindi",
        "ben": "Bengali",
        "guj": "Gujarati",
        "pun": "Punjabi",
        "mal": "Malayalam",
        "kan": "Kannada",
        "tam": "Tamil",
        "tel": "Telugu",
        "ori": "Oriya",
        "mar": "Marathi",
        "ass": "Assamese",
        "kon": "Konkani",
        "bod": "Bodo",
        "nep": "Nepali",
        "urd": "Urdu",
        "eng": "English"
    }

    codes = languages.keys()

    @staticmethod
    def is_ascii(s):
        return all(ord(c) < 128 for c in s)

    def __init__(self, text, source, target):
        self.text = text
        self.source = source
        self.target = target
        self.isEngSource = (self.source == 'eng')
        if self.isEngSource:
            self.usdictionary = enchant.Dict("en_US")
            self.gbdictionary = enchant.Dict("en_GB")
        self.validated = self.source in self.codes and self.target in self.codes
        if self.validated:
            self.engine = Transliterator(source=self.source,
                                         target=self.target)

    def run(self):
        status = False
        message = "Couldn't transliterate the text."
        content = {}
        output = []
        if not self.validated:
            message = "Please provide languages and their code."
            output = self.text
        else:
            text = self.text.split()
            try:
                for index in xrange(len(text)):
                    word = text[index]
                    if not self.isEngSource:
                        word = word.decode('utf-8')
                        output.insert(
                            index,
                            self.engine.transform(word).encode('utf-8'))
                    else:
                        if not Vida.is_ascii(word): word = word.decode('utf-8')
                        if not self.usdictionary.check(
                                word) and not self.gbdictionary.check(word):
                            output.insert(
                                index,
                                self.engine.transform(word).encode('utf-8'))
                        else:
                            output.insert(index, word)
                status = True
                message = "Succesfully transliterated the code."
            except UnicodeDecodeError, e:
                Repo.exception(e)
                message = "Couldn't decode the language properly."
            except IndexError, e:
                Repo.exception(e)
                message = "Couldn't properly frame the sentence."
            output = ' '.join(output)
示例#19
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import unicode_literals

import csv
from indictrans import Transliterator

trn = Transliterator(source='hin', target='eng', build_lookup=True)

# print eng

with open('dataSet.csv') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        hin = row[1].decode('utf-8').replace('\n', " ")
        eng = trn.transform(hin)
        print row[0], eng.encode('unicode-escape'), row[2]
示例#20
0
        fname = os.path.join(ud_dir, hi_tb, f)
        print 'Reading ' + fname

        ftrn = fname + '.en'
        fout = codecs.open(ftrn, 'w', encoding='utf-8')

        count = 0
        with codecs.open(fname, encoding='utf-8') as fin:
            for line in fin:
                count += 1

                if count % 10000 == 0 and count > 0:
                    print(count)
                tok = line.strip()
                if tok.startswith('#') or tok == '':
                    fout.write(unicode(tok) + '\n')
                    continue

                tok = tok.split('\t')
                if len(tok) == 1 or '.' in tok[0] or '-' in tok[0]:
                    fout.write(unicode(tok) + '\n')
                    continue

                else:
                    tok[1] = trn.transform(tok[1]) or '_'
                    tok[2] = trn.transform(tok[2]) or '_'
                    entry = '\t'.join(tok) + '\n'
                    fout.write(unicode(entry))

        fout.close()
示例#21
0
 def code_transliterate(self):
     trn = Transliterator(source='hin', target='eng', build_lookup=True)
     eng = trn.transform(self)
     return eng
示例#22
0
tk = Tokenizer(lang=s[:2])
tk_back = Tokenizer(lang=t[:2])

l = u"रज्ज के रुलाया"  #\nरज्ज के हंसाया\n\nमैंने दिल खो' के इश्क़ कमाया\n"

l = l.lower().strip()

lines = l.split("\n")
print(lines)

output = []
if flag == True:
    for l in lines:
        json = {}

        definitive = forward_transl_full.transform(l)

        json["text"] = definitive
        json["tokens"] = []

        tokens = []
        text_precedent = ""

        tokens = tk.tokenize(l)

        back_tokens = tk_back.tokenize(definitive)

        for i, t in enumerate(tokens):

            inner_json = {}
            choosen = back_tokens[i]