Пример #1
0
 def test_kbest(self):
     """Make sure `k-best` works without failure"""
     k_best = range(2, 15)
     r2i = Transliterator(source='eng', target='hin', decode='beamsearch')
     i2r = Transliterator(source='hin', target='eng', decode='beamsearch')
     for k in k_best:
         hin = r2i.transform('indictrans', k_best=k)
         eng = i2r.transform(hin[0], k_best=k)
         self.assertTrue(len(hin) == k)
         self.assertTrue(len(eng) == k)
Пример #2
0
 def test_kbest(self):
     """Make sure `k-best` works without failure"""
     k_best = range(2, 15)
     r2i = Transliterator(source='eng',
                          target='hin',
                          decode='beamsearch')
     i2r = Transliterator(source='hin',
                          target='eng',
                          decode='beamsearch')
     for k in k_best:
         hin = r2i.transform('indictrans', k_best=k)
         eng = i2r.transform(hin[0], k_best=k)
         self.assertTrue(len(hin) == k)
         self.assertTrue(len(eng) == k)
Пример #3
0
class IndictransTransliterator:
    def __init__(self):
        self.trn = Transliterator(source='hin',
                                  target='eng',
                                  decode='beamsearch',
                                  build_lookup=True)
        self.trans_dict = {}

    def transliterate(self, original):
        transliterations = self.get_all(original)
        return random.choice(transliterations)

    def get_all(self, original):
        if original in self.trans_dict:
            return self.trans_dict[original]
        else:
            transliterations = self.trn.transform(original, k_best=5)
            self.trans_dict[original] = transliterations
            return transliterations

    @staticmethod
    def _is_deva(unicode_tok):
        """Returns True if |unicode_tok| contains a Devanagari character"""
        for c in unicode_tok:
            if int('0900', 16) <= ord(c) <= int('097f', 16):
                return True
        return False
Пример #4
0
def predict():
    
    if request.method == 'POST':
        print(request.form.get('NewYork'))
        try:

            NewYork = request.form['NewYork']
            California = request.form['California']
            Florida = request.form['Florida']
            NewYorkstrip = NewYork.strip()
            NewYorkLower = NewYorkstrip.lower()
  
            f = open("data/Alldata.csv", "r")
            s = f.read()
            dic = ast.literal_eval(s)
            my_list = []
            words = NewYorkLower.split() 
            
            for c in words:
                trn = Transliterator(source=California.strip(), target=Florida.strip(), build_lookup=True)
                eng = trn.transform(c.lower())
                my_list.append(dic.get(c,eng)) 
                a = my_list
                
            listToStr = ' '.join(map(str, a)) 
            

        except ValueError:
            return "Please check if the values are entered correctly"
    return render_template('home.html', prediction = listToStr)
Пример #5
0
def transliterate():
    word = request.args.get('word', default="congress", type=str)
    trn = Transliterator(source='eng',
                         target='hin',
                         build_lookup=True,
                         decode='beamsearch')
    best_transliterated_list = trn.transform(word, k_best=5)
    return {"transliteration": best_transliterated_list}
Пример #6
0
def read_kumaretal_2019_agg_downloads(path, mode, romanize=False):
    st_time = time()

    global FIELDS, MAX_CHAR_LEN

    from indictrans import Transliterator
    trn_hin2eng = Transliterator(source='hin', target='eng')

    n_trimmed, n_romanized = 0, 0
    Example = namedtuple(f"{mode}_example",
                         FIELDS,
                         defaults=(None, ) * len(FIELDS))
    examples = []
    lines = read_csv_file(path, has_header=False)
    for i, line in enumerate(lines):
        uid, txt, label = line[0], line[1], line[2]
        if not txt:
            continue
        if romanize:
            new_txt = trn_hin2eng.transform(txt)
            if txt != new_txt:
                n_romanized += 1
            txt = new_txt
        new_txt = clean_generic(txt)
        if new_txt.strip() == "":
            new_txt = txt
        if len(new_txt) > MAX_CHAR_LEN:
            n_trimmed += 1
            newtokens, currsum = [], 0
            for tkn in new_txt.split():  # 1 for space
                if currsum + len(tkn) + 1 <= MAX_CHAR_LEN:
                    newtokens.append(tkn)
                    currsum += len(tkn) + 1
                else:
                    break
            new_txt = " ".join(newtokens)
        example = Example(dataset="kumaretal_2019_agg",
                          task="classification",
                          split_type=mode,
                          uid=uid,
                          text=txt,
                          label=label,
                          text_pp=new_txt)
        examples.append(example)
        progress_bar(len(examples), len(lines), ["time"], [time() - st_time])

    if romanize:
        print(
            f"len of {mode} data: {len(examples)} and # of trimmed instances: {n_trimmed} "
            f"and # of romanized instances: {n_romanized}")
    else:
        print(
            f"len of {mode} data: {len(examples)} and # of trimmed instances: {n_trimmed}"
        )

    return examples
Пример #7
0
 def test_rtrans(self):
     """Test Indic-to-Indic ML and Rule-Based models."""
     with io.open('%s/indic-test' % self.test_dir, encoding='utf-8') as fp:
         # first line contains language codes
         lang_codes = fp.readline().split()
         lang2word = dict(
             zip(lang_codes, [[] for i in range(len(lang_codes))]))
         for line in fp:
             line = line.split()
             for i, word in enumerate(line):
                 lang2word[lang_codes[i]].append(word)
     for src in lang_codes:
         for trg in lang_codes:
             if src == trg:
                 continue
             s2t_ml = Transliterator(source=src, target=trg, rb=False)
             s2t_rb = Transliterator(source=src, target=trg, rb=True)
             for word in lang2word[src]:
                 s2t_ml.transform(word)
                 s2t_rb.transform(word)
Пример #8
0
 def test_ru2ind(self):
     """Test [Roman, Urdu]-to-Indic ML models"""
     for lang_pair in self.trg2src:
         src = lang_pair[0]
         trg = lang_pair[1]
         trans = Transliterator(source=src, target=trg)
         with io.open('%s/%s_%s.testpairs' % (self.test_dir, trg, src),
                      encoding='utf-8') as fp:
             for line in fp:
                 expected, word = line.split()
                 self.assertEqual(trans.transform(word), expected)
Пример #9
0
 def test_ind2ru(self):
     """Test Indic-to-[Roman, Urdu] ML models"""
     for lang_pair in self.src2trg:
         src = lang_pair[0]
         trg = lang_pair[1]
         trans = Transliterator(source=src, target=trg)
         with io.open('%s/%s_%s.testpairs' % (self.test_dir, src, trg),
                      encoding='utf-8') as fp:
             for line in fp:
                 word, expected = line.split()
                 self.assertEqual(trans.transform(word), expected)
Пример #10
0
 def test_rtrans(self):
     """Test Indic-to-Indic ML and Rule-Based models."""
     with io.open('%s/indic-test' % self.test_dir, encoding='utf-8') as fp:
         # first line contains language codes
         lang_codes = fp.readline().split()
         lang2word = dict(zip(lang_codes,
                              [[] for i in range(len(lang_codes))]))
         for line in fp:
             line = line.split()
             for i, word in enumerate(line):
                 lang2word[lang_codes[i]].append(word)
     for src in lang_codes:
         for trg in lang_codes:
             if src == trg:
                 continue
             s2t_ml = Transliterator(source=src, target=trg, rb=False)
             s2t_rb = Transliterator(source=src, target=trg, rb=True)
             for word in lang2word[src]:
                 s2t_ml.transform(word)
                 s2t_rb.transform(word)
Пример #11
0
 def test_ind2ru(self):
     """Test Indic-to-[Roman, Urdu] ML models"""
     for lang_pair in self.src2trg:
         src = lang_pair[0]
         trg = lang_pair[1]
         trans = Transliterator(source=src, target=trg)
         with io.open('%s/%s_%s.testpairs' % (self.test_dir, src, trg),
                      encoding='utf-8') as fp:
             for line in fp:
                 word, expected = line.split()
                 self.assertEqual(trans.transform(word), expected)
Пример #12
0
 def test_ru2ind(self):
     """Test [Roman, Urdu]-to-Indic ML models"""
     for lang_pair in self.trg2src:
         src = lang_pair[0]
         trg = lang_pair[1]
         trans = Transliterator(source=src, target=trg)
         with io.open('%s/%s_%s.testpairs' % (self.test_dir, trg, src),
                      encoding='utf-8') as fp:
             for line in fp:
                 expected, word = line.split()
                 self.assertEqual(trans.transform(word), expected)
Пример #13
0
def read_iitp_product_reviews_hi_sa_downloads(path, mode):
    st_time = time()

    global FIELDS, MAX_CHAR_LEN

    from indictrans import Transliterator
    trn_hin2eng = Transliterator(source='hin', target='eng')

    n_trimmed = 0
    Example = namedtuple(f"{mode}_example",
                         FIELDS,
                         defaults=(None, ) * len(FIELDS))
    examples = []
    lines = [line.strip() for line in open(path, "r")]
    for i, line in enumerate(lines):
        line = line.strip()
        if not line:
            continue
        vals = line.split(",")
        label = vals[0]
        txt = ",".join(vals[1:])
        txt = trn_hin2eng.transform(txt)
        new_txt = "".join([char for char in txt])
        if len(new_txt) > MAX_CHAR_LEN:
            n_trimmed += 1
            newtokens, currsum = [], 0
            for tkn in new_txt.split():  # 1 for space
                if currsum + len(tkn) + 1 <= MAX_CHAR_LEN:
                    newtokens.append(tkn)
                    currsum += len(tkn) + 1
                else:
                    break
            new_txt = " ".join(newtokens)
        example = Example(dataset="iitp_product_reviews_hi_sa",
                          task="classification",
                          split_type=mode,
                          uid=len(examples),
                          text=txt,
                          label=label,
                          text_pp=new_txt)
        examples.append(example)
        progress_bar(len(examples), len(lines), ["time"], [time() - st_time])
    print(
        f"len of {mode} data: {len(examples)} and # of trimmed instances: {n_trimmed}"
    )
    return examples
Пример #14
0
def read_hinglishpedia_downloads(path1, path2, mode, standardizing_tags={}):
    st_time = time()

    global FIELDS, MAX_CHAR_LEN

    n_trimmed = 0
    FIELDS += [
        fieldname for fieldname in [
            "tgt",
        ] if fieldname not in FIELDS
    ]
    Example = namedtuple(f"{mode}_example",
                         FIELDS,
                         defaults=(None, ) * len(FIELDS))
    examples = []

    from indictrans import Transliterator
    trn_hin2eng = Transliterator(source='hin', target='eng')

    txt_lines = [line.strip() for line in open(path1, "r")]
    tag_lines = [line.strip() for line in open(path2, "r")]

    for i, (txt, tags) in tqdm(enumerate(zip(txt_lines, tag_lines))):
        if not txt:
            continue
        txt = trn_hin2eng.transform(txt)
        example = Example(dataset="hinglishpedia",
                          task="classification",
                          split_type=mode,
                          uid=len(examples),
                          text=txt,
                          langids=" ".join([
                              standardizing_tags[lid]
                              if lid in standardizing_tags else "other"
                              for lid in tags.split()
                          ]),
                          text_pp=txt)
        examples.append(example)
        # progress_bar(len(examples), len(txt_lines), ["time"], [time() - st_time])
    print(
        f"len of {mode} data: {len(examples)} and # of trimmed instances: {n_trimmed}"
    )
    return examples
Пример #15
0
def get_transliteration(vocab, headers):
    trans = {}
    if headers is None:
        trn = Transliterator(source='eng', target='hin', build_lookup=True)
        trans = {item: trn.transform(item) for item in vocab}
    else:
        base_url = 'https://api.cognitive.microsofttranslator.com'
        path = '/transliterate?api-version=3.0&language=hi&fromScript=Latn&toScript=Deva'
        count = 0
        body = []
        constructed_url = base_url + path
        query = ''
        while (count <= 6500):
            for i in range(count, (count + 500), 50):
                for j in range(i, i + 50):
                    query += vocab[j] + ' '
                body.append({'text': query.strip()})
                query = ''
            response = requests.post(constructed_url,
                                     headers=headers,
                                     json=body)
            result = response.json()
            for j, i in enumerate(result):
                trans.update({body[j]['text']: i['text']})
            body = []
            count += 500

        for i in range(count, len(vocab), 50):
            for j in range(i, i + 50):
                if j < len(vocab):
                    query += vocab[j] + ' '
            body.append({'text': query.strip()})
            query = ''
        response = requests.post(constructed_url, headers=headers, json=body)
        result = response.json()
        for j, i in enumerate(result):
            trans.update({body[j]['text']: i['text']})

    return trans
Пример #16
0
from indictrans import Transliterator
vocab = []

with open('all_roman.txt', 'r') as infile:
    con = infile.readlines()

vocab = [x.strip('\n') for x in con]

trn = Transliterator(source='eng', target='hin')

with open('transliterations.txt', 'w+') as outfile:
    for word in vocab:
        deva = trn.transform(word)
        outfile.write(word + "\t" + deva + "\n")
Пример #17
0
class FeatureExtractor(BaseEstimator, TransformerMixin):
    """Extract review text, emojis and emoji sentiment.

    Takes a sequence of strings and produces a dict of values.  Keys are
    `review`, `emojis`, and `emoji-sentiment`.
    """
    def __init__(self, lang='ta'):
        self.lang = lang
        self.normalizer = BaseNormalizer(lang)
        # This language map was created using Google's googletrans module. Create the file alltextlang.txt by calling
        # detect_lang_and_store in feature_utils.py
        self.lmap = self.load_language_maps(
            os.path.join(os.path.dirname(sys.path[0]),
                         '../resources/data/alltextslang.txt'))
        self.soundexer = Soundex()
        self.ta_trans = Transliterator(source='eng',
                                       target='tam',
                                       build_lookup=True)
        self.ml_trans = Transliterator(source='eng',
                                       target='mal',
                                       build_lookup=True)
        self.sym_spell = SymSpell(max_dictionary_edit_distance=2,
                                  prefix_length=7)
        self.sym_spell.load_dictionary(
            '../../src/extern/data/etymdict.csv.vocab.tsv.gz',
            term_index=0,
            count_index=1,
            separator="\t")
        super().__init__()

    def load_language_maps(self, mapfile):
        lmap = {}
        with open(mapfile, 'r') as mapf:
            for line in mapf:
                text, lang, conf = line.rstrip().split('\t')
                lmap[text] = (lang, float(conf))
        return lmap

    def get_language_tag(self, text):
        return self.lmap.get(text, ('unknown', 0.0))

    def fit(self, x, y=None):
        return self

    def transform(self, reviews):
        features = np.recarray(
            shape=(len(reviews), ),
            dtype=[
                ('review', object),
                ('emojis', object),
                ('emoji_sentiment', object),
                ('lang_tag', object),
                ('len_range', object),
                ('soundexes', object),
            ],
        )
        for i, review in enumerate(reviews):
            features['review'][i] = self.normalizer.normalize(text=review)

            emojis, sentiment = get_emojis_from_text(review)
            features['emojis'][i] = ' '.join(emojis)
            features['emoji_sentiment'][i] = sentiment

            lang, conf = self.get_language_tag(review.strip())
            if lang == self.lang or lang == (self.lang + 'en'):
                # google agrees with some confidence
                agreement = 1
            elif conf < 0.5:
                # google says not-tamil, but weakly
                agreement = 0.5
            else:
                # google clearly says not-tamil
                agreement = 0
            features['lang_tag'][i] = {'lang': lang, 'agreement': agreement}
            features['len_range'][i] = get_doc_len_range(review)
            if self.lang == 'ta':
                review_trans = self.ta_trans.transform(review)
                for word in review_trans.split():
                    suggestions = self.sym_spell.lookup(word,
                                                        Verbosity.CLOSEST,
                                                        max_edit_distance=2,
                                                        include_unknown=True)
                    if len(suggestions) > 0 and suggestions[0].distance < 3:
                        print(word, suggestions[0].term)
                        # no match with dictionary, we need a more comprehensive dictionary plus phonetic similarity
            elif self.lang == 'ml':
                review_trans = self.ml_trans.transform(review)
            else:
                review_trans = review
            # TODO: introduce spell correct here for added normalisation
            # print(lang, review_trans)
            features['soundexes'][i] = ' '.join([
                self.soundexer.soundex(word) for word in review_trans.split()
            ])
        return features
Пример #18
0
class Vida:
    """
	Only focused on English to 15 Indic Language Transliteration
	Hindi (hin)
	Bengali (ben)
	Gujarati (guj)
	Punjabi (pun)
	Malayalam (mal)
	Kannada (kan)
	Tamil (tam)
	Telugu (tel)
	Oriya (ori)
	Marathi (mar)
	Assamese (ass)
	Konkani (kon)
	Bodo (bod)
	Nepali (nep)
	Urdu (urd)
	English (eng)
	"""

    languages = {
        "hin": "Hindi",
        "ben": "Bengali",
        "guj": "Gujarati",
        "pun": "Punjabi",
        "mal": "Malayalam",
        "kan": "Kannada",
        "tam": "Tamil",
        "tel": "Telugu",
        "ori": "Oriya",
        "mar": "Marathi",
        "ass": "Assamese",
        "kon": "Konkani",
        "bod": "Bodo",
        "nep": "Nepali",
        "urd": "Urdu",
        "eng": "English"
    }

    codes = languages.keys()

    @staticmethod
    def is_ascii(s):
        return all(ord(c) < 128 for c in s)

    def __init__(self, text, source, target):
        self.text = text
        self.source = source
        self.target = target
        self.isEngSource = (self.source == 'eng')
        if self.isEngSource:
            self.usdictionary = enchant.Dict("en_US")
            self.gbdictionary = enchant.Dict("en_GB")
        self.validated = self.source in self.codes and self.target in self.codes
        if self.validated:
            self.engine = Transliterator(source=self.source,
                                         target=self.target)

    def run(self):
        status = False
        message = "Couldn't transliterate the text."
        content = {}
        output = []
        if not self.validated:
            message = "Please provide languages and their code."
            output = self.text
        else:
            text = self.text.split()
            try:
                for index in xrange(len(text)):
                    word = text[index]
                    if not self.isEngSource:
                        word = word.decode('utf-8')
                        output.insert(
                            index,
                            self.engine.transform(word).encode('utf-8'))
                    else:
                        if not Vida.is_ascii(word): word = word.decode('utf-8')
                        if not self.usdictionary.check(
                                word) and not self.gbdictionary.check(word):
                            output.insert(
                                index,
                                self.engine.transform(word).encode('utf-8'))
                        else:
                            output.insert(index, word)
                status = True
                message = "Succesfully transliterated the code."
            except UnicodeDecodeError, e:
                Repo.exception(e)
                message = "Couldn't decode the language properly."
            except IndexError, e:
                Repo.exception(e)
                message = "Couldn't properly frame the sentence."
            output = ' '.join(output)
Пример #19
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import unicode_literals

import csv
from indictrans import Transliterator

trn = Transliterator(source='hin', target='eng', build_lookup=True)

# print eng

with open('dataSet.csv') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        hin = row[1].decode('utf-8').replace('\n', " ")
        eng = trn.transform(hin)
        print row[0], eng.encode('unicode-escape'), row[2]
Пример #20
0
        fname = os.path.join(ud_dir, hi_tb, f)
        print 'Reading ' + fname

        ftrn = fname + '.en'
        fout = codecs.open(ftrn, 'w', encoding='utf-8')

        count = 0
        with codecs.open(fname, encoding='utf-8') as fin:
            for line in fin:
                count += 1

                if count % 10000 == 0 and count > 0:
                    print(count)
                tok = line.strip()
                if tok.startswith('#') or tok == '':
                    fout.write(unicode(tok) + '\n')
                    continue

                tok = tok.split('\t')
                if len(tok) == 1 or '.' in tok[0] or '-' in tok[0]:
                    fout.write(unicode(tok) + '\n')
                    continue

                else:
                    tok[1] = trn.transform(tok[1]) or '_'
                    tok[2] = trn.transform(tok[2]) or '_'
                    entry = '\t'.join(tok) + '\n'
                    fout.write(unicode(entry))

        fout.close()
Пример #21
0
 def code_transliterate(self):
     trn = Transliterator(source='hin', target='eng', build_lookup=True)
     eng = trn.transform(self)
     return eng
Пример #22
0
tk = Tokenizer(lang=s[:2])
tk_back = Tokenizer(lang=t[:2])

l = u"रज्ज के रुलाया"  #\nरज्ज के हंसाया\n\nमैंने दिल खो' के इश्क़ कमाया\n"

l = l.lower().strip()

lines = l.split("\n")
print(lines)

output = []
if flag == True:
    for l in lines:
        json = {}

        definitive = forward_transl_full.transform(l)

        json["text"] = definitive
        json["tokens"] = []

        tokens = []
        text_precedent = ""

        tokens = tk.tokenize(l)

        back_tokens = tk_back.tokenize(definitive)

        for i, t in enumerate(tokens):

            inner_json = {}
            choosen = back_tokens[i]