示例#1
0
def getpos():
    text = request.form['input_text']
    tokenized_text = indic_tokenize.trivial_tokenize(text)
    with open("./postagger.pkl",'rb') as f:
        pos_tagger = pickle.load(f)
    output = pos_tagger.tag(tokenized_text)
    return render_template('result.html',data = output)
def characterCountHindi(text):
    #sentences=sentence_tokenize.sentence_split(text, lang='hi')
    count=0
    for t in indic_tokenize.trivial_tokenize(text):
        for i in t:
            count=count+1
    return count
示例#3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--indic-nlp-path",
                        required=True,
                        help="path to Indic NLP Library root")
    parser.add_argument("--language", required=True)
    parser.add_argument("--remove-nuktas", default=False, action="store_true")
    parser.add_argument("input", help="input file; use - for stdin")
    args = parser.parse_args()

    try:
        sys.path.extend([
            args.indic_nlp_path,
            os.path.join(args.indic_nlp_path, "src"),
        ])
        from indicnlp.tokenize import indic_tokenize
        from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
    except:
        raise Exception(
            "Cannot load Indic NLP Library, make sure --indic-nlp-path is correct"
        )

    # create normalizer
    factory = IndicNormalizerFactory()
    normalizer = factory.get_normalizer(
        args.language,
        remove_nuktas=args.remove_nuktas,
    )

    # normalize and tokenize
    for line in fileinput.input([args.input],
                                openhook=fileinput.hook_compressed):
        line = normalizer.normalize(line.decode("utf-8", errors="ignore"))
        line = " ".join(indic_tokenize.trivial_tokenize(line, args.language))
        sys.stdout.write(line.encode("utf-8"))
def longWordCountHindi(text):
    words=indic_tokenize.trivial_tokenize(text)
    count=0
    for i in words:
        if len(i)>6:
            count+=1
    return count
示例#5
0
def Process(infile, outfile, column=0, max_row=1000000, interval=100000):
    print("infile = ", infile)
    with open(infile, "r", encoding="utf-8") as fin:
        analyzer = unsupervised_morph.UnsupervisedMorphAnalyzer('hi')
        result = []
        read_line_num = 0
        lines = fin.readlines()
        tot_line = len(lines)
        for line in lines:
            read_line_num += 1
            if read_line_num % interval == 0:
                print("processed %d lines " % read_line_num)
            line = line.split('\t')
            indic_string = line[column]
            indic_string = indic_string.strip()
            indic_res1 = indic_tokenize.trivial_tokenize(indic_string)
            analyzes_tokens = analyzer.morph_analyze_document(indic_res1)
            result.append(' '.join(analyzes_tokens))
            # if read_line_num % max_row==0:
            #     if os.path.exists(outfile):
            #         fout = open(outfile,"a",encoding="utf-8")
            #         fout.seek(0,2)
            #     else:
            #         fout = open(outfile,"w",encoding="utf-8")
            #     for line in result:
            #         # fout.write(line)
            #         fout.write(line+"\n")
            #     fout.close()
            #     result = []
        print("len_result = ", len(result))
        fout = open(outfile, "w", encoding="utf-8")
        for line in result:
            fout.write(line + "\n")
示例#6
0
def tokenize(ip_file_path, op_file_path):
    with open(ip_file_path, 'r') as f:
        with open(op_file_path, "w") as text_file:
            for line in f:
                result_arr = indic_tokenize.trivial_tokenize(line)
                tokenized_sentence = ' '.join(result_arr)
                text_file.write(tokenized_sentence)
def preprocess_line(line, normalizer, lang, transliterate=False):
    if lang == "en":
        return " ".join(
            en_tok.tokenize(en_normalizer.normalize(line.strip()),
                            escape=False))
    elif transliterate:
        # line = indic_detokenize.trivial_detokenize(line.strip(), lang)
        return unicode_transliterate.UnicodeIndicTransliterator.transliterate(
            " ".join(
                indic_tokenize.trivial_tokenize(
                    normalizer.normalize(line.strip()), lang)),
            lang,
            "hi",
        ).replace(" ् ", "्")
    else:
        # we only need to transliterate for joint training
        return " ".join(
            indic_tokenize.trivial_tokenize(normalizer.normalize(line.strip()),
                                            lang))
示例#8
0
def update_stats (caption : str, stats : dict) -> None:
	tokens = indic_tokenize.trivial_tokenize(caption)
	n_tokens = len (tokens)

	stats ['maxlen'] = max (stats ['maxlen'], n_tokens)
	for key in stats.keys ():
		if key == 'maxlen':
			continue
		if check_in_range (key, n_tokens):
			stats [key] += 1
	return
示例#9
0
    def indic_tokenizer(self, text, lang):
        '''
        Tokenizer for indic nlp
        '''

        # Tokenize
        tokens = indic_tokenize.trivial_tokenize(text=text, lang=lang)

        # Normalize
        for i in range(len(tokens)):
            tokens[i] = self.normalizers[lang].normalize(tokens[i])

        return tokens
示例#10
0
def preprocess_sent(text, lang):
    """
    Pre-process text (normalization and tokenization).

    text: text string to preprocess
    lang: language code (2-letter ISO code)

    returns the processed text string
    """
    normalizer = normalizer_factory.get_normalizer(lang)

    return indic_tokenize.trivial_tokenize(normalizer.normalize(
                                           text.replace('\n', ' ')), lang)
示例#11
0
def preprocess_line(line, normalizer, lang, transliterate=False):
    if lang == "en":
        # this is using cleaner for vi text and imp for en-vi dataset
        # TODO: how to not include this for other language cleaning
        # line = fix_contents(line)
        return " ".join(
            en_tok.tokenize(en_normalizer.normalize(line.strip()),
                            escape=False))
    elif lang == "vi":
        line = fix_contents(line)
        sentences = rdrsegmenter.tokenize(line)
        tokenized_sentence = join_tokenized_sentence_list(sentences)
        return tokenized_sentence

    elif lang == "bg":
        sentences = bg_cube(line)
        tokenized_sentence = join_tokenized_sentence_list(sentences,
                                                          cube_tokenized=True)
        return tokenized_sentence
    elif lang == "tr":
        sentences = tr_cube(line)
        tokenized_sentence = join_tokenized_sentence_list(sentences,
                                                          cube_tokenized=True)
        return tokenized_sentence
    elif transliterate:
        # line = indic_detokenize.trivial_detokenize(line.strip(), lang)
        return unicode_transliterate.UnicodeIndicTransliterator.transliterate(
            " ".join(
                indic_tokenize.trivial_tokenize(
                    normalizer.normalize(line.strip()), lang)),
            lang,
            "hi",
        ).replace(" ् ", "्")
    else:
        # we only need to transliterate for joint training
        return " ".join(
            indic_tokenize.trivial_tokenize(normalizer.normalize(line.strip()),
                                            lang))
示例#12
0
def pre_process_hindi_sentence(line):
    remove_nuktas = False
    factory = IndicNormalizerFactory()
    normalizer = factory.get_normalizer("hi", remove_nuktas)
    line = normalizer.normalize(line)
    line = clean_text(line)
    tokens = list()
    for t in indic_tokenize.trivial_tokenize(line):
        tokens.append(t)
    line = tokens
    line = [word.lower() for word in line]
    line = [word for word in line if not re.search(r'\d', word)]
    line = ' '.join(line)
    return (line)
def preprocess_sent(sent, normalizer, lang):
    if lang == "en":
        return " ".join(
            en_tok.tokenize(en_normalizer.normalize(sent.strip()),
                            escape=False))
    else:
        # line = indic_detokenize.trivial_detokenize(line.strip(), lang)
        return unicode_transliterate.UnicodeIndicTransliterator.transliterate(
            " ".join(
                indic_tokenize.trivial_tokenize(
                    normalizer.normalize(sent.strip()), lang)),
            lang,
            "hi",
        ).replace(" ् ", "्")
示例#14
0
def doc2vec(txt, lang, emb):
    """
    a doc is represented as the mean of all the words vectors
    of its constituent words
    """
    normalizer = normalizer_factory.get_normalizer(lang)
    normed_txt = normalizer.normalize(txt.replace('\n', ' '))
    words = indic_tokenize.trivial_tokenize(normed_txt, lang)
    word_vecs = [emb[word] for word in words if word in emb]
    if len(word_vecs) > 0:
        doc_vec = np.mean(np.array(word_vecs), axis=0)
    else:
        doc_vec = np.zeros(emb.vector_size)
    return doc_vec
示例#15
0
def get_token(filepath: str = ''):
    global tokens

    if not sentences:
        get_sentences(filepath)

    tok_list = list()
    for sentence in sentences:
        sentence = sentence.replace('\u200d', ' ')
        sentence = sentence.replace('\n', ' ')
        toks = ['<sos>'] + indic_tokenize.trivial_tokenize(
            sentence, lang='hi') + ['<eos>']
        tok_list.append(toks)

    tokens = tok_list
    return tokens
def old_preprocess(infname, outfname, lang):
    """
    Preparing each corpus file:
      - Normalization
      - Tokenization
      - Script coversion to Devanagari for Indic scripts
    """
    n = 0
    num_lines = sum(1 for line in open(infname, "r"))
    # reading
    with open(infname, "r",
              encoding="utf-8") as infile, open(outfname,
                                                "w",
                                                encoding="utf-8") as outfile:

        if lang == "en":
            en_tok = MosesTokenizer(lang="en")
            en_normalizer = MosesPunctNormalizer()
            for line in tqdm(infile, total=num_lines):
                outline = " ".join(
                    en_tok.tokenize(en_normalizer.normalize(line.strip()),
                                    escape=False))
                outfile.write(outline + "\n")
                n += 1

        else:
            normfactory = indic_normalize.IndicNormalizerFactory()
            normalizer = normfactory.get_normalizer(lang)
            for line in tqdm(infile, total=num_lines):
                outline = (unicode_transliterate.UnicodeIndicTransliterator.
                           transliterate(
                               " ".join(
                                   indic_tokenize.trivial_tokenize(
                                       normalizer.normalize(line.strip()),
                                       lang)),
                               lang,
                               "hi",
                           ).replace(" ् ", "्"))

                outfile.write(outline + "\n")
                n += 1
    return n
示例#17
0
def process(filename, maxlen=250):
    lines = open(filename, "r", encoding="utf-8")  #.read().split("\n")

    new = []
    word_counts = []

    # Pre-tokenize replacements
    regs = {"[‘’]": "'", '[“”]': '"', "[\s]+": " "}
    for line in tqdm(lines):
        line = line.strip()
        for reg in regs:
            line = re.sub(reg, regs[reg], line)
        new.append(line)

    lines.close()

    print(f"finished pre-processing {filename}")

    tokenized = []

    # Add a post-tokenize replacements section if needed
    reg = "[०-९]"
    for line in new:
        tokens = indic_tokenize.trivial_tokenize(line)
        if len(tokens) < maxlen:
            tok = " ".join(tokens)
            tokenized.append(re.sub(reg, "#", tok))
            word_counts.append(len(tokens))
        else:
            tokenized.append("")
            # A quicker way would be load in (title, article) pairs into this function
            # and skip a pair entirely whenever length > maxlen for articles

    print(f"finished tokenizing and post-processing {filename}")

    sort = sorted(word_counts, reverse=True)
    print(f"the 10 longest sequence lengths in {filename} are {sort[:10]}")

    print(f"average length: {sum(word_counts)/len(word_counts)}")

    return tokenized
示例#18
0
def generate_data_dictionary(paths, destination='data.pickle'):

    # to be saved
    return_dict = {'tokens': []}

    # iterating over each file corresponding to the path
    for path in tqdm(paths):
        if path.endswith('.txt'):

            # reading the file
            file = open(path, 'r', encoding='utf-8')
            text = file.read().replace('\n', '').replace('·', '')

            # tokenizing the text
            token_list = indic_tokenize.trivial_tokenize(text, lang='hi')

            # committing the new tokens to the return dict
            return_dict['tokens'].append(token_list)

    # pickling
    with open(destination, 'wb') as pickle_file:
        pickle.dump(return_dict, pickle_file)
示例#19
0
def get_vocab(corpus: List) -> Tuple:
    word_to_index = dict()
    index_to_word = dict()

    word_to_index['pad'] = 0
    word_to_index['start'] = 1
    word_to_index['end'] = 2
    word_to_index['unk'] = 3
    index_to_word[0] = 'pad'
    index_to_word[1] = 'start'
    index_to_word[2] = 'end'
    index_to_word[3] = 'unk'
    start_idx = 4

    for img in tqdm(corpus):
        tokens = indic_tokenize.trivial_tokenize(img['caption'])

        for tok in tokens:
            if tok not in word_to_index:
                word_to_index[tok] = start_idx
                index_to_word[start_idx] = tok
                start_idx += 1
    return word_to_index, index_to_word
def main():
    data = ""
    with open("textdata.txt") as fp:
        data = fp.read()
    print("Tokenizing....")
    token = indic_tokenize.trivial_tokenize(data)
    print("Size of token befoer eliminating punction: {0}".format(len(token)))
    remove_punc(token)
    print("Size of token after eliminating punction: {0}".format(len(token)))
    print("#" * 100)

    print("Size of token before filtering stop word : {0}".format(len(token)))
    token = filter_token(token)
    print("Size of token after filtering stop word : {0}".format(len(token)))
    print("Token:\n{0}".format(token))
    print("Trainning data 'hind.pos' from nltk......")

    #Trainning data
    train_data = indian.tagged_sents('hindi.pos')
    tnt_pos_tagger = tnt.TnT()
    tnt_pos_tagger.train(train_data)
    POS_TAG = tnt_pos_tagger.tag(token)
    for each_tag in POS_TAG:
        print(each_tag)
def clean_pairs(lines):
    cleaned = list()
    # prepare regex for char filtering
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for pair in lines:
        clean_pair = list()
        
        line = pair[0]
        #print (line)
        line = normalize('NFD', line).encode('ascii', 'ignore')
        line = line.decode('UTF-8')
        line = line.split()
        line = [word.lower() for word in line]
        line = [word.translate(table) for word in line]
        line = [re_print.sub('', w) for w in line]
        line = [word for word in line if word.isalpha()]
        clean_pair.append(' '.join(line))
        
        line = pair[1]
        #print (line)
        line = factory.normalize(line)
        line = clean_text(line)
        tokens = list()
        for t in indic_tokenize.trivial_tokenize(line): 
            tokens.append(t)
        line = tokens
        line = [word.lower() for word in line]
        line = [word for word in line if not re.search(r'\d', word)]
        clean_pair.append(' '.join(line))
        
        print (clean_pair)
        
        cleaned.append(clean_pair)
    return array(cleaned)
示例#22
0
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

# Use: echo {text} | python tokenize_indic.py {language}

import sys

from indicnlp.tokenize.indic_tokenize import trivial_tokenize
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory

factory = IndicNormalizerFactory()
normalizer = factory.get_normalizer(sys.argv[1],
                                    remove_nuktas=False,
                                    nasals_mode='do_nothing')

for line in sys.stdin:
    normalized_line = normalizer.normalize(line.strip())
    tokenized_line = ' '.join(trivial_tokenize(normalized_line, sys.argv[1]))
    print(tokenized_line)
示例#23
0
import pickle
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
from indicnlp.tokenize import indic_tokenize

remove_nuktas = False
factory = IndicNormalizerFactory()
normalizer = factory.get_normalizer("hi", remove_nuktas)

hin = open('./model/dataset/en-hi.hi').readlines()
hin = [line.decode('UTF-8') for line in hin]
print(hin[:5])
hin = [normalizer.normalize(line.strip()) for line in hin]

hin = [indic_tokenize.trivial_tokenize(line) for line in hin]
print(hin[:5])

with open("hindi_tokens.txt", "wb") as fp:
    pickle.dump(hin, fp)
示例#24
0
def noramlize_and_tok_text(sent):
    normalized = normalizer.normalize(sent)
    processed = ' '.join(trivial_tokenize(normalized, lang))
    return processed
                    tokens = analyzer.morph_analyze_document(
                        line.decode('utf-8').strip().split(' '))
                    s += ' '.join(tokens).strip().encode('utf-8') + '\n'
            tam = s
            print("     Morphological analysis complete")

        with open(m + '.TA', 'w+') as f:
            f.write(tam)
    else:
        with open(m + '.TA', 'r') as f:
            tam = f.read()

    sent = []
    twv = []
    for e in unicode(tam, "utf-8").split("\n"):
        twv.append(indic_tokenize.trivial_tokenize(e))
    twv = filter(None, twv)
    print("     Input ready for " + m + " algorithm, Starting...")
    modeltam = wv.Word2Vec(twv,
                           size=100,
                           window=w,
                           workers=1,
                           batch_words=25,
                           min_count=1)
    modeltam.save("embedding/" + m + str(w))

    print("     " + m + " model created and saved successfully!")

else:
    os.system("eval.py " + str(w))
'''
from indicnlp import common
common.set_resources_path(INDIC_NLP_RESOURCES)

#Initialize the Indic NLP library

from indicnlp import loader
loader.load()

#Tokenization
from indicnlp.tokenize import indic_tokenize

indic_string = 'अनूप,अनूप?।फोन'

print('Input String: {}'.format(indic_string))
print('Tokens: ')
for t in indic_tokenize.trivial_tokenize(indic_string):
    print(t)

from indicnlp.morph import unsupervised_morph
from indicnlp import common

# This step will call the service which is very slow

indic_string = 'आपल्या हिरड्यांच्या आणि दातांच्यामध्ये जीवाणू असतात .' + 'अनूप,अनूप?।फोन'
indic_res1 = indic_tokenize.trivial_tokenize(indic_string)
print(type(indic_res1))

print("indic_res1 = ", indic_res1)
#Word Segmentation
from indicnlp.morph import unsupervised_morph
from indicnlp import common
示例#27
0

if __name__ == '__main__':

    if len(sys.argv) < 4:
        print(
            "Usage: python unsupervised_morph.py <infile> <outfile> <language> <indic_resources_path> [<add_marker>]"
        )
        sys.exit(1)

    language = sys.argv[3]
    common.INDIC_RESOURCES_PATH = sys.argv[4]

    add_marker = False

    if len(sys.argv) == 6:
        add_marker = True if sys.argv[5] == 'True' else False

    print('Loading morph analyser for ' + language)
    analyzer = UnsupervisedMorphAnalyzer(language, add_marker)
    print('Loaded morph analyser for ' + language)

    with codecs.open(sys.argv[1], 'r', 'utf-8') as ifile:
        with codecs.open(sys.argv[2], 'w', 'utf-8') as ofile:
            for line in ifile.readlines():
                line = line.strip()
                tokens = indic_tokenize.trivial_tokenize(line)
                morph_tokens = analyzer.morph_analyze_document(tokens)
                ofile.write(' '.join(morph_tokens))
                ofile.write('\n')
        #            token=u'{}_E_'.format(token)
        #        out_tokens.append(token)
        #return out_tokens    


if __name__ == '__main__': 

    if len(sys.argv)<4:
        print "Usage: python unsupervised_morph.py <infile> <outfile> <language> <indic_resources_path> [<add_marker>]"
        sys.exit(1)

    language=sys.argv[3]
    common.INDIC_RESOURCES_PATH=sys.argv[4]

    add_marker=False

    if len(sys.argv)==6:
        add_marker= True if sys.argv[5] == 'True' else False 

    analyzer=UnsupervisedMorphAnalyzer(language,add_marker)

    with codecs.open(sys.argv[1],'r','utf-8') as ifile:
        with codecs.open(sys.argv[2],'w','utf-8') as ofile:
            for line in ifile.readlines():
                line=line.strip()
                tokens=indic_tokenize.trivial_tokenize(line)
                morph_tokens=analyzer.morph_analyze_document(tokens)
                ofile.write(string.join(morph_tokens,sep=' '))
                ofile.write('\n')

示例#29
0
def preprocess_mr(text):
    text = normalizer.normalize(text)
    return ' '.join(indic_tokenize.trivial_tokenize(text))
示例#30
0
def process(lang, sent):
    normalizer = IndicNormalizerFactory().get_normalizer(lang)
    normalized = normalizer.normalize(sent)
    processed = ' '.join(trivial_tokenize(normalized, lang))
    return processed
示例#31
0
def run_tokenize(args):
    for line in args.infile:
        args.outfile.write(' '.join(
            indic_tokenize.trivial_tokenize(line, args.lang)))