def getpos(): text = request.form['input_text'] tokenized_text = indic_tokenize.trivial_tokenize(text) with open("./postagger.pkl",'rb') as f: pos_tagger = pickle.load(f) output = pos_tagger.tag(tokenized_text) return render_template('result.html',data = output)
def characterCountHindi(text): #sentences=sentence_tokenize.sentence_split(text, lang='hi') count=0 for t in indic_tokenize.trivial_tokenize(text): for i in t: count=count+1 return count
def main(): parser = argparse.ArgumentParser() parser.add_argument("--indic-nlp-path", required=True, help="path to Indic NLP Library root") parser.add_argument("--language", required=True) parser.add_argument("--remove-nuktas", default=False, action="store_true") parser.add_argument("input", help="input file; use - for stdin") args = parser.parse_args() try: sys.path.extend([ args.indic_nlp_path, os.path.join(args.indic_nlp_path, "src"), ]) from indicnlp.tokenize import indic_tokenize from indicnlp.normalize.indic_normalize import IndicNormalizerFactory except: raise Exception( "Cannot load Indic NLP Library, make sure --indic-nlp-path is correct" ) # create normalizer factory = IndicNormalizerFactory() normalizer = factory.get_normalizer( args.language, remove_nuktas=args.remove_nuktas, ) # normalize and tokenize for line in fileinput.input([args.input], openhook=fileinput.hook_compressed): line = normalizer.normalize(line.decode("utf-8", errors="ignore")) line = " ".join(indic_tokenize.trivial_tokenize(line, args.language)) sys.stdout.write(line.encode("utf-8"))
def longWordCountHindi(text): words=indic_tokenize.trivial_tokenize(text) count=0 for i in words: if len(i)>6: count+=1 return count
def Process(infile, outfile, column=0, max_row=1000000, interval=100000): print("infile = ", infile) with open(infile, "r", encoding="utf-8") as fin: analyzer = unsupervised_morph.UnsupervisedMorphAnalyzer('hi') result = [] read_line_num = 0 lines = fin.readlines() tot_line = len(lines) for line in lines: read_line_num += 1 if read_line_num % interval == 0: print("processed %d lines " % read_line_num) line = line.split('\t') indic_string = line[column] indic_string = indic_string.strip() indic_res1 = indic_tokenize.trivial_tokenize(indic_string) analyzes_tokens = analyzer.morph_analyze_document(indic_res1) result.append(' '.join(analyzes_tokens)) # if read_line_num % max_row==0: # if os.path.exists(outfile): # fout = open(outfile,"a",encoding="utf-8") # fout.seek(0,2) # else: # fout = open(outfile,"w",encoding="utf-8") # for line in result: # # fout.write(line) # fout.write(line+"\n") # fout.close() # result = [] print("len_result = ", len(result)) fout = open(outfile, "w", encoding="utf-8") for line in result: fout.write(line + "\n")
def tokenize(ip_file_path, op_file_path): with open(ip_file_path, 'r') as f: with open(op_file_path, "w") as text_file: for line in f: result_arr = indic_tokenize.trivial_tokenize(line) tokenized_sentence = ' '.join(result_arr) text_file.write(tokenized_sentence)
def preprocess_line(line, normalizer, lang, transliterate=False): if lang == "en": return " ".join( en_tok.tokenize(en_normalizer.normalize(line.strip()), escape=False)) elif transliterate: # line = indic_detokenize.trivial_detokenize(line.strip(), lang) return unicode_transliterate.UnicodeIndicTransliterator.transliterate( " ".join( indic_tokenize.trivial_tokenize( normalizer.normalize(line.strip()), lang)), lang, "hi", ).replace(" ् ", "्") else: # we only need to transliterate for joint training return " ".join( indic_tokenize.trivial_tokenize(normalizer.normalize(line.strip()), lang))
def update_stats (caption : str, stats : dict) -> None: tokens = indic_tokenize.trivial_tokenize(caption) n_tokens = len (tokens) stats ['maxlen'] = max (stats ['maxlen'], n_tokens) for key in stats.keys (): if key == 'maxlen': continue if check_in_range (key, n_tokens): stats [key] += 1 return
def indic_tokenizer(self, text, lang): ''' Tokenizer for indic nlp ''' # Tokenize tokens = indic_tokenize.trivial_tokenize(text=text, lang=lang) # Normalize for i in range(len(tokens)): tokens[i] = self.normalizers[lang].normalize(tokens[i]) return tokens
def preprocess_sent(text, lang): """ Pre-process text (normalization and tokenization). text: text string to preprocess lang: language code (2-letter ISO code) returns the processed text string """ normalizer = normalizer_factory.get_normalizer(lang) return indic_tokenize.trivial_tokenize(normalizer.normalize( text.replace('\n', ' ')), lang)
def preprocess_line(line, normalizer, lang, transliterate=False): if lang == "en": # this is using cleaner for vi text and imp for en-vi dataset # TODO: how to not include this for other language cleaning # line = fix_contents(line) return " ".join( en_tok.tokenize(en_normalizer.normalize(line.strip()), escape=False)) elif lang == "vi": line = fix_contents(line) sentences = rdrsegmenter.tokenize(line) tokenized_sentence = join_tokenized_sentence_list(sentences) return tokenized_sentence elif lang == "bg": sentences = bg_cube(line) tokenized_sentence = join_tokenized_sentence_list(sentences, cube_tokenized=True) return tokenized_sentence elif lang == "tr": sentences = tr_cube(line) tokenized_sentence = join_tokenized_sentence_list(sentences, cube_tokenized=True) return tokenized_sentence elif transliterate: # line = indic_detokenize.trivial_detokenize(line.strip(), lang) return unicode_transliterate.UnicodeIndicTransliterator.transliterate( " ".join( indic_tokenize.trivial_tokenize( normalizer.normalize(line.strip()), lang)), lang, "hi", ).replace(" ् ", "्") else: # we only need to transliterate for joint training return " ".join( indic_tokenize.trivial_tokenize(normalizer.normalize(line.strip()), lang))
def pre_process_hindi_sentence(line): remove_nuktas = False factory = IndicNormalizerFactory() normalizer = factory.get_normalizer("hi", remove_nuktas) line = normalizer.normalize(line) line = clean_text(line) tokens = list() for t in indic_tokenize.trivial_tokenize(line): tokens.append(t) line = tokens line = [word.lower() for word in line] line = [word for word in line if not re.search(r'\d', word)] line = ' '.join(line) return (line)
def preprocess_sent(sent, normalizer, lang): if lang == "en": return " ".join( en_tok.tokenize(en_normalizer.normalize(sent.strip()), escape=False)) else: # line = indic_detokenize.trivial_detokenize(line.strip(), lang) return unicode_transliterate.UnicodeIndicTransliterator.transliterate( " ".join( indic_tokenize.trivial_tokenize( normalizer.normalize(sent.strip()), lang)), lang, "hi", ).replace(" ् ", "्")
def doc2vec(txt, lang, emb): """ a doc is represented as the mean of all the words vectors of its constituent words """ normalizer = normalizer_factory.get_normalizer(lang) normed_txt = normalizer.normalize(txt.replace('\n', ' ')) words = indic_tokenize.trivial_tokenize(normed_txt, lang) word_vecs = [emb[word] for word in words if word in emb] if len(word_vecs) > 0: doc_vec = np.mean(np.array(word_vecs), axis=0) else: doc_vec = np.zeros(emb.vector_size) return doc_vec
def get_token(filepath: str = ''): global tokens if not sentences: get_sentences(filepath) tok_list = list() for sentence in sentences: sentence = sentence.replace('\u200d', ' ') sentence = sentence.replace('\n', ' ') toks = ['<sos>'] + indic_tokenize.trivial_tokenize( sentence, lang='hi') + ['<eos>'] tok_list.append(toks) tokens = tok_list return tokens
def old_preprocess(infname, outfname, lang): """ Preparing each corpus file: - Normalization - Tokenization - Script coversion to Devanagari for Indic scripts """ n = 0 num_lines = sum(1 for line in open(infname, "r")) # reading with open(infname, "r", encoding="utf-8") as infile, open(outfname, "w", encoding="utf-8") as outfile: if lang == "en": en_tok = MosesTokenizer(lang="en") en_normalizer = MosesPunctNormalizer() for line in tqdm(infile, total=num_lines): outline = " ".join( en_tok.tokenize(en_normalizer.normalize(line.strip()), escape=False)) outfile.write(outline + "\n") n += 1 else: normfactory = indic_normalize.IndicNormalizerFactory() normalizer = normfactory.get_normalizer(lang) for line in tqdm(infile, total=num_lines): outline = (unicode_transliterate.UnicodeIndicTransliterator. transliterate( " ".join( indic_tokenize.trivial_tokenize( normalizer.normalize(line.strip()), lang)), lang, "hi", ).replace(" ् ", "्")) outfile.write(outline + "\n") n += 1 return n
def process(filename, maxlen=250): lines = open(filename, "r", encoding="utf-8") #.read().split("\n") new = [] word_counts = [] # Pre-tokenize replacements regs = {"[‘’]": "'", '[“”]': '"', "[\s]+": " "} for line in tqdm(lines): line = line.strip() for reg in regs: line = re.sub(reg, regs[reg], line) new.append(line) lines.close() print(f"finished pre-processing {filename}") tokenized = [] # Add a post-tokenize replacements section if needed reg = "[०-९]" for line in new: tokens = indic_tokenize.trivial_tokenize(line) if len(tokens) < maxlen: tok = " ".join(tokens) tokenized.append(re.sub(reg, "#", tok)) word_counts.append(len(tokens)) else: tokenized.append("") # A quicker way would be load in (title, article) pairs into this function # and skip a pair entirely whenever length > maxlen for articles print(f"finished tokenizing and post-processing {filename}") sort = sorted(word_counts, reverse=True) print(f"the 10 longest sequence lengths in {filename} are {sort[:10]}") print(f"average length: {sum(word_counts)/len(word_counts)}") return tokenized
def generate_data_dictionary(paths, destination='data.pickle'): # to be saved return_dict = {'tokens': []} # iterating over each file corresponding to the path for path in tqdm(paths): if path.endswith('.txt'): # reading the file file = open(path, 'r', encoding='utf-8') text = file.read().replace('\n', '').replace('·', '') # tokenizing the text token_list = indic_tokenize.trivial_tokenize(text, lang='hi') # committing the new tokens to the return dict return_dict['tokens'].append(token_list) # pickling with open(destination, 'wb') as pickle_file: pickle.dump(return_dict, pickle_file)
def get_vocab(corpus: List) -> Tuple: word_to_index = dict() index_to_word = dict() word_to_index['pad'] = 0 word_to_index['start'] = 1 word_to_index['end'] = 2 word_to_index['unk'] = 3 index_to_word[0] = 'pad' index_to_word[1] = 'start' index_to_word[2] = 'end' index_to_word[3] = 'unk' start_idx = 4 for img in tqdm(corpus): tokens = indic_tokenize.trivial_tokenize(img['caption']) for tok in tokens: if tok not in word_to_index: word_to_index[tok] = start_idx index_to_word[start_idx] = tok start_idx += 1 return word_to_index, index_to_word
def main(): data = "" with open("textdata.txt") as fp: data = fp.read() print("Tokenizing....") token = indic_tokenize.trivial_tokenize(data) print("Size of token befoer eliminating punction: {0}".format(len(token))) remove_punc(token) print("Size of token after eliminating punction: {0}".format(len(token))) print("#" * 100) print("Size of token before filtering stop word : {0}".format(len(token))) token = filter_token(token) print("Size of token after filtering stop word : {0}".format(len(token))) print("Token:\n{0}".format(token)) print("Trainning data 'hind.pos' from nltk......") #Trainning data train_data = indian.tagged_sents('hindi.pos') tnt_pos_tagger = tnt.TnT() tnt_pos_tagger.train(train_data) POS_TAG = tnt_pos_tagger.tag(token) for each_tag in POS_TAG: print(each_tag)
def clean_pairs(lines): cleaned = list() # prepare regex for char filtering re_print = re.compile('[^%s]' % re.escape(string.printable)) # prepare translation table for removing punctuation table = str.maketrans('', '', string.punctuation) for pair in lines: clean_pair = list() line = pair[0] #print (line) line = normalize('NFD', line).encode('ascii', 'ignore') line = line.decode('UTF-8') line = line.split() line = [word.lower() for word in line] line = [word.translate(table) for word in line] line = [re_print.sub('', w) for w in line] line = [word for word in line if word.isalpha()] clean_pair.append(' '.join(line)) line = pair[1] #print (line) line = factory.normalize(line) line = clean_text(line) tokens = list() for t in indic_tokenize.trivial_tokenize(line): tokens.append(t) line = tokens line = [word.lower() for word in line] line = [word for word in line if not re.search(r'\d', word)] clean_pair.append(' '.join(line)) print (clean_pair) cleaned.append(clean_pair) return array(cleaned)
#!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. # Use: echo {text} | python tokenize_indic.py {language} import sys from indicnlp.tokenize.indic_tokenize import trivial_tokenize from indicnlp.normalize.indic_normalize import IndicNormalizerFactory factory = IndicNormalizerFactory() normalizer = factory.get_normalizer(sys.argv[1], remove_nuktas=False, nasals_mode='do_nothing') for line in sys.stdin: normalized_line = normalizer.normalize(line.strip()) tokenized_line = ' '.join(trivial_tokenize(normalized_line, sys.argv[1])) print(tokenized_line)
import pickle from indicnlp.normalize.indic_normalize import IndicNormalizerFactory from indicnlp.tokenize import indic_tokenize remove_nuktas = False factory = IndicNormalizerFactory() normalizer = factory.get_normalizer("hi", remove_nuktas) hin = open('./model/dataset/en-hi.hi').readlines() hin = [line.decode('UTF-8') for line in hin] print(hin[:5]) hin = [normalizer.normalize(line.strip()) for line in hin] hin = [indic_tokenize.trivial_tokenize(line) for line in hin] print(hin[:5]) with open("hindi_tokens.txt", "wb") as fp: pickle.dump(hin, fp)
def noramlize_and_tok_text(sent): normalized = normalizer.normalize(sent) processed = ' '.join(trivial_tokenize(normalized, lang)) return processed
tokens = analyzer.morph_analyze_document( line.decode('utf-8').strip().split(' ')) s += ' '.join(tokens).strip().encode('utf-8') + '\n' tam = s print(" Morphological analysis complete") with open(m + '.TA', 'w+') as f: f.write(tam) else: with open(m + '.TA', 'r') as f: tam = f.read() sent = [] twv = [] for e in unicode(tam, "utf-8").split("\n"): twv.append(indic_tokenize.trivial_tokenize(e)) twv = filter(None, twv) print(" Input ready for " + m + " algorithm, Starting...") modeltam = wv.Word2Vec(twv, size=100, window=w, workers=1, batch_words=25, min_count=1) modeltam.save("embedding/" + m + str(w)) print(" " + m + " model created and saved successfully!") else: os.system("eval.py " + str(w)) '''
from indicnlp import common common.set_resources_path(INDIC_NLP_RESOURCES) #Initialize the Indic NLP library from indicnlp import loader loader.load() #Tokenization from indicnlp.tokenize import indic_tokenize indic_string = 'अनूप,अनूप?।फोन' print('Input String: {}'.format(indic_string)) print('Tokens: ') for t in indic_tokenize.trivial_tokenize(indic_string): print(t) from indicnlp.morph import unsupervised_morph from indicnlp import common # This step will call the service which is very slow indic_string = 'आपल्या हिरड्यांच्या आणि दातांच्यामध्ये जीवाणू असतात .' + 'अनूप,अनूप?।फोन' indic_res1 = indic_tokenize.trivial_tokenize(indic_string) print(type(indic_res1)) print("indic_res1 = ", indic_res1) #Word Segmentation from indicnlp.morph import unsupervised_morph from indicnlp import common
if __name__ == '__main__': if len(sys.argv) < 4: print( "Usage: python unsupervised_morph.py <infile> <outfile> <language> <indic_resources_path> [<add_marker>]" ) sys.exit(1) language = sys.argv[3] common.INDIC_RESOURCES_PATH = sys.argv[4] add_marker = False if len(sys.argv) == 6: add_marker = True if sys.argv[5] == 'True' else False print('Loading morph analyser for ' + language) analyzer = UnsupervisedMorphAnalyzer(language, add_marker) print('Loaded morph analyser for ' + language) with codecs.open(sys.argv[1], 'r', 'utf-8') as ifile: with codecs.open(sys.argv[2], 'w', 'utf-8') as ofile: for line in ifile.readlines(): line = line.strip() tokens = indic_tokenize.trivial_tokenize(line) morph_tokens = analyzer.morph_analyze_document(tokens) ofile.write(' '.join(morph_tokens)) ofile.write('\n')
# token=u'{}_E_'.format(token) # out_tokens.append(token) #return out_tokens if __name__ == '__main__': if len(sys.argv)<4: print "Usage: python unsupervised_morph.py <infile> <outfile> <language> <indic_resources_path> [<add_marker>]" sys.exit(1) language=sys.argv[3] common.INDIC_RESOURCES_PATH=sys.argv[4] add_marker=False if len(sys.argv)==6: add_marker= True if sys.argv[5] == 'True' else False analyzer=UnsupervisedMorphAnalyzer(language,add_marker) with codecs.open(sys.argv[1],'r','utf-8') as ifile: with codecs.open(sys.argv[2],'w','utf-8') as ofile: for line in ifile.readlines(): line=line.strip() tokens=indic_tokenize.trivial_tokenize(line) morph_tokens=analyzer.morph_analyze_document(tokens) ofile.write(string.join(morph_tokens,sep=' ')) ofile.write('\n')
def preprocess_mr(text): text = normalizer.normalize(text) return ' '.join(indic_tokenize.trivial_tokenize(text))
def process(lang, sent): normalizer = IndicNormalizerFactory().get_normalizer(lang) normalized = normalizer.normalize(sent) processed = ' '.join(trivial_tokenize(normalized, lang)) return processed
def run_tokenize(args): for line in args.infile: args.outfile.write(' '.join( indic_tokenize.trivial_tokenize(line, args.lang)))