def Process(infile, outfile, column=0, max_row=1000000, interval=100000): print("infile = ", infile) with open(infile, "r", encoding="utf-8") as fin: analyzer = unsupervised_morph.UnsupervisedMorphAnalyzer('hi') result = [] read_line_num = 0 lines = fin.readlines() tot_line = len(lines) for line in lines: read_line_num += 1 if read_line_num % interval == 0: print("processed %d lines " % read_line_num) line = line.split('\t') indic_string = line[column] indic_string = indic_string.strip() indic_res1 = indic_tokenize.trivial_tokenize(indic_string) analyzes_tokens = analyzer.morph_analyze_document(indic_res1) result.append(' '.join(analyzes_tokens)) # if read_line_num % max_row==0: # if os.path.exists(outfile): # fout = open(outfile,"a",encoding="utf-8") # fout.seek(0,2) # else: # fout = open(outfile,"w",encoding="utf-8") # for line in result: # # fout.write(line) # fout.write(line+"\n") # fout.close() # result = [] print("len_result = ", len(result)) fout = open(outfile, "w", encoding="utf-8") for line in result: fout.write(line + "\n")
def segmentize(ip_file_path, op_file_path, ln): analyzer = unsupervised_morph.UnsupervisedMorphAnalyzer(ln) with open(ip_file_path, 'r') as f: with open(op_file_path, "w") as text_file: for line in f: analyzes_tokens = analyzer.morph_analyze_document( line.split(' ')) text_file.write(' '.join(analyzes_tokens))
def run_morph(args): add_marker = False analyzer = unsupervised_morph.UnsupervisedMorphAnalyzer( args.lang, add_marker) for line in args.infile: morph_tokens = analyzer.morph_analyze_document(line.strip().split(' ')) args.outfile.write(' '.join(morph_tokens) + '\n')
def findMorphenes(self): input_file = parser.get('words', 'input_word_file') morpheme_output_file = parser.get('morphessor', 'morpheme_output_file') print("Executing Morpheme Analyzer") morpheme_analyzer = unsupervised_morph.UnsupervisedMorphAnalyzer('kn') input_fptr = open(input_file, 'r', encoding='utf-8') output_fptr = open(morpheme_output_file, 'w', encoding='utf-8') for line in input_fptr: word = line.strip().split(' ') if len(word) != 0: morpheme_tokens = morpheme_analyzer.morph_analyze_document( word) concatenated_tokens = ' '.join(' '.join(str(x) for x in w) for w in morpheme_tokens) if len(concatenated_tokens.split()) == 0: concatenated_tokens += '0 0' if len(concatenated_tokens.split()) == 1: concatenated_tokens += ' 0' concatenated_tokens = line.strip() + ' ' + concatenated_tokens tokens = concatenated_tokens.split(' ') output_fptr.write(tokens[1] + ' ' + tokens[-1]) output_fptr.write('\n') print("Morpheme Analyzer Execution Finished")
# -*- coding: utf-8 -*- from __future__ import print_function from indicnlp.morph import unsupervised_morph from indicnlp import common import sys common.INDIC_RESOURCES_PATH="/opt/indic_nlp_library/indic_nlp_resources" analyzer=unsupervised_morph.UnsupervisedMorphAnalyzer('ta') with open(sys.argv[1], 'r') as f: with open(sys.argv[1]+'_morph', 'w+') as g: for line in f.readlines(): if line: tokens=analyzer.morph_analyze_document(line.decode('utf-8').strip().split(' ')) print(' '.join(tokens).strip().encode('utf-8'), file=g)