Exemplo n.º 1
0
def Process(infile, outfile, column=0, max_row=1000000, interval=100000):
    print("infile = ", infile)
    with open(infile, "r", encoding="utf-8") as fin:
        analyzer = unsupervised_morph.UnsupervisedMorphAnalyzer('hi')
        result = []
        read_line_num = 0
        lines = fin.readlines()
        tot_line = len(lines)
        for line in lines:
            read_line_num += 1
            if read_line_num % interval == 0:
                print("processed %d lines " % read_line_num)
            line = line.split('\t')
            indic_string = line[column]
            indic_string = indic_string.strip()
            indic_res1 = indic_tokenize.trivial_tokenize(indic_string)
            analyzes_tokens = analyzer.morph_analyze_document(indic_res1)
            result.append(' '.join(analyzes_tokens))
            # if read_line_num % max_row==0:
            #     if os.path.exists(outfile):
            #         fout = open(outfile,"a",encoding="utf-8")
            #         fout.seek(0,2)
            #     else:
            #         fout = open(outfile,"w",encoding="utf-8")
            #     for line in result:
            #         # fout.write(line)
            #         fout.write(line+"\n")
            #     fout.close()
            #     result = []
        print("len_result = ", len(result))
        fout = open(outfile, "w", encoding="utf-8")
        for line in result:
            fout.write(line + "\n")
Exemplo n.º 2
0
def segmentize(ip_file_path, op_file_path, ln):
    analyzer = unsupervised_morph.UnsupervisedMorphAnalyzer(ln)
    with open(ip_file_path, 'r') as f:
        with open(op_file_path, "w") as text_file:
            for line in f:
                analyzes_tokens = analyzer.morph_analyze_document(
                    line.split(' '))
                text_file.write(' '.join(analyzes_tokens))
Exemplo n.º 3
0
def run_morph(args):

    add_marker = False
    analyzer = unsupervised_morph.UnsupervisedMorphAnalyzer(
        args.lang, add_marker)
    for line in args.infile:
        morph_tokens = analyzer.morph_analyze_document(line.strip().split(' '))
        args.outfile.write(' '.join(morph_tokens) + '\n')
Exemplo n.º 4
0
 def findMorphenes(self):
     input_file = parser.get('words', 'input_word_file')
     morpheme_output_file = parser.get('morphessor', 'morpheme_output_file')
     print("Executing Morpheme Analyzer")
     morpheme_analyzer = unsupervised_morph.UnsupervisedMorphAnalyzer('kn')
     input_fptr = open(input_file, 'r', encoding='utf-8')
     output_fptr = open(morpheme_output_file, 'w', encoding='utf-8')
     for line in input_fptr:
         word = line.strip().split(' ')
         if len(word) != 0:
             morpheme_tokens = morpheme_analyzer.morph_analyze_document(
                 word)
             concatenated_tokens = ' '.join(' '.join(str(x) for x in w)
                                            for w in morpheme_tokens)
             if len(concatenated_tokens.split()) == 0:
                 concatenated_tokens += '0 0'
             if len(concatenated_tokens.split()) == 1:
                 concatenated_tokens += ' 0'
             concatenated_tokens = line.strip() + ' ' + concatenated_tokens
             tokens = concatenated_tokens.split(' ')
             output_fptr.write(tokens[1] + ' ' + tokens[-1])
             output_fptr.write('\n')
     print("Morpheme Analyzer Execution Finished")
Exemplo n.º 5
0
# -*- coding: utf-8 -*-
from __future__ import print_function
from indicnlp.morph import unsupervised_morph 
from indicnlp import common
import sys

common.INDIC_RESOURCES_PATH="/opt/indic_nlp_library/indic_nlp_resources"

analyzer=unsupervised_morph.UnsupervisedMorphAnalyzer('ta')

with open(sys.argv[1], 'r') as f:
	with open(sys.argv[1]+'_morph', 'w+') as g:
		for line in f.readlines():
			if line:
				tokens=analyzer.morph_analyze_document(line.decode('utf-8').strip().split(' '))
				print(' '.join(tokens).strip().encode('utf-8'), file=g)