def postprocess(infname, outfname, input_size, lang, common_lang="hi", transliterate=False): """ parse fairseq interactive output, convert script back to native Indic script (in case of Indic languages) and detokenize. infname: fairseq log file outfname: output file of translation (sentences not translated contain the dummy string 'DUMMY_OUTPUT' input_size: expected number of output sentences lang: language """ consolidated_testoutput = [] # with open(infname,'r',encoding='utf-8') as infile: # consolidated_testoutput= list(map(lambda x: x.strip(), filter(lambda x: x.startswith('H-'),infile) )) # consolidated_testoutput.sort(key=lambda x: int(x.split('\t')[0].split('-')[1])) # consolidated_testoutput=[ x.split('\t')[2] for x in consolidated_testoutput ] consolidated_testoutput = [(x, 0.0, "") for x in range(input_size)] temp_testoutput = [] with open(infname, "r", encoding="utf-8") as infile: temp_testoutput = list( map( lambda x: x.strip().split("\t"), filter(lambda x: x.startswith("H-"), infile), )) temp_testoutput = list( map(lambda x: (int(x[0].split("-")[1]), float(x[1]), x[2]), temp_testoutput)) for sid, score, hyp in temp_testoutput: consolidated_testoutput[sid] = (sid, score, hyp) consolidated_testoutput = [x[2] for x in consolidated_testoutput] if lang == "en": en_detok = MosesDetokenizer(lang="en") with open(outfname, "w", encoding="utf-8") as outfile: for sent in consolidated_testoutput: outfile.write(en_detok.detokenize(sent.split(" ")) + "\n") else: xliterator = unicode_transliterate.UnicodeIndicTransliterator() with open(outfname, "w", encoding="utf-8") as outfile: for sent in consolidated_testoutput: if transliterate: outstr = indic_detokenize.trivial_detokenize( xliterator.transliterate(sent, common_lang, lang), lang) else: outstr = indic_detokenize.trivial_detokenize(sent, lang) outfile.write(outstr + "\n")
def postprocess(sents, lang, common_lang="hi"): """ parse fairseq interactive output, convert script back to native Indic script (in case of Indic languages) and detokenize. infname: fairseq log file outfname: output file of translation (sentences not translated contain the dummy string 'DUMMY_OUTPUT' input_size: expected number of output sentences lang: language """ postprocessed_sents = [] if lang == "en": en_detok = MosesDetokenizer(lang="en") for sent in sents: # outfile.write(en_detok.detokenize(sent.split(" ")) + "\n") postprocessed_sents.append(en_detok.detokenize(sent.split(" "))) else: xliterator = unicode_transliterate.UnicodeIndicTransliterator() for sent in sents: outstr = indic_detokenize.trivial_detokenize( xliterator.transliterate(sent, common_lang, lang), lang) # outfile.write(outstr + "\n") postprocessed_sents.append(outstr) postprocessed_sents = [i.replace("<unk>", "") for i in postprocessed_sents] return postprocessed_sents
def translate_mr(s): s=s.lower() #Apply tokenization tokenize = MosesTokenizer('en') s=' '.join(tokenize(s)) with open('sentence.txt','w') as f: f.write(s) #Apply BPE !subword-nmt apply-bpe -c mr-bpe-codes.en < sentence.txt > bpe-sentence.txt #Translate using OpenNMT !onmt_translate -model mr-model.pt -src bpe-sentence.txt -output bpe-trans.txt -replace_unk -gpu 0 #De-BPE !cat bpe-trans.txt | sed -E 's/(@@ )|(@@ ?$)//g' > trans.txt out = "" with open('trans.txt') as f: for i in f: out+=i return indic_detokenize.trivial_detokenize(out,lang='mr')
""" # Commented out IPython magic to ensure Python compatibility. # %%capture # #Translating Test Data # !subword-nmt apply-bpe -c mr-bpe-codes.en < processed-test-pmi.txt > bpe-processed-test-pmi.txt # !onmt_translate -model mr-model.pt -src bpe-processed-test-pmi.txt -output bpe-trans-pmi.txt -replace_unk -gpu 0 # !cat bpe-trans-pmi.txt | sed -E 's/(@@ )|(@@ ?$)//g' > trans-pmi.txt """**Detokenize predicted translations**""" from indicnlp.tokenize import indic_detokenize with open('trans-pmi.txt','r') as a,open('detok-trans-pmi.txt','w') as b: for item in a: b.write("%s" % indic_detokenize.trivial_detokenize(item,lang='mr')) """**Create a function to calculate BLEU score given hypothesis and reference files**""" import sacrebleu #Function to calculate BLEU score def score(r,h): ref = [] with open(r,'r') as lines: for i in lines: ref.append(i) hypothesis = [] with open(h,'r') as lines: for i in lines: hypothesis.append(i)
def run_detokenize(args): for line in args.infile: args.outfile.write(indic_detokenize.trivial_detokenize( line, args.lang))
def detok(sentence, lang): if lang == "en": return MosesDetokenizer(lang="en").detokenize(sentence.split()) elif lang == "ne": return indic_detokenize.trivial_detokenize(sentence, "ne")
from indicnlp.tokenize import indic_detokenize indic_string = '" सुनो , कुछ आवाज़ आ रही है . " , उसने कहा । ' print('Input String: {}'.format(indic_string)) print('Detokenized String: {}'.format( indic_detokenize.trivial_detokenize(indic_string, lang='hi')))