def testdataiterator(): """ iterates over testdata, yields mrl, nl """ with open(testmrlfilename) as mrlfile: with open(testnlfilename) as nlfile: for index, mrl in enumerate(mrlfile): nl = nlfile.readline() mrl = MRL_Linearizer.linearizeMRL(mrl) nl = MRL_Linearizer.stemNL(nl) print(nl) yield mrl, nl
def LinearizedFileCreater(txt): lst = ["-",":","&","."] with open(txt) as myfile: txt_lst = myfile.readlines() new_txtlst = [] for a in txt_lst: sentence = [] for b in range(len(a)): if a[b-1].isalpha() and a[b] == "'" and a[b+1].isalpha(): sentence.append(a[b].replace("'","§")) else: sentence.append(a[b]) new_txtlst.append("".join(sentence).strip()) with open('MRL_EN_TRAIN_YANG_linearizedTEST1.train.LMRL','w') as newfile: for a in new_txtlst: s = MRL_Linearizer.linearizeMRL(a).replace("§","'") finish = [] for b in range(len(s)): if s[b] == " " and (s[b-1].isalpha() or s[b-1] in lst) and s[b-2] != "@": appender = s[b].replace(" ","%") finish.append(appender) else: finish.append(s[b]) newfile.write("".join(finish)+"\n")
def stemmedFileCreater(txt): with open(txt) as myfile: txt_lst = myfile.readlines() with open('NL_EN_TEST_stem.txt','w') as newfile: for a in txt_lst: newfile.write(MRL_Linearizer.stemNL(a)+"\n")
def stemmedFileCreater(txt): with open(txt) as myfile: txt_lst = myfile.readlines() with open('Schreibtisch/progTest2/tune.nl','w') as newfile: for a in txt_lst: newfile.write(MRL_Linearizer.stemNL(a)+"\n")
def decode(): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. mrlf = open(outmrlfilename, "w+") # Load vocabularies. en_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.en" % FLAGS.en_vocab_size) fr_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.fr" % FLAGS.fr_vocab_size) en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path) _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path) # Decode from standard input. #sys.stdout.write("> ") #sys.stdout.flush() #sentence = sys.stdin.readline() #sentence = MRL_Linearizer.stemNL(sentence) def single_sentence_decoding(sentence): # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids( tf.compat.as_bytes(sentence), en_vocab) # Which bucket does it belong to? bucket_id = min([ b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids) ]) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) return decode_once(output_logits, rev_fr_vocab) #interactive session? if FLAGS.demo == False: for mrl, sentence in testdataiterator(): #decoding the whole test corpus print("translating:" + str(sentence)) #stemming input sentence sentence = MRL_Linearizer.stemNL(sentence) value, counter = single_sentence_decoding(sentence) print('Found at iteration: ' + str(counter)) print(value) #writing the translations on a file mrlf.write( str(counter) + "|||" + value + "|||" + Delinearizer.delinearizer(value) + "\n") mrlf.flush() #create file containing only the mrls with open("out.txt") as f: with open("nmtout.mrl", "w+") as out: for line in f: tokens = line.split("|||") out.write(tokens[2].replace("$", "")) else: sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() sentence = MRL_Linearizer.stemNL(sentence) print("translating:" + str(sentence)) value, counter = single_sentence_decoding(sentence) print('Found at iteration: ' + str(counter)) print(value) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()