def testdataiterator():
    """
	iterates over testdata, yields mrl, nl
	"""
    with open(testmrlfilename) as mrlfile:
        with open(testnlfilename) as nlfile:
            for index, mrl in enumerate(mrlfile):
                nl = nlfile.readline()
                mrl = MRL_Linearizer.linearizeMRL(mrl)
                nl = MRL_Linearizer.stemNL(nl)
                print(nl)
                yield mrl, nl
def LinearizedFileCreater(txt):
    lst = ["-",":","&","."]
    with open(txt) as myfile:
        txt_lst = myfile.readlines()
    new_txtlst = []
    for a in txt_lst:
        sentence = []
        for b in range(len(a)):
            if  a[b-1].isalpha() and a[b] == "'" and a[b+1].isalpha():
                sentence.append(a[b].replace("'","§"))
            else:
                sentence.append(a[b])
        new_txtlst.append("".join(sentence).strip())
        
    with open('MRL_EN_TRAIN_YANG_linearizedTEST1.train.LMRL','w') as newfile:
        for a in new_txtlst:
            s = MRL_Linearizer.linearizeMRL(a).replace("§","'")
            finish = []
            for b in range(len(s)):
                if s[b] == " " and (s[b-1].isalpha() or s[b-1] in lst) and s[b-2] != "@":
                    appender = s[b].replace(" ","%")
                    finish.append(appender)
                    
                else:
                    finish.append(s[b])
            newfile.write("".join(finish)+"\n")
def stemmedFileCreater(txt):
    with open(txt) as myfile:
        txt_lst = myfile.readlines()
        
    with open('NL_EN_TEST_stem.txt','w') as newfile:
        for a in txt_lst:
            newfile.write(MRL_Linearizer.stemNL(a)+"\n")
示例#4
0
def stemmedFileCreater(txt):
    with open(txt) as myfile:
        txt_lst = myfile.readlines()
        
    with open('Schreibtisch/progTest2/tune.nl','w') as newfile:
        for a in txt_lst:
            newfile.write(MRL_Linearizer.stemNL(a)+"\n")
def decode():
    with tf.Session() as sess:
        # Create model and load parameters.
        model = create_model(sess, True)
        model.batch_size = 1  # We decode one sentence at a time.
        mrlf = open(outmrlfilename, "w+")

        # Load vocabularies.
        en_vocab_path = os.path.join(FLAGS.data_dir,
                                     "vocab%d.en" % FLAGS.en_vocab_size)
        fr_vocab_path = os.path.join(FLAGS.data_dir,
                                     "vocab%d.fr" % FLAGS.fr_vocab_size)
        en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path)
        _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path)

        # Decode from standard input.
        #sys.stdout.write("> ")
        #sys.stdout.flush()
        #sentence = sys.stdin.readline()
        #sentence = MRL_Linearizer.stemNL(sentence)

        def single_sentence_decoding(sentence):

            # Get token-ids for the input sentence.
            token_ids = data_utils.sentence_to_token_ids(
                tf.compat.as_bytes(sentence), en_vocab)
            # Which bucket does it belong to?
            bucket_id = min([
                b for b in xrange(len(_buckets))
                if _buckets[b][0] > len(token_ids)
            ])
            # Get a 1-element batch to feed the sentence to the model.
            encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                {bucket_id: [(token_ids, [])]}, bucket_id)
            # Get output logits for the sentence.
            _, _, output_logits = model.step(sess, encoder_inputs,
                                             decoder_inputs, target_weights,
                                             bucket_id, True)

            return decode_once(output_logits, rev_fr_vocab)

        #interactive session?
        if FLAGS.demo == False:
            for mrl, sentence in testdataiterator():

                #decoding the whole test corpus

                print("translating:" + str(sentence))

                #stemming input sentence
                sentence = MRL_Linearizer.stemNL(sentence)

                value, counter = single_sentence_decoding(sentence)
                print('Found at iteration: ' + str(counter))
                print(value)

                #writing the translations on a file
                mrlf.write(
                    str(counter) + "|||" + value + "|||" +
                    Delinearizer.delinearizer(value) + "\n")
                mrlf.flush()

                #create file containing only the mrls
                with open("out.txt") as f:
                    with open("nmtout.mrl", "w+") as out:
                        for line in f:
                            tokens = line.split("|||")
                            out.write(tokens[2].replace("$", ""))
        else:
            sys.stdout.write("> ")
            sys.stdout.flush()
            sentence = sys.stdin.readline()
            sentence = MRL_Linearizer.stemNL(sentence)
            print("translating:" + str(sentence))
            value, counter = single_sentence_decoding(sentence)
            print('Found at iteration: ' + str(counter))
            print(value)
            print("> ", end="")
            sys.stdout.flush()
            sentence = sys.stdin.readline()