if cnt % 100 == 0:
            print("{} / {} ...".format(cnt, len(cpy_list)))

        try:
            if sys.argv[1] == "bpe":
                src_ids = src_lookup.encode(src_line, add_bos_eos_tokens=True)
            if sys.argv[1] == "gpt2":
                src_ids = src_lookup.encode(src_line, add_bos_eos_tokens=False)

            tgt_ids = tgt_lookup.encode(tgt_line, add_bos_eos_tokens=True)

            if cnt % 100 == 0:
                print("\n+++++++SRC:")
                print(src_line)
                print(src_ids)
                print(src_lookup.decode(src_ids))
                print(src_lookup.decode(src_ids, skip_bos_eos_tokens=True))
                print("+++++++TGT")
                print(tgt_line)
                print(tgt_ids)
                print(tgt_lookup.decode(tgt_ids))
                print(tgt_lookup.decode(tgt_ids, skip_bos_eos_tokens=True))
                print("+++++++\n")

            if len(src_ids) > max_line_tokens_length or len(
                    tgt_ids) > max_line_tokens_length:
                skipped_len += 1
                continue
        except:
            print()
            print(src_line)
src_lookup.save_special_tokens(
    file_prefix=os.path.join(output_lookup_folder, "src"))

tgt_lookup = Lookup(type="gpt2")
tgt_lookup.save_special_tokens(
    file_prefix=os.path.join(output_lookup_folder, "tgt"))

print("Done.")

# check everything is ok
lookup = Lookup(type="gpt2")
lookup.load(file_prefix=os.path.join(output_lookup_folder, "tgt"))
text = "This is a test."
token_ids = lookup.encode(text)
print("Encode: {}".format(token_ids))
recreated_string = lookup.decode(token_ids)
print("Decode: {}".format(recreated_string))
print("Map w2i:")
tokens = lookup.tokenize(text)
for i in range(len(tokens)):
    print("\t[{}] = [{}]".format(tokens[i],
                                 lookup.convert_tokens_to_ids(tokens[i])))

print("Map i2w:")
for i in range(len(token_ids)):
    print("\t[{}] = [{}]".format(token_ids[i],
                                 lookup.convert_ids_to_tokens(token_ids[i])))

token_ids = lookup.encode(text, add_bos_eos_tokens=True)
print("Encode with bos/eos: {}".format(token_ids))
recreated_string = lookup.decode(token_ids)