print("Done.")

# check everything is ok
lookup = Lookup(type="gpt2")
lookup.load(file_prefix = os.path.join(output_lookup_folder,"tgt"))
text = "This is a test."
token_ids = lookup.encode(text)
print("Encode: {}".format(token_ids))
recreated_string = lookup.decode(token_ids)
print("Decode: {}".format(recreated_string))
print("Map w2i:")
tokens = lookup.tokenize(text)
for i in range(len(tokens)):    
    print("\t[{}] = [{}]".format(tokens[i], lookup.convert_tokens_to_ids(tokens[i])))

print("Map i2w:")
for i in range(len(token_ids)):
    print("\t[{}] = [{}]".format(token_ids[i], lookup.convert_ids_to_tokens(token_ids[i])))


token_ids = lookup.encode(text, add_bos_eos_tokens = True)
print("Encode with bos/eos: {}".format(token_ids))
recreated_string = lookup.decode(token_ids)
print("Decode with bos/eos: {}".format(recreated_string))
recreated_string = lookup.decode(token_ids, skip_bos_eos_tokens = True)
print("Decode w/o  bos/eos: {}".format(recreated_string))



tgt_lookup.save_special_tokens(
    file_prefix=os.path.join(output_lookup_folder, "tgt"))

print("Done.")

# check everything is ok
lookup = Lookup(type="gpt2")
lookup.load(file_prefix=os.path.join(output_lookup_folder, "tgt"))
text = "This is a test."
token_ids = lookup.encode(text)
print("Encode: {}".format(token_ids))
recreated_string = lookup.decode(token_ids)
print("Decode: {}".format(recreated_string))
print("Map w2i:")
tokens = lookup.tokenize(text)
for i in range(len(tokens)):
    print("\t[{}] = [{}]".format(tokens[i],
                                 lookup.convert_tokens_to_ids(tokens[i])))

print("Map i2w:")
for i in range(len(token_ids)):
    print("\t[{}] = [{}]".format(token_ids[i],
                                 lookup.convert_ids_to_tokens(token_ids[i])))

token_ids = lookup.encode(text, add_bos_eos_tokens=True)
print("Encode with bos/eos: {}".format(token_ids))
recreated_string = lookup.decode(token_ids)
print("Decode with bos/eos: {}".format(recreated_string))
recreated_string = lookup.decode(token_ids, skip_bos_eos_tokens=True)
print("Decode w/o  bos/eos: {}".format(recreated_string))
示例#3
0
lookup = Lookup(type="gpt2")
lookup.load(file_prefix=os.path.join(output_lookup_folder, "tgt"))
print(lookup)

text = " ".join(y_text[0])  # X_text[0]
print("Text: {}".format(text))
token_ids = lookup.encode(text)
print("Encode: {}".format(token_ids))
recreated_string = lookup.decode(token_ids)
print("Decode: {}".format(recreated_string))
print("Map w2i:")
tokens = lookup.tokenize(text)
for i in range(len(tokens)):
    print("\t[{}] = [{}]".format(tokens[i],
                                 lookup.convert_tokens_to_ids(tokens[i])))

print("Map i2w:")
for i in range(len(token_ids)):
    print("\t[{}] = [{}]".format(token_ids[i],
                                 lookup.convert_ids_to_tokens(token_ids[i])))

token_ids = lookup.encode(text, add_bos_eos_tokens=True)
print("Encode with bos/eos: {}".format(token_ids))

print("\t to tokens: {}".format(lookup.convert_ids_to_tokens(token_ids)))

recreated_string = lookup.decode(token_ids, skip_bos_eos_tokens=True)
print("Decode w/o  bos/eos: {}".format(recreated_string))

print(lookup)