print("Done.") # check everything is ok lookup = Lookup(type="gpt2") lookup.load(file_prefix = os.path.join(output_lookup_folder,"tgt")) text = "This is a test." token_ids = lookup.encode(text) print("Encode: {}".format(token_ids)) recreated_string = lookup.decode(token_ids) print("Decode: {}".format(recreated_string)) print("Map w2i:") tokens = lookup.tokenize(text) for i in range(len(tokens)): print("\t[{}] = [{}]".format(tokens[i], lookup.convert_tokens_to_ids(tokens[i]))) print("Map i2w:") for i in range(len(token_ids)): print("\t[{}] = [{}]".format(token_ids[i], lookup.convert_ids_to_tokens(token_ids[i]))) token_ids = lookup.encode(text, add_bos_eos_tokens = True) print("Encode with bos/eos: {}".format(token_ids)) recreated_string = lookup.decode(token_ids) print("Decode with bos/eos: {}".format(recreated_string)) recreated_string = lookup.decode(token_ids, skip_bos_eos_tokens = True) print("Decode w/o bos/eos: {}".format(recreated_string))
tgt_lookup.save_special_tokens( file_prefix=os.path.join(output_lookup_folder, "tgt")) print("Done.") # check everything is ok lookup = Lookup(type="gpt2") lookup.load(file_prefix=os.path.join(output_lookup_folder, "tgt")) text = "This is a test." token_ids = lookup.encode(text) print("Encode: {}".format(token_ids)) recreated_string = lookup.decode(token_ids) print("Decode: {}".format(recreated_string)) print("Map w2i:") tokens = lookup.tokenize(text) for i in range(len(tokens)): print("\t[{}] = [{}]".format(tokens[i], lookup.convert_tokens_to_ids(tokens[i]))) print("Map i2w:") for i in range(len(token_ids)): print("\t[{}] = [{}]".format(token_ids[i], lookup.convert_ids_to_tokens(token_ids[i]))) token_ids = lookup.encode(text, add_bos_eos_tokens=True) print("Encode with bos/eos: {}".format(token_ids)) recreated_string = lookup.decode(token_ids) print("Decode with bos/eos: {}".format(recreated_string)) recreated_string = lookup.decode(token_ids, skip_bos_eos_tokens=True) print("Decode w/o bos/eos: {}".format(recreated_string))
lookup = Lookup(type="gpt2") lookup.load(file_prefix=os.path.join(output_lookup_folder, "tgt")) print(lookup) text = " ".join(y_text[0]) # X_text[0] print("Text: {}".format(text)) token_ids = lookup.encode(text) print("Encode: {}".format(token_ids)) recreated_string = lookup.decode(token_ids) print("Decode: {}".format(recreated_string)) print("Map w2i:") tokens = lookup.tokenize(text) for i in range(len(tokens)): print("\t[{}] = [{}]".format(tokens[i], lookup.convert_tokens_to_ids(tokens[i]))) print("Map i2w:") for i in range(len(token_ids)): print("\t[{}] = [{}]".format(token_ids[i], lookup.convert_ids_to_tokens(token_ids[i]))) token_ids = lookup.encode(text, add_bos_eos_tokens=True) print("Encode with bos/eos: {}".format(token_ids)) print("\t to tokens: {}".format(lookup.convert_ids_to_tokens(token_ids))) recreated_string = lookup.decode(token_ids, skip_bos_eos_tokens=True) print("Decode w/o bos/eos: {}".format(recreated_string)) print(lookup)