sys.path.append("../..") import torch from models.util.lookup import Lookup from tqdm import tqdm from itertools import dropwhile import sentencepiece as spm output_lookup_folder = os.path.join("lookup", "gpt2") # create output folder if not os.path.exists(output_lookup_folder): os.makedirs(output_lookup_folder) # CREATE LOOKUPS src_lookup = Lookup(type="gpt2") src_lookup.save_special_tokens( file_prefix=os.path.join(output_lookup_folder, "src")) tgt_lookup = Lookup(type="gpt2") tgt_lookup.save_special_tokens( file_prefix=os.path.join(output_lookup_folder, "tgt")) print("Done.") # check everything is ok lookup = Lookup(type="gpt2") lookup.load(file_prefix=os.path.join(output_lookup_folder, "tgt")) text = "This is a test." token_ids = lookup.encode(text) print("Encode: {}".format(token_ids))
fname = "" else: fname = "-" + MEI.replace(" ", "_") src_lookup_file_prefix = os.path.join("lookup", "bpe", "src" + fname + "-1024") tgt_lookup_file_prefix = os.path.join("lookup", "bpe", "src" + fname + "-1024") if sys.argv[1] == "gpt2": lookup_type = "gpt2" src_lookup_file_prefix = os.path.join("lookup", "gpt2", "src") tgt_lookup_file_prefix = os.path.join("lookup", "gpt2", "tgt") # load lookups try: src_lookup = Lookup(type=lookup_type) src_lookup.load(file_prefix=src_lookup_file_prefix) tgt_lookup = Lookup(type=lookup_type) tgt_lookup.load(file_prefix=tgt_lookup_file_prefix) except: print("ERROR with " + src_lookup_file_prefix) continue data = json.load(open(input_json_file, "r", encoding="utf8")) output_folder = os.path.join("ready", lookup_type) if not os.path.exists(output_folder): os.makedirs(output_folder) # process files import random
min_seq_len_y = min_seq_len_X max_seq_len_y = max_seq_len_X #data_folder = os.path.join("..", "..", "data", "cmudict", "ready", "bpe") #src_lookup_prefix = os.path.join("..", "..", "data", "cmudict", "lookup", "bpe","src-256") #tgt_lookup_prefix = os.path.join("..", "..", "data", "cmudict", "lookup", "bpe","tgt-256") #data_folder = os.path.join("..", "..", "data", "task2", "ready", "gpt2") #src_lookup_prefix = os.path.join("..", "..", "data", "task2", "lookup", "gpt2","src") #tgt_lookup_prefix = os.path.join("..", "..", "data", "task2", "lookup", "gpt2","tgt") #src_lookup = Lookup(type="gpt2") #tgt_lookup = Lookup(type="gpt2") data_folder = os.path.join("..", "..", "data", "task2", "ready", "bpe") src_lookup_prefix = os.path.join("..", "..", "data", "task2", "lookup", "bpe","src-Business_Ethics-1024") tgt_lookup_prefix = os.path.join("..", "..", "data", "task2", "lookup", "bpe","src-Business_Ethics-1024") src_lookup = Lookup(type="bpe") tgt_lookup = Lookup(type="bpe") src_lookup.load(src_lookup_prefix) tgt_lookup.load(tgt_lookup_prefix) train_loader, valid_loader, test_loader = loader(data_folder, batch_size, src_lookup, tgt_lookup, min_seq_len_X, max_seq_len_X, min_seq_len_y, max_seq_len_y, custom_filename_prefix = "Business_Ethics_") print("Loading done, train instances {}, dev instances {}, test instances {}, vocab size src/tgt {}/{}\n".format( len(train_loader.dataset.X), len(valid_loader.dataset.X), len(test_loader.dataset.X), len(src_lookup), len(tgt_lookup))) # ###################################################################### # GPU SELECTION ######################################################## device = select_processing_device(verbose = True)
all_f.write(data[MEI][cpy]["output"] + "\n") # TRAIN SENTENCEPIECE MODELS & CREATE LOOKUPS for MEI in data: MEI = MEI.replace(" ", "_") print("Prep BPE train for : " + MEI) try: spm.SentencePieceTrainer.Train( '--input=' + os.path.join(input_folder, MEI + ".txt") + ' --model_prefix=' + os.path.join(output_lookup_folder, "src-" + MEI + "-" + str(input_src_vocab_size)) + ' --character_coverage=1.0 --model_type=bpe --num_threads=8 --split_by_whitespace=true --shuffle_input_sentence=true --max_sentence_length=8000 --vocab_size=' + str(input_src_vocab_size)) print("Done.") src_lookup = Lookup(type="bpe") src_lookup.save_special_tokens(file_prefix=os.path.join( output_lookup_folder, "src-" + MEI + "-" + str(input_src_vocab_size))) except: print("ERROR, skipping " + MEI) spm.SentencePieceTrainer.Train( '--input=' + os.path.join(input_folder, "all.txt") + ' --model_prefix=' + os.path.join(output_lookup_folder, "src-" + str(input_src_vocab_size)) + ' --character_coverage=1.0 --model_type=bpe --num_threads=8 --split_by_whitespace=true --shuffle_input_sentence=true --max_sentence_length=8000 --vocab_size=' + str(input_src_vocab_size)) src_lookup = Lookup(type="bpe") src_lookup.save_special_tokens( file_prefix=os.path.join(output_lookup_folder, "src-" + str(input_src_vocab_size)))
min_seq_len_y = min_seq_len_X max_seq_len_y = max_seq_len_X #data_folder = os.path.join("..", "..", "data", "cmudict", "ready", "bpe") #src_lookup_prefix = os.path.join("..", "..", "data", "cmudict", "lookup", "bpe","src-256") #tgt_lookup_prefix = os.path.join("..", "..", "data", "cmudict", "lookup", "bpe","tgt-256") #data_folder = os.path.join("..", "..", "data", "cmudict", "ready", "gpt2") #src_lookup_prefix = os.path.join("..", "..", "data", "cmudict", "lookup", "gpt2","src") #tgt_lookup_prefix = os.path.join("..", "..", "data", "cmudict", "lookup", "gpt2","tgt") data_folder = os.path.join("..", "..", "data", "task2", "ready", "gpt2") src_lookup_prefix = os.path.join("..", "..", "data", "task2", "lookup", "gpt2", "src") tgt_lookup_prefix = os.path.join("..", "..", "data", "task2", "lookup", "gpt2", "tgt") src_lookup = Lookup(type="gpt2") src_lookup.load(src_lookup_prefix) tgt_lookup = Lookup(type="gpt2") tgt_lookup.load(tgt_lookup_prefix) train_loader, valid_loader, test_loader = loader( data_folder, batch_size, src_lookup, tgt_lookup, min_seq_len_X, max_seq_len_X, min_seq_len_y, max_seq_len_y, custom_filename_prefix="Business_Ethics_") print(
#src_lookup_prefix = os.path.join("..", "..", "data", "cmudict", "lookup", "bpe","src-256") #tgt_lookup_prefix = os.path.join("..", "..", "data", "cmudict", "lookup", "bpe","tgt-256") #data_folder = os.path.join("..", "..", "data", "cmudict", "ready", "gpt2") #src_lookup_prefix = os.path.join("..", "..", "data", "cmudict", "lookup", "gpt2","src") #tgt_lookup_prefix = os.path.join("..", "..", "data", "cmudict", "lookup", "gpt2","tgt") #data_folder = os.path.join("..", "..", "data", "task2", "ready", "gpt2") #src_lookup_prefix = os.path.join("..", "..", "data", "task2", "lookup", "gpt2","src") #tgt_lookup_prefix = os.path.join("..", "..", "data", "task2", "lookup", "gpt2","tgt") #src_lookup = Lookup(type="gpt2") #tgt_lookup = Lookup(type="gpt2") data_folder = os.path.join("..", "..", "data", "task2", "ready", "bpe") src_lookup_prefix = os.path.join("..", "..", "data", "task2", "lookup", "bpe","src-Business_Ethics-1024") tgt_lookup_prefix = os.path.join("..", "..", "data", "task2", "lookup", "bpe","src-Business_Ethics-1024") src_lookup = Lookup(type="bpe") tgt_lookup = Lookup(type="bpe") src_lookup.load(src_lookup_prefix) tgt_lookup.load(tgt_lookup_prefix) train_loader, valid_loader, test_loader = loader(data_folder, batch_size, src_lookup, tgt_lookup, min_seq_len_X, max_seq_len_X, min_seq_len_y, max_seq_len_y, custom_filename_prefix = "Business_Ethics_") print("Loading done, train instances {}, dev instances {}, test instances {}, vocab size src/tgt {}/{}\n".format( len(train_loader.dataset.X), len(valid_loader.dataset.X), len(test_loader.dataset.X), len(src_lookup), len(tgt_lookup))) # ###################################################################### # GPU SELECTION ########################################################
if sys.argv[1] == "gpt2": lookup_type = "gpt2" src_lookup_file_prefix = os.path.join("lookup", "gpt2", "src") tgt_lookup_file_prefix = os.path.join("lookup", "gpt2", "tgt") input_src_file = os.path.join("raw", "JRC-Acquis.en-fr.fr") input_tgt_file = os.path.join("raw", "JRC-Acquis.en-fr.en") output_folder = os.path.join("ready", lookup_type) max_line_tokens_length = 1000 validation_fraction = 0.005 test_fraction = 0.0125 full_data_fraction = 1. # load lookups src_lookup = lookup = Lookup(type=lookup_type) src_lookup.load(file_prefix=src_lookup_file_prefix) tgt_lookup = lookup = Lookup(type=lookup_type) tgt_lookup.load(file_prefix=tgt_lookup_file_prefix) # create output folder if not os.path.exists(output_folder): os.makedirs(output_folder) # process files import random print("Creating train dev and test files ...") train_X = [] train_y = []