def build_text_encoder(): """ Load vocabulary, build text encoder, add special tokens to it. """ text_encoder = TextEncoder(path_encoder, path_bpe) for special_token in get_special_tokens(): vocab_size = len(text_encoder.encoder) text_encoder.decoder[vocab_size] = special_token text_encoder.encoder[special_token] = vocab_size # note, that special tokens are not canonically used by calling .encode() # instead, index the encoder.encoder dict directly return text_encoder
def load_data(dataset, opt): if dataset == "atomic": data_loader = load_atomic_data(opt) elif dataset == "conceptnet": data_loader = load_conceptnet_data(opt) # Initialize TextEncoder encoder_path = "model/encoder_bpe_40000.json" bpe_path = "model/vocab_40000.bpe" text_encoder = TextEncoder(encoder_path, bpe_path) text_encoder.encoder = data_loader.vocab_encoder text_encoder.decoder = data_loader.vocab_decoder return data_loader, text_encoder
def load_data(dataset, opt, LoaderPath=""): if dataset == "atomic": data_loader = load_atomic_data(opt, LoaderPath) elif dataset == "conceptnet": data_loader = load_conceptnet_data(opt) # Initialize TextEncoder encoder_path = "model/encoder_bpe_40000.json" bpe_path = "model/vocab_40000.bpe" text_encoder = TextEncoder(encoder_path, bpe_path) #print(text_encoder.encoder)##DEBUG #print('\n\n\n\n\n\n')#DEBUG #print(text_encoder.encoder)#DEBUG text_encoder.encoder = data_loader.vocab_encoder text_encoder.decoder = data_loader.vocab_decoder #print('SLOVENE')#DEBUG #print('\n\n\n\n\n\n')#DEBUG #print(text_encoder.encoder)#DEBUG #print('to sam ya') return data_loader, text_encoder
def main(num): # Generate configuration files depending on experiment being run utils.generate_config_files("atomic", num) # Loads the correct configuration file config_file = "config/atomic/config_{}.json".format(num) print(config_file) # Read config file to option config = cfg.read_config(cfg.load_config(config_file)) opt, meta = cfg.get_parameters(config) # Set the random seeds torch.manual_seed(opt.train.static.seed) random.seed(opt.train.static.seed) if config.gpu_mode: torch.cuda.manual_seed_all(opt.train.static.seed) # Where to find the data splits = ["train", "dev", "test"] opt.train.dynamic.epoch = 0 print("Loading Data") categories = opt.data.categories path = "data/atomic/processed/{}/{}.pickle".format( opt.exp, utils.make_name_string(opt.data)) data_loader = data.make_data_loader(opt, categories) loaded = data_loader.load_data(path) print(data_loader.sequences["train"]["total"].size(0)) data_loader.opt = opt data_loader.batch_size = opt.train.dynamic.bs print("Done.") # Initialize text_encoder text_encoder = TextEncoder(config.encoder_path, config.bpe_path) special = [data.start_token, data.end_token] special += ["<{}>".format(cat) for cat in categories] special += [data.blank_token] text_encoder.encoder = data_loader.vocab_encoder text_encoder.decoder = data_loader.vocab_decoder opt.data.maxe1 = data_loader.max_event opt.data.maxe2 = data_loader.max_effect opt.data.maxr = data.atomic_data.num_delimiter_tokens["category"] n_special = len(special) n_ctx = opt.data.maxe1 + opt.data.maxe2 n_vocab = len(text_encoder.encoder) + n_ctx print(data_loader.__dict__.keys()) opt.net.vSize = n_vocab print("Building Model") model = models.make_model(opt, n_vocab, n_ctx, n_special, load=(opt.net.init == "pt")) print("Done.") print("Files will be logged at: {}".format( utils.make_name(opt, prefix="results/losses/", is_dir=True, eval_=True))) data_loader.reset_offsets("train") # Get number of examples data.set_max_sizes(data_loader) if config.gpu_mode: print("Pushing to GPU: {}".format(config.gpu_index)) cfg.device = config.gpu_index cfg.do_gpu = True torch.cuda.set_device(cfg.device) if config.multigpu: model = models.multi_gpu(model, config.gpu_indices).cuda() else: model.cuda(cfg.device) print("Done.") print("Training") optimizer = OpenAIAdam(model.parameters(), lr=opt.train.dynamic.lr, schedule=opt.train.static.lrsched, warmup=opt.train.static.lrwarm, t_total=meta.iterations, b1=opt.train.static.b1, b2=opt.train.static.b2, e=opt.train.static.e, l2=opt.train.static.l2, vector_l2=opt.train.static.vl2, max_grad_norm=opt.train.static.clip) scorers = ["bleu", "rouge", "cider"] trainer = train.make_trainer(opt, meta, data_loader, model, optimizer) trainer.set_evaluator(opt, model, data_loader) trainer.run()
prefix="results/{}/".format("losses"), is_dir=True, eval_=True), split, "pickle") print("Will save {} losses to {}".format(split, results_name)) path = "data/atomic/processed/generation/{}.pickle".format( utils.make_name_string(opt.data).replace( "kr_{}".format(opt.data.get("kr", 1)), "kr_1")) data_loader = data.make_data_loader(opt, opt.data.categories) loaded = data_loader.load_data(path) data_loader.batch_size = opt.train.dynamic.bs print("Done.") text_encoder = TextEncoder(config.encoder_path, config.bpe_path) # Set special tokens formatted_categories = ["<{}>".format(cat) for cat in opt.data.categories] special = [data.start_token, data.end_token] special += formatted_categories special += [data.blank_token] # Load vocab encoder and decoder from pre-initialized data_loader text_encoder.encoder = data_loader.vocab_encoder text_encoder.decoder = data_loader.vocab_decoder # Get component segmentation of sequences # context_size_event = maximum size of an event description # context_size_effect = maximum size of an event effect/intent/etc.
def main(num): # Generate configuration files depending on experiment being run utils.generate_config_files("conceptnet", num) # Loads the correct configuration file config_file = "config/conceptnet/config_{}.json".format(num) print(config_file) # Read config file to option config = cfg.read_config(cfg.load_config(config_file)) opt, meta = cfg.get_parameters(config) # config.gpu_mode = torch.cuda.is_available() # Set the random seeds torch.manual_seed(opt.train.static.seed) random.seed(opt.train.static.seed) if config.gpu_mode: torch.cuda.manual_seed_all(opt.train.static.seed) # Load the data splits = ["train", "dev", "test"] opt.train.dynamic.epoch = 0 print("Loading Data") # Initialize path to pre-set data loader path = "data/conceptnet/processed/{}/{}.pickle".format( opt.exp, utils.make_name_string(opt.data)) # Make data loader data_loader = data.make_data_loader(opt) loaded = data_loader.load_data(path) print(data_loader.sequences["train"]["total"].size(0)) data_loader.opt = opt data_loader.batch_size = opt.train.dynamic.bs print("Done.") text_encoder = TextEncoder(config.encoder_path, config.bpe_path) categories = data.conceptnet_data.conceptnet_relations special = [data.start_token, data.end_token] special += ["<{}>".format(cat) for cat in categories] if loaded: text_encoder.encoder = data_loader.vocab_encoder text_encoder.decoder = data_loader.vocab_decoder else: for special_token in special: text_encoder.decoder[len(encoder)] = special_token text_encoder.encoder[special_token] = len(encoder) data_loader.make_tensors(text_encoder, special) # Set max size of different parts of relation context_size_e1 = data_loader.max_e1 context_size_e2 = data_loader.max_e2 context_size_r = data_loader.max_r opt.data.maxr = context_size_r n_special = len(special) n_ctx = context_size_e1 + context_size_r + context_size_e2 n_vocab = len(text_encoder.encoder) + n_ctx print(data_loader.__dict__.keys()) opt.net.vSize = n_vocab # Build Model print("Building Model") model = models.make_model(opt, n_vocab, n_ctx, n_special, load=(opt.net.init == "pt")) print("Done.") print("Files will be logged at: {}".format( utils.make_name(opt, prefix="results/losses/", is_dir=True, eval_=True))) data_loader.reset_offsets("train", keys=["total"]) data.set_max_sizes(data_loader) # Push to GPU if config.gpu_mode: print("Pushing to GPU: {}".format(config.gpu_index)) cfg.device = config.gpu_index cfg.do_gpu = True torch.cuda.set_device(cfg.device) if config.multigpu: model = models.multi_gpu(model, config.gpu_indices).cuda() else: model.cuda(cfg.device) print("Done.") print("Training") optimizer = OpenAIAdam(model.parameters(), lr=opt.train.dynamic.lr, schedule=opt.train.static.lrsched, warmup=opt.train.static.lrwarm, t_total=meta.iterations, b1=opt.train.static.b1, b2=opt.train.static.b2, e=opt.train.static.e, l2=opt.train.static.l2, vector_l2=opt.train.static.vl2, max_grad_norm=opt.train.static.clip) trainer = train.make_trainer(opt, meta, data_loader, model, optimizer) print(data_loader.sequences["dev"]["total"].max()) trainer.set_generator(opt, model, data_loader) trainer.set_evaluator(opt, model, data_loader) trainer.run()
'DefinedAs', 'DesireOf', 'Desires', 'HasA', 'HasFirstSubevent', 'HasLastSubevent', 'HasPainCharacter', 'HasPainIntensity', 'HasPrerequisite', 'HasProperty', 'HasSubevent', 'InheritsFrom', 'InstanceOf', 'IsA', 'LocatedNear', 'LocationOfAction', 'MadeOf', 'MotivatedByGoal', 'NotCapableOf', 'NotDesires', 'NotHasA', 'NotHasProperty', 'NotIsA', 'NotMadeOf', 'PartOf', 'ReceivesAction', 'RelatedTo', 'SymbolOf', 'UsedFor' ] special = [data.start_token, data.end_token] special += ["<{}>".format(relation) for relation in relations] encoder_path = "model/encoder_bpe_40000.json" bpe_path = "model/vocab_40000.bpe" text_encoder = TextEncoder(encoder_path, bpe_path) for special_token in special: text_encoder.decoder[len(text_encoder.encoder)] = special_token text_encoder.encoder[special_token] = len(text_encoder.encoder) data_loader = cdata.GenerationDataLoader(opt) data_loader.load_data("data/conceptnet/") data_loader.make_tensors(text_encoder, special, test=False) opt.data.maxr = data_loader.max_r save_path = "data/conceptnet/processed/generation" save_name = os.path.join(save_path, "{}.pickle".format(utils.make_name_string(opt.data)))
categories += ["xEffect"] categories += ["xIntent"] categories += ["xNeed"] categories += ["xReact"] categories += ["xWant"] opt = DD() opt.dataset = "atomic" opt.exp = "generation" opt.data = DD() opt.data.categories = sorted(categories) encoder_path = "model/encoder_bpe_40000.json" bpe_path = "model/vocab_40000.bpe" text_encoder = TextEncoder(encoder_path, bpe_path) encoder = text_encoder.encoder #text encoder format= word:index #text decoder format= index:word n_vocab = len(text_encoder.encoder) special = [data.start_token, data.end_token] special += ["<{}>".format(cat) for cat in categories] special += [data.blank_token] #print(special)#DEBUG #add special tokens to text decoder & encoder for special_token in special: text_encoder.decoder[len(encoder)] = special_token encoder[special_token] = len(encoder)
if opt.data.get("maxr", None) is None: if opt.data.rel == "language": opt.data.maxr = 5 else: opt.data.maxr = 1 path = "comet-commonsense/data/conceptnet/processed/generation/{}.pickle".format( utils.make_name_string(opt.data)) data_loader = data.make_data_loader(opt) loaded = data_loader.load_data(path) encoder_path = "comet-commonsense/model/encoder_bpe_40000.json" bpe_path = "comet-commonsense/model/vocab_40000.bpe" text_encoder = TextEncoder(encoder_path, bpe_path) special = [data.start_token, data.end_token] special += ["<{}>".format(cat) for cat in relations] text_encoder.encoder = data_loader.vocab_encoder text_encoder.decoder = data_loader.vocab_decoder context_size_event = data_loader.max_e1 context_size_effect = data_loader.max_e2 n_special = len(special) n_ctx = data_loader.max_e1 + data_loader.max_e2 + data_loader.max_r n_vocab = len(text_encoder.encoder) + n_ctx model = models.make_model(opt,
categories = opt.data.categories # If given pickled_data argument, override the original name loading function, old one is complicated if args.pickled_data: path = args.pickled_data else: path = "data/atomic/processed/generation/{}.pickle".format( utils.make_name_string(opt.data)) data_loader = data.make_data_loader(opt, categories) loaded = data_loader.load_data(path) data_loader.batch_size = opt.train.dynamic.bs print("Done.") text_encoder = TextEncoder(config.encoder_path, config.bpe_path) special = [data.start_token, data.end_token] special += ["<{}>".format(cat) for cat in categories] special += [data.blank_token] #ipdb.set_trace() if not args.model_pickled_data == None: model_data_loader = data.make_data_loader(opt, opt.data.categories) _loaded = model_data_loader.load_data(args.model_pickled_data) # Load vocab encoder and decoder from pre-initialized data_loader text_encoder.encoder = model_data_loader.vocab_encoder text_encoder.decoder = model_data_loader.vocab_decoder