def main(args): quick_init_out = initialization.quick_init(args=args, verbose=True) with quick_init_out.log_writer.log_context(): task = tasks.create_task_from_config_path( config_path=args.task_config_path, verbose=True, ) with distributed.only_first_process(local_rank=args.local_rank): # load the model model_class_spec = model_resolution.resolve_model_setup_classes( model_type=args.model_type, task_type=task.TASK_TYPE, ) model_wrapper = model_setup.simple_model_setup( model_type=args.model_type, model_class_spec=model_class_spec, config_path=args.model_config_path, tokenizer_path=args.model_tokenizer_path, task=task, ) model_setup.simple_load_model_path( model=model_wrapper.model, model_load_mode=args.model_load_mode, model_path=args.model_path, verbose=True, ) model_wrapper.model.to(quick_init_out.device) train_examples = task.get_train_examples() train_examples, _ = train_setup.maybe_subsample_train( train_examples=train_examples, train_examples_number=args.train_examples_number, train_examples_fraction=args.train_examples_fraction, ) num_train_examples = len(train_examples) train_schedule = train_setup.get_train_schedule( num_train_examples=num_train_examples, max_steps=args.max_steps, num_train_epochs=args.num_train_epochs, gradient_accumulation_steps=args.gradient_accumulation_steps, per_gpu_train_batch_size=args.train_batch_size, n_gpu=quick_init_out.n_gpu, ) quick_init_out.log_writer.write_entry( "text", f"t_total: {train_schedule.t_total}", do_print=True) loss_criterion = train_setup.resolve_loss_function( task_type=task.TASK_TYPE) optimizer_scheduler = model_setup.create_optimizer( model=model_wrapper.model, learning_rate=args.learning_rate, t_total=train_schedule.t_total, warmup_steps=args.warmup_steps, warmup_proportion=args.warmup_proportion, optimizer_type=args.optimizer_type, verbose=True, ) model_setup.special_model_setup( model_wrapper=model_wrapper, optimizer_scheduler=optimizer_scheduler, fp16=args.fp16, fp16_opt_level=args.fp16_opt_level, n_gpu=quick_init_out.n_gpu, local_rank=args.local_rank, ) rparams = simple_runner.RunnerParameters( feat_spec=model_resolution.build_featurization_spec( model_type=args.model_type, max_seq_length=args.max_seq_length, ), local_rank=args.local_rank, n_gpu=quick_init_out.n_gpu, fp16=args.fp16, learning_rate=args.learning_rate, eval_batch_size=args.eval_batch_size, max_grad_norm=args.max_grad_norm, ) runner = simple_runner.SimpleTaskRunner( task=task, model_wrapper=model_wrapper, optimizer_scheduler=optimizer_scheduler, loss_criterion=loss_criterion, device=quick_init_out.device, rparams=rparams, train_schedule=train_schedule, log_writer=quick_init_out.log_writer, ) if args.do_train: val_examples = task.get_val_examples() metarunner.MetaRunner( runner=runner, train_examples=train_examples, val_examples= val_examples[:args.partial_eval_number], # quick and dirty should_save_func=metarunner.get_should_save_func( args.save_every_steps), should_eval_func=metarunner.get_should_eval_func( args.eval_every_steps), output_dir=args.output_dir, verbose=True, save_best_model=args.do_save, load_best_model=True, log_writer=quick_init_out.log_writer, ).train_val_save_every() if args.do_save: torch.save(model_wrapper.model.state_dict(), os.path.join(args.output_dir, "model.p")) if args.do_val: val_examples = task.get_val_examples() results = runner.run_val(val_examples) evaluate.write_val_results( results=results, output_dir=args.output_dir, verbose=True, ) if args.do_test: test_examples = task.get_test_examples() logits = runner.run_test(test_examples) evaluate.write_preds( logits=logits, output_path=os.path.join(args.output_dir, "test_preds.csv"), )
def main(args): quick_init_out = initialization.quick_init(args=args, verbose=False) task = tasks.create_task_from_config_path( config_path=args.task_config_path, verbose=True, ) with distributed.only_first_process(local_rank=args.local_rank): model_class_spec = model_resolution.resolve_model_setup_classes( model_type=args.model_type, task_type=task.TASK_TYPE, ) model_wrapper = model_setup.simple_model_setup( model_type=args.model_type, model_class_spec=model_class_spec, config_path=args.model_config_path, tokenizer_path=args.model_tokenizer_path, task=task, ) model_setup.simple_load_model_path( model=model_wrapper.model, model_load_mode=args.model_load_mode, model_path=args.model_path, ) adapter_weights_dict = multi_adapters.load_adapter_weights_dict_path( args.adapter_weights_path) multi_adapters.exclude_adapters( adapter_weights_dict, exclude_list=args.adapter_exclude.split(",")) adapter_weights_dict = multi_adapters.isolate_adapter_weights_dict( adapter_weights_dict=adapter_weights_dict, model_type=args.model_type, ) sub_module_name_list = list(adapter_weights_dict.keys()) modified_layers = multi_adapters.add_multi_adapters( model=model_wrapper.model, sub_module_name_list=sub_module_name_list, adapter_config=adapters.AdapterConfig(), include_base=args.adapter_include_base, include_flex=args.adapter_include_flex, num_weight_sets=args.adapter_num_weight_sets, use_optimized=args.adapter_use_optimized, ) multi_adapters.load_multi_adapter_weights( model=model_wrapper.model, modified_layers=modified_layers, adapter_weights_dict=adapter_weights_dict, ) model_wrapper.model.to(quick_init_out.device) tunable_parameters = multi_adapters.get_tunable_parameters( model=model_wrapper.model, modified_layers=modified_layers, ft_mode=args.adapter_ft_mode, ) train_examples = task.get_train_examples() train_examples, _ = train_setup.maybe_subsample_train( train_examples=train_examples, train_examples_number=args.train_examples_number, train_examples_fraction=args.train_examples_fraction, ) num_train_examples = len(train_examples) train_schedule = train_setup.get_train_schedule( num_train_examples=num_train_examples, max_steps=args.max_steps, num_train_epochs=args.num_train_epochs, gradient_accumulation_steps=args.gradient_accumulation_steps, per_gpu_train_batch_size=args.train_batch_size, n_gpu=quick_init_out.n_gpu, ) loss_criterion = train_setup.resolve_loss_function( task_type=task.TASK_TYPE) optimizer_scheduler = model_setup.create_optimizer_from_params( named_parameters=tunable_parameters, learning_rate=args.learning_rate, t_total=train_schedule.t_total, warmup_steps=args.warmup_steps, warmup_proportion=args.warmup_proportion, optimizer_type=args.optimizer_type, verbose=True, ) model_setup.special_model_setup( model_wrapper=model_wrapper, optimizer_scheduler=optimizer_scheduler, fp16=args.fp16, fp16_opt_level=args.fp16_opt_level, n_gpu=quick_init_out.n_gpu, local_rank=args.local_rank, ) rparams = simple_runner.RunnerParameters( feat_spec=model_resolution.build_featurization_spec( model_type=args.model_type, max_seq_length=args.max_seq_length, ), local_rank=args.local_rank, n_gpu=quick_init_out.n_gpu, fp16=args.fp16, learning_rate=args.learning_rate, eval_batch_size=args.eval_batch_size, max_grad_norm=args.max_grad_norm, ) runner = simple_runner.SimpleTaskRunner( task=task, model_wrapper=model_wrapper, optimizer_scheduler=optimizer_scheduler, loss_criterion=loss_criterion, device=quick_init_out.device, rparams=rparams, train_schedule=train_schedule, log_writer=quick_init_out.log_writer, ) if args.do_train: val_examples = task.get_val_examples() adapters_runner.AdapterMetaRunner( runner=runner, train_examples=train_examples, val_examples=val_examples[:args. partial_eval_number], # quick and dirty should_save_func=metarunner.get_should_save_func( args.save_every_steps), should_eval_func=metarunner.get_should_eval_func( args.eval_every_steps), output_dir=args.output_dir, verbose=True, save_best_model=args.do_save, load_best_model=True, log_writer=quick_init_out.log_writer, modified_layers=modified_layers, ).train_val_save_every() if args.do_save: torch.save(model_wrapper.model.state_dict(), os.path.join(args.output_dir, "model.p")) if args.do_val: val_examples = task.get_val_examples() results = runner.run_val(val_examples) evaluate.write_val_results( results=results, output_dir=args.output_dir, verbose=True, ) if args.do_test: test_examples = task.get_test_examples() logits = runner.run_test(test_examples) evaluate.write_preds( logits=logits, output_path=os.path.join(args.output_dir, "test_preds.csv"), )
def main(args): quick_init_out = initialization.quick_init(args=args, verbose=True) task, uda_task_data = uda_load_data.load_task_data_from_path( args.uda_task_config_path) with distributed.only_first_process(local_rank=args.local_rank): # load the model model_wrapper = llp_model_setup.setup_model( model_type=args.model_type, task=task, llp_embedding_dim=args.llp_embedding_dim, config_path=args.model_config_path, tokenizer_path=args.model_tokenizer_path, ) llp_model_setup.load_model( model=model_wrapper.model, state_dict=torch.load(args.model_path), load_mode=args.model_load_mode, ) model_wrapper.model.to(quick_init_out.device) # === Train Data Setup [START] === # labeled_examples = uda_task_data["sup"]["train"] unlabeled_examples, indices = train_setup.maybe_subsample_train( train_examples=uda_task_data["unsup"]["orig"], train_examples_number=args.unlabeled_train_examples_number, train_examples_fraction=args.unlabeled_train_examples_fraction, ) if indices is not None: write_json(indices, os.path.join(args.output_dir, "sampled_indices.json")) train_examples = labeled_examples + unlabeled_examples num_train_examples = len(train_examples) # === Train Data Setup [END] === # train_schedule = train_setup.get_train_schedule( num_train_examples=num_train_examples, max_steps=args.max_steps, num_train_epochs=args.num_train_epochs, gradient_accumulation_steps=args.gradient_accumulation_steps, per_gpu_train_batch_size=args.train_batch_size, n_gpu=quick_init_out.n_gpu, ) print("t_total", train_schedule.t_total) loss_criterion = train_setup.resolve_loss_function( task_type=task.TASK_TYPE) optimizer_scheduler = shared_model_setup.create_optimizer( model=model_wrapper.model, learning_rate=args.learning_rate, t_total=train_schedule.t_total, warmup_steps=args.warmup_steps, warmup_proportion=args.warmup_proportion, optimizer_type=args.optimizer_type, verbose=True, ) # I don't think this works for LLP... shared_model_setup.special_model_setup( model_wrapper=model_wrapper, optimizer_scheduler=optimizer_scheduler, fp16=args.fp16, fp16_opt_level=args.fp16_opt_level, n_gpu=quick_init_out.n_gpu, local_rank=args.local_rank, ) rparams = llp_runner.RunnerParameters( feat_spec=model_resolution.build_featurization_spec( model_type=args.model_type, max_seq_length=args.max_seq_length, ), local_rank=args.local_rank, n_gpu=quick_init_out.n_gpu, fp16=args.fp16, learning_rate=args.learning_rate, eval_batch_size=args.eval_batch_size, max_grad_norm=args.max_grad_norm, ) llp_params = llp_runner.LlpParameters( num_labeled=len(labeled_examples), llp_embedding_dim=args.llp_embedding_dim, llp_const_k=args.llp_const_k, llp_const_t=args.llp_const_t, llp_const_tau=args.llp_const_tau, llp_prop_chunk_size=args.llp_prop_chunk_size, llp_mem_bank_t=args.llp_mem_bank_t, llp_rep_global_agg_loss_lambda=args.llp_rep_global_agg_loss_lambda, llp_embedding_norm_loss=args.llp_embedding_norm_loss, llp_compute_global_agg_loss_mode=args.llp_compute_global_agg_loss_mode, ) llpuda_params = uda_llp_runner.LLPUDAParameters( uda_coeff=args.uda_coeff, use_unsup=args.unsup_ratio != 0, unsup_ratio=args.unsup_ratio, ) with quick_init_out.log_writer.log_context(): runner = uda_llp_runner.UDALLPRunner( task=task, model_wrapper=model_wrapper, optimizer_scheduler=optimizer_scheduler, loss_criterion=loss_criterion, device=quick_init_out.device, rparams=rparams, llp_params=llp_params, llpuda_params=llpuda_params, train_schedule=train_schedule, log_writer=quick_init_out.log_writer, ) if args.do_train: runner.init_llp_state(train_examples) runner.run_train(train_examples, uda_task_data) if args.do_save: torch.save(model_wrapper.model.state_dict(), os.path.join(args.output_dir, "model.p")) if args.do_val: val_examples = task.get_val_examples() results = runner.run_val(val_examples) evaluate.write_val_results( results=results, output_dir=args.output_dir, verbose=True, ) if args.do_test: test_examples = task.get_test_examples() logits = runner.run_test(test_examples) evaluate.write_preds( logits=logits, output_path=os.path.join(args.output_dir, "test_preds.csv"), )
def main(args): quick_init_out = initialization.quick_init(args=args, verbose=True) with quick_init_out.log_writer.log_context(): task_dict = create_task_dict( multitask_config_path=args.multitask_config_path, task_name_ls=args.task_name_ls, ) with distributed.only_first_process(local_rank=args.local_rank): # load the model model_wrapper = multitask_model_setup.setup_multitask_ptt_model( model_type=args.model_type, config_path=args.model_config_path, tokenizer_path=args.model_tokenizer_path, task_dict=task_dict, ) model_setup.simple_load_model_path( model=model_wrapper.model.model_dict[list( task_dict.keys())[0]], model_load_mode=args.model_load_mode, model_path=args.model_path, verbose=True, ) model_wrapper.model.to(quick_init_out.device) train_examples_dict = {} for task_name, task in task_dict.items(): train_examples = task.get_train_examples() train_examples, _ = train_setup.maybe_subsample_train( train_examples=train_examples, train_examples_number=args.train_examples_number, train_examples_fraction=args.train_examples_fraction, ) train_examples_dict[task_name] = train_examples # TODO: Tweak the schedule total_num_train_examples = sum( len(train_examples) for train_examples in train_examples_dict.values()) train_schedule = train_setup.get_train_schedule( num_train_examples=total_num_train_examples, max_steps=args.max_steps, num_train_epochs=args.num_train_epochs, gradient_accumulation_steps=args.gradient_accumulation_steps, per_gpu_train_batch_size=args.train_batch_size, n_gpu=quick_init_out.n_gpu, ) quick_init_out.log_writer.write_entry( "text", f"t_total: {train_schedule.t_total}", do_print=True) loss_criterion_dict = { task_name: train_setup.resolve_loss_function(task_type=task.TASK_TYPE) for task_name, task in task_dict.items() } optimizer_scheduler = model_setup.create_optimizer( model=model_wrapper.model, learning_rate=args.learning_rate, t_total=train_schedule.t_total, warmup_steps=args.warmup_steps, warmup_proportion=args.warmup_proportion, optimizer_type=args.optimizer_type, verbose=True, ) model_setup.special_model_setup( model_wrapper=model_wrapper, optimizer_scheduler=optimizer_scheduler, fp16=args.fp16, fp16_opt_level=args.fp16_opt_level, n_gpu=quick_init_out.n_gpu, local_rank=args.local_rank, ) rparams = simple_runner.RunnerParameters( feat_spec=model_resolution.build_featurization_spec( model_type=args.model_type, max_seq_length=args.max_seq_length, ), local_rank=args.local_rank, n_gpu=quick_init_out.n_gpu, fp16=args.fp16, learning_rate=args.learning_rate, eval_batch_size=args.eval_batch_size, max_grad_norm=args.max_grad_norm, ) runner = multitask_runner.MultiTaskRunner( task_dict=task_dict, model_wrapper=model_wrapper, optimizer_scheduler=optimizer_scheduler, loss_criterion_dict=loss_criterion_dict, device=quick_init_out.device, rparams=rparams, train_schedule=train_schedule, log_writer=quick_init_out.log_writer, ) if args.do_train: val_examples_dict = { task_name: task.get_val_examples()[:args.partial_eval_number] for task_name, task in task_dict.items() } metarunner.MetaRunner( runner=runner, train_examples=train_examples_dict, val_examples=val_examples_dict, # quick and dirty should_save_func=metarunner.get_should_save_func( args.save_every_steps), should_eval_func=metarunner.get_should_eval_func( args.eval_every_steps), output_dir=args.output_dir, verbose=True, save_best_model=args.do_save, load_best_model=True, log_writer=quick_init_out.log_writer, ).train_val_save_every() if args.do_save: torch.save(model_wrapper.model.state_dict(), os.path.join(args.output_dir, "model.p")) if args.do_val: val_examples_dict = { task_name: task.get_val_examples()[:args.partial_eval_number] for task_name, task in task_dict.items() } results = runner.run_val(val_examples_dict) evaluate.write_metrics( results=results, output_path=os.path.join(args.output_dir, "val_metrics.json"), verbose=True, ) if args.do_test: raise NotImplementedError()