def test_shallow(self): test_obj = yaml.load(""" a: !DummyArgClass arg1: !DummyArgClass2 _xnmt_id: id1 v: some_val arg2: !Ref { name: id1 } """) preloaded = persistence.YamlPreloader.preload_obj(root=test_obj, exp_name="exp1", exp_dir=self.out_dir) initalized = persistence.initialize_if_needed(preloaded) persistence.save_to_file(self.model_file, initalized)
def main(overwrite_args=None): with tee.Tee(), tee.Tee(error=True): argparser = argparse.ArgumentParser() argparser.add_argument("--dynet-mem", type=str) argparser.add_argument("--dynet-seed", type=int, help="set random seed for DyNet and XNMT.") argparser.add_argument("--dynet-autobatch", type=int) argparser.add_argument("--dynet-devices", type=str) argparser.add_argument("--dynet-viz", action='store_true', help="use visualization") argparser.add_argument("--dynet-gpu", action='store_true', help="use GPU acceleration") argparser.add_argument("--dynet-gpu-ids", type=int) argparser.add_argument("--dynet-gpus", type=int) argparser.add_argument("--dynet-weight-decay", type=float) argparser.add_argument("--dynet-profiling", type=int) argparser.add_argument("--settings", type=str, default="standard", help="settings (standard, debug, or unittest)" "must be given in '=' syntax, e.g." " --settings=standard") argparser.add_argument("experiments_file") argparser.add_argument("experiment_name", nargs='*', help="Run only the specified experiments") argparser.set_defaults(generate_doc=False) args = argparser.parse_args(overwrite_args) if args.dynet_seed: random.seed(args.dynet_seed) np.random.seed(args.dynet_seed) if args.dynet_gpu: if settings.CHECK_VALIDITY: settings.CHECK_VALIDITY = False log_preamble( "disabling CHECK_VALIDITY because it is not supported on GPU currently", logging.WARNING) config_experiment_names = YamlPreloader.experiment_names_from_file( args.experiments_file) results = [] # Check ahead of time that all experiments exist, to avoid bad surprises experiment_names = args.experiment_name or config_experiment_names if args.experiment_name: nonexistent = set(experiment_names).difference( config_experiment_names) if len(nonexistent) != 0: raise Exception("Experiments {} do not exist".format(",".join( list(nonexistent)))) log_preamble( f"running XNMT revision {tee.get_git_revision()} on {socket.gethostname()} on {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}" ) for experiment_name in experiment_names: ParamManager.init_param_col() uninitialized_exp_args = YamlPreloader.preload_experiment_from_file( args.experiments_file, experiment_name) logger.info(f"=> Running {experiment_name}") glob_args = uninitialized_exp_args.data.exp_global log_file = glob_args.log_file if os.path.isfile(log_file) and not settings.OVERWRITE_LOG: logger.warning( f"log file {log_file} already exists, skipping experiment; please delete log file by hand if you want to overwrite it " f"(or activate OVERWRITE_LOG, by either specifying an environment variable as OVERWRITE_LOG=1, " f"or specifying --settings=debug, or changing xnmt.settings.Standard.OVERWRITE_LOG manually)" ) continue tee.set_out_file(log_file) model_file = glob_args.model_file uninitialized_exp_args.data.exp_global.commandline_args = args # Create the model experiment = initialize_if_needed(uninitialized_exp_args) ParamManager.param_col.model_file = experiment.exp_global.model_file ParamManager.param_col.save_num_checkpoints = experiment.exp_global.save_num_checkpoints ParamManager.populate() # Run the experiment eval_scores = experiment(save_fct=lambda: save_to_file( model_file, experiment, ParamManager.param_col)) results.append((experiment_name, eval_scores)) print_results(results) tee.unset_out_file()
def run(self): seed = 13 random.seed(seed) np.random.seed(seed) EXP_DIR = os.path.dirname(__file__) EXP = "annot" model_file = f"{EXP_DIR}/results/{EXP}.mod" log_file = f"{EXP_DIR}/results/{EXP}.log" xnmt.tee.utils.dy.DynetParams().set_mem( 1024) #Doesnt work figure out how to set memory xnmt.tee.set_out_file(log_file, exp_name=EXP) ParamManager.init_param_col() ParamManager.param_col.model_file = model_file pre_runner = PreprocRunner( tasks=[ PreprocTokenize( in_files= [ #f'{EXP_DIR}/conala-corpus/conala-trainnodev.snippet', #f'{EXP_DIR}/conala-corpus/conala-trainnodev.intent', #f'{EXP_DIR}/conala-corpus/conala-dev.intent', #f'{EXP_DIR}/conala-corpus/conala-dev.snippet', #f'{EXP_DIR}/conala-corpus/conala-test.intent', #f'{EXP_DIR}/conala-corpus/conala-test.snippet', f'{EXP_DIR}/conala-corpus/attack_code_train.txt', f'{EXP_DIR}/conala-corpus/attack_text_train.txt', f'{EXP_DIR}/conala-corpus/attack_code_test.txt', f'{EXP_DIR}/conala-corpus/attack_text_test.txt' #f'{EXP_DIR}/conala-corpus/all.code', #f'{EXP_DIR}/conala-corpus/all.anno' ], out_files= [ #f'{EXP_DIR}/conala-corpus/conala-trainnodev.tmspm4000.snippet', #f'{EXP_DIR}/conala-corpus/conala-trainnodev.tmspm4000.intent', #f'{EXP_DIR}/conala-corpus/conala-dev.tmspm4000.intent', #f'{EXP_DIR}/conala-corpus/conala-dev.tmspm4000.snippet', #f'{EXP_DIR}/conala-corpus/conala-test.tmspm4000.intent', #f'{EXP_DIR}/conala-corpus/conala-test.tmspm4000.snippet', f'{EXP_DIR}/conala-corpus/attack-train.tmspm4000.snippet', f'{EXP_DIR}/conala-corpus/attack-train.tmspm4000.intent', f'{EXP_DIR}/conala-corpus/attack-test.tmspm4000.snippet', f'{EXP_DIR}/conala-corpus/attack-test.tmspm4000.intent' #f'{EXP_DIR}/conala-corpus/django.tmspm4000.snippet', #f'{EXP_DIR}/conala-corpus/django.tmspm4000.intent' ], specs=[{ 'filenum': 'all', 'tokenizers': [ SentencepieceTokenizer( hard_vocab_limit=False, train_files=[ f'{EXP_DIR}/conala-corpus/attack_text_train.txt', f'{EXP_DIR}/conala-corpus/attack_code_train.txt' ], vocab_size=self.vocab_size, model_type=self.model_type, model_prefix= 'conala-corpus/attack-train.tmspm4000.spm') ] }]), PreprocVocab( in_files=[ f'{EXP_DIR}/conala-corpus/attack-train.tmspm4000.intent', f'{EXP_DIR}/conala-corpus/attack-train.tmspm4000.snippet' ], out_files =[ f'{EXP_DIR}/conala-corpus/attack-train.tmspm4000.intent.vocab', f'{EXP_DIR}/conala-corpus/attack-train.tmspm4000.snippet.vocab' ], specs=[{ 'filenum': 'all', 'filters': [VocabFiltererFreq(min_freq=self.min_freq)] }]) ], overwrite=False) src_vocab = Vocab( vocab_file= f"{EXP_DIR}/conala-corpus/attack-train.tmspm4000.intent.vocab") trg_vocab = Vocab( vocab_file= f"{EXP_DIR}/conala-corpus/attack-train.tmspm4000.snippet.vocab") batcher = Batcher(batch_size=64) inference = AutoRegressiveInference(search_strategy=BeamSearch( len_norm=PolynomialNormalization(apply_during_search=True), beam_size=5), post_process='join-piece') layer_dim = self.layer_dim model = DefaultTranslator( src_reader=PlainTextReader(vocab=src_vocab), trg_reader=PlainTextReader(vocab=trg_vocab), src_embedder=SimpleWordEmbedder(emb_dim=layer_dim, vocab=src_vocab), encoder=BiLSTMSeqTransducer(input_dim=layer_dim, hidden_dim=layer_dim, layers=self.layers), attender=MlpAttender(hidden_dim=layer_dim, state_dim=layer_dim, input_dim=layer_dim), trg_embedder=SimpleWordEmbedder(emb_dim=layer_dim, vocab=trg_vocab), decoder=AutoRegressiveDecoder( input_dim=layer_dim, rnn=UniLSTMSeqTransducer( input_dim=layer_dim, hidden_dim=layer_dim, ), transform=AuxNonLinear(input_dim=layer_dim, output_dim=layer_dim, aux_input_dim=layer_dim), scorer=Softmax(vocab_size=len(trg_vocab), input_dim=layer_dim), trg_embed_dim=layer_dim, input_feeding=False, bridge=CopyBridge(dec_dim=layer_dim)), inference=inference) #decoder = AutoRegressiveDecoder(bridge=CopyBridge(),inference=inference)) train = SimpleTrainingRegimen( name=f"{EXP}", model=model, batcher=WordSrcBatcher(avg_batch_size=64), trainer=AdamTrainer(alpha=self.alpha), patience=3, lr_decay=0.5, restart_trainer=True, run_for_epochs=self.epochs, src_file=f"{EXP_DIR}/conala-corpus/attack-train.tmspm4000.intent", trg_file=f"{EXP_DIR}/conala-corpus/attack-train.tmspm4000.snippet", dev_tasks=[ LossEvalTask( src_file= f"{EXP_DIR}/conala-corpus/attack-test.tmspm4000.intent", ref_file= f'{EXP_DIR}/conala-corpus/attack-test.tmspm4000.snippet', model=model, batcher=WordSrcBatcher(avg_batch_size=64)), AccuracyEvalTask( eval_metrics='bleu', src_file= f'{EXP_DIR}/conala-corpus/attack-test.tmspm4000.intent', ref_file=f'{EXP_DIR}/conala-corpus/attack_text_test.txt', hyp_file=f'results/{EXP}.dev.hyp', model=model) ]) evaluate = [ AccuracyEvalTask( eval_metrics="bleu", #src_file=f"{EXP_DIR}/conala-corpus/conala-test.tmspm4000.intent", src_file= f"{EXP_DIR}/conala-corpus/attack-test.tmspm4000.intent", #ref_file=f"{EXP_DIR}/conala-corpus/all.code", #ref_file = f"{EXP_DIR}/conala-corpus/conala-test.snippet", ref_file=f"{EXP_DIR}/conala-corpus/attack_text_test.txt", hyp_file=f"results/{EXP}.test.hyp", inference=inference, model=model) ] standard_experiment = Experiment(exp_global=ExpGlobal( default_layer_dim=512, dropout=0.3, log_file=log_file, model_file=model_file), name="annot", model=model, train=train, evaluate=evaluate) # run experiment standard_experiment( save_fct=lambda: save_to_file(model_file, standard_experiment)) exit()
model=model, batcher=batcher, trainer=AdamTrainer(alpha=0.001), run_for_epochs=2, src_file="examples/data/head.ja", trg_file="examples/data/head.en", dev_tasks=[ LossEvalTask(src_file="examples/data/head.ja", ref_file="examples/data/head.en", model=model, batcher=batcher) ], ) evaluate = [ AccuracyEvalTask(eval_metrics="bleu,wer", src_file="examples/data/head.ja", ref_file="examples/data/head.en", hyp_file=f"examples/output/{EXP}.test_hyp", inference=inference, model=model) ] standard_experiment = Experiment(model=model, train=train, evaluate=evaluate) # run experiment standard_experiment(save_fct=lambda: save_to_file( model_file, standard_experiment, ParamManager.param_col)) exit()
run_for_epochs=2, src_file="examples/data/head.ja", trg_file="examples/data/head.en", dev_tasks=[ LossEvalTask(src_file="examples/data/head.ja", ref_file="examples/data/head.en", model=model, batcher=batcher) ], ) evaluate = [ AccuracyEvalTask(eval_metrics="bleu,wer", src_file="examples/data/head.ja", ref_file="examples/data/head.en", hyp_file=f"examples/output/{EXP}.test_hyp", inference=inference, model=model) ] standard_experiment = Experiment(name="programmatic", model=model, train=train, evaluate=evaluate) # run experiment standard_experiment( save_fct=lambda: save_to_file(model_file, standard_experiment)) exit()
def main(overwrite_args: Optional[Sequence[str]] = None) -> None: with tee.Tee(), tee.Tee(error=True): argparser = argparse.ArgumentParser() utils.add_backend_argparse(argparser) argparser.add_argument("--settings", type=str, default="standard", help="settings (standard, debug, or unittest)" "must be given in '=' syntax, e.g." " --settings=standard") argparser.add_argument( "--resume", action='store_true', help="whether a saved experiment is being resumed, and" "locations of output files should be re-used.") argparser.add_argument("--backend", type=str, default="dynet", help="backend (dynet or torch)") argparser.add_argument("experiments_file") argparser.add_argument("experiment_name", nargs='*', help="Run only the specified experiments") argparser.set_defaults(generate_doc=False) args = argparser.parse_args(overwrite_args) if xnmt.backend_dynet and args.dynet_seed: args.seed = args.dynet_seed if getattr(args, "seed", None): random.seed(args.seed) np.random.seed(args.seed) if xnmt.backend_torch: torch.manual_seed(0) if xnmt.backend_dynet and args.dynet_gpu and settings.CHECK_VALIDITY: settings.CHECK_VALIDITY = False log_preamble( "disabling CHECK_VALIDITY because it is not supported in the DyNet/GPU setting", logging.WARNING) config_experiment_names = YamlPreloader.experiment_names_from_file( args.experiments_file) results = [] # Check ahead of time that all experiments exist, to avoid bad surprises experiment_names = args.experiment_name or config_experiment_names if args.experiment_name: nonexistent = set(experiment_names).difference( config_experiment_names) if len(nonexistent) != 0: raise Exception("Experiments {} do not exist".format(",".join( list(nonexistent)))) log_preamble( f"running XNMT revision {tee.get_git_revision()} on {socket.gethostname()} with {'DyNet' if xnmt.backend_dynet else 'PyTorch'} on {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}" ) for experiment_name in experiment_names: ParamManager.init_param_col() uninitialized_exp_args = YamlPreloader.preload_experiment_from_file( args.experiments_file, experiment_name, resume=args.resume) logger.info(f"=> Running {experiment_name}") glob_args = uninitialized_exp_args.data.exp_global log_file = glob_args.log_file if not settings.OVERWRITE_LOG: log_files_exist = [] if os.path.isfile(log_file): log_files_exist.append(log_file) if os.path.isdir(log_file + ".tb"): log_files_exist.append(log_file + ".tb/") if log_files_exist: logger.warning( f"log file(s) {' '.join(log_files_exist)} already exists, skipping experiment; " f"please delete log file by hand if you want to overwrite it " f"(or activate OVERWRITE_LOG, by either specifying an environment variable OVERWRITE_LOG=1, " f"or specifying --settings=debug, or changing xnmt.settings.Standard.OVERWRITE_LOG manually)" ) continue elif settings.OVERWRITE_LOG and os.path.isdir(log_file + ".tb"): shutil.rmtree( log_file + ".tb/" ) # remove tensorboard logs from previous run that is being overwritten tee.set_out_file(log_file, exp_name=experiment_name) try: model_file = glob_args.model_file uninitialized_exp_args.data.exp_global.commandline_args = vars( args) # Create the model experiment = initialize_if_needed(uninitialized_exp_args) ParamManager.param_col.model_file = experiment.exp_global.model_file ParamManager.param_col.save_num_checkpoints = experiment.exp_global.save_num_checkpoints ParamManager.populate() # Run the experiment eval_scores = experiment( save_fct=lambda: save_to_file(model_file, experiment)) results.append((experiment_name, eval_scores)) print_results(results) except Exception as e: file_logger.error(traceback.format_exc()) raise e finally: tee.unset_out_file()
import xnmt.tee from xnmt.param_collection import ParamManager from xnmt.persistence import initialize_if_needed, YamlPreloader, LoadSerialized, save_to_file EXP_DIR = os.path.dirname(__file__) EXP = "programmatic-load" model_file = f"{EXP_DIR}/models/{EXP}.mod" log_file = f"{EXP_DIR}/logs/{EXP}.log" xnmt.tee.set_out_file(log_file) ParamManager.init_param_col() load_experiment = LoadSerialized( filename=f"{EXP_DIR}/models/programmatic.mod", overwrite=[ {"path" : "train", "val" : None} ] ) uninitialized_experiment = YamlPreloader.preload_obj(load_experiment, exp_dir=EXP_DIR, exp_name=EXP) loaded_experiment = initialize_if_needed(uninitialized_experiment) # if we were to continue training, we would need to set a save model file like this: # ParamManager.param_col.model_file = model_file ParamManager.populate() # run experiment loaded_experiment(save_fct=lambda: save_to_file(model_file, loaded_experiment))