def main(_params): global params params = _params train_df, test_df, dev_df, labels, num_labels, label_map, data_dir = prepare_data( ) data_args, model_args, config, tokenizer = prepare_config_and_tokenizer( data_dir, labels, num_labels, label_map) # ## Create Dataset Objects train_dataset = NerDataset( data_dir=data_args['data_dir'], tokenizer=tokenizer, labels=labels, model_type=config.model_type, max_seq_length=data_args['max_seq_length'], overwrite_cache=data_args['overwrite_cache'], # True mode=Split.train, data_size=params["data_size"]) eval_dataset = NerDataset(data_dir=data_args['data_dir'], tokenizer=tokenizer, labels=labels, model_type=config.model_type, max_seq_length=data_args['max_seq_length'], overwrite_cache=data_args['overwrite_cache'], mode=Split.dev, data_size=params["data_size"]) print(train_dataset.__len__(), eval_dataset.__len__()) # Train top-model using the Trainer API trainer, model = run_train(train_dataset, eval_dataset, config, model_args, labels, num_labels, label_map) gc.collect() torch.cuda.empty_cache() # ## Prepare test data, run trainer over test data and print metrics # we can pass overwrite_cache as True since we might like to make new predictions by just changing test.txt test_dataset = NerDataset(data_dir=data_args['data_dir'], tokenizer=tokenizer, labels=labels, model_type=config.model_type, max_seq_length=data_args['max_seq_length'], overwrite_cache=True, mode=Split.test, data_size=params["data_size"]) run_test(trainer, model, train_dataset, train_df, label_map) run_test(trainer, model, eval_dataset, dev_df, label_map) run_test(trainer, model, test_dataset, test_df, label_map)
def main(_params): global params params = _params ''' params['seed_value'] = args.seed_value params['set_seed'] = args.set_seed ''' wb_run = wandb.init(project="NER", name=params['exp_name'] + "_init") if params['set_seed']: random_seed_set(params['seed_value']) train_df, test_df, dev_df, labels, num_labels, label_map, data_dir, wt = prepare_data( ) data_args, model_args, config, tokenizer = prepare_config_and_tokenizer( data_dir, labels, num_labels, label_map) if 'add_vocab' in params.keys(): process_entity(tokenizer, train_df) process_entity(tokenizer, dev_df) process_entity(tokenizer, test_df) # ## Create Dataset Objects xargs = {} if params.get('xargs'): xargs = params['xargs'] xargs['wt'] = wt print('Got class weights') xargs["top_model"] = params.get("top_model") train_dataset = NerDataset( data_dir=data_args['data_dir'], tokenizer=tokenizer, labels=labels, model_type=config.model_type, max_seq_length=data_args['max_seq_length'], overwrite_cache=data_args['overwrite_cache'], # True mode=Split.train, data_size=params["data_size"], xargs=xargs) eval_dataset = NerDataset(data_dir=data_args['data_dir'], tokenizer=tokenizer, labels=labels, model_type=config.model_type, max_seq_length=data_args['max_seq_length'], overwrite_cache=data_args['overwrite_cache'], mode=Split.dev, data_size=100) # ## Prepare test data, run trainer over test data and print metrics # we can pass overwrite_cache as True since we might like to make new predictions by just changing test.txt test_dataset = NerDataset(data_dir=data_args['data_dir'], tokenizer=tokenizer, labels=labels, model_type=config.model_type, max_seq_length=data_args['max_seq_length'], overwrite_cache=True, mode=Split.test, data_size=100) print(train_dataset.__len__(), eval_dataset.__len__(), test_dataset.__len__()) wb_run.finish() # Train top-model using the Trainer API if params.get("hyp"): run_hyperp(train_dataset, eval_dataset, config, model_args, labels, num_labels, label_map, tokenizer, xargs) return trainer, model = run_train(train_dataset, eval_dataset, config, model_args, labels, num_labels, label_map, tokenizer, xargs) gc.collect() torch.cuda.empty_cache() wb_run = wandb.init(project="NER", name=params['exp_name'] + "summary") report = run_test(trainer, model, train_dataset, train_df, label_map) wandb.run.summary["train_report"] = report report = run_test(trainer, model, eval_dataset, dev_df, label_map) wandb.run.summary["val_report"] = report report = run_test(trainer, model, test_dataset, test_df, label_map) wandb.run.summary["test_report"] = report wandb.run.summary["model"] = model.__repr__() wandb.run.summary["data"] = { "train": train_dataset.__len__(), "val": eval_dataset.__len__(), "test": test_dataset.__len__(), "wt": wt } params["model_type"] = params["model_type"].name wandb.run.summary["params"] = params wb_run.finish()