def experiment_logger(args): ''' :param args: from biencoder_parameters :return: dirs for experiment log ''' experimet_logdir = args.experiment_logdir # / is included timestamp = datetime.now(timezone('Asia/Tokyo')) str_timestamp = '{0:%Y%m%d_%H%M%S}'.format(timestamp)[2:] dir_for_each_experiment = experimet_logdir + str_timestamp if os.path.exists(dir_for_each_experiment): dir_for_each_experiment += '_d' dir_for_each_experiment += '/' logger_path = dir_for_each_experiment + 'teelog.log' os.mkdir(dir_for_each_experiment) if not args.debug: sys.stdout = TeeLogger(logger_path, sys.stdout, False) # default: False sys.stderr = TeeLogger(logger_path, sys.stderr, False) # default: False return dir_for_each_experiment
def prepare_global_logging(serialization_dir: str, file_friendly_logging: bool) -> None: """ This function configures 3 global logging attributes - streaming stdout and stderr to a file as well as the terminal, setting the formatting for the python logging library and setting the interval frequency for the Tqdm progress bar. Note that this function does not set the logging level, which is set in ``allennlp/run.py``. Parameters ---------- serializezation_dir : ``str``, required. The directory to stream logs to. file_friendly_logging : ``bool``, required. Whether logs should clean the output to prevent carridge returns (used to update progress bars on a single terminal line). """ Tqdm.set_slower_interval(file_friendly_logging) std_out_file = os.path.join(serialization_dir, "stdout.log") sys.stdout = TeeLogger( std_out_file, # type: ignore sys.stdout, file_friendly_logging) sys.stderr = TeeLogger( os.path.join(serialization_dir, "stderr.log"), # type: ignore sys.stderr, file_friendly_logging) stdout_handler = logging.FileHandler(std_out_file) stdout_handler.setFormatter( logging.Formatter( '%(asctime)s - %(levelname)s - %(name)s - %(message)s')) logging.getLogger().addHandler(stdout_handler)
def prepare_global_logging(serialization_dir: str, file_friendly_logging: bool) -> logging.FileHandler: """ This function configures 3 global logging attributes - streaming stdout and stderr to a file as well as the terminal, setting the formatting for the python logging library and setting the interval frequency for the Tqdm progress bar. Note that this function does not set the logging level, which is set in ``allennlp/run.py``. Parameters ---------- serialization_dir : ``str``, required. The directory to stream logs to. file_friendly_logging : ``bool``, required. Whether logs should clean the output to prevent carriage returns (used to update progress bars on a single terminal line). This option is typically only used if you are running in an environment without a terminal. Returns ------- ``logging.FileHandler`` A logging file handler that can later be closed and removed from the global logger. """ # If we don't have a terminal as stdout, # force tqdm to be nicer. if not sys.stdout.isatty(): file_friendly_logging = True Tqdm.set_slower_interval(file_friendly_logging) std_out_file = os.path.join(serialization_dir, "stdout.log") sys.stdout = TeeLogger( std_out_file, # type: ignore sys.stdout, file_friendly_logging) sys.stderr = TeeLogger( os.path.join(serialization_dir, "stderr.log"), # type: ignore sys.stderr, file_friendly_logging) stdout_handler = logging.FileHandler(std_out_file) stdout_handler.setFormatter( logging.Formatter( '%(asctime)s - %(levelname)s - %(name)s - %(message)s')) logging.getLogger().addHandler(stdout_handler) return stdout_handler
def datasetteelogger(KBspecified_experiment_dir): experimet_logdir = KBspecified_experiment_dir + 'experiment_logdir/' if not os.path.isdir(experimet_logdir): os.mkdir(experimet_logdir) timestamp = datetime.now(timezone('Asia/Tokyo')) str_timestamp = '{0:%Y%m%d_%H%M%S}'.format(timestamp)[2:] dir_for_each_experiment = experimet_logdir + str_timestamp dir_for_each_experiment += '/' loggerpath = dir_for_each_experiment + 'teelog.log' os.mkdir(dir_for_each_experiment) dir_for_candidate_dumping = dir_for_each_experiment + 'dumped_candidates' os.mkdir(dir_for_candidate_dumping) print('\n====== ===== =====\nNOTE: TIMESTAMP for this experiment:', dir_for_each_experiment) print('====== ===== =====') sys.stdout = TeeLogger(loggerpath, sys.stdout, False) # default: False sys.stderr = TeeLogger(loggerpath, sys.stderr, False) # default: False return dir_for_each_experiment
def train_model(db: FeverDocDB, params: Union[Params, Dict[str, Any]], cuda_device: int, serialization_dir: str, filtering: str) -> Model: """ This function can be used as an entry point to running models in AllenNLP directly from a JSON specification using a :class:`Driver`. Note that if you care about reproducibility, you should avoid running code using Pytorch or numpy which affect the reproducibility of your experiment before you import and use this function, these libraries rely on random seeds which can be set in this function via a JSON specification file. Note that this function performs training and will also evaluate the trained model on development and test sets if provided in the parameter json. Parameters ---------- params: Params, required. A parameter object specifying an AllenNLP Experiment. serialization_dir: str, required The directory in which to save results and logs. """ SimpleRandom.set_seeds() os.makedirs(serialization_dir, exist_ok=True) sys.stdout = TeeLogger(os.path.join(serialization_dir, "stdout.log"), sys.stdout) # type: ignore sys.stderr = TeeLogger(os.path.join(serialization_dir, "stderr.log"), sys.stderr) # type: ignore handler = logging.FileHandler( os.path.join(serialization_dir, "python_logging.log")) handler.setLevel(logging.INFO) handler.setFormatter( logging.Formatter( '%(asctime)s - %(levelname)s - %(name)s - %(message)s')) logging.getLogger().addHandler(handler) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, "model_params.json"), "w") as param_file: json.dump(serialization_params, param_file, indent=4) # Now we begin assembling the required parts for the Trainer. ds_params = params.pop('dataset_reader', {}) dataset_reader = FEVERReader(db, sentence_level=ds_params.pop( "sentence_level", False), wiki_tokenizer=Tokenizer.from_params( ds_params.pop('wiki_tokenizer', {})), claim_tokenizer=Tokenizer.from_params( ds_params.pop('claim_tokenizer', {})), token_indexers=TokenIndexer.dict_from_params( ds_params.pop('token_indexers', {})), filtering=filtering) train_data_path = params.pop('train_data_path') logger.info("Reading training data from %s", train_data_path) train_data = dataset_reader.read(train_data_path) all_datasets = [train_data] datasets_in_vocab = ["train"] validation_data_path = params.pop('validation_data_path', None) if validation_data_path is not None: logger.info("Reading validation data from %s", validation_data_path) validation_data = dataset_reader.read(validation_data_path) all_datasets.append(validation_data) datasets_in_vocab.append("validation") else: validation_data = None logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_in_vocab)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), Dataset([ instance for dataset in all_datasets for instance in dataset.instances ])) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) model = Model.from_params(vocab, params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) train_data.index_instances(vocab) if validation_data: validation_data.index_instances(vocab) trainer_params = params.pop("trainer") if cuda_device is not None: trainer_params["cuda_device"] = cuda_device trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, validation_data, trainer_params) trainer.train() # Now tar up results archive_model(serialization_dir) return model
def train_model(params: Union[Params, Dict[str, Any]], cuda_device: int, serialization_dir: str, filtering: str) -> Model: """ This function can be used as an entry point to running models in AllenNLP directly from a JSON specification using a :class:`Driver`. Note that if you care about reproducibility, you should avoid running code using Pytorch or numpy which affect the reproducibility of your experiment before you import and use this function, these libraries rely on random seeds which can be set in this function via a JSON specification file. Note that this function performs training and will also evaluate the trained model on development and test sets if provided in the parameter json. Parameters ---------- params: Params, required. A parameter object specifying an AllenNLP Experiment. serialization_dir: str, required The directory in which to save results and logs. """ SimpleRandom.set_seeds() os.makedirs(serialization_dir, exist_ok=True) try: sys.stdout = TeeLogger(os.path.join(serialization_dir, "stdout.log"), sys.stdout, True) # type: ignore sys.stderr = TeeLogger(os.path.join(serialization_dir, "stderr.log"), sys.stderr, True) # type: ignore except TypeError: sys.stdout = TeeLogger(os.path.join(serialization_dir, "stdout.log"), sys.stdout) # type: ignore sys.stderr = TeeLogger(os.path.join(serialization_dir, "stderr.log"), sys.stderr) # type: ignore handler = logging.FileHandler( os.path.join(serialization_dir, "python_logging.log")) handler.setLevel(logging.INFO) handler.setFormatter( logging.Formatter( '%(asctime)s - %(levelname)s - %(name)s - %(message)s')) logging.getLogger().addHandler(handler) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, "model_params.json"), "w") as param_file: json.dump(serialization_params, param_file, indent=4) # Now we begin assembling the required parts for the Trainer. ds_params = params.pop('dataset_reader', {}) read_settings = ds_params.pop('read_settings', {}) dataset_reader = FEVERReader.from_params(ds_params) train_data_path = params.pop('train_data_path') logger.info("Reading training data from %s", train_data_path) train_data = dataset_reader.read( train_data_path, include_metadata=True, replace_with_gold=read_settings.pop('replace_gold', False), pad_with_nearest=read_settings.pop('pad_with_nearest', 0)) validation_data_path = params.pop('validation_data_path', None) if validation_data_path is not None: logger.info("Reading validation data from %s", validation_data_path) validation_data = dataset_reader.read(validation_data_path, include_metadata=True) else: validation_data = None vocab_params = params.pop("vocabulary", {}) dataset = None print(dict(vocab_params), 'directory_path' not in vocab_params) assert ('directory_path' in vocab_params) vocab = Vocabulary.from_params(vocab_params, dataset) print(vocab) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) model = Model.from_params(vocab=vocab, params=params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) trainer_params = params.pop("trainer") if cuda_device is not None: trainer_params["cuda_device"] = cuda_device trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, validation_data, trainer_params) trainer.train() # Now tar up results archive_model(serialization_dir) return model
def train_model(params: Params, serialization_dir: str, cuda_device: int, train_data_path: str, validation_data_path: str, test_data_path: str, file_friendly_logging: bool = False) -> Model: """ This function can be used as an entry point to running models in AllenNLP directly from a JSON specification using a :class:`Driver`. Note that if you care about reproducibility, you should avoid running code using Pytorch or numpy which affect the reproducibility of your experiment before you import and use this function, these libraries rely on random seeds which can be set in this function via a JSON specification file. Note that this function performs training and will also evaluate the trained model on development and test sets if provided in the parameter json. Parameters ---------- params: Params, required. A parameter object specifying an AllenNLP Experiment. serialization_dir: str, required The directory in which to save results and logs. """ prepare_environment(params) os.makedirs(serialization_dir, exist_ok=True) sys.stdout = TeeLogger( os.path.join(serialization_dir, "stdout.log"), # type: ignore sys.stdout, file_friendly_logging) sys.stderr = TeeLogger( os.path.join(serialization_dir, "stderr.log"), # type: ignore sys.stderr, file_friendly_logging) handler = logging.FileHandler( os.path.join(serialization_dir, "python_logging.log")) handler.setLevel(logging.INFO) handler.setFormatter( logging.Formatter( '%(asctime)s - %(levelname)s - %(name)s - %(message)s')) logging.getLogger().addHandler(handler) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, "model_params.json"), "w") as param_file: json.dump(serialization_params, param_file, indent=4) # all_datasets = datasets_from_params(params) all_datasets = datasets_from_args(params, train_data_path, validation_data_path, test_data_path) datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) model = Model.from_params(vocab, params.pop('model')) if cuda_device >= 0: model = model.cuda(cuda_device) # iterator = DataIterator.from_params(params.pop("iterator")) # iterator.index_with(vocab) train_iterator = DataIterator.from_params(params.pop("train_iterator")) val_iterator = DataIterator.from_params(params.pop("val_iterator")) train_iterator.index_with(vocab) val_iterator.index_with(vocab) train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") trainer = Trainer.from_params(model, serialization_dir, train_iterator, val_iterator, cuda_device, train_data, validation_data, trainer_params) evaluate_on_test = params.pop_bool("evaluate_on_test", False) # params.assert_empty('base train command') metrics = trainer.train() # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) if test_data and evaluate_on_test: test_metrics = evaluate(model, test_data, val_iterator, cuda_device=trainer._cuda_devices[0]) # pylint: disable=protected-access for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") metrics_json = json.dumps(metrics, indent=2) with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file: metrics_file.write(metrics_json) logger.info("Metrics: %s", metrics_json) return model
def main(): args = parse_args() params = Params.from_file(args.params) save_dir = Path(args.save) save_dir.mkdir(parents=True) params.to_file(save_dir / 'params.json') train_params, model_params = params.pop('train'), params.pop('model') random_seed = train_params.pop_int('random_seed', 2019) torch.manual_seed(random_seed) random.seed(random_seed) log_filename = save_dir / 'stdout.log' sys.stdout = TeeLogger(filename=log_filename, terminal=sys.stdout, file_friendly_terminal_output=False) sys.stderr = TeeLogger(filename=log_filename, terminal=sys.stderr, file_friendly_terminal_output=False) tokenizer = WordTokenizer( start_tokens=['<s>'], end_tokens=['</s>'], ) token_indexer = SingleIdTokenIndexer(lowercase_tokens=True) dataset_reader = SnliReader(tokenizer=tokenizer, token_indexers={'tokens': token_indexer}) train_labeled_dataset_path = train_params.pop('train_labeled_dataset_path') train_unlabeled_dataset_path = train_params.pop( 'train_unlabeled_dataset_path', None) train_labeled_dataset = dataset_reader.read(train_labeled_dataset_path) train_labeled_dataset = filter_dataset_by_length( dataset=train_labeled_dataset, max_length=30) if train_unlabeled_dataset_path is not None: train_unlabeled_dataset = dataset_reader.read( train_unlabeled_dataset_path) train_unlabeled_dataset = filter_dataset_by_length( dataset=train_unlabeled_dataset, max_length=30) else: train_unlabeled_dataset = [] valid_dataset = dataset_reader.read(train_params.pop('valid_dataset_path')) vocab = Vocabulary.from_instances( instances=train_labeled_dataset + train_unlabeled_dataset, max_vocab_size=train_params.pop_int('max_vocab_size', None)) vocab.save_to_files(save_dir / 'vocab') labeled_batch_size = train_params.pop_int('labeled_batch_size') unlabeled_batch_size = train_params.pop_int('unlabeled_batch_size') labeled_iterator = BasicIterator(batch_size=labeled_batch_size) unlabeled_iterator = BasicIterator(batch_size=unlabeled_batch_size) labeled_iterator.index_with(vocab) unlabeled_iterator.index_with(vocab) if not train_unlabeled_dataset: unlabeled_iterator = None model = SNLIModel(params=model_params, vocab=vocab) optimizer = optim.Adam(params=model.parameters(), lr=train_params.pop_float('lr', 1e-3)) summary_writer = SummaryWriter(log_dir=save_dir / 'log') kl_anneal_rate = train_params.pop_float('kl_anneal_rate', None) if kl_anneal_rate is None: kl_weight_scheduler = None else: kl_weight_scheduler = (lambda step: min(1.0, kl_anneal_rate * step)) model.kl_weight = 0.0 trainer = Trainer(model=model, optimizer=optimizer, labeled_iterator=labeled_iterator, unlabeled_iterator=unlabeled_iterator, train_labeled_dataset=train_labeled_dataset, train_unlabeled_dataset=train_unlabeled_dataset, validation_dataset=valid_dataset, summary_writer=summary_writer, serialization_dir=save_dir, num_epochs=train_params.pop('num_epochs', 50), iters_per_epoch=len(train_labeled_dataset) // labeled_batch_size, write_summary_every=100, validate_every=2000, patience=2, clip_grad_max_norm=5, kl_weight_scheduler=kl_weight_scheduler, cuda_device=train_params.pop_int('cuda_device', 0), early_stop=train_params.pop_bool('early_stop', True)) trainer.train()
def train_model(params: Params, serialization_dir: str) -> Model: """ This function can be used as an entry point to running models in AllenNLP directly from a JSON specification using a :class:`Driver`. Note that if you care about reproducibility, you should avoid running code using Pytorch or numpy which affect the reproducibility of your experiment before you import and use this function, these libraries rely on random seeds which can be set in this function via a JSON specification file. Note that this function performs training and will also evaluate the trained model on development and test sets if provided in the parameter json. Parameters ---------- params: Params, required. A parameter object specifying an AllenNLP Experiment. serialization_dir: str, required The directory in which to save results and logs. """ prepare_environment(params) os.makedirs(serialization_dir, exist_ok=True) sys.stdout = TeeLogger(os.path.join(serialization_dir, "stdout.log"), sys.stdout) # type: ignore sys.stderr = TeeLogger(os.path.join(serialization_dir, "stderr.log"), sys.stderr) # type: ignore handler = logging.FileHandler(os.path.join(serialization_dir, "python_logging.log")) handler.setLevel(logging.INFO) handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(name)s - %(message)s')) logging.getLogger().addHandler(handler) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, "model_params.json"), "w") as param_file: json.dump(serialization_params, param_file, indent=4) # Now we begin assembling the required parts for the Trainer. dataset_reader = DatasetReader.from_params(params.pop('dataset_reader')) train_data_path = params.pop('train_data_path') logger.info("Reading training data from %s", train_data_path) train_data = dataset_reader.read(train_data_path) all_datasets: List[Dataset] = [train_data] datasets_in_vocab = ["train"] validation_data_path = params.pop('validation_data_path', None) if validation_data_path is not None: logger.info("Reading validation data from %s", validation_data_path) validation_data = dataset_reader.read(validation_data_path) all_datasets.append(validation_data) datasets_in_vocab.append("validation") else: validation_data = None test_data_path = params.pop("test_data_path", None) if test_data_path is not None: logger.info("Reading test data from %s", test_data_path) test_data = dataset_reader.read(test_data_path) all_datasets.append(test_data) datasets_in_vocab.append("test") else: test_data = None logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_in_vocab)) vocab = Vocabulary.from_params(params.pop("vocabulary", {}), Dataset([instance for dataset in all_datasets for instance in dataset.instances])) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) model = Model.from_params(vocab, params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) train_data.index_instances(vocab) if validation_data: validation_data.index_instances(vocab) trainer_params = params.pop("trainer") trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, validation_data, trainer_params) evaluate_on_test = params.pop("evaluate_on_test", False) params.assert_empty('base train command') trainer.train() # Now tar up results archive_model(serialization_dir) if test_data and evaluate_on_test: test_data.index_instances(vocab) evaluate(model, test_data, iterator, cuda_device=trainer._cuda_device) # pylint: disable=protected-access elif test_data: logger.info("To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") return model
def main(): args = parse_args() params = Params.from_file(args.params) save_dir = Path(args.save) save_dir.mkdir(parents=True) params.to_file(save_dir / 'params.json') train_params, model_params = params.pop('train'), params.pop('model') random_seed = train_params.pop_int('random_seed', 2019) torch.manual_seed(random_seed) random.seed(random_seed) log_filename = save_dir / 'stdout.log' sys.stdout = TeeLogger(filename=log_filename, terminal=sys.stdout, file_friendly_terminal_output=False) sys.stderr = TeeLogger(filename=log_filename, terminal=sys.stderr, file_friendly_terminal_output=False) tokenizer = WordTokenizer(word_splitter=JustSpacesWordSplitter(), start_tokens=['<s>'], end_tokens=['</s>']) token_indexer = SingleIdTokenIndexer(lowercase_tokens=True) dataset_reader = QuoraParaphraseDatasetReader( tokenizer=tokenizer, token_indexers={'tokens': token_indexer}) train_labeled_dataset_path = train_params.pop('train_labeled_dataset_path') train_unlabeled_dataset_path = train_params.pop( 'train_unlabeled_dataset_path', None) train_labeled_dataset = dataset_reader.read(train_labeled_dataset_path) train_labeled_dataset = filter_dataset_by_length( dataset=train_labeled_dataset, max_length=35) if train_unlabeled_dataset_path is not None: train_unlabeled_dataset = dataset_reader.read( train_unlabeled_dataset_path) train_unlabeled_dataset = filter_dataset_by_length( dataset=train_unlabeled_dataset, max_length=35) else: train_unlabeled_dataset = [] valid_dataset = dataset_reader.read(train_params.pop('valid_dataset_path')) vocab = Vocabulary.from_instances( instances=train_labeled_dataset + train_unlabeled_dataset, max_vocab_size=train_params.pop_int('max_vocab_size', None)) vocab.save_to_files(save_dir / 'vocab') labeled_batch_size = train_params.pop_int('labeled_batch_size') unlabeled_batch_size = train_params.pop_int('unlabeled_batch_size') labeled_iterator = BasicIterator(batch_size=labeled_batch_size) unlabeled_iterator = BasicIterator(batch_size=unlabeled_batch_size) labeled_iterator.index_with(vocab) unlabeled_iterator.index_with(vocab) if not train_unlabeled_dataset: unlabeled_iterator = None model = SeparatedQuoraModel(params=model_params, vocab=vocab) optimizer = optim.Adam(params=model.parameters()) summary_writer = SummaryWriter(log_dir=save_dir / 'log') trainer = SeparatedLVMTrainer( model=model, optimizer=optimizer, labeled_iterator=labeled_iterator, unlabeled_iterator=unlabeled_iterator, train_labeled_dataset=train_labeled_dataset, train_unlabeled_dataset=train_unlabeled_dataset, validation_dataset=valid_dataset, summary_writer=summary_writer, serialization_dir=save_dir, num_epochs=train_params.pop('num_epochs', 50), iters_per_epoch=len(train_labeled_dataset) // labeled_batch_size, write_summary_every=100, validate_every=2000, patience=train_params.pop('patience', 2), clip_grad_max_norm=5, cuda_device=train_params.pop_int('cuda_device', 0)) trainer.train()
def train_model(params: Params, serialization_dir: str) -> Model: """ This function can be used as an entry point to running models in AllenNLP directly from a JSON specification using a :class:`Driver`. Note that if you care about reproducibility, you should avoid running code using Pytorch or numpy which affect the reproducibility of your experiment before you import and use this function, these libraries rely on random seeds which can be set in this function via a JSON specification file. Note that this function performs training and will also evaluate the trained model on development and test sets if provided in the parameter json. Parameters ---------- params: Params, required. A parameter object specifying an AllenNLP Experiment. serialization_dir: str, required The directory in which to save results and logs. """ prepare_environment(params) os.makedirs(serialization_dir, exist_ok=True) sys.stdout = TeeLogger(os.path.join(serialization_dir, "stdout.log"), sys.stdout) # type: ignore sys.stderr = TeeLogger(os.path.join(serialization_dir, "stderr.log"), sys.stderr) # type: ignore handler = logging.FileHandler( os.path.join(serialization_dir, "python_logging.log")) handler.setLevel(logging.INFO) handler.setFormatter( logging.Formatter( '%(asctime)s - %(levelname)s - %(name)s - %(message)s')) logging.getLogger().addHandler(handler) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, "model_params.json"), "w") as param_file: json.dump(serialization_params, param_file, indent=4) # Now we begin assembling the required parts for the Trainer. # 1. Primary training data. dataset_reader = DatasetReader.from_params(params.pop('dataset_reader')) train_data_path = params.pop('train_data_path') logger.info("Reading training data from %s", train_data_path) train_data = dataset_reader.read(train_data_path) # 2. Auxillary training data. dataset_reader_aux = DatasetReader.from_params( params.pop('dataset_reader_aux')) train_data_path_aux = params.pop('train_data_path_aux') logger.info("Reading auxilliary training data from %s", train_data_path_aux) train_data_aux = dataset_reader_aux.read(train_data_path_aux) # If only using a fraction of the auxiliary data. aux_sample_fraction = params.pop("aux_sample_fraction", 1.0) if aux_sample_fraction < 1.0: sample_size = int(aux_sample_fraction * len(train_data_aux.instances)) train_data_aux = Dataset( random.sample(train_data_aux.instances, sample_size)) # Balance the two datasets by inflating the size of the smaller dataset to the size of the larger dataset. train_size = len(train_data.instances) aux_train_size = len(train_data_aux.instances) mixing_ratio = params.pop("mixing_ratio") # mixing_ratio = float(train_size)/aux_train_size if train_size > aux_train_size: # case for PB scaffold. difference = train_size - aux_train_size aux_sample = [ random.choice(train_data_aux.instances) for _ in range(difference) ] train_data_aux = Dataset(train_data_aux.instances + aux_sample) logger.info( "Inflating auxiliary train data from {} to {} samples".format( aux_train_size, len(train_data_aux.instances))) # else: # case for FN scaffold. # difference = aux_train_size - train_size # train_sample = [random.choice(train_data.instances) for _ in range(difference)] # train_data = Dataset(train_data.instances + train_sample) # logger.info("Inflating train data from {} to {} samples".format( # train_size, len(train_data.instances))) all_datasets: Dict[str, Dataset] = {"train": train_data} all_datasets_aux: Dict[str, Dataset] = {"train_aux": train_data_aux} # 3. Primary validation data. validation_data_path = params.pop('validation_data_path', None) if validation_data_path is not None: logger.info("Reading validation data from %s", validation_data_path) validation_data = dataset_reader.read(validation_data_path) all_datasets["validation"] = validation_data else: validation_data = None # 4. Auxillary validation data. validation_data_path_aux = params.pop('validation_data_path_aux', None) if validation_data_path_aux is not None: logger.info("Reading auxilliary validation data from %s", validation_data_path_aux) validation_data_aux = dataset_reader_aux.read(validation_data_path_aux) all_datasets_aux["validation_aux"] = validation_data_aux else: validation_data_aux = None # 5. Primary test data test_data_path = params.pop("test_data_path", None) if test_data_path is not None: logger.info("Reading test data from %s", test_data_path) test_data = dataset_reader.read(test_data_path) all_datasets["test"] = test_data else: test_data = None # 6. Auxillary test data test_data_path_aux = params.pop("test_data_path_aux", None) if test_data_path_aux is not None: logger.info("Reading auxillary test data from %s", test_data_path_aux) test_data_aux = dataset_reader_aux.read(test_data_path_aux) all_datasets_aux["test_aux"] = test_data_aux else: test_data_aux = None datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", all_datasets)) datasets_for_vocab_creation_aux = set( params.pop("auxillary_datasets_for_vocab_creation", all_datasets_aux)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info( "Creating a vocabulary using %s data. Auxillary also included.", ", ".join(datasets_for_vocab_creation)) dataset_primary = Dataset([ instance for key, dataset in all_datasets.items() for instance in dataset.instances if key in datasets_for_vocab_creation ]) dataset_aux = Dataset([ instance for key, dataset in all_datasets_aux.items() for instance in dataset.instances if key in datasets_for_vocab_creation_aux ]) vocab = Vocabulary.from_params(params.pop("vocabulary", {}), dataset_primary, dataset_aux=dataset_aux) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) model = Model.from_params(vocab, params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) iterator_aux = DataIterator.from_params(params.pop("iterator_aux")) train_data.index_instances(vocab) train_data_aux.index_instances(vocab) if validation_data: validation_data.index_instances(vocab) if validation_data_aux: validation_data_aux.index_instances(vocab) cutoff_epoch = params.pop("cutoff_epoch", -1) trainer_params = params.pop("trainer") trainer = MultiTaskTrainer.from_params( model=model, serialization_dir=serialization_dir, iterator=iterator, iterator_aux=iterator_aux, train_dataset=train_data, train_dataset_aux=train_data_aux, mixing_ratio=mixing_ratio, cutoff_epoch=cutoff_epoch, validation_dataset=validation_data, validation_dataset_aux=validation_data_aux, params=trainer_params, files_to_archive=params.files_to_archive) evaluate_on_test = params.pop("evaluate_on_test", False) params.assert_empty('base train command') trainer.train() # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) if test_data and evaluate_on_test: test_data.index_instances(vocab) evaluate(model, test_data, iterator, cuda_device=trainer._cuda_device) # pylint: disable=protected-access elif test_data: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") if test_data_aux and evaluate_on_test: test_data_aux.index_instances(vocab) evaluate(model, test_data_aux, iterator_aux, cuda_device=trainer._cuda_device) # pylint: disable=protected-access elif test_data_aux: logger.info( "To evaluate on the auxillary test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") return model
def main(): args = parse_args() params = Params.from_file(args.params) save_dir = Path(args.save) save_dir.mkdir(parents=True) params.to_file(save_dir / 'params.json') train_params, model_params = params.pop('train'), params.pop('model') random_seed = train_params.pop_int('random_seed', 2019) torch.manual_seed(random_seed) random.seed(random_seed) log_filename = save_dir / 'stdout.log' sys.stdout = TeeLogger(filename=log_filename, terminal=sys.stdout, file_friendly_terminal_output=False) sys.stderr = TeeLogger(filename=log_filename, terminal=sys.stderr, file_friendly_terminal_output=False) tokenizer = WordTokenizer(word_splitter=JustSpacesWordSplitter(), start_tokens=['<s>'], end_tokens=['</s>']) token_indexer = SingleIdTokenIndexer(lowercase_tokens=True) dataset_reader = QuoraParaphraseDatasetReader( tokenizer=tokenizer, token_indexers={'tokens': token_indexer}) train_labeled_dataset = dataset_reader.read( train_params.pop('train_labeled_dataset_path')) train_unlabeled_dataset = dataset_reader.read( train_params.pop('train_unlabeled_dataset_path')) valid_dataset = dataset_reader.read(train_params.pop('valid_dataset_path')) train_labeled_dataset = filter_dataset_by_length( dataset=train_labeled_dataset, max_length=35) train_unlabeled_dataset = filter_dataset_by_length( dataset=train_unlabeled_dataset, max_length=35) vocab = Vocabulary.from_instances( instances=train_labeled_dataset + train_unlabeled_dataset, max_vocab_size=train_params.pop_int('max_vocab_size', None)) vocab.save_to_files(save_dir / 'vocab') labeled_batch_size = train_params.pop_int('labeled_batch_size') unlabeled_batch_size = train_params.pop_int('unlabeled_batch_size') labeled_iterator = BasicIterator(batch_size=labeled_batch_size) unlabeled_iterator = BasicIterator(batch_size=unlabeled_batch_size) labeled_iterator.index_with(vocab) unlabeled_iterator.index_with(vocab) pretrained_checkpoint_path = train_params.pop('pretrained_checkpoint_path', None) model = QuoraModel(params=model_params, vocab=vocab) if pretrained_checkpoint_path: model.load_state_dict( torch.load(pretrained_checkpoint_path, map_location='cpu')) model.add_finetune_parameters( con_autoweight=train_params.pop_bool('con_autoweight', False), con_y_weight=train_params.pop_float('con_y_weight'), con_z_weight=train_params.pop_float('con_z_weight'), con_z2_weight=train_params.pop_float('con_z2_weight')) main_optimizer = optim.Adam(params=model.finetune_main_parameters( exclude_generator=train_params.pop_bool('exclude_generator')), lr=train_params.pop_float('lr', 1e-3)) aux_optimizer = optim.Adam(params=model.finetune_aux_parameters(), lr=train_params.pop_float('aux_lr', 1e-4)) summary_writer = SummaryWriter(log_dir=save_dir / 'log') kl_anneal_rate = train_params.pop_float('kl_anneal_rate', None) if kl_anneal_rate is None: kl_weight_scheduler = None else: kl_weight_scheduler = (lambda step: min(1.0, kl_anneal_rate * step)) model.kl_weight = 0.0 gumbel_anneal_rate = train_params.pop_float('gumbel_anneal_rate', None) if gumbel_anneal_rate is None: gumbel_temperature_scheduler = None else: gumbel_temperature_scheduler = ( lambda step: max(0.1, 1.0 - gumbel_anneal_rate * step)) model.gumbel_temperature = 1.0 trainer = FineTuningTrainer( model=model, main_optimizer=main_optimizer, aux_optimizer=aux_optimizer, labeled_iterator=labeled_iterator, unlabeled_iterator=unlabeled_iterator, train_labeled_dataset=train_labeled_dataset, train_unlabeled_dataset=train_unlabeled_dataset, validation_dataset=valid_dataset, summary_writer=summary_writer, serialization_dir=save_dir, num_epochs=train_params.pop_int('num_epochs', 50), iters_per_epoch=len(train_labeled_dataset) // labeled_batch_size, write_summary_every=100, validate_every=1000, patience=train_params.pop_int('patience', 5), clip_grad_max_norm=train_params.pop_float('grad_max_norm', 5.0), kl_weight_scheduler=kl_weight_scheduler, gumbel_temperature_scheduler=gumbel_temperature_scheduler, cuda_device=train_params.pop_int('cuda_device', 0)) trainer.train()