def train_eval_lightning( train_dataset, eval_dataset, trainer_module, num_epochs, use_gpu, batch_preprocessor=None, reader_options: Optional[ReaderOptions] = None, checkpoint_path: Optional[str] = None, ) -> pl.Trainer: reader_options = reader_options or ReaderOptions() datamodule = PetastormLightningDataModule(train_dataset, eval_dataset, batch_preprocessor, reader_options) # pyre-fixme[16]: Module `pl` has no attribute `Trainer`. # pyre-fixme[16]: Module `pl` has no attribute `Trainer`. trainer = pl.Trainer( max_epochs=num_epochs * 1000, gpus=int(use_gpu), reload_dataloaders_every_epoch=True, resume_from_checkpoint=checkpoint_path, callbacks=[StoppingEpochCallback(num_epochs)], ) trainer.fit(trainer_module, datamodule=datamodule) # TODO: evaluate return trainer
def identify_and_train_network( input_table_spec: TableSpec, model: ModelManager__Union, num_epochs: int, use_gpu: Optional[bool] = None, reward_options: Optional[RewardOptions] = None, reader_options: Optional[ReaderOptions] = None, resource_options: Optional[ResourceOptions] = None, warmstart_path: Optional[str] = None, validator: Optional[ModelValidator__Union] = None, publisher: Optional[ModelPublisher__Union] = None, ) -> RLTrainingOutput: if use_gpu is None: # pyre-fixme[35]: Target cannot be annotated. use_gpu: bool = torch.cuda.is_available() reward_options = reward_options or RewardOptions() reader_options = reader_options or ReaderOptions() manager = model.value normalization_data_map = None setup_data = None data_module = manager.get_data_module( input_table_spec=input_table_spec, reward_options=reward_options, reader_options=reader_options, resource_options=resource_options, ) if data_module is not None: data_module.prepare_data() setup_data = data_module.setup_data else: normalization_data_map = manager.run_feature_identification( input_table_spec) return query_and_train( input_table_spec, model, num_epochs, use_gpu=use_gpu, setup_data=setup_data, normalization_data_map=normalization_data_map, reward_options=reward_options, reader_options=reader_options, resource_options=resource_options, warmstart_path=warmstart_path, validator=validator, publisher=publisher, )
def train_workflow( self, train_dataset: Dataset, eval_dataset: Optional[Dataset], normalization_data_map: Dict[str, NormalizationData], num_epochs: int, use_gpu: bool, named_model_ids: ModuleNameToEntityId, child_workflow_id: int, reward_options: Optional[RewardOptions] = None, reader_options: Optional[ReaderOptions] = None, resource_options: Optional[ResourceOptions] = None, warmstart_path: Optional[str] = None, ) -> RLTrainingOutput: writer = SummaryWriter() logger.info("TensorBoard logging location is: {}".format( writer.log_dir)) warmstart_input_path = warmstart_path or None self.initialize_trainer( use_gpu=use_gpu, # pyre-fixme[6]: Expected `RewardOptions` for 2nd param but got # `Optional[RewardOptions]`. # pyre-fixme[6]: Expected `RewardOptions` for 2nd param but got # `Optional[RewardOptions]`. reward_options=reward_options, normalization_data_map=normalization_data_map, warmstart_path=warmstart_input_path, ) if not reader_options: reader_options = ReaderOptions() with summary_writer_context(writer): train_output = self.train(train_dataset, eval_dataset, num_epochs, reader_options) output_paths = {} for module_name, serving_module in self.build_serving_modules().items( ): # TODO: make this a parameter torchscript_output_path = f"model_{round(time.time())}.torchscript" serving_module = self.build_serving_module() torch.jit.save(serving_module, torchscript_output_path) logger.info(f"Saved {module_name} to {torchscript_output_path}") output_paths[module_name] = torchscript_output_path return dataclasses.replace(train_output, output_paths=output_paths)
def __init__( self, *, input_table_spec: Optional[TableSpec] = None, reward_options: Optional[RewardOptions] = None, setup_data: Optional[Dict[str, bytes]] = None, saved_setup_data: Optional[Dict[str, bytes]] = None, reader_options: Optional[ReaderOptions] = None, model_manager=None, ): super().__init__() self.input_table_spec = input_table_spec self.reward_options = reward_options or RewardOptions() self.reader_options = reader_options or ReaderOptions() self._model_manager = model_manager self.setup_data = setup_data self.saved_setup_data = saved_setup_data or {} self._setup_done = False
def train_and_evaluate_generic( train_dataset: Dataset, eval_dataset: Optional[Dataset], trainer: RLTrainer, num_epochs: int, use_gpu: bool, batch_preprocessor: BatchPreprocessor, reporter: Observer, evaluator: Evaluator, reader_options: Optional[ReaderOptions] = None, ) -> None: reader_options = reader_options or ReaderOptions() epoch_iterator = EpochIterator(num_epochs=num_epochs) # pyre-fixme[16]: `HiveDataSetClass` has no attribute `parquet_url`. # pyre-fixme[16]: `HiveDataSetClass` has no attribute `parquet_url`. train_dataset_size = get_table_row_count(train_dataset.parquet_url) # pyre-fixme[16]: `EpochIterator` has no attribute `add_observer`. for epoch in epoch_iterator.add_observer(reporter): logger.info(f"Starting training epoch {epoch}.") dataloader = get_petastorm_dataloader( dataset=train_dataset, # pyre-fixme[6]: Expected `int` for 2nd param but got `Optional[int]`. batch_size=trainer.minibatch_size, batch_preprocessor=batch_preprocessor, use_gpu=use_gpu, reader_options=reader_options, ) dataloader_wrapper = DataLoaderWrapper( dataloader=dataloader, dataloader_size=train_dataset_size) for batch in dataloader_wrapper: trainer.train(batch) if eval_dataset is not None: eval_data = gather_eval_data( trainer=trainer, eval_dataset=eval_dataset, batch_preprocessor=batch_preprocessor, use_gpu=use_gpu, reader_options=reader_options, ) # evaluator passes cpe_details to reporter via notify_observers evaluator.evaluate_post_training(eval_data)
def __init__( self, *, input_table_spec: Optional[TableSpec] = None, reward_options: Optional[RewardOptions] = None, setup_data: Optional[Dict[str, bytes]] = None, saved_setup_data: Optional[Dict[str, bytes]] = None, reader_options: Optional[ReaderOptions] = None, resource_options: Optional[ResourceOptions] = None, model_manager=None, ): super().__init__() self.input_table_spec = input_table_spec self.reward_options = reward_options or RewardOptions() self.reader_options = reader_options or ReaderOptions() self.resource_options = resource_options or ResourceOptions(gpu=0) self._model_manager = model_manager self.setup_data = setup_data self.saved_setup_data = saved_setup_data or {} self._setup_done = False self._num_train_data_loader_calls = 0 self._num_val_data_loader_calls = 0 self._num_test_data_loader_calls = 0
def query_and_train( input_table_spec: TableSpec, model: ModelManager__Union, num_epochs: int, use_gpu: bool, *, setup_data: Optional[Dict[str, bytes]] = None, saved_setup_data: Optional[Dict[str, bytes]] = None, normalization_data_map: Optional[Dict[str, NormalizationData]] = None, reward_options: Optional[RewardOptions] = None, reader_options: Optional[ReaderOptions] = None, resource_options: Optional[ResourceOptions] = None, warmstart_path: Optional[str] = None, validator: Optional[ModelValidator__Union] = None, publisher: Optional[ModelPublisher__Union] = None, named_model_ids: Optional[ModuleNameToEntityId] = None, recurring_period: Optional[RecurringPeriod] = None, ) -> RLTrainingOutput: child_workflow_id = get_workflow_id() if named_model_ids is None: named_model_ids = get_new_named_entity_ids(model.value.serving_module_names()) logger.info("Starting query") reward_options = reward_options or RewardOptions() reader_options = reader_options or ReaderOptions() resource_options = resource_options or ResourceOptions() manager = model.value if saved_setup_data is not None: def _maybe_get_bytes(v) -> bytes: if isinstance(v, bytes): return v # HACK: FBLearner sometimes pack bytes into Blob return v.data saved_setup_data = {k: _maybe_get_bytes(v) for k, v in saved_setup_data.items()} if setup_data is None: data_module = manager.get_data_module( input_table_spec=input_table_spec, reward_options=reward_options, reader_options=reader_options, saved_setup_data=saved_setup_data, ) if data_module is not None: setup_data = data_module.prepare_data() # Throw away existing normalization data map normalization_data_map = None if sum([int(setup_data is not None), int(normalization_data_map is not None)]) != 1: raise ValueError("setup_data and normalization_data_map are mutually exclusive") train_dataset = None eval_dataset = None if normalization_data_map is not None: calc_cpe_in_training = manager.should_generate_eval_dataset sample_range_output = get_sample_range(input_table_spec, calc_cpe_in_training) train_dataset = manager.query_data( input_table_spec=input_table_spec, sample_range=sample_range_output.train_sample_range, reward_options=reward_options, ) eval_dataset = None if calc_cpe_in_training: eval_dataset = manager.query_data( input_table_spec=input_table_spec, sample_range=sample_range_output.eval_sample_range, reward_options=reward_options, ) logger.info("Starting training") results = manager.train_workflow( train_dataset, eval_dataset, num_epochs=num_epochs, use_gpu=use_gpu, setup_data=setup_data, normalization_data_map=normalization_data_map, named_model_ids=named_model_ids, child_workflow_id=child_workflow_id, reward_options=reward_options, reader_options=reader_options, resource_options=resource_options, warmstart_path=warmstart_path, ) if validator is not None: results = run_validator(validator, results) if publisher is not None: results = run_publisher( publisher, model, results, named_model_ids, child_workflow_id, recurring_period, ) return results
def train_workflow( model_manager: ModelManager, train_dataset: Optional[Dataset], eval_dataset: Optional[Dataset], *, num_epochs: int, use_gpu: bool, named_model_ids: ModuleNameToEntityId, child_workflow_id: int, setup_data: Optional[Dict[str, bytes]] = None, normalization_data_map: Optional[Dict[str, NormalizationData]] = None, reward_options: Optional[RewardOptions] = None, reader_options: Optional[ReaderOptions] = None, resource_options: Optional[ResourceOptions] = None, warmstart_path: Optional[str] = None, ) -> RLTrainingOutput: writer = SummaryWriter() logger.info("TensorBoard logging location is: {}".format(writer.log_dir)) if setup_data is not None: data_module = model_manager.get_data_module( setup_data=setup_data, reward_options=reward_options, reader_options=reader_options, resource_options=resource_options, ) assert data_module is not None data_module.setup() else: data_module = None if normalization_data_map is None: assert data_module is not None normalization_data_map = data_module.get_normalization_data_map() warmstart_input_path = warmstart_path or None trainer_module = model_manager.build_trainer( use_gpu=use_gpu, reward_options=reward_options, normalization_data_map=normalization_data_map, ) if not reader_options: reader_options = ReaderOptions() if not resource_options: resource_options = ResourceOptions() with summary_writer_context(writer): train_output, lightning_trainer = model_manager.train( trainer_module, train_dataset, eval_dataset, None, data_module, num_epochs, reader_options, resource_options, checkpoint_path=warmstart_input_path, ) output_paths = {} for module_name, serving_module in model_manager.build_serving_modules( trainer_module, normalization_data_map).items(): torchscript_output_path = f"{model_manager.__class__.__name__}_{module_name}_{round(time.time())}.torchscript" torch.jit.save(serving_module, torchscript_output_path) logger.info(f"Saved {module_name} to {torchscript_output_path}") output_paths[module_name] = torchscript_output_path return dataclasses.replace(train_output, output_paths=output_paths)
def query_and_train( input_table_spec: TableSpec, model: ModelManager__Union, normalization_data_map: Dict[str, NormalizationData], num_epochs: int, use_gpu: bool, reward_options: Optional[RewardOptions] = None, reader_options: Optional[ReaderOptions] = None, resource_options: Optional[ResourceOptions] = None, warmstart_path: Optional[str] = None, validator: Optional[ModelValidator__Union] = None, publisher: Optional[ModelPublisher__Union] = None, parent_workflow_id: Optional[int] = None, recurring_period: Optional[RecurringPeriod] = None, ) -> RLTrainingOutput: child_workflow_id = get_workflow_id() if parent_workflow_id is None: parent_workflow_id = child_workflow_id logger.info("Starting query") reward_options = reward_options or RewardOptions() reader_options = reader_options or ReaderOptions() resource_options = resource_options or ResourceOptions() manager = model.value calc_cpe_in_training = manager.should_generate_eval_dataset sample_range_output = get_sample_range(input_table_spec, calc_cpe_in_training) train_dataset = manager.query_data( input_table_spec=input_table_spec, sample_range=sample_range_output.train_sample_range, reward_options=reward_options, ) eval_dataset = None if calc_cpe_in_training: eval_dataset = manager.query_data( input_table_spec=input_table_spec, sample_range=sample_range_output.eval_sample_range, reward_options=reward_options, ) logger.info("Starting training") results = manager.train_workflow( train_dataset, eval_dataset, normalization_data_map, num_epochs, use_gpu, parent_workflow_id=parent_workflow_id, child_workflow_id=child_workflow_id, reward_options=reward_options, reader_options=reader_options, resource_options=resource_options, warmstart_path=warmstart_path, ) if validator is not None: results = run_validator(validator, results) if publisher is not None: results = run_publisher( publisher, model, results, parent_workflow_id, child_workflow_id, recurring_period, ) return results
def train_and_evaluate_generic( train_dataset: Dataset, eval_dataset: Optional[Dataset], trainer: RLTrainer, num_epochs: int, use_gpu: bool, batch_preprocessor: BatchPreprocessor, train_page_handler: TrainingPageHandler, eval_page_handler: EvaluationPageHandler, reader_options: Optional[ReaderOptions] = None, ): reader_options = reader_options or ReaderOptions() train_dataset_num_rows = get_table_row_count(train_dataset.parquet_url) eval_dataset_num_rows = None if eval_dataset is not None: eval_dataset_num_rows = get_table_row_count(eval_dataset.parquet_url) logger.info(f"train_data_num: {train_dataset_num_rows}, " f"eval_data_num: {eval_dataset_num_rows}") for epoch in range(num_epochs): logger.info(f"Epoch {epoch} start feeding training data") data_reader = make_batch_reader( train_dataset.parquet_url, num_epochs=1, reader_pool_type=reader_options.petastorm_reader_pool_type, ) with DataLoader( data_reader, batch_size=trainer.minibatch_size, collate_fn=collate_and_preprocess(batch_preprocessor), ) as data_loader: feed_pages( data_loader, train_dataset_num_rows, epoch, trainer.minibatch_size, use_gpu, train_page_handler, ) if not eval_dataset: continue logger.info(f"Epoch {epoch} start feeding evaluation data") eval_data_reader = make_batch_reader( eval_dataset.parquet_url, num_epochs=1, reader_pool_type=reader_options.petastorm_reader_pool_type, ) with DataLoader( eval_data_reader, batch_size=trainer.minibatch_size, collate_fn=collate_and_preprocess(batch_preprocessor), ) as eval_data_loader: feed_pages( eval_data_loader, eval_dataset_num_rows, epoch, trainer.minibatch_size, use_gpu, eval_page_handler, )