def evaluate_model(seed: int, log_dir: str, gpu_ids: Tuple[int, ...], model_name: str, model_filepath: str, model_params_filepath: str, test_data_directory: str, n_workers: int, n_batch: int, evaluation_metrics: Tuple[str, ...], s3shared_cloud_log_path: str = None): """ All args should be json serializable """ random.seed(seed) np.random.seed(seed) mx.random.seed(seed) logger = start_logging(log_dir) ctx = [mx.gpu(i) for i in gpu_ids if i < mx.context.num_gpus()] model = models.__dict__[model_name].load_model(model_filepath) model.load_parameters(model_params_filepath, ctx=ctx) # model.hybridize() logger.info('Loaded Model {} with params:\n{}'.format( model, pprint.pformat(model.__dict__))) datapoints = os.listdir(test_data_directory) try: datapoints.remove('{}.pkl'.format(model.DataEncoder.__name__)) except ValueError: pass else: logger.info( "There normally shouldn't be a DataEncoder in your test data...but okay" ) datapoints = [os.path.join(test_data_directory, i) for i in datapoints] logger.info('Testing on preprocessed data in {} ({} datapoints)'.format( test_data_directory, len(datapoints))) loader = AsyncDataLoader(datapoints, model.split_and_batchify, n_batch, ctx, n_workers) # # Dummy computation to initialize model for hybridize # split_batch, batch_length = loader.__next__() # [nd.sum(model(batch.data)) + nd.sum(batch.label) for batch in split_batch] for metric in evaluation_metrics: metric_fxn = experiments.__dict__[metric] metric_val = metric_fxn(loader, model) logger.info('{}: {}'.format(metric, metric_val)) if s3shared_cloud_log_path: s3_sync(log_dir, s3shared_cloud_log_path) # For testing return metric_val
def evaluate_model_for_experiment(seed: int, gpu_ids: Tuple[int, ...], dataset_name: str, experiment_name: str, experiment_run_log_id: str, model_name: str, model_label: str, n_workers: int, n_batch: int, evaluation_metrics: Tuple[str, ...], model_params_to_load: str = 'best.params', skip_s3_sync=False, test=False): for repo_type in ['seen_repos', 'unseen_repos']: # Assumes we've already preprocessed the data for the experiment, and we're pulling it from s3 test_data_dir_suffix = os.path.join(dataset_name, 'experiments', experiment_name, repo_type, 'test_graphs') if test: s3shared_local_path = test_s3shared_path else: from experiments import s3shared_local_path, s3shared_cloud_path if not skip_s3_sync: s3_sync( os.path.join(s3shared_cloud_path, test_data_dir_suffix), os.path.join(s3shared_local_path, test_data_dir_suffix)) for repo_type in ['seen_repos', 'unseen_repos']: # Assumes we've already preprocessed the data for the experiment, and we're pulling it from s3 test_data_dir_suffix = os.path.join(dataset_name, 'experiments', experiment_name, repo_type, 'test_graphs') local_test_dir = os.path.join(s3shared_local_path, test_data_dir_suffix) model_train_log_suffix = os.path.join( dataset_name, 'experiments', experiment_name, 'seen_repos', 'train_graphs', 'logs', experiment_run_log_id, '_'.join([model_name, model_label])) model_filepath = os.path.join(s3shared_local_path, model_train_log_suffix, 'model.pkl') model_params_filepath = os.path.join(s3shared_local_path, model_train_log_suffix, model_params_to_load) if not test: s3_cp( os.path.join(s3shared_cloud_path, model_train_log_suffix, 'model.pkl'), model_filepath) s3_cp( os.path.join(s3shared_cloud_path, model_train_log_suffix, model_params_to_load), model_params_filepath) log_dir_suffix = os.path.join('eval_logs', experiment_run_log_id, '_'.join([model_name, model_label]), model_params_to_load) log_dir = os.path.join(local_test_dir, log_dir_suffix) test_data_dir = os.path.join( local_test_dir, '_'.join([model_name, model_label, 'preprocessed_data'])) if test: s3_cloud_log_path = None else: s3_cloud_log_path = os.path.join(s3shared_cloud_path, test_data_dir_suffix, log_dir_suffix) evaluate_model(seed=seed, log_dir=log_dir, gpu_ids=gpu_ids, model_name=model_name, model_filepath=model_filepath, model_params_filepath=model_params_filepath, test_data_directory=test_data_dir, n_workers=n_workers, n_batch=n_batch, evaluation_metrics=evaluation_metrics, s3shared_cloud_log_path=s3_cloud_log_path)
def train_model_for_experiment(dataset_name: str, experiment_name: str, experiment_run_log_id: str, seed: int, gpu_ids: Tuple[int, ...], model_name: str, model_label: str, model_kwargs: dict, init_fxn_name: str, init_fxn_kwargs: dict, loss_fxn_name: str, loss_fxn_kwargs: dict, optimizer_name: str, optimizer_kwargs: dict, val_fraction: float, n_workers: int, n_epochs: int, evaluation_metrics: [str], n_batch: int, debug: bool = False, skip_s3_sync=False, test: bool = False): # Assumes we've already preprocessed the data for the experiment, and we're pulling it from s3 train_data_dir_suffix = os.path.join(dataset_name, 'experiments', experiment_name, 'seen_repos', 'train_graphs') if test: s3shared_local_path = test_s3shared_path else: from experiments import s3shared_local_path, s3shared_cloud_path if not skip_s3_sync: s3_sync( os.path.join( s3shared_cloud_path, train_data_dir_suffix, '_'.join([model_name, model_label, 'preprocessed_data'])), os.path.join( s3shared_local_path, train_data_dir_suffix, '_'.join([model_name, model_label, 'preprocessed_data']))) local_train_dir = os.path.join(s3shared_local_path, train_data_dir_suffix) model_class = models.__dict__[model_name] log_dir_suffix = os.path.join('logs', experiment_run_log_id, '_'.join([model_name, model_label])) log_dir = os.path.join(local_train_dir, log_dir_suffix) train_data_dir = os.path.join( local_train_dir, '_'.join([model_name, model_label, 'preprocessed_data'])) if test: s3_cloud_log_path = None else: s3_cloud_log_path = os.path.join(s3shared_cloud_path, train_data_dir_suffix, log_dir_suffix) train(seed=seed, log_dir=log_dir, gpu_ids=gpu_ids, model_name=model_name, data_encoder_filepath=os.path.join( train_data_dir, '{}.pkl'.format(model_class.DataEncoder.__name__)), model_kwargs=model_kwargs, init_fxn_name=init_fxn_name, init_fxn_kwargs=init_fxn_kwargs, loss_fxn_name=loss_fxn_name, loss_fxn_kwargs=loss_fxn_kwargs, optimizer_name=optimizer_name, optimizer_kwargs=optimizer_kwargs, train_data_directory=train_data_dir, val_fraction=val_fraction, n_workers=n_workers, n_epochs=n_epochs, evaluation_metrics=evaluation_metrics, n_batch=n_batch, s3shared_cloud_log_path=s3_cloud_log_path, debug=debug)
def train( seed: int, log_dir: str, gpu_ids: Tuple[int, ...], model_name: str, data_encoder_filepath: str, model_kwargs: dict, init_fxn_name: str, init_fxn_kwargs: dict, loss_fxn_name: str, loss_fxn_kwargs: dict, optimizer_name: str, optimizer_kwargs: dict, train_data_directory: str, val_fraction: float, n_workers: int, n_epochs: int, evaluation_metrics: Tuple[str, ...], n_batch: int, # n_batch is the total, so each gpu gets n_batch / len(gpu_ids) datapoints s3shared_cloud_log_path: str = None, debug: bool = False, test: bool = False): ''' All args should be json serializable ''' random.seed(seed) np.random.seed(seed) mx.random.seed(seed) logger = start_logging(log_dir, debug) ctx = [mx.gpu(i) for i in gpu_ids] logger.info( 'Starting training with args:\nseed: {}\ngpu_ids: {}\nval_fraction: {}\nn_workers: {}\nn_epochs: {}\nn_batch: {}\n' .format(seed, gpu_ids, val_fraction, n_workers, n_epochs, n_batch)) model_kwargs['data_encoder_filepath'] = data_encoder_filepath with open(data_encoder_filepath, 'rb') as f: model_kwargs['data_encoder'] = pickle.load(f) model = models.__dict__[model_name](**model_kwargs) # model.hybridize() model.save_model(os.path.join(log_dir, 'model.pkl')) logger.info( 'Instantiated Model {} with kwargs:\n{}\nand DataEncoder that processed with kwargs:\n{}' .format( model_name, pprint.pformat(model_kwargs), pprint.pformat(model.data_encoder.instance_to_datapoints_kwargs))) initializer = mx.init.__dict__[init_fxn_name](**init_fxn_kwargs) model.collect_params().initialize(initializer, ctx=ctx) logger.info('Initialized Model using {} with params:\n{}'.format( init_fxn_name, pprint.pformat(initializer.__dict__))) loss_fxn = experiments.__dict__[loss_fxn_name](**loss_fxn_kwargs) logger.info('Instantiated Loss {} with params:\n{}'.format( loss_fxn_name, pprint.pformat(loss_fxn.__dict__))) optimizer = mx.optimizer.__dict__[optimizer_name](**optimizer_kwargs) logger.info('Instantiated optimizer {} with params:\n{}'.format( optimizer_name, pprint.pformat(optimizer.__dict__))) trainer = gluon.Trainer(model.collect_params(), optimizer) datapoints = os.listdir(train_data_directory) datapoints.remove('{}.pkl'.format(model.DataEncoder.__name__)) datapoints = [os.path.join(train_data_directory, i) for i in datapoints] logger.info('Training on preprocessed data in {} ({} datapoints)'.format( train_data_directory, len(datapoints))) train_data, val_data = train_test_split(datapoints, test_size=val_fraction) if test: val_data = train_data train_data = val_data * 100 logger.info( 'Train data contains {} datapoints, Val data contains {} datapoints'. format(len(train_data), len(val_data))) train_loader = AsyncDataLoader(train_data, model.split_and_batchify, n_batch, ctx, n_workers) val_loader = AsyncDataLoader(val_data, model.split_and_batchify, n_batch, ctx, n_workers) lowest_val_loss = np.inf for e in range(n_epochs): with train_loader as train_loader: cumulative_loss = nd.zeros((1, ), ctx=ctx[0]) for split_batch, batch_length in tqdm( train_loader, total=train_loader.total_batches): with autograd.record(): losses = [ loss_fxn(model(batch.data), batch.label, model.data_encoder) for batch in split_batch ] for loss in losses: loss.backward() trainer.step(batch_length) loss_sums = nd.concat( *[loss.sum().as_in_context(ctx[0]) for loss in losses], dim=0) cumulative_loss += nd.sum(loss_sums) cumulative_loss.wait_to_read() logger.info('Epoch {}. (Cumulative) Train Loss: {}'.format( e, cumulative_loss.asscalar() / len(train_loader))) val_loss = evaluate_loss(val_loader, model, loss_fxn) logger.info('Epoch {}. Val Loss: {}'.format(e, val_loss)) if val_loss < lowest_val_loss: model.save_parameters(os.path.join(log_dir, 'best.params')) lowest_val_loss = np.min((val_loss, lowest_val_loss)) for metric in evaluation_metrics: metric_fxn = experiments.__dict__[metric] metric_val = metric_fxn(val_loader, model) logger.info('Epoch {}. {}: {}'.format(e, metric, metric_val)) checkpoint_filename = os.path.join( log_dir, 'model_checkpoint_epoch_{}.params'.format(e)) model.save_parameters(checkpoint_filename) if s3shared_cloud_log_path: s3_sync(log_dir, s3shared_cloud_log_path) logger.error( 'Training finished. S3 path: {}'.format(s3shared_cloud_log_path)) # For testing return val_data, metric_val
def make_tasks_and_preprocess( seed: int, dataset_name: str, experiment_name: str, task_names: List[str], n_jobs: int, model_names_labels_and_prepro_kwargs: List[Tuple[str, str, frozenset, dict, dict]], skip_make_tasks=False, test=False): # Assumes we've already created train and test directories, and we're pulling them from s3 if test: s3shared_local_path = test_s3shared_path else: from experiments import s3shared_local_path, s3shared_cloud_path s3_sync(os.path.join(s3shared_cloud_path, dataset_name, 'seen_repos'), os.path.join(s3shared_local_path, dataset_name, 'seen_repos')) s3_sync( os.path.join(s3shared_cloud_path, dataset_name, 'unseen_repos'), os.path.join(s3shared_local_path, dataset_name, 'unseen_repos')) dataset_dir = os.path.join(s3shared_local_path, dataset_name) experiment_dir = os.path.join(dataset_dir, 'experiments', experiment_name) os.makedirs(experiment_dir, exist_ok=True) for task_name in task_names: logger.info('Starting task {}'.format(task_name)) task_class = data.__dict__[task_name] dataset_types = [ os.path.join('seen_repos', 'train_graphs'), os.path.join('seen_repos', 'test_graphs'), os.path.join('unseen_repos', 'test_graphs') ] for dataset_type in dataset_types: gml_dir = os.path.join(dataset_dir, dataset_type) output_dir = os.path.join(experiment_dir, dataset_type) os.makedirs(output_dir, exist_ok=True) gml_files = [ os.path.abspath(os.path.join(gml_dir, file)) for file in os.listdir(gml_dir) ] task_filepath = os.path.join(output_dir, '{}.pkl'.format(task_name)) if not skip_make_tasks: task = task_class.from_gml_files(gml_files) task.save(task_filepath) for model_name, model_label, excluded_edge_types, data_encoder_kwargs, instance_to_datapoints_kwargs in model_names_labels_and_prepro_kwargs: logger.info('Starting preprocessing for {} on {} {}'.format( model_name, task_name, dataset_type)) dataset_output_dir_suffix = '_'.join( [model_name, model_label, 'preprocessed_data']) dataset_output_dir = os.path.join(output_dir, dataset_output_dir_suffix) if dataset_type == dataset_types[0]: data_encoder = 'new' else: model_class = models.__dict__[model_name] data_encoder = os.path.join( experiment_dir, dataset_types[0], dataset_output_dir_suffix, '{}.pkl'.format(model_class.DataEncoder.__name__)) preprocess_task_for_model( seed=seed, task_class_name=task_name, task_filepath=task_filepath, model_name=model_name, dataset_output_dir=dataset_output_dir, n_jobs=n_jobs, excluded_edge_types=excluded_edge_types, data_encoder=data_encoder, data_encoder_kwargs=data_encoder_kwargs, instance_to_datapoints_kwargs=instance_to_datapoints_kwargs ) if not test: s3_sync( experiment_dir, os.path.join(s3shared_cloud_path, dataset_name, 'experiments', experiment_name))