Exemplo n.º 1
0
def evaluate_model(seed: int,
                   log_dir: str,
                   gpu_ids: Tuple[int, ...],
                   model_name: str,
                   model_filepath: str,
                   model_params_filepath: str,
                   test_data_directory: str,
                   n_workers: int,
                   n_batch: int,
                   evaluation_metrics: Tuple[str, ...],
                   s3shared_cloud_log_path: str = None):
    """
    All args should be json serializable
    """
    random.seed(seed)
    np.random.seed(seed)
    mx.random.seed(seed)

    logger = start_logging(log_dir)

    ctx = [mx.gpu(i) for i in gpu_ids if i < mx.context.num_gpus()]

    model = models.__dict__[model_name].load_model(model_filepath)
    model.load_parameters(model_params_filepath, ctx=ctx)
    # model.hybridize()
    logger.info('Loaded Model {} with params:\n{}'.format(
        model, pprint.pformat(model.__dict__)))

    datapoints = os.listdir(test_data_directory)
    try:
        datapoints.remove('{}.pkl'.format(model.DataEncoder.__name__))
    except ValueError:
        pass
    else:
        logger.info(
            "There normally shouldn't be a DataEncoder in your test data...but okay"
        )
    datapoints = [os.path.join(test_data_directory, i) for i in datapoints]
    logger.info('Testing on preprocessed data in {} ({} datapoints)'.format(
        test_data_directory, len(datapoints)))
    loader = AsyncDataLoader(datapoints, model.split_and_batchify, n_batch,
                             ctx, n_workers)

    # # Dummy computation to initialize model for hybridize
    # split_batch, batch_length = loader.__next__()
    # [nd.sum(model(batch.data)) + nd.sum(batch.label) for batch in split_batch]

    for metric in evaluation_metrics:
        metric_fxn = experiments.__dict__[metric]
        metric_val = metric_fxn(loader, model)
        logger.info('{}: {}'.format(metric, metric_val))

        if s3shared_cloud_log_path:
            s3_sync(log_dir, s3shared_cloud_log_path)

    # For testing
    return metric_val
def evaluate_model_for_experiment(seed: int,
                                  gpu_ids: Tuple[int, ...],
                                  dataset_name: str,
                                  experiment_name: str,
                                  experiment_run_log_id: str,
                                  model_name: str,
                                  model_label: str,
                                  n_workers: int,
                                  n_batch: int,
                                  evaluation_metrics: Tuple[str, ...],
                                  model_params_to_load: str = 'best.params',
                                  skip_s3_sync=False,
                                  test=False):
    for repo_type in ['seen_repos', 'unseen_repos']:
        # Assumes we've already preprocessed the data for the experiment, and we're pulling it from s3
        test_data_dir_suffix = os.path.join(dataset_name, 'experiments',
                                            experiment_name, repo_type,
                                            'test_graphs')
        if test:
            s3shared_local_path = test_s3shared_path
        else:
            from experiments import s3shared_local_path, s3shared_cloud_path
            if not skip_s3_sync:
                s3_sync(
                    os.path.join(s3shared_cloud_path, test_data_dir_suffix),
                    os.path.join(s3shared_local_path, test_data_dir_suffix))

    for repo_type in ['seen_repos', 'unseen_repos']:
        # Assumes we've already preprocessed the data for the experiment, and we're pulling it from s3
        test_data_dir_suffix = os.path.join(dataset_name, 'experiments',
                                            experiment_name, repo_type,
                                            'test_graphs')
        local_test_dir = os.path.join(s3shared_local_path,
                                      test_data_dir_suffix)

        model_train_log_suffix = os.path.join(
            dataset_name, 'experiments', experiment_name, 'seen_repos',
            'train_graphs', 'logs', experiment_run_log_id,
            '_'.join([model_name, model_label]))
        model_filepath = os.path.join(s3shared_local_path,
                                      model_train_log_suffix, 'model.pkl')
        model_params_filepath = os.path.join(s3shared_local_path,
                                             model_train_log_suffix,
                                             model_params_to_load)
        if not test:
            s3_cp(
                os.path.join(s3shared_cloud_path, model_train_log_suffix,
                             'model.pkl'), model_filepath)
            s3_cp(
                os.path.join(s3shared_cloud_path, model_train_log_suffix,
                             model_params_to_load), model_params_filepath)

        log_dir_suffix = os.path.join('eval_logs', experiment_run_log_id,
                                      '_'.join([model_name, model_label]),
                                      model_params_to_load)
        log_dir = os.path.join(local_test_dir, log_dir_suffix)
        test_data_dir = os.path.join(
            local_test_dir,
            '_'.join([model_name, model_label, 'preprocessed_data']))
        if test:
            s3_cloud_log_path = None
        else:
            s3_cloud_log_path = os.path.join(s3shared_cloud_path,
                                             test_data_dir_suffix,
                                             log_dir_suffix)
        evaluate_model(seed=seed,
                       log_dir=log_dir,
                       gpu_ids=gpu_ids,
                       model_name=model_name,
                       model_filepath=model_filepath,
                       model_params_filepath=model_params_filepath,
                       test_data_directory=test_data_dir,
                       n_workers=n_workers,
                       n_batch=n_batch,
                       evaluation_metrics=evaluation_metrics,
                       s3shared_cloud_log_path=s3_cloud_log_path)
def train_model_for_experiment(dataset_name: str,
                               experiment_name: str,
                               experiment_run_log_id: str,
                               seed: int,
                               gpu_ids: Tuple[int, ...],
                               model_name: str,
                               model_label: str,
                               model_kwargs: dict,
                               init_fxn_name: str,
                               init_fxn_kwargs: dict,
                               loss_fxn_name: str,
                               loss_fxn_kwargs: dict,
                               optimizer_name: str,
                               optimizer_kwargs: dict,
                               val_fraction: float,
                               n_workers: int,
                               n_epochs: int,
                               evaluation_metrics: [str],
                               n_batch: int,
                               debug: bool = False,
                               skip_s3_sync=False,
                               test: bool = False):
    # Assumes we've already preprocessed the data for the experiment, and we're pulling it from s3
    train_data_dir_suffix = os.path.join(dataset_name, 'experiments',
                                         experiment_name, 'seen_repos',
                                         'train_graphs')
    if test:
        s3shared_local_path = test_s3shared_path
    else:
        from experiments import s3shared_local_path, s3shared_cloud_path
        if not skip_s3_sync:
            s3_sync(
                os.path.join(
                    s3shared_cloud_path, train_data_dir_suffix,
                    '_'.join([model_name, model_label, 'preprocessed_data'])),
                os.path.join(
                    s3shared_local_path, train_data_dir_suffix,
                    '_'.join([model_name, model_label, 'preprocessed_data'])))
    local_train_dir = os.path.join(s3shared_local_path, train_data_dir_suffix)

    model_class = models.__dict__[model_name]

    log_dir_suffix = os.path.join('logs', experiment_run_log_id,
                                  '_'.join([model_name, model_label]))
    log_dir = os.path.join(local_train_dir, log_dir_suffix)
    train_data_dir = os.path.join(
        local_train_dir,
        '_'.join([model_name, model_label, 'preprocessed_data']))
    if test:
        s3_cloud_log_path = None
    else:
        s3_cloud_log_path = os.path.join(s3shared_cloud_path,
                                         train_data_dir_suffix, log_dir_suffix)
    train(seed=seed,
          log_dir=log_dir,
          gpu_ids=gpu_ids,
          model_name=model_name,
          data_encoder_filepath=os.path.join(
              train_data_dir,
              '{}.pkl'.format(model_class.DataEncoder.__name__)),
          model_kwargs=model_kwargs,
          init_fxn_name=init_fxn_name,
          init_fxn_kwargs=init_fxn_kwargs,
          loss_fxn_name=loss_fxn_name,
          loss_fxn_kwargs=loss_fxn_kwargs,
          optimizer_name=optimizer_name,
          optimizer_kwargs=optimizer_kwargs,
          train_data_directory=train_data_dir,
          val_fraction=val_fraction,
          n_workers=n_workers,
          n_epochs=n_epochs,
          evaluation_metrics=evaluation_metrics,
          n_batch=n_batch,
          s3shared_cloud_log_path=s3_cloud_log_path,
          debug=debug)
Exemplo n.º 4
0
def train(
        seed: int,
        log_dir: str,
        gpu_ids: Tuple[int, ...],
        model_name: str,
        data_encoder_filepath: str,
        model_kwargs: dict,
        init_fxn_name: str,
        init_fxn_kwargs: dict,
        loss_fxn_name: str,
        loss_fxn_kwargs: dict,
        optimizer_name: str,
        optimizer_kwargs: dict,
        train_data_directory: str,
        val_fraction: float,
        n_workers: int,
        n_epochs: int,
        evaluation_metrics: Tuple[str, ...],
        n_batch:
    int,  # n_batch is the total, so each gpu gets n_batch / len(gpu_ids) datapoints
        s3shared_cloud_log_path: str = None,
        debug: bool = False,
        test: bool = False):
    '''
    All args should be json serializable
    '''
    random.seed(seed)
    np.random.seed(seed)
    mx.random.seed(seed)

    logger = start_logging(log_dir, debug)

    ctx = [mx.gpu(i) for i in gpu_ids]

    logger.info(
        'Starting training with args:\nseed: {}\ngpu_ids: {}\nval_fraction: {}\nn_workers: {}\nn_epochs: {}\nn_batch: {}\n'
        .format(seed, gpu_ids, val_fraction, n_workers, n_epochs, n_batch))

    model_kwargs['data_encoder_filepath'] = data_encoder_filepath
    with open(data_encoder_filepath, 'rb') as f:
        model_kwargs['data_encoder'] = pickle.load(f)
    model = models.__dict__[model_name](**model_kwargs)
    # model.hybridize()
    model.save_model(os.path.join(log_dir, 'model.pkl'))
    logger.info(
        'Instantiated Model {} with kwargs:\n{}\nand DataEncoder that processed with kwargs:\n{}'
        .format(
            model_name, pprint.pformat(model_kwargs),
            pprint.pformat(model.data_encoder.instance_to_datapoints_kwargs)))

    initializer = mx.init.__dict__[init_fxn_name](**init_fxn_kwargs)
    model.collect_params().initialize(initializer, ctx=ctx)
    logger.info('Initialized Model using {} with params:\n{}'.format(
        init_fxn_name, pprint.pformat(initializer.__dict__)))

    loss_fxn = experiments.__dict__[loss_fxn_name](**loss_fxn_kwargs)
    logger.info('Instantiated Loss {} with params:\n{}'.format(
        loss_fxn_name, pprint.pformat(loss_fxn.__dict__)))

    optimizer = mx.optimizer.__dict__[optimizer_name](**optimizer_kwargs)
    logger.info('Instantiated optimizer {} with params:\n{}'.format(
        optimizer_name, pprint.pformat(optimizer.__dict__)))
    trainer = gluon.Trainer(model.collect_params(), optimizer)

    datapoints = os.listdir(train_data_directory)
    datapoints.remove('{}.pkl'.format(model.DataEncoder.__name__))
    datapoints = [os.path.join(train_data_directory, i) for i in datapoints]
    logger.info('Training on preprocessed data in {} ({} datapoints)'.format(
        train_data_directory, len(datapoints)))
    train_data, val_data = train_test_split(datapoints, test_size=val_fraction)
    if test:
        val_data = train_data
        train_data = val_data * 100
    logger.info(
        'Train data contains {} datapoints, Val data contains {} datapoints'.
        format(len(train_data), len(val_data)))
    train_loader = AsyncDataLoader(train_data, model.split_and_batchify,
                                   n_batch, ctx, n_workers)
    val_loader = AsyncDataLoader(val_data, model.split_and_batchify, n_batch,
                                 ctx, n_workers)

    lowest_val_loss = np.inf
    for e in range(n_epochs):
        with train_loader as train_loader:
            cumulative_loss = nd.zeros((1, ), ctx=ctx[0])
            for split_batch, batch_length in tqdm(
                    train_loader, total=train_loader.total_batches):
                with autograd.record():
                    losses = [
                        loss_fxn(model(batch.data), batch.label,
                                 model.data_encoder) for batch in split_batch
                    ]
                for loss in losses:
                    loss.backward()
                trainer.step(batch_length)
                loss_sums = nd.concat(
                    *[loss.sum().as_in_context(ctx[0]) for loss in losses],
                    dim=0)
                cumulative_loss += nd.sum(loss_sums)
                cumulative_loss.wait_to_read()
            logger.info('Epoch {}. (Cumulative) Train Loss: {}'.format(
                e,
                cumulative_loss.asscalar() / len(train_loader)))

        val_loss = evaluate_loss(val_loader, model, loss_fxn)
        logger.info('Epoch {}. Val Loss: {}'.format(e, val_loss))
        if val_loss < lowest_val_loss:
            model.save_parameters(os.path.join(log_dir, 'best.params'))
        lowest_val_loss = np.min((val_loss, lowest_val_loss))

        for metric in evaluation_metrics:
            metric_fxn = experiments.__dict__[metric]
            metric_val = metric_fxn(val_loader, model)
            logger.info('Epoch {}. {}: {}'.format(e, metric, metric_val))

        checkpoint_filename = os.path.join(
            log_dir, 'model_checkpoint_epoch_{}.params'.format(e))
        model.save_parameters(checkpoint_filename)
        if s3shared_cloud_log_path:
            s3_sync(log_dir, s3shared_cloud_log_path)

    logger.error(
        'Training finished.  S3 path: {}'.format(s3shared_cloud_log_path))

    # For testing
    return val_data, metric_val
def make_tasks_and_preprocess(
        seed: int,
        dataset_name: str,
        experiment_name: str,
        task_names: List[str],
        n_jobs: int,
        model_names_labels_and_prepro_kwargs: List[Tuple[str, str, frozenset,
                                                         dict, dict]],
        skip_make_tasks=False,
        test=False):
    # Assumes we've already created train and test directories, and we're pulling them from s3
    if test:
        s3shared_local_path = test_s3shared_path
    else:
        from experiments import s3shared_local_path, s3shared_cloud_path
        s3_sync(os.path.join(s3shared_cloud_path, dataset_name, 'seen_repos'),
                os.path.join(s3shared_local_path, dataset_name, 'seen_repos'))
        s3_sync(
            os.path.join(s3shared_cloud_path, dataset_name, 'unseen_repos'),
            os.path.join(s3shared_local_path, dataset_name, 'unseen_repos'))

    dataset_dir = os.path.join(s3shared_local_path, dataset_name)
    experiment_dir = os.path.join(dataset_dir, 'experiments', experiment_name)
    os.makedirs(experiment_dir, exist_ok=True)
    for task_name in task_names:
        logger.info('Starting task {}'.format(task_name))
        task_class = data.__dict__[task_name]
        dataset_types = [
            os.path.join('seen_repos', 'train_graphs'),
            os.path.join('seen_repos', 'test_graphs'),
            os.path.join('unseen_repos', 'test_graphs')
        ]
        for dataset_type in dataset_types:
            gml_dir = os.path.join(dataset_dir, dataset_type)
            output_dir = os.path.join(experiment_dir, dataset_type)
            os.makedirs(output_dir, exist_ok=True)

            gml_files = [
                os.path.abspath(os.path.join(gml_dir, file))
                for file in os.listdir(gml_dir)
            ]
            task_filepath = os.path.join(output_dir,
                                         '{}.pkl'.format(task_name))

            if not skip_make_tasks:
                task = task_class.from_gml_files(gml_files)
                task.save(task_filepath)

            for model_name, model_label, excluded_edge_types, data_encoder_kwargs, instance_to_datapoints_kwargs in model_names_labels_and_prepro_kwargs:
                logger.info('Starting preprocessing for {} on {} {}'.format(
                    model_name, task_name, dataset_type))
                dataset_output_dir_suffix = '_'.join(
                    [model_name, model_label, 'preprocessed_data'])
                dataset_output_dir = os.path.join(output_dir,
                                                  dataset_output_dir_suffix)
                if dataset_type == dataset_types[0]:
                    data_encoder = 'new'
                else:
                    model_class = models.__dict__[model_name]
                    data_encoder = os.path.join(
                        experiment_dir, dataset_types[0],
                        dataset_output_dir_suffix,
                        '{}.pkl'.format(model_class.DataEncoder.__name__))
                preprocess_task_for_model(
                    seed=seed,
                    task_class_name=task_name,
                    task_filepath=task_filepath,
                    model_name=model_name,
                    dataset_output_dir=dataset_output_dir,
                    n_jobs=n_jobs,
                    excluded_edge_types=excluded_edge_types,
                    data_encoder=data_encoder,
                    data_encoder_kwargs=data_encoder_kwargs,
                    instance_to_datapoints_kwargs=instance_to_datapoints_kwargs
                )

    if not test:
        s3_sync(
            experiment_dir,
            os.path.join(s3shared_cloud_path, dataset_name, 'experiments',
                         experiment_name))