예제 #1
0
def predict(model, data_loader, eval=True):
    textlogger = logging.getLogger("openchem.predict")
    model.eval()
    start = time.time()
    prediction = []
    samples = []
    has_module = False
    if hasattr(model, 'module'):
        has_module = True
    if has_module:
        task = model.module.task
        logdir = model.module.logdir
    else:
        task = model.task
        logdir = model.logdir

    for i_batch, sample_batched in enumerate(data_loader):
        if has_module:
            task = model.module.task
            use_cuda = model.module.use_cuda
            batch_input, batch_object = model.module.cast_inputs(
                sample_batched, task, use_cuda, for_prediction=True)
        else:
            task = model.task
            use_cuda = model.use_cuda
            batch_input, batch_object = model.cast_inputs(sample_batched,
                                                          task,
                                                          use_cuda,
                                                          for_predction=True)
        predicted = model(batch_input, eval=True)
        if hasattr(predicted, 'detach'):
            predicted = predicted.detach().cpu().numpy()
        prediction += list(predicted)
        samples += list(batch_object)

    if task == 'classification':
        prediction = np.argmax(prediction, axis=1)
    f = open(logdir + "/predictions.txt", "w")
    assert len(prediction) == len(samples)

    if comm.is_main_process():
        for i in range(len(prediction)):
            tmp = [chr(c) for c in samples[i]]
            tmp = ''.join(tmp)
            if " " in tmp:
                tmp = tmp[:tmp.index(" ")]
                to_write = [str(pred) for pred in prediction[i]]
                to_write = ",".join(to_write)
            f.writelines(tmp + "," + to_write + "\n")
        f.close()

    if comm.is_main_process():
        textlogger.info('Predictions saved to ' + logdir + "/predictions.txt")
        textlogger.info('PREDICTION: [Time: %s, Number of samples: %d]' %
                        (time_since(start), len(prediction)))
예제 #2
0
def fit(model,
        scheduler,
        train_loader,
        optimizer,
        criterion,
        params,
        eval=False,
        val_loader=None,
        cur_epoch=0):
    textlogger = logging.getLogger("openchem.fit")
    logdir = params['logdir']
    print_every = params['print_every']
    save_every = params['save_every']
    n_epochs = params['num_epochs']
    writer = SummaryWriter()
    start = time.time()
    loss_total = 0
    n_batches = 0
    schedule_by_iter = scheduler.by_iteration
    scheduler = scheduler.scheduler
    all_losses = []
    val_losses = []
    has_module = False
    if hasattr(model, 'module'):
        has_module = True
    world_size = comm.get_world_size()

    for epoch in tqdm(range(cur_epoch, n_epochs + cur_epoch)):
        model.train()
        for i_batch, sample_batched in enumerate(tqdm(train_loader)):

            if has_module:
                task = model.module.task
                use_cuda = model.module.use_cuda
                batch_input, batch_target = model.module.cast_inputs(
                    sample_batched, task, use_cuda)
            else:
                task = model.task
                use_cuda = model.use_cuda
                batch_input, batch_target = model.cast_inputs(
                    sample_batched, task, use_cuda)
            loss = train_step(model, optimizer, criterion, batch_input,
                              batch_target)
            if schedule_by_iter:
                # steps are in iters
                scheduler.step()
            if world_size > 1:
                reduced_loss = reduce_tensor(loss, world_size).item()
            else:
                reduced_loss = loss.item()
            loss_total += reduced_loss
            n_batches += 1
        cur_loss = loss_total / n_batches
        all_losses.append(cur_loss)

        if epoch % print_every == 0:
            if comm.is_main_process():
                textlogger.info(
                    'TRAINING: [Time: %s, Epoch: %d, Progress: %d%%, '
                    'Loss: %.4f]' % (time_since(start), epoch,
                                     epoch / n_epochs * 100, cur_loss))
            if eval:
                assert val_loader is not None
                val_loss, metrics = evaluate(model,
                                             val_loader,
                                             criterion,
                                             epoch=epoch)
                val_losses.append(val_loss)
                info = {
                    'Train loss': cur_loss,
                    'Validation loss': val_loss,
                    'Validation metrics': metrics,
                    'LR': optimizer.param_groups[0]['lr']
                }
            else:
                info = {
                    'Train loss': cur_loss,
                    'LR': optimizer.param_groups[0]['lr']
                }

            if comm.is_main_process():
                for tag, value in info.items():
                    writer.add_scalar(tag, value, epoch + 1)

                for tag, value in model.named_parameters():
                    tag = tag.replace('.', '/')
                    if torch.std(value).item() < 1e-3 or \
                            torch.isnan(torch.std(value)).item():
                        textlogger.warning(
                            "Warning: {} has zero variance ".format(tag) +
                            "(i.e. constant vector)")
                    else:
                        log_value = value.detach().cpu().numpy()
                        writer.add_histogram(tag, log_value, epoch + 1)
                        #logger.histo_summary(
                        #    tag, log_value, epoch + 1)
                        if value.grad is None:
                            print("Warning: {} grad is undefined".format(tag))
                        else:
                            log_value_grad = value.grad.detach().cpu().numpy()
                            writer.add_histogram(tag + "/grad", log_value_grad,
                                                 epoch + 1)

        if epoch % save_every == 0 and comm.is_main_process():
            torch.save(model.state_dict(),
                       logdir + '/checkpoint/epoch_' + str(epoch))

        loss_total = 0
        n_batches = 0
        if scheduler is not None:
            if not schedule_by_iter:
                # steps are in epochs
                scheduler.step()

    return all_losses, val_losses
예제 #3
0
def evaluate(model, data_loader, criterion=None, epoch=None):
    textlogger = logging.getLogger("openchem.evaluate")
    model.eval()
    loss_total = 0
    n_batches = 0
    start = time.time()
    prediction = []
    ground_truth = []
    has_module = False
    if hasattr(model, 'module'):
        has_module = True
    if has_module:
        task = model.module.task
        eval_metrics = model.module.eval_metrics
        logdir = model.module.logdir
    else:
        task = model.task
        eval_metrics = model.eval_metrics
        logdir = model.logdir

    for i_batch, sample_batched in enumerate(data_loader):
        if has_module:
            task = model.module.task
            use_cuda = model.module.use_cuda
            batch_input, batch_target = model.module.cast_inputs(
                sample_batched, task, use_cuda)
        else:
            task = model.task
            use_cuda = model.use_cuda
            batch_input, batch_target = model.cast_inputs(
                sample_batched, task, use_cuda)
        predicted = model(batch_input, eval=True)
        try:
            loss = criterion(predicted, batch_target)
        except TypeError:
            loss = 0.0
        if hasattr(predicted, 'detach'):
            predicted = predicted.detach().cpu().numpy()
        if hasattr(batch_target, 'cpu'):
            batch_target = batch_target.cpu().numpy()
        if hasattr(loss, 'item'):
            loss = loss.item()
        if isinstance(loss, list):
            loss = 0.0
        prediction += list(predicted)
        ground_truth += list(batch_target)
        loss_total += loss
        n_batches += 1

    cur_loss = loss_total / n_batches
    if task == 'classification':
        prediction = np.argmax(prediction, axis=1)

    metrics = calculate_metrics(prediction, ground_truth, eval_metrics)
    metrics = np.mean(metrics)

    if task == "graph_generation":
        f = open(logdir + "/debug_smiles_epoch_" + str(epoch) + ".smi", "w")
        if isinstance(metrics, list) and len(metrics) == len(prediction):
            for i in range(len(prediction)):
                f.writelines(str(prediction[i]) + "," + str(metrics[i]) + "\n")
        else:
            for i in range(len(prediction)):
                f.writelines(str(prediction[i]) + "\n")
            f.close()

    if comm.is_main_process():
        textlogger.info('EVALUATION: [Time: %s, Loss: %.4f, Metrics: %.4f]' %
                        (time_since(start), cur_loss, metrics))

    return cur_loss, metrics
예제 #4
0
파일: run.py 프로젝트: zeromtmu/OpenChem
def main():
    parser = argparse.ArgumentParser(description='Experiment parameters')
    parser.add_argument("--use_cuda",
                        default=torch.cuda.is_available(),
                        help="Whether to train on GPU")
    parser.add_argument("--config_file",
                        required=True,
                        help="Path to the configuration file")
    parser.add_argument(
        "--mode",
        default='train',
        help="Could be \"train\", \"eval\", \"train_eval\", \"predict\"")
    parser.add_argument('--continue_learning',
                        dest='continue_learning',
                        action='store_true',
                        help="whether to continue learning")
    parser.add_argument("--force_checkpoint",
                        dest="force_checkpoint",
                        default="",
                        help="Full path to a pretrained snapshot "
                        "(e.g. useful for knowledge transfer or)")
    parser.add_argument('--dist-backend',
                        default='nccl',
                        type=str,
                        help='distributed backend')
    parser.add_argument('--seed',
                        default=None,
                        type=int,
                        help='seed for initializing training. ')
    parser.add_argument('--workers',
                        default=0,
                        type=int,
                        metavar='N',
                        help='number of data loading workers (default: 0)')
    parser.add_argument('--random_seed',
                        default=0,
                        type=int,
                        metavar='N',
                        help='random_seed (default: 0)')
    parser.add_argument("--local_rank", type=int, default=0)
    parser.add_argument("--copy_config_file",
                        action="store_true",
                        help="Copy config file to logdir (useful in training)")

    args, unknown = parser.parse_known_args()

    num_gpus = int(os.environ["WORLD_SIZE"]) \
        if "WORLD_SIZE" in os.environ else 1
    args.distributed = num_gpus > 1

    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        dist.init_process_group(backend=args.dist_backend,
                                init_method='env://')
        print('Distributed process with rank {:d} initalized'.format(
            args.local_rank))

    cudnn.benchmark = True

    if args.mode not in ['train', 'eval', 'train_eval', 'infer', 'predict']:
        raise ValueError("Mode has to be one of "
                         "['train', 'eval', 'train_eval', 'infer', 'predict']")
    config_module = runpy.run_path(args.config_file)

    model_config = config_module.get('model_params', None)
    random.seed(args.random_seed)
    np.random.seed(args.random_seed)
    torch.manual_seed(args.random_seed)
    torch.cuda.manual_seed_all(args.random_seed)
    model_config['use_cuda'] = args.use_cuda
    if model_config is None:
        raise ValueError('model_params dictionary has to be '
                         'defined in the config file')
    model_object = config_module.get('model', None)
    if model_object is None:
        raise ValueError('model class has to be defined in the config file')

        # after we read the config, trying to overwrite some of the properties
        # with command line arguments that were passed to the script
    parser_unk = argparse.ArgumentParser()
    for pm, value in flatten_dict(model_config).items():
        if type(value) == int or type(value) == float or \
                isinstance(value, string_types):
            parser_unk.add_argument('--' + pm, default=value, type=type(value))
        elif type(value) == bool:
            parser_unk.add_argument('--' + pm,
                                    default=value,
                                    type=ast.literal_eval)

    config_update = parser_unk.parse_args(unknown)
    nested_update(model_config, nest_dict(vars(config_update)))

    # checking that everything is correct with log directory
    logdir = model_config['logdir']
    ckpt_dir = os.path.join(logdir, 'checkpoint')

    if args.force_checkpoint:
        assert not args.continue_learning, \
            "force_checkpoint and continue_learning are " \
            "mutually exclusive flags"
        checkpoint = args.force_checkpoint
        assert os.path.isfile(checkpoint), "{} is not a file".format(
            checkpoint)
        cur_epoch = 0
    elif args.mode in ['eval', 'infer', 'predict'] or args.continue_learning:
        checkpoint = get_latest_checkpoint(ckpt_dir)
        if checkpoint is None:
            raise IOError("Failed to find model checkpoint under "
                          "{}. Can't load the model".format(ckpt_dir))
        cur_epoch = int(os.path.basename(checkpoint).split("_")[-1]) + 1
    else:
        checkpoint = None
        cur_epoch = 0

    if not os.path.exists(logdir):
        comm.mkdir(logdir)
        print('Directory {} created'.format(logdir))
    elif os.path.isfile(logdir):
        raise IOError("There is a file with the same name as \"logdir\" "
                      "parameter. You should change the log directory path "
                      "or delete the file to continue.")

    if not os.path.exists(ckpt_dir):
        comm.mkdir(ckpt_dir)
        print('Directory {} created'.format(ckpt_dir))
    elif os.path.isdir(ckpt_dir) and os.listdir(ckpt_dir) != []:
        if not args.continue_learning and args.mode not in [
                'eval', 'infer', 'predict'
        ]:
            raise IOError("Log directory is not empty. If you want to "
                          "continue learning, you should provide "
                          "\"--continue_learning\" flag")

    doprint = comm.is_main_process()
    tofile = os.path.join(logdir, "log.txt")
    logger = setup_textlogger("openchem", doprint, tofile)
    msg = "Running with config:\n"
    for k, v in sorted(flatten_dict(model_config).items()):
        msg += ("{}:\t{}\n".format(k, v)).expandtabs(50)
    logger.info("Running on {:d} GPUs".format(comm.get_world_size()))
    logger.info("Logging directory is set to {}".format(logdir))
    logger.info(msg)
    if args.copy_config_file:
        shutil.copy(args.config_file, logdir)

    train_config = copy.deepcopy(model_config)
    eval_config = copy.deepcopy(model_config)

    if args.mode == 'train' or args.mode == 'train_eval':
        if 'train_params' in config_module:
            nested_update(train_config,
                          copy.deepcopy(config_module['train_params']))
    if args.mode in ['eval', 'train_eval', 'infer', 'predict']:
        if 'eval_params' in config_module:
            nested_update(eval_config,
                          copy.deepcopy(config_module['eval_params']))

    if args.mode == "train" or args.mode == "train_eval":
        train_dataset = copy.deepcopy(model_config['train_data_layer'])
        if model_config['task'] == 'classification':
            train_dataset.target = train_dataset.target.reshape(-1)
        if args.distributed:
            train_sampler = DistributedSampler(train_dataset)
        else:
            train_sampler = None
        train_loader = create_loader(train_dataset,
                                     batch_size=model_config['batch_size'],
                                     shuffle=(train_sampler is None),
                                     num_workers=args.workers,
                                     pin_memory=True,
                                     sampler=train_sampler)
    else:
        train_loader = None

    if args.mode == "predict" and (
            'predict_data_layer' not in model_config.keys()
            or model_config['predict_data_layer'] is None):
        raise IOError("When model is run in 'predict' mode, "
                      "prediction data layer must be specified")

    if args.mode == "predict":
        predict_dataset = copy.deepcopy(model_config['predict_data_layer'])
        predict_loader = create_loader(predict_dataset,
                                       batch_size=model_config['batch_size'],
                                       shuffle=False,
                                       num_workers=1,
                                       pin_memory=True)
    else:
        predict_loader = None

    if args.mode in ["eval", "train_eval"
                     ] and ('val_data_layer' not in model_config.keys()
                            or model_config['val_data_layer'] is None):
        raise IOError("When model is run in 'eval' or 'train_eval' modes, "
                      "validation data layer must be specified")

    if args.mode in ["eval", "train_eval"]:
        val_dataset = copy.deepcopy(model_config['val_data_layer'])
        if model_config['task'] == 'classification':
            val_dataset.target = val_dataset.target.reshape(-1)
        val_loader = create_loader(val_dataset,
                                   batch_size=model_config['batch_size'],
                                   shuffle=False,
                                   num_workers=1,
                                   pin_memory=True)
    else:
        val_loader = None

    model_config['train_loader'] = train_loader
    model_config['val_loader'] = val_loader
    model_config['predict_loader'] = predict_loader

    # create model
    model = model_object(params=model_config)

    if args.use_cuda:
        model = model.to('cuda')

    if args.distributed:
        model = DistributedDataParallel(model,
                                        device_ids=[args.local_rank],
                                        output_device=args.local_rank)
    else:
        model = DataParallel(model)

    if checkpoint is not None:
        logger.info("Loading model from {}".format(checkpoint))
        weights = torch.load(checkpoint, map_location=torch.device("cpu"))
        model.load_state_dict(weights)
    else:
        logger.info("Starting training from scratch")
    if args.mode in ["train", "train_eval"]:
        logger.info("Training is set up from epoch {:d}".format(cur_epoch))

    criterion, optimizer, lr_scheduler = build_training(model, model_config)

    if args.mode == 'train':
        fit(model,
            lr_scheduler,
            train_loader,
            optimizer,
            criterion,
            model_config,
            eval=False,
            cur_epoch=cur_epoch)
    elif args.mode == 'train_eval':
        fit(model,
            lr_scheduler,
            train_loader,
            optimizer,
            criterion,
            model_config,
            eval=True,
            val_loader=val_loader,
            cur_epoch=cur_epoch)
    elif args.mode == "eval":
        evaluate(model, val_loader, criterion)
    elif args.mode == "predict":
        predict(model, predict_loader)
    elif args.mode == "infer":
        comm.synchronize()
        start_time = time.time()

        #if comm.get_world_size() > 1:
        #    seed = comm.get_rank() * 10000
        #    random.seed(seed)
        #    np.random.seed(seed)
        #    torch.manual_seed(seed)
        #    torch.cuda.manual_seed_all(seed)

        model.eval()
        smiles = []

        with torch.no_grad():
            for i in range(1):
                batch_smiles = model(None, batch_size=1024)
                smiles.extend(batch_smiles)
                print("Iteration {:d}: {:d} smiles".format(
                    i + 1, len(batch_smiles)))

        if comm.get_world_size() > 1:
            path = os.path.join(
                logdir, "debug_smiles_{:d}.txt".format(comm.get_rank()))
            with open(path, "w") as f:
                for s in smiles:
                    f.write(s + "\n")

            comm.synchronize()

            if not comm.is_main_process():
                return

            smiles = []
            for i in range(comm.get_world_size()):
                path = os.path.join(logdir, "debug_smiles_{:d}.txt".format(i))
                with open(path) as f:
                    smiles_local = f.readlines()
                os.remove(path)

                smiles_local = [s.rstrip() for s in smiles_local]
                smiles.extend(smiles_local)

        path = os.path.join(logdir, "debug_smiles.txt")
        with open(path, "w") as f:
            for s in smiles:
                f.write(s + "\n")

        print("Generated {:d} molecules in {:.1f} seconds".format(
            len(smiles),
            time.time() - start_time))

        eval_metrics = model_config['eval_metrics']
        score = eval_metrics(None, smiles)
        qed_score = metrics.qed(smiles)
        logger.info("Eval metrics = {:.2f}".format(score))
        logger.info("QED score = {:.2f}".format(qed_score))

        smiles, idx = sanitize_smiles(smiles, logging="info")