Пример #1
0
def pca_train(n, normed_trainx, trainy, normed_validx, validy, opt, logger,
              layers, opt_lr, opt_epochs, optimizer):
    trainx, validx, var_ex = pca_nd(normed_trainx, normed_validx, n, logger)
    validy_ = validy.copy()  # for further convenience
    trainy_ = trainy.copy()
    if opt.gpu:  # store everything to GPU all at once
        logger.info('Using GPU acceleration')
        device = torch.device("cuda:0")
        trainx = torch.Tensor(trainx).to(device)
        trainy = torch.Tensor(trainy).to(device)
        validx = torch.Tensor(validx).to(device)
        validy = torch.Tensor(validy).to(device)
    model = fitting.TorchMLPRegressor(len(trainx[0]),
                                      len(trainy[0]),
                                      layers,
                                      batch_size=opt.batch,
                                      is_gpu=opt.gpu != 0,
                                      args_opt={
                                          'optimizer': torch.optim.Adam,
                                          'lr': opt.lr,
                                          'weight_decay': opt.l2
                                      })
    model.init_session()
    print(model.regressor)
    model.load_data(trainx, trainy)

    header = 'Epoch Loss MeaSquE MeaSigE MeaUnsE MaxRelE Acc1% Acc2% Acc5% Acc10%'.split(
    )
    logger.info('%-8s %8s %8s %8s %8s %8s %8s %8s %8s %8s' % tuple(header))
    total_epoch = 0

    for k, each_epoch in enumerate(opt_epochs):
        # implement seperated learning rate
        model.reset_optimizer({
            'optimizer': optimizer,
            'lr': opt_lr[k],
            'weight_decay': opt.l2
        })
        for i_epoch in range(each_epoch):
            total_epoch += 1
            loss = model.fit_epoch(trainx, trainy)
            if (i_epoch + 1) % 20 == 0 or i_epoch + 1 == each_epoch:
                predy = model.predict_batch(validx)
                err_line = '%d/%d %8.3e %8.3e %8.1f %8.1f %8.1f %8.1f %8.1f %8.1f %8.1f' % (
                    total_epoch, sum(opt_epochs), loss,
                    metrics.mean_squared_error(validy_, predy),
                    metrics.mean_signed_error(validy_, predy) * 100,
                    metrics.mean_unsigned_error(validy_, predy) * 100,
                    metrics.max_relative_error(validy_, predy) * 100,
                    metrics.accuracy(validy_, predy, 0.01) * 100,
                    metrics.accuracy(validy_, predy, 0.02) * 100,
                    metrics.accuracy(validy_, predy, 0.05) * 100,
                    metrics.accuracy(validy_, predy, 0.10) * 100)
                logger.info(err_line)

    return var_ex, metrics.accuracy(
        validy_, predy, 0.02) * 100, metrics.mean_squared_error(validy, predy)
Пример #2
0
def load_data(opt, logger):
    if opt.layer != "":
        layers = list(map(int, opt.layer.split(',')))
    else:
        layers = []

    if not os.path.exists(opt.output):
        os.mkdir(opt.output)

    if opt.featrm == 'auto':
        featrm = [14, 15, 17, 18, 19, 20, 21, 22]
    elif opt.featrm == '':
        featrm = []
    else:
        featrm = list(map(int, opt.featrm.split(',')))

    logger.info('loading data...')
    datax, datay, data_names = dataloader.load(filename=opt.input,
                                               target=opt.target,
                                               fps=opt.fp.split(','),
                                               featrm=featrm)

    selector = preprocessing.Selector(datax, datay, data_names)
    if opt.part:
        selector.load(opt.part)
    else:
        selector.partition(0.8, 0.1)
        selector.save(opt.output + '/part.txt')

    trainx, trainy, trainname = selector.training_set()
    validx, validy, validname = selector.validation_set()

    logger.info('loading model...')
    scaler = preprocessing.Scaler()
    scaler.load(opt.output + '/scale.txt')
    normed_trainx = scaler.transform(trainx)
    normed_validx = scaler.transform(validx)
    model = fitting.TorchMLPRegressor(
        None,
        None,
        [],
        is_gpu=False,
    )
    model.load(opt.output + '/model.pt')
    #if  opt.pca != -1:
    #    normed_trainx, normed_validx, _ = pca_nd(normed_trainx, normed_validx, len(normed_trainx[0]) - opt.pca)
    return normed_validx, validy, model
Пример #3
0
def main():
    parser = argparse.ArgumentParser(
        description='Alkane property fitting demo')
    parser.add_argument('-i', '--input', type=str, help='Data')
    parser.add_argument('-f', '--fp', type=str, help='Fingerprints')
    parser.add_argument('-o',
                        '--output',
                        default='out',
                        type=str,
                        help='Output directory')
    parser.add_argument('-t',
                        '--target',
                        default='raw_density',
                        type=str,
                        help='Fitting target')
    parser.add_argument('-p',
                        '--part',
                        default='',
                        type=str,
                        help='Partition cache file')
    parser.add_argument('-l',
                        '--layer',
                        default='16,16',
                        type=str,
                        help='Size of hidden layers')
    parser.add_argument('--visual',
                        default=1,
                        type=int,
                        help='Visualzation data')
    parser.add_argument('--gpu', default=1, type=int, help='Using gpu')
    parser.add_argument('--epoch',
                        default="500,2000,2500",
                        type=str,
                        help='Number of epochs')
    parser.add_argument('--batch', default=1000, type=int, help='Batch size')
    parser.add_argument('--lr',
                        default="0.01,0.001,0.0001",
                        type=str,
                        help='Initial learning rate')
    parser.add_argument('--l2', default=0.000, type=float, help='L2 Penalty')
    parser.add_argument('--check',
                        default=50,
                        type=int,
                        help='Number of epoch that do convergence check')
    parser.add_argument('--minstop',
                        default=0.2,
                        type=float,
                        help='Minimum fraction of step to stop')
    parser.add_argument('--maxconv',
                        default=2,
                        type=int,
                        help='Times of true convergence that makes a stop')
    parser.add_argument('--featrm',
                        default='',
                        type=str,
                        help='Remove features')
    parser.add_argument('--optim', default='rms', type=str, help='optimizer')
    parser.add_argument('--continuation',
                        default=False,
                        type=bool,
                        help='continue training')
    parser.add_argument('--pca',
                        default=-1,
                        type=int,
                        help='dimension to discard')
    parser.add_argument(
        '--sobol',
        default=-1,
        type=int,
        help='dimensions to reduce according to sensitivity analysis')

    opt = parser.parse_args()

    if opt.layer != "":
        layers = list(map(int, opt.layer.split(',')))
    else:
        layers = []

    opt_lr = list(map(float, opt.lr.split(',')))
    opt_epochs = list(map(int, opt.epoch.split(',')))

    if not os.path.exists(opt.output):
        os.mkdir(opt.output)

    logger = logging.getLogger('train')
    logger.setLevel(logging.INFO)
    flog = logging.FileHandler(opt.output + '/log.txt', mode='w')
    flog.setLevel(logging.INFO)
    formatter = logging.Formatter(
        fmt='[%(asctime)s] (%(levelname)s) %(message)s',
        datefmt='%Y-%d-%m %H:%M:%S')
    flog.setFormatter(formatter)
    clog = logging.StreamHandler()
    clog.setFormatter(formatter)
    logger.addHandler(flog)
    logger.addHandler(clog)

    if sys.platform == 'linux':
        logger.info('Use non-interactive Agg backend for matplotlib on linux')
        matplotlib.use('Agg')

    if opt.featrm == 'auto':
        logger.info('Automatically remove features')
        featrm = [14, 15, 17, 18, 19, 20, 21, 22]
    elif opt.featrm == '':
        featrm = []
    else:
        featrm = list(map(int, opt.featrm.split(',')))
    logger.info('Remove Feature: %s' % featrm)

    logger.info('Reading data...')
    datax, datay, data_names = dataloader.load(filename=opt.input,
                                               target=opt.target,
                                               fps=opt.fp.split(','),
                                               featrm=featrm)

    # Store fingerprint identifier files
    for fp in opt.fp.split(','):
        if os.path.exists(fp + '.idx') and Path(fp).parent.absolute() != Path(
                opt.output).absolute():
            shutil.copy(fp + '.idx', opt.output)

    logger.info('Selecting data...')
    selector = preprocessing.Selector(datax, datay, data_names)
    if opt.part:
        logger.info('Loading partition file %s' % opt.part)
        selector.load(opt.part)
    else:
        logger.warning(
            "Partition file not found. Using auto-partition instead.")
        selector.partition(0.8, 0.1)
        selector.save(opt.output + '/part.txt')
    trainx, trainy, trainname = selector.training_set()
    validx, validy, validname = selector.validation_set()

    logger.info('Training size = %d, Validation size = %d' %
                (len(trainx), len(validx)))
    logger.info('X input example: (size=%d) %s' %
                (len(datax[0]), ','.join(map(str, datax[0]))))
    logger.info('Y input example: (size=%d) %s' %
                (len(datay[0]), ','.join(map(str, datay[0]))))
    logger.info('Normalizing...')
    scaler = preprocessing.Scaler()
    scaler.fit(trainx)
    scaler.save(opt.output + '/scale.txt')
    normed_trainx = scaler.transform(trainx)
    normed_validx = scaler.transform(validx)

    if opt.sobol != -1:
        with open(opt.output + '/sobol_idx.pkl', 'rb') as file:
            sobol_idx = pickle.load(file)
        normed_trainx, normed_validx = sobol_reduce(
            normed_trainx, normed_validx,
            len(normed_trainx[0]) - 2 - opt.sobol, sobol_idx)
        logger.info('sobol SA reduced dimension:%d' % (opt.sobol))

    if opt.pca != -1:
        normed_trainx, normed_validx, _ = pca_nd(
            normed_trainx, normed_validx,
            len(normed_trainx[0]) - opt.pca, logger)
        logger.info('pca reduced dimension:%d' % (opt.pca))

    logger.info('final input length:%d' % (len(normed_trainx[0])))
    logger.info('Building network...')
    logger.info('Hidden layers = %r' % layers)
    logger.info('optimizer = %s' % (opt.optim))
    logger.info('Learning rate = %s' % opt_lr)
    logger.info('Epochs = %s' % opt_epochs)
    logger.info('L2 penalty = %f' % opt.l2)
    logger.info('Batch size = %d' % opt.batch)

    validy_ = validy.copy()  # for further convenience
    trainy_ = trainy.copy()
    if opt.gpu:  # store everything to GPU all at once
        logger.info('Using GPU acceleration')
        device = torch.device("cuda:0")
        normed_trainx = torch.Tensor(normed_trainx).to(device)
        trainy = torch.Tensor(trainy).to(device)
        normed_validx = torch.Tensor(normed_validx).to(device)
        validy = torch.Tensor(validy).to(device)

    if opt.optim == 'sgd':
        optimizer = torch.optim.SGD
    elif opt.optim == 'adam':
        optimizer = torch.optim.Adam
    elif opt.optim == 'rms':
        optimizer = torch.optim.RMSprop
    elif opt.optim == 'ada':
        optimizer = torch.optim.Adagrad

    model = fitting.TorchMLPRegressor(len(normed_trainx[0]),
                                      len(trainy[0]),
                                      layers,
                                      batch_size=opt.batch,
                                      is_gpu=opt.gpu != 0,
                                      args_opt={
                                          'optimizer': optimizer,
                                          'lr': opt.lr,
                                          'weight_decay': opt.l2
                                      })

    model.init_session()
    if opt.continuation:
        cpt = opt.output + '/model.pt'
        logger.info('Continue training from checkpoint %s' % (cpt))
        model.load(cpt)

    logger.info('Optimizer = %s' % (optimizer))

    header = 'Step Loss MeaSquE MeaSigE MeaUnsE MaxRelE Acc2% Acc5% Acc10%'.split(
    )
    logger.info('%-8s %8s %8s %8s %8s %8s %8s %8s %8s' % tuple(header))

    mse_history = []
    converge_times = 0
    mse_min = None
    model_saved = False
    converged = False
    all_epoch = sum(opt_epochs)
    total_epoch = 0

    for k, each_epoch in enumerate(opt_epochs):
        # implement separated learning rate
        model.reset_optimizer({
            'optimizer': optimizer,
            'lr': opt_lr[k],
            'weight_decay': opt.l2
        })
        for i in range(each_epoch):
            total_epoch += 1
            loss = model.fit_epoch(normed_trainx, trainy)
            if total_epoch % opt.check == 0:
                predy = model.predict_batch(normed_validx)
                mse = metrics.mean_squared_error(validy_, predy)
                mse_history.append(mse)
                err_line = '%-8i %8.2e %8.2e %8.1f %8.1f %8.1f %8.1f %8.1f %8.1f' % (
                    total_epoch, loss.data.cpu().numpy()
                    if model.is_gpu else loss.data.numpy(), mse,
                    metrics.mean_signed_error(validy_, predy) * 100,
                    metrics.mean_unsigned_error(validy_, predy) * 100,
                    metrics.max_relative_error(validy_, predy) * 100,
                    metrics.accuracy(validy_, predy, 0.02) * 100,
                    metrics.accuracy(validy_, predy, 0.05) * 100,
                    metrics.accuracy(validy_, predy, 0.10) * 100)

                logger.info(err_line)

                if mse_min is None:
                    mse_min = mse
                elif mse < mse_min:
                    model.save(opt.output + '/model.pt')
                    model_saved = True
                    mse_min = mse

                if total_epoch > all_epoch * opt.minstop:
                    conv, cur_conv = validation.is_converge(
                        np.array(mse_history), nskip=25)
                    if conv:
                        logger.info('Model converge detected at epoch %d' %
                                    total_epoch)
                        converge_times += 1

                    if converge_times >= opt.maxconv and cur_conv:
                        logger.info('Model converged at epoch: %d' %
                                    total_epoch)
                        converged = True
                        break

    if not converged:
        logger.warning('Model not converged')

    if not model_saved:
        model.save(opt.output + '/model.pt')

    visualizer = visualize.LinearVisualizer(
        trainy_.reshape(-1),
        model.predict_batch(normed_trainx).reshape(-1), trainname, 'training')
    visualizer.append(validy_.reshape(-1),
                      model.predict_batch(normed_validx).reshape(-1),
                      validname, 'validation')
    visualizer.dump(opt.output + '/fit.txt')
    visualizer.dump_bad_molecules(opt.output + '/error-0.05.txt',
                                  'validation',
                                  threshold=0.05)
    visualizer.dump_bad_molecules(opt.output + '/error-0.10.txt',
                                  'validation',
                                  threshold=0.1)
    visualizer.dump_bad_molecules(opt.output + '/error-0.15.txt',
                                  'validation',
                                  threshold=0.15)
    visualizer.dump_bad_molecules(opt.output + '/error-0.20.txt',
                                  'validation',
                                  threshold=0.2)
    logger.info('Fitting result saved')

    if opt.visual:
        visualizer.scatter_yy(savefig=opt.output + '/error-train.png',
                              annotate_threshold=0,
                              marker='x',
                              lw=0.2,
                              s=5)
        visualizer.hist_error(savefig=opt.output + '/error-hist.png',
                              label='validation',
                              histtype='step',
                              bins=50)
        plt.show()
Пример #4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-d', '--dir', default='out', help='Model directory')
    parser.add_argument('-i', '--input', help='Data')
    parser.add_argument('-f', '--fp', help='Fingerprints')
    parser.add_argument('-t', '--target', help='Target property')
    parser.add_argument('-p', '--part', help='Partition file')
    parser.add_argument('--gpu', default=1, type=int, help='Using GPU')
    parser.add_argument('--visual',
                        default=1,
                        type=int,
                        help='Visualzation data')
    parser.add_argument('--visualx',
                        default='',
                        help='Extra visualisze on special x')
    parser.add_argument('--dump', default='', help='Output of fitting results')
    parser.add_argument('--featrm',
                        default='',
                        type=str,
                        help='Remove features')

    opt = parser.parse_args()

    model = fitting.TorchMLPRegressor(None, None, [])
    model.is_gpu = opt.gpu == 1
    model.load(opt.dir + '/model.pt')

    scaler = preprocessing.Scaler()
    scaler.load(opt.dir + '/scale.txt')

    if opt.featrm == 'auto':
        featrm = [14, 15, 17, 18, 19, 20, 21, 22]
    elif opt.featrm == '':
        featrm = []
    else:
        featrm = list(map(int, opt.featrm.split(',')))

    datax, datay, data_names = mdlearn.dataloader.load(filename=opt.input,
                                                       target=opt.obj,
                                                       fps=opt.fp.split(','),
                                                       featrm=featrm)

    selector = preprocessing.Selector(datax, datay, data_names)
    selector.load(opt.part)

    trainx, trainy, trainname = selector.training_set()
    validx, validy, validname = selector.validation_set()
    testx, testy, testname = selector.test_set()

    normed_trainx = scaler.transform(trainx)
    normed_validx = scaler.transform(validx)
    normed_testx = scaler.transform(testx)

    trainy = trainy.flatten()
    validy = validy.flatten()
    testy = testy.flatten()

    trainy_est = model.predict_batch(normed_trainx).flatten()
    validy_est = model.predict_batch(normed_validx).flatten()
    testy_est = model.predict_batch(normed_testx).flatten()

    def evaluate_model(y, y_est):
        mse = metrics.mean_squared_error(y, y_est)
        ae = np.average(metrics.abs_absolute_error(y, y_est))
        ave_y = np.average(y)
        ave_y_est = np.average(y_est)
        bias = (ave_y_est - ave_y)

        eval_results = OrderedDict()
        eval_results['MSE'] = mse
        eval_results['RMSE'] = np.sqrt(mse)
        eval_results['AE'] = ae
        eval_results['Max AAE'] = metrics.max_absolute_error(y, y_est)
        eval_results['Bias'] = bias

        eval_results['RRMSE'] = np.sqrt(mse) / np.abs(ave_y)
        eval_results['MARE'] = ae / np.abs(ave_y)
        eval_results['Max ARE'] = metrics.max_relative_error(y, y_est)
        eval_results['RBias'] = bias / np.abs(ave_y)

        eval_results['Accuracy1%'] = metrics.accuracy(y, y_est, 0.01)
        eval_results['Accuracy2%'] = metrics.accuracy(y, y_est, 0.02)
        eval_results['Accuracy5%'] = metrics.accuracy(y, y_est, 0.05)
        eval_results['Accuracy10%'] = metrics.accuracy(y, y_est, 0.1)

        return eval_results

    results = []

    results.append(evaluate_model(trainy, trainy_est))
    results.append(evaluate_model(validy, validy_est))
    results.append(evaluate_model(testy, testy_est))
    results.append(
        evaluate_model(np.concatenate((trainy, validy, testy)),
                       np.concatenate((trainy_est, validy_est, testy_est))))

    print('Dataset\t%s' % ('\t'.join(results[0].keys())))

    fmt = lambda x: '%.3g' % x

    for name, result in zip(['Training', 'Validation', 'Test', 'Overall'],
                            results):
        print('%s\t%s' % (name, '\t'.join([fmt(v) for v in result.values()])))

    visualizer = visualize.LinearVisualizer(trainy, trainy_est, trainname,
                                            'training')
    visualizer.append(validy, validy_est, validname, 'validation')
    visualizer.append(testy, testy_est, testname, 'test')
    if opt.dump:
        visualizer.dump(opt.dump)

    if opt.visual:
        visualizer.scatter_yy(annotate_threshold=0.1,
                              marker='x',
                              lw=0.2,
                              s=5,
                              figure_name='Value')
        visualizer.scatter_error(annotate_threshold=0.1,
                                 marker='x',
                                 lw=0.2,
                                 s=5,
                                 figure_name='Error')
        visualizer.hist_error(label='test',
                              histtype='step',
                              bins=50,
                              figure_name='Error Distribution')

        if opt.visualx:

            for i in map(int, opt.visualx.split(',')):
                visualizer2 = visualize.LinearVisualizer(
                    trainx[:, i], trainy_est - trainy, trainname, 'training')
                visualizer2.append(validx[:, i], validy_est - validy,
                                   validname, 'validation')
                visualizer2.append(testx[:, i], testy_est - testy, testname,
                                   'test')
                visualizer2.scatter_yy(ref=None,
                                       annotate_threshold=-1,
                                       marker='x',
                                       lw=0.2,
                                       s=5,
                                       figure_name=str(i))

        plt.show()