Exemplo n.º 1
0
def main(args):

    # Initialise
    args, cfg = initialise(args)

    # Initialise Keras backend
    initialise_backend(args)

    # Neural network-specific initialisation of the configuration dict
    initialise_config(args, cfg)

    # Keras import(s)
    import keras.backend as K
    from keras.models import load_model

    # Project import(s)
    from adversarial.models import classifier_model, adversary_model, combined_model, decorrelation_model

    # Load data
    data, features, _ = load_data(args.input + 'data.h5', test=True)

    # Common definitions
    # --------------------------------------------------------------------------
    # -- k-nearest neighbour
    kNN_var = 'D2-k#minusNN'

    def meaningful_digits(number):
        digits = 0
        if number > 0:
            digits = int(np.ceil(max(-np.log10(number), 0)))
            pass
        return '{l:.{d:d}f}'.format(d=digits, l=number)

    # -- Adversarial neural network (ANN) scan
    lambda_reg = 10.
    lambda_regs = sorted([1., 3., 10.])
    ann_vars = list()
    lambda_strs = list()
    for lambda_reg_ in lambda_regs:
        lambda_str = meaningful_digits(lambda_reg_).replace('.', 'p')
        lambda_strs.append(lambda_str)

        ann_var_ = "ANN(#lambda={:s})".format(lambda_str.replace('p', '.'))
        ann_vars.append(ann_var_)
        pass

    ann_var = ann_vars[lambda_regs.index(lambda_reg)]

    # -- uBoost scan
    uboost_eff = 92
    uboost_ur = 0.3
    uboost_urs = sorted([0., 0.01, 0.1, 0.3, 1.0])
    uboost_var = 'uBoost(#alpha={:s})'.format(meaningful_digits(uboost_ur))
    uboost_vars = [
        'uBoost(#alpha={:s})'.format(meaningful_digits(ur))
        for ur in uboost_urs
    ]
    uboost_pattern = 'uboost_ur_{{:4.2f}}_te_{:.0f}_rel21_fixed'.format(
        uboost_eff)

    # Tagger feature collection
    tagger_features = [
        'Tau21', 'Tau21DDT', 'D2', kNN_var, 'D2', 'D2CSS', 'NN', ann_var,
        'Adaboost', uboost_var
    ]

    # Add variables
    # --------------------------------------------------------------------------
    with Profile("Add variables"):

        # Tau21DDT
        from run.ddt.common import add_ddt
        add_ddt(data, path='models/ddt/ddt.pkl.gz')

        # D2-kNN
        from run.knn.common import add_knn, VAR as kNN_basevar, EFF as kNN_eff
        print "k-NN base variable: {} (cp. {})".format(kNN_basevar, kNN_var)
        add_knn(data,
                newfeat=kNN_var,
                path='models/knn/knn_{}_{}.pkl.gz'.format(
                    kNN_basevar, kNN_eff))

        # D2-CSS
        from run.css.common import add_css
        add_css("D2", data)

        # NN
        from run.adversarial.common import add_nn
        with Profile("NN"):
            classifier = load_model(
                'models/adversarial/classifier/full/classifier.h5')
            add_nn(data, classifier, 'NN')
            pass

        # ANN
        with Profile("ANN"):
            from adversarial.utils import DECORRELATION_VARIABLES
            adversary = adversary_model(
                gmm_dimensions=len(DECORRELATION_VARIABLES),
                **cfg['adversary']['model'])

            combined = combined_model(classifier, adversary,
                                      **cfg['combined']['model'])

            for ann_var_, lambda_str_ in zip(ann_vars, lambda_strs):
                print "== Loading model for {}".format(ann_var_)
                combined.load_weights(
                    'models/adversarial/combined/full/combined_lambda{}.h5'.
                    format(lambda_str_))
                add_nn(data, classifier, ann_var_)
                pass
            pass

        # Adaboost/uBoost
        with Profile("Adaboost/uBoost"):
            from run.uboost.common import add_bdt
            for var, ur in zip(uboost_vars, uboost_urs):
                var = ('Adaboost' if ur == 0 else var)
                path = 'models/uboost/' + uboost_pattern.format(ur).replace(
                    '.', 'p') + '.pkl.gz'
                print "== Loading model for {}".format(var)
                add_bdt(data, var, path)
                pass

            # Remove `Adaboost` from scan list
            uboost_vars.pop(0)
            pass

        pass

    # Remove unused variables
    used_variables = set(tagger_features + ann_vars + uboost_vars +
                         ['m', 'pt', 'npv', 'weight_test'])
    unused_variables = [var for var in list(data) if var not in used_variables]
    data.drop(columns=unused_variables)
    gc.collect()

    # Perform performance studies
    perform_studies(data, args, tagger_features, ann_vars, uboost_vars)

    return 0
def main(args):

    # Initialisation
    # --------------------------------------------------------------------------
    with Profile("Initialisation"):

        # Initialising
        # ----------------------------------------------------------------------
        args, cfg = initialise(args)

        # Validate train/optimise flags
        if args.optimise_classifier:

            # Stand-alone classifier optimisation
            args.train_classifier = True
            args.train_adversarial = False
            args.train = False
            cfg['classifier']['fit']['verbose'] = 2

        elif args.optimise_adversarial:

            # Adversarial network optimisation
            args.train_classifier = False
            args.train_adversarial = True
            args.train = False
            cfg['combined']['fit']['verbose'] = 2

            pass

        cfg['classifier']['fit']['verbose'] = 2  # @TEMP
        cfg['combined']['fit']['verbose'] = 2  # @TEMP

        # Initialise Keras backend
        initialise_backend(args)

        import keras
        import keras.backend as K
        from keras.models import load_model
        from keras.callbacks import Callback, TensorBoard, EarlyStopping
        from keras.utils.vis_utils import plot_model

        # Neural network-specific initialisation of the configuration dict
        initialise_config(args, cfg)

        # Setup TensorBoard, if applicable
        tensorboard_dir = initialise_tensorboard(args, cfg)

        # Print the current environment setup
        print_env(args, cfg)
        pass

    # Loading data
    # --------------------------------------------------------------------------
    data, features, features_decorrelation = load_data(args.input + 'data.h5',
                                                       train=True)
    num_features = len(features)

    # Regulsarisation parameter
    lambda_reg = cfg['combined']['model'][
        'lambda_reg']  # Use same `lambda` as the adversary
    digits = int(np.ceil(max(-np.log10(lambda_reg), 0)))

    # digits = 1

    lambda_str = '{l:.{d:d}f}'.format(d=digits, l=lambda_reg).replace('.', 'p')

    # Get standard-formatted decorrelation inputs
    decorrelation = get_decorrelation_variables(data)
    aux_vars = ['logpt']
    data['logpt'] = pd.Series(np.log(data['pt'].values), index=data.index)

    # Specify common weights
    # -- Classifier
    weight_var = 'weight_adv'  # 'weight_adv' / 'weight_train'
    data['weight_clf'] = pd.Series(data[weight_var].values, index=data.index)

    # -- Adversary
    data['weight_adv'] = pd.Series(np.multiply(data['weight_adv'].values,
                                               1. - data['signal'].values),
                                   index=data.index)

    # Classifier-only fit, cross-validation
    # --------------------------------------------------------------------------
    with Profile("Classifier-only fit, cross-validation"):

        # Define variable(s)
        basename = 'crossval_classifier'
        basedir = 'models/adversarial/classifier/crossval/'

        # Get indices for each fold in stratified k-fold training
        # @NOTE: No shuffling is performed -- assuming that's already done.
        skf = StratifiedKFold(n_splits=args.folds).split(
            data[features].values, data['signal'].values)

        # Import module creator methods and optimiser options
        from adversarial.models import classifier_model, adversary_model, combined_model, decorrelation_model

        # Collection of classifiers and their associated training histories
        classifiers = list()
        histories = list()

        # Train or load classifiers
        if args.optimise_classifier:  # args.train or args.train_classifier:
            log.info("Training cross-validation classifiers")

            # Loop `k` folds
            for fold, (train, validation) in enumerate(skf):
                with Profile("Fold {}/{}".format(fold + 1, args.folds)):

                    # Define unique name for current classifier
                    name = '{}__{}of{}'.format(basename, fold + 1, args.folds)

                    # Get classifier
                    classifier = classifier_model(num_features,
                                                  **cfg['classifier']['model'])

                    # Parallelise on GPUs
                    # @NOTE: Store reference to base model to allow for saving.
                    #        Cf. [https://github.com/keras-team/keras/issues/8446#issuecomment-343559454]
                    parallelised = parallelise_model(classifier, args)

                    # Compile model (necessary to save properly)
                    parallelised.compile(**cfg['classifier']['compile'])

                    # Prepare arrays
                    X = data[features].values[train]
                    Y = data['signal'].values[train]
                    W = data['weight_clf'].values[train]
                    validation_data = (data[features].values[validation],
                                       data['signal'].values[validation],
                                       data['weight_clf'].values[validation])

                    # Create callbacks
                    callbacks = []

                    # -- TensorBoard
                    if args.tensorboard:
                        callbacks += [
                            TensorBoard(log_dir=tensorboard_dir +
                                        'classifier/fold{}/'.format(fold))
                        ]
                        pass

                    # Compute initial losses
                    X_val, Y_val, W_val = validation_data
                    eval_opts = dict(
                        batch_size=cfg['classifier']['fit']['batch_size'],
                        verbose=0)
                    initial_losses = [[
                        parallelised.evaluate(X,
                                              Y,
                                              sample_weight=W,
                                              **eval_opts)
                    ],
                                      [
                                          parallelised.evaluate(
                                              X_val,
                                              Y_val,
                                              sample_weight=W_val,
                                              **eval_opts)
                                      ]]

                    # Fit classifier model
                    ret = parallelised.fit(X,
                                           Y,
                                           sample_weight=W,
                                           validation_data=validation_data,
                                           callbacks=callbacks,
                                           **cfg['classifier']['fit'])

                    # Prepend initial losses
                    for metric, loss_train, loss_val in zip(
                            parallelised.metrics_names, *initial_losses):
                        ret.history[metric].insert(0, loss_train)
                        ret.history['val_' + metric].insert(0, loss_val)
                        pass

                    # Add to list of cost histories
                    histories.append(ret.history)

                    # Add to list of classifiers
                    classifiers.append(classifier)

                    # Save classifier model and training history to file, both
                    # in unique output directory and in the directory for pre-
                    # trained classifiers
                    save([args.output, basedir], name, classifier, ret.history)
                    pass
                pass  # end: k-fold cross-validation
            pass
        else:

            # Load pre-trained classifiers
            log.info("Loading cross-validation classifiers from file")
            try:
                for fold in range(args.folds):
                    name = '{}__{}of{}'.format(basename, fold + 1, args.folds)
                    classifier, history = load(basedir, name)
                    classifiers.append(classifier)
                    histories.append(history)
                    pass
            except IOError as err:
                log.error("{}".format(err))
                log.error("Not all files were loaded. Exiting.")
                #return 1  # @TEMP
                pass

            pass  # end: train/load
        pass

    # Early stopping in case of stand-alone classifier optimisation
    # --------------------------------------------------------------------------
    if args.optimise_classifier:

        # Compute average validation loss
        val_avg = np.mean([hist['val_loss'] for hist in histories], axis=0)
        val_std = np.std([hist['val_loss'] for hist in histories], axis=0)
        return val_avg[-1] + val_std[-1]

    # Classifier-only fit, full
    # --------------------------------------------------------------------------
    with Profile("Classifier-only fit, full"):

        # Define variable(s)
        name = 'classifier'
        basedir = 'models/adversarial/classifier/full/'

        if args.train or args.train_classifier:
            log.info("Training full classifier")

            # Get classifier
            classifier = classifier_model(num_features,
                                          **cfg['classifier']['model'])

            # Save classifier model diagram to file
            plot_model(classifier,
                       to_file=args.output + 'model_{}.png'.format(name),
                       show_shapes=True)

            # Parallelise on GPUs
            parallelised = parallelise_model(classifier, args)

            # Compile model (necessary to save properly)
            parallelised.compile(**cfg['classifier']['compile'])

            # Create callbacks
            callbacks = []

            # -- TensorBoard
            if args.tensorboard:
                callbacks += [
                    TensorBoard(log_dir=tensorboard_dir + name + '/')
                ]
                pass

            # Prepare arrays
            X = data[features].values
            Y = data['signal'].values
            W = data['weight_clf'].values

            # Fit classifier model
            ret = parallelised.fit(X,
                                   Y,
                                   sample_weight=W,
                                   callbacks=callbacks,
                                   **cfg['classifier']['fit'])

            # Save classifier model and training history to file, both in unique
            # output directory and in the directory for pre-trained classifiers.
            save([args.output, basedir], name, classifier, ret.history)

            # Saving classifier in lwtnn-friendly format.
            lwtnn_save(classifier, 'nn')

        else:

            # Load pre-trained classifier
            log.info("Loading full classifier from file")
            classifier, history = load(basedir, name)
            pass  # end: train/load
        pass

    # Definitions for adversarial training
    # --------------------------------------------------------------------------
    # Create custom Kullback-Leibler (KL) divergence cost.
    def kullback_leibler(p_true, p_pred):
        return -K.log(p_pred)

    cfg['combined']['compile']['loss'][1] = kullback_leibler

    # @TODO: Make `train_{classifier,adverarial}` methods for used with _both_
    #        cross-val.- and full trianing

    # Combined adversarial fit, cross-validation
    # --------------------------------------------------------------------------
    with Profile("Combined adversarial fit, cross-validation"):
        # @TODO:
        # - Checkpointing

        # Define variables
        results = []  # Holding optimisation metrics
        basename = 'combined_lambda{}'.format(lambda_str)
        basedir = 'models/adversarial/combined/crossval/'

        # Get indices for each fold in stratified k-fold training
        # @NOTE: No shuffling is performed -- assuming that's already done above.
        skf = StratifiedKFold(n_splits=args.folds).split(
            data[features].values, data['signal'].values)

        if args.optimise_adversarial:  # args.train or args.train_adversarial:
            log.info("Training combined model cross-validation")

            # Loop `k` folds
            for fold, (train, validation) in enumerate(skf):
                with Profile("Fold {}/{}".format(fold + 1, args.folds)):

                    # Define unique name for current classifier
                    name = '{}__{}of{}'.format(basename, fold + 1, args.folds)

                    # Load pre-trained classifier
                    classifier, _ = load('models/adversarial/classifier/full/',
                                         'classifier')

                    # Set up adversary
                    adversary = adversary_model(
                        gmm_dimensions=len(DECORRELATION_VARIABLES),
                        **cfg['adversary']['model'])

                    # Set up combined, adversarial model
                    combined = combined_model(classifier, adversary,
                                              **cfg['combined']['model'])

                    # Parallelise on GPUs
                    parallelised = parallelise_model(combined, args)

                    # Prepare arrays
                    X = [data[features].values[train]] + [
                        data[aux_vars].values[train], decorrelation[train]
                    ]
                    Y = [data['signal'].values[train]
                         ] + [np.ones_like(data['signal'].values[train])]
                    W = [data['weight_clf'].values[train]
                         ] + [data['weight_adv'].values[train]]

                    validation_data = (
                        [data[features].values[validation]] + [
                            data[aux_vars].values[validation],
                            decorrelation[validation]
                        ], [data['signal'].values[validation]] +
                        [np.ones_like(data['signal'].values[validation])],
                        [data['weight_clf'].values[validation]] +
                        [data['weight_adv'].values[validation]])

                    # Compile model for pre-training
                    classifier.trainable = False
                    parallelised.compile(**cfg['combined']['compile'])

                    # Compute initial losses
                    log.info("Computing initial loss")
                    X_val, Y_val, W_val = validation_data
                    eval_opts = dict(
                        batch_size=cfg['combined']['fit']['batch_size'],
                        verbose=0)
                    initial_losses = [
                        parallelised.evaluate(X,
                                              Y,
                                              sample_weight=W,
                                              **eval_opts),
                        parallelised.evaluate(X_val,
                                              Y_val,
                                              sample_weight=W_val,
                                              **eval_opts)
                    ]

                    # Pre-training adversary
                    log.info("Pre-training")
                    pretrain_fit_opts = dict(**cfg['combined']['fit'])
                    pretrain_fit_opts['epochs'] = cfg['combined']['pretrain']
                    ret_pretrain = parallelised.fit(
                        X,
                        Y,
                        sample_weight=W,
                        validation_data=validation_data,
                        **pretrain_fit_opts)

                    # Re-compile combined model for full training
                    classifier.trainable = True
                    parallelised.compile(**cfg['combined']['compile'])

                    # Fit classifier model
                    log.info("Actual training")
                    ret = parallelised.fit(X,
                                           Y,
                                           sample_weight=W,
                                           validation_data=validation_data,
                                           **cfg['combined']['fit'])

                    # Prepend initial losses
                    for metric, loss_train, loss_val in zip(
                            parallelised.metrics_names, *initial_losses):
                        ret_pretrain.history[metric].insert(0, loss_train)
                        ret_pretrain.history['val_' + metric].insert(
                            0, loss_val)
                        pass

                    for metric in parallelised.metrics_names:
                        ret.history[metric] = ret_pretrain.history[
                            metric] + ret.history[metric]
                        ret.history['val_' + metric] = ret_pretrain.history[
                            'val_' + metric] + ret.history['val_' + metric]
                        pass

                    # Save combined model and training history to file, both in unique
                    # output directory and in the directory for pre-trained classifiers.
                    save([args.output, basedir], name, combined, ret.history)

                    # Add `ANN` variable
                    add_nn(data, classifier, 'ANN')

                    # Compute optimisation metric
                    try:
                        eff, rej, jsd_inv = metrics(data.iloc[validation],
                                                    'ANN')
                        print "Background rejection: {}".format(rej)
                        print "1/JSD:                {}".format(jsd_inv)
                        if np.inf in [rej, jsd_inv
                                      ] or np.nan in [rej, jsd_inv]:
                            return 0
                        results.append(rej + lambda_reg * jsd_inv)
                    except ValueError:
                        print "Got a NaN. Returning 0"
                        return 0
                    pass
                pass
            pass
        pass

    # Early stopping in case of adversarial network
    # --------------------------------------------------------------------------
    if args.optimise_adversarial:

        # Return optimisation metric: - (rej + 1/jsd)
        print "rej + 1/jsd: {} ± {}".format(np.mean(results), np.std(results))
        return -(np.mean(results) - np.std(results))

    # Combined adversarial fit, full
    # --------------------------------------------------------------------------
    with Profile("Combined adversarial fit, full"):

        # Define variables
        name = 'combined_lambda{}'.format(lambda_str)
        basedir = 'models/adversarial/combined/full/'

        # Load pre-trained classifier
        classifier, _ = load('models/adversarial/classifier/full/',
                             'classifier')

        # Set up adversary
        adversary = adversary_model(
            gmm_dimensions=len(DECORRELATION_VARIABLES),
            **cfg['adversary']['model'])

        # Save adversarial model diagram
        plot_model(adversary,
                   to_file=args.output + 'model_adversary.png',
                   show_shapes=True)

        # Create callback array
        callbacks = list()

        # (opt.) Add TensorBoard callback
        if args.tensorboard:
            callbacks += [
                TensorBoard(log_dir=tensorboard_dir + 'adversarial/')
            ]
            pass

        # Set up combined, adversarial model
        combined = combined_model(classifier, adversary,
                                  **cfg['combined']['model'])

        # Save combined model diagram
        plot_model(combined,
                   to_file=args.output + 'model_{}.png'.format(name),
                   show_shapes=True)

        if args.train or args.train_adversarial:
            log.info("Training full, combined model")

            # Parallelise on GPUs
            parallelised = parallelise_model(combined, args)

            # Compile model (necessary to save properly)
            parallelised.compile(**cfg['combined']['compile'])

            # Prepare arrays
            X = [data[features].values
                 ] + [data[aux_vars].values, decorrelation]
            Y = [data['signal'].values] + [np.ones_like(data['signal'].values)]
            W = [data['weight_clf'].values] + [data['weight_adv'].values]

            # Compile model for pre-training
            classifier.trainable = False
            parallelised.compile(**cfg['combined']['compile'])

            # Pre-training adversary
            log.info("Pre-training")
            pretrain_fit_opts = dict(**cfg['combined']['fit'])
            pretrain_fit_opts['epochs'] = cfg['combined']['pretrain']
            ret_pretrain = parallelised.fit(X,
                                            Y,
                                            sample_weight=W,
                                            **pretrain_fit_opts)

            # Re-compile combined model for full training
            classifier.trainable = True
            parallelised.compile(**cfg['combined']['compile'])

            # Fit classifier model
            log.info("Actual training")
            ret = parallelised.fit(X,
                                   Y,
                                   sample_weight=W,
                                   callbacks=callbacks,
                                   **cfg['combined']['fit'])

            # Prepend initial losses
            for metric in parallelised.metrics_names:
                ret.history[metric] = ret_pretrain.history[
                    metric] + ret.history[metric]
                pass

            # Save combined model and training history to file, both in unique
            # output directory and in the directory for pre-trained classifiers.
            adv = lambda s: s.replace('combined', 'adversary')
            save([args.output, basedir], name, combined, ret.history)
            save([args.output, adv(basedir)], adv(name), adversary)

            # Saving adversarially trained classifier in lwtnn-friendly format.
            lwtnn_save(classifier, 'ann')

        else:

            # Load pre-trained combined _weights_ from file, in order to
            # simultaneously load the embedded classifier so as to not have to
            # extract it manually afterwards.
            log.info("Loading full, combined model from file")
            combined, history = load(basedir, name, model=combined)
            pass  # end: train/load

        pass

    return 0
def main(args):

    # Initialise
    args, cfg = initialise(args)

    # Initialise Keras backend
    initialise_backend(args)

    # Neural network-specific initialisation of the configuration dict
    initialise_config(args, cfg)

    # Keras import(s)
    import keras.backend as K
    from keras.models import load_model

    # Project import(s)
    from adversarial.models import classifier_model, adversary_model, combined_model, decorrelation_model

    # Load data
    data, features, _ = load_data(args.input + 'data.h5', test=True)

    def meaningful_digits(number):
        digits = 0
        if number > 0:
            digits = int(np.ceil(max(-np.log10(number), 0)))
            pass
        return '{l:.{d:d}f}'.format(d=digits, l=number)

    # -- Adversarial neural network (ANN) scan
    lambda_reg = 100.
    lambda_regs = sorted([100.])
    ann_vars = list()
    lambda_strs = list()
    for lambda_reg_ in lambda_regs:
        lambda_str = meaningful_digits(lambda_reg_).replace('.', 'p')
        lambda_strs.append(lambda_str)

        ann_var_ = "ANN(#lambda={:s})".format(lambda_str.replace('p', '.'))
        ann_vars.append(ann_var_)
        pass

    ann_var = ann_vars[lambda_regs.index(lambda_reg)]

    print "ann_var"
    print ann_var

    # Tagger feature collection
    # tagger_features = ['NN', ann_var]
    tagger_features = ['NN', ann_var, 'MV2c10', 'XbbScoreHiggs']
    # tagger_features = ['MV2c10']

    # Add variables
    # --------------------------------------------------------------------------
    with Profile("Add variables"):

        # NN
        from run.adversarial.common import add_nn
        with Profile("NN"):
            classifier = load_model(
                'models/adversarial/classifier/full/classifier.h5')
            add_nn(data, classifier, 'NN')
            pass

        # ANN
        with Profile("ANN"):
            from adversarial.utils import DECORRELATION_VARIABLES
            adversary = adversary_model(
                gmm_dimensions=len(DECORRELATION_VARIABLES),
                **cfg['adversary']['model'])

            combined = combined_model(classifier, adversary,
                                      **cfg['combined']['model'])

            for ann_var_, lambda_str_ in zip(ann_vars, lambda_strs):
                print "== Loading model for {}".format(ann_var_)
                combined.load_weights(
                    'models/adversarial/combined/full/combined_lambda{}.h5'.
                    format(lambda_str_))
                add_nn(data, classifier, ann_var_)
                pass
            pass

        with Profile("MV2c10"):
            data["MV2c10"] = pd.concat(
                [data["MV2c10_discriminant_1"], data["MV2c10_discriminant_2"]],
                axis=1).min(axis=1)

        # Add MV2 and XbbScore here
        # e.g. min(MV2_sj1, MV2_sj2)

    # Remove unused variables
    used_variables = set(tagger_features + ann_vars +
                         ['mass', 'pt', 'npv', 'weight_test'])
    unused_variables = [var for var in list(data) if var not in used_variables]
    data.drop(columns=unused_variables)
    gc.collect()

    # Perform performance studies
    perform_studies(data, args, tagger_features, ann_vars)

    return 0