示例#1
0
def test_ffnnm_hyperoptimize(setup_ffnnm_dataframe, setup_config):
    """Test that FeedforwardNeuralNetworkModeler.hyperoptimize() returns a
    dictionary of parameters.
    """
    errors_list = []
    cat_features_list = setup_ffnnm_dataframe.select_dtypes(
        include=["object"]).columns.tolist()
    for cat_feature in cat_features_list:
        setup_ffnnm_dataframe[cat_feature] = setup_ffnnm_dataframe[
            cat_feature].astype("category")
    setup_ffnnm_dataframe["FILE_DATE"] = pd.Series(
        pd.factorize(setup_ffnnm_dataframe["FILE_DATE"])[0])
    subset_training_obs = (~setup_ffnnm_dataframe["_validation"]
                           & ~setup_ffnnm_dataframe["_test"]
                           & ~setup_ffnnm_dataframe["_predict_obs"])
    training_obs_lead_lengths = setup_ffnnm_dataframe[subset_training_obs][
        "_duration"].value_counts()
    n_intervals = training_obs_lead_lengths[
        training_obs_lead_lengths >
        setup_config["MIN_SURVIVORS_IN_TRAIN"]].index.max()
    modeler = tf_modelers.FeedforwardNeuralNetworkModeler(
        config=setup_config,
        data=setup_ffnnm_dataframe,
    )
    modeler.n_intervals = n_intervals
    params = modeler.hyperoptimize(2)
    if not isinstance(params, dict):
        errors_list.append(f"Parameter set is not a dict.")
    assert not errors_list, "Errors occurred: \n{}".format(
        "\n".join(errors_list))
示例#2
0
def test_ffnnm_construct_embedding_network(setup_ffnnm_dataframe, setup_config):
    """Test that FeedforwardNeuralNetworkModeler.construct_embedding_network()
    returns a Keras training model."""
    errors_list = []
    assertions = []
    config = setup_config
    data = setup_ffnnm_dataframe
    subset_training_obs = ~data["_validation"] & ~data["_test"] & ~data["_predict_obs"]
    train_obs_lead_lengths = data[subset_training_obs]["_duration"].value_counts()
    n_intervals = train_obs_lead_lengths[
        train_obs_lead_lengths > config["MIN_SURVIVORS_IN_TRAIN"]
    ].index.max()
    categorical_features = [
        "nonmixed_categorical_var",
        "consistent_mixed_categorical_var",
    ]
    for col in categorical_features:
        data[col] = data[col].astype("category")
    data["FILE_DATE"], _ = pd.factorize(data["FILE_DATE"])
    try:
        if config["INDIVIDUAL_IDENTIFIER"] == "":
            config["INDIVIDUAL_IDENTIFIER"] = data.columns[0]
        if config["TIME_IDENTIFIER"] == "":
            config["TIME_IDENTIFIER"] = data.columns[1]
        modeler = tf_modelers.FeedforwardNeuralNetworkModeler(config=config, data=data)
        modeler.n_intervals = n_intervals
        modeler.model = modeler.construct_embedding_network()
        assertions.append(isinstance(modeler.model, Model))
        if not assertions[-1]:
            errors_list.append("Model not of type tensorflow.keras.Model")
    except Exception as error:
        assertions.append(False)
        errors_list.append(str(error))
    assertion = all(assertions)
    assert assertion, "Errors occurred: \n{}".format("\n".join(errors_list))
示例#3
0
def test_ffnnm_train(setup_ffnnm_dataframe, setup_config):
    """Test that FeedforwardNeuralNetworkModeler.train() returns a Keras
    training model and that the modeler's weights change during training."""
    errors_list = []
    assertions = []
    config = setup_config
    data = setup_ffnnm_dataframe
    subset_training_obs = ~data["_validation"] & ~data["_test"] & ~data[
        "_predict_obs"]
    train_obs_lead_lengths = data[subset_training_obs][
        "_duration"].value_counts()
    n_intervals = train_obs_lead_lengths[
        train_obs_lead_lengths > config["MIN_SURVIVORS_IN_TRAIN"]].index.max()
    categorical_features = [
        "nonmixed_categorical_var",
        "consistent_mixed_categorical_var",
    ]
    for col in categorical_features:
        data[col] = data[col].astype("category")
    data["FILE_DATE"], _ = pd.factorize(data["FILE_DATE"])
    try:
        modeler = tf_modelers.FeedforwardNeuralNetworkModeler(config=config,
                                                              data=data)
        modeler.n_intervals = n_intervals
        modeler.data[modeler.numeric_features] = modeler.data[
            modeler.numeric_features].fillna(
                modeler.config["NON_CAT_MISSING_VALUE"])
        modeler.model = modeler.construct_embedding_network()
        weights_pretrain = modeler.model.get_weights()
        modeler.model = modeler.train()
        weights_posttrain = modeler.model.get_weights()
        no_change_in_weights_list = []
        for i, weight_pretrain in enumerate(weights_pretrain):
            no_change_in_weights_i = np.equal(weight_pretrain,
                                              weights_posttrain[i])
            no_change_in_weights_list.append(np.all(no_change_in_weights_i))
        weights_are_training = not all(no_change_in_weights_list)
        assertions.append(weights_are_training)
        if not assertions[-1]:
            errors_list.append("Model weights have not changed, suggesting "
                               "failure to train")
        assertions.append(isinstance(modeler.model, Model))
        if not assertions[-1]:
            errors_list.append("Model not of type tensorflow.keras.Model")
    except Exception as error:
        assertions.append(False)
        errors_list.append(str(error))
    assertion = all(assertions)
    assert assertion, "Errors occurred: \n{}".format("\n".join(errors_list))
示例#4
0
def test_ffnnm_train(setup_ffnnm_dataframe, setup_config):
    """Test that FeedforwardNeuralNetworkModeler.train() returns a Keras
    training model and that the modeler's weights change during training."""
    errors_list = []
    assertions = []
    config = setup_config
    data = setup_ffnnm_dataframe
    subset_training_obs = (~data['_validation'] & ~data['_test']
                           & ~data['_predict_obs'])
    train_obs_lead_lengths = (
        data[subset_training_obs]['_duration'].value_counts())
    n_intervals = train_obs_lead_lengths[
        train_obs_lead_lengths > config['MIN_SURVIVORS_IN_TRAIN']].index.max()
    categorical_features = [
        'nonmixed_categorical_var', 'consistent_mixed_categorical_var'
    ]
    data['FILE_DATE'], _ = pd.factorize(data['FILE_DATE'])
    try:
        modeler = tf_modelers.FeedforwardNeuralNetworkModeler(
            config=config,
            data=data,
            categorical_features=categorical_features)
        modeler.n_intervals = n_intervals
        modeler.data = modeler.data.fillna(
            modeler.config['NON_CAT_MISSING_VALUE'])
        modeler.model = modeler.construct_embedding_network()
        weights_pretrain = modeler.model.get_weights()
        modeler.model = modeler.train()
        weights_posttrain = modeler.model.get_weights()
        no_change_in_weights_list = []
        for i, weight_pretrain in enumerate(weights_pretrain):
            no_change_in_weights_i = np.equal(weight_pretrain,
                                              weights_posttrain[i])
            no_change_in_weights_list.append(np.all(no_change_in_weights_i))
        weights_are_training = not all(no_change_in_weights_list)
        assertions.append(weights_are_training)
        if not assertions[-1]:
            errors_list.append('Model weights have not changed, suggesting '
                               'failure to train')
        assertions.append(isinstance(modeler.model, ke.training.Model))
        if not assertions[-1]:
            errors_list.append('Model not of type keras.engine.training.Model')
    except Exception as error:
        assertions.append(False)
        errors_list.append(str(error))
    assertion = all(assertions)
    assert assertion, 'Errors occurred: \n{}'.format('\n'.join(errors_list))
示例#5
0
def test_ffnnm_init(setup_ffnnm_dataframe, setup_config):
    """Test that FeedforwardNeuralNetworkModeler instantiates properly."""
    errors_list = []
    assertions = []
    config = setup_config
    data = setup_ffnnm_dataframe
    try:
        if config['INDIVIDUAL_IDENTIFIER'] == '':
            config['INDIVIDUAL_IDENTIFIER'] = data.columns[0]
        if config['TIME_IDENTIFIER'] == '':
            config['TIME_IDENTIFIER'] = data.columns[1]
        modeler = tf_modelers.FeedforwardNeuralNetworkModeler(
            config=config, data=data, categorical_features=[])
        assertions.append(
            isinstance(modeler, tf_modelers.FeedforwardNeuralNetworkModeler))
        if assertions[-1] is False:
            errors_list.append('Modeler did not instantiate properly')
    except Exception as error:
        assertions.append(False)
        errors_list.append(str(error))
    assertion = all(assertions)
    assert assertion, 'Errors occurred: \n{}'.format('\n'.join(errors_list))
示例#6
0
def test_ffnnm_init(setup_ffnnm_dataframe, setup_config):
    """Test that FeedforwardNeuralNetworkModeler instantiates properly."""
    errors_list = []
    assertions = []
    config = setup_config
    data = setup_ffnnm_dataframe
    try:
        if config["INDIVIDUAL_IDENTIFIER"] == "":
            config["INDIVIDUAL_IDENTIFIER"] = data.columns[0]
        if config["TIME_IDENTIFIER"] == "":
            config["TIME_IDENTIFIER"] = data.columns[1]
        modeler = tf_modelers.FeedforwardNeuralNetworkModeler(config=config,
                                                              data=data)
        assertions.append(
            isinstance(modeler, tf_modelers.FeedforwardNeuralNetworkModeler))
        if assertions[-1] is False:
            errors_list.append("Modeler did not instantiate properly")
    except Exception as error:
        assertions.append(False)
        errors_list.append(str(error))
    assertion = all(assertions)
    assert assertion, "Errors occurred: \n{}".format("\n".join(errors_list))
示例#7
0
def test_ffnnm_construct_embedding_network(setup_ffnnm_dataframe,
                                           setup_config):
    """Test that FeedforwardNeuralNetworkModeler.construct_embedding_network()
    returns a Keras training model."""
    errors_list = []
    assertions = []
    config = setup_config
    data = setup_ffnnm_dataframe
    subset_training_obs = (~data['_validation'] & ~data['_test']
                           & ~data['_predict_obs'])
    train_obs_lead_lengths = (
        data[subset_training_obs]['_duration'].value_counts())
    n_intervals = train_obs_lead_lengths[
        train_obs_lead_lengths > config['MIN_SURVIVORS_IN_TRAIN']].index.max()
    categorical_features = [
        'nonmixed_categorical_var', 'consistent_mixed_categorical_var'
    ]
    data['FILE_DATE'], _ = pd.factorize(data['FILE_DATE'])
    try:
        if config['INDIVIDUAL_IDENTIFIER'] == '':
            config['INDIVIDUAL_IDENTIFIER'] = data.columns[0]
        if config['TIME_IDENTIFIER'] == '':
            config['TIME_IDENTIFIER'] = data.columns[1]
        modeler = tf_modelers.FeedforwardNeuralNetworkModeler(
            config=config,
            data=data,
            categorical_features=categorical_features)
        modeler.n_intervals = n_intervals
        modeler.model = modeler.construct_embedding_network()
        assertions.append(isinstance(modeler.model, ke.training.Model))
        if not assertions[-1]:
            errors_list.append('Model not of type keras.engine.training.Model')
    except Exception as error:
        assertions.append(False)
        errors_list.append(str(error))
    assertion = all(assertions)
    assert assertion, 'Errors occurred: \n{}'.format('\n'.join(errors_list))
示例#8
0
def main():
    """Execute default FIFE pipeline from data to forecasts and metrics."""
    # Set up I/O
    checkpoint_time = time()
    if len(sys.argv) > 1:
        with open(sys.argv[1], 'r') as file:
            config = json.load(file)
    else:
        print('No configuration file specified.')
        candidate_configs = [
            file for file in os.listdir() if file.endswith('.json')
        ]
        assert len(candidate_configs) >= 1, ((
            'No json files found in current directory. '
            'Please specify a configuration file in your command, '
            'e.g., "fife example_config.json".'))
        assert len(candidate_configs) <= 1, ((
            'Multiple json files found in current directory. '
            'Please specify a configuration file in your command, '
            'e.g., "fife example_config.json".'))
        print(f'Using {candidate_configs[0]} as configuration file.')
        with open(candidate_configs[0], 'r') as file:
            config = json.load(file)

    utils.make_results_reproducible(config['SEED'])
    utils.redirect_output_to_log(path=config['RESULTS_PATH'])
    print('Produced using FIFE: Finite-Interval Forecasting Engine')
    print('Copyright (c) 2018 - 2020, Institute for Defense Analyses (IDA)')
    print('Please cite using the suggested citation in the LICENSE file.\n')
    utils.print_config(config)

    # Process data
    data = utils.import_data_file(config['DATA_FILE_PATH'])
    if config['INDIVIDUAL_IDENTIFIER'] == '':
        config['INDIVIDUAL_IDENTIFIER'] = data.columns[0]
        print('Individual identifier column name not given; assumed to be '
              f'leftmost column ({config["INDIVIDUAL_IDENTIFIER"]})')
    if config['TIME_IDENTIFIER'] == '':
        config['TIME_IDENTIFIER'] = data.columns[1]
        print('Time identifier column name not given; assumed to be '
              f'second-leftmost column ({config["TIME_IDENTIFIER"]})')
    data_processor = processors.PanelDataProcessor(config, data)
    data_processor.build_processed_data()
    print(f'Data processing time: {time() - checkpoint_time} seconds')
    checkpoint_time = time()

    # Save intermediate files
    utils.save_maps(data_processor.categorical_maps,
                    'Categorical_Maps',
                    path=config['RESULTS_PATH'])
    utils.save_maps(data_processor.numeric_ranges,
                    'Numeric_Ranges',
                    path=config['RESULTS_PATH'])
    utils.save_intermediate_data(data_processor.data,
                                 'Processed_Data',
                                 file_format='pickle',
                                 path=config['RESULTS_PATH'])

    # Train and save model
    utils.ensure_folder_existence(
        f'{config["RESULTS_PATH"]}/Intermediate/Models')
    categorical_features = list(data_processor.categorical_maps.keys())
    if config.get('TREE_MODELS'):
        modeler = \
            lgb_modelers.GradientBoostedTreesModeler(
                config=config, data=data_processor.data,
                categorical_features=categorical_features)
        modeler.build_model()
        for i, lead_specific_model in enumerate(modeler.model):
            lead_path = (f'{config["RESULTS_PATH"]}/Intermediate/Models/'
                         f'{i + 1}-lead_GBT_Model.json')
            with open(lead_path, 'w') as file:
                json.dump(lead_specific_model.dump_model(), file, indent=4)
    elif config.get('PROPORTIONAL_HAZARDS'):
        modeler = \
            tf_modelers.ProportionalHazardsModeler(
                config=config, data=data_processor.data,
                categorical_features=categorical_features)
        modeler.build_model()
        modeler.model.save(
            f'{config["RESULTS_PATH"]}/Intermediate/Models/PH_Model.h5')
    else:
        modeler = \
            tf_modelers.FeedforwardNeuralNetworkModeler(
                config=config, data=data_processor.data,
                categorical_features=categorical_features)
        modeler.build_model()
        modeler.model.save(
            f'{config["RESULTS_PATH"]}/Intermediate/Models/FFNN_Model.h5')
    print(f'Model training time: {time() - checkpoint_time} seconds')
    checkpoint_time = time()

    # Save metrics and forecasts
    utils.save_output_table(modeler.evaluate(modeler.data['_validation']
                                             & ~modeler.data['_test']),
                            'Metrics',
                            path=config['RESULTS_PATH'])
    individual_predictions = modeler.forecast()
    utils.save_output_table(individual_predictions,
                            'Survival_Curves',
                            path=config['RESULTS_PATH'])
    utils.save_output_table(
        utils.compute_aggregation_uncertainty(individual_predictions),
        'Aggregate_Survival_Bounds',
        index=False,
        path=config['RESULTS_PATH'])

    # Save and plot retention rates
    lead_periods = config['RETENTION_INTERVAL']
    time_ids = pd.factorize(modeler.data[modeler.config['TIME_IDENTIFIER']],
                            sort=True)[0]
    retention_rates = modeler.tabulate_retention_rates(
        lead_periods=lead_periods, time_ids=time_ids)
    utils.save_output_table(retention_rates,
                            'Retention_Rates',
                            path=config['RESULTS_PATH'])
    axes = retention_rates.plot()
    axes.set_ylabel(f'{lead_periods}-period Retention Rate')
    earliest_period = data_processor.numeric_ranges.loc[
        data_processor.config["TIME_IDENTIFIER"], "Minimum"]
    axes.set_xlabel(f'Periods Since {earliest_period}')
    utils.save_plot('Retention_Rates', path=config['RESULTS_PATH'])

    # Save event counts by quantile
    utils.save_output_table(modeler.tabulate_survival_by_quantile(
        modeler.data['_validation'] & ~modeler.data['_test'],
        n_quantiles=config['QUANTILES']),
                            'Counts_by_Quantile',
                            index=False,
                            path=config['RESULTS_PATH'])

    # Plot SHAP values for a subset of observations in the final period
    if isinstance(modeler, (lgb_modelers.GradientBoostedTreesModeler)):
        subset = modeler.data.index.isin(data_processor.raw_subset.index)
        shap_values = modeler.compute_shap_values(subset=subset)
        utils.plot_shap_values(
            shap_values,
            data_processor.raw_subset[modeler.categorical_features +
                                      modeler.numeric_features],
            modeler.data[subset][modeler.categorical_features +
                                 modeler.numeric_features],
            config['TIME_IDENTIFIER'],
            path=config['RESULTS_PATH'])

    # Save metrics for interacted fixed effects model
    if ((set() < set(config['FIXED_EFFECT_FEATURES']) <= set(
            data_processor.data))):
        ife_modeler = \
            pd_modelers.InteractedFixedEffectsModeler(
                config=config, data=data_processor.data,
                categorical_features=categorical_features)
        ife_modeler.build_model()
        with open(f'{config["RESULTS_PATH"]}Intermediate/Models/IFE_Model.p',
                  'wb') as file:
            pickle.dump(ife_modeler.model, file)
        subset = ife_modeler.data['_validation'] & ~ife_modeler.data['_test']
        utils.save_output_table(ife_modeler.evaluate(subset),
                                'IFE_Metrics',
                                path=config['RESULTS_PATH'])
        ife_quantiles = ife_modeler.tabulate_survival_by_quantile(
            subset, n_quantiles=config['QUANTILES'])
        utils.save_output_table(ife_quantiles,
                                'IFE_Counts_by_Quantile',
                                index=False,
                                path=config['RESULTS_PATH'])

    print(f'Output production time: {time() - checkpoint_time} seconds')