def simulate_from_pokec_covariates2(data_dir, setting="A"):
    """mimic TMLE experiment to see what happens"""
    graph_data, profiles = load_data_pokec(data_dir)
    pokec_features = process_pokec_attributes(profiles)

    region = pokec_features['region']
    region = np.searchsorted(np.unique(region), region) - 1.

    old_school = pokec_features['old_school']
    age = pokec_features['scaled_age']
    age_cat = np.where(age < 0, -1., 1.)
    age_cat[np.isnan(age)] = 0

    # simulate
    covs = [old_school, region, age_cat]

    # z = 0.
    # for cov in covs:
    #     z += np.random.uniform(-2, 2)*cov
    z = (2 * (region < 1) - 1)
    t_prob = expit(z)
    t = np.random.binomial(1, t_prob)

    zz = 0.
    for cov in covs:
        zz += np.random.uniform(-1, 3) * cov

    y, y_1, y_0 = simulate_y(zz, t, setting)

    return t, y, y_0, y_1, t_prob, z
def simulate_from_pokec_covariates0(data_dir, setting="A"):
    graph_data, profiles = load_data_pokec(data_dir)
    pokec_features = process_pokec_attributes(profiles)

    # predictable covariates
    covs = ['scaled_registration', 'scaled_age', 'region']
    # 'sign_in_zodiac',
    # 'relation_to_casual_sex']
    clean_age = np.where(np.isnan(pokec_features['scaled_age']),
                         np.zeros_like(pokec_features['scaled_age']),
                         pokec_features['scaled_age'])
    pokec_features['scaled_age'] = clean_age
    region = pokec_features['region']
    pokec_features['region'] = (region - region.mean()) / region.std()

    cov_array = np.zeros([covs.__len__(), pokec_features['region'].shape[0]])
    for idx, cov in enumerate(covs):
        cov_array[idx] = pokec_features[cov]

    coeff = np.random.uniform(-2, 2, [covs.__len__(), 1])

    z = 0.5 + np.sum(coeff * cov_array, 0)
    t_prob = expit(z)
    t = np.random.binomial(1, t_prob)

    y, y_1, y_0 = simulate_y(z, t, setting)

    return t, y, y_0, y_1, t_prob, z
Exemplo n.º 3
0
def main():
    session_config = tf.ConfigProto(intra_op_parallelism_threads=0, inter_op_parallelism_threads=4)
    # session_config = tf.ConfigProto()
    tf.enable_eager_execution(config=session_config)

    print("This is the RERM (network) dataset test")

    parser = add_parser_sampling_arguments()
    args = parser.parse_args()

    print("load the data")
    graph_data, profiles = load_data_pokec('../dat/pokec/regional_subset')

    print("number of edges {}".format(graph_data.edge_list.shape[0]))

    pokec_features = process_pokec_attributes(profiles)

    treatments = pokec_features['I_like_books']
    outcomes = pokec_features['relation_to_casual_sex']

    print("make the graph sampler")
    dataset_fn_train = get_dataset_fn(args.sampler, args)

    # dataset = dataset_fn_train(graph_data, args.seed)
    # itr = dataset.make_one_shot_iterator()
    # t0 = time.time()
    # for _ in range(1000):
    #     sample = itr.get_next()
    # t1 = time.time()
    # print(t1-t0)

    print("make the input_fn")
    # make_sample_generator = make_input_fn(graph_data, args, treatments, outcomes, dataset_fn_train)
    make_sample_generator = make_no_graph_input_fn(graph_data, args, treatments, outcomes, filter_test=True)
    sample_generator = make_sample_generator()

    itr = sample_generator.make_one_shot_iterator()

    in_treat_and_test = []
    in_treat = []

    t0 = time.time()
    for _ in range(1000):
        sample = itr.get_next()
        treatment = sample[0]['treatment']
        test = sample[0]['in_test']
        in_treat += [np.mean(treatment)]
        in_treat_and_test += [np.sum(tf.cast(treatment, tf.float32)*test)]
    t1 = time.time()
    print(t1-t0)

    print(np.mean(in_treat))
    print(np.mean(in_treat_and_test))

    print(tf.equal(tf.squeeze(sample[0]['in_test']), 1))

    print(sample[0].keys())
    print(sample[0].keys())
    print(sample[0]['treatment'].shape[0].value)
Exemplo n.º 4
0
def simulate_from_pokec_covariate(data_dir,
                                  covariate='region',
                                  beta0=1.0,
                                  beta1=1.0,
                                  gamma=1.0):
    graph_data, profiles = load_data_pokec(data_dir)
    pokec_features = process_pokec_attributes(profiles)

    # predictable covariates
    covs = ['scaled_registration', 'scaled_age', 'region']
    # 'sign_in_zodiac',
    # 'relation_to_casual_sex']

    # reindex region to 0, 1, 2
    region = pokec_features['region']
    region = np.searchsorted(np.unique(region), region) - 1.

    age = pokec_features['scaled_age']
    age_cat = np.where(age < 0., -1., 1.)
    age_cat[np.isnan(age)] = 0

    registration = pokec_features['scaled_registration']
    registration_cat = np.where(registration < -0.5, -1., 0.)
    registration_cat[registration > 0.5] = 1.

    if covariate == 'region':
        confounder = region
    elif covariate == 'age':
        confounder = age_cat
    elif covariate == 'registration':
        confounder = registration_cat
    else:
        raise Exception("covariate name not recognized")

    # simulate treatments and outcomes
    propensities = 0.5 + 0.35 * confounder
    treatment = np.random.binomial(1, propensities)
    y, y0, y1 = simulate_y(propensities,
                           treatment,
                           beta0=beta0,
                           beta1=beta1,
                           gamma=gamma)

    t = treatment.astype(np.int32)
    y = y.astype(np.float32)
    y0 = y0.astype(np.float32)
    y1 = y1.astype(np.float32)

    return t, y, y0, y1, propensities
def main():
    tf.enable_eager_execution()
    tf.logging.set_verbosity(tf.logging.INFO)

    parser = add_parser_model_arguments()
    args = parser.parse_args()

    print("load the data")
    graph_data, profiles = load_data_pokec(args.data_dir)

    print("Loaded data with {} vertices and {} edges".format(
        graph_data.num_vertices, graph_data.edge_list.shape[0]))

    np.random.seed(42)  # use consistent seed for simulation
    if args.simulated == 'attribute':
        treatments, outcomes, y_0, y_1, t_prob = \
            simulate_from_pokec_covariate(args.data_dir,
                                          covariate=args.covariate,
                                          beta0=1.0,
                                          beta1=args.beta1,
                                          gamma=1.0)
    elif args.simulated == 'propensity':
        output = pd.read_csv(args.base_propensities_path, '\t')
        base_propensity_scores = output['treatment_probability'].values

        treatments, outcomes, y_0, y_1, t_prob= \
            simulate_exogeneity_experiment(base_propensity_scores,
                                           exogeneous_con=args.exogeneity,
                                            beta0=1.0,
                                            beta1=args.beta1,
                                            gamma=1.0)

    #  but let it change for data splitting and initialization
    tf.set_random_seed(args.seed)
    np.random.seed(args.seed + 42)

    os.makedirs(args.output_dir, exist_ok=True)
    np.savez(os.path.join(args.output_dir, 'simulated_data'),
             treatments=treatments,
             outcomes=outcomes,
             y_0=y_0,
             y_1=y_1,
             t_prob=t_prob)

    treatment_cat = True
    outcome_cat = not outcomes.dtype == np.float32

    if not outcome_cat:
        # rescale outcome to reduce the sensitivity of training to optimization parameters
        outcomes = (outcomes - outcomes.mean()) / outcomes.std()

    if not args.do_train and not args.do_eval and not args.do_predict:
        raise ValueError(
            "At least one of `do_train`, `do_eval` or `do_predict' must be True."
        )

    tf.gfile.MakeDirs(args.output_dir)

    session_config = tf.ConfigProto(intra_op_parallelism_threads=0,
                                    inter_op_parallelism_threads=4)

    if args.use_xla:
        session_config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1

    run_config = tf.estimator.RunConfig(
        log_step_count_steps=10,
        model_dir=args.output_dir,
        save_checkpoints_steps=args.save_checkpoints_steps,
        keep_checkpoint_max=args.keep_checkpoints,
        # save_checkpoints_steps=None,
        # save_checkpoints_secs=None,
        save_summary_steps=10,
        session_config=session_config)

    # estimator setup

    num_train_steps = args.num_train_steps
    vertex_embedding_params = {
        'embedding_dim': args.embedding_dim,
        'embedding_trainable': _str2bool(args.embedding_trainable)
    }

    model_fn = treatment_response_model_fn_builder(
        label_task_weight=args.label_task_weight,
        init_checkpoint=args.init_checkpoint,
        label_pred=args.label_pred,
        unsupervised=args.unsupervised,
        global_optimizer=_make_global_optimizer(args),
        embedding_optimizer=_make_local_optimizer(args),
        regularization=None,
        treatment_cat=treatment_cat,
        outcome_cat=outcome_cat,
        polyak_train=True)

    estimator = tf.estimator.Estimator(
        model_fn=model_fn,
        params={
            **vertex_embedding_params, 'num_vertices': graph_data.num_vertices,
            'batch_size': args.batch_size
        },
        model_dir=args.output_dir,
        config=run_config)

    if args.do_train:
        tf.logging.info("***** Running training *****")
        tf.logging.info("  Batch size = %d", args.batch_size)
        tf.logging.info("  Num steps = %d", num_train_steps)

        # subsample and process the data
        with tf.name_scope("training_data"):
            dataset_fn_train = get_dataset_fn(args.sampler, args)
            train_input_fn = make_input_fn(graph_data, args, treatments,
                                           outcomes, dataset_fn_train)

        # additional logging
        hooks = [
            tf.train.LoggingTensorHook({'loss': 'loss'}, every_n_iter=100)
        ]
        if args.label_pred:
            hooks += [
                tf.train.LoggingTensorHook(
                    {
                        # 'token_ids': 'token_ids',
                        # 'token_mask': 'token_mask',
                        # 'label_ids': 'label_ids',
                        # 'pred_in': 'summary/in_split/predictions',
                        # 'pred_out': 'summary/out_split/predictions',
                        # 'ra_in': 'summary/in_split/labels/kappa/batch_random_agreement/random_agreement',
                        # 'ra_out': 'summary/out_split/labels/kappa/batch_random_agreement/random_agreement',
                    },
                    every_n_iter=1000)
            ]

        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

    if args.do_train and (args.do_eval or args.do_predict):
        # reload the model to get rid of unsupervised parts of the model
        trained_model_checkpoint = tf.train.latest_checkpoint(args.output_dir)
        model_fn = treatment_response_model_fn_builder(
            label_task_weight=args.label_task_weight,
            init_checkpoint=trained_model_checkpoint,
            label_pred=True,
            unsupervised=False,
            treatment_cat=treatment_cat,
            outcome_cat=outcome_cat,
            polyak_train=False,
            polyak_restore=False)

        estimator = tf.estimator.Estimator(
            model_fn=model_fn,
            params={
                **vertex_embedding_params, 'num_vertices':
                graph_data.num_vertices,
                'batch_size': args.batch_size
            },
            model_dir=args.output_dir,
            config=run_config)

    if args.do_eval:

        tf.logging.info("***** Running evaluation *****")
        # tf.logging.info("  Num examples = %d", len(eval_examples))
        tf.logging.info("  Batch size = %d", args.batch_size)

        # This tells the estimator to run through the entire set.
        eval_steps = None

        with tf.name_scope("evaluation_data"):
            eval_input_fn = make_no_graph_input_fn(graph_data,
                                                   args,
                                                   treatments,
                                                   outcomes,
                                                   filter_test=True)

        result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with tf.gfile.GFile(output_eval_file, "w") as writer:
            tf.logging.info("***** Eval results *****")
            for key in sorted(result.keys()):
                tf.logging.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

    if args.do_predict:
        tf.logging.info("***** Running prediction*****")

        if not outcome_cat:
            # undo the normalization of the outputs
            m = outcomes.mean()
            s = outcomes.std()

            def descale(prediction):
                prediction['outcome'] = prediction['outcome'] * s + m
                prediction['expected_outcome_st_treatment'] = prediction[
                    'expected_outcome_st_treatment'] * s + m
                prediction['expected_outcome_st_no_treatment'] = prediction[
                    'expected_outcome_st_no_treatment'] * s + m
                return prediction
        else:
            # categorical Y wasn't rescaled, so no need to do this
            def descale(prediction):
                return prediction

        with tf.name_scope("evaluation_data"):
            predict_input_fn = make_no_graph_input_fn(graph_data, args,
                                                      treatments, outcomes)

        result = estimator.predict(input_fn=predict_input_fn)

        output_predict_file = os.path.join(args.output_dir, "test_results.tsv")
        with tf.gfile.GFile(output_predict_file, "w") as writer:
            tf.logging.info("***** Predict results *****")

            attribute_names = [
                'vertex_index', 'in_test', 'treatment_probability',
                'expected_outcome_st_treatment',
                'expected_outcome_st_no_treatment', 'outcome', 'treatment'
            ]

            header = "\t".join(attribute_name
                               for attribute_name in attribute_names) + "\n"
            writer.write(header)
            for prediction in result:
                prediction = descale(prediction)
                output_line = "\t".join(
                    str(prediction[attribute_name])
                    for attribute_name in attribute_names) + "\n"
                writer.write(output_line)
def simulate_from_pokec_covariates(data_dir,
                                   setting="A",
                                   discretize_covariates=True,
                                   easy_t=True):
    graph_data, profiles = load_data_pokec(data_dir)
    pokec_features = process_pokec_attributes(profiles)

    # predictable covariates
    covs = ['scaled_registration', 'scaled_age', 'region']
    # 'sign_in_zodiac',
    # 'relation_to_casual_sex']

    # reindex region to 0, 1, 2
    region = pokec_features['region']
    region = np.searchsorted(np.unique(region), region) - 1.

    if discretize_covariates:
        age = pokec_features['scaled_age']
        age_cat = np.where(age < 0, -1., 1.)
        age_cat[np.isnan(age)] = 0

        old_school = pokec_features[
            'old_school']  # binarized version of registration

        # simulate
        covs = [old_school, region, age_cat]

    else:
        scaled_age = pokec_features['scaled_age']
        scaled_age[np.isnan(scaled_age)] = 0

        scaled_region = (region - region.mean()) / region.std()
        registration = pokec_features['scaled_registration']

        covs = [registration, scaled_region, scaled_age]

    # treatment
    if easy_t:
        z = (2 * (region < 1) - 1)
        t_prob = expit(z)
        t = np.random.binomial(1, t_prob)
    else:
        z = 0.
        for cov in covs:
            z += np.random.uniform(-1, 3) * cov

        z = (z - z.min()) / (z.max() - z.min())  # rescale to [0, 1]
        z = (z - 0.5) * 6.  # rescale to [-3, 3]
        t_prob = expit(
            z
        )  # probabilities between 0.047 and 0.95 (enough to be interesting w/o being pathological)
        t = np.random.binomial(1, t_prob)

    # confounding
    zz = 0.
    for cov in covs:
        zz += np.random.uniform(-1, 3) * cov

    y, y_1, y_0 = simulate_y(zz, t, setting)

    return t, y, y_0, y_1, t_prob, z, zz
    # confounding
    zz = 0.
    for cov in covs:
        zz += np.random.uniform(-1, 3) * cov

    y, y_1, y_0 = simulate_y(zz, t, setting)

    return t, y, y_0, y_1, t_prob, z, zz


if __name__ == '__main__':
    # main()
    tf.enable_eager_execution()
    data_dir = '../dat/networks/pokec/regional_subset'
    graph_data, profiles = load_data_pokec(data_dir)
    pokec_features = process_pokec_attributes(profiles)

    sbm_embedding = np.loadtxt(
        '../dat/networks/pokec/regional_subset/svinetk128groups.txt')
    sbm_embedding = sbm_embedding[:, 1:]  # drop the first column of embedding
    sbm_embedding = sbm_embedding[sbm_embedding[:, 0].argsort()]
    sbm_embedding = sbm_embedding[:, 1:]

    print("Loaded data with {} vertices and {} edges".format(
        graph_data.num_vertices, graph_data.edge_list.shape[0]))

    reps = 25

    for setting in ['A', 'B', 'C', 'D', 'E']:
        for discretize_covariates in [True, False]: