Exemplo n.º 1
0
def convert_remaining_groups_to_rules(data):

    remaining_groups = list(data['partitions'])
    for name in remaining_groups:
        top_group = get_variable_mode(data, name)
        data = convert_group_to_rules(data, name, baseline_labels = [top_group])

    assert check_data(data)
    return data
Exemplo n.º 2
0
def convert_group_to_rules(data, name, baseline_labels = None):

    assert data['variable_types'][name] == 'categorical'

    # convert variable from categorical to rules
    idx = data['variable_names'].index(name)
    labels = np.unique(data['X'][:, idx])
    conversion_dict = {k: [k] for k in labels}

    if baseline_labels is not None:

        if type(baseline_labels) is not list:
            baseline_labels = [baseline_labels]

        for g in baseline_labels:
            conversion_dict.pop(g)

    data = convert_categorical_to_rules(data, name, conversion_dict, prepend_name = True)
    assert check_data(data)
    return data
Exemplo n.º 3
0
def check_groups(groups, data = None):

    assert type(groups) is dict
    if len(groups) == 0:
        return True

    group_names = list(groups.keys())
    n_samples = len(groups[group_names[0]]['indices'])
    n_samples_validation = len(groups[group_names[0]]['indices_validation'])
    n_samples_test = len(groups[group_names[0]]['indices_test'])

    for group_info in groups.values():

        assert check_group_info(group_info)
        assert len(group_info['indices']) == n_samples
        assert len(group_info['indices_test']) == n_samples_test
        assert len(group_info['indices_validation']) == n_samples_validation

    if data is not None:

        assert check_data(data)
        assert n_samples == data['X'].shape[0]

        if n_samples_test > 0:
            assert has_test_set(data)
            assert n_samples_test == data['X_test'].shape[0]

        if n_samples_validation > 0:
            assert has_validation_set(data)
            assert n_samples_validation == data['X_validation'].shape[0]

        for g in group_names:
            assert g not in data['variable_names']
            assert g not in data['variable_types']
            assert g not in data['variable_orderings']
            assert g not in data['partitions']

    return True
Exemplo n.º 4
0
    def __init__(self, data, groups, pooled_model, decoupled_models,
                 groups_to_models):

        # check inputs
        assert check_data(data, ready_for_training=True)
        assert check_groups(groups, data)

        # initialize data
        self._data = {
            'X': np.array(data['X']),
            'Y': np.array(data['Y']),
            'variable_names': list(data['variable_names'])
        }

        self._groups = deepcopy(groups)
        self._pooled_model = pooled_model
        self._decoupled_models = decoupled_models

        group_names, group_values = groups_to_group_data(groups)
        training_values = np.unique(group_values, axis=0).tolist()
        training_splits = [tuple(zip(group_names, v)) for v in training_values]

        assert isinstance(groups_to_models, dict)
        assert set(training_splits) == set(groups_to_models.keys(
        )), 'mapper should include map every group in the training data'
        assignment_idx = np.array(list(groups_to_models.values()))
        assert np.array_equal(np.unique(assignment_idx), np.arange(
            len(self))), 'every model should cover at least one group'

        models_to_groups = {k: [] for k in range(len(self))}
        for group_tuple, model_index in groups_to_models.items():
            group_value = [s[1] for s in group_tuple]
            assert len(group_value) == len(group_names)
            models_to_groups[model_index].append(group_value)

        self._splits = training_splits
        self.groups_to_models = groups_to_models
        self.models_to_groups = models_to_groups
Exemplo n.º 5
0
def oversample_by_group(data, **kwargs):

    data, groups = split_groups_from_data(data)

    # get names/labels/values
    group_names = []
    group_labels = []
    group_values = []
    for n, g in groups.items():
        group_names.append(n)
        group_values.append(g['indices'])
        group_labels.append(g['labels'][g['indices']])
    group_values = np.transpose(np.vstack(group_values))
    group_labels = np.transpose(np.vstack(group_labels))

    # get unique ids for each combination of group attributes
    _, profile_idx = np.unique(group_values, axis = 0, return_inverse = True)
    profile_labels = range(0, np.max(profile_idx) + 1)

    # oversample labels
    ros = RandomOverSampler(**kwargs)
    X = np.array(data['X'])
    Y = np.array(data['Y'])
    X_res = []
    Y_res = []
    G_res = []
    assert np.isin((-1, 1), Y).all()

    for i in profile_labels:
        row_idx = np.isin(profile_idx, i)
        profile_values = group_labels[row_idx, :][0]
        Xg = X[row_idx, :]
        Yg = Y[row_idx]
        if np.isin((-1, 1), Yg).all():
            Xs, Ys = ros.fit_sample(Xg, Yg)
            X_res.append(Xs)
            Y_res.append(Ys)
            G_res.append(np.tile(profile_values, (len(Ys), 1)))
        else:
            profile_name = ''.join(['%s' % s for s in profile_values])
            warnings.warn('missing + and - labels for group %s' % profile_name)
            X_res.append(Xg)
            Y_res.append(Yg)
            G_res.append(np.tile(profile_values, (len(Yg), 1)))

    G_res = np.vstack(G_res)
    X_res = np.vstack(X_res)
    Y_res = np.concatenate(Y_res)

    data['X'] = X_res
    data['Y'] = Y_res
    data['sample_weights'] = np.ones_like(data['Y'], dtype = float)

    for j, name in enumerate(group_names):
        data = add_variable(data,
                            name = name,
                            variable_type = 'categorical',
                            is_partition = True,
                            values = G_res[:, j])


    assert check_data(data)
    return data