예제 #1
0
def generate_candidate_solution(pairwise_max_rdc, table_index_dict, prep,
                                max_budget, schema, max_no_relationships,
                                rdc_threshold):
    spn_relationships_list = set()
    all_merged_tables = set()
    learning_costs = 0

    # Basis of either binary or single SPNs
    # create every relationship above threshold
    for relationship_obj in schema.relationships:
        relationship_list = [relationship_obj.identifier]
        merged_tables = [relationship_obj.start, relationship_obj.end]

        if candidate_rdc_sum_means(pairwise_max_rdc, table_index_dict, [
            (relationship_list, merged_tables)
        ]) > rdc_threshold:
            # learning_costs += learning_cost(prep, [relationship_list])
            all_merged_tables.update(merged_tables)
            spn_relationships_list.add(
                (frozenset(relationship_list), frozenset(merged_tables)))

    # add remaining single tables
    for table in set([table.table_name for table in schema.tables
                      ]).difference(all_merged_tables):
        # learning_costs += learning_cost(prep, None, single_table=table)
        spn_relationships_list.add((frozenset(), frozenset([table])))

    # In addition randomly select larger joins
    rejected_candidates = 0
    while rejected_candidates < 5:
        no_joins = randint(2, max_no_relationships)
        relationship_list, merged_tables = create_random_join(schema, no_joins)
        current_costs = learning_cost(prep, [relationship_list])

        # Already in ensemble
        if (frozenset(relationship_list),
                frozenset(merged_tables)) in spn_relationships_list:
            rejected_candidates += 1
            continue

        # does not offer any benefit
        if candidate_rdc_sum_means(pairwise_max_rdc, table_index_dict, [
            (relationship_list, merged_tables)
        ]) <= rdc_threshold:
            rejected_candidates += 1
            continue

        # Can not be added because of budget
        if learning_costs + current_costs > max_budget:
            break

        # Can be added
        all_merged_tables.update(merged_tables)
        learning_costs += current_costs
        spn_relationships_list.add(
            (frozenset(relationship_list), frozenset(merged_tables)))

    return frozenset(spn_relationships_list), learning_costs
예제 #2
0
def prepare_sample_hdf(schema, hdf_path, max_table_data, sample_size):
    meta_data_path = hdf_path + '/meta_data.pkl'
    prep = JoinDataPreparator(meta_data_path,
                              schema,
                              max_table_data=max_table_data)
    new_meta_data = copy.deepcopy(prep.table_meta_data)

    def correct_meta_data(table):
        new_meta_data[table]['hdf_path'] = new_meta_data[table][
            'hdf_path'].replace(table, table + '_sampled')
        incoming_relationships = find_relationships(schema,
                                                    table,
                                                    incoming=True)
        for relationship_obj in incoming_relationships:
            new_meta_data[table][relationship_obj.identifier] = None
        outgoing_relationships = find_relationships(schema,
                                                    table,
                                                    incoming=False)
        for relationship_obj in outgoing_relationships:
            new_meta_data[table][relationship_obj.identifier]['length'] = 0

    # find first table and sample
    max_join_relationships, _ = create_random_join(schema,
                                                   len(schema.relationships))
    start_table, _ = prep._find_start_table(max_join_relationships, 1)
    logger.debug(f"Creating sample for {start_table}")
    sampled_tables = {start_table}
    df_sample_cache = dict()
    df_full_samples, _, _, _ = prep.generate_n_samples(
        sample_size, single_table=start_table, drop_redundant_columns=False)
    df_sample_cache[start_table] = df_full_samples
    df_full_samples.to_hdf(f'{hdf_path}/{start_table}_sampled.hdf',
                           key='df',
                           format='table')
    correct_meta_data(start_table)

    while len(sampled_tables) < len(schema.tables):
        for relationship_obj in schema.relationships:
            if (relationship_obj.start in sampled_tables
                    and not relationship_obj.end in sampled_tables) or (
                        relationship_obj.start not in sampled_tables
                        and relationship_obj.end in sampled_tables):
                if relationship_obj.start in sampled_tables and not relationship_obj.end in sampled_tables:
                    # outgoing edge, e.g. lineorders joined, join date
                    next_joined_table = relationship_obj.end
                    logger.debug(f"Creating sample for {next_joined_table}")
                    next_table_data = prep._get_table_data(
                        prep.table_meta_data[next_joined_table]['hdf_path'],
                        next_joined_table)
                    left_attribute = relationship_obj.end + '.' + relationship_obj.end_attr
                    right_attribute = relationship_obj.start + '.' + relationship_obj.start_attr

                    df_samples = df_sample_cache[relationship_obj.start]
                    df_samples = df_samples.set_index(right_attribute,
                                                      drop=False)
                    next_table_data = next_table_data.set_index(left_attribute,
                                                                drop=False)
                    next_table_data = df_samples.merge(next_table_data,
                                                       right_index=True,
                                                       left_on=right_attribute)
                    # only keep rows with join partner
                    next_table_data = next_table_data[next_table_data[
                        relationship_obj.end + '.' +
                        relationship_obj.multiplier_attribute_name] > 0]

                elif relationship_obj.start not in sampled_tables and relationship_obj.end in sampled_tables:
                    next_joined_table = relationship_obj.start
                    logger.debug(f"Creating sample for {next_joined_table}")
                    next_table_data = prep._get_table_data(
                        prep.table_meta_data[next_joined_table]['hdf_path'],
                        next_joined_table)
                    left_attribute = relationship_obj.end + '.' + relationship_obj.end_attr
                    right_attribute = relationship_obj.start + '.' + relationship_obj.start_attr

                    df_samples = df_sample_cache[relationship_obj.end]
                    df_samples = df_samples.set_index(left_attribute,
                                                      drop=False)
                    # df_samples.index.name = None
                    next_table_data = next_table_data.set_index(
                        right_attribute, drop=False)
                    next_table_data = df_samples.merge(next_table_data,
                                                       right_index=True,
                                                       left_on=left_attribute)
                    # only keep rows with join partner
                    next_table_data = next_table_data[next_table_data[
                        relationship_obj.end + '.' +
                        relationship_obj.multiplier_attribute_name] > 0]

                if len(next_table_data) > sample_size:
                    next_table_data = next_table_data.sample(sample_size)
                # only keep columns of interest
                del_cols = []
                for col in next_table_data.columns:
                    if col not in prep.table_meta_data[next_joined_table][
                            'relevant_attributes_full']:
                        del_cols.append(col)
                next_table_data.drop(columns=del_cols, inplace=True)
                df_sample_cache[next_joined_table] = next_table_data
                next_table_data.to_hdf(
                    f'{hdf_path}/{next_joined_table}_sampled.hdf',
                    key='df',
                    format='table')
                correct_meta_data(next_joined_table)
                sampled_tables.add(next_joined_table)

    # different meta data
    with open(hdf_path + '/meta_data_sampled.pkl', 'wb') as f:
        pickle.dump(new_meta_data, f, pickle.HIGHEST_PROTOCOL)