Пример #1
0
def test_e2e(dod, number_jps=5):
    attrs = ["Mit Id", "Krb Name", "Hr Org Unit Title"]
    values = ["968548423", "kimball", "Mechanical Engineering"]

    # attrs = ["Last Name", "Building Name", "Bldg Gross Square Footage", "Department Name"]
    # values = ["Madden", "Ray and Maria Stata Center", "", "Dept of Electrical Engineering & Computer Science"]

    i = 0
    first = True
    first_mjp = None
    most_likely_key = None
    for mjp, attrs_project in dod.virtual_schema_iterative_search(
            attrs, values, debug_enumerate_all_jps=False):
        print("JP: " + str(i))
        # i += 1
        # print(mjp.head(2))
        # if i > number_jps:
        #     break
        if first:
            first = False
            first_mjp = mjp
            most_likely_keys_info = mva.most_likely_key(first_mjp)
            most_likely_key = most_likely_keys_info[0][0]
        missing_keys, non_unique_df1, non_unique_df2, conflicting_pair = \
            mva.inconsistent_value_on_key(first_mjp, mjp, key=most_likely_key)
        if len(conflicting_pair) > 0:
            print(str(conflicting_pair))
Пример #2
0
def rank_materializable_join_graphs(materializable_join_paths, table_path):
    def score_for_key(keys_score, target):
        for c, nunique, score in keys_score:
            if target == c:
                return score

    def aggr_avg(scores):
        scores = np.asarray(scores)
        return np.average(scores)

    def aggr_mul(scores):
        return reduce(operator.mul, scores)

    rank_jps = []
    keys_cache = dict()
    for mjp in materializable_join_paths:
        jump_scores = []
        for filter, l, r in mjp:
            table = l.source_name
            if table not in keys_cache:
                path = table_path[table]
                table_df = dpu.get_dataframe(path + "/" + table)
                likely_keys_sorted = mva.most_likely_key(table_df)
                keys_cache[table] = likely_keys_sorted
            likely_keys_sorted = keys_cache[table]
            jump_score = score_for_key(likely_keys_sorted, l.field_name)
            jump_scores.append(jump_score)
        jp_score_avg = aggr_avg(jump_scores)
        jp_score_mul = aggr_mul(jump_scores)
        rank_jps.append((mjp, jp_score_avg, jp_score_mul))
    rank_jps = sorted(rank_jps, key=lambda x: x[1], reverse=True)
    return rank_jps
Пример #3
0
def rank_materializable_join_paths_piece(materializable_join_paths,
                                         candidate_group, table_path, dod):
    # compute rank list of likely keys for each table
    table_keys = dict()
    table_field_rank = dict()
    for table in candidate_group:
        if table in table_path:
            path = table_path[table]
        else:
            nid = (dod.aurum_api.make_drs(table)).data[0].nid
            path = dod.aurum_api.helper.get_path_nid(nid)
            table_path[table] = path
        table_df = dpu.get_dataframe(path + "/" + table)
        likely_keys_sorted = mva.most_likely_key(table_df)
        table_keys[table] = likely_keys_sorted
        field_rank = {
            payload[0]: i
            for i, payload in enumerate(likely_keys_sorted)
        }
        table_field_rank[table] = field_rank

    # 1) Split join paths into its pairs, then 2) sort each pair individually, then 3) assemble again

    num_jumps = sorted([len(x) for x in materializable_join_paths])[-1]
    jump_joins = {i: [] for i in range(num_jumps)}

    # 1) split
    for annotated_jp in materializable_join_paths:
        for i, jp in enumerate(annotated_jp):
            jump_joins[i].append(jp)

    def field_to_rank(table, field):
        return table_field_rank[table][field]

    # 2) sort
    for jump, joins in jump_joins.items():
        joins = sorted(
            joins,
            key=lambda x: field_to_rank(x[1].source_name, x[1].field_name))
        jump_joins[jump] = joins

    # 3) assemble
    ranked_materialized_join_paths = [
        [] for _ in range(len(materializable_join_paths))
    ]
    for jump, joins in jump_joins.items():
        for i, join in enumerate(joins):
            ranked_materialized_join_paths[i].append(join)

    return ranked_materialized_join_paths
Пример #4
0
def test_e2e(dod, number_jps=5):
    # attrs = ["Mit Id", "Krb Name", "Hr Org Unit Title"]
    # values = ["968548423", "kimball", "Mechanical Engineering"]

    attrs = ["Subject", "Title", "Publisher"]
    values = [
        "", "Man who would be king and other stories",
        "Oxford university press, incorporated"
    ]

    # attrs = ["Iap Category Name", "Person Name", "Person Email"]
    # # values = ["", "Meghan Kenney", "*****@*****.**"]
    # values = ["Engineering", "", ""]

    # attrs = ["Building Name Long", "Ext Gross Area", "Building Room", "Room Square Footage"]
    # values = ["", "", "", ""]

    # attrs = ["c_name", "c_phone", "n_name", "l_tax"]
    # values = ["Customer#000000001", "25-989-741-2988", "BRAZIL", ""]

    # attrs = ["Last Name", "Building Name", "Bldg Gross Square Footage", "Department Name"]
    # values = ["Madden", "Ray and Maria Stata Center", "", "Dept of Electrical Engineering & Computer Science"]

    i = 0
    first = True
    first_mjp = None
    most_likely_key = None
    for mjp, attrs_project in dod.virtual_schema_iterative_search(
            attrs, values, debug_enumerate_all_jps=False):
        print("JP: " + str(i))
        # i += 1
        # print(mjp.head(2))
        # if i > number_jps:
        #     break

        proj_view = dpu.project(mjp, attrs_project)

        if first:
            first = False
            first_mjp = mjp
            most_likely_keys_info = mva.most_likely_key(first_mjp)
            most_likely_key = most_likely_keys_info[0][0]
        missing_keys, non_unique_df1, non_unique_df2, conflicting_pair = \
            mva.inconsistent_value_on_key(first_mjp, mjp, key=most_likely_key)
        if len(conflicting_pair) > 0:
            print(str(conflicting_pair))