def gather_statistics(schema_set, datasources_with_tag, stats_output_p, datasets_input_p): for source_name in datasources_with_tag: all_stats = {} file_path = stats_output_p + source_name + '.json' my_file = pathlib.Path(file_path) if my_file.exists(): continue path = datasets_input_p + source_name + '.csv' dataset = pd.read_csv(path, index_col=0, header=0) dataset = df_rename_cols(dataset) attr_schema = schema_set[source_name] df_columns = list(dataset.columns.values) # print(attr_schema) attr_schema = [{'name': parse_dataset.clean_name(attr['name'], False, False)} for attr in attr_schema] df_columns = [parse_dataset.clean_name(attr, False, False) for attr in df_columns] cols_to_delete = find_attrs_to_delete(attr_schema, df_columns) dataset = df_delete_cols(dataset, cols_to_delete) schema = schema_set[source_name] attributes_list = [parse_dataset.clean_name(attr['name'], False, False) for attr in schema] attributes_list = [item for item in attributes_list if item not in cols_to_delete] print(source_name, attributes_list, cols_to_delete) for attr in attributes_list: stat, groups, uniques = groupby_unique(attr, dataset) all_stats[attr] = stat # TODO more types of stats needed with open(file_path, 'w') as fp: json.dump(all_stats, fp, sort_keys=True, indent=2) return
def load_prematching_metadata(p, m, pds): all_topics = {} topic_contexts_f = open(p.enriched_topics_json_dir, 'r') topic_contexts = json.load(topic_contexts_f) all_topics = {} # TODO get all available topics for source_name in m.datasources_with_tag: source_name = pds.clean_name(source_name, False, False) for src_top in topic_contexts[source_name]: if src_top not in all_topics: all_topics[src_top] = {} all_topics[src_top][source_name] = topic_contexts[source_name][ src_top] # synsets attrs_contexts_f = open(p.enriched_attrs_json_dir, 'r') attrs_contexts = json.load(attrs_contexts_f) return all_topics, attrs_contexts, topic_contexts
def load_per_source_metadata(p, m, datasources, source_name, pds, bmm): # path = datasets_path + source_name + '.csv' # dataset = pd.read_csv(path, index_col=0, header=0) stats_f = open(p.dataset_stats + source_name + '.json', 'r') stats = json.load(stats_f) df_columns = list(stats.keys()) schema = m.schema_set[source_name] metadata = m.dataset_metadata_set[source_name]['tags'] dataset = pd.DataFrame() # dataset = bmm.df_rename_cols(dataset) datasources[source_name] = (source_name, dataset, schema, metadata) print(source_name) # bmm.print_metadata_head(source_name, dataset, schema, metadata) # initialization schema matching tags_list = [tag['display_name'] for tag in metadata] # use enriched tags instead tags_list_enriched_f = open(p.enriched_topics_json_dir, 'r') tags_list_enriched = json.load(tags_list_enriched_f) tags_list_enriched_dataset = tags_list_enriched[source_name] tags_list_enriched_names = list(tags_list_enriched[source_name].keys()) # TODO add non-overlapping homonyms to context attributes_list = [ pds.clean_name(attr['name'], False, False) for attr in schema ] cols_to_delete = bmm.find_attrs_to_delete(attributes_list, df_columns) attributes_list = [ item for item in attributes_list if item not in cols_to_delete ] return tags_list_enriched_dataset, tags_list_enriched_names, attributes_list, schema, datasources, stats
def print_metadata_head(source_name, dataset, schema, metadata): print('dataset_name:', source_name) print('dataset_values.head \n', dataset.head()) print('dataset_schema[0]', parse_dataset.clean_name(schema[0]['name'], False, False)) print('dataset_schema[1]', parse_dataset.clean_name(schema[1]['name'], False, False)) print('dataset_tags[0]', metadata[0]['display_name'])
def df_rename_cols(dataset): dataset_col_names = [parse_dataset.clean_name(x) for x in list(dataset.columns.values)] col_rename_dict = {i: j for i, j in zip(list(dataset.columns.values), dataset_col_names)} dataset.rename(columns=col_rename_dict, inplace=True) return dataset
datasets_path = './thesis_project_dataset_clean/' datasources = {} for source_name in datasources_with_tag: dataset = pd.read_csv(datasets_path + source_name + '.csv', index_col=0, header=0) schema = schema_set[source_name] metadata = dataset_metadata_set[source_name]['tags'] dataset = df_rename_cols(dataset) datasources[source_name] = (source_name, dataset, schema, metadata) print_metadata_head(source_name, dataset, schema, metadata) # initialization schema matching tags_list = [tag['display_name'] for tag in metadata] attributes_list = [parse_dataset.clean_name(attr['name'], False, False) for attr in schema] sim_matrix = build_local_similarity_matrix(tags_list, attributes_list) sim_frame = pd.DataFrame(data=sim_matrix, columns=attributes_list, index=tags_list) print(sim_frame.to_string()) tree_matches = sim_frame.loc['trees'] attrs = list(sim_frame.columns.values) max_score = 0 arg_max_score = None arg_i = -1 for attr_i in range(len(attrs)): attr = attrs[attr_i] score = sim_frame.loc['trees', attr] if score > max_score: max_score = score arg_max_score = attr
def perform_matching(p, dataset_metadata_set, schema_set, datasources_with_tag, kb, params): comparison_count = 0 comparison_count_o = [comparison_count] sim_matrices = {} for source_name in datasources_with_tag: t2 = time.time() dataset = pd.read_csv(p.datasets_path + source_name + '.csv', index_col=0, header=0) dataset = bmm.df_rename_cols(dataset) schema = schema_set[source_name] metadata = dataset_metadata_set[source_name]['tags'] schema = [{ 'name': pds.clean_name(attr['name'], False, False) } for attr in schema] schema_attr_names = [] for attr in schema: # attr['name'] = pds.clean_name(attr['name'], False, False) schema_attr_names.append(attr['name']) schema_attr_names.sort() for concept in kb: for datasource in kb[concept]['matches']: src_attr = kb[concept]['matches'][datasource]['attribute'] src_vals = kb[concept]['matches'][datasource]['example_values'] # do not match with self if source_name == datasource: continue # do not match if no populated values if src_vals == None: continue src_data = pd.DataFrame({src_attr: src_vals}) print("[concept:%s, datasource:%s(%s) <=> dataset:%s]" % (concept, datasource, src_attr, source_name)) # groupby values for each column and obtain count for each unique value, then multiply counts when comparison succeeds tar_schema = list(dataset.columns.values) cols_to_delete = bmm.find_attrs_to_delete(schema, tar_schema) tar_schema = [ item for item in tar_schema if item not in cols_to_delete ] attrs_stat = {} max_len = 0 for attr in tar_schema: # TODO save this output to file for later use # stat, groups, uniques = bmm.groupby_unique(attr, dataset) stat, uniques = bmm.get_attr_stats(p.dataset_stats, source_name, attr) uniques.sort() # save for later try: arg_i = schema_attr_names.index(attr) if schema[arg_i]['domain'] == None: schema[arg_i]['coded_values'] = uniques schema[arg_i]['domain'] = 'coded_values_groupby' except: pass attrs_stat[attr] = (stat, uniques) if len(uniques) > max_len: max_len = len(uniques) tar_df = pd.DataFrame() for attr in tar_schema: uniques = attrs_stat[attr][1] attrs_stat[attr] = attrs_stat[attr][0] attr_vals = uniques + ['None'] * (max_len - len(uniques)) tar_df[attr] = attr_vals # collect stats first, also compare data types src_datatype = kb[concept]['matches'][datasource]['data_type'] attr_schema = schema_set[datasource] cols_to_delete = bmm.compare_datatypes(src_datatype, attr_schema, tar_schema) tar_df = bmm.df_delete_cols(tar_df, cols_to_delete) # TODO datatypes must match! need to move to a different matcher sim_matrix, confidence = bmm.match_table_by_values( src_data, tar_df, params['match_threshold'], comparison_count_o, attrs_stat, params['sample_ratio'], params['sample_min_count'], params['sample_max_count']) src_names = list(src_data.columns.values) tar_names = list(tar_df.columns.values) sim_matrix2 = schm.matcher_name_matrix(src_names, tar_names) sim_matrix = schm.combine_scores_matrix( sim_matrix, sim_matrix2, params['proportions']) print(sim_matrix.to_string()) # save similarity matrices filename = '%s|%s|%s||%s.csv' % (concept, datasource, src_attr, source_name) sim_matrices[filename] = sim_matrix t3 = time.time() total = t3 - t2 print('time %s sec' % (total)) print('-----') return kb, sim_matrices
def initialize_matching(p, input_topics, dataset_metadata_set, schema_set, datasources_with_tag, reverse_index, kb): datasources = {} for source_name in datasources_with_tag: # path = datasets_path + source_name + '.csv' # dataset = pd.read_csv(path, index_col=0, header=0) stats_f = open(p.dataset_stats + source_name + '.json', 'r') stats = json.load(stats_f) df_columns = list(stats.keys()) schema = schema_set[source_name] metadata = dataset_metadata_set[source_name]['tags'] dataset = pd.DataFrame() # dataset = bmm.df_rename_cols(dataset) datasources[source_name] = (source_name, dataset, schema, metadata) print(source_name) if DEBUG_MODE: bmm.print_metadata_head(source_name, dataset, schema, metadata) # initialization schema matching tags_list = [tag['display_name'] for tag in metadata] attributes_list = [ pds.clean_name(attr['name'], False, False) for attr in schema ] cols_to_delete = bmm.find_attrs_to_delete(attributes_list, df_columns) attributes_list = [ item for item in attributes_list if item not in cols_to_delete ] sim_matrix = bmm.build_local_similarity_matrix(tags_list, attributes_list) sim_frame = pd.DataFrame(data=sim_matrix, columns=attributes_list, index=tags_list) # print(sim_frame.to_string()) # TODO during new concepts stage, add second best tag and so on attrs = list(sim_frame.columns.values) # if stats file is empty if len(attrs) == 0: return kb, datasources_with_tag, schema_set for topic in reverse_index[source_name]: max_score = 0 arg_max_score = None arg_i = -1 for attr_i in range(len(attrs)): attr = attrs[attr_i] score = sim_frame.loc[topic, attr] if score > max_score: max_score = score arg_max_score = attr arg_i = attr_i arg_max_examples_vals = None example_value = None if schema[arg_i]['domain'] != None: arg_max_examples_vals = schema[arg_i]['coded_values'] arg_max_examples_vals.sort() if len(arg_max_examples_vals) > 0: example_value = arg_max_examples_vals[0] else: # loading from stats file _, uniques = bmm.get_attr_stats(p.dataset_stats, source_name, attrs[arg_i]) # stat, _, uniques = bmm.groupby_unique(attrs[arg_i], dataset) uniques.sort() schema[arg_i]['coded_values'] = uniques arg_max_examples_vals = schema[arg_i]['coded_values'] if len(arg_max_examples_vals) > 0: print('arg_max_examples_vals', arg_max_examples_vals[0]) schema[arg_i]['domain'] = 'coded_values_groupby' print('best match:', topic, arg_max_score, max_score, example_value) kb_match_entry = { 'concept': topic, 'datasource': source_name, 'attribute': arg_max_score, 'match_score': max_score, 'example_values': arg_max_examples_vals, 'data_type': schema[arg_i]['data_type'] } bmm.update_kb_json(kb, kb_match_entry) print('-----') # done initialization return kb, datasources_with_tag, schema_set
def create_attributes_contexts(datasets, m, p, r): contexts = {} for dataset in datasets: contexts[dataset] = {} schema = m.schema_set[dataset] attributes_list = [ pds.clean_name(attr['name'], False, False) for attr in schema ] dataset_existing_tags = m.dataset_metadata_set[dataset]['tags'] dataset_existing_groups = m.dataset_metadata_set[dataset]['groups'] dataset_notes = m.dataset_metadata_set[dataset]['notes'] desc = '' for group in dataset_existing_groups: desc = ' ' + group['description'] dataset_existing_tags = [ tag['display_name'] for tag in dataset_existing_tags ] dataset_existing_groups = [ group['display_name'] for group in dataset_existing_groups ] dataset_notes = [ word for word in dataset_notes.split() if "http://" not in word ] notes = ' '.join(dataset_notes) stats_f = open(p.dataset_stats + dataset + '.json', 'r') stats = json.load(stats_f) df_columns = list(stats.keys()) attributes_list = [ pds.clean_name(attr['name'], False, False) for attr in schema ] cols_to_delete = bmm.find_attrs_to_delete(attributes_list, df_columns) attributes_list = [ item for item in attributes_list if item not in cols_to_delete ] for attr in attributes_list: # other_attrs = attributes_list.copy() # other_attrs.remove(attr) other_attrs = [] attr_values = stats[attr].keys() # TODO get average of val length, place attr vals in notes if length is long length = 0 if len(attr_values) > 0: if len(attr_values) > r.vals_truncate_sample: num_to_select = r.vals_truncate_sample attr_values = random.sample(attr_values, num_to_select) length = [len(val) for val in attr_values] length = sum(length) / len(attr_values) if r.sentence_threshold <= length: notes = notes + '. ' + '. '.join([val for val in attr_values]) # print('>>>>>', notes) else: other_attrs.extend(attr_values) # print('>>>>>', other_attrs) pt.enrich_homonyms(dataset, attr, desc, notes, other_attrs) m.dataset_attributes_contexts = contexts return contexts
def initialize_matching(p, m, r): all_topics, attrs_contexts, topic_contexts = load_prematching_metadata( p, m, pds) wordnet = pt.load_dict() pair_dict_all = {} datasources = {} for source_name in m.datasources_with_tag: tags_list_enriched_dataset, tags_list_enriched_names, attributes_list, schema, _, _ = load_per_source_metadata( p, m, datasources, source_name, pds, bmm) score_names = m.score_names sim_matrix1 = build_local_similarity_matrix(tags_list_enriched_dataset, attributes_list, r) # TODO build_local_similarity_matrix using context if source_name not in attrs_contexts: print('ERROR: DATASOURCE NOT FOUND', source_name, '\n', '-----') continue attribute_contexts = attrs_contexts[source_name] # topic_contexts is all datasets, attribute_contexts is per dataset sim_matrix2, sim_matrix3, pair_dict = build_local_context_similarity_matrix( topic_contexts, attribute_contexts, source_name, wordnet, all_topics) sim_frame1 = pd.DataFrame(data=sim_matrix1, columns=attributes_list, index=tags_list_enriched_names) sim_frame2 = pd.DataFrame(data=sim_matrix2, columns=attributes_list, index=tags_list_enriched_names) sim_frame3 = pd.DataFrame(data=sim_matrix3, columns=attributes_list, index=tags_list_enriched_names) # chance of getting external topics pair_dict_all.update(pair_dict) # print(sim_frame.to_string()) attrs = list(sim_frame1.columns.values) # if stats file is empty if len(attrs) == 0: print('ERROR: empty dataset', source_name, '\n', '-----') continue # get example values for attr_i in range(len(schema)): # print(attr_i) if 'domain' not in schema[attr_i] or schema[attr_i][ 'domain'] == None: attr_name = schema[attr_i]['name'] attr_name = pds.clean_name(attr_name, False, False) # loading from stats file _, uniques = bmm.get_attr_stats(p.dataset_stats, source_name, attr_name) if uniques != None: # print('uniques', len(uniques)) pass else: continue # stat, _, uniques = bmm.groupby_unique(attrs[arg_i], dataset) uniques.sort() schema[attr_i]['coded_values'] = uniques # arg_max_examples_vals = schema[attr_i]['coded_values'] # if len(arg_max_examples_vals) > 0: print('arg_max_examples_vals', arg_max_examples_vals[0]) schema[attr_i]['domain'] = 'coded_values_groupby' # init kb build_kb_json(tags_list_enriched_names, source_name, m) # during new concepts stage, add second best tag and so on for attr_i in range(len(attrs)): scores1 = [[ attr_i, attrs[attr_i], sim_frame1.loc[topic, attrs[attr_i]], topic ] for topic in tags_list_enriched_names] scores2 = [[ attr_i, attrs[attr_i], sim_frame2.loc[topic, attrs[attr_i]], topic ] for topic in tags_list_enriched_names] scores3 = [[ attr_i, attrs[attr_i], sim_frame3.loc[topic, attrs[attr_i]], topic ] for topic in tags_list_enriched_names] score_len = 0 if len(scores1) != 0: score_len = len(scores1[0]) # for topic in tags_list_enriched_names: # scores1 = [[attr_i, attrs[attr_i], sim_frame1.loc[topic, attrs[attr_i]]] for attr_i in range(len(attrs))] # scores2 = [[attr_i, attrs[attr_i], sim_frame2.loc[topic, attrs[attr_i]]] for attr_i in range(len(attrs))] scores = [] for i in range(len(scores1)): if scores1[i][2] >= scores2[i][2]: # print(scores2[i][2]) # print(scores3[i][2]) scores.append([ attr_i, attrs[attr_i], scores1[i][2], scores1[i][3], score_names[0] ]) else: multiplier = 1.0 if scores3[i][2] != 0: multiplier = scores3[i][2] scores.append([ attr_i, attrs[attr_i], min(scores2[i][2] * multiplier, 1.0), scores2[i][3], score_names[1] ]) scores = sorted(scores, key=lambda tup: tup[2]) scores.reverse() scores_examples = [] for attr_score in scores: # example_value = None # print(attr_score, attr_score[0], schema[attr_score[0]]) if 'coded_values' not in schema[attr_score[0]]: continue arg_max_examples_vals = schema[attr_score[0]]['coded_values'] arg_max_examples_vals.sort() scores_examples.append(attr_score + [schema[attr_score[0]]['coded_values']]) # print('here') top = 0 output = [] for score in scores_examples: if len(score) <= score_len: # print('skip', score) continue # print('topic_to_attr_count', score[2], top) if score[ 2] > r.topic_to_attr_threshold and top <= r.topic_to_attr_count: # print('topic_to_attr_count', r.topic_to_attr_count) output.append(score) top += 1 # if len(output) == 0: # output.append(scores_examples[0]) # max_score = 0 # arg_max_score = None # arg_i = -1 # for attr_i in range(len(attrs)): # attr = attrs[attr_i] # score = sim_frame.loc[topic, attr] # if score > max_score: # max_score = score # arg_max_score = attr # arg_i = attr_i # if len(arg_max_examples_vals) > 0: example_value = arg_max_examples_vals[0] # print('best match:', topic, arg_max_score, max_score, example_value) # print('=====output', output) for match in output: kb_match_entry = { 'concept': match[3], 'datasource': source_name, 'attribute': match[1], 'match_score': match[2], 'example_values': match[5], 'data_type': schema[match[0]]['data_type'], 'score_name': match[4] } update_kb_json(m.kbs[source_name], kb_match_entry) # for debugging: kb_match_entry['example_values'] = kb_match_entry[ 'example_values'][:min( len(kb_match_entry['example_values']), 5)] pprint.pprint(kb_match_entry) print('-----') m.pair_dict_all = pair_dict_all # done initialization return True
def initialize_matching_full(p, m, r): all_topics, attrs_contexts, topic_contexts = load_prematching_metadata( p, m, pds) wordnet = pt.load_dict() pair_dict_all = {} kb_curr_file = './outputs/kb_file.json' # TODO <= if not os.path.exists(kb_curr_file): with open(kb_curr_file, 'w') as fp: json.dump({}, fp, sort_keys=True, indent=2) fp = open(kb_curr_file, 'r') m.kbs = json.load(fp) len_all_ds = len(m.datasources_with_tag) datasources = {} for source_name in m.datasources_with_tag: if source_name in m.kbs: print('--already have local mapping ', source_name) continue # already have local mapping, skip _, _, attributes_list, schema, _, _ = load_per_source_metadata( p, m, datasources, source_name, pds, bmm) attributes_list_orig = [ pds.clean_name(attr['name'], False, False) for attr in schema ] score_names = m.score_names # 700+ topics sim_matrix1 = build_local_similarity_matrix(all_topics, attributes_list, r) if source_name not in attrs_contexts: print('ERROR: DATASOURCE NOT FOUND', source_name, '\n', '---!--') continue attribute_contexts = attrs_contexts[source_name] # topic_contexts is all datasets, attribute_contexts is per dataset sim_matrix2, pair_dict, sim_matrix3 = build_local_context_similarity_matrix_full( topic_contexts, attribute_contexts, source_name, wordnet, all_topics, p.server_ip) pprint.pprint(pair_dict) sim_frame1 = pd.DataFrame(data=sim_matrix1, columns=attributes_list, index=all_topics.keys()) sim_frame2 = pd.DataFrame(data=sim_matrix2, columns=list(attribute_contexts.keys()), index=all_topics.keys()) sim_frame3 = pd.DataFrame(data=sim_matrix3, columns=list(attribute_contexts.keys()), index=all_topics.keys()) pair_dict_all.update(pair_dict) attrs = list(sim_frame1.columns.values) if len(attrs) == 0: print('ERROR: empty dataset', source_name, '\n', '-----') continue for attr_i in range(len(schema)): if 'domain' not in schema[attr_i] or schema[attr_i][ 'domain'] == None: attr_name = schema[attr_i]['name'] attr_name = pds.clean_name(attr_name, False, False) _, uniques = bmm.get_attr_stats(p.dataset_stats, source_name, attr_name) if uniques != None: pass else: continue uniques.sort() schema[attr_i]['coded_values'] = uniques schema[attr_i]['domain'] = 'coded_values_groupby' # init kb build_kb_json(all_topics, source_name, m) for attr_i in range(len(attrs)): scores1 = [[ attr_i, attrs[attr_i], sim_frame1.loc[topic, attrs[attr_i]], topic, None ] for topic in all_topics] scores2 = [[ attr_i, attrs[attr_i], sim_frame2.loc[topic, attrs[attr_i]], topic, pair_dict[(source_name, pds.clean_name(attrs[attr_i]), topic)] ] for topic in all_topics] scores3 = [[ attr_i, attrs[attr_i], sim_frame3.loc[topic, attrs[attr_i]], topic, None ] for topic in all_topics] score_len = 0 if len(scores1) != 0: score_len = len(scores1[0]) # TODO change score to weighted average weights = [0.40, 0.6, 0.0] # TODO train the weights scores = [] for i in range(len(scores1)): scores_tmp = [scores1[i][2], scores2[i][2], scores3[i][2]] index, element = max(enumerate(scores_tmp), key=itemgetter(1)) # if scores1[i][2] >= scores2[i][2]: # scores.append([attr_i, attrs[attr_i],scores1[i][2],scores1[i][3], scores1[i][4], score_names[0]]) # else: # scores.append([attr_i, attrs[attr_i],scores2[i][2],scores2[i][3], scores2[i][4], score_names[1]]) score_tmp = weights[0] * scores_tmp[0] + weights[ 1] * scores_tmp[1] + weights[2] * scores_tmp[2] scores.append([ attr_i, attrs[attr_i], score_tmp, scores2[i][3], scores2[i][4], score_names[index] ]) scores = sorted(scores, key=lambda tup: tup[2]) scores.reverse() scores_examples = [] for attr_score in scores: # attr_splt = attr_score[1].split() ind = attributes_list_orig.index(attr_score[1]) if 'coded_values' not in schema[ind]: continue arg_max_examples_vals = schema[ind]['coded_values'] arg_max_examples_vals.sort() scores_examples.append(attr_score + [arg_max_examples_vals]) top = 0 output = [] for score in scores_examples: if len(score) <= score_len: continue if score[ 2] > r.topic_to_attr_threshold and top <= r.topic_to_attr_count: output.append(score) top += 1 for match in output: kb_match_entry = { 'concept': match[3], 'datasource': source_name, 'attribute': match[1], 'match_score': match[2], 'example_values': match[6], 'topic_source': match[5], 'data_type': schema[match[0]]['data_type'], 'score_name': match[4] } update_kb_json(m.kbs[source_name], kb_match_entry) # for debugging: kb_match_entry['example_values'] = kb_match_entry[ 'example_values'][:min( len(kb_match_entry['example_values']), 5)] pprint.pprint(kb_match_entry) with open(p.schema_p, 'w') as fp: json.dump(m.schema_set, fp, sort_keys=True, indent=2) with open(kb_curr_file, 'w') as fp: json.dump(m.kbs, fp, sort_keys=True, indent=2) print('done saving kb_file', source_name) print('^^^ PROGRESS', len(m.kbs) / len_all_ds) return