def _overlap_coefficient_join_split( ltable_list, rtable_list, l_columns, r_columns, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, comp_op, allow_empty, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score, show_progress): """Perform overlap coefficient join for a split of ltable and rtable""" # find column indices of key attr, join attr and output attrs in ltable l_key_attr_index = l_columns.index(l_key_attr) l_join_attr_index = l_columns.index(l_join_attr) l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs) # find column indices of key attr, join attr and output attrs in rtable r_key_attr_index = r_columns.index(r_key_attr) r_join_attr_index = r_columns.index(r_join_attr) r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs) # Build inverted index over ltable inverted_index = InvertedIndex(ltable_list, l_join_attr_index, tokenizer, cache_size_flag=True) # While building the index, we cache the record ids with empty set of # tokens. This is needed to handle the allow_empty flag. cached_data = inverted_index.build(allow_empty) l_empty_records = cached_data['empty_records'] overlap_filter = OverlapFilter(tokenizer, 1) comp_fn = COMP_OP_MAP[comp_op] output_rows = [] has_output_attributes = (l_out_attrs is not None or r_out_attrs is not None) if show_progress: prog_bar = pyprind.ProgBar(len(rtable_list)) for r_row in rtable_list: r_string = r_row[r_join_attr_index] r_join_attr_tokens = tokenizer.tokenize(r_string) r_num_tokens = len(r_join_attr_tokens) # If allow_empty flag is set and the current rtable record has empty set # of tokens in the join attribute, then generate output pairs joining # the current rtable record with those records in ltable with empty set # of tokens in the join attribute. These ltable record ids are cached in # l_empty_records list which was constructed when building the inverted # index. if allow_empty and r_num_tokens == 0: for l_id in l_empty_records: if has_output_attributes: output_row = get_output_row_from_tables( ltable_list[l_id], r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: output_row = [ ltable_list[l_id][l_key_attr_index], r_row[r_key_attr_index] ] if out_sim_score: output_row.append(1.0) output_rows.append(output_row) continue # probe inverted index and find overlap of candidates candidate_overlap = overlap_filter.find_candidates( r_join_attr_tokens, inverted_index) for cand, overlap in iteritems(candidate_overlap): # compute the actual similarity score sim_score = ( float(overlap) / float(min(r_num_tokens, inverted_index.size_cache[cand]))) if comp_fn(sim_score, threshold): if has_output_attributes: output_row = get_output_row_from_tables( ltable_list[cand], r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: output_row = [ ltable_list[cand][l_key_attr_index], r_row[r_key_attr_index] ] # if out_sim_score flag is set, append the overlap coefficient # score to the output record. if out_sim_score: output_row.append(sim_score) output_rows.append(output_row) if show_progress: prog_bar.update() output_header = get_output_header_from_tables(l_key_attr, r_key_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix) if out_sim_score: output_header.append("_sim_score") output_table = pd.DataFrame(output_rows, columns=output_header) return output_table
def sample_pairs(ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr, sample_size, y_param, seed, l_out_prefix='l_', r_out_prefix='r_', show_progress=True): # get attributes to project. l_proj_attrs = get_attrs_to_project(None, l_key_attr, l_join_attr) r_proj_attrs = get_attrs_to_project(None, r_key_attr, r_join_attr) # convert dataframe to array for faster access ltable_array = convert_dataframe_to_array(ltable, l_proj_attrs, l_join_attr) rtable_array = convert_dataframe_to_array(rtable, r_proj_attrs, r_join_attr) # find column indices of key attr and join attr in ltable array l_key_attr_index = l_proj_attrs.index(l_key_attr) l_join_attr_index = l_proj_attrs.index(l_join_attr) # find column indices of key attr and join attr in rtable array r_key_attr_index = r_proj_attrs.index(r_key_attr) r_join_attr_index = r_proj_attrs.index(r_join_attr) # create a whitespace tokenizer to tokenize join attributes ws_tok = WhitespaceTokenizer(return_set=True) # build inverted index on join attriubute in ltable inverted_index = InvertedIndex(ltable_array, l_join_attr_index, ws_tok) inverted_index.build() number_of_r_tuples_to_sample = int( ceil(float(sample_size) / float(y_param))) sample_rtable_indices = random.sample(range(0, len(rtable_array)), number_of_r_tuples_to_sample) cand_pos_ltuples_required = int(ceil(y_param / 2.0)) overlap_filter = OverlapFilter(ws_tok, 1) output_rows = [] if show_progress: prog_bar = pyprind.ProgBar(number_of_r_tuples_to_sample) for r_idx in sample_rtable_indices: r_row = rtable_array[r_idx] r_id = r_row[r_key_attr_index] r_join_attr_tokens = ws_tok.tokenize(r_row[r_join_attr_index]) # probe inverted index and find ltable candidates cand_overlap = overlap_filter.find_candidates(r_join_attr_tokens, inverted_index) sampled_ltuples = set() for cand in sorted(cand_overlap.items(), key=operator.itemgetter(1), reverse=True): if len(sampled_ltuples) == cand_pos_ltuples_required: break sampled_ltuples.add(cand[0]) ltable_size = len(ltable_array) while len(sampled_ltuples) < y_param: rand_idx = random.randint(0, ltable_size - 1) sampled_ltuples.add(rand_idx) for l_idx in sampled_ltuples: output_rows.append([ltable_array[l_idx][l_key_attr_index], r_id]) if show_progress: prog_bar.update() for seed_pair_row in seed.itertuples(index=False): output_rows.append([seed_pair_row[0], seed_pair_row[1]]) output_header = get_output_header_from_tables(l_key_attr, r_key_attr, None, None, l_out_prefix, r_out_prefix) output_table = pd.DataFrame(output_rows, columns=output_header) # add an id column named '_id' to the output table. output_table.insert(0, '_id', range(0, len(output_table))) return output_table
def _overlap_coefficient_join_split(ltable_list, rtable_list, l_columns, r_columns, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, comp_op, allow_empty, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score, show_progress): """Perform overlap coefficient join for a split of ltable and rtable""" # find column indices of key attr, join attr and output attrs in ltable l_key_attr_index = l_columns.index(l_key_attr) l_join_attr_index = l_columns.index(l_join_attr) l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs) # find column indices of key attr, join attr and output attrs in rtable r_key_attr_index = r_columns.index(r_key_attr) r_join_attr_index = r_columns.index(r_join_attr) r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs) # Build inverted index over ltable inverted_index = InvertedIndex(ltable_list, l_join_attr_index, tokenizer, cache_size_flag=True) # While building the index, we cache the record ids with empty set of # tokens. This is needed to handle the allow_empty flag. cached_data = inverted_index.build(allow_empty) l_empty_records = cached_data['empty_records'] overlap_filter = OverlapFilter(tokenizer, 1) comp_fn = COMP_OP_MAP[comp_op] output_rows = [] has_output_attributes = (l_out_attrs is not None or r_out_attrs is not None) if show_progress: prog_bar = pyprind.ProgBar(len(rtable_list)) for r_row in rtable_list: r_string = r_row[r_join_attr_index] r_join_attr_tokens = tokenizer.tokenize(r_string) r_num_tokens = len(r_join_attr_tokens) # If allow_empty flag is set and the current rtable record has empty set # of tokens in the join attribute, then generate output pairs joining # the current rtable record with those records in ltable with empty set # of tokens in the join attribute. These ltable record ids are cached in # l_empty_records list which was constructed when building the inverted # index. if allow_empty and r_num_tokens == 0: for l_id in l_empty_records: if has_output_attributes: output_row = get_output_row_from_tables( ltable_list[l_id], r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: output_row = [ltable_list[l_id][l_key_attr_index], r_row[r_key_attr_index]] if out_sim_score: output_row.append(1.0) output_rows.append(output_row) continue # probe inverted index and find overlap of candidates candidate_overlap = overlap_filter.find_candidates( r_join_attr_tokens, inverted_index) for cand, overlap in iteritems(candidate_overlap): # compute the actual similarity score sim_score = (float(overlap) / float(min(r_num_tokens, inverted_index.size_cache[cand]))) if comp_fn(sim_score, threshold): if has_output_attributes: output_row = get_output_row_from_tables( ltable_list[cand], r_row, l_key_attr_index, r_key_attr_index, l_out_attrs_indices, r_out_attrs_indices) else: output_row = [ltable_list[cand][l_key_attr_index], r_row[r_key_attr_index]] # if out_sim_score flag is set, append the overlap coefficient # score to the output record. if out_sim_score: output_row.append(sim_score) output_rows.append(output_row) if show_progress: prog_bar.update() output_header = get_output_header_from_tables(l_key_attr, r_key_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix) if out_sim_score: output_header.append("_sim_score") output_table = pd.DataFrame(output_rows, columns=output_header) return output_table
def sample_pairs(ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr, sample_size, y_param, seed, l_out_prefix='l_', r_out_prefix='r_', show_progress=True): # get attributes to project. l_proj_attrs = get_attrs_to_project(None, l_key_attr, l_join_attr) r_proj_attrs = get_attrs_to_project(None, r_key_attr, r_join_attr) # convert dataframe to array for faster access ltable_array = convert_dataframe_to_array(ltable, l_proj_attrs, l_join_attr) rtable_array = convert_dataframe_to_array(rtable, r_proj_attrs, r_join_attr) # find column indices of key attr and join attr in ltable array l_key_attr_index = l_proj_attrs.index(l_key_attr) l_join_attr_index = l_proj_attrs.index(l_join_attr) # find column indices of key attr and join attr in rtable array r_key_attr_index = r_proj_attrs.index(r_key_attr) r_join_attr_index = r_proj_attrs.index(r_join_attr) # create a whitespace tokenizer to tokenize join attributes ws_tok = WhitespaceTokenizer(return_set=True) # build inverted index on join attriubute in ltable inverted_index = InvertedIndex(ltable_array, l_join_attr_index, ws_tok) inverted_index.build() number_of_r_tuples_to_sample = int(ceil(float(sample_size) / float(y_param))) sample_rtable_indices = random.sample(range(0, len(rtable_array)), number_of_r_tuples_to_sample) cand_pos_ltuples_required = int(ceil(y_param / 2.0)) overlap_filter = OverlapFilter(ws_tok, 1) output_rows = [] if show_progress: prog_bar = pyprind.ProgBar(number_of_r_tuples_to_sample) for r_idx in sample_rtable_indices: r_row = rtable_array[r_idx] r_id = r_row[r_key_attr_index] r_join_attr_tokens = ws_tok.tokenize(r_row[r_join_attr_index]) # probe inverted index and find ltable candidates cand_overlap = overlap_filter.find_candidates( r_join_attr_tokens, inverted_index) sampled_ltuples = set() for cand in sorted(cand_overlap.items(), key=operator.itemgetter(1), reverse=True): if len(sampled_ltuples) == cand_pos_ltuples_required: break sampled_ltuples.add(cand[0]) ltable_size = len(ltable_array) while len(sampled_ltuples) < y_param: rand_idx = random.randint(0, ltable_size - 1) sampled_ltuples.add(rand_idx) for l_idx in sampled_ltuples: output_rows.append([ltable_array[l_idx][l_key_attr_index], r_id]) if show_progress: prog_bar.update() for seed_pair_row in seed.itertuples(index=False): output_rows.append([seed_pair_row[0], seed_pair_row[1]]) output_header = get_output_header_from_tables(l_key_attr, r_key_attr, None, None, l_out_prefix, r_out_prefix) output_table = pd.DataFrame(output_rows, columns=output_header) # add an id column named '_id' to the output table. output_table.insert(0, '_id', range(0, len(output_table))) return output_table