def filter_pair(self, lstring, rstring): """Filter two strings with prefix filter. Args: lstring, rstring : input strings Returns: result : boolean, True if the tuple pair is dropped. """ # check for empty string if (not lstring) or (not rstring): return True ltokens = tokenize(lstring, self.tokenizer, self.sim_measure_type) rtokens = tokenize(rstring, self.tokenizer, self.sim_measure_type) token_ordering = gen_token_ordering_for_lists([ltokens, rtokens]) ordered_ltokens = order_using_token_ordering(ltokens, token_ordering) ordered_rtokens = order_using_token_ordering(rtokens, token_ordering) l_prefix_length = get_prefix_length(len(ordered_ltokens), self.sim_measure_type, self.threshold, self.tokenizer) r_prefix_length = get_prefix_length(len(ordered_rtokens), self.sim_measure_type, self.threshold, self.tokenizer) prefix_overlap = set(ordered_ltokens[0:l_prefix_length]).intersection( set(ordered_rtokens[0:r_prefix_length])) if len(prefix_overlap) > 0: return False else: return True
def filter_pair(self, lstring, rstring): """Filter two strings with size filter. Args: lstring, rstring : input strings Returns: result : boolean, True if the tuple pair is dropped. """ # check for empty string if (not lstring) or (not rstring): return True l_num_tokens = len( tokenize(lstring, self.tokenizer, self.sim_measure_type)) r_num_tokens = len( tokenize(rstring, self.tokenizer, self.sim_measure_type)) size_lower_bound = get_size_lower_bound(l_num_tokens, self.sim_measure_type, self.threshold) size_upper_bound = get_size_upper_bound(l_num_tokens, self.sim_measure_type, self.threshold) if size_lower_bound <= r_num_tokens <= size_upper_bound: return False else: return True
def filter_pair(self, lstring, rstring): """Filter two strings with suffix filter. Args: lstring, rstring : input strings Returns: result : boolean, True if the tuple pair is dropped. """ # check for empty string if (not lstring) or (not rstring): return True ltokens = tokenize(lstring, self.tokenizer, self.sim_measure_type) rtokens = tokenize(rstring, self.tokenizer, self.sim_measure_type) token_ordering = gen_token_ordering_for_lists([ltokens, rtokens]) ordered_ltokens = order_using_token_ordering(ltokens, token_ordering) ordered_rtokens = order_using_token_ordering(rtokens, token_ordering) l_num_tokens = len(ordered_ltokens) r_num_tokens = len(ordered_rtokens) l_prefix_length = get_prefix_length(l_num_tokens, self.sim_measure_type, self.threshold, self.tokenizer) r_prefix_length = get_prefix_length(r_num_tokens, self.sim_measure_type, self.threshold, self.tokenizer) return self._filter_suffix(ordered_ltokens[l_prefix_length:], ordered_rtokens[r_prefix_length:], l_prefix_length, r_prefix_length, len(ltokens), len(rtokens))
def filter_pair(self, lstring, rstring): """Filter two strings with position filter. Args: lstring, rstring : input strings Returns: result : boolean, True if the tuple pair is dropped. """ # check for empty string if (not lstring) or (not rstring): return True ltokens = tokenize(lstring, self.tokenizer, self.sim_measure_type) rtokens = tokenize(rstring, self.tokenizer, self.sim_measure_type) token_ordering = gen_token_ordering_for_lists([ltokens, rtokens]) ordered_ltokens = order_using_token_ordering(ltokens, token_ordering) ordered_rtokens = order_using_token_ordering(rtokens, token_ordering) l_num_tokens = len(ordered_ltokens) r_num_tokens = len(ordered_rtokens) l_prefix_length = get_prefix_length(l_num_tokens, self.sim_measure_type, self.threshold, self.tokenizer) r_prefix_length = get_prefix_length(r_num_tokens, self.sim_measure_type, self.threshold, self.tokenizer) l_prefix_dict = {} l_pos = 0 for token in ordered_ltokens[0:l_prefix_length]: l_prefix_dict[token] = l_pos overlap_threshold = get_overlap_threshold(l_num_tokens, r_num_tokens, self.sim_measure_type, self.threshold, self.tokenizer) current_overlap = 0 r_pos = 0 for token in ordered_rtokens[0:r_prefix_length]: l_pos = l_prefix_dict.get(token) if l_pos is not None: overlap_upper_bound = 1 + min(l_num_tokens - l_pos - 1, r_num_tokens - r_pos - 1) if (current_overlap + overlap_upper_bound) < overlap_threshold: return True current_overlap += 1 r_pos += 1 if current_overlap > 0: return False return True
def build(self): for row in self.table: index_string = str(row[self.index_attr]) # check for empty string if not index_string: continue index_attr_tokens = order_using_token_ordering(tokenize( index_string, self.tokenizer, self.sim_measure_type), self.token_ordering) num_tokens = len(index_attr_tokens) prefix_length = get_prefix_length( num_tokens, self.sim_measure_type, self.threshold, self.tokenizer) row_id = row[self.key_attr] pos = 0 for token in index_attr_tokens[0:prefix_length]: if self.index.get(token) is None: self.index[token] = [] self.index.get(token).append((row_id, pos)) pos += 1 self.size_map[row_id] = num_tokens return True
def filter_pair(self, lstring, rstring): """Filter two strings with overlap filter. Args: lstring, rstring : input strings Returns: result : boolean, True if the tuple pair is dropped. """ # check for empty string if (not lstring) or (not rstring): return True ltokens = tokenize(lstring, self.tokenizer) rtokens = tokenize(rstring, self.tokenizer) num_overlap = overlap(ltokens, rtokens) if num_overlap < self.overlap_size: return True else: return False
def build(self): for row in self.table: index_string = str(row[self.index_attr]) # check for empty string if not index_string: continue index_attr_tokens = tokenize(index_string, self.tokenizer) row_id = row[self.key_attr] for token in index_attr_tokens: if self.index.get(token) is None: self.index[token] = [] self.index.get(token).append(row_id) return True
def build(self): for row in self.table: index_string = str(row[self.index_attr]) # check for empty string if not index_string: continue num_tokens = len(tokenize(index_string, self.tokenizer)) if self.index.get(num_tokens) is None: self.index[num_tokens] = [] self.index.get(num_tokens).append(row[self.key_attr]) if num_tokens < self.min_length: self.min_length = num_tokens if num_tokens > self.max_length: self.max_length = num_tokens return True
def gen_token_ordering_for_tables(table_list, attr_list, tokenizer, sim_measure_type='OVERLAP'): token_freq_dict = {} table_index = 0 for table in table_list: for row in table: for token in tokenize(str(row[attr_list[table_index]]), tokenizer, sim_measure_type): token_freq_dict[token] = token_freq_dict.get(token, 0) + 1 table_index += 1 ordered_tokens = sorted(list(token_freq_dict.items()), key=itemgetter(0)) token_ordering = {} order_idx = 1 for token_freq_tuple in sorted(ordered_tokens, key=itemgetter(1)): token_ordering[token_freq_tuple[0]] = order_idx order_idx += 1 return token_ordering
def _filter_tables_split(ltable, rtable, l_key_attr, r_key_attr, l_filter_attr, r_filter_attr, size_filter, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix): # find column indices of key attr, filter attr and output attrs in ltable l_columns = list(ltable.columns.values) l_key_attr_index = l_columns.index(l_key_attr) l_filter_attr_index = l_columns.index(l_filter_attr) l_out_attrs_indices = [] l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs) # find column indices of key attr, filter attr and output attrs in rtable r_columns = list(rtable.columns.values) r_key_attr_index = r_columns.index(r_key_attr) r_filter_attr_index = r_columns.index(r_filter_attr) r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs) # build a dictionary on ltable ltable_dict = build_dict_from_table(ltable, l_key_attr_index, l_filter_attr_index) # build a dictionary on rtable rtable_dict = build_dict_from_table(rtable, r_key_attr_index, r_filter_attr_index) # Build size index over ltable size_index = SizeIndex(ltable_dict.values(), l_key_attr_index, l_filter_attr_index, size_filter.tokenizer) size_index.build() output_rows = [] has_output_attributes = (l_out_attrs is not None or r_out_attrs is not None) prog_bar = pyprind.ProgBar(len(rtable)) for r_row in rtable_dict.values(): r_id = r_row[r_key_attr_index] r_string = str(r_row[r_filter_attr_index]) # check for empty string if not r_string: continue r_num_tokens = len( tokenize(r_string, size_filter.tokenizer, size_filter.sim_measure_type)) size_lower_bound = get_size_lower_bound(r_num_tokens, size_filter.sim_measure_type, size_filter.threshold) size_upper_bound = get_size_upper_bound(r_num_tokens, size_filter.sim_measure_type, size_filter.threshold) size_lower_bound = (size_index.min_length if size_lower_bound < size_index.min_length else size_lower_bound) size_upper_bound = (size_index.max_length if size_upper_bound > size_index.max_length else size_upper_bound) # probe size index and find candidates candidates = _find_candidates(size_lower_bound, size_upper_bound, size_index) for cand in candidates: if has_output_attributes: output_row = get_output_row_from_tables( ltable_dict[cand], r_row, cand, r_id, l_out_attrs_indices, r_out_attrs_indices) output_rows.append(output_row) else: output_rows.append([cand, r_id]) prog_bar.update() output_header = get_output_header_from_tables(l_key_attr, r_key_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix) output_table = pd.DataFrame(output_rows, columns=output_header) return output_table
def _filter_tables_split(ltable, rtable, l_key_attr, r_key_attr, l_filter_attr, r_filter_attr, prefix_filter, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix): # find column indices of key attr, filter attr and output attrs in ltable l_columns = list(ltable.columns.values) l_key_attr_index = l_columns.index(l_key_attr) l_filter_attr_index = l_columns.index(l_filter_attr) l_out_attrs_indices = [] l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs) # find column indices of key attr, filter attr and output attrs in rtable r_columns = list(rtable.columns.values) r_key_attr_index = r_columns.index(r_key_attr) r_filter_attr_index = r_columns.index(r_filter_attr) r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs) # build a dictionary on ltable ltable_dict = build_dict_from_table(ltable, l_key_attr_index, l_filter_attr_index) # build a dictionary on rtable rtable_dict = build_dict_from_table(rtable, r_key_attr_index, r_filter_attr_index) # generate token ordering using tokens in l_filter_attr and r_filter_attr token_ordering = gen_token_ordering_for_tables( [ltable_dict.values(), rtable_dict.values()], [l_filter_attr_index, r_filter_attr_index], prefix_filter.tokenizer, prefix_filter.sim_measure_type) # Build prefix index on l_filter_attr prefix_index = PrefixIndex(ltable_dict.values(), l_key_attr_index, l_filter_attr_index, prefix_filter.tokenizer, prefix_filter.sim_measure_type, prefix_filter.threshold, token_ordering) prefix_index.build() output_rows = [] has_output_attributes = (l_out_attrs is not None or r_out_attrs is not None) prog_bar = pyprind.ProgBar(len(rtable)) for r_row in rtable_dict.values(): r_id = r_row[r_key_attr_index] r_string = str(r_row[r_filter_attr_index]) # check for empty string if not r_string: continue r_filter_attr_tokens = tokenize(r_string, prefix_filter.tokenizer, prefix_filter.sim_measure_type) r_ordered_tokens = order_using_token_ordering(r_filter_attr_tokens, token_ordering) # probe prefix index and find candidates candidates = _find_candidates(r_ordered_tokens, len(r_ordered_tokens), prefix_filter, prefix_index) for cand in candidates: if has_output_attributes: output_row = get_output_row_from_tables( ltable_dict[cand], r_row, cand, r_id, l_out_attrs_indices, r_out_attrs_indices) output_rows.append(output_row) else: output_rows.append([cand, r_id]) prog_bar.update() output_header = get_output_header_from_tables(l_key_attr, r_key_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix) # generate a dataframe from the list of output rows output_table = pd.DataFrame(output_rows, columns=output_header) return output_table
def _set_sim_join_split(ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, sim_measure_type, threshold, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score): """Perform set similarity join for a split of ltable and rtable""" # find column indices of key attr, join attr and output attrs in ltable l_columns = list(ltable.columns.values) l_key_attr_index = l_columns.index(l_key_attr) l_join_attr_index = l_columns.index(l_join_attr) l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs) # find column indices of key attr, join attr and output attrs in rtable r_columns = list(rtable.columns.values) r_key_attr_index = r_columns.index(r_key_attr) r_join_attr_index = r_columns.index(r_join_attr) r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs) # build a dictionary on ltable ltable_dict = build_dict_from_table(ltable, l_key_attr_index, l_join_attr_index) # build a dictionary on rtable rtable_dict = build_dict_from_table(rtable, r_key_attr_index, r_join_attr_index) # generate token ordering using tokens in l_join_attr # and r_join_attr token_ordering = gen_token_ordering_for_tables( [ltable_dict.values(), rtable_dict.values()], [l_join_attr_index, r_join_attr_index], tokenizer, sim_measure_type) # build a dictionary of tokenized l_join_attr l_join_attr_dict = {} for row in ltable_dict.values(): l_join_attr_dict[row[l_key_attr_index]] = order_using_token_ordering( tokenize(str(row[l_join_attr_index]), tokenizer, sim_measure_type), token_ordering) # Build position index on l_join_attr position_index = PositionIndex(ltable_dict.values(), l_key_attr_index, l_join_attr_index, tokenizer, sim_measure_type, threshold, token_ordering) position_index.build() pos_filter = PositionFilter(tokenizer, sim_measure_type, threshold) suffix_filter = SuffixFilter(tokenizer, sim_measure_type, threshold) sim_fn = get_sim_function(sim_measure_type) output_rows = [] has_output_attributes = (l_out_attrs is not None or r_out_attrs is not None) prog_bar = pyprind.ProgBar(len(rtable_dict.keys())) for r_row in rtable_dict.values(): r_id = r_row[r_key_attr_index] r_string = str(r_row[r_join_attr_index]) # check for empty string if not r_string: continue r_join_attr_tokens = tokenize(r_string, tokenizer, sim_measure_type) r_ordered_tokens = order_using_token_ordering(r_join_attr_tokens, token_ordering) r_num_tokens = len(r_ordered_tokens) r_prefix_length = get_prefix_length(r_num_tokens, sim_measure_type, threshold, tokenizer) candidate_overlap = find_candidates_position_filter( r_ordered_tokens, r_num_tokens, r_prefix_length, pos_filter, position_index) for cand, overlap in iteritems(candidate_overlap): if overlap > 0: l_ordered_tokens = l_join_attr_dict[cand] l_num_tokens = position_index.get_size(cand) l_prefix_length = get_prefix_length( l_num_tokens, sim_measure_type, threshold, tokenizer) if not suffix_filter._filter_suffix( l_ordered_tokens[l_prefix_length:], r_ordered_tokens[r_prefix_length:], l_prefix_length, r_prefix_length, l_num_tokens, r_num_tokens): sim_score = sim_fn(l_ordered_tokens, r_ordered_tokens) if sim_score >= threshold: if has_output_attributes: output_row = get_output_row_from_tables( ltable_dict[cand], r_row, cand, r_id, l_out_attrs_indices, r_out_attrs_indices) if out_sim_score: output_row.append(sim_score) output_rows.append(output_row) else: output_row = [cand, r_id] if out_sim_score: output_row.append(sim_score) output_rows.append(output_row) prog_bar.update() output_header = get_output_header_from_tables( l_key_attr, r_key_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix) if out_sim_score: output_header.append("_sim_score") # generate a dataframe from the list of output rows output_table = pd.DataFrame(output_rows, columns=output_header) return output_table
def _edit_dist_join_split(ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score): # find column indices of key attr, join attr and output attrs in ltable l_columns = list(ltable.columns.values) l_key_attr_index = l_columns.index(l_key_attr) l_join_attr_index = l_columns.index(l_join_attr) l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs) # find column indices of key attr, join attr and output attrs in rtable r_columns = list(rtable.columns.values) r_key_attr_index = r_columns.index(r_key_attr) r_join_attr_index = r_columns.index(r_join_attr) r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs) # build a dictionary on ltable ltable_dict = build_dict_from_table(ltable, l_key_attr_index, l_join_attr_index) # build a dictionary on rtable rtable_dict = build_dict_from_table(rtable, r_key_attr_index, r_join_attr_index) sim_measure_type = 'EDIT_DISTANCE' # generate token ordering using tokens in l_join_attr # and r_join_attr token_ordering = gen_token_ordering_for_tables( [ltable_dict.values(), rtable_dict.values()], [l_join_attr_index, r_join_attr_index], tokenizer, sim_measure_type) # build a dictionary of l_join_attr lengths l_join_attr_dict = {} for row in ltable_dict.values(): l_join_attr_dict[row[l_key_attr_index]] = len(str( row[l_join_attr_index])) # Build prefix index on l_join_attr prefix_index = PrefixIndex(ltable_dict.values(), l_key_attr_index, l_join_attr_index, tokenizer, sim_measure_type, threshold, token_ordering) prefix_index.build() prefix_filter = PrefixFilter(tokenizer, sim_measure_type, threshold) sim_fn = get_sim_function(sim_measure_type) output_rows = [] has_output_attributes = (l_out_attrs is not None or r_out_attrs is not None) prog_bar = pyprind.ProgBar(len(rtable_dict.keys())) for r_row in rtable_dict.values(): r_id = r_row[r_key_attr_index] r_string = str(r_row[r_join_attr_index]) r_len = len(r_string) # check for empty string if not r_string: continue r_join_attr_tokens = tokenize(r_string, tokenizer, sim_measure_type) r_ordered_tokens = order_using_token_ordering(r_join_attr_tokens, token_ordering) candidates = find_candidates_prefix_filter( r_ordered_tokens, len(r_ordered_tokens), prefix_filter, prefix_index) for cand in candidates: if r_len - threshold <= l_join_attr_dict[cand] <= r_len + threshold: edit_dist = sim_fn(str(ltable_dict[cand][l_join_attr_index]), r_string) if edit_dist <= threshold: if has_output_attributes: output_row = get_output_row_from_tables( ltable_dict[cand], r_row, cand, r_id, l_out_attrs_indices, r_out_attrs_indices) if out_sim_score: output_row.append(edit_dist) output_rows.append(output_row) else: output_row = [cand, r_id] if out_sim_score: output_row.append(edit_dist) output_rows.append(output_row) prog_bar.update() output_header = get_output_header_from_tables( l_key_attr, r_key_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix) if out_sim_score: output_header.append("_sim_score") # generate a dataframe from the list of output rows output_table = pd.DataFrame(output_rows, columns=output_header) return output_table
def _filter_tables_split(ltable, rtable, l_key_attr, r_key_attr, l_filter_attr, r_filter_attr, overlap_filter, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score): # Find column indices of key attr, filter attr and output attrs in ltable l_columns = list(ltable.columns.values) l_key_attr_index = l_columns.index(l_key_attr) l_filter_attr_index = l_columns.index(l_filter_attr) l_out_attrs_indices = [] l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs) # Find column indices of key attr, filter attr and output attrs in rtable r_columns = list(rtable.columns.values) r_key_attr_index = r_columns.index(r_key_attr) r_filter_attr_index = r_columns.index(r_filter_attr) r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs) # Build a dictionary on ltable ltable_dict = build_dict_from_table(ltable, l_key_attr_index, l_filter_attr_index) # Build a dictionary on rtable rtable_dict = build_dict_from_table(rtable, r_key_attr_index, r_filter_attr_index) # Build inverted index over ltable inverted_index = InvertedIndex(ltable_dict.values(), l_key_attr_index, l_filter_attr_index, overlap_filter.tokenizer) inverted_index.build() output_rows = [] has_output_attributes = (l_out_attrs is not None or r_out_attrs is not None) prog_bar = pyprind.ProgBar(len(rtable)) for r_row in rtable_dict.values(): r_id = r_row[r_key_attr_index] r_string = str(r_row[r_filter_attr_index]) # check for empty string if not r_string: continue r_filter_attr_tokens = tokenize(r_string, overlap_filter.tokenizer) # probe inverted index and find overlap of candidates candidate_overlap = _find_candidates(r_filter_attr_tokens, inverted_index) for cand, overlap in iteritems(candidate_overlap): if overlap >= overlap_filter.overlap_size: if has_output_attributes: output_row = get_output_row_from_tables( ltable_dict[cand], r_row, cand, r_id, l_out_attrs_indices, r_out_attrs_indices) if out_sim_score: output_row.append(overlap) output_rows.append(output_row) else: output_row = [cand, r_id] if out_sim_score: output_row.append(overlap) output_rows.append(output_row) prog_bar.update() output_header = get_output_header_from_tables(l_key_attr, r_key_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix) if out_sim_score: output_header.append("_sim_score") output_table = pd.DataFrame(output_rows, columns=output_header) return output_table
def test_valid_join(scenario, sim_measure_type, args): (ltable_path, l_key_attr, l_join_attr) = scenario[0] (rtable_path, r_key_attr, r_join_attr) = scenario[1] join_fn = JOIN_FN_MAP[sim_measure_type] # load input tables for the tests. ltable = pd.read_csv(os.path.join(os.path.dirname(__file__), ltable_path)) rtable = pd.read_csv(os.path.join(os.path.dirname(__file__), rtable_path)) # generate cartesian product to be used as candset ltable['tmp_join_key'] = 1 rtable['tmp_join_key'] = 1 cartprod = pd.merge(ltable[[l_key_attr, l_join_attr, 'tmp_join_key']], rtable[[r_key_attr, r_join_attr, 'tmp_join_key']], on='tmp_join_key').drop('tmp_join_key', 1) ltable.drop('tmp_join_key', 1) rtable.drop('tmp_join_key', 1) sim_func = get_sim_function(sim_measure_type) # apply sim function to the entire cartesian product to obtain # the expected set of pairs satisfying the threshold. cartprod['sim_score'] = cartprod.apply(lambda row: sim_func( tokenize(str(row[l_join_attr]), args[0], sim_measure_type), tokenize(str(row[r_join_attr]), args[0], sim_measure_type)), axis=1) expected_pairs = set() for idx, row in cartprod.iterrows(): if float(row['sim_score']) >= args[1]: expected_pairs.add(','.join( (str(row[l_key_attr]), str(row[r_key_attr])))) # use join function to obtain actual output pairs. actual_candset = join_fn(ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr, *args) expected_output_attrs = ['_id'] l_out_prefix = DEFAULT_L_OUT_PREFIX r_out_prefix = DEFAULT_R_OUT_PREFIX # Check for l_out_prefix in args. if len(args) > 4: l_out_prefix = args[4] expected_output_attrs.append(l_out_prefix + l_key_attr) # Check for l_out_attrs in args. if len(args) > 2: if args[2]: for attr in args[2]: expected_output_attrs.append(l_out_prefix + attr) # Check for r_out_prefix in args. if len(args) > 5: r_out_prefix = args[5] expected_output_attrs.append(r_out_prefix + r_key_attr) # Check for r_out_attrs in args. if len(args) > 3: if args[3]: for attr in args[3]: expected_output_attrs.append(r_out_prefix + attr) # Check for out_sim_score in args. if len(args) > 6: if args[6]: expected_output_attrs.append('_sim_score') else: expected_output_attrs.append('_sim_score') # verify whether the output table has the necessary attributes. assert_list_equal(list(actual_candset.columns.values), expected_output_attrs) actual_pairs = set() for idx, row in actual_candset.iterrows(): actual_pairs.add(','.join((str(row[l_out_prefix + l_key_attr]), str(row[r_out_prefix + r_key_attr])))) # verify whether the actual pairs and the expected pairs match. assert_equal(len(expected_pairs), len(actual_pairs)) common_pairs = actual_pairs.intersection(expected_pairs) assert_equal(len(common_pairs), len(expected_pairs))
def filter_tables(self, ltable, rtable, l_key_attr, r_key_attr, l_filter_attr, r_filter_attr, l_out_attrs=None, r_out_attrs=None, l_out_prefix='l_', r_out_prefix='r_'): """Filter tables with suffix filter. Args: ltable, rtable : Pandas data frame l_key_attr, r_key_attr : String, key attribute from ltable and rtable l_filter_attr, r_filter_attr : String, filter attribute from ltable and rtable l_out_attrs, r_out_attrs : list of attribtues to be included in the output table from ltable and rtable l_out_prefix, r_out_prefix : String, prefix to be used in the attribute names of the output table Returns: result : Pandas data frame """ # check if the input tables are dataframes validate_input_table(ltable, 'left table') validate_input_table(rtable, 'right table') # check if the key attributes and filter attributes exist validate_attr(l_key_attr, ltable.columns, 'key attribute', 'left table') validate_attr(r_key_attr, rtable.columns, 'key attribute', 'right table') validate_attr(l_filter_attr, ltable.columns, 'filter attribute', 'left table') validate_attr(r_filter_attr, rtable.columns, 'filter attribute', 'right table') # check if the output attributes exist validate_output_attrs(l_out_attrs, ltable.columns, r_out_attrs, rtable.columns) # check if the key attributes are unique and do not contain missing values validate_key_attr(l_key_attr, ltable, 'left table') validate_key_attr(r_key_attr, rtable, 'right table') # find column indices of key attr, filter attr and # output attrs in ltable l_columns = list(ltable.columns.values) l_key_attr_index = l_columns.index(l_key_attr) l_filter_attr_index = l_columns.index(l_filter_attr) l_out_attrs_indices = find_output_attribute_indices( l_columns, l_out_attrs) # find column indices of key attr, filter attr and # output attrs in rtable r_columns = list(rtable.columns.values) r_key_attr_index = r_columns.index(r_key_attr) r_filter_attr_index = r_columns.index(r_filter_attr) r_out_attrs_indices = find_output_attribute_indices( r_columns, r_out_attrs) # build a dictionary on ltable ltable_dict = build_dict_from_table(ltable, l_key_attr_index, l_filter_attr_index) # build a dictionary on rtable rtable_dict = build_dict_from_table(rtable, r_key_attr_index, r_filter_attr_index) # generate token ordering using tokens in l_filter_attr # and r_filter_attr token_ordering = gen_token_ordering_for_tables( [ltable_dict.values(), rtable_dict.values()], [l_filter_attr_index, r_filter_attr_index], self.tokenizer, self.sim_measure_type) output_rows = [] has_output_attributes = (l_out_attrs is not None or r_out_attrs is not None) prog_bar = pyprind.ProgBar(len(ltable)) for l_row in ltable_dict.values(): l_id = l_row[l_key_attr_index] l_string = str(l_row[l_filter_attr_index]) # check for empty string if not l_string: continue ltokens = tokenize(l_string, self.tokenizer, self.sim_measure_type) ordered_ltokens = order_using_token_ordering( ltokens, token_ordering) l_num_tokens = len(ordered_ltokens) l_prefix_length = get_prefix_length(l_num_tokens, self.sim_measure_type, self.threshold, self.tokenizer) l_suffix = ordered_ltokens[l_prefix_length:] for r_row in rtable_dict.values(): r_id = r_row[r_key_attr_index] r_string = str(r_row[r_filter_attr_index]) # check for empty string if not r_string: continue rtokens = tokenize(r_string, self.tokenizer, self.sim_measure_type) ordered_rtokens = order_using_token_ordering( rtokens, token_ordering) r_num_tokens = len(ordered_rtokens) r_prefix_length = get_prefix_length(r_num_tokens, self.sim_measure_type, self.threshold, self.tokenizer) if not self._filter_suffix( l_suffix, ordered_rtokens[r_prefix_length:], l_prefix_length, r_prefix_length, l_num_tokens, r_num_tokens): if has_output_attributes: output_row = get_output_row_from_tables( ltable_dict[l_id], r_row, l_id, r_id, l_out_attrs_indices, r_out_attrs_indices) output_rows.append(output_row) else: output_rows.append([l_id, r_id]) prog_bar.update() output_header = get_output_header_from_tables(l_key_attr, r_key_attr, l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix) # generate a dataframe from the list of output rows output_table = pd.DataFrame(output_rows, columns=output_header) output_table.insert(0, '_id', range(0, len(output_table))) return output_table