def test_set_return_set(self): tok = WhitespaceTokenizer() self.assertEqual(tok.get_return_set(), False) self.assertEqual(tok.tokenize('ab cd ab bb cd db'), ['ab', 'cd', 'ab', 'bb', 'cd', 'db']) self.assertEqual(tok.set_return_set(True), True) self.assertEqual(tok.get_return_set(), True) self.assertEqual(tok.tokenize('ab cd ab bb cd db'), ['ab', 'cd', 'bb', 'db']) self.assertEqual(tok.set_return_set(False), True) self.assertEqual(tok.get_return_set(), False) self.assertEqual(tok.tokenize('ab cd ab bb cd db'), ['ab', 'cd', 'ab', 'bb', 'cd', 'db'])
def get_features(sim_measures=None, tokenizers=None): features = [] ws_tok = WhitespaceTokenizer(return_set=True) if sim_measures is None: sim_measures = [ 'JACCARD', 'COSINE', 'DICE', # 'LEFT_LENGTH', 'RIGHT_LENGTH', 'LENGTH_SUM', 'LENGTH_DIFF'] 'OVERLAP_COEFFICIENT', 'EDIT_DISTANCE', 'LEFT_LENGTH', 'RIGHT_LENGTH', 'LENGTH_SUM', 'LENGTH_DIFF' ] if tokenizers is None: tokenizers = { 'alph': AlphabeticTokenizer(return_set=True), 'alph_num': AlphanumericTokenizer(return_set=True), 'num': NumericTokenizer(return_set=True), 'ws': WhitespaceTokenizer(return_set=True), 'qg2': QgramTokenizer(qval=2, return_set=True), 'qg3': QgramTokenizer(qval=3, return_set=True) } for sim_measure_type in sim_measures: if sim_measure_type in [ 'EDIT_DISTANCE', 'LEFT_LENGTH', 'RIGHT_LENGTH', 'LENGTH_SUM', 'LENGTH_DIFF' ]: features.append( (sim_measure_type.lower(), 'none', sim_measure_type, None, get_sim_function(sim_measure_type))) continue for tok_name in tokenizers.keys(): # if sim_measure_type == 'COSINE' and tok_name == 'qg3': # continue features.append((sim_measure_type.lower() + '_' + tok_name, tok_name, sim_measure_type, tokenizers[tok_name], get_sim_function(sim_measure_type))) feature_table_header = [ 'feature_name', 'tokenizer_type', 'sim_measure_type', 'tokenizer', 'sim_function' ] feature_table = pd.DataFrame(features, columns=feature_table_header) feature_table = feature_table.set_index('feature_name') return feature_table
def apply_filterable_rule(self, rule_name, l_df, r_df, l_key, r_key, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, verbose, show_progress, n_chunks): candset = None conjunct_list = self.rule_str[rule_name] for conjunct in conjunct_list: is_auto_gen, sim_fn, l_attr, r_attr, l_tok, r_tok, op, th = parse_conjunct( conjunct, self.rule_ft[rule_name]) if l_tok == 'dlm_dc0': tokenizer = WhitespaceTokenizer(return_set=True) elif l_tok == 'qgm_3': tokenizer = QgramTokenizer(qval=3, return_set=True) if sim_fn == 'jaccard': join_fn = ssj.jaccard_join elif sim_fn == 'cosine': join_fn = ssj.cosine_join elif sim_fn == 'dice': join_fn = ssj.dice_join elif sim_fn == 'overlap_coeff': join_fn = ssj.overlap_coefficient_join elif sim_fn == 'lev_dist': join_fn = ssj.edit_distance_join if join_fn == ssj.edit_distance_join: comp_op = '<=' if op == '>=': comp_op = '<' else: comp_op = '>=' if op == '<=': comp_op = '>' ssj.dataframe_column_to_str(l_df, l_attr, inplace=True) ssj.dataframe_column_to_str(r_df, r_attr, inplace=True) if join_fn == ssj.edit_distance_join: c_df = join_fn(l_df, r_df, l_key, r_key, l_attr, r_attr, float(th), comp_op, True, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, False, n_chunks, show_progress) else: c_df = join_fn(l_df, r_df, l_key, r_key, l_attr, r_attr, tokenizer, float(th), comp_op, True, True, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, False, n_chunks, show_progress) if candset is not None: # union the candset of this conjunct with the existing candset candset = pd.concat([candset, c_df]).drop_duplicates( [l_output_prefix + l_key, r_output_prefix + r_key]).reset_index(drop=True) else: # candset from the first conjunct of the rule candset = c_df return candset
def block_tuples(self, ltuple, rtuple, l_overlap_attr, r_overlap_attr, rem_stop_words=False, q_val=None, word_level=True, overlap_size=1, allow_missing=False): """Blocks a tuple pair based on the overlap of token sets of attribute values. Args: ltuple (Series): The input left tuple. rtuple (Series): The input right tuple. l_overlap_attr (string): The overlap attribute in left tuple. r_overlap_attr (string): The overlap attribute in right tuple. rem_stop_words (boolean): A flag to indicate whether stop words (e.g., a, an, the) should be removed from the token sets of the overlap attribute values (defaults to False). q_val (int): A value of q to use if the overlap attributes values are to be tokenized as qgrams (defaults to None). word_level (boolean): A flag to indicate whether the overlap attributes should be tokenized as words (i.e, using whitespace as delimiter) (defaults to True). overlap_size (int): The minimum number of tokens that must overlap (defaults to 1). allow_missing (boolean): A flag to indicate whether a tuple pair with missing value in at least one of the blocking attributes should be blocked (defaults to False). If this flag is set to True, the pair will be kept if either ltuple has missing value in l_block_attr or rtuple has missing value in r_block_attr or both. Returns: A status indicating if the tuple pair is blocked (boolean). Examples: >>> import py_entitymatching as em >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> ob = em.OverlapBlocker() >>> status = ob.block_tuples(A.ix[0], B.ix[0], 'address', 'address') """ # validate data types of input parameters specific to overlap blocker self.validate_types_other_params(l_overlap_attr, r_overlap_attr, rem_stop_words, q_val, word_level, overlap_size) # validate word_level and q_val self.validate_word_level_qval(word_level, q_val) # determine which tokenizer to use if word_level == True: # # create a whitespace tokenizer tokenizer = WhitespaceTokenizer(return_set=True) else: # # create a qgram tokenizer tokenizer = QgramTokenizer(qval=q_val, return_set=True) # # cleanup the tuples from non-ascii characters, punctuations, and stop words l_val = self.cleanup_tuple_val(ltuple[l_overlap_attr], rem_stop_words) r_val = self.cleanup_tuple_val(rtuple[r_overlap_attr], rem_stop_words) # create a filter for overlap similarity overlap_filter = OverlapFilter(tokenizer, overlap_size, allow_missing=allow_missing) return overlap_filter.filter_pair(l_val, r_val)
def block_tables(self, ltable, rtable, l_overlap_attr, r_overlap_attr, rem_stop_words=False, q_val=None, word_level=True, overlap_size=1, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', allow_missing=False, verbose=False, show_progress=True, n_jobs=1): """ Blocks two tables based on the overlap of token sets of attribute values. Finds tuple pairs from left and right tables such that the overlap between (a) the set of tokens obtained by tokenizing the value of attribute l_overlap_attr of a tuple from the left table, and (b) the set of tokens obtained by tokenizing the value of attribute r_overlap_attr of a tuple from the right table, is above a certain threshold. Args: ltable (DataFrame): The left input table. rtable (DataFrame): The right input table. l_overlap_attr (string): The overlap attribute in left table. r_overlap_attr (string): The overlap attribute in right table. rem_stop_words (boolean): A flag to indicate whether stop words (e.g., a, an, the) should be removed from the token sets of the overlap attribute values (defaults to False). q_val (int): The value of q to use if the overlap attributes values are to be tokenized as qgrams (defaults to None). word_level (boolean): A flag to indicate whether the overlap attributes should be tokenized as words (i.e, using whitespace as delimiter) (defaults to True). overlap_size (int): The minimum number of tokens that must overlap (defaults to 1). l_output_attrs (list): A list of attribute names from the left table to be included in the output candidate set (defaults to None). r_output_attrs (list): A list of attribute names from the right table to be included in the output candidate set (defaults to None). l_output_prefix (string): The prefix to be used for the attribute names coming from the left table in the output candidate set (defaults to 'ltable\_'). r_output_prefix (string): The prefix to be used for the attribute names coming from the right table in the output candidate set (defaults to 'rtable\_'). allow_missing (boolean): A flag to indicate whether tuple pairs with missing value in at least one of the blocking attributes should be included in the output candidate set (defaults to False). If this flag is set to True, a tuple in ltable with missing value in the blocking attribute will be matched with every tuple in rtable and vice versa. verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_jobs (int): The number of parallel jobs to be used for computation (defaults to 1). If -1 all CPUs are used. If 0 or 1, no parallel computation is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used (where n_cpus is the total number of CPUs in the machine). Thus, for n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs) is less than 1, then no parallel computation is used (i.e., equivalent to the default). Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `ltable` is not of type pandas DataFrame. AssertionError: If `rtable` is not of type pandas DataFrame. AssertionError: If `l_overlap_attr` is not of type string. AssertionError: If `r_overlap_attr` is not of type string. AssertionError: If `l_output_attrs` is not of type of list. AssertionError: If `r_output_attrs` is not of type of list. AssertionError: If the values in `l_output_attrs` is not of type string. AssertionError: If the values in `r_output_attrs` is not of type string. AssertionError: If `l_output_prefix` is not of type string. AssertionError: If `r_output_prefix` is not of type string. AssertionError: If `q_val` is not of type int. AssertionError: If `word_level` is not of type boolean. AssertionError: If `overlap_size` is not of type int. AssertionError: If `verbose` is not of type boolean. AssertionError: If `allow_missing` is not of type boolean. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `n_jobs` is not of type int. AssertionError: If `l_overlap_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. AssertionError: If `l_output_attrs` are not in the ltable. AssertionError: If `r_output_attrs` are not in the rtable. SyntaxError: If `q_val` is set to a valid value and `word_level` is set to True. SyntaxError: If `q_val` is set to None and `word_level` is set to False. Examples: >>> import py_entitymatching as em >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> ob = em.OverlapBlocker() # Use word-level tokenizer >>> C1 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], word_level=True, overlap_size=1) # Use q-gram tokenizer >>> C2 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], word_level=False, q_val=2) # Include all possible missing values >>> C3 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], allow_missing=True) # Use all the cores in the machine >>> C3 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], n_jobs=-1) """ # validate data types of standard input parameters self.validate_types_params_tables(ltable, rtable, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, verbose, n_jobs) # validate data types of input parameters specific to overlap blocker self.validate_types_other_params(l_overlap_attr, r_overlap_attr, rem_stop_words, q_val, word_level, overlap_size) # validate data type of allow_missing self.validate_allow_missing(allow_missing) # validate data type of show_progress self.validate_show_progress(show_progress) # validate overlap attributes self.validate_overlap_attrs(ltable, rtable, l_overlap_attr, r_overlap_attr) # validate output attributes self.validate_output_attrs(ltable, rtable, l_output_attrs, r_output_attrs) # get and validate required metadata log_info(logger, 'Required metadata: ltable key, rtable key', verbose) # # get metadata l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose) # # validate metadata cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose) cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose) # validate word_level and q_val self.validate_word_level_qval(word_level, q_val) # do blocking # # do projection before merge l_proj_attrs = self.get_attrs_to_project(l_key, l_overlap_attr, l_output_attrs) l_df = ltable[l_proj_attrs] r_proj_attrs = self.get_attrs_to_project(r_key, r_overlap_attr, r_output_attrs) r_df = rtable[r_proj_attrs] # # case the column to string if required. l_df.is_copy, r_df.is_copy = False, False # to avoid setwithcopy warning ssj.dataframe_column_to_str(l_df, l_overlap_attr, inplace=True) ssj.dataframe_column_to_str(r_df, r_overlap_attr, inplace=True) # # cleanup the tables from non-ascii characters, punctuations, and stop words l_dummy_overlap_attr = '@#__xx__overlap_ltable__#@' r_dummy_overlap_attr = '@#__xx__overlap_rtable__#@' l_df[l_dummy_overlap_attr] = l_df[l_overlap_attr] r_df[r_dummy_overlap_attr] = r_df[r_overlap_attr] if not l_df.empty: self.cleanup_table(l_df, l_dummy_overlap_attr, rem_stop_words) if not r_df.empty: self.cleanup_table(r_df, r_dummy_overlap_attr, rem_stop_words) # # determine which tokenizer to use if word_level == True: # # # create a whitespace tokenizer tokenizer = WhitespaceTokenizer(return_set=True) else: # # # create a qgram tokenizer tokenizer = QgramTokenizer(qval=q_val, return_set=True) # # perform overlap similarity join candset = overlap_join(l_df, r_df, l_key, r_key, l_dummy_overlap_attr, r_dummy_overlap_attr, tokenizer, overlap_size, '>=', allow_missing, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, False, n_jobs, show_progress) # # retain only the required attributes in the output candidate set retain_cols = self.get_attrs_to_retain(l_key, r_key, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix) candset = candset[retain_cols] # update metadata in the catalog key = get_name_for_key(candset.columns) candset = add_key_column(candset, key) cm.set_candset_properties(candset, key, l_output_prefix + l_key, r_output_prefix + r_key, ltable, rtable) # return the candidate set return candset
def block_candset(self, candset, l_overlap_attr, r_overlap_attr, rem_stop_words=False, q_val=None, word_level=True, overlap_size=1, allow_missing=False, verbose=False, show_progress=True, n_jobs=1): """Blocks an input candidate set of tuple pairs based on the overlap of token sets of attribute values. Finds tuple pairs from an input candidate set of tuple pairs such that the overlap between (a) the set of tokens obtained by tokenizing the value of attribute l_overlap_attr of the left tuple in a tuple pair, and (b) the set of tokens obtained by tokenizing the value of attribute r_overlap_attr of the right tuple in the tuple pair, is above a certain threshold. Args: candset (DataFrame): The input candidate set of tuple pairs. l_overlap_attr (string): The overlap attribute in left table. r_overlap_attr (string): The overlap attribute in right table. rem_stop_words (boolean): A flag to indicate whether stop words (e.g., a, an, the) should be removed from the token sets of the overlap attribute values (defaults to False). q_val (int): The value of q to use if the overlap attributes values are to be tokenized as qgrams (defaults to None). word_level (boolean): A flag to indicate whether the overlap attributes should be tokenized as words (i.e, using whitespace as delimiter) (defaults to True). overlap_size (int): The minimum number of tokens that must overlap (defaults to 1). allow_missing (boolean): A flag to indicate whether tuple pairs with missing value in at least one of the blocking attributes should be included in the output candidate set (defaults to False). If this flag is set to True, a tuple pair with missing value in either blocking attribute will be retained in the output candidate set. verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_jobs (int): The number of parallel jobs to be used for computation (defaults to 1). If -1 all CPUs are used. If 0 or 1, no parallel computation is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used (where n_cpus are the total number of CPUs in the machine).Thus, for n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs) is less than 1, then no parallel computation is used (i.e., equivalent to the default). Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `l_overlap_attr` is not of type string. AssertionError: If `r_overlap_attr` is not of type string. AssertionError: If `q_val` is not of type int. AssertionError: If `word_level` is not of type boolean. AssertionError: If `overlap_size` is not of type int. AssertionError: If `verbose` is not of type boolean. AssertionError: If `allow_missing` is not of type boolean. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `n_jobs` is not of type int. AssertionError: If `l_overlap_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. SyntaxError: If `q_val` is set to a valid value and `word_level` is set to True. SyntaxError: If `q_val` is set to None and `word_level` is set to False. Examples: >>> import py_entitymatching as em >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> ob = em.OverlapBlocker() >>> C = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name']) >>> D1 = ob.block_candset(C, 'name', 'name', allow_missing=True) # Include all possible tuple pairs with missing values >>> D2 = ob.block_candset(C, 'name', 'name', allow_missing=True) # Execute blocking using multiple cores >>> D3 = ob.block_candset(C, 'name', 'name', n_jobs=-1) # Use q-gram tokenizer >>> D2 = ob.block_candset(C, 'name', 'name', word_level=False, q_val=2) """ # validate data types of standard input parameters self.validate_types_params_candset(candset, verbose, show_progress, n_jobs) # validate data types of input parameters specific to overlap blocker self.validate_types_other_params(l_overlap_attr, r_overlap_attr, rem_stop_words, q_val, word_level, overlap_size) # get and validate metadata log_info( logger, 'Required metadata: cand.set key, fk ltable, fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset( candset, logger, verbose) # # validate metadata cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # validate overlap attrs self.validate_overlap_attrs(ltable, rtable, l_overlap_attr, r_overlap_attr) # validate word_level and q_val self.validate_word_level_qval(word_level, q_val) # do blocking # # do projection before merge l_df = ltable[[l_key, l_overlap_attr]] r_df = rtable[[r_key, r_overlap_attr]] # # case the overlap attribute to string if required. l_df.is_copy, r_df.is_copy = False, False # to avoid setwithcopy warning ssj.dataframe_column_to_str(l_df, l_overlap_attr, inplace=True) ssj.dataframe_column_to_str(r_df, r_overlap_attr, inplace=True) # # cleanup the tables from non-ascii characters, punctuations, and stop words self.cleanup_table(l_df, l_overlap_attr, rem_stop_words) self.cleanup_table(r_df, r_overlap_attr, rem_stop_words) # # determine which tokenizer to use if word_level == True: # # # create a whitespace tokenizer tokenizer = WhitespaceTokenizer(return_set=True) else: # # # create a qgram tokenizer tokenizer = QgramTokenizer(qval=q_val, return_set=True) # # create a filter for overlap similarity join overlap_filter = OverlapFilter(tokenizer, overlap_size, allow_missing=allow_missing) # # perform overlap similarity filtering of the candset out_table = overlap_filter.filter_candset(candset, fk_ltable, fk_rtable, l_df, r_df, l_key, r_key, l_overlap_attr, r_overlap_attr, n_jobs, show_progress=show_progress) # update catalog cm.set_candset_properties(out_table, key, fk_ltable, fk_rtable, ltable, rtable) # return candidate set return out_table
def block_tables(self, ltable, rtable, l_overlap_attr, r_overlap_attr, rem_stop_words=False, q_val=None, word_level=True, overlap_size=1, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', allow_missing=False, verbose=False, show_progress=True, n_ltable_chunks=1, n_rtable_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK. Blocks two tables based on the overlap of token sets of attribute values. Finds tuple pairs from left and right tables such that the overlap between (a) the set of tokens obtained by tokenizing the value of attribute l_overlap_attr of a tuple from the left table, and (b) the set of tokens obtained by tokenizing the value of attribute r_overlap_attr of a tuple from the right table, is above a certain threshold. Args: ltable (DataFrame): The left input table. rtable (DataFrame): The right input table. l_overlap_attr (string): The overlap attribute in left table. r_overlap_attr (string): The overlap attribute in right table. rem_stop_words (boolean): A flag to indicate whether stop words (e.g., a, an, the) should be removed from the token sets of the overlap attribute values (defaults to False). q_val (int): The value of q to use if the overlap attributes values are to be tokenized as qgrams (defaults to None). word_level (boolean): A flag to indicate whether the overlap attributes should be tokenized as words (i.e, using whitespace as delimiter) (defaults to True). overlap_size (int): The minimum number of tokens that must overlap (defaults to 1). l_output_attrs (list): A list of attribute names from the left table to be included in the output candidate set (defaults to None). r_output_attrs (list): A list of attribute names from the right table to be included in the output candidate set (defaults to None). l_output_prefix (string): The prefix to be used for the attribute names coming from the left table in the output candidate set (defaults to 'ltable\_'). r_output_prefix (string): The prefix to be used for the attribute names coming from the right table in the output candidate set (defaults to 'rtable\_'). allow_missing (boolean): A flag to indicate whether tuple pairs with missing value in at least one of the blocking attributes should be included in the output candidate set (defaults to False). If this flag is set to True, a tuple in ltable with missing value in the blocking attribute will be matched with every tuple in rtable and vice versa. verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_ltable_chunks (int): The number of partitions to split the left table ( defaults to 1). If it is set to -1, then the number of partitions is set to the number of cores in the machine. n_rtable_chunks (int): The number of partitions to split the right table ( defaults to 1). If it is set to -1, then the number of partitions is set to the number of cores in the machine. Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `ltable` is not of type pandas DataFrame. AssertionError: If `rtable` is not of type pandas DataFrame. AssertionError: If `l_overlap_attr` is not of type string. AssertionError: If `r_overlap_attr` is not of type string. AssertionError: If `l_output_attrs` is not of type of list. AssertionError: If `r_output_attrs` is not of type of list. AssertionError: If the values in `l_output_attrs` is not of type string. AssertionError: If the values in `r_output_attrs` is not of type string. AssertionError: If `l_output_prefix` is not of type string. AssertionError: If `r_output_prefix` is not of type string. AssertionError: If `q_val` is not of type int. AssertionError: If `word_level` is not of type boolean. AssertionError: If `overlap_size` is not of type int. AssertionError: If `verbose` is not of type boolean. AssertionError: If `allow_missing` is not of type boolean. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `n_ltable_chunks` is not of type int. AssertionError: If `n_rtable_chunks` is not of type int. AssertionError: If `l_overlap_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. AssertionError: If `l_output_attrs` are not in the ltable. AssertionError: If `r_output_attrs` are not in the rtable. SyntaxError: If `q_val` is set to a valid value and `word_level` is set to True. SyntaxError: If `q_val` is set to None and `word_level` is set to False. Examples: >>> from py_entitymatching.dask.dask_overlap_blocker import DaskOverlapBlocker >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> ob = DaskOverlapBlocker() # Use all cores # # Use word-level tokenizer >>> C1 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], word_level=True, overlap_size=1, n_ltable_chunks=-1, n_rtable_chunks=-1) # # Use q-gram tokenizer >>> C2 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], word_level=False, q_val=2, n_ltable_chunks=-1, n_rtable_chunks=-1) # # Include all possible missing values >>> C3 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], allow_missing=True, n_ltable_chunks=-1, n_rtable_chunks=-1) """ logger.warning( "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN " "RISK.") # Input validations self.validate_types_params_tables(ltable, rtable, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, verbose, n_ltable_chunks, n_rtable_chunks) self.validate_types_other_params(l_overlap_attr, r_overlap_attr, rem_stop_words, q_val, word_level, overlap_size) self.validate_allow_missing(allow_missing) self.validate_show_progress(show_progress) self.validate_overlap_attrs(ltable, rtable, l_overlap_attr, r_overlap_attr) self.validate_output_attrs(ltable, rtable, l_output_attrs, r_output_attrs) self.validate_word_level_qval(word_level, q_val) log_info(logger, 'Required metadata: ltable key, rtable key', verbose) l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose) # validate metadata cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose) cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose) # validate input table chunks validate_object_type(n_ltable_chunks, int, 'Parameter n_ltable_chunks') validate_object_type(n_rtable_chunks, int, 'Parameter n_rtable_chunks') validate_chunks(n_ltable_chunks) validate_chunks(n_rtable_chunks) if n_ltable_chunks == -1: n_ltable_chunks = multiprocessing.cpu_count() ltable_chunks = pd.np.array_split(ltable, n_ltable_chunks) # preprocess/tokenize ltable if word_level == True: tokenizer = WhitespaceTokenizer(return_set=True) else: tokenizer = QgramTokenizer(qval=q_val, return_set=True) preprocessed_tokenized_ltbl = [] # Construct DAG for preprocessing/tokenizing ltable chunks start_row_id = 0 for i in range(len(ltable_chunks)): result = delayed(self.process_tokenize_block_attr)(ltable_chunks[i][ l_overlap_attr], start_row_id, rem_stop_words, tokenizer) preprocessed_tokenized_ltbl.append(result) start_row_id += len(ltable_chunks[i]) preprocessed_tokenized_ltbl = delayed(wrap)(preprocessed_tokenized_ltbl) # Execute the DAG if show_progress: with ProgressBar(): logger.info('Preprocessing/tokenizing ltable') preprocessed_tokenized_ltbl_vals = preprocessed_tokenized_ltbl.compute( scheduler="processes", num_workers=multiprocessing.cpu_count()) else: preprocessed_tokenized_ltbl_vals = preprocessed_tokenized_ltbl.compute( scheduler="processes", num_workers=multiprocessing.cpu_count()) ltable_processed_dict = {} for i in range(len(preprocessed_tokenized_ltbl_vals)): ltable_processed_dict.update(preprocessed_tokenized_ltbl_vals[i]) # build inverted index inverted_index = self.build_inverted_index(ltable_processed_dict) if n_rtable_chunks == -1: n_rtable_chunks = multiprocessing.cpu_count() rtable_chunks = pd.np.array_split(rtable, n_rtable_chunks) # Construct the DAG for probing probe_result = [] start_row_id = 0 for i in range(len(rtable_chunks)): result = delayed(self.probe)(rtable_chunks[i][r_overlap_attr], inverted_index, start_row_id, rem_stop_words, tokenizer, overlap_size) probe_result.append(result) start_row_id += len(rtable_chunks[i]) probe_result = delayed(wrap)(probe_result) # Execute the DAG for probing if show_progress: with ProgressBar(): logger.info('Probing using rtable') probe_result = probe_result.compute(scheduler="processes", num_workers=multiprocessing.cpu_count()) else: probe_result = probe_result.compute(scheduler="processes", num_workers=multiprocessing.cpu_count()) # construct a minimal dataframe that can be used to add more attributes flat_list = [item for sublist in probe_result for item in sublist] tmp = pd.DataFrame(flat_list, columns=['fk_ltable_rid', 'fk_rtable_rid']) fk_ltable = ltable.iloc[tmp.fk_ltable_rid][l_key].values fk_rtable = rtable.iloc[tmp.fk_rtable_rid][r_key].values id_vals = list(range(len(flat_list))) candset = pd.DataFrame.from_dict( {'_id': id_vals, l_output_prefix+l_key: fk_ltable, r_output_prefix+r_key: fk_rtable}) # set the properties for the candidate set cm.set_key(candset, '_id') cm.set_fk_ltable(candset, 'ltable_'+l_key) cm.set_fk_rtable(candset, 'rtable_'+r_key) cm.set_ltable(candset, ltable) cm.set_rtable(candset, rtable) ret_candset = gh.add_output_attributes(candset, l_output_attrs=l_output_attrs, r_output_attrs=r_output_attrs, l_output_prefix=l_output_prefix, r_output_prefix=r_output_prefix, validate=False) # handle missing values if allow_missing: missing_value_pairs = get_pairs_with_missing_value(ltable, rtable, l_key, r_key, l_overlap_attr, r_overlap_attr, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, False, False) missing_value_pairs.insert(0, '_id', range(len(ret_candset), len(ret_candset)+len(missing_value_pairs))) if len(missing_value_pairs) > 0: ret_candset = pd.concat([ret_candset, missing_value_pairs], ignore_index=True, sort=False) cm.set_key(ret_candset, '_id') cm.set_fk_ltable(ret_candset, 'ltable_' + l_key) cm.set_fk_rtable(ret_candset, 'rtable_' + r_key) cm.set_ltable(ret_candset, ltable) cm.set_rtable(ret_candset, rtable) # Return the final candidate set to user. return ret_candset
def block_candset(self, candset, l_overlap_attr, r_overlap_attr, rem_stop_words=False, q_val=None, word_level=True, overlap_size=1, allow_missing=False, verbose=False, show_progress=True, n_chunks=-1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK. Blocks an input candidate set of tuple pairs based on the overlap of token sets of attribute values. Finds tuple pairs from an input candidate set of tuple pairs such that the overlap between (a) the set of tokens obtained by tokenizing the value of attribute l_overlap_attr of the left tuple in a tuple pair, and (b) the set of tokens obtained by tokenizing the value of attribute r_overlap_attr of the right tuple in the tuple pair, is above a certain threshold. Args: candset (DataFrame): The input candidate set of tuple pairs. l_overlap_attr (string): The overlap attribute in left table. r_overlap_attr (string): The overlap attribute in right table. rem_stop_words (boolean): A flag to indicate whether stop words (e.g., a, an, the) should be removed from the token sets of the overlap attribute values (defaults to False). q_val (int): The value of q to use if the overlap attributes values are to be tokenized as qgrams (defaults to None). word_level (boolean): A flag to indicate whether the overlap attributes should be tokenized as words (i.e, using whitespace as delimiter) (defaults to True). overlap_size (int): The minimum number of tokens that must overlap (defaults to 1). allow_missing (boolean): A flag to indicate whether tuple pairs with missing value in at least one of the blocking attributes should be included in the output candidate set (defaults to False). If this flag is set to True, a tuple pair with missing value in either blocking attribute will be retained in the output candidate set. verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_chunks (int): The number of partitions to split the candidate set. If it is set to -1, the number of partitions will be set to the number of cores in the machine. Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `l_overlap_attr` is not of type string. AssertionError: If `r_overlap_attr` is not of type string. AssertionError: If `q_val` is not of type int. AssertionError: If `word_level` is not of type boolean. AssertionError: If `overlap_size` is not of type int. AssertionError: If `verbose` is not of type boolean. AssertionError: If `allow_missing` is not of type boolean. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `n_chunks` is not of type int. AssertionError: If `l_overlap_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. SyntaxError: If `q_val` is set to a valid value and `word_level` is set to True. SyntaxError: If `q_val` is set to None and `word_level` is set to False. Examples: >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_overlap_blocker import DaskOverlapBlocker >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> ob = DaskOverlapBlocker() >>> C = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name']) >>> D1 = ob.block_candset(C, 'name', 'name', allow_missing=True) # Include all possible tuple pairs with missing values >>> D2 = ob.block_candset(C, 'name', 'name', allow_missing=True) # Execute blocking using multiple cores >>> D3 = ob.block_candset(C, 'name', 'name', n_chunks=-1) # Use q-gram tokenizer >>> D2 = ob.block_candset(C, 'name', 'name', word_level=False, q_val=2) """ logger.warning( "WARNING THIS BLOCKER IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN " "RISK.") # Validate input parameters self.validate_types_params_candset(candset, verbose, show_progress, n_chunks) self.validate_types_other_params(l_overlap_attr, r_overlap_attr, rem_stop_words, q_val, word_level, overlap_size) # get and validate metadata log_info(logger, 'Required metadata: cand.set key, fk ltable, fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset( candset, logger, verbose) # # validate metadata cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # validate overlap attrs self.validate_overlap_attrs(ltable, rtable, l_overlap_attr, r_overlap_attr) # validate word_level and q_val self.validate_word_level_qval(word_level, q_val) # validate number of chunks validate_object_type(n_chunks, int, 'Parameter n_chunks') validate_chunks(n_chunks) # # do projection before merge l_df = ltable[[l_key, l_overlap_attr]] r_df = rtable[[r_key, r_overlap_attr]] # # set index for convenience l_df = l_df.set_index(l_key, drop=False) r_df = r_df.set_index(r_key, drop=False) # # case the overlap attribute to string if required. l_df.is_copy, r_df.is_copy = False, False # to avoid setwithcopy warning ssj.dataframe_column_to_str(l_df, l_overlap_attr, inplace=True) ssj.dataframe_column_to_str(r_df, r_overlap_attr, inplace=True) if word_level == True: tokenizer = WhitespaceTokenizer(return_set=True) else: tokenizer = QgramTokenizer(return_set=True) n_chunks = get_num_partitions(n_chunks, len(candset)) c_splits = pd.np.array_split(candset, n_chunks) valid_splits = [] # Create DAG for i in range(n_chunks): result = delayed(self._block_candset_split)(c_splits[i], l_df, r_df, l_key, r_key, l_overlap_attr, r_overlap_attr, fk_ltable, fk_rtable, allow_missing, rem_stop_words, tokenizer, overlap_size) valid_splits.append(result) valid_splits = delayed(wrap)(valid_splits) # Execute the DAG if show_progress: with ProgressBar(): valid_splits = valid_splits.compute(scheduler="processes", num_workers=get_num_cores()) else: valid_splits = valid_splits.compute(scheduler="processes", num_workers=get_num_cores()) valid = sum(valid_splits, []) # construct output table if len(candset) > 0: out_table = candset[valid] else: out_table = pd.DataFrame(columns=candset.columns) # update the catalog cm.set_candset_properties(out_table, key, fk_ltable, fk_rtable, ltable, rtable) # return the output table return out_table
def tuner_overlap_blocker(ltable, rtable, l_key, r_key, l_overlap_attr, r_overlap_attr, rem_stop_words, q_val, word_level, overlap_size, ob_obj, n_bins=50, sample_proportion=0.1, seed=0, repeat=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK. Tunes the parameters for blocking two tables command implemented using Dask. Given the input tables and the parameters for Dask-based overlap blocker command, this command returns the configuration including whether the input tables need to be swapped, the number of left table chunks, and the number of right table chunks. It uses "Staged Tuning" approach to select the configuration setting. The key idea of this approach select the configuration for one parameter at a time. Conceptually, this command performs the following steps. First, it samples the left table and rtable using stratified sampling. Next, it uses the sampled tables to decide if they need to be swapped or not (by running the down sample command and comparing the runtimes). Next, it finds the number of rtable partitions using the sampled tables (by trying the a fixed set of partitions and comparing the runtimes). The number of partitions is selected to be the number before which the runtime starts increasing. Then it finds the number of right table partitions similar to selecting the number of left table partitions. while doing this, set the number of right table partitions is set to the value found in the previous step. Finally, it returns the configuration setting back to the user as a triplet (x, y, z) where x indicates if the tables need to be swapped or not, y indicates the number of left table partitions (if the tables need to be swapped, then this indicates the number of left table partitions after swapping), and z indicates the number of right table partitions. Args: ltable (DataFrame): The left input table. rtable (DataFrame): The right input table. l_overlap_attr (string): The overlap attribute in left table. r_overlap_attr (string): The overlap attribute in right table. rem_stop_words (boolean): A flag to indicate whether stop words (e.g., a, an, the) should be removed from the token sets of the overlap attribute values (defaults to False). q_val (int): The value of q to use if the overlap attributes values are to be tokenized as qgrams (defaults to None). word_level (boolean): A flag to indicate whether the overlap attributes should be tokenized as words (i.e, using whitespace as delimiter) (defaults to True). overlap_size (int): The minimum number of tokens that must overlap. ob_obj (OverlapBlocker): The object used to call commands to block two tables and a candidate set n_bins (int): The number of bins to be used for stratified sampling. sample_proportion (float): The proportion used to sample the tables. This value is expected to be greater than 0 and less thank 1. repeat (int): The number of times to execute the down sample command while selecting the values for the parameters. Returns: A tuple containing 3 values. For example if the tuple is represented as (x, y, z) then x indicates if the tables need to be swapped or not, y indicates the number of left table partitions (if the tables need to be swapped, then this indicates the number of left table partitions after swapping), and z indicates the number of right table partitions. Examples: >>> from py_entitymatching.tuner.tuner_overlap_blocker import tuner_overlap_blocker >>> from py_entitymatching.dask.dask_overlap_blocker import DaskOverlapBlocker >>> obj = DaskOverlapBlocker() >>> (swap_or_not, n_ltable_chunks, n_sample_rtable_chunks) = tuner_overlap_blocker(ltable, rtable, 'id', 'id', "title", "title", rem_stop_words=True, q_val=None, word_level=True, overlap_size=1, ob_obj=obj) """ logger.warning( "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN " "RISK.") # Select the tokenizer if word_level: tokenizer = WhitespaceTokenizer() else: tokenizer = QgramTokenizer() # Same the input tables, given in the original order sampled_tables_orig_order = get_sampled_tables( ltable, rtable, l_key, r_key, l_overlap_attr, r_overlap_attr, rem_stop_words, tokenizer, ob_obj, n_bins, sample_proportion, seed) # Same the input tables, given in the swapped order sampled_tables_swap_order = get_sampled_tables( rtable, ltable, r_key, l_key, r_overlap_attr, l_overlap_attr, rem_stop_words, tokenizer, ob_obj, n_bins, sample_proportion, seed) # Select if the tables need to be swapped swap_config = should_swap(ob_obj, sampled_tables_orig_order, sampled_tables_swap_order, l_overlap_attr, r_overlap_attr, rem_stop_words, q_val, word_level, overlap_size, repeat) # Use the sampled tables s_ltable, s_rtable = sampled_tables_orig_order if swap_config == True: s_ltable, s_rtable = sampled_tables_swap_order # Find the number of right table partitions n_rtable_chunks = find_rtable_chunks(ob_obj, s_ltable, s_rtable, l_overlap_attr, r_overlap_attr, rem_stop_words, q_val, word_level, overlap_size) # Find the number of left table partitions n_ltable_chunks = find_ltable_chunks(ob_obj, s_ltable, s_rtable, l_overlap_attr, r_overlap_attr, rem_stop_words, q_val, word_level, overlap_size, n_rtable_chunks) # Return the configuration return (swap_config, n_ltable_chunks, n_rtable_chunks)
def _apply_filterable_rule(self, rule_name, ltable, rtable, l_key, r_key): candset = None conjunct_list = self.rule_str[rule_name] for conjunct in conjunct_list: is_auto_gen, sim_fn, l_attr, r_attr, l_tok, r_tok, op, \ th = self._parse_conjunct( conjunct, rule_name) if l_tok == 'dlm_dc0': tokenizer = WhitespaceTokenizer(return_set=True) elif l_tok == 'qgm_3': tokenizer = QgramTokenizer(qval=3, return_set=True) if sim_fn == 'jaccard': join_fn = ssj.jaccard_join elif sim_fn == 'cosine': join_fn = ssj.cosine_join elif sim_fn == 'dice': join_fn = ssj.dice_join elif sim_fn == 'overlap_coeff': join_fn = ssj.overlap_coefficient_join elif sim_fn == 'lev_dist': join_fn = ssj.edit_distance_join if join_fn == ssj.edit_distance_join: comp_op = '<=' if op == '>=': comp_op = '<' else: comp_op = '>=' if op == '<=': comp_op = '>' ssj.dataframe_column_to_str(ltable, l_attr, inplace=True) ssj.dataframe_column_to_str(rtable, r_attr, inplace=True) if join_fn == ssj.edit_distance_join: tokenizer = QgramTokenizer(qval=2, return_set=False) c_df = join_fn( ltable, rtable, l_key, r_key, l_attr, r_attr, float(th), comp_op, allow_missing=True, # need to revisit allow_missing out_sim_score=False, l_out_prefix='l_', r_out_prefix='r_', show_progress=False, tokenizer=tokenizer) else: c_df = join_fn(ltable, rtable, l_key, r_key, l_attr, r_attr, tokenizer, float(th), comp_op, allow_empty=True, allow_missing=True, l_out_prefix='l_', r_out_prefix='r_', out_sim_score=False) #c_df.drop('_id', axis=1) if candset is not None: # union the candset of this conjunct with the existing candset candset = pd.concat([candset, c_df]).drop_duplicates( [l_output_prefix + l_key, r_output_prefix + r_key]).reset_index(drop=True) else: # candset from the first conjunct of the rule candset = c_df return candset
def sample_pairs(ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr, sample_size, y_param, seed, l_out_prefix='l_', r_out_prefix='r_', show_progress=True): # get attributes to project. l_proj_attrs = get_attrs_to_project(None, l_key_attr, l_join_attr) r_proj_attrs = get_attrs_to_project(None, r_key_attr, r_join_attr) # convert dataframe to array for faster access ltable_array = convert_dataframe_to_array(ltable, l_proj_attrs, l_join_attr) rtable_array = convert_dataframe_to_array(rtable, r_proj_attrs, r_join_attr) # find column indices of key attr and join attr in ltable array l_key_attr_index = l_proj_attrs.index(l_key_attr) l_join_attr_index = l_proj_attrs.index(l_join_attr) # find column indices of key attr and join attr in rtable array r_key_attr_index = r_proj_attrs.index(r_key_attr) r_join_attr_index = r_proj_attrs.index(r_join_attr) # create a whitespace tokenizer to tokenize join attributes ws_tok = WhitespaceTokenizer(return_set=True) # build inverted index on join attriubute in ltable inverted_index = InvertedIndex(ltable_array, l_join_attr_index, ws_tok) inverted_index.build() number_of_r_tuples_to_sample = int( ceil(float(sample_size) / float(y_param))) sample_rtable_indices = random.sample(range(0, len(rtable_array)), number_of_r_tuples_to_sample) cand_pos_ltuples_required = int(ceil(y_param / 2.0)) overlap_filter = OverlapFilter(ws_tok, 1) output_rows = [] if show_progress: prog_bar = pyprind.ProgBar(number_of_r_tuples_to_sample) for r_idx in sample_rtable_indices: r_row = rtable_array[r_idx] r_id = r_row[r_key_attr_index] r_join_attr_tokens = ws_tok.tokenize(r_row[r_join_attr_index]) # probe inverted index and find ltable candidates cand_overlap = overlap_filter.find_candidates(r_join_attr_tokens, inverted_index) sampled_ltuples = set() for cand in sorted(cand_overlap.items(), key=operator.itemgetter(1), reverse=True): if len(sampled_ltuples) == cand_pos_ltuples_required: break sampled_ltuples.add(cand[0]) ltable_size = len(ltable_array) while len(sampled_ltuples) < y_param: rand_idx = random.randint(0, ltable_size - 1) sampled_ltuples.add(rand_idx) for l_idx in sampled_ltuples: output_rows.append([ltable_array[l_idx][l_key_attr_index], r_id]) if show_progress: prog_bar.update() for seed_pair_row in seed.itertuples(index=False): output_rows.append([seed_pair_row[0], seed_pair_row[1]]) output_header = get_output_header_from_tables(l_key_attr, r_key_attr, None, None, l_out_prefix, r_out_prefix) output_table = pd.DataFrame(output_rows, columns=output_header) # add an id column named '_id' to the output table. output_table.insert(0, '_id', range(0, len(output_table))) return output_table
def get_features(ltable, rtable, l_exclude_attrs=set(), r_exclude_attrs=set()): toks_set = {'alph': AlphabeticTokenizer(return_set=True), 'alph_num': AlphanumericTokenizer(return_set=True), 'ws': WhitespaceTokenizer(return_set=True), 'qg2': QgramTokenizer(qval=2, return_set=True), 'qg3': QgramTokenizer(qval=3, return_set=True)} toks_bag = {'alph_bag': AlphabeticTokenizer(return_set=False), 'alph_num_bag': AlphanumericTokenizer(return_set=False), 'ws_bag': WhitespaceTokenizer(return_set=False), 'qg2_bag': QgramTokenizer(qval=2, return_set=False), 'qg3_bag': QgramTokenizer(qval=3, return_set=False)} str_features = {'jaccard': (jaccard, True, False), 'cosine': (cosine, True, False), 'dice': (dice, True, False), 'overlap_coeff': (overlap_coeff, True, False), 'monge_elkan': (monge_elkan, True, False), 'tfidf': (tfidf, True, True), 'soft_tfidf': (soft_tfidf, True, True), 'lev_sim': (lev_sim, False), # 'hamming_sim': (hamming_sim, False), 'jaro': (jaro, False), 'jaro_winkler': (jaro_winkler, False), 'needleman_wunsch': (needleman_wunsch, False), 'smith_waterman': (smith_waterman, False), 'exact_match': (exact_match, False)} num_features = {'rel_diff': rel_diff, 'abs_norm': abs_norm} l_col_names = ltable.columns r_col_names = rtable.columns l_col_types = ltable.dtypes r_col_types = rtable.dtypes l_col_map = {} i = 0 for l_col_name in l_col_names: if l_col_name in l_exclude_attrs: i += 1 continue l_col_map[l_col_name] = (i, l_col_types[i]) i += 1 feat_rows = [] i = 0 for r_col_name in r_col_names: if r_col_name in r_exclude_attrs: i += 1 continue l_col = l_col_map.get(r_col_name) if l_col is None: print('ERROR: Column ' + r_col_name + ' in rtable not found in ltable') return if l_col[1] != r_col_types[i]: print('ERROR: Type mismatch for column ' + r_col_name + '. ' +\ r_col_types[i] + ' in rtable and ' + l_col[1] + ' in ltable.') if l_col[1] == int or l_col[1] == float: for k in num_features.keys(): feat_rows.append((r_col_name + '_' + k, l_col[0], i, None, num_features[k])) else: for k in str_features.keys(): feat_entry = str_features[k] if feat_entry[1] == False: feat_rows.append((r_col_name + '_' + k, l_col[0], i, None, feat_entry[0])) else: toks = toks_bag if feat_entry[2] else toks_set for t in toks.keys(): feat_rows.append((r_col_name + '_' + k + '_' + t, l_col[0], i, toks[t].tokenize, feat_entry[0])) i += 1 feature_table = pd.DataFrame(feat_rows, columns=['feat_name', 'l_attr', 'r_attr', 'tok', 'sim_fn']) return feature_table
def sample_pairs(ltable, rtable, l_key_attr, r_key_attr, l_join_attr, r_join_attr, sample_size, y_param, seed, l_out_prefix='l_', r_out_prefix='r_', show_progress=True): # get attributes to project. l_proj_attrs = get_attrs_to_project(None, l_key_attr, l_join_attr) r_proj_attrs = get_attrs_to_project(None, r_key_attr, r_join_attr) # convert dataframe to array for faster access ltable_array = convert_dataframe_to_array(ltable, l_proj_attrs, l_join_attr) rtable_array = convert_dataframe_to_array(rtable, r_proj_attrs, r_join_attr) # find column indices of key attr and join attr in ltable array l_key_attr_index = l_proj_attrs.index(l_key_attr) l_join_attr_index = l_proj_attrs.index(l_join_attr) # find column indices of key attr and join attr in rtable array r_key_attr_index = r_proj_attrs.index(r_key_attr) r_join_attr_index = r_proj_attrs.index(r_join_attr) # create a whitespace tokenizer to tokenize join attributes ws_tok = WhitespaceTokenizer(return_set=True) # build inverted index on join attriubute in ltable inverted_index = InvertedIndex(ltable_array, l_join_attr_index, ws_tok) inverted_index.build() number_of_r_tuples_to_sample = int(ceil(float(sample_size) / float(y_param))) sample_rtable_indices = random.sample(range(0, len(rtable_array)), number_of_r_tuples_to_sample) cand_pos_ltuples_required = int(ceil(y_param / 2.0)) overlap_filter = OverlapFilter(ws_tok, 1) output_rows = [] if show_progress: prog_bar = pyprind.ProgBar(number_of_r_tuples_to_sample) for r_idx in sample_rtable_indices: r_row = rtable_array[r_idx] r_id = r_row[r_key_attr_index] r_join_attr_tokens = ws_tok.tokenize(r_row[r_join_attr_index]) # probe inverted index and find ltable candidates cand_overlap = overlap_filter.find_candidates( r_join_attr_tokens, inverted_index) sampled_ltuples = set() for cand in sorted(cand_overlap.items(), key=operator.itemgetter(1), reverse=True): if len(sampled_ltuples) == cand_pos_ltuples_required: break sampled_ltuples.add(cand[0]) ltable_size = len(ltable_array) while len(sampled_ltuples) < y_param: rand_idx = random.randint(0, ltable_size - 1) sampled_ltuples.add(rand_idx) for l_idx in sampled_ltuples: output_rows.append([ltable_array[l_idx][l_key_attr_index], r_id]) if show_progress: prog_bar.update() for seed_pair_row in seed.itertuples(index=False): output_rows.append([seed_pair_row[0], seed_pair_row[1]]) output_header = get_output_header_from_tables(l_key_attr, r_key_attr, None, None, l_out_prefix, r_out_prefix) output_table = pd.DataFrame(output_rows, columns=output_header) # add an id column named '_id' to the output table. output_table.insert(0, '_id', range(0, len(output_table))) return output_table
def setUp(self): self.ws_tok = WhitespaceTokenizer() self.ws_tok_return_set = WhitespaceTokenizer(return_set=True)
class WhitespaceTokenizerTestCases(unittest.TestCase): def setUp(self): self.ws_tok = WhitespaceTokenizer() self.ws_tok_return_set = WhitespaceTokenizer(return_set=True) def test_whitespace_tok_valid(self): self.assertEqual(self.ws_tok.tokenize('data science'), ['data', 'science']) self.assertEqual(self.ws_tok.tokenize('data science'), ['data', 'science']) self.assertEqual(self.ws_tok.tokenize('data science'), ['data', 'science']) self.assertEqual(self.ws_tok.tokenize('data\tscience'), ['data', 'science']) self.assertEqual(self.ws_tok.tokenize('data\nscience'), ['data', 'science']) self.assertEqual(self.ws_tok.tokenize('ab cd ab bb cd db'), ['ab', 'cd', 'ab', 'bb', 'cd', 'db']) self.assertEqual(self.ws_tok_return_set.tokenize('ab cd ab bb cd db'), ['ab', 'cd', 'bb', 'db']) def test_get_return_set(self): self.assertEqual(self.ws_tok.get_return_set(), False) self.assertEqual(self.ws_tok_return_set.get_return_set(), True) def test_set_return_set(self): tok = WhitespaceTokenizer() self.assertEqual(tok.get_return_set(), False) self.assertEqual(tok.tokenize('ab cd ab bb cd db'), ['ab', 'cd', 'ab', 'bb', 'cd', 'db']) self.assertEqual(tok.set_return_set(True), True) self.assertEqual(tok.get_return_set(), True) self.assertEqual(tok.tokenize('ab cd ab bb cd db'), ['ab', 'cd', 'bb', 'db']) self.assertEqual(tok.set_return_set(False), True) self.assertEqual(tok.get_return_set(), False) self.assertEqual(tok.tokenize('ab cd ab bb cd db'), ['ab', 'cd', 'ab', 'bb', 'cd', 'db']) def test_get_delim_set(self): self.assertSetEqual(self.ws_tok.get_delim_set(), {' ', '\t', '\n'}) @raises(TypeError) def test_whitespace_tok_invalid1(self): self.ws_tok.tokenize(None) @raises(TypeError) def test_whitespace_tok_invalid2(self): self.ws_tok.tokenize(99) @raises(AttributeError) def test_set_delim_set(self): self.ws_tok.set_delim_set({'*', '.'})