def _fit_ex_attrs(self, table, exclude_attrs, target_attr): """ This function supports the fit method, where the DataFrame can be given as input along with what attributes must be excluded and the target attribute. """ # Validate the input parameters. # # We expect the input table to be of type pandas DataFrame. if not isinstance(table, pd.DataFrame): logger.error('Input table is not of type DataFrame') raise AssertionError('Input table is not of type DataFrame') # Convert the exclude attributes into list (if the input is not of list) if not isinstance(exclude_attrs, list): exclude_attrs = [exclude_attrs] # Check if the exclude attributes are present in the input table. If # not, raise an error. if not ch.check_attrs_present(table, exclude_attrs): logger.error( 'The attributes mentioned in exclude_attrs is not present ' \ 'in the input table') raise AssertionError( 'The attributes mentioned in exclude_attrs is not present ' \ 'in the input table') # Check if the target attribute is present in the input table. If # not, raise an error. if not ch.check_attrs_present(table, target_attr): logger.error('The target_attr is not present in the input table') raise AssertionError( 'The target_attr is not present in the input table') # We now remove duplicate attributes from the exclude_attrs exclude_attrs = gh.list_drop_duplicates(exclude_attrs) # We explicitly append target attribute to exclude attributes if target_attr not in exclude_attrs: exclude_attrs.append(target_attr) # Now, we get the attributes to project attributes_to_project = gh.list_diff(list(table.columns), exclude_attrs) # Get the predictors and the target attribute from the input table # based on the exclude attrs and the target attribute. x = table[attributes_to_project] y = table[target_attr] self._fit_sklearn(x, y, check_rem=False)
def _predict_ex_attrs(self, table, exclude_attrs, return_prob=False): """ Variant of predict method, where data is derived based on exclude attributes. """ # Validate input parameters # # We expect input table to be a pandas DataFrame. if not isinstance(table, pd.DataFrame): logger.error('Input table is not of type DataFrame') raise AssertionError('Input table is not of type DataFrame') # # We expect the exclude attributes to be a list, if not convert it # into a list. if not isinstance(exclude_attrs, list): exclude_attrs = [exclude_attrs] # Check if the input table contains the attributes to be excluded. If # not raise an error. if not ch.check_attrs_present(table, exclude_attrs): logger.error( 'The attributes mentioned in exclude_attrs is not present ' \ 'in the input table') raise AssertionError( 'The attributes mentioned in exclude_attrs is not present ' \ 'in the input table') # Get the attributes to project. attributes_to_project = gh.list_diff(list(table.columns), exclude_attrs) # Get feature vectors and the target attribute x = table[attributes_to_project] # Do the predictions and return the probabilities (if required) res = self._predict_sklearn(x, check_rem=False, return_prob=return_prob) return res
def _validate_metadata_for_table(table, key, output_string, lgr, verbose): """ Validates metadata for table (DataFrame) """ # Validate input parameters # # We expect the input table to be of type pandas DataFrame if not isinstance(table, pd.DataFrame): logger.error('Input object is not of type pandas DataFrame') raise AssertionError('Input object is not of type pandas DataFrame') # Check the key column is present in the table if not ch.check_attrs_present(table, key): logger.error('Input key ( %s ) not in the DataFrame' % key) raise KeyError('Input key ( %s ) not in the DataFrame' % key) # Validate the key ch.log_info(lgr, 'Validating ' + output_string + ' key: ' + str(key), verbose) # We expect the key to be of type string if not isinstance(key, six.string_types): logger.error('Key attribute must be of type string') raise AssertionError('Key attribute must be of type string') if not ch.is_key_attribute(table, key, verbose): logger.error('Attribute %s in the %s table does not ' 'qualify to be the key' % (str(key), output_string)) raise AssertionError('Attribute %s in the %s table does not ' 'qualify to be the key' % (str(key), output_string)) ch.log_info(lgr, '..... Done', verbose) return True
def _get_xy_data_ex(table, exclude_attrs, target_attr): # Validate the input parameters # # We expect the input table to be of type pandas DataFrame if not isinstance(table, pd.DataFrame): logger.error('Input table is not of type DataFrame') raise AssertionError( logger.error('Input table is not of type dataframe')) # We expect exclude attributes to be of type list. If not convert it into # a list. if not isinstance(exclude_attrs, list): exclude_attrs = [exclude_attrs] # Check if the exclude attributes are present in the input table if not check_attrs_present(table, exclude_attrs): logger.error('The attributes mentioned in exclude_attrs ' 'is not present ' 'in the input table') raise AssertionError('The attributes mentioned in exclude_attrs ' 'is not present ' 'in the input table') # Check if the target attribute is present in the input table if not check_attrs_present(table, target_attr): logger.error('The target_attr is not present in the input table') raise AssertionError( 'The target_attr is not present in the input table') # Drop the duplicates from the exclude attributes exclude_attrs = list_drop_duplicates(exclude_attrs) # Explicitly add the target attribute to exclude attribute (if it is not # already present) if target_attr not in exclude_attrs: exclude_attrs.append(target_attr) # Project the list of attributes that should be used for scikit-learn's # functions. attrs_to_project = list_diff(list(table.columns), exclude_attrs) # Get the values for x x = table[attrs_to_project].values # Get the values for x y = table[target_attr].values y = y.ravel() # to mute warnings from svm and cross validation # Return x and y return x, y
def _get_xy_data_ex(table, exclude_attrs, target_attr): # Validate the input parameters # # We expect the input table to be of type pandas DataFrame validate_object_type(table, pd.DataFrame) # We expect exclude attributes to be of type list. If not convert it into # a list. if not isinstance(exclude_attrs, list): exclude_attrs = [exclude_attrs] # Check if the exclude attributes are present in the input table if not check_attrs_present(table, exclude_attrs): logger.error('The attributes mentioned in exclude_attrs ' 'is not present ' 'in the input table') raise AssertionError( 'The attributes mentioned in exclude_attrs ' 'is not present ' 'in the input table') # Check if the target attribute is present in the input table if not check_attrs_present(table, target_attr): logger.error('The target_attr is not present in the input table') raise AssertionError( 'The target_attr is not present in the input table') # Drop the duplicates from the exclude attributes exclude_attrs = list_drop_duplicates(exclude_attrs) # Explicitly add the target attribute to exclude attribute (if it is not # already present) if target_attr not in exclude_attrs: exclude_attrs.append(target_attr) # Project the list of attributes that should be used for scikit-learn's # functions. attrs_to_project = list_diff(list(table.columns), exclude_attrs) # Get the values for x x = table[attrs_to_project].values # Get the values for x y = table[target_attr].values y = y.ravel() # to mute warnings from svm and cross validation # Return x and y return x, y
def set_fk_rtable(data_frame, foreign_key_rtable): """ Sets the foreign key to rtable for a DataFrame in the catalog. Specifically this function is a sugar function that will set the foreign key to right table using set_property function. This function is typically called on a DataFrame which contains metadata such as fk_ltable, fk_rtable, ltable, rtable. Args: data_frame (DataFrame): The input DataFrame for which the foreign key rtable property must be set. foreign_key_rtable (string): The attribute that must be set as foreign key to rtable in the catalog. Returns: A Boolean value of True is returned if the foreign key to rtable was set successfully. Raises: AssertionError: If `data_frame` is not of type pandas DataFrame. AssertionError: If `foreign_key_rtable` is not of type string. AssertionError: If `fk_rtable` is not in the input DataFrame. See Also: :meth:`~py_entitymatching.set_property` """ # Validate the input parameters # # The input object is expected to be of type pandas DataFrame if not isinstance(data_frame, pd.DataFrame): logger.error('Input object is not of type pandas data frame') raise AssertionError('Input object is not of type pandas data frame') if not isinstance(foreign_key_rtable, six.string_types): logger.error('Input (foreign key ltable) is not of type pandas data ' 'frame') raise AssertionError( 'Input (foreign key rtable) is not of type pandas ' 'data frame') # Check if the given attribute is present in the DataFrame if not ch.check_attrs_present(data_frame, foreign_key_rtable): logger.error('Input attr. ( %s ) not in the DataFrame' % foreign_key_rtable) raise KeyError('Input attr. ( %s ) not in the DataFrame' % foreign_key_rtable) # Finally set the property and relay the result return set_property(data_frame, 'fk_rtable', foreign_key_rtable)
def preserve_metadata(df, new_df): if cm.is_dfinfo_present(df): if _is_table_or_candset(df): if not _is_table(df): key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key\ = cm.get_metadata_for_candset(df, logger, False) if not ch.check_attrs_present(new_df, [key, fk_ltable, fk_rtable]): logger.warning('Not setting the metadata as some attrs ' 'are not present') return new_df else: key = cm.get_key(df) if not ch.check_attrs_present(new_df, [key]): logger.warning('Not setting the metadata as some attrs ' 'are not present') return new_df cm.init_properties(new_df) cm.copy_properties(df, new_df) return new_df
def set_fk_ltable(data_frame, fk_ltable): """ Sets the foreign key to ltable for a DataFrame in the catalog. Specifically this function is a sugar function that will set the foreign key to the left table using :meth:`py_entitymatching.set_property` function. This function is typically called on a DataFrame which contains metadata such as fk_ltable, fk_rtable, ltable, rtable. Args: data_frame (DataFrame): The input DataFrame for which the foreign key ltable property must be set. fk_ltable (string): The attribute that must ne set as the foreign key to the ltable in the catalog. Returns: A Boolean value of True is returned if the foreign key to ltable was set successfully. Raises: AssertionError: If `data_frame` is not of type pandas DataFrame. AssertionError: If `fk_ltable` is not of type string. AssertionError: If `fk_ltable` is not in the input DataFrame. See Also: :meth:`~py_entitymatching.set_property` """ # Validate the input parameters # # We expect the input object to be of type pandas DataFrame if not isinstance(data_frame, pd.DataFrame): logger.error('Input object is not of type pandas data frame') raise AssertionError('Input object is not of type pandas data frame') # # We expect the input fk_ltable to be of type string if not isinstance(fk_ltable, six.string_types): logger.error('The input (fk_ltable) is not of type string') raise AssertionError('The input (fk_ltable) is not of type string') # # The fk_ltable attribute should be one of the columns in the input # DataFrame if not ch.check_attrs_present(data_frame, fk_ltable): logger.error('Input attr. ( %s ) not in the DataFrame' % fk_ltable) raise KeyError('Input attr. ( %s ) not in the DataFrame' % fk_ltable) # Call the set_property function and relay the result. return set_property(data_frame, 'fk_ltable', fk_ltable)
def _validate_inputs(table, label_column_name, verbose): """ This function validates the inputs for the label_table function """ # Validate the input parameters # # The input table table is expected to be of type pandas DataFrame if not isinstance(table, pd.DataFrame): logger.error('Input object is not of type data frame') raise AssertionError('Input object is not of type data frame') # # The label column name is expected to be of type string if not isinstance(label_column_name, six.string_types): logger.error('Input attr. is not of type string') raise AssertionError('Input attr. is not of type string') # # Check if the label column name is already present in the input table if ch.check_attrs_present(table, label_column_name): logger.error( 'The label column name (%s) is already present in the ' 'input table', label_column_name) raise AssertionError( 'The label column name (%s) is already present ' 'in the input table', label_column_name) # Now, validate the metadata for the input DataFrame as we have to copy # these properties to the output DataFrame # # First, display what metadata is required for this function ch.log_info( logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ltable, rtable, ltable key, rtable key', verbose) # # Second, get the metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset(table, logger, verbose) # # Third, validate the metadata cm._validate_metadata_for_candset(table, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # Return True if everything was successful return True
def _validate_inputs(table, label_column_name, verbose): """ This function validates the inputs for the label_table function """ # Validate the input parameters # # The input table table is expected to be of type pandas DataFrame validate_object_type(table, pd.DataFrame) # # The label column name is expected to be of type string validate_object_type(label_column_name, six.string_types, error_prefix='Input attr.') # # Check if the label column name is already present in the input table if ch.check_attrs_present(table, label_column_name): logger.error('The label column name (%s) is already present in the ' 'input table', label_column_name) raise AssertionError('The label column name (%s) is already present ' 'in the input table', label_column_name) # Now, validate the metadata for the input DataFrame as we have to copy # these properties to the output DataFrame # # First, display what metadata is required for this function ch.log_info(logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ltable, rtable, ltable key, rtable key', verbose) # # Second, get the metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset(table, logger, verbose) # # Third, validate the metadata cm._validate_metadata_for_candset(table, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # Return True if everything was successful return True
def _vis_debug_dt(matcher, train, test, exclude_attrs, target_attr, show_window=True): """ Wrapper function for debugging the Random Forest matcher visually. """ try: from PyQt5 import QtWidgets from py_entitymatching.gui.debug_gui_base import MainWindowManager except ImportError: raise ImportError( 'PyQt5 is not installed. Please install PyQt5 to use ' 'GUI related functions in py_entitymatching.') # Validate the input parameters # # We expect the matcher to be of type DTMatcher if not isinstance(matcher, DTMatcher): logger.error('Input matcher is not of type Decision Tree matcher') raise AssertionError('Input matcher is not of type ' 'Decision Tree matcher') # # We expect the target attribute to be of type string. validate_object_type(target_attr, six.string_types, error_prefix='Target attribute') # # Check whether the exclude attributes are indeed present in the train # DataFrame. if not ch.check_attrs_present(train, exclude_attrs): logger.error('The exclude attrs are not in train table columns') raise AssertionError('The exclude attrs are not in the ' 'train table columns') # # Check whether the target attribute is indeed present in the train # DataFrame. if not ch.check_attrs_present(train, target_attr): logger.error('The target attr is not in train table columns') raise AssertionError('The target attr is not in the ' 'train table columns') # # Check whether the exclude attributes are indeed present in the test # DataFrame. if not ch.check_attrs_present(test, exclude_attrs): logger.error('The exclude attrs are not in test table columns') raise AssertionError('The exclude attrs are not in the ' 'test table columns') # The exclude attributes is expected to be of type list, if not # explicitly convert this into a list. if not isinstance(exclude_attrs, list): exclude_attrs = [exclude_attrs] # Drop the duplicates from the exclude attributes exclude_attrs = gh.list_drop_duplicates(exclude_attrs) # If the target attribute is not present in the exclude attributes, # then explicitly add it to the exclude attributes. if target_attr not in exclude_attrs: exclude_attrs.append(target_attr) # Now, fit using training data matcher.fit(table=train, exclude_attrs=exclude_attrs, target_attr=target_attr) # Get a column name to store the predictions. predict_attr_name = get_name_for_predict_column(test.columns) # Predict using the test data predicted = matcher.predict(table=test, exclude_attrs=exclude_attrs, target_attr=predict_attr_name, append=True, inplace=False) # Get the evaluation summary. eval_summary = eval_matches(predicted, target_attr, predict_attr_name) # Get metric in a form that can be displayed from the evaluation summary metric = _get_metric(eval_summary) # Get false negatives and false positives as a DataFrame fp_dataframe = _get_dataframe(predicted, eval_summary['false_pos_ls']) fn_dataframe = _get_dataframe(predicted, eval_summary['false_neg_ls']) em._viewapp = QtWidgets.QApplication.instance() if em._viewapp is None: em._viewapp = QtWidgets.QApplication([]) app = em._viewapp # Get the main window application app = em._viewapp m = MainWindowManager(matcher, "dt", exclude_attrs, metric, predicted, fp_dataframe, fn_dataframe) # If the show window is true, then display the window. if show_window: m.show() app.exec_()
def set_key(data_frame, key_attribute): """ Sets the value of 'key' property for a DataFrame in the catalog with the given attribute (i.e column name). Specifically, this function set the the key attribute for the DataFrame if the given attribute satisfies the following two properties: The key attribute should have unique values. The key attribute should not have missing values. A missing value is represented as np.NaN. Args: data_frame (DataFrame): The DataFrame for which the key must be set in the catalog. key_attribute (string): The key attribute (column name) in the DataFrame. Returns: A Boolean value of True is returned, if the given attribute satisfies the conditions for a key and the update was successful. Raises: AssertionError: If `data_frame` is not of type pandas DataFrame. AssertionError: If `key_attribute` is not of type string. KeyError: If given `key_attribute` is not in the DataFrame columns. See Also: :meth:`~py_entitymatching.set_property` """ # Validate input parameters # # We expect the input object (data_frame) to be of type pandas DataFrame if not isinstance(data_frame, pd.DataFrame): logger.error('Input object is not of type pandas DataFrame') raise AssertionError('Input object is not of type pandas DataFrame') # # We expect input key attribute to be of type string if not isinstance(key_attribute, six.string_types): logger.error('Input key attribute is not of type string') # Check if the key attribute is present as one of the columns in the # DataFrame if not ch.check_attrs_present(data_frame, key_attribute): logger.error('Input key ( %s ) not in the DataFrame' % key_attribute) raise KeyError('Input key ( %s ) not in the DataFrame' % key_attribute) # Check if the key attribute satisfies the conditions to be a key. If # not, just return False. # Note: Currently it is not clear, whether we should return False from # here or raise an exception. As of now resorting to just returning # False, because this function is used by other computation # intensive commands in py_entitymatching and raising an exception might make all # the work done in those commands go in vain (or those commands should # catch the exception correctly, which may be complicated and require # changes to the current code). We need to revisit this # later. if ch.is_key_attribute(data_frame, key_attribute) is False: logger.warning('Attribute (%s ) does not qualify to be a key; Not ' 'setting/replacing the key' % key_attribute) return False else: # Set the key property for the input DataFrame return set_property(data_frame, 'key', key_attribute)
def eval_matches(data_frame, gold_label_attr, predicted_label_attr): """ Evaluates the matches from the matcher. Specifically, given a DataFrame containing golden labels and predicted labels, this function would evaluate the matches and return the accuracy results such as precision, recall and F1. Args: data_frame (DataFrame): The input pandas DataFrame containing "gold" labels and "predicted" labels. gold_label_attr (string): An attribute in the input DataFrame containing "gold" labels. predicted_label_attr (string): An attribute in the input DataFrame containing "predicted" labels. Returns: A Python dictionary containing the accuracy measures such as precision, recall, F1. Raises: AssertionError: If `data_frame` is not of type pandas DataFrame. AssertionError: If `gold_label_attr` is not of type string. AssertionError: If `predicted_label_attr` is not of type string. AssertionError: If the `gold_label_attr` is not in the input dataFrame. AssertionError: If the `predicted_label_attr` is not in the input dataFrame. Examples: >>> import py_entitymatching as em >>> # G is the labeled data used for development purposes, match_f is the feature table >>> H = em.extract_feat_vecs(G, feat_table=match_f, attrs_after='gold_labels') >>> dt = em.DTMatcher() >>> dt.fit(table=H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_labels'], target_attr='gold_labels') >>> pred_table = dt.predict(table=H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_labels'], append=True, target_attr='predicted_labels') >>> eval_summary = em.eval_matches(pred_table, 'gold_labels', 'predicted_labels') """ # Validate input parameters # # We expect the input object to be of type pandas DataFrame validate_object_type(data_frame, pd.DataFrame, 'The input table') # # We expect the input attribute (gold_label_attr) to be of type string validate_object_type(gold_label_attr, six.string_types, 'The input gold_label_attr') # # We expect the input attribute (predicted_label_attr) to be of type # string validate_object_type(predicted_label_attr, six.string_types, 'The input predicted_label_attr') # Check whether the gold label attribute is present in the input table if not ch.check_attrs_present(data_frame, gold_label_attr): logger.error( 'The gold_label_attr is not present in the input DataFrame') raise AssertionError( 'The gold_label_attr is not present in the input DataFrame') # Check whether the predicted label attribute is present in the input table if not ch.check_attrs_present(data_frame, predicted_label_attr): logger.error( 'The predicted_label_attr is not present in the input DataFrame') raise AssertionError( 'The predicted_label_attr is not present in the input DataFrame') # Reset the index to get the indices set as 0..len(table) new_data_frame = data_frame.reset_index(drop=False, inplace=False) # Project out the gold and label attributes. gold = new_data_frame[gold_label_attr] predicted = new_data_frame[predicted_label_attr] # Get gold negatives, positives gold_negative = gold[gold == 0].index.values gold_positive = gold[gold == 1].index.values # Get predicted negatives, positives predicted_negative = predicted[predicted == 0].index.values predicted_positive = predicted[predicted == 1].index.values # get false positive indices false_positive_indices = \ list(set(gold_negative).intersection(predicted_positive)) # get true positive indices true_positive_indices = \ list(set(gold_positive).intersection(predicted_positive)) # get false negative indices false_negative_indices = \ list(set(gold_positive).intersection(predicted_negative)) # get true negative indices true_negative_indices = \ list(set(gold_negative).intersection(predicted_negative)) # Get the number of TP, FP, FN, TN num_true_positives = float(len(true_positive_indices)) num_false_positives = float(len(false_positive_indices)) num_false_negatives = float(len(false_negative_indices)) num_true_negatives = float(len(true_negative_indices)) # Precision = num_tp/ (num_tp + num_fp) # Get precision numerator, denominator precision_numerator = num_true_positives precision_denominiator = num_true_positives + num_false_positives # Precision = num_tp/ (num_tp + num_fn) # Get recall numerator, denominator recall_numerator = num_true_positives recall_denominator = num_true_positives + num_false_negatives # Compute precision if precision_denominiator == 0.0: precision = 0.0 else: precision = precision_numerator / precision_denominiator # Compute recall if recall_denominator == 0.0: recall = 0.0 else: recall = recall_numerator / recall_denominator # Compute F1 if precision == 0.0 and recall == 0.0: F1 = 0.0 else: F1 = (2.0 * precision * recall) / (precision + recall) # Get the fk_ltable and fk_rtable fk_ltable = cm.get_property(data_frame, 'fk_ltable') fk_rtable = cm.get_property(data_frame, 'fk_rtable') # Check if the fk_ltable contain any missing values if ch.does_contain_missing_vals(data_frame, fk_ltable): logger.error('The fk_ltable (%s) contains missing values' % fk_ltable) raise AssertionError('The fk_ltable (%s) contains missing values' % fk_ltable) # Check if the fk_rtable contain any missing values if ch.does_contain_missing_vals(data_frame, fk_rtable): logger.error('The fk_rtable (%s) contains missing values' % fk_rtable) raise AssertionError('The fk_rtable (%s) contains missing values' % fk_rtable) # Set the index values to fk_ltable and fk_rtable new_data_frame.set_index([fk_ltable, fk_rtable], drop=False, inplace=True) # Get the list of false positives and false negatives. false_pos_ls = list( new_data_frame.iloc[false_positive_indices].index.values) false_neg_ls = list( new_data_frame.iloc[false_negative_indices].index.values) # Store and return the accuracy results. accuracy_results = collections.OrderedDict() accuracy_results['prec_numerator'] = precision_numerator accuracy_results['prec_denominator'] = precision_denominiator accuracy_results['precision'] = precision accuracy_results['recall_numerator'] = recall_numerator accuracy_results['recall_denominator'] = recall_denominator accuracy_results['recall'] = recall accuracy_results['f1'] = F1 accuracy_results['pred_pos_num'] = num_true_positives + num_false_positives accuracy_results['false_pos_num'] = num_false_positives accuracy_results['false_pos_ls'] = false_pos_ls accuracy_results['pred_neg_num'] = num_false_negatives + num_true_negatives accuracy_results['false_neg_num'] = num_false_negatives accuracy_results['false_neg_ls'] = false_neg_ls return accuracy_results
def extract_feature_vecs(candset, attrs_before=None, feature_table=None, attrs_after=None, verbose=False, show_progress=True): """ This function extracts feature vectors from a DataFrame (typically a labeled candidate set). Specifically, this function uses feature table, ltable and rtable (that is present in the `candset`'s metadata) to extract feature vectors. Args: candset (DataFrame): The input candidate set for which the features vectors should be extracted. attrs_before (list): The list of attributes from the input candset, that should be added before the feature vectors (defaults to None). feature_table (DataFrame): A DataFrame containing a list of features that should be used to compute the feature vectors ( defaults to None). attrs_after (list): The list of attributes from the input candset that should be added after the feature vectors (defaults to None). verbose (boolean): A flag to indicate whether the debug information should be displayed (defaults to False). show_progress (boolean): A flag to indicate whether the progress of extracting feature vectors must be displayed (defaults to True). Returns: A pandas DataFrame containing feature vectors. The DataFrame will have metadata ltable and rtable, pointing to the same ltable and rtable as the input candset. Also, the output DataFrame will have three columns: key, foreign key ltable, foreign key rtable copied from input candset to the output DataFrame. These three columns precede the columns mentioned in `attrs_before`. Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `attrs_before` has attributes that are not present in the input candset. AssertionError: If `attrs_after` has attribtues that are not present in the input candset. AssertionError: If `feature_table` is set to None. """ # Validate input parameters # # We expect the input candset to be of type pandas DataFrame. if not isinstance(candset, pd.DataFrame): logger.error('Input cand.set is not of type dataframe') raise AssertionError('Input cand.set is not of type dataframe') # # If the attrs_before is given, Check if the attrs_before are present in # the input candset if attrs_before != None: if not ch.check_attrs_present(candset, attrs_before): logger.error( 'The attributes mentioned in attrs_before is not present ' 'in the input table') raise AssertionError( 'The attributes mentioned in attrs_before is not present ' 'in the input table') # # If the attrs_after is given, Check if the attrs_after are present in # the input candset if attrs_after != None: if not ch.check_attrs_present(candset, attrs_after): logger.error( 'The attributes mentioned in attrs_after is not present ' 'in the input table') raise AssertionError( 'The attributes mentioned in attrs_after is not present ' 'in the input table') # We expect the feature table to be a valid object if feature_table is None: logger.error('Feature table cannot be null') raise AssertionError('The feature table cannot be null') # Do metadata checking # # Mention what metadata is required to the user ch.log_info( logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # Get metadata ch.log_info(logger, 'Getting metadata from catalog', verbose) key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset( candset, logger, verbose) # # Validate metadata ch.log_info(logger, 'Validating metadata', verbose) cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # Extract features # id_list = [(row[fk_ltable], row[fk_rtable]) for i, row in # candset.iterrows()] # id_list = [tuple(tup) for tup in candset[[fk_ltable, fk_rtable]].values] # # Set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) if show_progress: prog_bar = pyprind.ProgBar(len(candset)) # # Apply feature functions feat_vals = [] ch.log_info(logger, 'Applying feature functions', verbose) col_names = list(candset.columns) fk_ltable_idx = col_names.index(fk_ltable) fk_rtable_idx = col_names.index(fk_rtable) l_dict = {} r_dict = {} for row in candset.itertuples(index=False): if show_progress: prog_bar.update() fk_ltable_val = row[fk_ltable_idx] fk_rtable_val = row[fk_rtable_idx] if fk_ltable_val not in l_dict: l_dict[fk_ltable_val] = l_df.ix[fk_ltable_val] l_tuple = l_dict[fk_ltable_val] if fk_rtable_val not in r_dict: r_dict[fk_rtable_val] = r_df.ix[fk_rtable_val] r_tuple = r_dict[fk_rtable_val] f = apply_feat_fns(l_tuple, r_tuple, feature_table) feat_vals.append(f) # Construct output table feature_vectors = pd.DataFrame(feat_vals, index=candset.index.values) # # Rearrange the feature names in the input feature table order feature_names = list(feature_table['feature_name']) feature_vectors = feature_vectors[feature_names] ch.log_info(logger, 'Constructing output table', verbose) # print(feature_vectors) # # Insert attrs_before if attrs_before: if not isinstance(attrs_before, list): attrs_before = [attrs_before] attrs_before = gh.list_diff(attrs_before, [key, fk_ltable, fk_rtable]) attrs_before.reverse() for a in attrs_before: feature_vectors.insert(0, a, candset[a]) # # Insert keys feature_vectors.insert(0, fk_rtable, candset[fk_rtable]) feature_vectors.insert(0, fk_ltable, candset[fk_ltable]) feature_vectors.insert(0, key, candset[key]) # # insert attrs after if attrs_after: if not isinstance(attrs_after, list): attrs_after = [attrs_after] attrs_after = gh.list_diff(attrs_after, [key, fk_ltable, fk_rtable]) attrs_after.reverse() col_pos = len(feature_vectors.columns) for a in attrs_after: feature_vectors.insert(col_pos, a, candset[a]) col_pos += 1 # Reset the index # feature_vectors.reset_index(inplace=True, drop=True) # # Update the catalog cm.init_properties(feature_vectors) cm.copy_properties(candset, feature_vectors) # Finally, return the feature vectors return feature_vectors
def _validate_metadata_for_candset(candset, key, foreign_key_ltable, foreign_key_rtable, ltable, rtable, ltable_key, rtable_key, lgr, verbose): """ Validates metadata for a candidate set. """ # Validate input parameters # # We expect candset to be of type pandas DataFrame if not isinstance(candset, pd.DataFrame): logger.error('Input candset is not of type pandas DataFrame') raise AssertionError('Input candset is not of type pandas DataFrame') # Check if the key column is present in the candset if not ch.check_attrs_present(candset, key): logger.error('Input key ( %s ) not in the DataFrame' % key) raise KeyError('Input key ( %s ) not in the DataFrame' % key) # Check if the foreign key ltable column is present in the candset if not ch.check_attrs_present(candset, foreign_key_ltable): logger.error('Input foreign_key_ltable ( %s ) not in the DataFrame' % foreign_key_ltable) raise KeyError('Input foreign_key_ltable ( %s ) not in the DataFrame' % foreign_key_ltable) # Check if the foreign key rtable column is present in the candset if not ch.check_attrs_present(candset, foreign_key_rtable): logger.error('Input fk_rtable ( %s ) not in the DataFrame' % foreign_key_rtable) raise KeyError('Input fk_rtable ( %s ) not in the DataFrame' % foreign_key_rtable) # We expect the ltable to be of type pandas DataFrame if not isinstance(ltable, pd.DataFrame): logger.error('Input ltable is not of type pandas data frame') raise AssertionError('Input ltable is not of type pandas data frame') # We expect the rtable to be of type pandas DataFrame if not isinstance(rtable, pd.DataFrame): logger.error('Input rtable is not of type pandas data frame') raise AssertionError('Input rtable is not of type pandas data frame') # We expect the ltable key to be present in the ltable if not ch.check_attrs_present(ltable, ltable_key): logger.error('ltable key ( %s ) not in ltable' % ltable_key) raise KeyError('ltable key ( %s ) not in ltable' % ltable_key) # We expect the rtable key to be present in the rtable if not ch.check_attrs_present(rtable, rtable_key): logger.error('rtable key ( %s ) not in rtable' % rtable_key) raise KeyError('rtable key ( %s ) not in rtable' % rtable_key) # First validate metadata for the candidate set (as a table) _validate_metadata_for_table(candset, key, 'candset', lgr, verbose) ch.log_info(lgr, 'Validating foreign key constraint for left table', verbose) # Second check foreign key constraints if not ch.check_fk_constraint(candset, foreign_key_ltable, ltable, ltable_key): logger.error('Candset does not satisfy foreign key constraint with ' 'the left table') raise AssertionError( 'Candset does not satisfy foreign key constraint with ' 'the left table') if not ch.check_fk_constraint(candset, foreign_key_rtable, rtable, rtable_key): logger.error('Candset does not satisfy foreign key constraint with ' 'the right table') raise AssertionError( 'Candset does not satisfy foreign key constraint with ' 'the right table') ch.log_info(lgr, '..... Done', verbose) ch.log_info(lgr, 'Validating foreign key constraint for right table', verbose) ch.log_info(lgr, '..... Done', verbose) return True
def impute_table(table, exclude_attrs=None, missing_val='NaN', strategy='mean', axis=0, val_all_nans=0, verbose=True): """ Impute table containing missing values. Args: table (DataFrame): DataFrame which values should be imputed. exclude_attrs (List) : list of attribute names to be excluded from imputing (defaults to None). missing_val (string or int): The placeholder for the missing values. All occurrences of `missing_values` will be imputed. For missing values encoded as np.nan, use the string value 'NaN' (defaults to 'NaN'). strategy (string): String that specifies on how to impute values. Valid strings: 'mean', 'median', 'most_frequent' (defaults to 'mean'). axis (int): axis=1 along rows, and axis=0 along columns (defaults to 0). val_all_nans (float): Value to fill in if all the values in the column are NaN. Returns: Imputed DataFrame. Raises: AssertionError: If `table` is not of type pandas DataFrame. Examples: >>> import py_entitymatching as em >>> # H is the feature vector which should be imputed. Specifically, impute the missing values >>> # in each column, with the mean of that column >>> H = em.impute_table(H, exclude_attrs=['_id', 'ltable_id', 'rtable_id'], strategy='mean') """ # Validate input paramaters # # We expect the input table to be of type pandas DataFrame if not isinstance(table, pd.DataFrame): logger.error('Input table is not of type DataFrame') raise AssertionError('Input table is not of type DataFrame') ch.log_info( logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # Get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset( table, logger, verbose) # # Validate metadata cm._validate_metadata_for_candset(table, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) fv_columns = table.columns if exclude_attrs == None: feature_names = fv_columns else: # Check if the exclude attributes are present in the input table if not ch.check_attrs_present(table, exclude_attrs): logger.error('The attributes mentioned in exclude_attrs ' 'is not present ' 'in the input table') raise AssertionError('The attributes mentioned in exclude_attrs ' 'is not present ' 'in the input table') # We expect exclude attributes to be of type list. If not convert it into # a list. if not isinstance(exclude_attrs, list): exclude_attrs = [exclude_attrs] # Drop the duplicates from the exclude attributes exclude_attrs = gh.list_drop_duplicates(exclude_attrs) cols = [c not in exclude_attrs for c in fv_columns] feature_names = fv_columns[cols] # print feature_names table_copy = table.copy() projected_table = table_copy[feature_names] projected_table_values = projected_table.values imp = Imputer(missing_values=missing_val, strategy=strategy, axis=axis) imp.fit(projected_table_values) imp.statistics_[pd.np.isnan(imp.statistics_)] = val_all_nans projected_table_values = imp.transform(projected_table_values) table_copy[feature_names] = projected_table_values # Update catalog cm.init_properties(table_copy) cm.copy_properties(table, table_copy) return table_copy
def dask_extract_feature_vecs(candset, attrs_before=None, feature_table=None, attrs_after=None, verbose=False, show_progress=True, n_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK This function extracts feature vectors from a DataFrame (typically a labeled candidate set). Specifically, this function uses feature table, ltable and rtable (that is present in the `candset`'s metadata) to extract feature vectors. Args: candset (DataFrame): The input candidate set for which the features vectors should be extracted. attrs_before (list): The list of attributes from the input candset, that should be added before the feature vectors (defaults to None). feature_table (DataFrame): A DataFrame containing a list of features that should be used to compute the feature vectors ( defaults to None). attrs_after (list): The list of attributes from the input candset that should be added after the feature vectors (defaults to None). verbose (boolean): A flag to indicate whether the debug information should be displayed (defaults to False). show_progress (boolean): A flag to indicate whether the progress of extracting feature vectors must be displayed (defaults to True). n_chunks (int): The number of partitions to split the candidate set. If it is set to -1, the number of partitions will be set to the number of cores in the machine. Returns: A pandas DataFrame containing feature vectors. The DataFrame will have metadata ltable and rtable, pointing to the same ltable and rtable as the input candset. Also, the output DataFrame will have three columns: key, foreign key ltable, foreign key rtable copied from input candset to the output DataFrame. These three columns precede the columns mentioned in `attrs_before`. Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `attrs_before` has attributes that are not present in the input candset. AssertionError: If `attrs_after` has attribtues that are not present in the input candset. AssertionError: If `feature_table` is set to None. AssertionError: If `n_chunks` is not of type int. Examples: >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_extract_features import dask_extract_feature_vecs >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> match_f = em.get_features_for_matching(A, B) >>> # G is the labeled dataframe which should be converted into feature vectors >>> H = dask_extract_feature_vecs(G, features=match_f, attrs_before=['title'], attrs_after=['gold_labels']) """ logger.warning( "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK." ) # Validate input parameters # # We expect the input candset to be of type pandas DataFrame. validate_object_type(candset, pd.DataFrame, error_prefix='Input cand.set') # # If the attrs_before is given, Check if the attrs_before are present in # the input candset if attrs_before != None: if not ch.check_attrs_present(candset, attrs_before): logger.error( 'The attributes mentioned in attrs_before is not present ' 'in the input table') raise AssertionError( 'The attributes mentioned in attrs_before is not present ' 'in the input table') # # If the attrs_after is given, Check if the attrs_after are present in # the input candset if attrs_after != None: if not ch.check_attrs_present(candset, attrs_after): logger.error( 'The attributes mentioned in attrs_after is not present ' 'in the input table') raise AssertionError( 'The attributes mentioned in attrs_after is not present ' 'in the input table') # We expect the feature table to be a valid object if feature_table is None: logger.error('Feature table cannot be null') raise AssertionError('The feature table cannot be null') # Do metadata checking # # Mention what metadata is required to the user ch.log_info( logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # Get metadata ch.log_info(logger, 'Getting metadata from catalog', verbose) key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset( candset, logger, verbose) # # Validate metadata ch.log_info(logger, 'Validating metadata', verbose) cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # Extract features # id_list = [(row[fk_ltable], row[fk_rtable]) for i, row in # candset.iterrows()] # id_list = [tuple(tup) for tup in candset[[fk_ltable, fk_rtable]].values] # # Set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # Apply feature functions ch.log_info(logger, 'Applying feature functions', verbose) col_names = list(candset.columns) fk_ltable_idx = col_names.index(fk_ltable) fk_rtable_idx = col_names.index(fk_rtable) validate_object_type(n_chunks, int, 'Parameter n_chunks') validate_chunks(n_chunks) n_chunks = get_num_partitions(n_chunks, len(candset)) c_splits = np.array_split(candset, n_chunks) pickled_obj = cloudpickle.dumps(feature_table) feat_vals_by_splits = [] for i in range(len(c_splits)): partial_result = delayed(get_feature_vals_by_cand_split)( pickled_obj, fk_ltable_idx, fk_rtable_idx, l_df, r_df, c_splits[i], False) feat_vals_by_splits.append(partial_result) feat_vals_by_splits = delayed(wrap)(feat_vals_by_splits) if show_progress: with ProgressBar(): feat_vals_by_splits = feat_vals_by_splits.compute( scheduler="processes", num_workers=get_num_cores()) else: feat_vals_by_splits = feat_vals_by_splits.compute( scheduler="processes", num_workers=get_num_cores()) feat_vals = sum(feat_vals_by_splits, []) # Construct output table feature_vectors = pd.DataFrame(feat_vals, index=candset.index.values) # # Rearrange the feature names in the input feature table order feature_names = list(feature_table['feature_name']) feature_vectors = feature_vectors[feature_names] ch.log_info(logger, 'Constructing output table', verbose) # print(feature_vectors) # # Insert attrs_before if attrs_before: if not isinstance(attrs_before, list): attrs_before = [attrs_before] attrs_before = gh.list_diff(attrs_before, [key, fk_ltable, fk_rtable]) attrs_before.reverse() for a in attrs_before: feature_vectors.insert(0, a, candset[a]) # # Insert keys feature_vectors.insert(0, fk_rtable, candset[fk_rtable]) feature_vectors.insert(0, fk_ltable, candset[fk_ltable]) feature_vectors.insert(0, key, candset[key]) # # insert attrs after if attrs_after: if not isinstance(attrs_after, list): attrs_after = [attrs_after] attrs_after = gh.list_diff(attrs_after, [key, fk_ltable, fk_rtable]) attrs_after.reverse() col_pos = len(feature_vectors.columns) for a in attrs_after: feature_vectors.insert(col_pos, a, candset[a]) col_pos += 1 # Reset the index # feature_vectors.reset_index(inplace=True, drop=True) # # Update the catalog cm.init_properties(feature_vectors) cm.copy_properties(candset, feature_vectors) # Finally, return the feature vectors return feature_vectors
def test_check_attrs_invalid_None(self): A = pd.read_csv(path_a) status = ch.check_attrs_present(A, None) self.assertEqual(status, False)
def test_check_attrs_present_invalid_df(self): ch.check_attrs_present(None, 'ID')
def extract_feature_vecs(candset, attrs_before=None, feature_table=None, attrs_after=None, verbose=False, show_progress=True, n_jobs=1, FeatureExtractor=ParallelFeatureExtractor): """ This function extracts feature vectors from a DataFrame (typically a labeled candidate set). Specifically, this function uses feature table, ltable and rtable (that is present in the `candset`'s metadata) to extract feature vectors. Args: candset (DataFrame): The input candidate set for which the features vectors should be extracted. attrs_before (list): The list of attributes from the input candset, that should be added before the feature vectors (defaults to None). feature_table (DataFrame): A DataFrame containing a list of features that should be used to compute the feature vectors ( defaults to None). attrs_after (list): The list of attributes from the input candset that should be added after the feature vectors (defaults to None). verbose (boolean): A flag to indicate whether the debug information should be displayed (defaults to False). show_progress (boolean): A flag to indicate whether the progress of extracting feature vectors must be displayed (defaults to True). Returns: A pandas DataFrame containing feature vectors. The DataFrame will have metadata ltable and rtable, pointing to the same ltable and rtable as the input candset. Also, the output DataFrame will have three columns: key, foreign key ltable, foreign key rtable copied from input candset to the output DataFrame. These three columns precede the columns mentioned in `attrs_before`. Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `attrs_before` has attributes that are not present in the input candset. AssertionError: If `attrs_after` has attribtues that are not present in the input candset. AssertionError: If `feature_table` is set to None. Examples: >>> import py_entitymatching as em >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> match_f = em.get_features_for_matching(A, B) >>> # G is the labeled dataframe which should be converted into feature vectors >>> H = em.extract_feature_vecs(G, features=match_f, attrs_before=['title'], attrs_after=['gold_labels']) """ # (Matt) Stage 1: Input validation # Validate input parameters # # We expect the input candset to be of type pandas DataFrame. validate_object_type(candset, pd.DataFrame, error_prefix='Input cand.set') # # We expect the FeatureExtractor class to be of type BaseFeatureExtractor validate_subclass(FeatureExtractor, BaseFeatureExtractor, error_prefix='Input FeatureExtractor') # (Matt) The two blocks below are making sure that attributes that are to be appended # to this function's output do in fact exist in the input DataFrame # # If the attrs_before is given, Check if the attrs_before are present in # the input candset if attrs_before != None: if not ch.check_attrs_present(candset, attrs_before): logger.error( 'The attributes mentioned in attrs_before is not present ' 'in the input table') raise AssertionError( 'The attributes mentioned in attrs_before is not present ' 'in the input table') # # If the attrs_after is given, Check if the attrs_after are present in # the input candset if attrs_after != None: if not ch.check_attrs_present(candset, attrs_after): logger.error( 'The attributes mentioned in attrs_after is not present ' 'in the input table') raise AssertionError( 'The attributes mentioned in attrs_after is not present ' 'in the input table') # (Matt) Why not make sure that this is a DataFrame instead of just nonempty? # We expect the feature table to be a valid object if feature_table is None: logger.error('Feature table cannot be null') raise AssertionError('The feature table cannot be null') # Do metadata checking # # Mention what metadata is required to the user ch.log_info(logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # (Matt) ch ~ catalog helper # # Get metadata ch.log_info(logger, 'Getting metadata from catalog', verbose) # (Matt) cm ~ catalog manager key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset( candset, logger, verbose) # # Validate metadata ch.log_info(logger, 'Validating metadata', verbose) cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # Extract features # id_list = [(row[fk_ltable], row[fk_rtable]) for i, row in # candset.iterrows()] # id_list = [tuple(tup) for tup in candset[[fk_ltable, fk_rtable]].values] # (Matt) ParallelFeatureExtractor implementation starts here # # Apply feature functions feature_extractor = FeatureExtractor( feature_table, n_jobs=n_jobs, verbose=verbose, show_progress=show_progress ) feat_vals = feature_extractor.extract_from(candset) # (Matt) ParallelFeatureExtractor implementation ends here; the rest is formatting # Construct output table feature_vectors = pd.DataFrame(feat_vals, index=candset.index.values) # # Rearrange the feature names in the input feature table order feature_names = list(feature_table['feature_name']) feature_vectors = feature_vectors[feature_names] ch.log_info(logger, 'Constructing output table', verbose) # print(feature_vectors) # # Insert attrs_before if attrs_before: if not isinstance(attrs_before, list): attrs_before = [attrs_before] attrs_before = gh.list_diff(attrs_before, [key, fk_ltable, fk_rtable]) attrs_before.reverse() for a in attrs_before: feature_vectors.insert(0, a, candset[a]) # # Insert keys feature_vectors.insert(0, fk_rtable, candset[fk_rtable]) feature_vectors.insert(0, fk_ltable, candset[fk_ltable]) feature_vectors.insert(0, key, candset[key]) # # insert attrs after if attrs_after: if not isinstance(attrs_after, list): attrs_after = [attrs_after] attrs_after = gh.list_diff(attrs_after, [key, fk_ltable, fk_rtable]) attrs_after.reverse() col_pos = len(feature_vectors.columns) for a in attrs_after: feature_vectors.insert(col_pos, a, candset[a]) col_pos += 1 # Reset the index # feature_vectors.reset_index(inplace=True, drop=True) # # Update the catalog cm.init_properties(feature_vectors) cm.copy_properties(candset, feature_vectors) # Finally, return the feature vectors return feature_vectors
def _vis_debug_rf(matcher, train, test, exclude_attrs, target_attr, show_window=True): """ Wrapper function for debugging the Random Forest matcher visually. """ try: from PyQt5 import QtWidgets from py_entitymatching.gui.debug_gui_base import MainWindowManager except ImportError: raise ImportError('PyQt5 is not installed. Please install PyQt5 to use ' 'GUI related functions in py_entitymatching.') # Validate the input parameters # # We expect the matcher to be of type RfMatcher if not isinstance(matcher, RFMatcher): logger.error('Input matcher is not of type ' 'Random Forest matcher') raise AssertionError('Input matcher is not of type ' 'Random Forest matcher') # # We expect the target attribute to be of type string. validate_object_type(target_attr, six.string_types, error_prefix='Target attribute') # # Check whether the exclude attributes are indeed present in the train # DataFrame. if not check_attrs_present(train, exclude_attrs): logger.error('The exclude attrs are not in train table columns') raise AssertionError('The exclude attrs are not in the train table columns') # # Check whether the target attribute is indeed present in the train # DataFrame. if not check_attrs_present(train, target_attr): logger.error('The target attr is not in train table columns') raise AssertionError('The target attr is not in the train table columns') # # Check whether the exclude attributes are indeed present in the test # DataFrame. if not check_attrs_present(test, exclude_attrs): logger.error('The exclude attrs are not in test table columns') raise AssertionError('The exclude attrs are not in the test table columns') # The exclude attributes is expected to be of type list, if not # explicitly convert this into a list. if not isinstance(exclude_attrs, list): exclude_attrs = [exclude_attrs] # Drop the duplicates from the exclude attributes exclude_attrs = list_drop_duplicates(exclude_attrs) # If the target attribute is not present in the exclude attributes, # then explicitly add it to the exclude attributes. if target_attr not in exclude_attrs: exclude_attrs.append(target_attr) # Now, fit using training data matcher.fit(table=train, exclude_attrs=exclude_attrs, target_attr=target_attr) # Get a column name to store the predictions. predict_attr_name = get_name_for_predict_column(test.columns) # Predict using the test data predicted = matcher.predict(table=test, exclude_attrs=exclude_attrs, target_attr=predict_attr_name, append=True, inplace=False) # Get the evaluation summary. eval_summary = em.eval_matches(predicted, target_attr, predict_attr_name) em._viewapp = QtWidgets.QApplication.instance() if em._viewapp is None: em._viewapp = QtWidgets.QApplication([]) # Get metric in a form that can be displayed from the evaluation summary metric = _get_metric(eval_summary) # Get false negatives and false positives as a DataFrame fp_dataframe = _get_dataframe(predicted, eval_summary['false_pos_ls']) fn_dataframe = _get_dataframe(predicted, eval_summary['false_neg_ls']) # Get the main window application app = em._viewapp m = MainWindowManager(matcher, "rf", exclude_attrs, metric, predicted, fp_dataframe, fn_dataframe) # If the show window is true, then display the window. if show_window: m.show() app.exec_()
def test_check_attrs_present_valid_1(self): A = pd.read_csv(path_a) status = ch.check_attrs_present(A, 'ID') self.assertEqual(status, True)
def extract_feature_vecs(candset, attrs_before=None, feature_table=None, attrs_after=None, verbose=False, show_progress=True, n_jobs=1): """ This function extracts feature vectors from a DataFrame (typically a labeled candidate set). Specifically, this function uses feature table, ltable and rtable (that is present in the `candset`'s metadata) to extract feature vectors. Args: candset (DataFrame): The input candidate set for which the features vectors should be extracted. attrs_before (list): The list of attributes from the input candset, that should be added before the feature vectors (defaults to None). feature_table (DataFrame): A DataFrame containing a list of features that should be used to compute the feature vectors ( defaults to None). attrs_after (list): The list of attributes from the input candset that should be added after the feature vectors (defaults to None). verbose (boolean): A flag to indicate whether the debug information should be displayed (defaults to False). show_progress (boolean): A flag to indicate whether the progress of extracting feature vectors must be displayed (defaults to True). Returns: A pandas DataFrame containing feature vectors. The DataFrame will have metadata ltable and rtable, pointing to the same ltable and rtable as the input candset. Also, the output DataFrame will have three columns: key, foreign key ltable, foreign key rtable copied from input candset to the output DataFrame. These three columns precede the columns mentioned in `attrs_before`. Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `attrs_before` has attributes that are not present in the input candset. AssertionError: If `attrs_after` has attribtues that are not present in the input candset. AssertionError: If `feature_table` is set to None. Examples: >>> import py_entitymatching as em >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> match_f = em.get_features_for_matching(A, B) >>> # G is the labeled dataframe which should be converted into feature vectors >>> H = em.extract_feature_vecs(G, features=match_f, attrs_before=['title'], attrs_after=['gold_labels']) """ # Validate input parameters # # We expect the input candset to be of type pandas DataFrame. validate_object_type(candset, pd.DataFrame, error_prefix='Input cand.set') # # If the attrs_before is given, Check if the attrs_before are present in # the input candset if attrs_before != None: if not ch.check_attrs_present(candset, attrs_before): logger.error( 'The attributes mentioned in attrs_before is not present ' 'in the input table') raise AssertionError( 'The attributes mentioned in attrs_before is not present ' 'in the input table') # # If the attrs_after is given, Check if the attrs_after are present in # the input candset if attrs_after != None: if not ch.check_attrs_present(candset, attrs_after): logger.error( 'The attributes mentioned in attrs_after is not present ' 'in the input table') raise AssertionError( 'The attributes mentioned in attrs_after is not present ' 'in the input table') # We expect the feature table to be a valid object if feature_table is None: logger.error('Feature table cannot be null') raise AssertionError('The feature table cannot be null') # Do metadata checking # # Mention what metadata is required to the user ch.log_info(logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # Get metadata ch.log_info(logger, 'Getting metadata from catalog', verbose) key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset( candset, logger, verbose) # # Validate metadata ch.log_info(logger, 'Validating metadata', verbose) cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # Extract features # id_list = [(row[fk_ltable], row[fk_rtable]) for i, row in # candset.iterrows()] # id_list = [tuple(tup) for tup in candset[[fk_ltable, fk_rtable]].values] # # Set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # Apply feature functions ch.log_info(logger, 'Applying feature functions', verbose) col_names = list(candset.columns) fk_ltable_idx = col_names.index(fk_ltable) fk_rtable_idx = col_names.index(fk_rtable) n_procs = get_num_procs(n_jobs, len(candset)) c_splits = pd.np.array_split(candset, n_procs) pickled_obj = cloudpickle.dumps(feature_table) feat_vals_by_splits = Parallel(n_jobs=n_procs)(delayed(get_feature_vals_by_cand_split)(pickled_obj, fk_ltable_idx, fk_rtable_idx, l_df, r_df, c_splits[i], show_progress and i == len( c_splits) - 1) for i in range(len(c_splits))) feat_vals = sum(feat_vals_by_splits, []) # Construct output table feature_vectors = pd.DataFrame(feat_vals, index=candset.index.values) # # Rearrange the feature names in the input feature table order feature_names = list(feature_table['feature_name']) feature_vectors = feature_vectors[feature_names] ch.log_info(logger, 'Constructing output table', verbose) # print(feature_vectors) # # Insert attrs_before if attrs_before: if not isinstance(attrs_before, list): attrs_before = [attrs_before] attrs_before = gh.list_diff(attrs_before, [key, fk_ltable, fk_rtable]) attrs_before.reverse() for a in attrs_before: feature_vectors.insert(0, a, candset[a]) # # Insert keys feature_vectors.insert(0, fk_rtable, candset[fk_rtable]) feature_vectors.insert(0, fk_ltable, candset[fk_ltable]) feature_vectors.insert(0, key, candset[key]) # # insert attrs after if attrs_after: if not isinstance(attrs_after, list): attrs_after = [attrs_after] attrs_after = gh.list_diff(attrs_after, [key, fk_ltable, fk_rtable]) attrs_after.reverse() col_pos = len(feature_vectors.columns) for a in attrs_after: feature_vectors.insert(col_pos, a, candset[a]) col_pos += 1 # Reset the index # feature_vectors.reset_index(inplace=True, drop=True) # # Update the catalog cm.init_properties(feature_vectors) cm.copy_properties(candset, feature_vectors) # Finally, return the feature vectors return feature_vectors
def test_check_attrs_present_valid_3(self): A = pd.read_csv(path_a) status = ch.check_attrs_present(A, ['_ID']) self.assertEqual(status, False)