def get_false_negatives_as_df(table, eval_summary, verbose=False): """ Select only the false negatives from the input table and return as a DataFrame based on the evaluation results. Args: table (DataFrame): The input table (pandas DataFrame) that was used for evaluation. eval_summary (dictionary): A Python dictionary containing evaluation results, typically from 'eval_matches' command. Returns: A pandas DataFrame containing only the false negatives from the input table. Further, this function sets the output DataFrame's properties same as input DataFrame. """ # Validate input parameters # # We expect the input candset to be of type pandas DataFrame. if not isinstance(table, pd.DataFrame): logger.error('Input cand.set is not of type dataframe') raise AssertionError('Input cand.set is not of type dataframe') # Do metadata checking # # Mention what metadata is required to the user ch.log_info(logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # Get metadata ch.log_info(logger, 'Getting metadata from the catalog', verbose) key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset( table, logger, verbose) # # Validate metadata ch.log_info(logger, 'Validating metadata', verbose) cm._validate_metadata_for_candset(table, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) data_frame = _get_dataframe(table, eval_summary['false_neg_ls']) # # Update catalog ch.log_info(logger, 'Updating catalog', verbose) cm.init_properties(data_frame) cm.copy_properties(table, data_frame) # # Update catalog ch.log_info(logger, 'Returning the dataframe', verbose) return data_frame
def _vis_debug_rf(matcher, train, test, exclude_attrs, target_attr, show_window=True): """ Wrapper function for debugging the Random Forest matcher visually. """ try: from PyQt5 import QtWidgets from py_entitymatching.gui.debug_gui_base import MainWindowManager except ImportError: raise ImportError('PyQt5 is not installed. Please install PyQt5 to use ' 'GUI related functions in py_entitymatching.') # Validate the input parameters # # We expect the matcher to be of type RfMatcher if not isinstance(matcher, RFMatcher): logger.error('Input matcher is not of type ' 'Random Forest matcher') raise AssertionError('Input matcher is not of type ' 'Random Forest matcher') # # We expect the target attribute to be of type string. validate_object_type(target_attr, six.string_types, error_prefix='Target attribute') # # Check whether the exclude attributes are indeed present in the train # DataFrame. if not check_attrs_present(train, exclude_attrs): logger.error('The exclude attrs are not in train table columns') raise AssertionError('The exclude attrs are not in the train table columns') # # Check whether the target attribute is indeed present in the train # DataFrame. if not check_attrs_present(train, target_attr): logger.error('The target attr is not in train table columns') raise AssertionError('The target attr is not in the train table columns') # # Check whether the exclude attributes are indeed present in the test # DataFrame. if not check_attrs_present(test, exclude_attrs): logger.error('The exclude attrs are not in test table columns') raise AssertionError('The exclude attrs are not in the test table columns') # The exclude attributes is expected to be of type list, if not # explicitly convert this into a list. if not isinstance(exclude_attrs, list): exclude_attrs = [exclude_attrs] # Drop the duplicates from the exclude attributes exclude_attrs = list_drop_duplicates(exclude_attrs) # If the target attribute is not present in the exclude attributes, # then explicitly add it to the exclude attributes. if target_attr not in exclude_attrs: exclude_attrs.append(target_attr) # Now, fit using training data matcher.fit(table=train, exclude_attrs=exclude_attrs, target_attr=target_attr) # Get a column name to store the predictions. predict_attr_name = get_name_for_predict_column(test.columns) # Predict using the test data predicted = matcher.predict(table=test, exclude_attrs=exclude_attrs, target_attr=predict_attr_name, append=True, inplace=False) # Get the evaluation summary. eval_summary = em.eval_matches(predicted, target_attr, predict_attr_name) em._viewapp = QtWidgets.QApplication.instance() if em._viewapp is None: em._viewapp = QtWidgets.QApplication([]) # Get metric in a form that can be displayed from the evaluation summary metric = _get_metric(eval_summary) # Get false negatives and false positives as a DataFrame fp_dataframe = _get_dataframe(predicted, eval_summary['false_pos_ls']) fn_dataframe = _get_dataframe(predicted, eval_summary['false_neg_ls']) # Get the main window application app = em._viewapp m = MainWindowManager(matcher, "rf", exclude_attrs, metric, predicted, fp_dataframe, fn_dataframe) # If the show window is true, then display the window. if show_window: m.show() app.exec_()
def _vis_debug_dt(matcher, train, test, exclude_attrs, target_attr, show_window=True): """ Wrapper function for debugging the Random Forest matcher visually. """ try: from PyQt5 import QtWidgets from py_entitymatching.gui.debug_gui_base import MainWindowManager except ImportError: raise ImportError( 'PyQt5 is not installed. Please install PyQt5 to use ' 'GUI related functions in py_entitymatching.') # Validate the input parameters # # We expect the matcher to be of type DTMatcher if not isinstance(matcher, DTMatcher): logger.error('Input matcher is not of type Decision Tree matcher') raise AssertionError('Input matcher is not of type ' 'Decision Tree matcher') # # We expect the target attribute to be of type string. validate_object_type(target_attr, six.string_types, error_prefix='Target attribute') # # Check whether the exclude attributes are indeed present in the train # DataFrame. if not ch.check_attrs_present(train, exclude_attrs): logger.error('The exclude attrs are not in train table columns') raise AssertionError('The exclude attrs are not in the ' 'train table columns') # # Check whether the target attribute is indeed present in the train # DataFrame. if not ch.check_attrs_present(train, target_attr): logger.error('The target attr is not in train table columns') raise AssertionError('The target attr is not in the ' 'train table columns') # # Check whether the exclude attributes are indeed present in the test # DataFrame. if not ch.check_attrs_present(test, exclude_attrs): logger.error('The exclude attrs are not in test table columns') raise AssertionError('The exclude attrs are not in the ' 'test table columns') # The exclude attributes is expected to be of type list, if not # explicitly convert this into a list. if not isinstance(exclude_attrs, list): exclude_attrs = [exclude_attrs] # Drop the duplicates from the exclude attributes exclude_attrs = gh.list_drop_duplicates(exclude_attrs) # If the target attribute is not present in the exclude attributes, # then explicitly add it to the exclude attributes. if target_attr not in exclude_attrs: exclude_attrs.append(target_attr) # Now, fit using training data matcher.fit(table=train, exclude_attrs=exclude_attrs, target_attr=target_attr) # Get a column name to store the predictions. predict_attr_name = get_name_for_predict_column(test.columns) # Predict using the test data predicted = matcher.predict(table=test, exclude_attrs=exclude_attrs, target_attr=predict_attr_name, append=True, inplace=False) # Get the evaluation summary. eval_summary = eval_matches(predicted, target_attr, predict_attr_name) # Get metric in a form that can be displayed from the evaluation summary metric = _get_metric(eval_summary) # Get false negatives and false positives as a DataFrame fp_dataframe = _get_dataframe(predicted, eval_summary['false_pos_ls']) fn_dataframe = _get_dataframe(predicted, eval_summary['false_neg_ls']) em._viewapp = QtWidgets.QApplication.instance() if em._viewapp is None: em._viewapp = QtWidgets.QApplication([]) app = em._viewapp # Get the main window application app = em._viewapp m = MainWindowManager(matcher, "dt", exclude_attrs, metric, predicted, fp_dataframe, fn_dataframe) # If the show window is true, then display the window. if show_window: m.show() app.exec_()
def get_false_negatives_as_df(table, eval_summary, verbose=False): """ Select only the false negatives from the input table and return as a DataFrame based on the evaluation results. Args: table (DataFrame): The input table (pandas DataFrame) that was used for evaluation. eval_summary (dictionary): A Python dictionary containing evaluation results, typically from 'eval_matches' command. Returns: A pandas DataFrame containing only the false negatives from the input table. Further, this function sets the output DataFrame's properties same as input DataFrame. Examples: >>> import py_entitymatching as em >>> # G is the labeled data used for development purposes, match_f is the feature table >>> H = em.extract_feat_vecs(G, feat_table=match_f, attrs_after='gold_labels') >>> dt = em.DTMatcher() >>> dt.fit(table=H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_labels'], target_attr='gold_labels') >>> pred_table = dt.predict(table=H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_labels'], append=True, target_attr='predicted_labels') >>> eval_summary = em.eval_matches(pred_table, 'gold_labels', 'predicted_labels') >>> false_neg_df = em.get_false_negatives_as_df(H, eval_summary) """ # Validate input parameters # # We expect the input candset to be of type pandas DataFrame. validate_object_type(table, pd.DataFrame, error_prefix='Input cand.set') # Do metadata checking # # Mention what metadata is required to the user ch.log_info( logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # Get metadata ch.log_info(logger, 'Getting metadata from the catalog', verbose) key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset( table, logger, verbose) # # Validate metadata ch.log_info(logger, 'Validating metadata', verbose) cm._validate_metadata_for_candset(table, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) data_frame = _get_dataframe(table, eval_summary['false_neg_ls']) # # Update catalog ch.log_info(logger, 'Updating catalog', verbose) cm.init_properties(data_frame) cm.copy_properties(table, data_frame) # # Update catalog ch.log_info(logger, 'Returning the dataframe', verbose) return data_frame