def _debug_decisiontree_matcher(decision_tree, tuple_1, tuple_2, feature_table, table_columns, exclude_attrs, ensemble_flag=False): """ This function is used to print the debug information for decision tree and random forest matcher. """ # Get the classifier from the input object. if isinstance(decision_tree, DTMatcher): clf = decision_tree.clf else: clf = decision_tree # Based on the exclude attributes derive the feature names. if exclude_attrs is None: feature_names = table_columns else: cols = [c not in exclude_attrs for c in table_columns] feature_names = table_columns[cols] # Get the python code based on the classifier, feature names and the # boolean results. code = _get_code(clf, feature_names, ['False', 'True']) # Apply feature functions to get feature vectors. feature_vectors = apply_feat_fns(tuple_1, tuple_2, feature_table) # Wrap the code in a a function code = _get_dbg_fn(code) # Initialize a dictionary with the given feature vectors. This is # important because the code must be linked with the values in the # feature vectors. code_dict = {} code_dict.update(feature_vectors) six.exec_(code, code_dict) ret_val = code_dict['debug_fn']() # Based on the ensemble flag, indent the output (as in RF, we need to # indent it a bit further right). if ensemble_flag is True: spacer = " " else: spacer = "" # Further, if the ensemble flag is True, then print the prob. for match # and non-matches. if ensemble_flag is True: p = _get_prob(clf, tuple_1, tuple_2, feature_table, feature_names) print(spacer + "Prob. for non-match : " + str(p[0])) print(spacer + "Prob for match : " + str(p[1])) return p else: # Else, just print the match status. print(spacer + "Match status : " + str(ret_val))
def _get_prob(clf, t1, t2, feature_table, feature_names): """ Get the probability of the match status. """ # Get the feature vectors from the feature table and the input tuples. feat_values = apply_feat_fns(t1, t2, feature_table) feat_values = pd.Series(feat_values) feat_values = feat_values[feature_names] v = feat_values.values v = v.reshape(1, -1) # Use the classifier to predict the probability. p = clf.predict_proba(v) return p[0]