示例#1
0
def _debug_decisiontree_matcher(decision_tree,
                                tuple_1,
                                tuple_2,
                                feature_table,
                                table_columns,
                                exclude_attrs,
                                ensemble_flag=False):
    """
    This function is used to print the debug information for decision tree
    and random forest matcher.
    """
    # Get the classifier from the input object.
    if isinstance(decision_tree, DTMatcher):
        clf = decision_tree.clf
    else:
        clf = decision_tree

    # Based on the exclude attributes derive the feature names.
    if exclude_attrs is None:
        feature_names = table_columns
    else:
        cols = [c not in exclude_attrs for c in table_columns]
        feature_names = table_columns[cols]

    # Get the python code based on the classifier, feature names and the
    # boolean results.
    code = _get_code(clf, feature_names, ['False', 'True'])
    # Apply feature functions to get feature vectors.
    feature_vectors = apply_feat_fns(tuple_1, tuple_2, feature_table)

    # Wrap the code in a a function
    code = _get_dbg_fn(code)

    # Initialize a dictionary with the given feature vectors. This is
    # important because the code must be linked with the values in the
    # feature vectors.
    code_dict = {}
    code_dict.update(feature_vectors)
    six.exec_(code, code_dict)
    ret_val = code_dict['debug_fn']()
    # Based on the ensemble flag, indent the output (as in RF, we need to
    # indent it a bit further right).
    if ensemble_flag is True:
        spacer = "    "
    else:
        spacer = ""

    # Further, if the ensemble flag is True, then print the prob. for match
    # and non-matches.
    if ensemble_flag is True:
        p = _get_prob(clf, tuple_1, tuple_2, feature_table, feature_names)
        print(spacer + "Prob. for non-match : " + str(p[0]))
        print(spacer + "Prob for match : " + str(p[1]))
        return p
    else:
        # Else, just print the match status.
        print(spacer + "Match status : " + str(ret_val))
def _debug_decisiontree_matcher(decision_tree, tuple_1, tuple_2,
                                feature_table, table_columns,
                                exclude_attrs,
                                ensemble_flag=False):
    """
    This function is used to print the debug information for decision tree
    and random forest matcher.
    """
    # Get the classifier from the input object.
    if isinstance(decision_tree, DTMatcher):
        clf = decision_tree.clf
    else:
        clf = decision_tree

    # Based on the exclude attributes derive the feature names.
    if exclude_attrs is None:
        feature_names = table_columns
    else:
        cols = [c not in exclude_attrs for c in table_columns]
        feature_names = table_columns[cols]

    # Get the python code based on the classifier, feature names and the
    # boolean results.
    code = _get_code(clf, feature_names, ['False', 'True'])
    # Apply feature functions to get feature vectors.
    feature_vectors = apply_feat_fns(tuple_1, tuple_2, feature_table)

    # Wrap the code in a a function
    code = _get_dbg_fn(code)

    # Initialize a dictionary with the given feature vectors. This is
    # important because the code must be linked with the values in the
    # feature vectors.
    code_dict = {}
    code_dict.update(feature_vectors)
    six.exec_(code, code_dict)
    ret_val = code_dict['debug_fn']()
    # Based on the ensemble flag, indent the output (as in RF, we need to
    # indent it a bit further right).
    if ensemble_flag is True:
        spacer = "    "
    else:
        spacer = ""

    # Further, if the ensemble flag is True, then print the prob. for match
    # and non-matches.
    if ensemble_flag is True:
        p = _get_prob(clf, tuple_1, tuple_2, feature_table, feature_names)
        print(spacer + "Prob. for non-match : " + str(p[0]))
        print(spacer + "Prob for match : " + str(p[1]))
        return p
    else:
        # Else, just print the match status.
        print(spacer + "Match status : " + str(ret_val))
示例#3
0
def _get_prob(clf, t1, t2, feature_table, feature_names):
    """
    Get the probability of the match status.
    """
    # Get the feature vectors from the feature table and the input tuples.
    feat_values = apply_feat_fns(t1, t2, feature_table)
    feat_values = pd.Series(feat_values)
    feat_values = feat_values[feature_names]
    v = feat_values.values
    v = v.reshape(1, -1)
    # Use the classifier to predict the probability.
    p = clf.predict_proba(v)
    return p[0]
def _get_prob(clf, t1, t2, feature_table, feature_names):
    """
    Get the probability of the match status.
    """
    # Get the feature vectors from the feature table and the input tuples.
    feat_values = apply_feat_fns(t1, t2, feature_table)
    feat_values = pd.Series(feat_values)
    feat_values = feat_values[feature_names]
    v = feat_values.values
    v = v.reshape(1, -1)
    # Use the classifier to predict the probability.
    p = clf.predict_proba(v)
    return p[0]