def add_output_attributes(candset, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_',
                          r_output_prefix='rtable_', validate=True, copy_props=True,
                          delete_from_catalog=True, verbose=False):

    if not isinstance(candset, pd.DataFrame):
        logger.error('Input object is not of type pandas data frame')
        raise AssertionError('Input object is not of type pandas data frame')

    # # get metadata
    key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(candset, logger, verbose)
    if validate:
        cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key,
                                          logger, verbose)
    index_values = candset.index

    df = _add_output_attributes(candset, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key,
                                l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix,
                                validate=False)

    df.set_index(index_values, inplace=True)
    if copy_props:
        cm.init_properties(df)
        cm.copy_properties(candset, df)
        if delete_from_catalog:
            cm.del_all_properties(candset)
    return df
Exemplo n.º 2
0
def _post_process_labelled_table(input_table, labeled_table, col_name):
    """
    This function post processes the labeled table and updates the catalog.
    Specifically, this function validates that the label column contain only
    0 and 1's, and finally copies the properties from the input table to the
    output table.
    """
    # Cast the label values to int as initially they will be strings when it
    # comes from the GUI
    labeled_table[col_name] = labeled_table[col_name].astype(int)

    # Check if the table contains only 0s and 1s
    label_value_with_1 = labeled_table[col_name] == 1
    label_value_with_0 = labeled_table[col_name] == 0
    sum_of_labels = sum(label_value_with_1 | label_value_with_0)

    # If they contain column values other than 0 and 1, raise an error
    if not sum_of_labels == len(labeled_table):
        logger.error('The label column contains values other than 0 and 1')
        raise AssertionError(
            'The label column contains values other than 0 and 1')

    # Copy the properties from the input table to label table.
    # Note: Here we dont have to check for the integrity of 'key' because the
    # key column is not tampered from the input table.
    cm.init_properties(labeled_table)
    cm.copy_properties(input_table, labeled_table)

    # Return the label table
    return labeled_table
    def test_eval_matches_valid_2(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        C1 = C[['_id', 'ltable_ID', 'rtable_ID']]
        num_ones = 1
        num_zeros = len(C1) - num_ones
        gold = [0] * num_ones
        gold.extend([1] * num_zeros)
        predicted = [1] * (num_zeros + num_ones)

        ln = len(C1.columns)
        C1.insert(ln, 'gold', gold)
        C1.insert(ln + 1, 'predicted', predicted)
        cm.copy_properties(C, C1)

        result = eval_matches(C1, 'predicted', 'gold')
        self.assertEqual(isinstance(result, dict), True)
        self.assertEqual(result['prec_numerator'], 14)
        self.assertEqual(result['prec_denominator'], 14)
        self.assertAlmostEqual(result['precision'], 1)
        self.assertEqual(result['recall_numerator'], 14)
        self.assertEqual(result['recall_denominator'], 15)
        self.assertEqual(result['recall'], 0.9333333333333333)
        self.assertEqual(result['f1'], 0.9655172413793104)
        self.assertEqual(result['pred_pos_num'], 14)
        self.assertEqual(result['false_pos_num'], 0.0)
        self.assertEqual(len(result['false_pos_ls']), 0)
        self.assertEqual(result['pred_neg_num'], 1)
        self.assertEqual(result['false_neg_num'], 1.0)
        self.assertEqual(len(result['false_neg_ls']), 1)
        t = result['false_neg_ls'][0]
        self.assertEqual(t[0], 'a1')
        self.assertEqual(t[1], 'b1')
    def test_eval_matches_valid_3(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        C1 = C[['_id', 'ltable_ID', 'rtable_ID']]
        num_ones = len(C1)
        num_zeros = len(C1) - num_ones
        gold = [0]*num_ones
        # gold.extend([1]*num_zeros)
        predicted = [1]* (num_zeros + num_ones)

        ln = len(C1.columns)
        C1.insert(ln, 'gold', gold)
        C1.insert(ln+1, 'predicted', predicted)
        D = pd.DataFrame(columns=C1.columns)
        cm.copy_properties(C, D)
        result = eval_matches(D, 'gold', 'predicted')

        self.assertEqual(isinstance(result, dict), True)
        self.assertEqual(result['prec_numerator'], 0)
        self.assertEqual(result['prec_denominator'], 0)
        self.assertAlmostEqual(result['precision'], 0)
        self.assertEqual(result['recall_numerator'], 0)
        self.assertEqual(result['recall_denominator'], 0)
        self.assertEqual(result['recall'], 0)
        self.assertEqual(result['f1'], 0)
        self.assertEqual(result['pred_pos_num'], 0)
        self.assertEqual(result['false_pos_num'], 0.0)
        self.assertEqual(len(result['false_pos_ls']), 0)
        self.assertEqual(result['pred_neg_num'], 0)
        self.assertEqual(result['false_neg_num'], 0.0)
        self.assertEqual(len(result['false_neg_ls']), 0)
Exemplo n.º 5
0
    def test_eval_matches_valid_3(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        C1 = C[['_id', 'ltable_ID', 'rtable_ID']]
        num_ones = len(C1)
        num_zeros = len(C1) - num_ones
        gold = [0] * num_ones
        # gold.extend([1]*num_zeros)
        predicted = [1] * (num_zeros + num_ones)

        ln = len(C1.columns)
        C1.insert(ln, 'gold', gold)
        C1.insert(ln + 1, 'predicted', predicted)
        D = pd.DataFrame(columns=C1.columns)
        cm.copy_properties(C, D)
        result = eval_matches(D, 'gold', 'predicted')

        self.assertEqual(isinstance(result, dict), True)
        self.assertEqual(result['prec_numerator'], 0)
        self.assertEqual(result['prec_denominator'], 0)
        self.assertAlmostEqual(result['precision'], 0)
        self.assertEqual(result['recall_numerator'], 0)
        self.assertEqual(result['recall_denominator'], 0)
        self.assertEqual(result['recall'], 0)
        self.assertEqual(result['f1'], 0)
        self.assertEqual(result['pred_pos_num'], 0)
        self.assertEqual(result['false_pos_num'], 0.0)
        self.assertEqual(len(result['false_pos_ls']), 0)
        self.assertEqual(result['pred_neg_num'], 0)
        self.assertEqual(result['false_neg_num'], 0.0)
        self.assertEqual(len(result['false_neg_ls']), 0)
Exemplo n.º 6
0
    def test_eval_matches_valid_2(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        C1 = C[['_id', 'ltable_ID', 'rtable_ID']]
        num_ones = 1
        num_zeros = len(C1) - num_ones
        gold = [0] * num_ones
        gold.extend([1] * num_zeros)
        predicted = [1] * (num_zeros + num_ones)

        ln = len(C1.columns)
        C1.insert(ln, 'gold', gold)
        C1.insert(ln + 1, 'predicted', predicted)
        cm.copy_properties(C, C1)

        result = eval_matches(C1, 'predicted', 'gold')
        self.assertEqual(isinstance(result, dict), True)
        self.assertEqual(result['prec_numerator'], 14)
        self.assertEqual(result['prec_denominator'], 14)
        self.assertAlmostEqual(result['precision'], 1)
        self.assertEqual(result['recall_numerator'], 14)
        self.assertEqual(result['recall_denominator'], 15)
        self.assertEqual(result['recall'], 0.9333333333333333)
        self.assertEqual(result['f1'], 0.9655172413793104)
        self.assertEqual(result['pred_pos_num'], 14)
        self.assertEqual(result['false_pos_num'], 0.0)
        self.assertEqual(len(result['false_pos_ls']), 0)
        self.assertEqual(result['pred_neg_num'], 1)
        self.assertEqual(result['false_neg_num'], 1.0)
        self.assertEqual(len(result['false_neg_ls']), 1)
        t = result['false_neg_ls'][0]
        self.assertEqual(t[0], 'a1')
        self.assertEqual(t[1], 'b1')
Exemplo n.º 7
0
def add_output_attributes(candset, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_',
                          r_output_prefix='rtable_', validate=True, copy_props=True,
                          delete_from_catalog=True, verbose=False):

    if not isinstance(candset, pd.DataFrame):
        logger.error('Input object is not of type pandas data frame')
        raise AssertionError('Input object is not of type pandas data frame')

    # # get metadata
    key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(candset, logger, verbose)
    if validate:
        cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key,
                                          logger, verbose)
    index_values = candset.index

    df = _add_output_attributes(candset, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key,
                                l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix,
                                validate=False)

    df.set_index(index_values, inplace=True)
    if copy_props:
        cm.init_properties(df)
        cm.copy_properties(candset, df)
        if delete_from_catalog:
            cm.del_all_properties(candset)
    return df
def rename_col(df, old_col_name, new_col_name):
    new_df = df.rename(columns={old_col_name: new_col_name})

    if cm.is_dfinfo_present(df):
        cm.init_properties(new_df)
        cm.copy_properties(df, new_df)

        if _is_table_or_candset(df):
            if not _is_table(df):
                key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \
                    cm.get_metadata_for_candset(df, logger, False)
                if key == old_col_name:
                    cm.set_key(new_df, new_col_name)
                elif fk_ltable == old_col_name:
                    cm.set_fk_ltable(new_df, new_col_name)
                elif fk_rtable == old_col_name:
                    cm.set_fk_rtable(new_df, new_col_name)
                else:
                    pass
            else:
                key = cm.get_key(df)
                if key == old_col_name:
                    cm.set_key(new_df, new_col_name)

    return new_df
Exemplo n.º 9
0
def _post_process_labelled_table(input_table, labeled_table, col_name):
    """
    This function post processes the labeled table and updates the catalog.
    Specifically, this function validates that the label column contain only
    0 and 1's, and finally copies the properties from the input table to the
    output table.
    """
    # Cast the label values to int as initially they will be strings when it
    # comes from the GUI
    labeled_table[col_name] = labeled_table[col_name].astype(int)

    # Check if the table contains only 0s and 1s
    label_value_with_1 = labeled_table[col_name] == 1
    label_value_with_0 = labeled_table[col_name] == 0
    sum_of_labels = sum(label_value_with_1 | label_value_with_0)

    # If they contain column values other than 0 and 1, raise an error
    if not sum_of_labels == len(labeled_table):
        logger.error('The label column contains values other than 0 and 1')
        raise AssertionError(
            'The label column contains values other than 0 and 1')

    # Copy the properties from the input table to label table.
    # Note: Here we dont have to check for the integrity of 'key' because the
    # key column is not tampered from the input table.
    cm.init_properties(labeled_table)
    cm.copy_properties(input_table, labeled_table)

    # Return the label table
    return labeled_table
Exemplo n.º 10
0
def rename_col(df, old_col_name, new_col_name):
    new_df = df.rename(columns={old_col_name: new_col_name})

    if cm.is_dfinfo_present(df):
        cm.init_properties(new_df)
        cm.copy_properties(df, new_df)

        if _is_table_or_candset(df):
            if not _is_table(df):
                key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \
                    cm.get_metadata_for_candset(df, logger, False)
                if key == old_col_name:
                    cm.set_key(new_df, new_col_name)
                elif fk_ltable == old_col_name:
                    cm.set_fk_ltable(new_df, new_col_name)
                elif fk_rtable == old_col_name:
                    cm.set_fk_rtable(new_df, new_col_name)
                else:
                    pass
            else:
                key = cm.get_key(df)
                if key == old_col_name:
                    cm.set_key(new_df, new_col_name)

    return new_df
Exemplo n.º 11
0
def get_false_negatives_as_df(table, eval_summary, verbose=False):

    """
    Select only the false negatives from the input table and return as a
    DataFrame based on the evaluation results.

    Args:
        table (DataFrame): The input table (pandas DataFrame) that was used for
            evaluation.
        eval_summary (dictionary): A Python dictionary containing evaluation
            results, typically from 'eval_matches' command.

    Returns:
        A pandas DataFrame containing only the false negatives from
        the input table.

        Further,
        this function sets the output DataFrame's properties same as input
        DataFrame.

    """
    # Validate input parameters

    # # We expect the input candset to be of type pandas DataFrame.
    if not isinstance(table, pd.DataFrame):
        logger.error('Input cand.set is not of type dataframe')
        raise AssertionError('Input cand.set is not of type dataframe')

    # Do metadata checking
    # # Mention what metadata is required to the user
    ch.log_info(logger, 'Required metadata: cand.set key, fk ltable, '
                        'fk rtable, '
                        'ltable, rtable, ltable key, rtable key', verbose)

    # # Get metadata
    ch.log_info(logger, 'Getting metadata from the catalog', verbose)

    key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \
        cm.get_metadata_for_candset(
        table, logger, verbose)

    # # Validate metadata
    ch.log_info(logger, 'Validating metadata', verbose)
    cm._validate_metadata_for_candset(table, key, fk_ltable, fk_rtable,
                                      ltable, rtable, l_key, r_key,
                                      logger, verbose)

    data_frame =  _get_dataframe(table, eval_summary['false_neg_ls'])

    # # Update catalog
    ch.log_info(logger, 'Updating catalog', verbose)

    cm.init_properties(data_frame)
    cm.copy_properties(table, data_frame)

    # # Update catalog
    ch.log_info(logger, 'Returning the dataframe', verbose)

    return data_frame
Exemplo n.º 12
0
 def test_copy_properties_update_false_2(self):
     A = read_csv_metadata(path_a)
     A1 = pd.read_csv(path_a)
     cm.copy_properties(A, A1, replace=False)
     p = cm.get_all_properties(A)
     p1 = cm.get_all_properties(A1)
     self.assertEqual(p, p1)
     self.assertEqual(cm.get_key(A1), cm.get_key(A))
Exemplo n.º 13
0
    def _predict(self,
                 x=None,
                 table=None,
                 exclude_attrs=None,
                 target_attr=None,
                 append=False,
                 return_probs=False,
                 probs_attr=None,
                 inplace=True,
                 copy_props=True):
        """
            Delegated function from predict.
        """
        # If x is not none, call the predict method that mimics sk-learn
        # predict method.
        if x is not None:
            y = self._predict_sklearn(x, return_prob=return_probs)
        # If the input table and the exclude attributes are not None,
        # then call the appropriate predict method.
        elif table is not None and exclude_attrs is not None:
            y = self._predict_ex_attrs(table,
                                       exclude_attrs,
                                       return_prob=return_probs)
            # If the append is True, update the table
            if target_attr is not None and append is True:
                # If inplace is True, then update the input table.
                if inplace:
                    if return_probs:
                        table[target_attr] = y[0]
                        table[probs_attr] = y[1]
                        # Return the updated table
                        return table
                    else:
                        # Return the updated table
                        table[target_attr] = y
                        return table
                else:
                    # else, create a copy and update it.
                    table_copy = table.copy()
                    if return_probs:
                        table_copy[target_attr] = y[0]
                        table_copy[probs_attr] = y[1]
                    else:
                        table_copy[target_attr] = y
                    # copy the properties from the input table to the output
                    # table.
                    if copy_props:
                        cm.copy_properties(table, table_copy)
                    # Return the new table.
                    return table_copy

        else:
            # else, raise a syntax error
            raise SyntaxError('The arguments supplied does not match '
                              'the signatures supported !!!')
        # Return the predictions
        return y
Exemplo n.º 14
0
 def test_copy_properties_valid_1(self):
     A = read_csv_metadata(path_a)
     A1 = pd.read_csv(path_a)
     cm.copy_properties(A, A1)
     self.assertEqual(cm.is_dfinfo_present(A1), True)
     p = cm.get_all_properties(A)
     p1 = cm.get_all_properties(A1)
     self.assertEqual(p, p1)
     self.assertEqual(cm.get_key(A1), cm.get_key(A))
Exemplo n.º 15
0
def filter_rows(df, condn):
    new_df = df.query(condn)

    # update metadata
    if cm.is_dfinfo_present(df):
        if _is_table_or_candset(df):
            cm.init_properties(new_df)
            cm.copy_properties(df, new_df)

    return new_df
Exemplo n.º 16
0
def filter_rows(df, condn):
    new_df = df.query(condn)

    # update metadata
    if cm.is_dfinfo_present(df):
        if _is_table_or_candset(df):
            cm.init_properties(new_df)
            cm.copy_properties(df, new_df)

    return new_df
Exemplo n.º 17
0
def mutate_col(df, **kwargs):
    new_df = df.assign(**kwargs)

    if cm.is_dfinfo_present(df):
        cm.init_properties(new_df)
        cm.copy_properties(df, new_df)

        # if _is_table_or_candset(df):
        #     key = cm.get_key(df)
        #     if key == new_col_name:
        #         cm.set_key(new_df, new_col_name)

    return new_df
Exemplo n.º 18
0
def mutate_col(df, **kwargs):
    new_df = df.assign(**kwargs)

    if cm.is_dfinfo_present(df):
        cm.init_properties(new_df)
        cm.copy_properties(df, new_df)

        # if _is_table_or_candset(df):
        #     key = cm.get_key(df)
        #     if key == new_col_name:
        #         cm.set_key(new_df, new_col_name)

    return new_df
Exemplo n.º 19
0
    def _predict(self, x=None, table=None, exclude_attrs=None, target_attr=None,
                 append=False, return_probs=False,
                 probs_attr=None, inplace=True, copy_props=True):
        """
            Delegated function from predict.
        """
        # If x is not none, call the predict method that mimics sk-learn
        # predict method.
        if x is not None:
            y = self._predict_sklearn(x, return_prob=return_probs)
        # If the input table and the exclude attributes are not None,
        # then call the appropriate predict method.
        elif table is not None and exclude_attrs is not None:
            y = self._predict_ex_attrs(table, exclude_attrs, return_prob=return_probs)
            # If the append is True, update the table
            if target_attr is not None and append is True:
                # If inplace is True, then update the input table.
                if inplace:
                    if return_probs:
                        table[target_attr] = y[0]
                        table[probs_attr] = y[1]
                        # Return the updated table
                        return table
                    else:
                        # Return the updated table
                        table[target_attr] = y
                        return table
                else:
                    # else, create a copy and update it.
                    table_copy = table.copy()
                    if return_probs:
                        table_copy[target_attr] = y[0]
                        table_copy[probs_attr] = y[1]
                    else:
                        table_copy[target_attr] = y
                    # copy the properties from the input table to the output
                    # table.
                    if copy_props:
                        cm.copy_properties(table, table_copy)
                    # Return the new table.
                    return table_copy

        else:
            # else, raise a syntax error
            raise SyntaxError(
                'The arguments supplied does not match '
                'the signatures supported !!!')
        # Return the predictions
        return y
Exemplo n.º 20
0
    def _test_label_table(self, table, col_name, label_values):
        _validate_inputs(table, col_name,  verbose=False)
        lbl_table = _init_label_table(table, col_name)
        # mg._viewapp = None
        # from py_entitymatching.gui.table_gui import edit_table
        # edit_table(lbl_table, show_flag=False)
        # mg._viewapp = None

        new_table = lbl_table.copy()
        cm.copy_properties(table, new_table)
        lbl_table = new_table

        lbl_table[col_name] = label_values
        lbl_table = _post_process_labelled_table(table, lbl_table, col_name)
        return lbl_table
Exemplo n.º 21
0
    def _test_label_table(self, table, col_name, label_values):
        _validate_inputs(table, col_name, verbose=False)
        lbl_table = _init_label_table(table, col_name)
        # mg._viewapp = None
        # from py_entitymatching.gui.table_gui import edit_table
        # edit_table(lbl_table, show_flag=False)
        # mg._viewapp = None

        new_table = lbl_table.copy()
        cm.copy_properties(table, new_table)
        lbl_table = new_table

        lbl_table[col_name] = label_values
        lbl_table = _post_process_labelled_table(table, lbl_table, col_name)
        return lbl_table
Exemplo n.º 22
0
    def test_copy_properties_valid_2(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b)
        C = read_csv_metadata(path_c, ltable=A, rtable=B)

        C1 = pd.read_csv(path_c)
        cm.copy_properties(C, C1)
        self.assertEqual(cm.is_dfinfo_present(C1), True)
        p = cm.get_all_properties(C1)
        p1 = cm.get_all_properties(C1)
        self.assertEqual(p, p1)
        self.assertEqual(cm.get_key(C1), cm.get_key(C))
        self.assertEqual(cm.get_ltable(C1).equals(A), True)
        self.assertEqual(cm.get_rtable(C1).equals(B), True)
        self.assertEqual(cm.get_fk_ltable(C1), cm.get_fk_ltable(C))
        self.assertEqual(cm.get_fk_rtable(C1), cm.get_fk_rtable(C))
    def test_eval_matches_predicted_attr_not_in_df(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        C1 = C[['_id', 'ltable_ID', 'rtable_ID']]
        num_ones = 1
        num_zeros = len(C1) - num_ones
        gold = [0] * num_ones
        gold.extend([1] * num_zeros)
        predicted = [1] * (num_zeros + num_ones)

        ln = len(C1.columns)
        C1.insert(ln, 'gold', gold)
        C1.insert(ln + 1, 'predicted', predicted)
        cm.copy_properties(C, C1)

        result = eval_matches(C1, 'gold', 'predicted1')
Exemplo n.º 24
0
    def test_eval_matches_predicted_attr_not_in_df(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        C1 = C[['_id', 'ltable_ID', 'rtable_ID']]
        num_ones = 1
        num_zeros = len(C1) - num_ones
        gold = [0] * num_ones
        gold.extend([1] * num_zeros)
        predicted = [1] * (num_zeros + num_ones)

        ln = len(C1.columns)
        C1.insert(ln, 'gold', gold)
        C1.insert(ln + 1, 'predicted', predicted)
        cm.copy_properties(C, C1)

        result = eval_matches(C1, 'gold', 'predicted1')
Exemplo n.º 25
0
def drop_cols(df, col_list):
    if not isinstance(col_list, list):
        col_list = [col_list]
    if cm.is_dfinfo_present(df):
        if _is_table_or_candset(df):
            if not _is_table(df):
                key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key\
                    = cm.get_metadata_for_candset(df, logger, False)
                col_list = gh.list_diff(col_list, [key, fk_ltable, fk_rtable])
                col_list = gh.list_drop_duplicates(col_list)
            else:
                key = cm.get_key(df)
                col_list = gh.list_diff(col_list, [key])
                col_list = gh.list_drop_duplicates(col_list)
        new_df = df.drop(col_list, axis=1)
        cm.init_properties(new_df)
        cm.copy_properties(df, new_df)
    else:
        new_df = df[col_list]

    return new_df
Exemplo n.º 26
0
def drop_cols(df, col_list):
    if not isinstance(col_list, list):
        col_list = [col_list]
    if cm.is_dfinfo_present(df):
        if _is_table_or_candset(df):
            if not _is_table(df):
                key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key\
                    = cm.get_metadata_for_candset(df, logger, False)
                col_list = gh.list_diff(col_list, [key, fk_ltable, fk_rtable])
                col_list = gh.list_drop_duplicates(col_list)
            else:
                key = cm.get_key(df)
                col_list = gh.list_diff(col_list, [key])
                col_list = gh.list_drop_duplicates(col_list)
        new_df = df.drop(col_list, axis=1)
        cm.init_properties(new_df)
        cm.copy_properties(df, new_df)
    else:
        new_df = df[col_list]

    return new_df
Exemplo n.º 27
0
def preserve_metadata(df, new_df):
    if cm.is_dfinfo_present(df):
        if _is_table_or_candset(df):
            if not _is_table(df):
                key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key\
                    = cm.get_metadata_for_candset(df, logger, False)
                if not ch.check_attrs_present(new_df,
                                              [key, fk_ltable, fk_rtable]):
                    logger.warning('Not setting the metadata as some attrs '
                                   'are not present')
                    return new_df
            else:
                key = cm.get_key(df)
                if not ch.check_attrs_present(new_df, [key]):
                    logger.warning('Not setting the metadata as some attrs '
                                   'are not present')
                    return new_df

        cm.init_properties(new_df)
        cm.copy_properties(df, new_df)
    return new_df
Exemplo n.º 28
0
def preserve_metadata(df, new_df):
    if cm.is_dfinfo_present(df):
        if _is_table_or_candset(df):
            if not _is_table(df):
                key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key\
                    = cm.get_metadata_for_candset(df, logger, False)
                if not ch.check_attrs_present(new_df, [key, fk_ltable,
                                                     fk_rtable]):
                    logger.warning('Not setting the metadata as some attrs '
                                   'are not present')
                    return new_df
            else:
                key = cm.get_key(df)
                if not ch.check_attrs_present(new_df, [key]):
                    logger.warning('Not setting the metadata as some attrs '
                                   'are not present')
                    return new_df


        cm.init_properties(new_df)
        cm.copy_properties(df, new_df)
    return new_df
Exemplo n.º 29
0
def project_cols(df, col_list):
    if not isinstance(col_list, list):
        col_list = [col_list]
    if cm.is_dfinfo_present(df):
        if _is_table_or_candset(df):
            if not _is_table(df):
                key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key\
                    = cm.get_metadata_for_candset(df, logger, False)
                updated_col_list = [key, fk_ltable, fk_rtable]
                updated_col_list.extend(col_list)
                col_list = gh.list_drop_duplicates(updated_col_list)
            else:
                key = cm.get_key(df)
                updated_col_list = [key]
                updated_col_list.extend(col_list)
                col_list = gh.list_drop_duplicates(updated_col_list)
        new_df = df[col_list]
        cm.init_properties(new_df)
        cm.copy_properties(df, new_df)
    else:
        new_df = df[col_list]

    return new_df
Exemplo n.º 30
0
def project_cols(df, col_list):
    if not isinstance(col_list, list):
        col_list = [col_list]
    if cm.is_dfinfo_present(df):
        if _is_table_or_candset(df):
            if not _is_table(df):
                key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key\
                    = cm.get_metadata_for_candset(df, logger, False)
                updated_col_list = [key, fk_ltable, fk_rtable]
                updated_col_list.extend(col_list)
                col_list = gh.list_drop_duplicates(updated_col_list)
            else:
                key = cm.get_key(df)
                updated_col_list = [key]
                updated_col_list.extend(col_list)
                col_list = gh.list_drop_duplicates(updated_col_list)
        new_df = df[col_list]
        cm.init_properties(new_df)
        cm.copy_properties(df, new_df)
    else:
        new_df = df[col_list]

    return new_df
Exemplo n.º 31
0
    def predict(self, x=None, table=None, exclude_attrs=None, target_attr=None,
                append=False, return_probs=False,
                probs_attr=None, inplace=True):
        """
        Predict interface for the matcher.

        Specifically, there are two ways the user can call the predict method.
        First, interface similar to scikit-learn where the feature vectors
        given as projected DataFrame.
        Second, give the DataFrame and explicitly specify the feature vectors
        (by specifying the attributes to be excluded) .

        A point to note is all the input parameters have a default value of
        None. This is done to support both the interfaces in a single function.


        Args:
            x (DataFrame): The input pandas DataFrame containing only feature
                vectors (defaults to None).
            table (DataFrame): The input pandas DataFrame containing feature
                vectors, and may be other attributes (defaults to None).
            exclude_attrs (list): A list of attributes to be excluded from the
                input table to get the feature vectors (defaults to None).
            target_attr (string): The attribute name where the predictions
                need to be stored in the input table (defaults to None).
            probs_attr (string): The attribute name where the prediction probabilities 
                need to be stored in the input table (defaults to None).
            append (boolean): A flag to indicate whether the predictions need
                to be appended in the input DataFrame (defaults to False).
            return_probs (boolean): A flag to indicate where the prediction probabilities
                need to be returned (defaults to False). If set to True, returns the 
                probability if the pair was a match.
            inplace (boolean): A flag to indicate whether the append needs to be
                done inplace (defaults to True).

        Returns:
            An array of predictions or a DataFrame with predictions updated.

        """
        # If x is not none, call the predict method that mimics sk-learn
        # predict method.
        if x is not None:
            y = self._predict_sklearn(x, return_prob=return_probs)
        # If the input table and the exclude attributes are not None,
        # then call the appropriate predict method.
        elif table is not None and exclude_attrs is not None:
            y = self._predict_ex_attrs(table, exclude_attrs, return_prob=return_probs)
            # If the append is True, update the table
            if target_attr is not None and append is True:
                # If inplace is True, then update the input table.
                if inplace:
                    if return_probs:
                        table[target_attr] = y[0]
                        table[probs_attr] = y[1]
                        # Return the updated table
                        return table
                    else:
                        # Return the updated table
                        table[target_attr] = y
                        return table
                else:
                # else, create a copy and update it.
                    table_copy = table.copy()
                    if return_probs:
                        table_copy[target_attr] = y[0]
                        table_copy[probs_attr] = y[1]
                    else:
                        table_copy[target_attr] = y
                    # copy the properties from the input table to the output
                    # table.
                    cm.copy_properties(table, table_copy)
                    # Return the new table.
                    return table_copy

        else:
            # else, raise a syntax error
            raise SyntaxError(
                'The arguments supplied does not match '
                'the signatures supported !!!')
        # Return the predictions
        return y
Exemplo n.º 32
0
def split_train_test(labeled_data,
                     train_proportion=0.5,
                     random_state=None,
                     verbose=True):
    """
    This function splits the input data into train and test.

    Specifically, this function is just a wrapper of scikit-learn's
    train_test_split function.

    This function also takes care of copying the metadata from the input
    table to train and test splits.

    Args:
        labeled_data (DataFrame): The input pandas DataFrame that needs to be
            split into train and test.
        train_proportion (float): A number between 0 and 1, indicating the
            proportion of tuples that should be included in the train split (
            defaults to 0.5).
        random_state (object): A number of random number object (as in
            scikit-learn).
        verbose (boolean): A flag to indicate whether the debug information
            should be displayed.

    Returns:

        A Python dictionary containing two keys - train and test.

        The value for the key 'train' is a pandas DataFrame containing tuples
        allocated from the input table based on train_proportion.

        Similarly, the value for the key 'test' is a pandas DataFrame containing
        tuples for evaluation.

        This function sets the output DataFrames (train, test) properties
        same as the input DataFrame.

    Examples:
        >>> import py_entitymatching as em
        >>> # G is the labeled data or the feature vectors that should be split
        >>> train_test = em.split_train_test(G, train_proportion=0.5)
        >>> train, test = train_test['train'], train_test['test']


    """
    # Validate input parameters
    # # We expected labeled data to be of type pandas DataFrame
    if not isinstance(labeled_data, pd.DataFrame):
        logger.error('Input table is not of type DataFrame')
        raise AssertionError('Input table is not of type DataFrame')

    ch.log_info(
        logger, 'Required metadata: cand.set key, fk ltable, '
        'fk rtable, '
        'ltable, rtable, ltable key, rtable key', verbose)

    # # Get metadata
    key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \
        cm.get_metadata_for_candset(
            labeled_data,
            logger, verbose)

    # # Validate metadata
    cm._validate_metadata_for_candset(labeled_data, key, fk_ltable, fk_rtable,
                                      ltable, rtable, l_key, r_key, logger,
                                      verbose)

    num_rows = len(labeled_data)
    # We expect the train proportion to be between 0 and 1.
    assert train_proportion >= 0 and train_proportion <= 1, \
        " Train proportion is expected to be between 0 and 1"

    # We expect the number of rows in the table to be non-empty
    assert num_rows > 0, 'The input table is empty'

    # Explicitly get the train and test size in terms of tuples (based on the
    #  given proportion)
    train_size = int(math.floor(num_rows * train_proportion))
    test_size = int(num_rows - train_size)

    # Use sk-learn to split the data
    idx_values = pd.np.array(labeled_data.index.values)
    idx_train, idx_test = ms.train_test_split(idx_values,
                                              test_size=test_size,
                                              train_size=train_size,
                                              random_state=random_state)

    # Construct output tables.
    label_train = labeled_data.ix[idx_train]
    label_test = labeled_data.ix[idx_test]

    # Update catalog
    cm.init_properties(label_train)
    cm.copy_properties(labeled_data, label_train)

    cm.init_properties(label_test)
    cm.copy_properties(labeled_data, label_test)

    # Return output tables
    result = OrderedDict()
    result['train'] = label_train
    result['test'] = label_test

    # Finally, return the dictionary.
    return result
Exemplo n.º 33
0
def impute_table(table,
                 exclude_attrs=None,
                 missing_val='NaN',
                 strategy='mean',
                 axis=0,
                 val_all_nans=0,
                 verbose=True):
    """
    Impute table containing missing values.

    Args:
        table (DataFrame): DataFrame which values should be imputed.
        exclude_attrs (List) : list of attribute names to be excluded from
            imputing (defaults to None).
        missing_val (string or int):  The placeholder for the missing values.
            All occurrences of `missing_values` will be imputed.
            For missing values encoded as np.nan, use the string value 'NaN'
            (defaults to 'NaN').
        strategy (string): String that specifies on how to impute values. Valid
            strings: 'mean', 'median', 'most_frequent' (defaults to 'mean').
        axis (int):  axis=1 along rows, and axis=0 along columns  (defaults
            to 0).
        val_all_nans (float): Value to fill in if all the values in the column
            are NaN.

    Returns:
        Imputed DataFrame.


    Raises:
        AssertionError: If `table` is not of type pandas DataFrame.

    Examples:
        >>> import py_entitymatching as em
        >>> # H is the feature vector which should be imputed. Specifically, impute the missing values
        >>> # in each column, with the mean of that column
        >>> H = em.impute_table(H, exclude_attrs=['_id', 'ltable_id', 'rtable_id'], strategy='mean')


    """
    # Validate input paramaters
    # # We expect the input table to be of type pandas DataFrame
    if not isinstance(table, pd.DataFrame):
        logger.error('Input table is not of type DataFrame')
        raise AssertionError('Input table is not of type DataFrame')

    ch.log_info(
        logger, 'Required metadata: cand.set key, fk ltable, '
        'fk rtable, '
        'ltable, rtable, ltable key, rtable key', verbose)

    # # Get metadata
    key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \
        cm.get_metadata_for_candset(
            table,
            logger, verbose)

    # # Validate metadata
    cm._validate_metadata_for_candset(table, key, fk_ltable, fk_rtable, ltable,
                                      rtable, l_key, r_key, logger, verbose)

    fv_columns = table.columns

    if exclude_attrs == None:
        feature_names = fv_columns

    else:

        # Check if the exclude attributes are present in the input table
        if not ch.check_attrs_present(table, exclude_attrs):
            logger.error('The attributes mentioned in exclude_attrs '
                         'is not present '
                         'in the input table')
            raise AssertionError('The attributes mentioned in exclude_attrs '
                                 'is not present '
                                 'in the input table')
        # We expect exclude attributes to be of type list. If not convert it into
        #  a list.
        if not isinstance(exclude_attrs, list):
            exclude_attrs = [exclude_attrs]

        # Drop the duplicates from the exclude attributes
        exclude_attrs = gh.list_drop_duplicates(exclude_attrs)

        cols = [c not in exclude_attrs for c in fv_columns]
        feature_names = fv_columns[cols]
    # print feature_names
    table_copy = table.copy()
    projected_table = table_copy[feature_names]

    projected_table_values = projected_table.values

    imp = Imputer(missing_values=missing_val, strategy=strategy, axis=axis)
    imp.fit(projected_table_values)
    imp.statistics_[pd.np.isnan(imp.statistics_)] = val_all_nans
    projected_table_values = imp.transform(projected_table_values)
    table_copy[feature_names] = projected_table_values
    # Update catalog
    cm.init_properties(table_copy)
    cm.copy_properties(table, table_copy)

    return table_copy
Exemplo n.º 34
0
def dask_extract_feature_vecs(candset,
                              attrs_before=None,
                              feature_table=None,
                              attrs_after=None,
                              verbose=False,
                              show_progress=True,
                              n_chunks=1):
    """
    WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK

    This function extracts feature vectors from a DataFrame (typically a
    labeled candidate set).

    Specifically, this function uses feature
    table, ltable and rtable (that is present in the `candset`'s
    metadata) to extract feature vectors.

    Args:
        candset (DataFrame): The input candidate set for which the features
            vectors should be extracted.
            
        attrs_before (list): The list of attributes from the input candset,
            that should be added before the feature vectors (defaults to None).
            
        feature_table (DataFrame): A DataFrame containing a list of
            features that should be used to compute the feature vectors (
            defaults to None).
            
        attrs_after (list): The list of attributes from the input candset
            that should be added after the feature vectors (defaults to None).
            
        verbose (boolean): A flag to indicate whether the debug information
            should be displayed (defaults to False).
            
        show_progress (boolean): A flag to indicate whether the progress of
            extracting feature vectors must be displayed (defaults to True).
            
        n_chunks (int): The number of partitions to split the candidate set. If it 
            is set to -1, the number of partitions will be set to the 
            number of cores in the machine.  


    Returns:
        A pandas DataFrame containing feature vectors.

        The DataFrame will have metadata ltable and rtable, pointing
        to the same ltable and rtable as the input candset.

        Also, the output
        DataFrame will have three columns: key, foreign key ltable, foreign
        key rtable copied from input candset to the output DataFrame. These
        three columns precede the columns mentioned in `attrs_before`.



    Raises:
        AssertionError: If `candset` is not of type pandas
            DataFrame.
        AssertionError: If `attrs_before` has attributes that
            are not present in the input candset.
        AssertionError: If `attrs_after` has attribtues that
            are not present in the input candset.
        AssertionError: If `feature_table` is set to None.
        AssertionError: If `n_chunks` is not of type
                int.

    Examples:
        >>> import py_entitymatching as em
        >>> from py_entitymatching.dask.dask_extract_features import dask_extract_feature_vecs
        >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
        >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
        >>> match_f = em.get_features_for_matching(A, B)
        >>> # G is the labeled dataframe which should be converted into feature vectors
        >>> H = dask_extract_feature_vecs(G, features=match_f, attrs_before=['title'], attrs_after=['gold_labels'])


    """
    logger.warning(
        "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK."
    )

    # Validate input parameters

    # # We expect the input candset to be of type pandas DataFrame.
    validate_object_type(candset, pd.DataFrame, error_prefix='Input cand.set')

    # # If the attrs_before is given, Check if the attrs_before are present in
    # the input candset
    if attrs_before != None:
        if not ch.check_attrs_present(candset, attrs_before):
            logger.error(
                'The attributes mentioned in attrs_before is not present '
                'in the input table')
            raise AssertionError(
                'The attributes mentioned in attrs_before is not present '
                'in the input table')

    # # If the attrs_after is given, Check if the attrs_after are present in
    # the input candset
    if attrs_after != None:
        if not ch.check_attrs_present(candset, attrs_after):
            logger.error(
                'The attributes mentioned in attrs_after is not present '
                'in the input table')
            raise AssertionError(
                'The attributes mentioned in attrs_after is not present '
                'in the input table')

    # We expect the feature table to be a valid object
    if feature_table is None:
        logger.error('Feature table cannot be null')
        raise AssertionError('The feature table cannot be null')

    # Do metadata checking
    # # Mention what metadata is required to the user
    ch.log_info(
        logger, 'Required metadata: cand.set key, fk ltable, '
        'fk rtable, '
        'ltable, rtable, ltable key, rtable key', verbose)

    # # Get metadata
    ch.log_info(logger, 'Getting metadata from catalog', verbose)

    key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \
        cm.get_metadata_for_candset(
            candset, logger, verbose)

    # # Validate metadata
    ch.log_info(logger, 'Validating metadata', verbose)
    cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable,
                                      ltable, rtable, l_key, r_key, logger,
                                      verbose)

    # Extract features

    # id_list = [(row[fk_ltable], row[fk_rtable]) for i, row in
    #            candset.iterrows()]
    # id_list = [tuple(tup) for tup in candset[[fk_ltable, fk_rtable]].values]

    # # Set index for convenience
    l_df = ltable.set_index(l_key, drop=False)
    r_df = rtable.set_index(r_key, drop=False)

    # # Apply feature functions
    ch.log_info(logger, 'Applying feature functions', verbose)
    col_names = list(candset.columns)
    fk_ltable_idx = col_names.index(fk_ltable)
    fk_rtable_idx = col_names.index(fk_rtable)

    validate_object_type(n_chunks, int, 'Parameter n_chunks')
    validate_chunks(n_chunks)

    n_chunks = get_num_partitions(n_chunks, len(candset))

    c_splits = np.array_split(candset, n_chunks)

    pickled_obj = cloudpickle.dumps(feature_table)

    feat_vals_by_splits = []

    for i in range(len(c_splits)):
        partial_result = delayed(get_feature_vals_by_cand_split)(
            pickled_obj, fk_ltable_idx, fk_rtable_idx, l_df, r_df, c_splits[i],
            False)
        feat_vals_by_splits.append(partial_result)

    feat_vals_by_splits = delayed(wrap)(feat_vals_by_splits)
    if show_progress:
        with ProgressBar():
            feat_vals_by_splits = feat_vals_by_splits.compute(
                scheduler="processes", num_workers=get_num_cores())
    else:
        feat_vals_by_splits = feat_vals_by_splits.compute(
            scheduler="processes", num_workers=get_num_cores())

    feat_vals = sum(feat_vals_by_splits, [])

    # Construct output table
    feature_vectors = pd.DataFrame(feat_vals, index=candset.index.values)
    # # Rearrange the feature names in the input feature table order
    feature_names = list(feature_table['feature_name'])
    feature_vectors = feature_vectors[feature_names]

    ch.log_info(logger, 'Constructing output table', verbose)
    # print(feature_vectors)
    # # Insert attrs_before
    if attrs_before:
        if not isinstance(attrs_before, list):
            attrs_before = [attrs_before]
        attrs_before = gh.list_diff(attrs_before, [key, fk_ltable, fk_rtable])
        attrs_before.reverse()
        for a in attrs_before:
            feature_vectors.insert(0, a, candset[a])

    # # Insert keys
    feature_vectors.insert(0, fk_rtable, candset[fk_rtable])
    feature_vectors.insert(0, fk_ltable, candset[fk_ltable])
    feature_vectors.insert(0, key, candset[key])

    # # insert attrs after
    if attrs_after:
        if not isinstance(attrs_after, list):
            attrs_after = [attrs_after]
        attrs_after = gh.list_diff(attrs_after, [key, fk_ltable, fk_rtable])
        attrs_after.reverse()
        col_pos = len(feature_vectors.columns)
        for a in attrs_after:
            feature_vectors.insert(col_pos, a, candset[a])
            col_pos += 1

    # Reset the index
    # feature_vectors.reset_index(inplace=True, drop=True)

    # # Update the catalog
    cm.init_properties(feature_vectors)
    cm.copy_properties(candset, feature_vectors)

    # Finally, return the feature vectors
    return feature_vectors
Exemplo n.º 35
0
 def test_copy_properties_src_df_notin_catalog(self):
     A = pd.read_csv(path_a)
     A1 = pd.read_csv(path_a)
     cm.copy_properties(A, A1)
Exemplo n.º 36
0
def extract_feature_vecs(candset,
                         attrs_before=None,
                         feature_table=None,
                         attrs_after=None,
                         verbose=False,
                         show_progress=True):
    """
    This function extracts feature vectors from a DataFrame (typically a
    labeled candidate set).

    Specifically, this function uses feature
    table, ltable and rtable (that is present in the `candset`'s
    metadata) to extract feature vectors.

    Args:
        candset (DataFrame): The input candidate set for which the features
            vectors should be extracted.
        attrs_before (list): The list of attributes from the input candset,
            that should be added before the feature vectors (defaults to None).
        feature_table (DataFrame): A DataFrame containing a list of
            features that should be used to compute the feature vectors (
            defaults to None).
        attrs_after (list): The list of attributes from the input candset
            that should be added after the feature vectors (defaults to None).
        verbose (boolean): A flag to indicate whether the debug information
            should be displayed (defaults to False).
        show_progress (boolean): A flag to indicate whether the progress of
            extracting feature vectors must be displayed (defaults to True).


    Returns:
        A pandas DataFrame containing feature vectors.

        The DataFrame will have metadata ltable and rtable, pointing
        to the same ltable and rtable as the input candset.

        Also, the output
        DataFrame will have three columns: key, foreign key ltable, foreign
        key rtable copied from input candset to the output DataFrame. These
        three columns precede the columns mentioned in `attrs_before`.



    Raises:
        AssertionError: If `candset` is not of type pandas
            DataFrame.
        AssertionError: If `attrs_before` has attributes that
            are not present in the input candset.
        AssertionError: If `attrs_after` has attribtues that
            are not present in the input candset.
        AssertionError: If `feature_table` is set to None.

    """
    # Validate input parameters

    # # We expect the input candset to be of type pandas DataFrame.
    if not isinstance(candset, pd.DataFrame):
        logger.error('Input cand.set is not of type dataframe')
        raise AssertionError('Input cand.set is not of type dataframe')

    # # If the attrs_before is given, Check if the attrs_before are present in
    # the input candset
    if attrs_before != None:
        if not ch.check_attrs_present(candset, attrs_before):
            logger.error(
                'The attributes mentioned in attrs_before is not present '
                'in the input table')
            raise AssertionError(
                'The attributes mentioned in attrs_before is not present '
                'in the input table')

    # # If the attrs_after is given, Check if the attrs_after are present in
    # the input candset
    if attrs_after != None:
        if not ch.check_attrs_present(candset, attrs_after):
            logger.error(
                'The attributes mentioned in attrs_after is not present '
                'in the input table')
            raise AssertionError(
                'The attributes mentioned in attrs_after is not present '
                'in the input table')

    # We expect the feature table to be a valid object
    if feature_table is None:
        logger.error('Feature table cannot be null')
        raise AssertionError('The feature table cannot be null')

    # Do metadata checking
    # # Mention what metadata is required to the user
    ch.log_info(
        logger, 'Required metadata: cand.set key, fk ltable, '
        'fk rtable, '
        'ltable, rtable, ltable key, rtable key', verbose)

    # # Get metadata
    ch.log_info(logger, 'Getting metadata from catalog', verbose)

    key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \
        cm.get_metadata_for_candset(
        candset, logger, verbose)

    # # Validate metadata
    ch.log_info(logger, 'Validating metadata', verbose)
    cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable,
                                      ltable, rtable, l_key, r_key, logger,
                                      verbose)

    # Extract features

    # id_list = [(row[fk_ltable], row[fk_rtable]) for i, row in
    #            candset.iterrows()]
    # id_list = [tuple(tup) for tup in candset[[fk_ltable, fk_rtable]].values]

    # # Set index for convenience
    l_df = ltable.set_index(l_key, drop=False)
    r_df = rtable.set_index(r_key, drop=False)

    if show_progress:
        prog_bar = pyprind.ProgBar(len(candset))
    # # Apply feature functions
    feat_vals = []
    ch.log_info(logger, 'Applying feature functions', verbose)
    col_names = list(candset.columns)
    fk_ltable_idx = col_names.index(fk_ltable)
    fk_rtable_idx = col_names.index(fk_rtable)
    l_dict = {}
    r_dict = {}

    for row in candset.itertuples(index=False):

        if show_progress:
            prog_bar.update()
        fk_ltable_val = row[fk_ltable_idx]
        fk_rtable_val = row[fk_rtable_idx]

        if fk_ltable_val not in l_dict:
            l_dict[fk_ltable_val] = l_df.ix[fk_ltable_val]
        l_tuple = l_dict[fk_ltable_val]

        if fk_rtable_val not in r_dict:
            r_dict[fk_rtable_val] = r_df.ix[fk_rtable_val]
        r_tuple = r_dict[fk_rtable_val]

        f = apply_feat_fns(l_tuple, r_tuple, feature_table)
        feat_vals.append(f)

    # Construct output table
    feature_vectors = pd.DataFrame(feat_vals, index=candset.index.values)
    # # Rearrange the feature names in the input feature table order
    feature_names = list(feature_table['feature_name'])
    feature_vectors = feature_vectors[feature_names]

    ch.log_info(logger, 'Constructing output table', verbose)
    # print(feature_vectors)
    # # Insert attrs_before
    if attrs_before:
        if not isinstance(attrs_before, list):
            attrs_before = [attrs_before]
        attrs_before = gh.list_diff(attrs_before, [key, fk_ltable, fk_rtable])
        attrs_before.reverse()
        for a in attrs_before:
            feature_vectors.insert(0, a, candset[a])

    # # Insert keys
    feature_vectors.insert(0, fk_rtable, candset[fk_rtable])
    feature_vectors.insert(0, fk_ltable, candset[fk_ltable])
    feature_vectors.insert(0, key, candset[key])

    # # insert attrs after
    if attrs_after:
        if not isinstance(attrs_after, list):
            attrs_after = [attrs_after]
        attrs_after = gh.list_diff(attrs_after, [key, fk_ltable, fk_rtable])
        attrs_after.reverse()
        col_pos = len(feature_vectors.columns)
        for a in attrs_after:
            feature_vectors.insert(col_pos, a, candset[a])
            col_pos += 1

    # Reset the index
    # feature_vectors.reset_index(inplace=True, drop=True)

    # # Update the catalog
    cm.init_properties(feature_vectors)
    cm.copy_properties(candset, feature_vectors)

    # Finally, return the feature vectors
    return feature_vectors
Exemplo n.º 37
0
def get_false_negatives_as_df(table, eval_summary, verbose=False):
    """
    Select only the false negatives from the input table and return as a
    DataFrame based on the evaluation results.

    Args:
        table (DataFrame): The input table (pandas DataFrame) that was used for
            evaluation.
        eval_summary (dictionary): A Python dictionary containing evaluation
            results, typically from 'eval_matches' command.

    Returns:
        A pandas DataFrame containing only the false negatives from
        the input table.

        Further,
        this function sets the output DataFrame's properties same as input
        DataFrame.

    Examples:
        >>> import py_entitymatching as em
        >>> # G is the labeled data used for development purposes, match_f is the feature table
        >>> H = em.extract_feat_vecs(G, feat_table=match_f, attrs_after='gold_labels')
        >>> dt = em.DTMatcher()
        >>> dt.fit(table=H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_labels'], target_attr='gold_labels')
        >>> pred_table = dt.predict(table=H,  exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_labels'],  append=True, target_attr='predicted_labels')
        >>> eval_summary = em.eval_matches(pred_table, 'gold_labels', 'predicted_labels')
        >>> false_neg_df = em.get_false_negatives_as_df(H, eval_summary)


    """
    # Validate input parameters

    # # We expect the input candset to be of type pandas DataFrame.
    validate_object_type(table, pd.DataFrame, error_prefix='Input cand.set')

    # Do metadata checking
    # # Mention what metadata is required to the user
    ch.log_info(
        logger, 'Required metadata: cand.set key, fk ltable, '
        'fk rtable, '
        'ltable, rtable, ltable key, rtable key', verbose)

    # # Get metadata
    ch.log_info(logger, 'Getting metadata from the catalog', verbose)

    key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \
        cm.get_metadata_for_candset(
        table, logger, verbose)

    # # Validate metadata
    ch.log_info(logger, 'Validating metadata', verbose)
    cm._validate_metadata_for_candset(table, key, fk_ltable, fk_rtable, ltable,
                                      rtable, l_key, r_key, logger, verbose)

    data_frame = _get_dataframe(table, eval_summary['false_neg_ls'])

    # # Update catalog
    ch.log_info(logger, 'Updating catalog', verbose)

    cm.init_properties(data_frame)
    cm.copy_properties(table, data_frame)

    # # Update catalog
    ch.log_info(logger, 'Returning the dataframe', verbose)

    return data_frame
Exemplo n.º 38
0
 def test_copy_properties_invalid_src_df(self):
     A = read_csv_metadata(path_a)
     cm.copy_properties(None, A)
Exemplo n.º 39
0
 def test_copy_properties_invalid_tar_df(self):
     A = read_csv_metadata(path_a)
     cm.copy_properties(A, None)
Exemplo n.º 40
0
    def predict(self, x=None, table=None, exclude_attrs=None, target_attr=None,
                append=False, return_probs=False, probs_attr=None, inplace=True,
                show_progress=False, n_chunks=1):
        """
        WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.

        Predict interface for the matcher.

        Specifically, there are two ways the user can call the predict method.
        First, interface similar to scikit-learn where the feature vectors
        given as projected DataFrame.
        Second, give the DataFrame and explicitly specify the feature vectors
        (by specifying the attributes to be excluded) .

        A point to note is all the input parameters have a default value of
        None. This is done to support both the interfaces in a single function.

        Currently, the Dask implementation supports only the cases when the table is not 
        None and the flags inplace, append are False. 


        Args:
            x (DataFrame): The input pandas DataFrame containing only feature
                vectors (defaults to None).
            table (DataFrame): The input pandas DataFrame containing feature
                vectors, and may be other attributes (defaults to None).
            exclude_attrs (list): A list of attributes to be excluded from the
                input table to get the feature vectors (defaults to None).
            target_attr (string): The attribute name where the predictions
                need to be stored in the input table (defaults to None).
            probs_attr (string): The attribute name where the prediction probabilities 
                need to be stored in the input table (defaults to None).
            append (boolean): A flag to indicate whether the predictions need
                to be appended in the input DataFrame (defaults to False).
            return_probs (boolean): A flag to indicate where the prediction probabilities
                need to be returned (defaults to False). If set to True, returns the 
                probability if the pair was a match.
            inplace (boolean): A flag to indicate whether the append needs to be
                done inplace (defaults to True).
            show_progress (boolean): A flag to indicate whether the progress of
                extracting feature vectors must be displayed (defaults to True).
            n_chunks (int): The number of partitions to split the candidate set. If it 
                is set to -1, the number of partitions will be set to the 
                number of cores in the machine.  


        Returns:
            An array of predictions or a DataFrame with predictions updated.

        """

        logger.warning(
            "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.")

        if x is not None:
            return self._predict(x, table, exclude_attrs, target_attr, append,
                                 return_probs, probs_attr, inplace)
        else:
            n_chunks = get_num_partitions(n_chunks, len(table))
            if n_chunks == 1 or inplace == True or append == False:
                # When the inplace flag is True, the predictions (and probs) are added
                # in place. If he have to use Dask then we have to modify _predict (
                # specifically _predict_sk_learn) function.
                # So, to keep things simple, we support Dask only when
                # inplace=False

                # Similarly, when append=False, the return value from _predict will be
                # different for different cases (for example, when return_probs is True
                #  or False). If we have to use Dask then we have to careful in
                # recording the return values for each chunk.
                # So, to keep things simple, we support Dask only when
                # append=True


                result = self._predict(table=table, exclude_attrs=exclude_attrs,
                                     target_attr=target_attr, append=append,
                                     return_probs=return_probs, probs_attr=probs_attr,
                                     inplace=inplace, copy_props=True)

            else:
                predicted_results = []
                splitted_tables = pd.np.array_split(table, n_chunks)
                for i in range(len(splitted_tables)):
                    partial_result = delayed(self._predict)(table=splitted_tables[i],
                                                            exclude_attrs=exclude_attrs, target_attr=target_attr,
                                                            append=append,
                                                            return_probs=return_probs,
                                                            probs_attr=probs_attr,
                                                            inplace=inplace,
                                                            copy_props=False)
                    predicted_results.append(partial_result)
                predicted_results = delayed(wrap)(predicted_results)
                if show_progress:
                    with ProgressBar():
                        predicted_results = predicted_results.compute(
                            scheduler="processes", num_workers=get_num_cores())
                else:
                     predicted_results = predicted_results.compute(
                        scheduler="processes", num_workers=get_num_cores())


                result = pd.concat(predicted_results)
                cm.copy_properties(table, result)
                return result
def dask_down_sample(ltable, rtable, size, y_param, show_progress=True, verbose=False,
                seed=None, rem_stop_words=True, rem_puncs=True, n_ltable_chunks=1,
                n_sample_rtable_chunks=1):


    """
        WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.
         
        This command down samples two tables A and B into smaller tables A' and
        B' respectively.    
        Specifically, first it randomly selects `size` tuples
        from the table B to be table B'. Next, it builds an inverted index I
        (token, tuple_id) on table A. For each tuple x ∈ B', the algorithm
        finds a set P of k/2 tuples from I that match x,
        and a set Q of k/2 tuples randomly selected from A - P.
        The idea is for A' and B' to share some matches yet be
        as representative of A and B as possible.
    
        Args:
            ltable (DataFrame): The left input table, i.e., table A.
            rtable (DataFrame): The right input table, i.e., table B. 
            size (int): The size that table B should be down sampled to.
            y_param (int): The parameter to control the down sample size of table A.
                Specifically, the down sampled size of table A should be close to
                size * y_param.
            show_progress (boolean): A flag to indicate whether a progress bar
                should be displayed (defaults to True).
            verbose (boolean): A flag to indicate whether the debug information
             should be displayed (defaults to False).
            seed (int): The seed for the pseudo random number generator to select
                the tuples from A and B (defaults to None).
            rem_stop_words (boolean): A flag to indicate whether a default set of stop words 
             must be removed.
            rem_puncs (boolean): A flag to indicate whether the punctuations must be 
             removed from the strings.             
            n_ltable_chunks (int): The number of partitions for ltable (defaults to 1). If it 
              is set to -1, the number of partitions will be set to the 
              number of cores in the machine.  
            n_sample_rtable_chunks (int): The number of partitions for the 
              sampled rtable (defaults to 1)
                
    
        Returns:
            Down sampled tables A and B as pandas DataFrames.
    
        Raises:
            AssertionError: If any of the input tables (`table_a`, `table_b`) are
                empty or not a DataFrame.
            AssertionError: If `size` or `y_param` is empty or 0 or not a
                valid integer value.
            AssertionError: If `seed` is not a valid integer
                value.
            AssertionError: If `verbose` is not of type bool.
            AssertionError: If `show_progress` is not of type bool.
            AssertionError: If `n_ltable_chunks` is not of type int.
            AssertionError: If `n_sample_rtable_chunks` is not of type int.            
    
        Examples:
            >>> from py_entitymatching.dask.dask_down_sample import dask_down_sample
            >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
            >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
            >>> sample_A, sample_B = dask_down_sample(A, B, 500, 1, n_ltable_chunks=-1, n_sample_rtable_chunks=-1)
            # Example with seed = 0. This means the same sample data set will be returned
            # each time this function is run.
            >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
            >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
            >>> sample_A, sample_B = dask_down_sample(A, B, 500, 1, seed=0, n_ltable_chunks=-1, n_sample_rtable_chunks=-1)
            
        """

    logger.warning(
        "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN "
        "RISK.")

    # validation checks
    if not isinstance(ltable, pd.DataFrame):
        logger.error('Input table A (ltable) is not of type pandas DataFrame')
        raise AssertionError(
            'Input table A (ltable) is not of type pandas DataFrame')

    if not isinstance(rtable, pd.DataFrame):
        logger.error('Input table B (rtable) is not of type pandas DataFrame')

        raise AssertionError(
            'Input table B (rtable) is not of type pandas DataFrame')

    if len(ltable) == 0 or len(rtable) == 0:
        logger.error('Size of the input table is 0')
        raise AssertionError('Size of the input table is 0')

    if size == 0 or y_param == 0:
        logger.error(
            'size or y cannot be zero (3rd and 4th parameter of downsample)')
        raise AssertionError(
            'size or y_param cannot be zero (3rd and 4th parameter of downsample)')

    if seed is not None and not isinstance(seed, int):
        logger.error('Seed is not of type integer')
        raise AssertionError('Seed is not of type integer')

    if len(rtable) < size:
        logger.warning(
            'Size of table B is less than b_size parameter - using entire table B')

    validate_object_type(verbose, bool, 'Parameter verbose')
    validate_object_type(show_progress, bool, 'Parameter show_progress')
    validate_object_type(rem_stop_words, bool, 'Parameter rem_stop_words')
    validate_object_type(rem_puncs, bool, 'Parameter rem_puncs')
    validate_object_type(n_ltable_chunks, int, 'Parameter n_ltable_chunks')
    validate_object_type(n_sample_rtable_chunks, int, 'Parameter n_sample_rtable_chunks')


    rtable_sampled = sample_right_table(rtable, size, seed)

    ltbl_str_cols = _get_str_cols_list(ltable)
    proj_ltable = ltable[ltable.columns[ltbl_str_cols]]


    if n_ltable_chunks == -1:
        n_ltable_chunks = get_num_cores()

    ltable_chunks = pd.np.array_split(proj_ltable, n_ltable_chunks)
    preprocessed_tokenized_tbl = []

    # Use Dask to preprocess and tokenize strings.
    start_row_id = 0
    for i in range(len(ltable_chunks)):
        # start_row_id is internally used by process_tokenize_concat strings to map
        # each to its row id in the ltable.
        result = delayed(process_tokenize_concat_strings)(ltable_chunks[i],
                                                             start_row_id,
                                                             rem_puncs, rem_stop_words)
        preprocessed_tokenized_tbl.append(result)

        # update start_row_id
        start_row_id += len(ltable_chunks[i])

    preprocessed_tokenized_tbl = delayed(wrap)(preprocessed_tokenized_tbl)

    # Now execute the DAG
    if show_progress:
        with ProgressBar():
            logger.info('Preprocessing/tokenizing ltable')
            preprocessed_tokenized_tbl_vals = preprocessed_tokenized_tbl.compute(
                scheduler="processes", num_workers=get_num_cores())
    else:
        preprocessed_tokenized_tbl_vals = preprocessed_tokenized_tbl.compute(
            scheduler="processes", num_workers=get_num_cores())

    ltable_processed_dict = {}
    for i in range(len(preprocessed_tokenized_tbl_vals)):
        ltable_processed_dict.update(preprocessed_tokenized_tbl_vals[i])

    # Build an inverted index
    inverted_index = build_inverted_index(ltable_processed_dict)


    # Preprocess/tokenize sampled rtable and probe
    rtbl_str_cols = _get_str_cols_list(rtable_sampled)
    proj_rtable_sampled = rtable_sampled[rtable_sampled.columns[rtbl_str_cols]]


    if n_sample_rtable_chunks == -1:
        n_sample_rtable_chunks = get_num_cores()

    rtable_chunks = pd.np.array_split(proj_rtable_sampled, n_sample_rtable_chunks)
    probe_result = []

    # Create the DAG
    for i in range(len(rtable_chunks)):
        result = delayed(probe)(rtable_chunks[i], y_param, len(proj_ltable),
                                              inverted_index, rem_puncs,
                                rem_stop_words, seed)
        probe_result.append(result)

    probe_result = delayed(wrap)(probe_result)

    # Execute the DAG
    if show_progress:
        with ProgressBar():
            logger.info('Probing using rtable')
            probe_result = probe_result.compute(scheduler="processes",
                                                num_workers=multiprocessing.cpu_count())
    else:
        probe_result = probe_result.compute(scheduler="processes",
                                            num_workers=multiprocessing.cpu_count())

    probe_result = map(list, probe_result)
    l_tbl_indices = set(sum(probe_result, []))

    l_tbl_indices = list(l_tbl_indices)
    ltable_sampled = ltable.iloc[l_tbl_indices]



    # update catalog
    if cm.is_dfinfo_present(ltable):
        cm.copy_properties(ltable, ltable_sampled)

    if cm.is_dfinfo_present(rtable):
        cm.copy_properties(rtable, rtable_sampled)

    return ltable_sampled, rtable_sampled
Exemplo n.º 42
0
    def predict(self, x=None, table=None, exclude_attrs=None, target_attr=None,
                append=False, return_probs=False, probs_attr=None, inplace=True,
                show_progress=False, n_chunks=1):
        """
        WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.

        Predict interface for the matcher.

        Specifically, there are two ways the user can call the predict method.
        First, interface similar to scikit-learn where the feature vectors
        given as projected DataFrame.
        Second, give the DataFrame and explicitly specify the feature vectors
        (by specifying the attributes to be excluded) .

        A point to note is all the input parameters have a default value of
        None. This is done to support both the interfaces in a single function.

        Currently, the Dask implementation supports only the cases when the table is not 
        None and the flags inplace, append are False. 


        Args:
            x (DataFrame): The input pandas DataFrame containing only feature
                vectors (defaults to None).
            table (DataFrame): The input pandas DataFrame containing feature
                vectors, and may be other attributes (defaults to None).
            exclude_attrs (list): A list of attributes to be excluded from the
                input table to get the feature vectors (defaults to None).
            target_attr (string): The attribute name where the predictions
                need to be stored in the input table (defaults to None).
            probs_attr (string): The attribute name where the prediction probabilities 
                need to be stored in the input table (defaults to None).
            append (boolean): A flag to indicate whether the predictions need
                to be appended in the input DataFrame (defaults to False).
            return_probs (boolean): A flag to indicate where the prediction probabilities
                need to be returned (defaults to False). If set to True, returns the 
                probability if the pair was a match.
            inplace (boolean): A flag to indicate whether the append needs to be
                done inplace (defaults to True).
            show_progress (boolean): A flag to indicate whether the progress of
                extracting feature vectors must be displayed (defaults to True).
            n_chunks (int): The number of partitions to split the candidate set. If it 
                is set to -1, the number of partitions will be set to the 
                number of cores in the machine.  


        Returns:
            An array of predictions or a DataFrame with predictions updated.

        """

        logger.warning(
            "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.")

        if x is not None:
            return self._predict(x, table, exclude_attrs, target_attr, append,
                                 return_probs, probs_attr, inplace)
        else:
            n_chunks = get_num_partitions(n_chunks, len(table))
            if n_chunks == 1 or inplace == True or append == False:
                # When the inplace flag is True, the predictions (and probs) are added
                # in place. If he have to use Dask then we have to modify _predict (
                # specifically _predict_sk_learn) function.
                # So, to keep things simple, we support Dask only when
                # inplace=False

                # Similarly, when append=False, the return value from _predict will be
                # different for different cases (for example, when return_probs is True
                #  or False). If we have to use Dask then we have to careful in
                # recording the return values for each chunk.
                # So, to keep things simple, we support Dask only when
                # append=True


                result = self._predict(table=table, exclude_attrs=exclude_attrs,
                                     target_attr=target_attr, append=append,
                                     return_probs=return_probs, probs_attr=probs_attr,
                                     inplace=inplace, copy_props=True)

            else:
                predicted_results = []
                splitted_tables = np.array_split(table, n_chunks)
                for i in range(len(splitted_tables)):
                    partial_result = delayed(self._predict)(table=splitted_tables[i],
                                                            exclude_attrs=exclude_attrs, target_attr=target_attr,
                                                            append=append,
                                                            return_probs=return_probs,
                                                            probs_attr=probs_attr,
                                                            inplace=inplace,
                                                            copy_props=False)
                    predicted_results.append(partial_result)
                predicted_results = delayed(wrap)(predicted_results)
                if show_progress:
                    with ProgressBar():
                        predicted_results = predicted_results.compute(
                            scheduler="processes", num_workers=get_num_cores())
                else:
                     predicted_results = predicted_results.compute(
                        scheduler="processes", num_workers=get_num_cores())


                result = pd.concat(predicted_results)
                cm.copy_properties(table, result)
                return result
Exemplo n.º 43
0
 def test_copy_properties_update_false_1(self):
     A = read_csv_metadata(path_a)
     A1 = read_csv_metadata(path_a)
     status=cm.copy_properties(A, A1, replace=False)
     self.assertEqual(status, False)
Exemplo n.º 44
0
def down_sample(table_a, table_b, size, y_param, show_progress=True,
                verbose=False, seed=None):
    """
    This function down samples two tables A and B into smaller tables A' and
    B' respectively.

    Specifically, first it randomly selects `size` tuples
    from the table B to be table B'. Next, it builds an inverted index I
    (token, tuple_id) on table A. For each tuple x ∈ B', the algorithm
    finds a set P of k/2 tuples from I that match x,
    and a set Q of k/2 tuples randomly selected from A - P.
    The idea is for A' and B' to share some matches yet be
    as representative of A and B as possible.

    Args:
        table_a,table_b (DataFrame): The input tables A and B.
        size (int): The size that table B should be down sampled to.
        y_param (int): The parameter to control the down sample size of table A.
            Specifically, the down sampled size of table A should be close to
            size * y_param.
        show_progress (boolean): A flag to indicate whether a progress bar
            should be displayed (defaults to True).
        verbose (boolean): A flag to indicate whether the debug information
         should be displayed (defaults to False).
        seed (int): The seed for the pseudo random number generator to select
            the tuples from A and B (defaults to None).

    Returns:
        Down sampled tables A and B as pandas DataFrames.

    Raises:
        AssertionError: If any of the input tables (`table_a`, `table_b`) are
            empty or not a DataFrame.
        AssertionError: If `size` or `y_param` is empty or 0 or not a
            valid integer value.
        AssertionError: If `seed` is not a valid integer
            value.

    Examples:
        >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
        >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
        >>> sample_A, sample_B = em.down_sample(A, B, 500, 1)

        # Example with seed = 0. This means the same sample data set will be returned
        # each time this function is run.
        >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
        >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
        >>> sample_A, sample_B = em.down_sample(A, B, 500, 1, seed=0)
    """

    if not isinstance(table_a, pd.DataFrame):
        logger.error('Input table A is not of type pandas DataFrame')
        raise AssertionError(
            'Input table A is not of type pandas DataFrame')

    if not isinstance(table_b, pd.DataFrame):
        logger.error('Input table B is not of type pandas DataFrame')
        raise AssertionError(
            'Input table B is not of type pandas DataFrame')

    if len(table_a) == 0 or len(table_b) == 0:
        logger.error('Size of the input table is 0')
        raise AssertionError('Size of the input table is 0')

    if size == 0 or y_param == 0:
        logger.error(
            'size or y cannot be zero (3rd and 4th parameter of downsample)')
        raise AssertionError(
            'size or y_param cannot be zero (3rd and 4th parameter of downsample)')

    if seed is not None and not isinstance(seed, int):
        logger.error('Seed is not of type integer')
        raise AssertionError('Seed is not of type integer')

    if len(table_b) < size:
        logger.warning(
            'Size of table B is less than b_size parameter - using entire table B')

    # get and validate required metadata
    log_info(logger, 'Required metadata: ltable key, rtable key', verbose)

    # # # get metadata
    # l_key, r_key = cm.get_keys_for_ltable_rtable(table_a, table_b, logger,
    #                                              verbose)
    #
    # # # validate metadata
    # cm._validate_metadata_for_table(table_a, l_key, 'ltable', logger,
    #                                 verbose)
    # cm._validate_metadata_for_table(table_b, r_key, 'rtable', logger,
    #                                 verbose)

    # Inverted index built on table A will consist of all tuples in such P's and Q's - central idea is to have
    # good coverage in the down sampled A' and B'.
    s_inv_index = _inv_index(table_a)

    # Randomly select size tuples from table B to be B'
    # If a seed value has been give, use a RandomState with the given seed
    b_sample_size = min(math.floor(size), len(table_b))
    if seed is not None:
        rand = RandomState(seed)
    else:
        rand = RandomState()
    b_tbl_indices = list(rand.choice(len(table_b), int(b_sample_size), replace=False))

    # Probe inverted index to find all tuples in A that share tokens with tuples in B'.
    s_tbl_indices = _probe_index(table_b.ix[b_tbl_indices], y_param,
                                 len(table_a), s_inv_index, show_progress, seed=seed)
    s_tbl_indices = list(s_tbl_indices)
    l_sampled = table_a.iloc[list(s_tbl_indices)]
    r_sampled = table_b.iloc[list(b_tbl_indices)]

    # update catalog
    if cm.is_dfinfo_present(table_a):
        cm.copy_properties(table_a, l_sampled)
    if cm.is_dfinfo_present(table_b):
        cm.copy_properties(table_b, r_sampled)

    return l_sampled, r_sampled
Exemplo n.º 45
0
def down_sample(table_a, table_b, size, y_param, show_progress=True,
                verbose=False, seed=None, rem_stop_words=True,
                rem_puncs=True, n_jobs=1):
    """
    This function down samples two tables A and B into smaller tables A' and
    B' respectively.

    Specifically, first it randomly selects `size` tuples
    from the table B to be table B'. Next, it builds an inverted index I
    (token, tuple_id) on table A. For each tuple x ∈ B', the algorithm
    finds a set P of k/2 tuples from I that match x,
    and a set Q of k/2 tuples randomly selected from A - P.
    The idea is for A' and B' to share some matches yet be
    as representative of A and B as possible.

    Args:
        table_a,table_b (DataFrame): The input tables A and B.
        size (int): The size that table B should be down sampled to.
        y_param (int): The parameter to control the down sample size of table A.
            Specifically, the down sampled size of table A should be close to
            size * y_param.
        show_progress (boolean): A flag to indicate whether a progress bar
            should be displayed (defaults to True).
        verbose (boolean): A flag to indicate whether the debug information
         should be displayed (defaults to False).
        seed (int): The seed for the pseudo random number generator to select
            the tuples from A and B (defaults to None).
        rem_stop_words (boolean): A flag to indicate whether a default set of stop words 
         must be removed.
        rem_puncs (boolean): A flag to indicate whether the punctuations must be 
         removed from the strings.
        n_jobs (int): The number of parallel jobs to be used for computation
            (defaults to 1). If -1 all CPUs are used. If 0 or 1,
            no parallel computation is used at all, which is useful for
            debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are
            used (where n_cpus is the total number of CPUs in the
            machine). Thus, for n_jobs = -2, all CPUs but one are used.
            If (n_cpus + 1 + n_jobs) is less than 1, then no parallel
            computation is used (i.e., equivalent to the default).
            

    Returns:
        Down sampled tables A and B as pandas DataFrames.

    Raises:
        AssertionError: If any of the input tables (`table_a`, `table_b`) are
            empty or not a DataFrame.
        AssertionError: If `size` or `y_param` is empty or 0 or not a
            valid integer value.
        AssertionError: If `seed` is not a valid integer
            value.
        AssertionError: If `verbose` is not of type bool.
        AssertionError: If `show_progress` is not of type bool.
        AssertionError: If `n_jobs` is not of type int.

    Examples:
        >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
        >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
        >>> sample_A, sample_B = em.down_sample(A, B, 500, 1, n_jobs=-1)

        # Example with seed = 0. This means the same sample data set will be returned
        # each time this function is run.
        >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
        >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
        >>> sample_A, sample_B = em.down_sample(A, B, 500, 1, seed=0, n_jobs=-1)
    """

    if not isinstance(table_a, pd.DataFrame):
        logger.error('Input table A is not of type pandas DataFrame')
        raise AssertionError(
            'Input table A is not of type pandas DataFrame')

    if not isinstance(table_b, pd.DataFrame):
        logger.error('Input table B is not of type pandas DataFrame')
        raise AssertionError(
            'Input table B is not of type pandas DataFrame')

    if len(table_a) == 0 or len(table_b) == 0:
        logger.error('Size of the input table is 0')
        raise AssertionError('Size of the input table is 0')

    if size == 0 or y_param == 0:
        logger.error(
            'size or y cannot be zero (3rd and 4th parameter of downsample)')
        raise AssertionError(
            'size or y_param cannot be zero (3rd and 4th parameter of downsample)')

    if seed is not None and not isinstance(seed, int):
        logger.error('Seed is not of type integer')
        raise AssertionError('Seed is not of type integer')

    if len(table_b) < size:
        logger.warning(
            'Size of table B is less than b_size parameter - using entire table B')

    validate_object_type(verbose, bool, 'Parameter verbose')
    validate_object_type(show_progress, bool, 'Parameter show_progress')
    validate_object_type(rem_stop_words, bool, 'Parameter rem_stop_words')
    validate_object_type(rem_puncs, bool, 'Parameter rem_puncs')
    validate_object_type(n_jobs, int, 'Parameter n_jobs')

    # get and validate required metadata
    log_info(logger, 'Required metadata: ltable key, rtable key', verbose)

    # # # get metadata
    # l_key, r_key = cm.get_keys_for_ltable_rtable(table_a, table_b, logger,
    #                                              verbose)
    #
    # # # validate metadata
    # cm._validate_metadata_for_table(table_a, l_key, 'ltable', logger,
    #                                 verbose)
    # cm._validate_metadata_for_table(table_b, r_key, 'rtable', logger,
    #                                 verbose)

    # Inverted index built on table A will consist of all tuples in such P's and Q's - central idea is to have
    # good coverage in the down sampled A' and B'.
    s_inv_index = _inv_index(table_a, rem_stop_words, rem_puncs)

    # Randomly select size tuples from table B to be B'
    # If a seed value has been give, use a RandomState with the given seed
    b_sample_size = min(math.floor(size), len(table_b))
    if seed is not None:
        rand = RandomState(seed)
    else:
        rand = RandomState()
    b_tbl_indices = list(rand.choice(len(table_b), int(b_sample_size), replace=False))

    n_jobs = get_num_procs(n_jobs, len(table_b))

    sample_table_b = table_b.loc[b_tbl_indices]
    if n_jobs <= 1:
        # Probe inverted index to find all tuples in A that share tokens with tuples in B'.
        s_tbl_indices = _probe_index_split(sample_table_b, y_param,
                                           len(table_a), s_inv_index, show_progress,
                                           seed, rem_stop_words, rem_puncs)
    else:
        sample_table_splits = np.array_split(sample_table_b, n_jobs)
        results = Parallel(n_jobs=n_jobs)(
            delayed(_probe_index_split)(sample_table_splits[job_index], y_param,
                                       len(table_a), s_inv_index,
                                        (show_progress and (job_index == n_jobs - 1)),
                                       seed, rem_stop_words, rem_puncs)
            for job_index in range(n_jobs)
        )
        results = map(list, results)
        s_tbl_indices = set(sum(results, []))

    s_tbl_indices = list(s_tbl_indices)
    l_sampled = table_a.iloc[list(s_tbl_indices)]
    r_sampled = table_b.iloc[list(b_tbl_indices)]

    # update catalog
    if cm.is_dfinfo_present(table_a):
        cm.copy_properties(table_a, l_sampled)
    if cm.is_dfinfo_present(table_b):
        cm.copy_properties(table_b, r_sampled)

    return l_sampled, r_sampled
def extract_feature_vecs(candset, attrs_before=None, feature_table=None,
                         attrs_after=None, verbose=False,
                         show_progress=True, n_jobs=1):
    """
    This function extracts feature vectors from a DataFrame (typically a
    labeled candidate set).

    Specifically, this function uses feature
    table, ltable and rtable (that is present in the `candset`'s
    metadata) to extract feature vectors.

    Args:
        candset (DataFrame): The input candidate set for which the features
            vectors should be extracted.
        attrs_before (list): The list of attributes from the input candset,
            that should be added before the feature vectors (defaults to None).
        feature_table (DataFrame): A DataFrame containing a list of
            features that should be used to compute the feature vectors (
            defaults to None).
        attrs_after (list): The list of attributes from the input candset
            that should be added after the feature vectors (defaults to None).
        verbose (boolean): A flag to indicate whether the debug information
            should be displayed (defaults to False).
        show_progress (boolean): A flag to indicate whether the progress of
            extracting feature vectors must be displayed (defaults to True).


    Returns:
        A pandas DataFrame containing feature vectors.

        The DataFrame will have metadata ltable and rtable, pointing
        to the same ltable and rtable as the input candset.

        Also, the output
        DataFrame will have three columns: key, foreign key ltable, foreign
        key rtable copied from input candset to the output DataFrame. These
        three columns precede the columns mentioned in `attrs_before`.



    Raises:
        AssertionError: If `candset` is not of type pandas
            DataFrame.
        AssertionError: If `attrs_before` has attributes that
            are not present in the input candset.
        AssertionError: If `attrs_after` has attribtues that
            are not present in the input candset.
        AssertionError: If `feature_table` is set to None.


    Examples:
        >>> import py_entitymatching as em
        >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
        >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
        >>> match_f = em.get_features_for_matching(A, B)
        >>> # G is the labeled dataframe which should be converted into feature vectors
        >>> H = em.extract_feature_vecs(G, features=match_f, attrs_before=['title'], attrs_after=['gold_labels'])


    """
    # Validate input parameters

    # # We expect the input candset to be of type pandas DataFrame.
    validate_object_type(candset, pd.DataFrame, error_prefix='Input cand.set')

    # # If the attrs_before is given, Check if the attrs_before are present in
    # the input candset
    if attrs_before != None:
        if not ch.check_attrs_present(candset, attrs_before):
            logger.error(
                'The attributes mentioned in attrs_before is not present '
                'in the input table')
            raise AssertionError(
                'The attributes mentioned in attrs_before is not present '
                'in the input table')

    # # If the attrs_after is given, Check if the attrs_after are present in
    # the input candset
    if attrs_after != None:
        if not ch.check_attrs_present(candset, attrs_after):
            logger.error(
                'The attributes mentioned in attrs_after is not present '
                'in the input table')
            raise AssertionError(
                'The attributes mentioned in attrs_after is not present '
                'in the input table')

    # We expect the feature table to be a valid object
    if feature_table is None:
        logger.error('Feature table cannot be null')
        raise AssertionError('The feature table cannot be null')

    # Do metadata checking
    # # Mention what metadata is required to the user
    ch.log_info(logger, 'Required metadata: cand.set key, fk ltable, '
                        'fk rtable, '
                        'ltable, rtable, ltable key, rtable key', verbose)

    # # Get metadata
    ch.log_info(logger, 'Getting metadata from catalog', verbose)

    key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \
        cm.get_metadata_for_candset(
            candset, logger, verbose)

    # # Validate metadata
    ch.log_info(logger, 'Validating metadata', verbose)
    cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable,
                                      ltable, rtable, l_key, r_key,
                                      logger, verbose)

    # Extract features



    # id_list = [(row[fk_ltable], row[fk_rtable]) for i, row in
    #            candset.iterrows()]
    # id_list = [tuple(tup) for tup in candset[[fk_ltable, fk_rtable]].values]

    # # Set index for convenience
    l_df = ltable.set_index(l_key, drop=False)
    r_df = rtable.set_index(r_key, drop=False)

    # # Apply feature functions
    ch.log_info(logger, 'Applying feature functions', verbose)
    col_names = list(candset.columns)
    fk_ltable_idx = col_names.index(fk_ltable)
    fk_rtable_idx = col_names.index(fk_rtable)

    n_procs = get_num_procs(n_jobs, len(candset))

    c_splits = pd.np.array_split(candset, n_procs)

    pickled_obj = cloudpickle.dumps(feature_table)

    feat_vals_by_splits = Parallel(n_jobs=n_procs)(delayed(get_feature_vals_by_cand_split)(pickled_obj,
                                                                                           fk_ltable_idx,
                                                                                           fk_rtable_idx,
                                                                                           l_df, r_df,
                                                                                           c_splits[i],
                                                                                           show_progress and i == len(
                                                                                               c_splits) - 1)
                                                   for i in range(len(c_splits)))

    feat_vals = sum(feat_vals_by_splits, [])

    # Construct output table
    feature_vectors = pd.DataFrame(feat_vals, index=candset.index.values)
    # # Rearrange the feature names in the input feature table order
    feature_names = list(feature_table['feature_name'])
    feature_vectors = feature_vectors[feature_names]

    ch.log_info(logger, 'Constructing output table', verbose)
    # print(feature_vectors)
    # # Insert attrs_before
    if attrs_before:
        if not isinstance(attrs_before, list):
            attrs_before = [attrs_before]
        attrs_before = gh.list_diff(attrs_before, [key, fk_ltable, fk_rtable])
        attrs_before.reverse()
        for a in attrs_before:
            feature_vectors.insert(0, a, candset[a])

    # # Insert keys
    feature_vectors.insert(0, fk_rtable, candset[fk_rtable])
    feature_vectors.insert(0, fk_ltable, candset[fk_ltable])
    feature_vectors.insert(0, key, candset[key])

    # # insert attrs after
    if attrs_after:
        if not isinstance(attrs_after, list):
            attrs_after = [attrs_after]
        attrs_after = gh.list_diff(attrs_after, [key, fk_ltable, fk_rtable])
        attrs_after.reverse()
        col_pos = len(feature_vectors.columns)
        for a in attrs_after:
            feature_vectors.insert(col_pos, a, candset[a])
            col_pos += 1

    # Reset the index
    # feature_vectors.reset_index(inplace=True, drop=True)

    # # Update the catalog
    cm.init_properties(feature_vectors)
    cm.copy_properties(candset, feature_vectors)

    # Finally, return the feature vectors
    return feature_vectors
Exemplo n.º 47
0
def sample_table(table, sample_size, replace=False, verbose=False):
    """
    Samples a candidate set of tuple pairs (for labeling purposes).

    This function samples a DataFrame, typically used for labeling
    purposes. This function expects the input DataFrame containing the
    metadata of a candidate set (such as key, fk_ltable, fk_rtable, ltable,
    rtable). Specifically, this function creates a copy of the input
    DataFrame, samples the data using uniform random sampling (uses 'random'
    function from numpy to sample) and returns the sampled DataFrame.
    Further, also copies the properties from the input DataFrame to the output
    DataFrame.

    Args:
        table (DataFrame): The input DataFrame to be sampled.
            Specifically,
            a DataFrame containing the metadata of a candidate set (such as
            key, fk_ltable, fk_rtable, ltable, rtable) in the catalog.
        sample_size (int): The number of samples to be picked from the input
            DataFrame.
        replace (boolean): A flag to indicate whether sampling should be
            done with replacement or not (defaults to False).
        verbose (boolean): A flag to indicate whether more detailed information
            about the execution steps should be printed out (defaults to False).

    Returns:
        A new DataFrame with 'sample_size' number of rows.

        Further,
        this function sets the output DataFrame's properties same as input
        DataFrame.

    Raises:
        AssertionError: If `table` is not of type pandas DataFrame.
        AssertionError: If the size of `table` is 0.
        AssertionError: If the `sample_size` is greater than the input
            DataFrame size.

    Examples:
        >>> import py_entitymatching as em
        >>> S = em.sample_table(C, sample_size=450) # C is the candidate set to be sampled from.


    Note:
        As mentioned in the above description, the output DataFrame is
        updated (in the catalog) with the properties from the input
        DataFrame. A subtle point to note here is, when the replace flag is
        set to True, then the output  DataFrame can contain duplicate keys.
        In that case, this function  will not set the key and it is up to
        the user to fix it after the function returns.
    """
    # Validate input parameters.

    # # The input DataFrame is expected to be of type pandas DataFrame.
    validate_object_type(table, pd.DataFrame)

    # # There should at least not-zero rows to sample from
    if len(table) == 0:
        logger.error('Size of the input table is 0')
        raise AssertionError('Size of the input table is 0')

    # # The sample size should be less than or equal to the number of rows in
    #  the input DataFrame
    if len(table) < sample_size:
        logger.error('Sample size is larger than the input table size')
        raise AssertionError('Sample size is larger than the input table size')

    # Now, validate the metadata for the input DataFrame as we have to copy
    # these properties to the output DataFrame

    # # First, display what metadata is required for this function
    ch.log_info(logger, 'Required metadata: cand.set key, fk ltable, '
                        'fk rtable, ltable, rtable, ltable key, rtable key',
                verbose)

    # # Second, get the metadata
    key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \
        cm.get_metadata_for_candset(table, logger, verbose)

    # # Third, validate the metadata
    cm._validate_metadata_for_candset(table, key, fk_ltable, fk_rtable,
                                      ltable, rtable, l_key, r_key,
                                      logger, verbose)

    # Get the sample set for the output table
    sample_indices = pd.np.random.choice(len(table), sample_size,
                                         replace=replace)
    # Sort the indices ordered by index value
    sample_indices = sorted(sample_indices)
    sampled_table = table.iloc[list(sample_indices)]

    # Copy the properties
    cm.init_properties(sampled_table)

    # # If the replace is set to True, then we should check for the validity
    # of key before setting it
    if replace:
        properties = cm.get_all_properties(table)
        for property_name, property_value in six.iteritems(properties):
            if property_name == 'key':
                # Check for the validity of key before setting it
                cm.set_key(sampled_table, property_value)
            else:
                # Copy the other properties as is
                cm.set_property(sampled_table, property_name, property_value)
    else:
        cm.copy_properties(table, sampled_table)

    # Return the sampled table
    return sampled_table