Exemplos de set_key em Python, exemplos de py_entitymatching.catalog.catalog_manager.set_key em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: pandas_helper.py Projeto: stevemandala/py_entitymatching

def rename_col(df, old_col_name, new_col_name):
    new_df = df.rename(columns={old_col_name: new_col_name})

    if cm.is_dfinfo_present(df):
        cm.init_properties(new_df)
        cm.copy_properties(df, new_df)

        if _is_table_or_candset(df):
            if not _is_table(df):
                key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \
                    cm.get_metadata_for_candset(df, logger, False)
                if key == old_col_name:
                    cm.set_key(new_df, new_col_name)
                elif fk_ltable == old_col_name:
                    cm.set_fk_ltable(new_df, new_col_name)
                elif fk_rtable == old_col_name:
                    cm.set_fk_rtable(new_df, new_col_name)
                else:
                    pass
            else:
                key = cm.get_key(df)
                if key == old_col_name:
                    cm.set_key(new_df, new_col_name)

    return new_df

Exemplo n.º 2

0

Exibir arquivo

Arquivo: pandas_helper.py Projeto: anhaidgroup/py_entitymatching

def rename_col(df, old_col_name, new_col_name):
    new_df = df.rename(columns={old_col_name: new_col_name})

    if cm.is_dfinfo_present(df):
        cm.init_properties(new_df)
        cm.copy_properties(df, new_df)

        if _is_table_or_candset(df):
            if not _is_table(df):
                key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \
                    cm.get_metadata_for_candset(df, logger, False)
                if key == old_col_name:
                    cm.set_key(new_df, new_col_name)
                elif fk_ltable == old_col_name:
                    cm.set_fk_ltable(new_df, new_col_name)
                elif fk_rtable == old_col_name:
                    cm.set_fk_rtable(new_df, new_col_name)
                else:
                    pass
            else:
                key = cm.get_key(df)
                if key == old_col_name:
                    cm.set_key(new_df, new_col_name)

    return new_df

Exemplo n.º 3

0

Exibir arquivo

Arquivo: generic_helper.py Projeto: anhaidgroup/py_entitymatching

def load_dataset(file_name, key=None, **kwargs):
    if not isinstance(file_name, six.string_types):
        logger.error('file name is not a string')
        raise AssertionError('file name is not a string')
    p = get_install_path()
    p = os.sep.join([p, 'datasets', file_name+'.csv'])
    table = pd.read_csv(p, **kwargs)
    if key is not None:
        cm.set_key(table, key)
    return table

Exemplo n.º 4

0

Exibir arquivo

def load_dataset(file_name, key=None, **kwargs):
    if not isinstance(file_name, six.string_types):
        logger.error('file name is not a string')
        raise AssertionError('file name is not a string')
    p = get_install_path()
    p = os.sep.join([p, 'datasets', file_name + '.csv'])
    table = pd.read_csv(p, **kwargs)
    if key is not None:
        cm.set_key(table, key)
    return table

Exemplo n.º 5

0

Exibir arquivo

Arquivo: tuner_overlap_blocker.py Projeto: anhaidgroup/py_entitymatching

def get_sampled_tables(ltable, rtable, l_key, r_key, l_overlap_attr, r_overlap_attr,
                       should_rem_stop_words, tokenizer, ob_obj, n_bins,
                       sample_proportion, seed):

    """
    Function to return the sampled tables.    
    """
    s_ltable = sample_ltable(ltable, l_key, l_overlap_attr, should_rem_stop_words,
                             ob_obj, n_bins, sample_proportion, seed)

    cm.set_key(s_ltable, l_key)
    result =  ob_obj.process_tokenize_block_attr(ltable[l_overlap_attr], 0,
                                                 should_rem_stop_words, tokenizer)
    inverted_index = ob_obj.build_inverted_index(result)

    s_rtable = sample_rtable(rtable, r_key, r_overlap_attr, tokenizer,
                             should_rem_stop_words, ob_obj, n_bins, sample_proportion,
                             inverted_index, seed)
    cm.set_key(s_rtable, r_key)

    return (s_ltable, s_rtable)

Exemplo n.º 6

0

Exibir arquivo

Arquivo: tuner_overlap_blocker.py Projeto: anhaidgroup/py_entitymatching

def get_sampled_tables(ltable, rtable, l_key, r_key, l_overlap_attr,
                       r_overlap_attr, should_rem_stop_words, tokenizer,
                       ob_obj, n_bins, sample_proportion, seed):
    """
    Function to return the sampled tables.    
    """
    s_ltable = sample_ltable(ltable, l_key, l_overlap_attr,
                             should_rem_stop_words, ob_obj, n_bins,
                             sample_proportion, seed)

    cm.set_key(s_ltable, l_key)
    result = ob_obj.process_tokenize_block_attr(ltable[l_overlap_attr], 0,
                                                should_rem_stop_words,
                                                tokenizer)
    inverted_index = ob_obj.build_inverted_index(result)

    s_rtable = sample_rtable(rtable, r_key, r_overlap_attr, tokenizer,
                             should_rem_stop_words, ob_obj, n_bins,
                             sample_proportion, inverted_index, seed)
    cm.set_key(s_rtable, r_key)

    return (s_ltable, s_rtable)

Exemplo n.º 7

0

Exibir arquivo

Arquivo: test_catalog.py Projeto: stevemandala/py_entitymatching

 def test_set_key_with_mvals(self):
     p = os.sep.join([catalog_datasets_path, 'A_mvals.csv'])
     A = pd.read_csv(p)
     status = cm.set_key(A, 'ID')
     self.assertEqual(status, False)

Exemplo n.º 8

0

Exibir arquivo

Arquivo: test_catalog.py Projeto: stevemandala/py_entitymatching

 def test_set_key_notin_df(self):
     A = pd.read_csv(path_a)
     cm.set_key(A, 'ID1')

Exemplo n.º 9

0

Exibir arquivo

Arquivo: test_catalog.py Projeto: stevemandala/py_entitymatching

 def test_set_key_invalid_df(self):
     cm.set_key(None, 'ID')

Exemplo n.º 10

0

Exibir arquivo

Arquivo: test_catalog.py Projeto: stevemandala/py_entitymatching

 def test_set_key_valid(self):
     A = pd.read_csv(path_a)
     cm.set_key(A, 'ID')
     self.assertEqual(cm.get_key(A), 'ID')

Exemplo n.º 11

0

Exibir arquivo

Arquivo: pickles.py Projeto: anhaidgroup/py_entitymatching

def load_table(file_path, metadata_ext='.pklmetadata'):
    """
    Loads a pickled DataFrame from a file along with its metadata.

    This function loads a DataFrame from a file stored in pickle format.

    Further, this function looks for a metadata file with the same file name
    but with an extension given by the user (defaults to '.pklmetadata'. If the
    metadata file is present, the function will update the metadata for that
    DataFrame in the catalog.

    Args:
        file_path (string): The file path to load the file from.
        metadata_ext (string): The metadata file extension (defaults to
            '.pklmetadata') that should be used to generate metadata file name.

    Returns:
        If the loading is successful, the function will return a pandas
        DataFrame read from the file. The catalog will be updated with the
        metadata read from the metadata file (if the file was present).

    Raises:
        AssertionError: If `file_path` is not of type string.
        AssertionError: If `metadata_ext` is not of type string.


    Examples:

        >>> A = em.load_table('./A.pkl')

        >>> A = em.load_table('./A.pkl', metadata_ext='.pklmeta')


    See Also:
        :meth:`~py_entitymatching.save_table`



    Note:
        This function is different from read_csv_metadata in two aspects.
        First, this function currently does not support reading in candidate
        set tables, where there are more metadata such as ltable,
        rtable than just 'key', and conceptually the user is expected to
        provide ltable and rtable information while calling this function. (
        this support will be added shortly). Second, this function loads the
        table stored in a pickle format.


    """
    # Validate input parameters
    validate_object_type(file_path, six.string_types, error_prefix='Input file path')

    validate_object_type(metadata_ext, six.string_types)

    # Load the object from the file path. Note that we use a generic load
    # object to load in the DataFrame too.
    data_frame = load_object(file_path)

    # Load metadata from file path

    # # Check if the meta data file is present
    if ps._is_metadata_file_present(file_path, extension=metadata_ext):
        # Construct the metadata file name, and read it from the disk.

        # # Get the file name used to load the DataFrame
        file_name, _ = os.path.splitext(file_path)
        # # Construct the metadata file name
        metadata_filename = file_name + metadata_ext
        # # Load the metadata from the disk
        metadata_dict = load_object(metadata_filename)

        # Update the catalog with the properties read from the disk
        for property_name, property_value in six.iteritems(metadata_dict):
            if property_name == 'key':
                # If the property_name is key call set_key as the function
                # will check for the integrity of key before setting it in
                # the catalog
                cm.set_key(data_frame, property_value)
            else:
                cm.set_property(data_frame, property_name, property_value)
    else:
        # If the metadata file is not present then issue a warning
        logger.warning('There is no metadata file')

    # Return the DataFrame
    return data_frame

Exemplo n.º 12

0

Exibir arquivo

def read_csv_metadata(file_path, **kwargs):
    """
    Reads a CSV (comma-separated values) file into a pandas DataFrame
    and update the catalog with the metadata. The CSV files typically contain
    data for the input tables or a candidate set.

    Specifically, this function first reads the CSV file from the given file
    path into a pandas DataFrame, by using pandas' in-built 'read_csv'
    method. Then, it updates the catalog with the metadata. There are three
    ways to update the metadata: (1) using a metadata file, (2) using the
    key-value parameters supplied in the function, and (3) using both
    metadata file and key-value parameters.

    To update the metadata in the catalog using the metadata file,
    the function will look for a file in the same directory with  same file name
    but with a  specific extension. This extension can be optionally given by
    the user (defaults to '.metadata'). If the metadata  file is  present,
    the function will read and update the catalog appropriately. If  the
    metadata file is not present, the function will issue a warning that the
    metadata file is not present.

    The metadata information can also be given as parameters to the function
    (see description of arguments for more details). If given, the function
    will update the catalog with the given information.

    Further, the metadata can partly reside in the metdata file and partly as
    supplied parameters. The function will take a union of the two and
    update the catalog appropriately.
    If the same metadata is given in both the metadata file
    and the function, then the metadata in the function takes precedence over
    the metadata given in the file.

    Args:
        file_path(string): The CSV file path

        kwargs(dictionary): A Python dictionary containing key-value arguments.
            There are a few key-value pairs that are specific to
            read_csv_metadata and  all the other key-value pairs are passed
            to pandas read_csv method

    Returns:
        A pandas DataFrame read from the input CSV file.
    Raises:
        AssertionError: If `file_path` is not of type string.
        AssertionError: If a file does not exist in the
            given `file_path`.

    Examples:
        *Example 1:* Read from CSV file and set metadata

        >>> A = em.read_csv_metadata('path_to_csv_file', key='id')
        >>> em.get_key(A)
         # 'id'

        *Example 2:*  Read from CSV file (with metadata file in the same directory

         Let the metadata file contain the following contents:

          #key = id

        >>> A = em.read_csv_metadata('path_to_csv_file')
        >>> em.get_key(A)
         # 'id'

    See Also:
        :meth:`~py_entitymatching.to_csv_metadata`
    """
    # Validate the input parameters.

    # # File path is expected to be of type string.
    if not isinstance(file_path, six.string_types):
        logger.error('Input file path is not of type string')
        raise AssertionError('Input file path is not of type string')

    # # Check if the given path is valid.
    if not os.path.exists(file_path):
        logger.error('File does not exist at path %s' % file_path)
        raise AssertionError('File does not exist at path %s' % file_path)

    # Check if the user has specified the metadata file's extension.
    extension = kwargs.pop('metadata_extn', None)

    # If the extension is not specified then set the extension to .metadata'.
    if extension is None:
        extension = '.metadata'

    # Format the extension to include a '.' in front if the user has not
    # given one.
    if not extension.startswith('.'):
        extension = '.' + extension

    # If the file is present, then update metadata from file.
    if _is_metadata_file_present(file_path, extension=extension):
        file_name, _ = os.path.splitext(file_path)
        file_name = ''.join([file_name, extension])
        metadata, _ = _get_metadata_from_file(file_name)

    # Else issue a warning that the metadata file is not present
    else:
        logger.warning('Metadata file is not present in the given path; '
                       'proceeding to read the csv file.')
        metadata = {}

    # Update the metadata with the key-value pairs given in the command. The
    # function _update_metadata_for_read_cmd takes care of updating the
    # metadata with only the key-value pairs specific to read_csv_metadata
    # method
    metadata, kwargs = _update_metadata_for_read_cmd(metadata, **kwargs)

    # Validate the metadata.
    _check_metadata_for_read_cmd(metadata)

    # Read the csv file using pandas read_csv method.
    data_frame = pd.read_csv(file_path, **kwargs)

    # Get the value for 'key' property and update the catalog.
    key = metadata.pop('key', None)
    if key is not None:
        cm.set_key(data_frame, key)

    fk_ltable = metadata.pop('fk_ltable', None)
    if fk_ltable is not None:
        cm.set_fk_ltable(data_frame, fk_ltable)

    fk_rtable = metadata.pop('fk_rtable', None)
    if fk_ltable is not None:
        cm.set_fk_rtable(data_frame, fk_rtable)

    # Update the catalog with other properties.
    for property_name, property_value in six.iteritems(metadata):
        cm.set_property(data_frame, property_name, property_value)
    if not cm.is_dfinfo_present(data_frame):
        cm.init_properties(data_frame)

    # Return the DataFrame
    return data_frame

Exemplo n.º 13

0

Exibir arquivo

Arquivo: single_table.py Projeto: anhaidgroup/py_entitymatching

def sample_table(table, sample_size, replace=False, verbose=False):
    """
    Samples a candidate set of tuple pairs (for labeling purposes).

    This function samples a DataFrame, typically used for labeling
    purposes. This function expects the input DataFrame containing the
    metadata of a candidate set (such as key, fk_ltable, fk_rtable, ltable,
    rtable). Specifically, this function creates a copy of the input
    DataFrame, samples the data using uniform random sampling (uses 'random'
    function from numpy to sample) and returns the sampled DataFrame.
    Further, also copies the properties from the input DataFrame to the output
    DataFrame.

    Args:
        table (DataFrame): The input DataFrame to be sampled.
            Specifically,
            a DataFrame containing the metadata of a candidate set (such as
            key, fk_ltable, fk_rtable, ltable, rtable) in the catalog.
        sample_size (int): The number of samples to be picked from the input
            DataFrame.
        replace (boolean): A flag to indicate whether sampling should be
            done with replacement or not (defaults to False).
        verbose (boolean): A flag to indicate whether more detailed information
            about the execution steps should be printed out (defaults to False).

    Returns:
        A new DataFrame with 'sample_size' number of rows.

        Further,
        this function sets the output DataFrame's properties same as input
        DataFrame.

    Raises:
        AssertionError: If `table` is not of type pandas DataFrame.
        AssertionError: If the size of `table` is 0.
        AssertionError: If the `sample_size` is greater than the input
            DataFrame size.

    Examples:
        >>> import py_entitymatching as em
        >>> S = em.sample_table(C, sample_size=450) # C is the candidate set to be sampled from.


    Note:
        As mentioned in the above description, the output DataFrame is
        updated (in the catalog) with the properties from the input
        DataFrame. A subtle point to note here is, when the replace flag is
        set to True, then the output  DataFrame can contain duplicate keys.
        In that case, this function  will not set the key and it is up to
        the user to fix it after the function returns.
    """
    # Validate input parameters.

    # # The input DataFrame is expected to be of type pandas DataFrame.
    validate_object_type(table, pd.DataFrame)

    # # There should at least not-zero rows to sample from
    if len(table) == 0:
        logger.error('Size of the input table is 0')
        raise AssertionError('Size of the input table is 0')

    # # The sample size should be less than or equal to the number of rows in
    #  the input DataFrame
    if len(table) < sample_size:
        logger.error('Sample size is larger than the input table size')
        raise AssertionError('Sample size is larger than the input table size')

    # Now, validate the metadata for the input DataFrame as we have to copy
    # these properties to the output DataFrame

    # # First, display what metadata is required for this function
    ch.log_info(
        logger, 'Required metadata: cand.set key, fk ltable, '
        'fk rtable, ltable, rtable, ltable key, rtable key', verbose)

    # # Second, get the metadata
    key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \
        cm.get_metadata_for_candset(table, logger, verbose)

    # # Third, validate the metadata
    cm._validate_metadata_for_candset(table, key, fk_ltable, fk_rtable, ltable,
                                      rtable, l_key, r_key, logger, verbose)

    # Get the sample set for the output table
    sample_indices = np.random.choice(len(table), sample_size, replace=replace)
    # Sort the indices ordered by index value
    sample_indices = sorted(sample_indices)
    sampled_table = table.iloc[list(sample_indices)]

    # Copy the properties
    cm.init_properties(sampled_table)

    # # If the replace is set to True, then we should check for the validity
    # of key before setting it
    if replace:
        properties = cm.get_all_properties(table)
        for property_name, property_value in six.iteritems(properties):
            if property_name == 'key':
                # Check for the validity of key before setting it
                cm.set_key(sampled_table, property_value)
            else:
                # Copy the other properties as is
                cm.set_property(sampled_table, property_name, property_value)
    else:
        cm.copy_properties(table, sampled_table)

    # Return the sampled table
    return sampled_table

Exemplo n.º 14

0

Exibir arquivo

Arquivo: single_table.py Projeto: anhaidgroup/py_entitymatching

def sample_table(table, sample_size, replace=False, verbose=False):
    """
    Samples a candidate set of tuple pairs (for labeling purposes).

    This function samples a DataFrame, typically used for labeling
    purposes. This function expects the input DataFrame containing the
    metadata of a candidate set (such as key, fk_ltable, fk_rtable, ltable,
    rtable). Specifically, this function creates a copy of the input
    DataFrame, samples the data using uniform random sampling (uses 'random'
    function from numpy to sample) and returns the sampled DataFrame.
    Further, also copies the properties from the input DataFrame to the output
    DataFrame.

    Args:
        table (DataFrame): The input DataFrame to be sampled.
            Specifically,
            a DataFrame containing the metadata of a candidate set (such as
            key, fk_ltable, fk_rtable, ltable, rtable) in the catalog.
        sample_size (int): The number of samples to be picked from the input
            DataFrame.
        replace (boolean): A flag to indicate whether sampling should be
            done with replacement or not (defaults to False).
        verbose (boolean): A flag to indicate whether more detailed information
            about the execution steps should be printed out (defaults to False).

    Returns:
        A new DataFrame with 'sample_size' number of rows.

        Further,
        this function sets the output DataFrame's properties same as input
        DataFrame.

    Raises:
        AssertionError: If `table` is not of type pandas DataFrame.
        AssertionError: If the size of `table` is 0.
        AssertionError: If the `sample_size` is greater than the input
            DataFrame size.

    Examples:
        >>> import py_entitymatching as em
        >>> S = em.sample_table(C, sample_size=450) # C is the candidate set to be sampled from.


    Note:
        As mentioned in the above description, the output DataFrame is
        updated (in the catalog) with the properties from the input
        DataFrame. A subtle point to note here is, when the replace flag is
        set to True, then the output  DataFrame can contain duplicate keys.
        In that case, this function  will not set the key and it is up to
        the user to fix it after the function returns.
    """
    # Validate input parameters.

    # # The input DataFrame is expected to be of type pandas DataFrame.
    validate_object_type(table, pd.DataFrame)

    # # There should at least not-zero rows to sample from
    if len(table) == 0:
        logger.error('Size of the input table is 0')
        raise AssertionError('Size of the input table is 0')

    # # The sample size should be less than or equal to the number of rows in
    #  the input DataFrame
    if len(table) < sample_size:
        logger.error('Sample size is larger than the input table size')
        raise AssertionError('Sample size is larger than the input table size')

    # Now, validate the metadata for the input DataFrame as we have to copy
    # these properties to the output DataFrame

    # # First, display what metadata is required for this function
    ch.log_info(logger, 'Required metadata: cand.set key, fk ltable, '
                        'fk rtable, ltable, rtable, ltable key, rtable key',
                verbose)

    # # Second, get the metadata
    key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \
        cm.get_metadata_for_candset(table, logger, verbose)

    # # Third, validate the metadata
    cm._validate_metadata_for_candset(table, key, fk_ltable, fk_rtable,
                                      ltable, rtable, l_key, r_key,
                                      logger, verbose)

    # Get the sample set for the output table
    sample_indices = pd.np.random.choice(len(table), sample_size,
                                         replace=replace)
    # Sort the indices ordered by index value
    sample_indices = sorted(sample_indices)
    sampled_table = table.iloc[list(sample_indices)]

    # Copy the properties
    cm.init_properties(sampled_table)

    # # If the replace is set to True, then we should check for the validity
    # of key before setting it
    if replace:
        properties = cm.get_all_properties(table)
        for property_name, property_value in six.iteritems(properties):
            if property_name == 'key':
                # Check for the validity of key before setting it
                cm.set_key(sampled_table, property_value)
            else:
                # Copy the other properties as is
                cm.set_property(sampled_table, property_name, property_value)
    else:
        cm.copy_properties(table, sampled_table)

    # Return the sampled table
    return sampled_table

Exemplo n.º 15

0

Exibir arquivo

def load_table(file_path, metadata_ext='.pklmetadata'):
    """
    Loads a pickled DataFrame from a file along with its metadata.

    This function loads a DataFrame from a file stored in pickle format.

    Further, this function looks for a metadata file with the same file name
    but with an extension given by the user (defaults to '.pklmetadata'. If the
    metadata file is present, the function will update the metadata for that
    DataFrame in the catalog.

    Args:
        file_path (string): The file path to load the file from.
        metadata_ext (string): The metadata file extension (defaults to
            '.pklmetadata') that should be used to generate metadata file name.

    Returns:
        If the loading is successful, the function will return a pandas
        DataFrame read from the file. The catalog will be updated with the
        metadata read from the metadata file (if the file was present).

    Raises:
        AssertionError: If `file_path` is not of type string.
        AssertionError: If `metadata_ext` is not of type string.


    Examples:

        >>> A = em.load_table('./A.pkl')

        >>> A = em.load_table('./A.pkl', metadata_ext='.pklmeta')


    See Also:
        :meth:`~py_entitymatching.save_table`



    Note:
        This function is different from read_csv_metadata in two aspects.
        First, this function currently does not support reading in candidate
        set tables, where there are more metadata such as ltable,
        rtable than just 'key', and conceptually the user is expected to
        provide ltable and rtable information while calling this function. (
        this support will be added shortly). Second, this function loads the
        table stored in a pickle format.


    """
    # Validate input parameters
    validate_object_type(file_path, six.string_types)

    validate_object_type(metadata_ext, six.string_types)

    # Load the object from the file path. Note that we use a generic load
    # object to load in the DataFrame too.
    data_frame = load_object(file_path)

    # Load metadata from file path

    # # Check if the meta data file is present
    if ps._is_metadata_file_present(file_path, extension=metadata_ext):
        # Construct the metadata file name, and read it from the disk.

        # # Get the file name used to load the DataFrame
        file_name, _ = os.path.splitext(file_path)
        # # Construct the metadata file name
        metadata_filename = file_name + metadata_ext
        # # Load the metadata from the disk
        metadata_dict = load_object(metadata_filename)

        # Update the catalog with the properties read from the disk
        for property_name, property_value in six.iteritems(metadata_dict):
            if property_name == 'key':
                # If the property_name is key call set_key as the function
                # will check for the integrity of key before setting it in
                # the catalog
                cm.set_key(data_frame, property_value)
            else:
                cm.set_property(data_frame, property_name, property_value)
    else:
        # If the metadata file is not present then issue a warning
        logger.warning('There is no metadata file')

    # Return the DataFrame
    return data_frame

Exemplo n.º 16

0

Exibir arquivo

Arquivo: dask_overlap_blocker.py Projeto: ale3385/py_entitymatching

    def block_tables(self, ltable, rtable, l_overlap_attr, r_overlap_attr,
                     rem_stop_words=False, q_val=None, word_level=True, overlap_size=1,
                     l_output_attrs=None, r_output_attrs=None,
                     l_output_prefix='ltable_', r_output_prefix='rtable_',
                     allow_missing=False, verbose=False, show_progress=True,
                     n_ltable_chunks=1, n_rtable_chunks=1):

        """
        WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.

        Blocks two tables based on the overlap of token sets of attribute
        values. Finds tuple pairs from left and right tables such that the overlap
        between (a) the set of tokens obtained by tokenizing the value of
        attribute l_overlap_attr of a tuple from the left table, and (b) the
        set of tokens obtained by tokenizing the value of attribute
        r_overlap_attr of a tuple from the right table, is above a certain
        threshold.

        Args:
            ltable (DataFrame): The left input table.

            rtable (DataFrame): The right input table.

            l_overlap_attr (string): The overlap attribute in left table.

            r_overlap_attr (string): The overlap attribute in right table.

            rem_stop_words (boolean): A flag to indicate whether stop words
             (e.g., a, an, the) should be removed from the token sets of the
             overlap attribute values (defaults to False).

            q_val (int): The value of q to use if the overlap attributes
             values are to be tokenized as qgrams (defaults to None).

            word_level (boolean): A flag to indicate whether the overlap
             attributes should be tokenized as words (i.e, using whitespace
             as delimiter) (defaults to True).

            overlap_size (int): The minimum number of tokens that must
             overlap (defaults to 1).
            l_output_attrs (list): A list of attribute names from the left
                table to be included in the output candidate set (defaults
                to None).
            r_output_attrs (list): A list of attribute names from the right
                table to be included in the output candidate set  (defaults
                to None).

            l_output_prefix (string): The prefix to be used for the attribute names
                                   coming from the left table in the output
                                   candidate set (defaults to 'ltable\_').
            r_output_prefix (string): The prefix to be used for the attribute names
                                   coming from the right table in the output
                                   candidate set (defaults to 'rtable\_').
            allow_missing (boolean): A flag to indicate whether tuple pairs
                                     with missing value in at least one of the
                                     blocking attributes should be included in
                                     the output candidate set (defaults to
                                     False). If this flag is set to True, a
                                     tuple in ltable with missing value in the
                                     blocking attribute will be matched with
                                     every tuple in rtable and vice versa.

            verbose (boolean): A flag to indicate whether the debug
                information should be logged (defaults to False).

            show_progress (boolean): A flag to indicate whether progress should
                be displayed to the user (defaults to True).

            n_ltable_chunks (int): The number of partitions to split the left table (
                                    defaults to 1). If it is set to -1, then the number of 
                                    partitions is set to the number of cores in the 
                                    machine.                                      
            n_rtable_chunks (int): The number of partitions to split the right table (
                                    defaults to 1). If it is set to -1, then the number of 
                                    partitions is set to the number of cores in the 
                                    machine.            


        Returns:
            A candidate set of tuple pairs that survived blocking (DataFrame).
        Raises:
            AssertionError: If `ltable` is not of type pandas
                DataFrame.

            AssertionError: If `rtable` is not of type pandas
                DataFrame.

            AssertionError: If `l_overlap_attr` is not of type string.

            AssertionError: If `r_overlap_attr` is not of type string.

            AssertionError: If `l_output_attrs` is not of type of
             list.

            AssertionError: If `r_output_attrs` is not of type of
             list.

            AssertionError: If the values in `l_output_attrs` is not of type
             string.

            AssertionError: If the values in `r_output_attrs` is not of type
             string.

            AssertionError: If `l_output_prefix` is not of type
             string.

            AssertionError: If `r_output_prefix` is not of type
             string.

            AssertionError: If `q_val` is not of type int.

            AssertionError: If `word_level` is not of type boolean.

            AssertionError: If `overlap_size` is not of type int.

            AssertionError: If `verbose` is not of type
             boolean.

            AssertionError: If `allow_missing` is not of type boolean.

            AssertionError: If `show_progress` is not of type
             boolean.

            AssertionError: If `n_ltable_chunks` is not of type
             int.

            AssertionError: If `n_rtable_chunks` is not of type
             int.

            AssertionError: If `l_overlap_attr` is not in the ltable
             columns.

            AssertionError: If `r_block_attr` is not in the rtable columns.

            AssertionError: If `l_output_attrs` are not in the ltable.

            AssertionError: If `r_output_attrs` are not in the rtable.

            SyntaxError: If `q_val` is set to a valid value and
                `word_level` is set to True.

            SyntaxError: If `q_val` is set to None and
                `word_level` is set to False.

        Examples:
            >>> from py_entitymatching.dask.dask_overlap_blocker import DaskOverlapBlocker
            >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
            >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
            >>> ob = DaskOverlapBlocker()
            # Use all cores
            # # Use word-level tokenizer
            >>> C1 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], word_level=True, overlap_size=1, n_ltable_chunks=-1, n_rtable_chunks=-1)
            # # Use q-gram tokenizer
            >>> C2 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], word_level=False, q_val=2, n_ltable_chunks=-1, n_rtable_chunks=-1)
            # # Include all possible missing values
            >>> C3 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], allow_missing=True, n_ltable_chunks=-1, n_rtable_chunks=-1)
        """
        logger.warning(
            "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN "
            "RISK.")

        # Input validations
        self.validate_types_params_tables(ltable, rtable, l_output_attrs,
                                          r_output_attrs, l_output_prefix,
                                          r_output_prefix, verbose, n_ltable_chunks, n_rtable_chunks)
        self.validate_types_other_params(l_overlap_attr, r_overlap_attr,
                                         rem_stop_words, q_val, word_level, overlap_size)
        self.validate_allow_missing(allow_missing)
        self.validate_show_progress(show_progress)
        self.validate_overlap_attrs(ltable, rtable, l_overlap_attr, r_overlap_attr)
        self.validate_output_attrs(ltable, rtable, l_output_attrs, r_output_attrs)
        self.validate_word_level_qval(word_level, q_val)

        log_info(logger, 'Required metadata: ltable key, rtable key', verbose)

        l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose)

        # validate metadata
        cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose)
        cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose)


        # validate input table chunks
        validate_object_type(n_ltable_chunks, int, 'Parameter n_ltable_chunks')
        validate_object_type(n_rtable_chunks, int,
                             'Parameter n_rtable_chunks')
        validate_chunks(n_ltable_chunks)
        validate_chunks(n_rtable_chunks)

        if n_ltable_chunks == -1:
            n_ltable_chunks = multiprocessing.cpu_count()


        ltable_chunks = pd.np.array_split(ltable, n_ltable_chunks)

        # preprocess/tokenize ltable
        if word_level == True:
            tokenizer = WhitespaceTokenizer(return_set=True)
        else:
            tokenizer = QgramTokenizer(qval=q_val, return_set=True)

        preprocessed_tokenized_ltbl = []

        # Construct DAG for preprocessing/tokenizing ltable chunks
        start_row_id = 0
        for i in range(len(ltable_chunks)):
            result = delayed(self.process_tokenize_block_attr)(ltable_chunks[i][
                                                                  l_overlap_attr],
                                                              start_row_id,
                                                              rem_stop_words, tokenizer)
            preprocessed_tokenized_ltbl.append(result)
            start_row_id += len(ltable_chunks[i])
        preprocessed_tokenized_ltbl = delayed(wrap)(preprocessed_tokenized_ltbl)

        # Execute the DAG
        if show_progress:
            with ProgressBar():
                logger.info('Preprocessing/tokenizing ltable')
                preprocessed_tokenized_ltbl_vals = preprocessed_tokenized_ltbl.compute(
                scheduler="processes", num_workers=multiprocessing.cpu_count())
        else:
            preprocessed_tokenized_ltbl_vals = preprocessed_tokenized_ltbl.compute(
                scheduler="processes", num_workers=multiprocessing.cpu_count())

        ltable_processed_dict = {}
        for i in range(len(preprocessed_tokenized_ltbl_vals)):
            ltable_processed_dict.update(preprocessed_tokenized_ltbl_vals[i])

        # build inverted index
        inverted_index = self.build_inverted_index(ltable_processed_dict)

        if n_rtable_chunks == -1:
            n_rtable_chunks = multiprocessing.cpu_count()

        rtable_chunks = pd.np.array_split(rtable, n_rtable_chunks)

        # Construct the DAG for probing
        probe_result = []
        start_row_id = 0
        for i in range(len(rtable_chunks)):
            result = delayed(self.probe)(rtable_chunks[i][r_overlap_attr],
                                         inverted_index, start_row_id, rem_stop_words,
                                         tokenizer, overlap_size)
            probe_result.append(result)
            start_row_id += len(rtable_chunks[i])
        probe_result = delayed(wrap)(probe_result)

        # Execute the DAG for probing
        if show_progress:
            with ProgressBar():
                logger.info('Probing using rtable')
                probe_result = probe_result.compute(scheduler="processes",
                                            num_workers=multiprocessing.cpu_count())
        else:
            probe_result = probe_result.compute(scheduler="processes",
                                                num_workers=multiprocessing.cpu_count())

        # construct a minimal dataframe that can be used to add more attributes
        flat_list = [item for sublist in probe_result for item in sublist]
        tmp = pd.DataFrame(flat_list, columns=['fk_ltable_rid', 'fk_rtable_rid'])
        fk_ltable = ltable.iloc[tmp.fk_ltable_rid][l_key].values
        fk_rtable = rtable.iloc[tmp.fk_rtable_rid][r_key].values
        id_vals = list(range(len(flat_list)))

        candset = pd.DataFrame.from_dict(
            {'_id': id_vals, l_output_prefix+l_key: fk_ltable, r_output_prefix+r_key: fk_rtable})


        # set the properties for the candidate set
        cm.set_key(candset, '_id')
        cm.set_fk_ltable(candset, 'ltable_'+l_key)
        cm.set_fk_rtable(candset, 'rtable_'+r_key)
        cm.set_ltable(candset, ltable)
        cm.set_rtable(candset, rtable)

        ret_candset = gh.add_output_attributes(candset, l_output_attrs=l_output_attrs,
                                               r_output_attrs=r_output_attrs,
                                               l_output_prefix=l_output_prefix,
                                               r_output_prefix=r_output_prefix,
                                               validate=False)



        # handle missing values
        if allow_missing:
            missing_value_pairs = get_pairs_with_missing_value(ltable, rtable, l_key,
                                                           r_key, l_overlap_attr,
                                                           r_overlap_attr,
                                                           l_output_attrs,
                                                           r_output_attrs,
                                                           l_output_prefix,
                                                           r_output_prefix, False, False)
            missing_value_pairs.insert(0, '_id', range(len(ret_candset),
                                                       len(ret_candset)+len(missing_value_pairs)))

            if len(missing_value_pairs) > 0:
                ret_candset = pd.concat([ret_candset, missing_value_pairs], ignore_index=True, sort=False)
                cm.set_key(ret_candset, '_id')
                cm.set_fk_ltable(ret_candset, 'ltable_' + l_key)
                cm.set_fk_rtable(ret_candset, 'rtable_' + r_key)
                cm.set_ltable(ret_candset, ltable)
                cm.set_rtable(ret_candset, rtable)

        # Return the final candidate set to user.
        return ret_candset

Exemplo n.º 17

0

Exibir arquivo

Arquivo: dask_overlap_blocker.py Projeto: anhaidgroup/py_entitymatching

    def block_tables(self, ltable, rtable, l_overlap_attr, r_overlap_attr,
                     rem_stop_words=False, q_val=None, word_level=True, overlap_size=1,
                     l_output_attrs=None, r_output_attrs=None,
                     l_output_prefix='ltable_', r_output_prefix='rtable_',
                     allow_missing=False, verbose=False, show_progress=True,
                     n_ltable_chunks=1, n_rtable_chunks=1):

        """
        WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.

        Blocks two tables based on the overlap of token sets of attribute
        values. Finds tuple pairs from left and right tables such that the overlap
        between (a) the set of tokens obtained by tokenizing the value of
        attribute l_overlap_attr of a tuple from the left table, and (b) the
        set of tokens obtained by tokenizing the value of attribute
        r_overlap_attr of a tuple from the right table, is above a certain
        threshold.

        Args:
            ltable (DataFrame): The left input table.

            rtable (DataFrame): The right input table.

            l_overlap_attr (string): The overlap attribute in left table.

            r_overlap_attr (string): The overlap attribute in right table.

            rem_stop_words (boolean): A flag to indicate whether stop words
             (e.g., a, an, the) should be removed from the token sets of the
             overlap attribute values (defaults to False).

            q_val (int): The value of q to use if the overlap attributes
             values are to be tokenized as qgrams (defaults to None).

            word_level (boolean): A flag to indicate whether the overlap
             attributes should be tokenized as words (i.e, using whitespace
             as delimiter) (defaults to True).

            overlap_size (int): The minimum number of tokens that must
             overlap (defaults to 1).
            l_output_attrs (list): A list of attribute names from the left
                table to be included in the output candidate set (defaults
                to None).
            r_output_attrs (list): A list of attribute names from the right
                table to be included in the output candidate set  (defaults
                to None).

            l_output_prefix (string): The prefix to be used for the attribute names
                                   coming from the left table in the output
                                   candidate set (defaults to 'ltable\_').
            r_output_prefix (string): The prefix to be used for the attribute names
                                   coming from the right table in the output
                                   candidate set (defaults to 'rtable\_').
            allow_missing (boolean): A flag to indicate whether tuple pairs
                                     with missing value in at least one of the
                                     blocking attributes should be included in
                                     the output candidate set (defaults to
                                     False). If this flag is set to True, a
                                     tuple in ltable with missing value in the
                                     blocking attribute will be matched with
                                     every tuple in rtable and vice versa.

            verbose (boolean): A flag to indicate whether the debug
                information should be logged (defaults to False).

            show_progress (boolean): A flag to indicate whether progress should
                be displayed to the user (defaults to True).

            n_ltable_chunks (int): The number of partitions to split the left table (
                                    defaults to 1). If it is set to -1, then the number of 
                                    partitions is set to the number of cores in the 
                                    machine.                                      
            n_rtable_chunks (int): The number of partitions to split the right table (
                                    defaults to 1). If it is set to -1, then the number of 
                                    partitions is set to the number of cores in the 
                                    machine.            


        Returns:
            A candidate set of tuple pairs that survived blocking (DataFrame).
        Raises:
            AssertionError: If `ltable` is not of type pandas
                DataFrame.

            AssertionError: If `rtable` is not of type pandas
                DataFrame.

            AssertionError: If `l_overlap_attr` is not of type string.

            AssertionError: If `r_overlap_attr` is not of type string.

            AssertionError: If `l_output_attrs` is not of type of
             list.

            AssertionError: If `r_output_attrs` is not of type of
             list.

            AssertionError: If the values in `l_output_attrs` is not of type
             string.

            AssertionError: If the values in `r_output_attrs` is not of type
             string.

            AssertionError: If `l_output_prefix` is not of type
             string.

            AssertionError: If `r_output_prefix` is not of type
             string.

            AssertionError: If `q_val` is not of type int.

            AssertionError: If `word_level` is not of type boolean.

            AssertionError: If `overlap_size` is not of type int.

            AssertionError: If `verbose` is not of type
             boolean.

            AssertionError: If `allow_missing` is not of type boolean.

            AssertionError: If `show_progress` is not of type
             boolean.

            AssertionError: If `n_ltable_chunks` is not of type
             int.

            AssertionError: If `n_rtable_chunks` is not of type
             int.

            AssertionError: If `l_overlap_attr` is not in the ltable
             columns.

            AssertionError: If `r_block_attr` is not in the rtable columns.

            AssertionError: If `l_output_attrs` are not in the ltable.

            AssertionError: If `r_output_attrs` are not in the rtable.

            SyntaxError: If `q_val` is set to a valid value and
                `word_level` is set to True.

            SyntaxError: If `q_val` is set to None and
                `word_level` is set to False.

        Examples:
            >>> from py_entitymatching.dask.dask_overlap_blocker import DaskOverlapBlocker
            >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
            >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
            >>> ob = DaskOverlapBlocker()
            # Use all cores
            # # Use word-level tokenizer
            >>> C1 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], word_level=True, overlap_size=1, n_ltable_chunks=-1, n_rtable_chunks=-1)
            # # Use q-gram tokenizer
            >>> C2 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], word_level=False, q_val=2, n_ltable_chunks=-1, n_rtable_chunks=-1)
            # # Include all possible missing values
            >>> C3 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], allow_missing=True, n_ltable_chunks=-1, n_rtable_chunks=-1)
        """
        logger.warning(
            "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN "
            "RISK.")

        # Input validations
        self.validate_types_params_tables(ltable, rtable, l_output_attrs,
                                          r_output_attrs, l_output_prefix,
                                          r_output_prefix, verbose, n_ltable_chunks, n_rtable_chunks)
        self.validate_types_other_params(l_overlap_attr, r_overlap_attr,
                                         rem_stop_words, q_val, word_level, overlap_size)
        self.validate_allow_missing(allow_missing)
        self.validate_show_progress(show_progress)
        self.validate_overlap_attrs(ltable, rtable, l_overlap_attr, r_overlap_attr)
        self.validate_output_attrs(ltable, rtable, l_output_attrs, r_output_attrs)
        self.validate_word_level_qval(word_level, q_val)

        log_info(logger, 'Required metadata: ltable key, rtable key', verbose)

        l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose)

        # validate metadata
        cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose)
        cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose)


        # validate input table chunks
        validate_object_type(n_ltable_chunks, int, 'Parameter n_ltable_chunks')
        validate_object_type(n_rtable_chunks, int,
                             'Parameter n_rtable_chunks')
        validate_chunks(n_ltable_chunks)
        validate_chunks(n_rtable_chunks)

        if n_ltable_chunks == -1:
            n_ltable_chunks = multiprocessing.cpu_count()


        ltable_chunks = pd.np.array_split(ltable, n_ltable_chunks)

        # preprocess/tokenize ltable
        if word_level == True:
            tokenizer = WhitespaceTokenizer(return_set=True)
        else:
            tokenizer = QgramTokenizer(qval=q_val, return_set=True)

        preprocessed_tokenized_ltbl = []

        # Construct DAG for preprocessing/tokenizing ltable chunks
        start_row_id = 0
        for i in range(len(ltable_chunks)):
            result = delayed(self.process_tokenize_block_attr)(ltable_chunks[i][
                                                                  l_overlap_attr],
                                                              start_row_id,
                                                              rem_stop_words, tokenizer)
            preprocessed_tokenized_ltbl.append(result)
            start_row_id += len(ltable_chunks[i])
        preprocessed_tokenized_ltbl = delayed(wrap)(preprocessed_tokenized_ltbl)

        # Execute the DAG
        if show_progress:
            with ProgressBar():
                logger.info('Preprocessing/tokenizing ltable')
                preprocessed_tokenized_ltbl_vals = preprocessed_tokenized_ltbl.compute(
                scheduler="processes", num_workers=multiprocessing.cpu_count())
        else:
            preprocessed_tokenized_ltbl_vals = preprocessed_tokenized_ltbl.compute(
                scheduler="processes", num_workers=multiprocessing.cpu_count())

        ltable_processed_dict = {}
        for i in range(len(preprocessed_tokenized_ltbl_vals)):
            ltable_processed_dict.update(preprocessed_tokenized_ltbl_vals[i])

        # build inverted index
        inverted_index = self.build_inverted_index(ltable_processed_dict)

        if n_rtable_chunks == -1:
            n_rtable_chunks = multiprocessing.cpu_count()

        rtable_chunks = pd.np.array_split(rtable, n_rtable_chunks)

        # Construct the DAG for probing
        probe_result = []
        start_row_id = 0
        for i in range(len(rtable_chunks)):
            result = delayed(self.probe)(rtable_chunks[i][r_overlap_attr],
                                         inverted_index, start_row_id, rem_stop_words,
                                         tokenizer, overlap_size)
            probe_result.append(result)
            start_row_id += len(rtable_chunks[i])
        probe_result = delayed(wrap)(probe_result)

        # Execute the DAG for probing
        if show_progress:
            with ProgressBar():
                logger.info('Probing using rtable')
                probe_result = probe_result.compute(scheduler="processes",
                                            num_workers=multiprocessing.cpu_count())
        else:
            probe_result = probe_result.compute(scheduler="processes",
                                                num_workers=multiprocessing.cpu_count())

        # construct a minimal dataframe that can be used to add more attributes
        flat_list = [item for sublist in probe_result for item in sublist]
        tmp = pd.DataFrame(flat_list, columns=['fk_ltable_rid', 'fk_rtable_rid'])
        fk_ltable = ltable.iloc[tmp.fk_ltable_rid][l_key].values
        fk_rtable = rtable.iloc[tmp.fk_rtable_rid][r_key].values
        id_vals = list(range(len(flat_list)))

        candset = pd.DataFrame.from_dict(
            {'_id': id_vals, l_output_prefix+l_key: fk_ltable, r_output_prefix+r_key: fk_rtable})


        # set the properties for the candidate set
        cm.set_key(candset, '_id')
        cm.set_fk_ltable(candset, 'ltable_'+l_key)
        cm.set_fk_rtable(candset, 'rtable_'+r_key)
        cm.set_ltable(candset, ltable)
        cm.set_rtable(candset, rtable)

        ret_candset = gh.add_output_attributes(candset, l_output_attrs=l_output_attrs,
                                               r_output_attrs=r_output_attrs,
                                               l_output_prefix=l_output_prefix,
                                               r_output_prefix=r_output_prefix,
                                               validate=False)



        # handle missing values
        if allow_missing:
            missing_value_pairs = get_pairs_with_missing_value(ltable, rtable, l_key,
                                                           r_key, l_overlap_attr,
                                                           r_overlap_attr,
                                                           l_output_attrs,
                                                           r_output_attrs,
                                                           l_output_prefix,
                                                           r_output_prefix, False, False)
            missing_value_pairs.insert(0, '_id', range(len(ret_candset),
                                                       len(ret_candset)+len(missing_value_pairs)))

            if len(missing_value_pairs) > 0:
                ret_candset = pd.concat([ret_candset, missing_value_pairs], ignore_index=True, sort=False)
                cm.set_key(ret_candset, '_id')
                cm.set_fk_ltable(ret_candset, 'ltable_' + l_key)
                cm.set_fk_rtable(ret_candset, 'rtable_' + r_key)
                cm.set_ltable(ret_candset, ltable)
                cm.set_rtable(ret_candset, rtable)

        # Return the final candidate set to user.
        return ret_candset