def block_tables(self,
                     ltable,
                     rtable,
                     l_block_attr,
                     r_block_attr,
                     l_output_attrs=None,
                     r_output_attrs=None,
                     l_output_prefix='ltable_',
                     r_output_prefix='rtable_',
                     allow_missing=False,
                     verbose=False,
                     n_jobs=1):
        """Blocks two tables based on attribute equivalence.

        Finds tuple pairs from left and right tables such that the value of
        attribute l_block_attr of a tuple from the left table exactly matches
        the value of attribute r_block_attr of a tuple from the right table.
        This is similar to equi-join of two tables.

        Args:
            ltable (DataFrame): The left input table.

            rtable (DataFrame): The right input table.

            l_block_attr (string): The blocking attribute in left table.

            r_block_attr (string): The blocking attribute in right table.

            l_output_attrs (list): A list of attribute names from the left
                                   table to be included in the
                                   output candidate set (defaults to None).

            r_output_attrs (list): A list of attribute names from the right
                                   table to be included in the
                                   output candidate set (defaults to None).

            l_output_prefix (string): The prefix to be used for the attribute names
                                   coming from the left table in the output
                                   candidate set (defaults to 'ltable\_').

            r_output_prefix (string): The prefix to be used for the attribute names
                                   coming from the right table in the output
                                   candidate set (defaults to 'rtable\_').

            allow_missing (boolean): A flag to indicate whether tuple pairs
                                     with missing value in at least one of the
                                     blocking attributes should be included in
                                     the output candidate set (defaults to
                                     False). If this flag is set to True, a
                                     tuple in ltable with missing value in the
                                     blocking attribute will be matched with
                                     every tuple in rtable and vice versa.

            verbose (boolean): A flag to indicate whether the debug information
                should be logged (defaults to False).


            n_jobs (int): The number of parallel jobs to be used for computation
                (defaults to 1). If -1 all CPUs are used. If 0 or 1,
                no parallel computation is used at all, which is useful for
                debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are
                used (where n_cpus is the total number of CPUs in the
                machine). Thus, for n_jobs = -2, all CPUs but one are used.
                If (n_cpus + 1 + n_jobs) is less than 1, then no parallel
                computation is used (i.e., equivalent to the default).

        Returns:
            A candidate set of tuple pairs that survived blocking (DataFrame).

        Raises:
            AssertionError: If `ltable` is not of type pandas
                DataFrame.
            AssertionError: If `rtable` is not of type pandas
                DataFrame.
            AssertionError: If `l_block_attr` is not of type string.
            AssertionError: If `r_block_attr` is not of type string.
            AssertionError: If `l_output_attrs` is not of type of
                list.
            AssertionError: If `r_output_attrs` is not of type of
                list.
            AssertionError: If the values in `l_output_attrs` is not of type
                string.
            AssertionError: If the values in `r_output_attrs` is not of type
                string.
            AssertionError: If `l_output_prefix` is not of type
                string.
            AssertionError: If `r_output_prefix` is not of type
                string.
            AssertionError: If `verbose` is not of type
                boolean.
            AssertionError: If `allow_missing` is not of type boolean.
            AssertionError: If `n_jobs` is not of type
                int.
            AssertionError: If `l_block_attr` is not in the ltable columns.
            AssertionError: If `r_block_attr` is not in the rtable columns.
            AssertionError: If `l_out_attrs` are not in the ltable.
            AssertionError: If `r_out_attrs` are not in the rtable.

        """

        # validate data types of input parameters
        self.validate_types_params_tables(ltable, rtable, l_output_attrs,
                                          r_output_attrs, l_output_prefix,
                                          r_output_prefix, verbose, n_jobs)

        # validate data types of input blocking attributes
        self.validate_types_block_attrs(l_block_attr, r_block_attr)

        # validate data type of allow_missing
        self.validate_allow_missing(allow_missing)

        # validate input parameters
        self.validate_block_attrs(ltable, rtable, l_block_attr, r_block_attr)
        self.validate_output_attrs(ltable, rtable, l_output_attrs,
                                   r_output_attrs)

        # get and validate required metadata
        log_info(logger, 'Required metadata: ltable key, rtable key', verbose)

        # # get metadata
        l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger,
                                                     verbose)

        # # validate metadata
        cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger,
                                        verbose)
        cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger,
                                        verbose)

        # do blocking

        # # do projection of required attributes from the tables
        l_proj_attrs = self.get_attrs_to_project(l_key, l_block_attr,
                                                 l_output_attrs)
        ltable_proj = ltable[l_proj_attrs]
        r_proj_attrs = self.get_attrs_to_project(r_key, r_block_attr,
                                                 r_output_attrs)
        rtable_proj = rtable[r_proj_attrs]

        # # remove records with nans in the blocking attribute
        l_df = rem_nan(ltable_proj, l_block_attr)
        r_df = rem_nan(rtable_proj, r_block_attr)

        # # determine number of processes to launch parallely
        n_procs = self.get_num_procs(n_jobs, len(l_df) * len(r_df))

        if n_procs <= 1:
            # single process
            candset = _block_tables_split(l_df, r_df, l_key, r_key,
                                          l_block_attr, r_block_attr,
                                          l_output_attrs, r_output_attrs,
                                          l_output_prefix, r_output_prefix,
                                          allow_missing)
        else:
            # multiprocessing
            m, n = self.get_split_params(n_procs, len(l_df), len(r_df))
            l_splits = pd.np.array_split(l_df, m)
            r_splits = pd.np.array_split(r_df, n)
            c_splits = Parallel(n_jobs=m * n)(delayed(_block_tables_split)(
                l, r, l_key, r_key, l_block_attr, r_block_attr, l_output_attrs,
                r_output_attrs, l_output_prefix, r_output_prefix,
                allow_missing) for l in l_splits for r in r_splits)
            candset = pd.concat(c_splits, ignore_index=True)

        # if allow_missing flag is True, then compute
        # all pairs with missing value in left table, and
        # all pairs with missing value in right table
        if allow_missing:
            missing_pairs = self.get_pairs_with_missing_value(
                ltable_proj, rtable_proj, l_key, r_key, l_block_attr,
                r_block_attr, l_output_attrs, r_output_attrs, l_output_prefix,
                r_output_prefix)
            candset = pd.concat([candset, missing_pairs], ignore_index=True)

        # update catalog
        key = get_name_for_key(candset.columns)
        candset = add_key_column(candset, key)
        cm.set_candset_properties(candset, key, l_output_prefix + l_key,
                                  r_output_prefix + r_key, ltable, rtable)

        # return candidate set
        return candset
Exemplo n.º 2
0
    def block_tables(self,
                     ltable,
                     rtable,
                     l_block_attr,
                     r_block_attr,
                     l_output_attrs=None,
                     r_output_attrs=None,
                     l_output_prefix='ltable_',
                     r_output_prefix='rtable_',
                     allow_missing=False,
                     verbose=False,
                     n_jobs=1):
        """Blocks two tables based on attribute equivalence.

        Conceptually, this will check `l_block_attr=r_block_attr` for each tuple
        pair from the Cartesian product of tables `ltable` and `rtable`. It outputs a
        Pandas dataframe object with tuple pairs that satisfy the equality condition.
        The dataframe will include attributes '_id', key attribute from
        ltable, key attributes from rtable, followed by lists `l_output_attrs` and
        `r_output_attrs` if they are specified. Each of these output and key attributes will be
        prefixed with given `l_output_prefix` and `r_output_prefix`. If `allow_missing` is set
        to `True` then all tuple pairs with missing value in at least one of the tuples will be
        included in the output dataframe.
        Further, this will update the following metadata in the catalog for the output table:
        (1) key, (2) ltable, (3) rtable, (4) fk_ltable, and (5) fk_rtable.

        Args:
            ltable (DataFrame): The left input table.

            rtable (DataFrame): The right input table.

            l_block_attr (string): The blocking attribute in left table.

            r_block_attr (string): The blocking attribute in right table.

            l_output_attrs (list): A list of attribute names from the left
                                   table to be included in the
                                   output candidate set (defaults to None).

            r_output_attrs (list): A list of attribute names from the right
                                   table to be included in the
                                   output candidate set (defaults to None).

            l_output_prefix (string): The prefix to be used for the attribute names
                                   coming from the left table in the output
                                   candidate set (defaults to 'ltable\_').

            r_output_prefix (string): The prefix to be used for the attribute names
                                   coming from the right table in the output
                                   candidate set (defaults to 'rtable\_').

            allow_missing (boolean): A flag to indicate whether tuple pairs
                                     with missing value in at least one of the
                                     blocking attributes should be included in
                                     the output candidate set (defaults to
                                     False). If this flag is set to True, a
                                     tuple in ltable with missing value in the
                                     blocking attribute will be matched with
                                     every tuple in rtable and vice versa.

            verbose (boolean): A flag to indicate whether the debug information
                should be logged (defaults to False).


            n_jobs (int): The number of parallel jobs to be used for computation
                (defaults to 1). If -1 all CPUs are used. If 0 or 1,
                no parallel computation is used at all, which is useful for
                debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are
                used (where n_cpus is the total number of CPUs in the
                machine). Thus, for n_jobs = -2, all CPUs but one are used.
                If (n_cpus + 1 + n_jobs) is less than 1, then no parallel
                computation is used (i.e., equivalent to the default).

        Returns:
            A candidate set of tuple pairs that survived blocking (DataFrame).

        Raises:
            AssertionError: If `ltable` is not of type pandas
                DataFrame.
            AssertionError: If `rtable` is not of type pandas
                DataFrame.
            AssertionError: If `l_block_attr` is not of type string.
            AssertionError: If `r_block_attr` is not of type string.
            AssertionError: If `l_output_attrs` is not of type of
                list.
            AssertionError: If `r_output_attrs` is not of type of
                list.
            AssertionError: If the values in `l_output_attrs` is not of type
                string.
            AssertionError: If the values in `r_output_attrs` is not of type
                string.
            AssertionError: If `l_output_prefix` is not of type
                string.
            AssertionError: If `r_output_prefix` is not of type
                string.
            AssertionError: If `verbose` is not of type
                boolean.
            AssertionError: If `allow_missing` is not of type boolean.
            AssertionError: If `n_jobs` is not of type
                int.
            AssertionError: If `l_block_attr` is not in the ltable columns.
            AssertionError: If `r_block_attr` is not in the rtable columns.
            AssertionError: If `l_out_attrs` are not in the ltable.
            AssertionError: If `r_out_attrs` are not in the rtable.

        Examples:
            >>> import py_entitymatching as em
            >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
            >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
            >>> ab = em.AttrEquivalenceBlocker()
            >>> C1 = ab.block_tables(A, B, 'zipcode', 'zipcode', l_output_attrs=['name'], r_output_attrs=['name'])
            # Include all possible tuple pairs with missing values
            >>> C2 = ab.block_tables(A, B, 'zipcode', 'zipcode', l_output_attrs=['name'], r_output_attrs=['name'], allow_missing=True)


        """

        # validate data types of input parameters
        self.validate_types_params_tables(ltable, rtable, l_output_attrs,
                                          r_output_attrs, l_output_prefix,
                                          r_output_prefix, verbose, n_jobs)

        # validate data types of input blocking attributes
        self.validate_types_block_attrs(l_block_attr, r_block_attr)

        # validate data type of allow_missing
        self.validate_allow_missing(allow_missing)

        # validate input parameters
        self.validate_block_attrs(ltable, rtable, l_block_attr, r_block_attr)
        self.validate_output_attrs(ltable, rtable, l_output_attrs,
                                   r_output_attrs)

        # get and validate required metadata
        log_info(logger, 'Required metadata: ltable key, rtable key', verbose)

        # # get metadata
        l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger,
                                                     verbose)

        # # validate metadata
        cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger,
                                        verbose)
        cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger,
                                        verbose)

        # do blocking

        # # do projection of required attributes from the tables
        l_proj_attrs = self.get_attrs_to_project(l_key, l_block_attr,
                                                 l_output_attrs)
        ltable_proj = ltable[l_proj_attrs]
        r_proj_attrs = self.get_attrs_to_project(r_key, r_block_attr,
                                                 r_output_attrs)
        rtable_proj = rtable[r_proj_attrs]

        # # remove records with nans in the blocking attribute
        l_df = rem_nan(ltable_proj, l_block_attr)
        r_df = rem_nan(rtable_proj, r_block_attr)

        # # determine number of processes to launch parallely
        n_procs = self.get_num_procs(n_jobs, len(l_df) * len(r_df))

        if n_procs <= 1:
            # single process
            candset = _block_tables_split(l_df, r_df, l_key, r_key,
                                          l_block_attr, r_block_attr,
                                          l_output_attrs, r_output_attrs,
                                          l_output_prefix, r_output_prefix,
                                          allow_missing)
        else:
            # multiprocessing
            m, n = self.get_split_params(n_procs, len(l_df), len(r_df))
            l_splits = np.array_split(l_df, m)
            r_splits = np.array_split(r_df, n)
            c_splits = Parallel(n_jobs=m * n)(delayed(_block_tables_split)(
                l, r, l_key, r_key, l_block_attr, r_block_attr, l_output_attrs,
                r_output_attrs, l_output_prefix, r_output_prefix,
                allow_missing) for l in l_splits for r in r_splits)
            candset = pd.concat(c_splits, ignore_index=True)

        # if allow_missing flag is True, then compute
        # all pairs with missing value in left table, and
        # all pairs with missing value in right table
        if allow_missing:
            missing_pairs = self.get_pairs_with_missing_value(
                ltable_proj, rtable_proj, l_key, r_key, l_block_attr,
                r_block_attr, l_output_attrs, r_output_attrs, l_output_prefix,
                r_output_prefix)
            candset = pd.concat([candset, missing_pairs], ignore_index=True)

        # update catalog
        key = get_name_for_key(candset.columns)
        candset = add_key_column(candset, key)
        cm.set_candset_properties(candset, key, l_output_prefix + l_key,
                                  r_output_prefix + r_key, ltable, rtable)

        # return candidate set
        return candset
    def block_tables(self, ltable, rtable, l_block_attr, r_block_attr,
                     l_output_attrs=None, r_output_attrs=None,
                     l_output_prefix='ltable_', r_output_prefix='rtable_',
                     allow_missing=False, verbose=False, n_ltable_chunks=1,
                     n_rtable_chunks=1):
        """
        WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK

        Blocks two tables based on attribute equivalence.
        Conceptually, this will check `l_block_attr=r_block_attr` for each tuple
        pair from the Cartesian product of tables `ltable` and `rtable`. It outputs a
        Pandas dataframe object with tuple pairs that satisfy the equality condition.
        The dataframe will include attributes '_id', key attribute from
        ltable, key attributes from rtable, followed by lists `l_output_attrs` and
        `r_output_attrs` if they are specified. Each of these output and key attributes will be
        prefixed with given `l_output_prefix` and `r_output_prefix`. If `allow_missing` is set
        to `True` then all tuple pairs with missing value in at least one of the tuples will be
        included in the output dataframe.
        Further, this will update the following metadata in the catalog for the output table:
        (1) key, (2) ltable, (3) rtable, (4) fk_ltable, and (5) fk_rtable.
      
        Args:
            ltable (DataFrame): The left input table.
            rtable (DataFrame): The right input table.
            l_block_attr (string): The blocking attribute in left table.
            r_block_attr (string): The blocking attribute in right table.
            l_output_attrs (list): A list of attribute names from the left
                                   table to be included in the
                                   output candidate set (defaults to None).
            r_output_attrs (list): A list of attribute names from the right
                                   table to be included in the
                                   output candidate set (defaults to None).
            l_output_prefix (string): The prefix to be used for the attribute names
                                   coming from the left table in the output
                                   candidate set (defaults to 'ltable\_').
            r_output_prefix (string): The prefix to be used for the attribute names
                                   coming from the right table in the output
                                   candidate set (defaults to 'rtable\_').
            allow_missing (boolean): A flag to indicate whether tuple pairs
                                     with missing value in at least one of the
                                     blocking attributes should be included in
                                     the output candidate set (defaults to
                                     False). If this flag is set to True, a
                                     tuple in ltable with missing value in the
                                     blocking attribute will be matched with
                                     every tuple in rtable and vice versa.
            verbose (boolean): A flag to indicate whether the debug information
                              should be logged (defaults to False).
            
            n_ltable_chunks (int): The number of partitions to split the left table (
                                    defaults to 1). If it is set to -1, then the number of 
                                    partitions is set to the number of cores in the 
                                    machine.                                      
            n_rtable_chunks (int): The number of partitions to split the right table (
                                    defaults to 1). If it is set to -1, then the number of 
                                    partitions is set to the number of cores in the 
                                    machine.            
        Returns:
            A candidate set of tuple pairs that survived blocking (DataFrame).
            
        Raises:
            AssertionError: If `ltable` is not of type pandas
                DataFrame.
            AssertionError: If `rtable` is not of type pandas
                DataFrame.
            AssertionError: If `l_block_attr` is not of type string.
            AssertionError: If `r_block_attr` is not of type string.
            AssertionError: If `l_output_attrs` is not of type of
                list.
            AssertionError: If `r_output_attrs` is not of type of
                list.
            AssertionError: If the values in `l_output_attrs` is not of type
                string.
            AssertionError: If the values in `r_output_attrs` is not of type
                string.
            AssertionError: If `l_output_prefix` is not of type
                string.
            AssertionError: If `r_output_prefix` is not of type
                string.
            AssertionError: If `verbose` is not of type
                boolean.
            AssertionError: If `allow_missing` is not of type boolean.
            AssertionError: If `n_ltable_chunks` is not of type
                int.
            AssertionError: If `n_rtable_chunks` is not of type
                int.
            AssertionError: If `l_block_attr` is not in the ltable columns.
            AssertionError: If `r_block_attr` is not in the rtable columns.
            AssertionError: If `l_out_attrs` are not in the ltable.
            AssertionError: If `r_out_attrs` are not in the rtable.
       
        Examples:
            >>> import py_entitymatching as em
            >>> from py_entitymatching.dask.dask_attr_equiv_blocker import DaskAttrEquivalenceBlocker            
            >>> ab = DaskAttrEquivalenceBlocker()
            >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
            >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
            >>> C1 = ab.block_tables(A, B, 'zipcode', 'zipcode', l_output_attrs=['name'], r_output_attrs=['name'])
            # Include all possible tuple pairs with missing values
            >>> C2 = ab.block_tables(A, B, 'zipcode', 'zipcode', l_output_attrs=['name'], r_output_attrs=['name'], allow_missing=True)
        """

        logger.warning("WARNING THIS BLOCKER IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR "
                    "OWN RISK.")


        # validate data types of input parameters
        self.validate_types_params_tables(ltable, rtable,
                                          l_output_attrs, r_output_attrs,
                                          l_output_prefix,
                                          r_output_prefix, verbose, 1) # last arg is
                                         # set to 1 just to reuse the function from the
                                         # old blocker.

        # validate data types of input blocking attributes
        self.validate_types_block_attrs(l_block_attr, r_block_attr)

        # validate data type of allow_missing
        self.validate_allow_missing(allow_missing)

        # validate input parameters
        self.validate_block_attrs(ltable, rtable, l_block_attr, r_block_attr)
        self.validate_output_attrs(ltable, rtable, l_output_attrs,
                                   r_output_attrs)

        # validate number of ltable and rtable chunks
        validate_object_type(n_ltable_chunks, int, 'Parameter n_ltable_chunks')
        validate_object_type(n_rtable_chunks, int, 'Parameter n_rtable_chunks')

        validate_chunks(n_ltable_chunks)
        validate_chunks(n_rtable_chunks)

        # get and validate required metadata
        log_info(logger, 'Required metadata: ltable key, rtable key', verbose)

        # # get metadata
        l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger,
                                                     verbose)

        # # validate metadata
        cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger,
                                        verbose)
        cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger,
                                        verbose)

        # do blocking

        # # do projection of required attributes from the tables
        l_proj_attrs = self.get_attrs_to_project(l_key, l_block_attr,
                                                 l_output_attrs)
        ltable_proj = ltable[l_proj_attrs]
        r_proj_attrs = self.get_attrs_to_project(r_key, r_block_attr,
                                                 r_output_attrs)
        rtable_proj = rtable[r_proj_attrs]

        # # remove records with nans in the blocking attribute
        l_df = rem_nan(ltable_proj, l_block_attr)
        r_df = rem_nan(rtable_proj, r_block_attr)

        # # determine the number of chunks
        n_ltable_chunks = get_num_partitions(n_ltable_chunks, len(ltable))
        n_rtable_chunks = get_num_partitions(n_rtable_chunks, len(rtable))

        if n_ltable_chunks == 1 and n_rtable_chunks == 1:
            # single process
            candset = _block_tables_split(l_df, r_df, l_key, r_key,
                                          l_block_attr, r_block_attr,
                                          l_output_attrs, r_output_attrs,
                                          l_output_prefix, r_output_prefix,
                                          allow_missing)
        else:
            l_splits = np.array_split(l_df, n_ltable_chunks)
            r_splits = np.array_split(r_df, n_rtable_chunks)
            c_splits = []

            for l in l_splits:
                for r in r_splits:
                    partial_result = delayed(_block_tables_split)(l, r, l_key, r_key,
                                             l_block_attr, r_block_attr,
                                             l_output_attrs, r_output_attrs,
                                             l_output_prefix, r_output_prefix,
                                             allow_missing)
                    c_splits.append(partial_result)
            c_splits = delayed(wrap)(c_splits)
            c_splits = c_splits.compute(scheduler="processes", n_jobs=get_num_cores())
            candset = pd.concat(c_splits, ignore_index=True)

        # if allow_missing flag is True, then compute
        # all pairs with missing value in left table, and
        # all pairs with missing value in right table
        if allow_missing:
            missing_pairs = self.get_pairs_with_missing_value(ltable_proj,
                                                              rtable_proj,
                                                              l_key, r_key,
                                                              l_block_attr,
                                                              r_block_attr,
                                                              l_output_attrs,
                                                              r_output_attrs,
                                                              l_output_prefix,
                                                              r_output_prefix)
            candset = pd.concat([candset, missing_pairs], ignore_index=True)

        # update catalog
        key = get_name_for_key(candset.columns)
        candset = add_key_column(candset, key)
        cm.set_candset_properties(candset, key, l_output_prefix + l_key,
                                  r_output_prefix + r_key, ltable, rtable)

        # return candidate set
        return candset
    def block_tables(self, ltable, rtable, l_block_attr, r_block_attr,
                     l_output_attrs=None, r_output_attrs=None,
                     l_output_prefix='ltable_', r_output_prefix='rtable_',
                     allow_missing=False, verbose=False, n_ltable_chunks=1,
                     n_rtable_chunks=1):
        """
        WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK

        Blocks two tables based on attribute equivalence.
        Conceptually, this will check `l_block_attr=r_block_attr` for each tuple
        pair from the Cartesian product of tables `ltable` and `rtable`. It outputs a
        Pandas dataframe object with tuple pairs that satisfy the equality condition.
        The dataframe will include attributes '_id', key attribute from
        ltable, key attributes from rtable, followed by lists `l_output_attrs` and
        `r_output_attrs` if they are specified. Each of these output and key attributes will be
        prefixed with given `l_output_prefix` and `r_output_prefix`. If `allow_missing` is set
        to `True` then all tuple pairs with missing value in at least one of the tuples will be
        included in the output dataframe.
        Further, this will update the following metadata in the catalog for the output table:
        (1) key, (2) ltable, (3) rtable, (4) fk_ltable, and (5) fk_rtable.
      
        Args:
            ltable (DataFrame): The left input table.
            rtable (DataFrame): The right input table.
            l_block_attr (string): The blocking attribute in left table.
            r_block_attr (string): The blocking attribute in right table.
            l_output_attrs (list): A list of attribute names from the left
                                   table to be included in the
                                   output candidate set (defaults to None).
            r_output_attrs (list): A list of attribute names from the right
                                   table to be included in the
                                   output candidate set (defaults to None).
            l_output_prefix (string): The prefix to be used for the attribute names
                                   coming from the left table in the output
                                   candidate set (defaults to 'ltable\_').
            r_output_prefix (string): The prefix to be used for the attribute names
                                   coming from the right table in the output
                                   candidate set (defaults to 'rtable\_').
            allow_missing (boolean): A flag to indicate whether tuple pairs
                                     with missing value in at least one of the
                                     blocking attributes should be included in
                                     the output candidate set (defaults to
                                     False). If this flag is set to True, a
                                     tuple in ltable with missing value in the
                                     blocking attribute will be matched with
                                     every tuple in rtable and vice versa.
            verbose (boolean): A flag to indicate whether the debug information
                              should be logged (defaults to False).
            
            n_ltable_chunks (int): The number of partitions to split the left table (
                                    defaults to 1). If it is set to -1, then the number of 
                                    partitions is set to the number of cores in the 
                                    machine.                                      
            n_rtable_chunks (int): The number of partitions to split the right table (
                                    defaults to 1). If it is set to -1, then the number of 
                                    partitions is set to the number of cores in the 
                                    machine.            
        Returns:
            A candidate set of tuple pairs that survived blocking (DataFrame).
            
        Raises:
            AssertionError: If `ltable` is not of type pandas
                DataFrame.
            AssertionError: If `rtable` is not of type pandas
                DataFrame.
            AssertionError: If `l_block_attr` is not of type string.
            AssertionError: If `r_block_attr` is not of type string.
            AssertionError: If `l_output_attrs` is not of type of
                list.
            AssertionError: If `r_output_attrs` is not of type of
                list.
            AssertionError: If the values in `l_output_attrs` is not of type
                string.
            AssertionError: If the values in `r_output_attrs` is not of type
                string.
            AssertionError: If `l_output_prefix` is not of type
                string.
            AssertionError: If `r_output_prefix` is not of type
                string.
            AssertionError: If `verbose` is not of type
                boolean.
            AssertionError: If `allow_missing` is not of type boolean.
            AssertionError: If `n_ltable_chunks` is not of type
                int.
            AssertionError: If `n_rtable_chunks` is not of type
                int.
            AssertionError: If `l_block_attr` is not in the ltable columns.
            AssertionError: If `r_block_attr` is not in the rtable columns.
            AssertionError: If `l_out_attrs` are not in the ltable.
            AssertionError: If `r_out_attrs` are not in the rtable.
       
        Examples:
            >>> import py_entitymatching as em
            >>> from py_entitymatching.dask.dask_attr_equiv_blocker import DaskAttrEquivalenceBlocker            
            >>> ab = DaskAttrEquivalenceBlocker()
            >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
            >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
            >>> C1 = ab.block_tables(A, B, 'zipcode', 'zipcode', l_output_attrs=['name'], r_output_attrs=['name'])
            # Include all possible tuple pairs with missing values
            >>> C2 = ab.block_tables(A, B, 'zipcode', 'zipcode', l_output_attrs=['name'], r_output_attrs=['name'], allow_missing=True)
        """

        logger.warning("WARNING THIS BLOCKER IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR "
                    "OWN RISK.")


        # validate data types of input parameters
        self.validate_types_params_tables(ltable, rtable,
                                          l_output_attrs, r_output_attrs,
                                          l_output_prefix,
                                          r_output_prefix, verbose, 1) # last arg is
                                         # set to 1 just to reuse the function from the
                                         # old blocker.

        # validate data types of input blocking attributes
        self.validate_types_block_attrs(l_block_attr, r_block_attr)

        # validate data type of allow_missing
        self.validate_allow_missing(allow_missing)

        # validate input parameters
        self.validate_block_attrs(ltable, rtable, l_block_attr, r_block_attr)
        self.validate_output_attrs(ltable, rtable, l_output_attrs,
                                   r_output_attrs)

        # validate number of ltable and rtable chunks
        validate_object_type(n_ltable_chunks, int, 'Parameter n_ltable_chunks')
        validate_object_type(n_rtable_chunks, int, 'Parameter n_rtable_chunks')

        validate_chunks(n_ltable_chunks)
        validate_chunks(n_rtable_chunks)

        # get and validate required metadata
        log_info(logger, 'Required metadata: ltable key, rtable key', verbose)

        # # get metadata
        l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger,
                                                     verbose)

        # # validate metadata
        cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger,
                                        verbose)
        cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger,
                                        verbose)

        # do blocking

        # # do projection of required attributes from the tables
        l_proj_attrs = self.get_attrs_to_project(l_key, l_block_attr,
                                                 l_output_attrs)
        ltable_proj = ltable[l_proj_attrs]
        r_proj_attrs = self.get_attrs_to_project(r_key, r_block_attr,
                                                 r_output_attrs)
        rtable_proj = rtable[r_proj_attrs]

        # # remove records with nans in the blocking attribute
        l_df = rem_nan(ltable_proj, l_block_attr)
        r_df = rem_nan(rtable_proj, r_block_attr)

        # # determine the number of chunks
        n_ltable_chunks = get_num_partitions(n_ltable_chunks, len(ltable))
        n_rtable_chunks = get_num_partitions(n_rtable_chunks, len(rtable))

        if n_ltable_chunks == 1 and n_rtable_chunks == 1:
            # single process
            candset = _block_tables_split(l_df, r_df, l_key, r_key,
                                          l_block_attr, r_block_attr,
                                          l_output_attrs, r_output_attrs,
                                          l_output_prefix, r_output_prefix,
                                          allow_missing)
        else:
            l_splits = pd.np.array_split(l_df, n_ltable_chunks)
            r_splits = pd.np.array_split(r_df, n_rtable_chunks)
            c_splits = []

            for l in l_splits:
                for r in r_splits:
                    partial_result = delayed(_block_tables_split)(l, r, l_key, r_key,
                                             l_block_attr, r_block_attr,
                                             l_output_attrs, r_output_attrs,
                                             l_output_prefix, r_output_prefix,
                                             allow_missing)
                    c_splits.append(partial_result)
            c_splits = delayed(wrap)(c_splits)
            c_splits = c_splits.compute(scheduler="processes", n_jobs=get_num_cores())
            candset = pd.concat(c_splits, ignore_index=True)

        # if allow_missing flag is True, then compute
        # all pairs with missing value in left table, and
        # all pairs with missing value in right table
        if allow_missing:
            missing_pairs = self.get_pairs_with_missing_value(ltable_proj,
                                                              rtable_proj,
                                                              l_key, r_key,
                                                              l_block_attr,
                                                              r_block_attr,
                                                              l_output_attrs,
                                                              r_output_attrs,
                                                              l_output_prefix,
                                                              r_output_prefix)
            candset = pd.concat([candset, missing_pairs], ignore_index=True)

        # update catalog
        key = get_name_for_key(candset.columns)
        candset = add_key_column(candset, key)
        cm.set_candset_properties(candset, key, l_output_prefix + l_key,
                                  r_output_prefix + r_key, ltable, rtable)

        # return candidate set
        return candset