def rename_col(df, old_col_name, new_col_name):
    new_df = df.rename(columns={old_col_name: new_col_name})

    if cm.is_dfinfo_present(df):
        cm.init_properties(new_df)
        cm.copy_properties(df, new_df)

        if _is_table_or_candset(df):
            if not _is_table(df):
                key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \
                    cm.get_metadata_for_candset(df, logger, False)
                if key == old_col_name:
                    cm.set_key(new_df, new_col_name)
                elif fk_ltable == old_col_name:
                    cm.set_fk_ltable(new_df, new_col_name)
                elif fk_rtable == old_col_name:
                    cm.set_fk_rtable(new_df, new_col_name)
                else:
                    pass
            else:
                key = cm.get_key(df)
                if key == old_col_name:
                    cm.set_key(new_df, new_col_name)

    return new_df
def rename_col(df, old_col_name, new_col_name):
    new_df = df.rename(columns={old_col_name: new_col_name})

    if cm.is_dfinfo_present(df):
        cm.init_properties(new_df)
        cm.copy_properties(df, new_df)

        if _is_table_or_candset(df):
            if not _is_table(df):
                key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \
                    cm.get_metadata_for_candset(df, logger, False)
                if key == old_col_name:
                    cm.set_key(new_df, new_col_name)
                elif fk_ltable == old_col_name:
                    cm.set_fk_ltable(new_df, new_col_name)
                elif fk_rtable == old_col_name:
                    cm.set_fk_rtable(new_df, new_col_name)
                else:
                    pass
            else:
                key = cm.get_key(df)
                if key == old_col_name:
                    cm.set_key(new_df, new_col_name)

    return new_df
 def test_validate_and_set_fk_rtable_err_case_2(self):
     C = pd.read_csv(path_c)
     p = os.sep.join([catalog_datasets_path, 'A_inv_fk.csv'])
     A = pd.read_csv(p)
     status = cm.validate_and_set_fk_rtable(C, 'ltable_ID', A, 'ID')
     self.assertEqual(status, False)
     self.assertEqual(cm.is_dfinfo_present(C), False)
 def test_copy_properties_valid_1(self):
     A = read_csv_metadata(path_a)
     A1 = pd.read_csv(path_a)
     cm.copy_properties(A, A1)
     self.assertEqual(cm.is_dfinfo_present(A1), True)
     p = cm.get_all_properties(A)
     p1 = cm.get_all_properties(A1)
     self.assertEqual(p, p1)
     self.assertEqual(cm.get_key(A1), cm.get_key(A))
def filter_rows(df, condn):
    new_df = df.query(condn)

    # update metadata
    if cm.is_dfinfo_present(df):
        if _is_table_or_candset(df):
            cm.init_properties(new_df)
            cm.copy_properties(df, new_df)

    return new_df
def filter_rows(df, condn):
    new_df = df.query(condn)

    # update metadata
    if cm.is_dfinfo_present(df):
        if _is_table_or_candset(df):
            cm.init_properties(new_df)
            cm.copy_properties(df, new_df)

    return new_df
def mutate_col(df, **kwargs):
    new_df = df.assign(**kwargs)

    if cm.is_dfinfo_present(df):
        cm.init_properties(new_df)
        cm.copy_properties(df, new_df)

        # if _is_table_or_candset(df):
        #     key = cm.get_key(df)
        #     if key == new_col_name:
        #         cm.set_key(new_df, new_col_name)

    return new_df
def mutate_col(df, **kwargs):
    new_df = df.assign(**kwargs)

    if cm.is_dfinfo_present(df):
        cm.init_properties(new_df)
        cm.copy_properties(df, new_df)

        # if _is_table_or_candset(df):
        #     key = cm.get_key(df)
        #     if key == new_col_name:
        #         cm.set_key(new_df, new_col_name)

    return new_df
    def test_copy_properties_valid_2(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b)
        C = read_csv_metadata(path_c, ltable=A, rtable=B)

        C1 = pd.read_csv(path_c)
        cm.copy_properties(C, C1)
        self.assertEqual(cm.is_dfinfo_present(C1), True)
        p = cm.get_all_properties(C1)
        p1 = cm.get_all_properties(C1)
        self.assertEqual(p, p1)
        self.assertEqual(cm.get_key(C1), cm.get_key(C))
        self.assertEqual(cm.get_ltable(C1).equals(A), True)
        self.assertEqual(cm.get_rtable(C1).equals(B), True)
        self.assertEqual(cm.get_fk_ltable(C1), cm.get_fk_ltable(C))
        self.assertEqual(cm.get_fk_rtable(C1), cm.get_fk_rtable(C))
示例#10
0
def _write_metadata(data_frame, file_path):
    """
    Write metadata contents to disk.
    """
    # Initialize a metadata dictionary to store the metadata.
    metadata_dict = collections.OrderedDict()

    # Get all the properties for the input data frame
    if cm.is_dfinfo_present(data_frame) is True:
        properties = cm.get_all_properties(data_frame)
    else:
        # If the data_frame is not in the catalog, then return immedidately.
        return False

    # If the properties are present in the catalog, then write properties to
    # disk
    if len(properties) > 0:
        for property_name, property_value in six.iteritems(properties):
            # If the property value is not of type string, then just write it
            #  as 'POINTER'. This will be useful while writing the candidate
            # sets to disk. The candidate set will have properties such as
            # ltable and rtable which are DataFrames. We do not have a simple
            # way to write them to disk and link them back the candidate set
            # while reading back from disk. So to get around this problem we
            # will use 'POINTER' as the special value to indicate objects
            # other than strings.
            if isinstance(property_value, six.string_types) is False:
                metadata_dict[property_name] = 'POINTER'
            else:
                metadata_dict[property_name] = property_value

        # Write the properties to a file in disk. The file will one property
        # per line. We follow a special syntax to write the properties. The
        # syntax is:
        # #property_name=property_value
        with open(file_path, 'w') as file_handler:
            for property_name, property_value in six.iteritems(metadata_dict):
                file_handler.write('#%s=%s\n' %
                                   (property_name, property_value))

    return True
def drop_cols(df, col_list):
    if not isinstance(col_list, list):
        col_list = [col_list]
    if cm.is_dfinfo_present(df):
        if _is_table_or_candset(df):
            if not _is_table(df):
                key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key\
                    = cm.get_metadata_for_candset(df, logger, False)
                col_list = gh.list_diff(col_list, [key, fk_ltable, fk_rtable])
                col_list = gh.list_drop_duplicates(col_list)
            else:
                key = cm.get_key(df)
                col_list = gh.list_diff(col_list, [key])
                col_list = gh.list_drop_duplicates(col_list)
        new_df = df.drop(col_list, axis=1)
        cm.init_properties(new_df)
        cm.copy_properties(df, new_df)
    else:
        new_df = df[col_list]

    return new_df
def preserve_metadata(df, new_df):
    if cm.is_dfinfo_present(df):
        if _is_table_or_candset(df):
            if not _is_table(df):
                key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key\
                    = cm.get_metadata_for_candset(df, logger, False)
                if not ch.check_attrs_present(new_df,
                                              [key, fk_ltable, fk_rtable]):
                    logger.warning('Not setting the metadata as some attrs '
                                   'are not present')
                    return new_df
            else:
                key = cm.get_key(df)
                if not ch.check_attrs_present(new_df, [key]):
                    logger.warning('Not setting the metadata as some attrs '
                                   'are not present')
                    return new_df

        cm.init_properties(new_df)
        cm.copy_properties(df, new_df)
    return new_df
def drop_cols(df, col_list):
    if not isinstance(col_list, list):
        col_list = [col_list]
    if cm.is_dfinfo_present(df):
        if _is_table_or_candset(df):
            if not _is_table(df):
                key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key\
                    = cm.get_metadata_for_candset(df, logger, False)
                col_list = gh.list_diff(col_list, [key, fk_ltable, fk_rtable])
                col_list = gh.list_drop_duplicates(col_list)
            else:
                key = cm.get_key(df)
                col_list = gh.list_diff(col_list, [key])
                col_list = gh.list_drop_duplicates(col_list)
        new_df = df.drop(col_list, axis=1)
        cm.init_properties(new_df)
        cm.copy_properties(df, new_df)
    else:
        new_df = df[col_list]

    return new_df
def preserve_metadata(df, new_df):
    if cm.is_dfinfo_present(df):
        if _is_table_or_candset(df):
            if not _is_table(df):
                key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key\
                    = cm.get_metadata_for_candset(df, logger, False)
                if not ch.check_attrs_present(new_df, [key, fk_ltable,
                                                     fk_rtable]):
                    logger.warning('Not setting the metadata as some attrs '
                                   'are not present')
                    return new_df
            else:
                key = cm.get_key(df)
                if not ch.check_attrs_present(new_df, [key]):
                    logger.warning('Not setting the metadata as some attrs '
                                   'are not present')
                    return new_df


        cm.init_properties(new_df)
        cm.copy_properties(df, new_df)
    return new_df
def project_cols(df, col_list):
    if not isinstance(col_list, list):
        col_list = [col_list]
    if cm.is_dfinfo_present(df):
        if _is_table_or_candset(df):
            if not _is_table(df):
                key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key\
                    = cm.get_metadata_for_candset(df, logger, False)
                updated_col_list = [key, fk_ltable, fk_rtable]
                updated_col_list.extend(col_list)
                col_list = gh.list_drop_duplicates(updated_col_list)
            else:
                key = cm.get_key(df)
                updated_col_list = [key]
                updated_col_list.extend(col_list)
                col_list = gh.list_drop_duplicates(updated_col_list)
        new_df = df[col_list]
        cm.init_properties(new_df)
        cm.copy_properties(df, new_df)
    else:
        new_df = df[col_list]

    return new_df
def project_cols(df, col_list):
    if not isinstance(col_list, list):
        col_list = [col_list]
    if cm.is_dfinfo_present(df):
        if _is_table_or_candset(df):
            if not _is_table(df):
                key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key\
                    = cm.get_metadata_for_candset(df, logger, False)
                updated_col_list = [key, fk_ltable, fk_rtable]
                updated_col_list.extend(col_list)
                col_list = gh.list_drop_duplicates(updated_col_list)
            else:
                key = cm.get_key(df)
                updated_col_list = [key]
                updated_col_list.extend(col_list)
                col_list = gh.list_drop_duplicates(updated_col_list)
        new_df = df[col_list]
        cm.init_properties(new_df)
        cm.copy_properties(df, new_df)
    else:
        new_df = df[col_list]

    return new_df
 def test_is_dfinfo_present_valid_1(self):
     A = read_csv_metadata(path_a)
     status = cm.is_dfinfo_present(A)
     self.assertEqual(status, True)
 def test_del_all_properties_valid_2(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b)
     C = read_csv_metadata(path_c, ltable=A, rtable=B)
     cm.del_all_properties(C)
     self.assertEqual(cm.is_dfinfo_present(C), False)
 def test_del_all_properties_valid_1(self):
     A = read_csv_metadata(path_a)
     cm.del_all_properties(A)
     self.assertEqual(cm.is_dfinfo_present(A), False)
示例#20
0
def save_table(data_frame, file_path, metadata_ext='.pklmetadata'):
    """
    Saves a DataFrame to disk along with its metadata in a pickle format.

    This function saves a  DataFrame to disk along with its metadata from
    the catalog.

    Specifically, this function saves the DataFrame in the given
    file path, and saves the metadata in the same directory (as the
    file path) but with a different extension. This extension can be
    optionally given by the user (defaults to '.pklmetadata').

    Args:
        data_frame (DataFrame): The DataFrame that should be saved.

        file_path (string): The file path where the DataFrame must be stored.

        metadata_ext (string): The metadata extension that should be used while
            storing the metadata information. The default value is
            '.pklmetadata'.

    Returns:
        A Boolean value of True is returned if the DataFrame is successfully
        saved.

    Raises:
        AssertionError: If `data_frame` is not of type pandas
         DataFrame.
        AssertionError: If `file_path` is not of type string.
        AssertionError: If `metadata_ext` is not of type string.
        AssertionError: If a file cannot written in the given `file_path`.

    Examples:

        >>> A = pd.DataFrame({'id' : [1, 2], 'colA':['a', 'b'], 'colB' : [10, 20]})
        >>> em.save_table(A, './A.pkl') # will store two files ./A.pkl and ./A.pklmetadata

        >>> A = pd.DataFrame({'id' : [1, 2], 'colA':['a', 'b'], 'colB' : [10, 20]})
        >>> em.save_table(A, './A.pkl', metadata_ext='.pklmeta') # will store two files ./A.pkl and ./A.pklmeta


    See Also:
        :meth:`~py_entitymatching.load_table`

    Note:
        This function is a bit different from to_csv_metadata, where the
        DataFrame is stored in a CSV file format. The CSV file format can be
        viewed using a text editor. But a DataFrame stored using 'save_table' is
        stored in a special format, which cannot be viewed with a text editor.
        The reason we have save_table is, for larger DataFrames it is
        efficient to pickle the DataFrame to disk than writing the DataFrame
        in CSV format.
    """
    # Validate the input parameters

    validate_object_type(data_frame, pd.DataFrame)

    validate_object_type(file_path, six.string_types, error_prefix='Input file path')

    validate_object_type(metadata_ext, six.string_types, error_prefix='Input Metadata ext')

    # Get the file_name (with out extension) and the extension from the given
    #  file path. For example if the file_path was /Users/foo/file.csv then
    # the file_name will be /Users/foo/file and the extension will be '.csv'
    file_name, _ = os.path.splitext(file_path)

    # The metadata file name is the same file name but with the extension
    # given by the user
    metadata_filename = file_name + metadata_ext

    # Check if the file exists in the file_path and whether we have
    # sufficient access privileges to write in that path
    can_write, file_exists = ps._check_file_path(file_path)

    if can_write:
        # If the file already exists then issue a warning and overwrite the
        # file
        if file_exists:
            logger.warning(
                'File already exists at %s; Overwriting it', file_path)
            # we open the file_path in binary mode, as we are writing in
            # binary format'
            with open(file_path, 'wb') as file_handler:
                cloudpickle.dump(data_frame, file_handler)
        else:
            #
            with open(file_path, 'wb') as file_handler:
                cloudpickle.dump(data_frame, file_handler)
    else:
        # Looks like we cannot write the file in the given path. Raise an
        # error in this case.
        logger.error('Cannot write in the file path %s; Exiting', file_path)
        raise AssertionError('Cannot write in the file path %s', file_path)

    # Once we are done with writing the DataFrame, we will write the metadata
    #  now

    # Initialize a metadata dictionary to hold the metadata of DataFrame from
    #  the catalog
    metadata_dict = collections.OrderedDict()

    # get all the properties for the input data frame
    # # Check if the DataFrame information is present in the catalog
    properties = {}
    if cm.is_dfinfo_present(data_frame) is True:
        properties = cm.get_all_properties(data_frame)

    # If the properties are present in the catalog, then write properties to
    # disk
    if len(properties) > 0:
        for property_name, property_value in six.iteritems(properties):
            if isinstance(property_value, six.string_types) is True:
                metadata_dict[property_name] = property_value

    # try to save metadata
    can_write, file_exists = ps._check_file_path(metadata_filename)
    if can_write:
        # If the file already exists, then issue a warning and overwrite the
        # file
        if file_exists:
            logger.warning(
                'Metadata file already exists at %s. Overwriting it',
                metadata_filename)
            # write metadata contents
            with open(metadata_filename, 'wb') as file_handler:
                cloudpickle.dump(metadata_dict, file_handler)
        else:
            # write metadata contents
            with open(metadata_filename, 'wb') as file_handler:
                cloudpickle.dump(metadata_dict, file_handler)
    else:
        logger.warning(
            'Cannot write metadata at the file path %s. Skip writing metadata '
            'file', metadata_filename)

    return True
def dask_down_sample(ltable, rtable, size, y_param, show_progress=True, verbose=False,
                seed=None, rem_stop_words=True, rem_puncs=True, n_ltable_chunks=1,
                n_sample_rtable_chunks=1):


    """
        WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.
         
        This command down samples two tables A and B into smaller tables A' and
        B' respectively.    
        Specifically, first it randomly selects `size` tuples
        from the table B to be table B'. Next, it builds an inverted index I
        (token, tuple_id) on table A. For each tuple x ∈ B', the algorithm
        finds a set P of k/2 tuples from I that match x,
        and a set Q of k/2 tuples randomly selected from A - P.
        The idea is for A' and B' to share some matches yet be
        as representative of A and B as possible.
    
        Args:
            ltable (DataFrame): The left input table, i.e., table A.
            rtable (DataFrame): The right input table, i.e., table B. 
            size (int): The size that table B should be down sampled to.
            y_param (int): The parameter to control the down sample size of table A.
                Specifically, the down sampled size of table A should be close to
                size * y_param.
            show_progress (boolean): A flag to indicate whether a progress bar
                should be displayed (defaults to True).
            verbose (boolean): A flag to indicate whether the debug information
             should be displayed (defaults to False).
            seed (int): The seed for the pseudo random number generator to select
                the tuples from A and B (defaults to None).
            rem_stop_words (boolean): A flag to indicate whether a default set of stop words 
             must be removed.
            rem_puncs (boolean): A flag to indicate whether the punctuations must be 
             removed from the strings.             
            n_ltable_chunks (int): The number of partitions for ltable (defaults to 1). If it 
              is set to -1, the number of partitions will be set to the 
              number of cores in the machine.  
            n_sample_rtable_chunks (int): The number of partitions for the 
              sampled rtable (defaults to 1)
                
    
        Returns:
            Down sampled tables A and B as pandas DataFrames.
    
        Raises:
            AssertionError: If any of the input tables (`table_a`, `table_b`) are
                empty or not a DataFrame.
            AssertionError: If `size` or `y_param` is empty or 0 or not a
                valid integer value.
            AssertionError: If `seed` is not a valid integer
                value.
            AssertionError: If `verbose` is not of type bool.
            AssertionError: If `show_progress` is not of type bool.
            AssertionError: If `n_ltable_chunks` is not of type int.
            AssertionError: If `n_sample_rtable_chunks` is not of type int.            
    
        Examples:
            >>> from py_entitymatching.dask.dask_down_sample import dask_down_sample
            >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
            >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
            >>> sample_A, sample_B = dask_down_sample(A, B, 500, 1, n_ltable_chunks=-1, n_sample_rtable_chunks=-1)
            # Example with seed = 0. This means the same sample data set will be returned
            # each time this function is run.
            >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
            >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
            >>> sample_A, sample_B = dask_down_sample(A, B, 500, 1, seed=0, n_ltable_chunks=-1, n_sample_rtable_chunks=-1)
            
        """

    logger.warning(
        "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN "
        "RISK.")

    # validation checks
    if not isinstance(ltable, pd.DataFrame):
        logger.error('Input table A (ltable) is not of type pandas DataFrame')
        raise AssertionError(
            'Input table A (ltable) is not of type pandas DataFrame')

    if not isinstance(rtable, pd.DataFrame):
        logger.error('Input table B (rtable) is not of type pandas DataFrame')

        raise AssertionError(
            'Input table B (rtable) is not of type pandas DataFrame')

    if len(ltable) == 0 or len(rtable) == 0:
        logger.error('Size of the input table is 0')
        raise AssertionError('Size of the input table is 0')

    if size == 0 or y_param == 0:
        logger.error(
            'size or y cannot be zero (3rd and 4th parameter of downsample)')
        raise AssertionError(
            'size or y_param cannot be zero (3rd and 4th parameter of downsample)')

    if seed is not None and not isinstance(seed, int):
        logger.error('Seed is not of type integer')
        raise AssertionError('Seed is not of type integer')

    if len(rtable) < size:
        logger.warning(
            'Size of table B is less than b_size parameter - using entire table B')

    validate_object_type(verbose, bool, 'Parameter verbose')
    validate_object_type(show_progress, bool, 'Parameter show_progress')
    validate_object_type(rem_stop_words, bool, 'Parameter rem_stop_words')
    validate_object_type(rem_puncs, bool, 'Parameter rem_puncs')
    validate_object_type(n_ltable_chunks, int, 'Parameter n_ltable_chunks')
    validate_object_type(n_sample_rtable_chunks, int, 'Parameter n_sample_rtable_chunks')


    rtable_sampled = sample_right_table(rtable, size, seed)

    ltbl_str_cols = _get_str_cols_list(ltable)
    proj_ltable = ltable[ltable.columns[ltbl_str_cols]]


    if n_ltable_chunks == -1:
        n_ltable_chunks = get_num_cores()

    ltable_chunks = pd.np.array_split(proj_ltable, n_ltable_chunks)
    preprocessed_tokenized_tbl = []

    # Use Dask to preprocess and tokenize strings.
    start_row_id = 0
    for i in range(len(ltable_chunks)):
        # start_row_id is internally used by process_tokenize_concat strings to map
        # each to its row id in the ltable.
        result = delayed(process_tokenize_concat_strings)(ltable_chunks[i],
                                                             start_row_id,
                                                             rem_puncs, rem_stop_words)
        preprocessed_tokenized_tbl.append(result)

        # update start_row_id
        start_row_id += len(ltable_chunks[i])

    preprocessed_tokenized_tbl = delayed(wrap)(preprocessed_tokenized_tbl)

    # Now execute the DAG
    if show_progress:
        with ProgressBar():
            logger.info('Preprocessing/tokenizing ltable')
            preprocessed_tokenized_tbl_vals = preprocessed_tokenized_tbl.compute(
                scheduler="processes", num_workers=get_num_cores())
    else:
        preprocessed_tokenized_tbl_vals = preprocessed_tokenized_tbl.compute(
            scheduler="processes", num_workers=get_num_cores())

    ltable_processed_dict = {}
    for i in range(len(preprocessed_tokenized_tbl_vals)):
        ltable_processed_dict.update(preprocessed_tokenized_tbl_vals[i])

    # Build an inverted index
    inverted_index = build_inverted_index(ltable_processed_dict)


    # Preprocess/tokenize sampled rtable and probe
    rtbl_str_cols = _get_str_cols_list(rtable_sampled)
    proj_rtable_sampled = rtable_sampled[rtable_sampled.columns[rtbl_str_cols]]


    if n_sample_rtable_chunks == -1:
        n_sample_rtable_chunks = get_num_cores()

    rtable_chunks = pd.np.array_split(proj_rtable_sampled, n_sample_rtable_chunks)
    probe_result = []

    # Create the DAG
    for i in range(len(rtable_chunks)):
        result = delayed(probe)(rtable_chunks[i], y_param, len(proj_ltable),
                                              inverted_index, rem_puncs,
                                rem_stop_words, seed)
        probe_result.append(result)

    probe_result = delayed(wrap)(probe_result)

    # Execute the DAG
    if show_progress:
        with ProgressBar():
            logger.info('Probing using rtable')
            probe_result = probe_result.compute(scheduler="processes",
                                                num_workers=multiprocessing.cpu_count())
    else:
        probe_result = probe_result.compute(scheduler="processes",
                                            num_workers=multiprocessing.cpu_count())

    probe_result = map(list, probe_result)
    l_tbl_indices = set(sum(probe_result, []))

    l_tbl_indices = list(l_tbl_indices)
    ltable_sampled = ltable.iloc[l_tbl_indices]



    # update catalog
    if cm.is_dfinfo_present(ltable):
        cm.copy_properties(ltable, ltable_sampled)

    if cm.is_dfinfo_present(rtable):
        cm.copy_properties(rtable, rtable_sampled)

    return ltable_sampled, rtable_sampled
 def test_valid_path_wi_metadata_unknownprop(self):
     cm.del_catalog()
     p = os.sep.join([io_datasets_path, 'InvalidMetadata1.csv'])
     IM = read_csv_metadata(p)
     self.assertEqual(cm.is_dfinfo_present(IM), True)
     self.assertEqual(cm.get_property(IM, 'key1'), 'ID')
def down_sample(table_a, table_b, size, y_param, show_progress=True,
                verbose=False, seed=None, rem_stop_words=True,
                rem_puncs=True, n_jobs=1):
    """
    This function down samples two tables A and B into smaller tables A' and
    B' respectively.

    Specifically, first it randomly selects `size` tuples
    from the table B to be table B'. Next, it builds an inverted index I
    (token, tuple_id) on table A. For each tuple x ∈ B', the algorithm
    finds a set P of k/2 tuples from I that match x,
    and a set Q of k/2 tuples randomly selected from A - P.
    The idea is for A' and B' to share some matches yet be
    as representative of A and B as possible.

    Args:
        table_a,table_b (DataFrame): The input tables A and B.
        size (int): The size that table B should be down sampled to.
        y_param (int): The parameter to control the down sample size of table A.
            Specifically, the down sampled size of table A should be close to
            size * y_param.
        show_progress (boolean): A flag to indicate whether a progress bar
            should be displayed (defaults to True).
        verbose (boolean): A flag to indicate whether the debug information
         should be displayed (defaults to False).
        seed (int): The seed for the pseudo random number generator to select
            the tuples from A and B (defaults to None).
        rem_stop_words (boolean): A flag to indicate whether a default set of stop words 
         must be removed.
        rem_puncs (boolean): A flag to indicate whether the punctuations must be 
         removed from the strings.
        n_jobs (int): The number of parallel jobs to be used for computation
            (defaults to 1). If -1 all CPUs are used. If 0 or 1,
            no parallel computation is used at all, which is useful for
            debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are
            used (where n_cpus is the total number of CPUs in the
            machine). Thus, for n_jobs = -2, all CPUs but one are used.
            If (n_cpus + 1 + n_jobs) is less than 1, then no parallel
            computation is used (i.e., equivalent to the default).
            

    Returns:
        Down sampled tables A and B as pandas DataFrames.

    Raises:
        AssertionError: If any of the input tables (`table_a`, `table_b`) are
            empty or not a DataFrame.
        AssertionError: If `size` or `y_param` is empty or 0 or not a
            valid integer value.
        AssertionError: If `seed` is not a valid integer
            value.
        AssertionError: If `verbose` is not of type bool.
        AssertionError: If `show_progress` is not of type bool.
        AssertionError: If `n_jobs` is not of type int.

    Examples:
        >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
        >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
        >>> sample_A, sample_B = em.down_sample(A, B, 500, 1, n_jobs=-1)

        # Example with seed = 0. This means the same sample data set will be returned
        # each time this function is run.
        >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
        >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
        >>> sample_A, sample_B = em.down_sample(A, B, 500, 1, seed=0, n_jobs=-1)
    """

    if not isinstance(table_a, pd.DataFrame):
        logger.error('Input table A is not of type pandas DataFrame')
        raise AssertionError(
            'Input table A is not of type pandas DataFrame')

    if not isinstance(table_b, pd.DataFrame):
        logger.error('Input table B is not of type pandas DataFrame')
        raise AssertionError(
            'Input table B is not of type pandas DataFrame')

    if len(table_a) == 0 or len(table_b) == 0:
        logger.error('Size of the input table is 0')
        raise AssertionError('Size of the input table is 0')

    if size == 0 or y_param == 0:
        logger.error(
            'size or y cannot be zero (3rd and 4th parameter of downsample)')
        raise AssertionError(
            'size or y_param cannot be zero (3rd and 4th parameter of downsample)')

    if seed is not None and not isinstance(seed, int):
        logger.error('Seed is not of type integer')
        raise AssertionError('Seed is not of type integer')

    if len(table_b) < size:
        logger.warning(
            'Size of table B is less than b_size parameter - using entire table B')

    validate_object_type(verbose, bool, 'Parameter verbose')
    validate_object_type(show_progress, bool, 'Parameter show_progress')
    validate_object_type(rem_stop_words, bool, 'Parameter rem_stop_words')
    validate_object_type(rem_puncs, bool, 'Parameter rem_puncs')
    validate_object_type(n_jobs, int, 'Parameter n_jobs')

    # get and validate required metadata
    log_info(logger, 'Required metadata: ltable key, rtable key', verbose)

    # # # get metadata
    # l_key, r_key = cm.get_keys_for_ltable_rtable(table_a, table_b, logger,
    #                                              verbose)
    #
    # # # validate metadata
    # cm._validate_metadata_for_table(table_a, l_key, 'ltable', logger,
    #                                 verbose)
    # cm._validate_metadata_for_table(table_b, r_key, 'rtable', logger,
    #                                 verbose)

    # Inverted index built on table A will consist of all tuples in such P's and Q's - central idea is to have
    # good coverage in the down sampled A' and B'.
    s_inv_index = _inv_index(table_a, rem_stop_words, rem_puncs)

    # Randomly select size tuples from table B to be B'
    # If a seed value has been give, use a RandomState with the given seed
    b_sample_size = min(math.floor(size), len(table_b))
    if seed is not None:
        rand = RandomState(seed)
    else:
        rand = RandomState()
    b_tbl_indices = list(rand.choice(len(table_b), int(b_sample_size), replace=False))

    n_jobs = get_num_procs(n_jobs, len(table_b))

    sample_table_b = table_b.loc[b_tbl_indices]
    if n_jobs <= 1:
        # Probe inverted index to find all tuples in A that share tokens with tuples in B'.
        s_tbl_indices = _probe_index_split(sample_table_b, y_param,
                                           len(table_a), s_inv_index, show_progress,
                                           seed, rem_stop_words, rem_puncs)
    else:
        sample_table_splits = np.array_split(sample_table_b, n_jobs)
        results = Parallel(n_jobs=n_jobs)(
            delayed(_probe_index_split)(sample_table_splits[job_index], y_param,
                                       len(table_a), s_inv_index,
                                        (show_progress and (job_index == n_jobs - 1)),
                                       seed, rem_stop_words, rem_puncs)
            for job_index in range(n_jobs)
        )
        results = map(list, results)
        s_tbl_indices = set(sum(results, []))

    s_tbl_indices = list(s_tbl_indices)
    l_sampled = table_a.iloc[list(s_tbl_indices)]
    r_sampled = table_b.iloc[list(b_tbl_indices)]

    # update catalog
    if cm.is_dfinfo_present(table_a):
        cm.copy_properties(table_a, l_sampled)
    if cm.is_dfinfo_present(table_b):
        cm.copy_properties(table_b, r_sampled)

    return l_sampled, r_sampled
def dask_down_sample(ltable,
                     rtable,
                     size,
                     y_param,
                     show_progress=True,
                     verbose=False,
                     seed=None,
                     rem_stop_words=True,
                     rem_puncs=True,
                     n_ltable_chunks=1,
                     n_sample_rtable_chunks=1):
    """
        WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.
         
        This command down samples two tables A and B into smaller tables A' and
        B' respectively.    
        Specifically, first it randomly selects `size` tuples
        from the table B to be table B'. Next, it builds an inverted index I
        (token, tuple_id) on table A. For each tuple x ∈ B', the algorithm
        finds a set P of k/2 tuples from I that match x,
        and a set Q of k/2 tuples randomly selected from A - P.
        The idea is for A' and B' to share some matches yet be
        as representative of A and B as possible.
    
        Args:
            ltable (DataFrame): The left input table, i.e., table A.
            rtable (DataFrame): The right input table, i.e., table B. 
            size (int): The size that table B should be down sampled to.
            y_param (int): The parameter to control the down sample size of table A.
                Specifically, the down sampled size of table A should be close to
                size * y_param.
            show_progress (boolean): A flag to indicate whether a progress bar
                should be displayed (defaults to True).
            verbose (boolean): A flag to indicate whether the debug information
             should be displayed (defaults to False).
            seed (int): The seed for the pseudo random number generator to select
                the tuples from A and B (defaults to None).
            rem_stop_words (boolean): A flag to indicate whether a default set of stop words 
             must be removed.
            rem_puncs (boolean): A flag to indicate whether the punctuations must be 
             removed from the strings.             
            n_ltable_chunks (int): The number of partitions for ltable (defaults to 1). If it 
              is set to -1, the number of partitions will be set to the 
              number of cores in the machine.  
            n_sample_rtable_chunks (int): The number of partitions for the 
              sampled rtable (defaults to 1)
                
    
        Returns:
            Down sampled tables A and B as pandas DataFrames.
    
        Raises:
            AssertionError: If any of the input tables (`table_a`, `table_b`) are
                empty or not a DataFrame.
            AssertionError: If `size` or `y_param` is empty or 0 or not a
                valid integer value.
            AssertionError: If `seed` is not a valid integer
                value.
            AssertionError: If `verbose` is not of type bool.
            AssertionError: If `show_progress` is not of type bool.
            AssertionError: If `n_ltable_chunks` is not of type int.
            AssertionError: If `n_sample_rtable_chunks` is not of type int.            
    
        Examples:
            >>> from py_entitymatching.dask.dask_down_sample import dask_down_sample
            >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
            >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
            >>> sample_A, sample_B = dask_down_sample(A, B, 500, 1, n_ltable_chunks=-1, n_sample_rtable_chunks=-1)
            # Example with seed = 0. This means the same sample data set will be returned
            # each time this function is run.
            >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
            >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
            >>> sample_A, sample_B = dask_down_sample(A, B, 500, 1, seed=0, n_ltable_chunks=-1, n_sample_rtable_chunks=-1)
            
        """

    logger.warning(
        "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN "
        "RISK.")

    # validation checks
    if not isinstance(ltable, pd.DataFrame):
        logger.error('Input table A (ltable) is not of type pandas DataFrame')
        raise AssertionError(
            'Input table A (ltable) is not of type pandas DataFrame')

    if not isinstance(rtable, pd.DataFrame):
        logger.error('Input table B (rtable) is not of type pandas DataFrame')

        raise AssertionError(
            'Input table B (rtable) is not of type pandas DataFrame')

    if len(ltable) == 0 or len(rtable) == 0:
        logger.error('Size of the input table is 0')
        raise AssertionError('Size of the input table is 0')

    if size == 0 or y_param == 0:
        logger.error(
            'size or y cannot be zero (3rd and 4th parameter of downsample)')
        raise AssertionError(
            'size or y_param cannot be zero (3rd and 4th parameter of downsample)'
        )

    if seed is not None and not isinstance(seed, int):
        logger.error('Seed is not of type integer')
        raise AssertionError('Seed is not of type integer')

    if len(rtable) < size:
        logger.warning(
            'Size of table B is less than b_size parameter - using entire table B'
        )

    validate_object_type(verbose, bool, 'Parameter verbose')
    validate_object_type(show_progress, bool, 'Parameter show_progress')
    validate_object_type(rem_stop_words, bool, 'Parameter rem_stop_words')
    validate_object_type(rem_puncs, bool, 'Parameter rem_puncs')
    validate_object_type(n_ltable_chunks, int, 'Parameter n_ltable_chunks')
    validate_object_type(n_sample_rtable_chunks, int,
                         'Parameter n_sample_rtable_chunks')

    rtable_sampled = sample_right_table(rtable, size, seed)

    ltbl_str_cols = _get_str_cols_list(ltable)
    proj_ltable = ltable[ltable.columns[ltbl_str_cols]]

    if n_ltable_chunks == -1:
        n_ltable_chunks = get_num_cores()

    ltable_chunks = np.array_split(proj_ltable, n_ltable_chunks)
    preprocessed_tokenized_tbl = []

    # Use Dask to preprocess and tokenize strings.
    start_row_id = 0
    for i in range(len(ltable_chunks)):
        # start_row_id is internally used by process_tokenize_concat strings to map
        # each to its row id in the ltable.
        result = delayed(process_tokenize_concat_strings)(ltable_chunks[i],
                                                          start_row_id,
                                                          rem_puncs,
                                                          rem_stop_words)
        preprocessed_tokenized_tbl.append(result)

        # update start_row_id
        start_row_id += len(ltable_chunks[i])

    preprocessed_tokenized_tbl = delayed(wrap)(preprocessed_tokenized_tbl)

    # Now execute the DAG
    if show_progress:
        with ProgressBar():
            logger.info('Preprocessing/tokenizing ltable')
            preprocessed_tokenized_tbl_vals = preprocessed_tokenized_tbl.compute(
                scheduler="processes", num_workers=get_num_cores())
    else:
        preprocessed_tokenized_tbl_vals = preprocessed_tokenized_tbl.compute(
            scheduler="processes", num_workers=get_num_cores())

    ltable_processed_dict = {}
    for i in range(len(preprocessed_tokenized_tbl_vals)):
        ltable_processed_dict.update(preprocessed_tokenized_tbl_vals[i])

    # Build an inverted index
    inverted_index = build_inverted_index(ltable_processed_dict)

    # Preprocess/tokenize sampled rtable and probe
    rtbl_str_cols = _get_str_cols_list(rtable_sampled)
    proj_rtable_sampled = rtable_sampled[rtable_sampled.columns[rtbl_str_cols]]

    if n_sample_rtable_chunks == -1:
        n_sample_rtable_chunks = get_num_cores()

    rtable_chunks = np.array_split(proj_rtable_sampled, n_sample_rtable_chunks)
    probe_result = []

    # Create the DAG
    for i in range(len(rtable_chunks)):
        result = delayed(probe)(rtable_chunks[i], y_param, len(proj_ltable),
                                inverted_index, rem_puncs, rem_stop_words,
                                seed)
        probe_result.append(result)

    probe_result = delayed(wrap)(probe_result)

    # Execute the DAG
    if show_progress:
        with ProgressBar():
            logger.info('Probing using rtable')
            probe_result = probe_result.compute(
                scheduler="processes", num_workers=multiprocessing.cpu_count())
    else:
        probe_result = probe_result.compute(
            scheduler="processes", num_workers=multiprocessing.cpu_count())

    probe_result = map(list, probe_result)
    l_tbl_indices = set(sum(probe_result, []))

    l_tbl_indices = list(l_tbl_indices)
    ltable_sampled = ltable.iloc[l_tbl_indices]

    # update catalog
    if cm.is_dfinfo_present(ltable):
        cm.copy_properties(ltable, ltable_sampled)

    if cm.is_dfinfo_present(rtable):
        cm.copy_properties(rtable, rtable_sampled)

    return ltable_sampled, rtable_sampled
 def test_valid_path_wi_invalidmetadata_replace_key(self):
     cm.del_catalog()
     p = os.sep.join([io_datasets_path, 'A_key_zipcode.csv'])
     IM = read_csv_metadata(p, key='ID')
     self.assertEqual(cm.is_dfinfo_present(IM), True)
     self.assertEqual(cm.is_property_present_for_df(IM, 'key'), True)
def down_sample(table_a, table_b, size, y_param, show_progress=True,
                verbose=False, seed=None):
    """
    This function down samples two tables A and B into smaller tables A' and
    B' respectively.

    Specifically, first it randomly selects `size` tuples
    from the table B to be table B'. Next, it builds an inverted index I
    (token, tuple_id) on table A. For each tuple x ∈ B', the algorithm
    finds a set P of k/2 tuples from I that match x,
    and a set Q of k/2 tuples randomly selected from A - P.
    The idea is for A' and B' to share some matches yet be
    as representative of A and B as possible.

    Args:
        table_a,table_b (DataFrame): The input tables A and B.
        size (int): The size that table B should be down sampled to.
        y_param (int): The parameter to control the down sample size of table A.
            Specifically, the down sampled size of table A should be close to
            size * y_param.
        show_progress (boolean): A flag to indicate whether a progress bar
            should be displayed (defaults to True).
        verbose (boolean): A flag to indicate whether the debug information
         should be displayed (defaults to False).
        seed (int): The seed for the pseudo random number generator to select
            the tuples from A and B (defaults to None).

    Returns:
        Down sampled tables A and B as pandas DataFrames.

    Raises:
        AssertionError: If any of the input tables (`table_a`, `table_b`) are
            empty or not a DataFrame.
        AssertionError: If `size` or `y_param` is empty or 0 or not a
            valid integer value.
        AssertionError: If `seed` is not a valid integer
            value.

    Examples:
        >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
        >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
        >>> sample_A, sample_B = em.down_sample(A, B, 500, 1)

        # Example with seed = 0. This means the same sample data set will be returned
        # each time this function is run.
        >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
        >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
        >>> sample_A, sample_B = em.down_sample(A, B, 500, 1, seed=0)
    """

    if not isinstance(table_a, pd.DataFrame):
        logger.error('Input table A is not of type pandas DataFrame')
        raise AssertionError(
            'Input table A is not of type pandas DataFrame')

    if not isinstance(table_b, pd.DataFrame):
        logger.error('Input table B is not of type pandas DataFrame')
        raise AssertionError(
            'Input table B is not of type pandas DataFrame')

    if len(table_a) == 0 or len(table_b) == 0:
        logger.error('Size of the input table is 0')
        raise AssertionError('Size of the input table is 0')

    if size == 0 or y_param == 0:
        logger.error(
            'size or y cannot be zero (3rd and 4th parameter of downsample)')
        raise AssertionError(
            'size or y_param cannot be zero (3rd and 4th parameter of downsample)')

    if seed is not None and not isinstance(seed, int):
        logger.error('Seed is not of type integer')
        raise AssertionError('Seed is not of type integer')

    if len(table_b) < size:
        logger.warning(
            'Size of table B is less than b_size parameter - using entire table B')

    # get and validate required metadata
    log_info(logger, 'Required metadata: ltable key, rtable key', verbose)

    # # # get metadata
    # l_key, r_key = cm.get_keys_for_ltable_rtable(table_a, table_b, logger,
    #                                              verbose)
    #
    # # # validate metadata
    # cm._validate_metadata_for_table(table_a, l_key, 'ltable', logger,
    #                                 verbose)
    # cm._validate_metadata_for_table(table_b, r_key, 'rtable', logger,
    #                                 verbose)

    # Inverted index built on table A will consist of all tuples in such P's and Q's - central idea is to have
    # good coverage in the down sampled A' and B'.
    s_inv_index = _inv_index(table_a)

    # Randomly select size tuples from table B to be B'
    # If a seed value has been give, use a RandomState with the given seed
    b_sample_size = min(math.floor(size), len(table_b))
    if seed is not None:
        rand = RandomState(seed)
    else:
        rand = RandomState()
    b_tbl_indices = list(rand.choice(len(table_b), int(b_sample_size), replace=False))

    # Probe inverted index to find all tuples in A that share tokens with tuples in B'.
    s_tbl_indices = _probe_index(table_b.ix[b_tbl_indices], y_param,
                                 len(table_a), s_inv_index, show_progress, seed=seed)
    s_tbl_indices = list(s_tbl_indices)
    l_sampled = table_a.iloc[list(s_tbl_indices)]
    r_sampled = table_b.iloc[list(b_tbl_indices)]

    # update catalog
    if cm.is_dfinfo_present(table_a):
        cm.copy_properties(table_a, l_sampled)
    if cm.is_dfinfo_present(table_b):
        cm.copy_properties(table_b, r_sampled)

    return l_sampled, r_sampled
 def test_validpath_metadata_set_to_none_1(self):
     cm.del_catalog()
     del_files_in_dir(sndbx_path)
     A = read_csv_metadata(path_a, key=None)
     self.assertEqual(cm.is_dfinfo_present(A), True)
     cm.get_key(A)
 def test_valid_path_wi_invalidmetadata_replace_key(self):
     cm.del_catalog()
     p = os.sep.join([io_datasets_path, 'A_key_zipcode.csv'])
     IM = read_csv_metadata(p, key='ID')
     self.assertEqual(cm.is_dfinfo_present(IM), True)
     self.assertEqual(cm.is_property_present_for_df(IM, 'key'), True)
 def test_is_dfinfo_present_valid_2(self):
     A = pd.read_csv(path_a)
     status = cm.is_dfinfo_present(A)
     self.assertEqual(status, False)
示例#30
0
def _down_sample(ltable, rtable, y_param, show_progress=True, verbose=False,
                 seed=None,  rem_puncs=True, rem_stop_words=True, n_ltable_chunks=-1,
                 n_rtable_chunks=-1):
    """
    Down sampling command implementation. We have reproduced the down sample command 
    because the input to the down sample command is the down sampled right table.   
    """

    if not isinstance(ltable, pd.DataFrame):
        logger.error('Input table A (ltable) is not of type pandas DataFrame')
        raise AssertionError(
            'Input table A (ltable) is not of type pandas DataFrame')

    if not isinstance(rtable, pd.DataFrame):
        logger.error('Input table B (rtable) is not of type pandas DataFrame')

        raise AssertionError(
            'Input table B (rtable) is not of type pandas DataFrame')

    if len(ltable) == 0 or len(rtable) == 0:
        logger.error('Size of the input table is 0')
        raise AssertionError('Size of the input table is 0')

    if y_param == 0:
        logger.error(
            'y cannot be zero (3rd and 4th parameter of downsample)')
        raise AssertionError(
            'y_param cannot be zero (3rd and 4th parameter of downsample)')

    if seed is not None and not isinstance(seed, int):
        logger.error('Seed is not of type integer')
        raise AssertionError('Seed is not of type integer')


    validate_object_type(verbose, bool, 'Parameter verbose')
    validate_object_type(show_progress, bool, 'Parameter show_progress')
    validate_object_type(rem_stop_words, bool, 'Parameter rem_stop_words')
    validate_object_type(rem_puncs, bool, 'Parameter rem_puncs')
    validate_object_type(n_ltable_chunks, int, 'Parameter n_ltable_chunks')
    validate_object_type(n_rtable_chunks, int, 'Parameter n_rtable_chunks')

    # rtable_sampled = sample_right_table(rtable, size)
    rtable_sampled = rtable

    ltbl_str_cols = _get_str_cols_list(ltable)
    proj_ltable = ltable[ltable.columns[ltbl_str_cols]]

    if n_ltable_chunks == -1:
        n_ltable_chunks = multiprocessing.cpu_count()

    ltable_chunks = pd.np.array_split(proj_ltable, n_ltable_chunks)
    preprocessed_tokenized_tbl = []
    start_row_id = 0
    for i in range(len(ltable_chunks)):
        result = delayed(process_tokenize_concat_strings)(ltable_chunks[i],
                                                          start_row_id,
                                                          rem_puncs, rem_stop_words)
        preprocessed_tokenized_tbl.append(result)
        start_row_id += len(ltable_chunks[i])
    preprocessed_tokenized_tbl = delayed(wrap)(preprocessed_tokenized_tbl)
    if show_progress:
        with ProgressBar():
            logger.info('Preprocessing/tokenizing ltable')
            preprocessed_tokenized_tbl_vals = preprocessed_tokenized_tbl.compute(
                scheduler="processes", num_workers=multiprocessing.cpu_count())
    else:
        preprocessed_tokenized_tbl_vals = preprocessed_tokenized_tbl.compute(
            scheduler="processes", num_workers=multiprocessing.cpu_count())

    ltable_processed_dict = {}
    for i in range(len(preprocessed_tokenized_tbl_vals)):
        ltable_processed_dict.update(preprocessed_tokenized_tbl_vals[i])

    inverted_index = build_inverted_index(ltable_processed_dict)

    rtbl_str_cols = _get_str_cols_list(rtable_sampled)
    proj_rtable_sampled = rtable_sampled[rtable_sampled.columns[rtbl_str_cols]]


    if n_rtable_chunks == -1:
        n_rtable_chunks = multiprocessing.cpu_count()

    rtable_chunks = pd.np.array_split(proj_rtable_sampled, n_rtable_chunks)
    probe_result = []

    for i in range(len(rtable_chunks)):
        result = delayed(probe)(rtable_chunks[i], y_param, len(proj_ltable),
                                inverted_index, rem_puncs,
                                rem_stop_words, seed)
        probe_result.append(result)

    probe_result = delayed(wrap)(probe_result)
    if show_progress:
        with ProgressBar():
            logger.info('Probing using rtable')
            probe_result = probe_result.compute(scheduler="processes",
                                                num_workers=multiprocessing.cpu_count())
    else:
        probe_result = probe_result.compute(scheduler="processes",
                                            num_workers=multiprocessing.cpu_count())

    probe_result = map(list, probe_result)
    l_tbl_indices = set(sum(probe_result, []))

    l_tbl_indices = list(l_tbl_indices)
    ltable_sampled = ltable.iloc[l_tbl_indices]



    # update catalog
    if cm.is_dfinfo_present(ltable):
        cm.copy_properties(ltable, ltable_sampled)

    if cm.is_dfinfo_present(rtable):
        cm.copy_properties(rtable, rtable_sampled)

    return ltable_sampled, rtable_sampled
 def test_is_dfinfo_present_invalid(self):
     cm.is_dfinfo_present(None)
示例#32
0
def read_csv_metadata(file_path, **kwargs):
    """
    Reads a CSV (comma-separated values) file into a pandas DataFrame
    and update the catalog with the metadata. The CSV files typically contain
    data for the input tables or a candidate set.

    Specifically, this function first reads the CSV file from the given file
    path into a pandas DataFrame, by using pandas' in-built 'read_csv'
    method. Then, it updates the catalog with the metadata. There are three
    ways to update the metadata: (1) using a metadata file, (2) using the
    key-value parameters supplied in the function, and (3) using both
    metadata file and key-value parameters.

    To update the metadata in the catalog using the metadata file,
    the function will look for a file in the same directory with  same file name
    but with a  specific extension. This extension can be optionally given by
    the user (defaults to '.metadata'). If the metadata  file is  present,
    the function will read and update the catalog appropriately. If  the
    metadata file is not present, the function will issue a warning that the
    metadata file is not present.

    The metadata information can also be given as parameters to the function
    (see description of arguments for more details). If given, the function
    will update the catalog with the given information.

    Further, the metadata can partly reside in the metdata file and partly as
    supplied parameters. The function will take a union of the two and
    update the catalog appropriately.
    If the same metadata is given in both the metadata file
    and the function, then the metadata in the function takes precedence over
    the metadata given in the file.

    Args:
        file_path(string): The CSV file path

        kwargs(dictionary): A Python dictionary containing key-value arguments.
            There are a few key-value pairs that are specific to
            read_csv_metadata and  all the other key-value pairs are passed
            to pandas read_csv method

    Returns:
        A pandas DataFrame read from the input CSV file.
    Raises:
        AssertionError: If `file_path` is not of type string.
        AssertionError: If a file does not exist in the
            given `file_path`.

    Examples:
        *Example 1:* Read from CSV file and set metadata

        >>> A = em.read_csv_metadata('path_to_csv_file', key='id')
        >>> em.get_key(A)
         # 'id'

        *Example 2:*  Read from CSV file (with metadata file in the same directory

         Let the metadata file contain the following contents:

          #key = id

        >>> A = em.read_csv_metadata('path_to_csv_file')
        >>> em.get_key(A)
         # 'id'

    See Also:
        :meth:`~py_entitymatching.to_csv_metadata`
    """
    # Validate the input parameters.

    # # File path is expected to be of type string.
    if not isinstance(file_path, six.string_types):
        logger.error('Input file path is not of type string')
        raise AssertionError('Input file path is not of type string')

    # # Check if the given path is valid.
    if not os.path.exists(file_path):
        logger.error('File does not exist at path %s' % file_path)
        raise AssertionError('File does not exist at path %s' % file_path)

    # Check if the user has specified the metadata file's extension.
    extension = kwargs.pop('metadata_extn', None)

    # If the extension is not specified then set the extension to .metadata'.
    if extension is None:
        extension = '.metadata'

    # Format the extension to include a '.' in front if the user has not
    # given one.
    if not extension.startswith('.'):
        extension = '.' + extension

    # If the file is present, then update metadata from file.
    if _is_metadata_file_present(file_path, extension=extension):
        file_name, _ = os.path.splitext(file_path)
        file_name = ''.join([file_name, extension])
        metadata, _ = _get_metadata_from_file(file_name)

    # Else issue a warning that the metadata file is not present
    else:
        logger.warning('Metadata file is not present in the given path; '
                       'proceeding to read the csv file.')
        metadata = {}

    # Update the metadata with the key-value pairs given in the command. The
    # function _update_metadata_for_read_cmd takes care of updating the
    # metadata with only the key-value pairs specific to read_csv_metadata
    # method
    metadata, kwargs = _update_metadata_for_read_cmd(metadata, **kwargs)

    # Validate the metadata.
    _check_metadata_for_read_cmd(metadata)

    # Read the csv file using pandas read_csv method.
    data_frame = pd.read_csv(file_path, **kwargs)

    # Get the value for 'key' property and update the catalog.
    key = metadata.pop('key', None)
    if key is not None:
        cm.set_key(data_frame, key)

    fk_ltable = metadata.pop('fk_ltable', None)
    if fk_ltable is not None:
        cm.set_fk_ltable(data_frame, fk_ltable)

    fk_rtable = metadata.pop('fk_rtable', None)
    if fk_ltable is not None:
        cm.set_fk_rtable(data_frame, fk_rtable)

    # Update the catalog with other properties.
    for property_name, property_value in six.iteritems(metadata):
        cm.set_property(data_frame, property_name, property_value)
    if not cm.is_dfinfo_present(data_frame):
        cm.init_properties(data_frame)

    # Return the DataFrame
    return data_frame
 def test_validpath_metadata_set_to_none_1(self):
     cm.del_catalog()
     del_files_in_dir(sndbx_path)
     A = read_csv_metadata(path_a, key=None)
     self.assertEqual(cm.is_dfinfo_present(A), True)
     cm.get_key(A)
 def test_valid_path_wo_metadata(self):
     cm.del_catalog()
     B = read_csv_metadata(path_b)
     pd_B = pd.read_csv(path_b)
     self.assertEqual(B.equals(pd_B), True)
     self.assertEqual(cm.is_dfinfo_present(B), True)
 def test_init_properties_valid(self):
     # cm.del_catalog()
     A = pd.read_csv(path_a)
     cm.init_properties(A)
     self.assertEqual(cm.is_dfinfo_present(A), True)
 def test_valid_path_wo_metadata(self):
     cm.del_catalog()
     B = read_csv_metadata(path_b)
     pd_B = pd.read_csv(path_b)
     self.assertEqual(B.equals(pd_B), True)
     self.assertEqual(cm.is_dfinfo_present(B), True)
示例#37
0
def save_table(data_frame, file_path, metadata_ext='.pklmetadata'):
    """
    Saves a DataFrame to disk along with its metadata in a pickle format.

    This function saves a  DataFrame to disk along with its metadata from
    the catalog.

    Specifically, this function saves the DataFrame in the given
    file path, and saves the metadata in the same directory (as the
    file path) but with a different extension. This extension can be
    optionally given by the user (defaults to '.pklmetadata').

    Args:
        data_frame (DataFrame): The DataFrame that should be saved.

        file_path (string): The file path where the DataFrame must be stored.

        metadata_ext (string): The metadata extension that should be used while
            storing the metadata information. The default value is
            '.pklmetadata'.

    Returns:
        A Boolean value of True is returned if the DataFrame is successfully
        saved.

    Raises:
        AssertionError: If `data_frame` is not of type pandas
         DataFrame.
        AssertionError: If `file_path` is not of type string.
        AssertionError: If `metadata_ext` is not of type string.
        AssertionError: If a file cannot written in the given `file_path`.

    Examples:

        >>> A = pd.DataFrame({'id' : [1, 2], 'colA':['a', 'b'], 'colB' : [10, 20]})
        >>> em.save_table(A, './A.pkl') # will store two files ./A.pkl and ./A.pklmetadata

        >>> A = pd.DataFrame({'id' : [1, 2], 'colA':['a', 'b'], 'colB' : [10, 20]})
        >>> em.save_table(A, './A.pkl', metadata_ext='.pklmeta') # will store two files ./A.pkl and ./A.pklmeta


    See Also:
        :meth:`~py_entitymatching.load_table`

    Note:
        This function is a bit different from to_csv_metadata, where the
        DataFrame is stored in a CSV file format. The CSV file format can be
        viewed using a text editor. But a DataFrame stored using 'save_table' is
        stored in a special format, which cannot be viewed with a text editor.
        The reason we have save_table is, for larger DataFrames it is
        efficient to pickle the DataFrame to disk than writing the DataFrame
        in CSV format.
    """
    # Validate the input parameters

    validate_object_type(data_frame, pd.DataFrame)

    validate_object_type(file_path, six.string_types)

    validate_object_type(metadata_ext, six.string_types)

    # Get the file_name (with out extension) and the extension from the given
    #  file path. For example if the file_path was /Users/foo/file.csv then
    # the file_name will be /Users/foo/file and the extension will be '.csv'
    file_name, _ = os.path.splitext(file_path)

    # The metadata file name is the same file name but with the extension
    # given by the user
    metadata_filename = file_name + metadata_ext

    # Check if the file exists in the file_path and whether we have
    # sufficient access privileges to write in that path
    can_write, file_exists = ps._check_file_path(file_path)

    if can_write:
        # If the file already exists then issue a warning and overwrite the
        # file
        if file_exists:
            logger.warning(
                'File already exists at %s; Overwriting it', file_path)
            # we open the file_path in binary mode, as we are writing in
            # binary format'
            with open(file_path, 'wb') as file_handler:
                cloudpickle.dump(data_frame, file_handler)
        else:
            #
            with open(file_path, 'wb') as file_handler:
                cloudpickle.dump(data_frame, file_handler)
    else:
        # Looks like we cannot write the file in the given path. Raise an
        # error in this case.
        logger.error('Cannot write in the file path %s; Exiting', file_path)
        raise AssertionError('Cannot write in the file path %s', file_path)

    # Once we are done with writing the DataFrame, we will write the metadata
    #  now

    # Initialize a metadata dictionary to hold the metadata of DataFrame from
    #  the catalog
    metadata_dict = collections.OrderedDict()

    # get all the properties for the input data frame
    # # Check if the DataFrame information is present in the catalog
    properties = {}
    if cm.is_dfinfo_present(data_frame) is True:
        properties = cm.get_all_properties(data_frame)

    # If the properties are present in the catalog, then write properties to
    # disk
    if len(properties) > 0:
        for property_name, property_value in six.iteritems(properties):
            if isinstance(property_value, six.string_types) is True:
                metadata_dict[property_name] = property_value

    # try to save metadata
    can_write, file_exists = ps._check_file_path(metadata_filename)
    if can_write:
        # If the file already exists, then issue a warning and overwrite the
        # file
        if file_exists:
            logger.warning(
                'Metadata file already exists at %s. Overwriting it',
                metadata_filename)
            # write metadata contents
            with open(metadata_filename, 'wb') as file_handler:
                cloudpickle.dump(metadata_dict, file_handler)
        else:
            # write metadata contents
            with open(metadata_filename, 'wb') as file_handler:
                cloudpickle.dump(metadata_dict, file_handler)
    else:
        logger.warning(
            'Cannot write metadata at the file path %s. Skip writing metadata '
            'file', metadata_filename)

    return True
 def test_valid_path_wi_metadata_unknownprop(self):
     cm.del_catalog()
     p = os.sep.join([io_datasets_path, 'InvalidMetadata1.csv'])
     IM = read_csv_metadata(p)
     self.assertEqual(cm.is_dfinfo_present(IM), True)
     self.assertEqual(cm.get_property(IM, 'key1'), 'ID')