예제 #1
0
 def compare_string_distance(self, col_name: str, args: dict):
     logger.debug(f'Doing a comparison of {col_name} using {args}')
     self.contraster.string(
         col_name,
         col_name,
         label=f'{col_name}_{utils.convert_dict_to_str(args)}_distance',
         **args)
예제 #2
0
 def compare_swap_month_days(self, col_name: str, args: dict):
     logger.debug(
         f'Checking if the month and day are swapped in {col_name}')
     self.contraster.date(col_name,
                          col_name,
                          label=f'{col_name}_swap_month_days_distance',
                          **args)
예제 #3
0
 def make_contrast_metadata(self, contrasts):
     contrast_metadata = {}
     for column in contrasts.columns:
         logger.debug(f'Making you some stats about {column}')
         contrast_metadata[column] = utils.summarize_column(
             contrasts[column])
     self.metadata['contrasts'] = contrast_metadata
예제 #4
0
    def block_and_match(self, df):
        ## We will split-apply-combinei
        logger.debug(f'df sent to block-and-match has the following columns: {df.dtypes}')
        logger.info(f'Blocking by {self.blocking_rules}')
        grouped = df.groupby([utils.unpack_blocking_rule(df, column_name, position) for column_name, position in self.blocking_rules.items()])
        logger.info(f'Applying matcher to {len(grouped)} blocks.')
        all_block_metadata = {}

        matches = {}

        for key, group in grouped:
            logger.debug(f"Matching group {key} of size {len(group)}")
            
            if len(group) > 1:
                matches[key], block_metadata = self.match(group, key)
            else:
                block_metadata = {
                    'size': 1,
                    'n_pairs': 0,
                    'contrasts': None,
                    'scores': None
                }
                logger.debug(f"Group {key} only has one record, making a singleton id")
                matches[key] = cluster.generate_singleton_id(group, str(key))

            logger.debug('Wrapping up block')
            all_block_metadata[key] = block_metadata

        logger.debug('All blocks done! Yehaw!')
        self.metadata['blocks'] = all_block_metadata
        return pd.concat(matches.values())
예제 #5
0
 def compare_numeric_distance(self, col_name: str, args: dict):
     logger.debug(f'Doing a numeric distance calculation on {col_name}')
     self.contraster.date(
         col_name,
         col_name,
         label=
         f'{col_name}_numeric_{utils.convert_dict_to_str(args)}_distance',
         **args)
예제 #6
0
def generate_matched_ids(
    distances:pd.DataFrame,
    DF:pd.DataFrame,
    clustering_params:dict,
    base_data_directory:str,
    match_job_id=str,
    block_name='',
) -> pd.DataFrame:
    
    logger.info('Beginning clustering & id generation.')
    distances = square_distance_matrix(distances)
    ioutils.write_dataframe(distances.reset_index(), filepath=f'{base_data_directory}/match_cache/square_distances/{match_job_id}/{block_name}')

    ids = cluster(
        distances, **clustering_params
    )
    ioutils.write_dataframe(ids.reset_index(), filepath=f'{base_data_directory}/match_cache/raw_cluster_ids/{match_job_id}/{block_name}')
    max_cluster_id = ids.max()
    replacement_ids = pd.Series(range(max_cluster_id + 1, max_cluster_id + len(ids[ids == -1]) + 1), index=ids[ids==-1].index)
    ids[ids == -1] = replacement_ids
    logger.debug(f'IDs: {ids}')
    logger.debug(f'Replaced noisy singleton ids with \n{replacement_ids}')
    
    logger.debug(f'Adding the block name ({block_name}) to the matched_ids.')
    ids = block_name + ids.astype(str)
    logger.debug(f'New IDs: \n{ids}')
    
    df = DF.copy()
    
    df['matched_id'] = ids
    logger.info('Matched ids generated')

    return (df)                 
예제 #7
0
 def compare_list(self, col_name: str, args: dict):
     if args['method'] == 'any':
         logger.debug(f'Checking if {col_name} shares any value.')
         self.contraster.compare_vectorized(
             lists_share_any_values,
             col_name,
             col_name,
             label=f'{col_name}_any_list_item_distance')
     elif args['method'] == 'all':
         logger.debug(f'Checking if {col_name} shares all values.')
         self.contraster.compare_vectorized(
             lists_share_all_values,
             col_name,
             col_name,
             label=f'{col_name}_all_list_items_distance')
     else:
         raise ValueError(
             f"I don't know how to compare lists with this method ({method}). Please send me 'all' or 'any'."
         )
예제 #8
0
def write_matched_data(matches: pd.DataFrame, base_data_directory: str,
                       person_keys: list, schema_pk_lookup: dict,
                       match_job_id: str) -> dict:
    write_dataframe(
        df=matches.reset_index(),
        filepath=
        f'{base_data_directory}/match_cache/matcher_results/{match_job_id}')
    matched_results_paths = {}
    logger.debug(schema_pk_lookup)
    for event_type, primary_keys in schema_pk_lookup.items():
        logger.info(
            f'Writing matched data for {base_data_directory} {event_type}')
        matched_results_paths[event_type] = write_one_event_type(
            df=matches,
            base_data_directory=base_data_directory,
            event_type=event_type,
            person_keys=person_keys,
            primary_keys=primary_keys,
            match_job_id=match_job_id)

    return matched_results_paths
예제 #9
0
 def compare_exact(self, col_name: str, n_chars=None):
     logger.debug(f'Doing an exact comparison on {col_name}')
     if n_chars is not None:
         logger.debug(
             f'Doing an exact comparison of {n_chars} characters of {col_name}'
         )
         self.contraster.compare_vectorized(
             compare_exact_n_chars,
             col_name,
             col_name,
             n_chars['n_chars'],
             label=f"{col_name}_exact_{n_chars['n_chars']}_distance")
     else:
         logger.debug(
             f'Doing an exact comparison of all characters in {col_name}')
         self.contraster.exact(col_name,
                               col_name,
                               label=f'{col_name}_exact_distance')
예제 #10
0
def preprocess(df:pd.DataFrame, match_job_id:str, base_data_directory:str) -> pd.DataFrame:
    # full_name
    # full name is only given if name parts are not. maybe we should do some preprocessing on full names to create
    # name parts and use only the name parts, especially since it is possible for the jail and HMIS systems to
    # differ on what they use

    # prefix
    # we should preprocess prefixes to remove punctuation and possibly spaces
    if 'prefix' in df.columns:
        logger.debug('Removing punctuation from prefixes')
        df['prefix'] = df['prefix'].str.replace('[^\w\s]','')

    # first_name
    # potential preprocessing steps:
    # - remove punctuation
    # - create: full_first_name, first_word_first_name
    # - try using second+ word of first name as middle name if no middle name 

    # middle_name
    # potential preprocessing steps:
    # - remove punctuation
    # - create: full_middle_name, first_word_middle_name, second_word_middle_name

    # last_name
    # potential preprocessing steps:
    # - remove punctuation
    # - create: full_last_name, first_word_last_name, second_word_last_name

    # suffix
    if 'suffix' in df.columns:
        logger.debug('Removing punctuation from suffixes')
        df['suffix'] = df['suffix'].str.replace('[^\w\s]','')

    # dob
    # MUST BE CAST TO DATETIME DURING PREPROCESSING
    if 'dob' in df.columns:
        logger.debug('Converting date of birth to datetime')
        df['dob'] = pd.to_datetime(df['dob'])

    # ssn
    # THIS SHOULD BE CONVERTED TO STRING. The SSN consists of 3 words, and numerical distances are only
    # VAGUELY meaningful (e.g., the first 3 digits increase roughly east to west but not in a rigorous way,
    # and the second 2 digits are given out in a fixed but non-monotonic order)
    # the first three digits are the "area code" of where the person was registered.
    # most people living in an area will have one of a few local area codes; therefore, the distinctiveness
    # of the area code may be useful for matching. we may want to preprocess ssn to extract the area code
    # to make this comparison.
    if 'ssn' in df.columns:
        logger.debug('Converting social security number to str')
        df['ssn'] = df['ssn'].astype(str)

    # dmv_number
    # THIS SHOULD BE CAST TO STRING. In some jurisdictions, they are strings and in others ints. To ensure
    # that we can generalize here, we need to convert to string for all of them.
    if 'dmv_number' in df.columns:
        logger.debug('Converting dmv number to str')
        df['dmv_number'] = df['dmv_number'].astype(str)

    # race
    # make race into a list
    # eventually, we will want to combine secondary race and race into a single field
    if 'race' in df.columns:
        logger.debug('Converting race to list')
        df['race'] = df['race'].fillna('').str.split(',')
        logger.debug(f"Races observed in preprocessed df: {df['race']}")

    # ethnicity
    # ethnicity encodes only Hispanic/Not Hispanic. for some databases, Hispanic is actually included
    # in the race categories instead of in a separate field. we may want to do some pre-processing to
    # to add H to the race list where the ethnicity field contains 'Hispanic'

    logger.info('Preprocessing done!')
    logger.debug(f"The preprocessed dataframe has the following columns: {df.columns}")
    logger.debug(f"The preprocessed dimensions of the dataframe is: {df.shape}")
    ioutils.write_dataframe(df.reset_index(), filepath=f'{base_data_directory}/match_cache/preprocessed_data/{match_job_id}')
    return df
예제 #11
0
def do_match(
    base_data_directory:str,
    schema_pk_lookup:dict,
    upload_id:str=None,
    notify_webapp:bool=True,
    config_path:str='matcher_config.yaml'
):
    with open(config_path) as f:
        config = yaml.load(f)

    # Initializing: let's get started by collecting and logging some job metadata
    metadata = {
        'match_job_start_time': datetime.datetime.now(),
        'match_job_id': utils.unique_match_job_id(),
        'base_data_directory': base_data_directory,
        'config': config
    }
    logger.info("Matching process started with the following configuration:")
    for key, value in config.items():
         logger.info(f"Matcher config {key}: {value}")

    try:
        # Loading: collect matching data (keys) for all available event types & record which event types were found
        logger.info('Loading data for matching.')
        df, event_types_read = ioutils.load_data_for_matching(
            base_data_directory,
            list(schema_pk_lookup.keys()),
            config['keys'],
            metadata['match_job_id']
        )
        metadata['event_types_read'] = list(event_types_read)
        metadata['loaded_data_columns'] = list(df.columns.values)
        metadata['loaded_data_shape'] = list(df.shape)
        metadata['data_loaded_time'] = datetime.datetime.now()

        # Preprocessing: enforce data types and split/combine columns for feartures
        logger.info('Doing some preprocessing on the columns')
        df = preprocess.preprocess(df, metadata['match_job_id'], base_data_directory)
        metadata['preprocessed_data_columns'] = list(df.columns.values)
        metadata['preprocessed_data_shape'] = list(df.shape)
        metadata['data_preprocessed_time'] = datetime.datetime.now()

        # Matching: block the data, generate pairs and features, and cluster entities
        logger.info(f"Running matcher")
        match_object = matcher.Matcher(
            base_data_directory=base_data_directory,
            match_job_id=metadata['match_job_id'],
            clustering_rules=config['clusterer']['args'],
            contrast_rules=config['contrasts'],
            blocking_rules=config['blocking_rules']
        )
        matches = match_object.block_and_match(df=df)
        metadata['data_matched_time'] = datetime.datetime.now()
        metadata.update(match_object.metadata)
        logger.debug('Matching done!')

        logger.debug(f"Number of matched pairs: {len(matches)}")

        # Writing: Join the matched ids to the source data for each event & write to S3 and postgres
        logger.info('Writing matched results!')
        matched_results_paths = ioutils.write_matched_data(
            matches=matches,
            base_data_directory=base_data_directory,
            person_keys=config['keys'],
            schema_pk_lookup={event_type:schema_pk_lookup[event_type] for event_type in event_types_read},
            match_job_id=metadata['match_job_id']
        )
        metadata['data_written_time'] = datetime.datetime.now()
        ioutils.write_dict_to_yaml(metadata, f"{base_data_directory}/match_cache/metadata/{metadata['match_job_id']}")

        logger.info('Finished')
        match_end_time = datetime.datetime.now()
        match_runtime =  match_end_time - metadata['match_job_start_time']

        match_successful = True
        status_message = 'new matches are available. Yipee!'

    except Exception as e:
        match_end_time = datetime.datetime.now()
        match_run_time = match_end_time - metadata['match_job_start_time']
        match_successful = False
        status_message = 'matching failed. SAD!'
        try:
            matched_results_paths
        except NameError:
            matched_results_paths = None

        try:
            match_end_time
        except NameError:
            match_end_time = datetime.datetime.now()

        try:
            match_runtime
        except NameError:
            match_runtime = match_end_time - metadata['match_job_start_time']

        logger.error(f'Matcher failed with message "{str(e)}"')

    finally:
        if notify_webapp:
            job = q.enqueue_call(
                func='backend.match_finished',
                args=(
                    matched_results_paths,
                    metadata['match_job_id'],
                    metadata['match_job_start_time'],
                    match_end_time,
                    match_successful,
                    match_runtime,
                    upload_id
                ),
                result_ttl=5000,
                timeout=3600
            )
            logger.info(f'Notified the webapp that {status_message}')
        logger.info('Matcher done!!')
예제 #12
0
def load_data_for_matching(base_data_directory: str, event_types: list,
                           keys: list, match_job_id: str) -> list:
    # We will frame the record linkage problem as a deduplication problem
    logger.debug(f'Loading data for event types: {event_types}')
    try:
        df = pd.concat([
            load_one_event_type(base_data_directory, event_type, keys,
                                match_job_id) for event_type in event_types
        ])
    except ValueError as e:
        if str(e) != "All objects passed were None":
            raise
        else:
            logger.debug('Found no events data.')
            raise ValueError(
                f'No merged data files found for any event type ({event_types}) in {base_data_directory}.'
            )
    logger.debug(f'Number of deduped events: {len(df)}')

    ## and the match_job_id
    df['match_job_id'] = match_job_id

    # Which event types did we read successfully?
    event_types_read = df.event_type.drop_duplicates().values

    ## TODO: Check the definition of keys
    # Drop duplicates, disregarding event type
    df = df.drop('event_type', axis=1)
    df = df.drop_duplicates(subset=keys)

    logger.debug(
        f"The loaded dataframe has the following columns: {df.columns}")
    logger.debug(f"The dimensions of the loaded dataframe is: {df.shape}")
    logger.debug(f"The indices of the loaded dataframe are {df.index}")
    logger.debug(
        f'The loaded has {len(df)} rows and {len(df.index.unique())} unique indices'
    )
    logger.debug(
        f'The loaded dataframe has the following duplicate indices: {df[df.index.duplicated()].index.values}'
    )

    # Cache read data
    write_dataframe(
        df=df.reset_index(),
        filepath=f'{base_data_directory}/match_cache/loaded_data/{match_job_id}'
    )

    return df, event_types_read
예제 #13
0
def write_dict_to_yaml(dict_to_write: dict, filepath: str):
    logger.debug(f'Writing some dictionary data to {filepath}! Oooooo!')
    with smart_open.smart_open(filepath, 'wb') as fout:
        fout.write(yaml.dump(dict_to_write).encode())
    logger.info(f'Wrote metadata to {filepath}')
예제 #14
0
def compare_exact_n_chars(s1: pd.Series, s2: pd.Series, n: int) -> pd.Series:
    logger.debug(f'Doing an exact comparison of {n} characters')
    s1 = truncate_string(s1, n)
    s2 = truncate_string(s2, n)
    return (s1 == s2).astype(float)
예제 #15
0
    def run(self, pairs: pd.MultiIndex, df: pd.DataFrame) -> pd.DataFrame:
        """ Read the config and make the required contrasts.

            The config dictionary keys are column names. The values define
            the contrasts to make for the given column. Each definition is a
            dictionary with with a `method` key and (optionally) an `args`
            key containing a dictionary of arguments to pass to the method.

            We will loop over the column names and the contrast definitions and
            call the appropriate method for each.
        """
        self.metadata['contraster_start_time'] = datetime.datetime.now()
        logger.debug(f'Making the following contrasts: \n{self.config}')

        for col_name, contrast_definitions in self.config.items():
            logger.debug(
                f'Found the following contrasts for {col_name}: \n{contrast_definitions}'
            )

            for contrast_definition in contrast_definitions:
                logger.debug(
                    f"Trying out {contrast_definition['method']} on {col_name}."
                )
                contrast_method = getattr(self, contrast_definition['method'])

                if 'args' in contrast_definition.keys():
                    logger.debug(
                        f"Passing {contrast_definition['args']} to {contrast_definition['method']}"
                    )
                    contrast_method(col_name, contrast_definition['args'])

                else:
                    logger.debug(
                        f"Found no arguments for {col_name} {contrast_definition['method']}."
                    )
                    contrast_method(col_name)

        logger.debug('Running all those contrasts!')
        contrasts = self.contraster.compute(pairs, df)

        self.make_contrast_metadata(contrasts)

        self.metadata['contraster_end_time'] = datetime.datetime.now()

        return contrasts
예제 #16
0
    def match(self, df:pd.DataFrame, key='all') -> pd.DataFrame:
        
        metadata = {
            'size': len(df)
        }
        logger.debug('Indexing the data for matching!')
        indexer = rl.FullIndex()
        pairs = indexer.index(df)
        metadata['n_pairs'] = len(pairs)
        logger.debug(f"Number of pairs: {metadata['n_pairs']}")

        logger.debug(f"Initializing contrasting")
        contraster_obj = contraster.Contraster(self.contrast_rules)
        contrasts = contraster_obj.run(pairs, df)
        metadata['contraster_metadata'] = contraster_obj.metadata
        logger.debug(f"Contrasts created")

        contrasts.index.rename(['matcher_index_left', 'matcher_index_right'], inplace=True)
        contrasts = rules.compactify(contrasts, operation='mean')
        logger.debug('Summary distances generated. Making you some stats about them.')
        metadata['scores'] = utils.summarize_column(contrasts.matches)
        logger.debug('Caching those contrasts and distances for you.')
        ioutils.write_dataframe(contrasts.reset_index(), filepath=f'{self.base_data_directory}/match_cache/contrasts/{self.match_job_id}/{key}')

        logger.debug(f"Contrasts dataframe size: {contrasts.shape}")
        logger.debug(f"Contrasts data without duplicated indexes: {contrasts[~contrasts.index.duplicated(keep='first')].shape}")
        logger.debug("Duplicated keys:")
        logger.debug(f"{contrasts[contrasts.index.duplicated(keep=False)]}")

        matches = cluster.generate_matched_ids(
            distances=contrasts,
            DF=df,
            clustering_params=self.clustering_rules,
            base_data_directory=self.base_data_directory, # at some point, we may want to consider making the matcher into a class
            match_job_id=self.match_job_id,       # rather than passing around keys, match_job_ids, base_data_directorys, etc.
            block_name=str(key)
        )

        return matches, metadata