def compare_string_distance(self, col_name: str, args: dict): logger.debug(f'Doing a comparison of {col_name} using {args}') self.contraster.string( col_name, col_name, label=f'{col_name}_{utils.convert_dict_to_str(args)}_distance', **args)
def compare_swap_month_days(self, col_name: str, args: dict): logger.debug( f'Checking if the month and day are swapped in {col_name}') self.contraster.date(col_name, col_name, label=f'{col_name}_swap_month_days_distance', **args)
def make_contrast_metadata(self, contrasts): contrast_metadata = {} for column in contrasts.columns: logger.debug(f'Making you some stats about {column}') contrast_metadata[column] = utils.summarize_column( contrasts[column]) self.metadata['contrasts'] = contrast_metadata
def block_and_match(self, df): ## We will split-apply-combinei logger.debug(f'df sent to block-and-match has the following columns: {df.dtypes}') logger.info(f'Blocking by {self.blocking_rules}') grouped = df.groupby([utils.unpack_blocking_rule(df, column_name, position) for column_name, position in self.blocking_rules.items()]) logger.info(f'Applying matcher to {len(grouped)} blocks.') all_block_metadata = {} matches = {} for key, group in grouped: logger.debug(f"Matching group {key} of size {len(group)}") if len(group) > 1: matches[key], block_metadata = self.match(group, key) else: block_metadata = { 'size': 1, 'n_pairs': 0, 'contrasts': None, 'scores': None } logger.debug(f"Group {key} only has one record, making a singleton id") matches[key] = cluster.generate_singleton_id(group, str(key)) logger.debug('Wrapping up block') all_block_metadata[key] = block_metadata logger.debug('All blocks done! Yehaw!') self.metadata['blocks'] = all_block_metadata return pd.concat(matches.values())
def compare_numeric_distance(self, col_name: str, args: dict): logger.debug(f'Doing a numeric distance calculation on {col_name}') self.contraster.date( col_name, col_name, label= f'{col_name}_numeric_{utils.convert_dict_to_str(args)}_distance', **args)
def generate_matched_ids( distances:pd.DataFrame, DF:pd.DataFrame, clustering_params:dict, base_data_directory:str, match_job_id=str, block_name='', ) -> pd.DataFrame: logger.info('Beginning clustering & id generation.') distances = square_distance_matrix(distances) ioutils.write_dataframe(distances.reset_index(), filepath=f'{base_data_directory}/match_cache/square_distances/{match_job_id}/{block_name}') ids = cluster( distances, **clustering_params ) ioutils.write_dataframe(ids.reset_index(), filepath=f'{base_data_directory}/match_cache/raw_cluster_ids/{match_job_id}/{block_name}') max_cluster_id = ids.max() replacement_ids = pd.Series(range(max_cluster_id + 1, max_cluster_id + len(ids[ids == -1]) + 1), index=ids[ids==-1].index) ids[ids == -1] = replacement_ids logger.debug(f'IDs: {ids}') logger.debug(f'Replaced noisy singleton ids with \n{replacement_ids}') logger.debug(f'Adding the block name ({block_name}) to the matched_ids.') ids = block_name + ids.astype(str) logger.debug(f'New IDs: \n{ids}') df = DF.copy() df['matched_id'] = ids logger.info('Matched ids generated') return (df)
def compare_list(self, col_name: str, args: dict): if args['method'] == 'any': logger.debug(f'Checking if {col_name} shares any value.') self.contraster.compare_vectorized( lists_share_any_values, col_name, col_name, label=f'{col_name}_any_list_item_distance') elif args['method'] == 'all': logger.debug(f'Checking if {col_name} shares all values.') self.contraster.compare_vectorized( lists_share_all_values, col_name, col_name, label=f'{col_name}_all_list_items_distance') else: raise ValueError( f"I don't know how to compare lists with this method ({method}). Please send me 'all' or 'any'." )
def write_matched_data(matches: pd.DataFrame, base_data_directory: str, person_keys: list, schema_pk_lookup: dict, match_job_id: str) -> dict: write_dataframe( df=matches.reset_index(), filepath= f'{base_data_directory}/match_cache/matcher_results/{match_job_id}') matched_results_paths = {} logger.debug(schema_pk_lookup) for event_type, primary_keys in schema_pk_lookup.items(): logger.info( f'Writing matched data for {base_data_directory} {event_type}') matched_results_paths[event_type] = write_one_event_type( df=matches, base_data_directory=base_data_directory, event_type=event_type, person_keys=person_keys, primary_keys=primary_keys, match_job_id=match_job_id) return matched_results_paths
def compare_exact(self, col_name: str, n_chars=None): logger.debug(f'Doing an exact comparison on {col_name}') if n_chars is not None: logger.debug( f'Doing an exact comparison of {n_chars} characters of {col_name}' ) self.contraster.compare_vectorized( compare_exact_n_chars, col_name, col_name, n_chars['n_chars'], label=f"{col_name}_exact_{n_chars['n_chars']}_distance") else: logger.debug( f'Doing an exact comparison of all characters in {col_name}') self.contraster.exact(col_name, col_name, label=f'{col_name}_exact_distance')
def preprocess(df:pd.DataFrame, match_job_id:str, base_data_directory:str) -> pd.DataFrame: # full_name # full name is only given if name parts are not. maybe we should do some preprocessing on full names to create # name parts and use only the name parts, especially since it is possible for the jail and HMIS systems to # differ on what they use # prefix # we should preprocess prefixes to remove punctuation and possibly spaces if 'prefix' in df.columns: logger.debug('Removing punctuation from prefixes') df['prefix'] = df['prefix'].str.replace('[^\w\s]','') # first_name # potential preprocessing steps: # - remove punctuation # - create: full_first_name, first_word_first_name # - try using second+ word of first name as middle name if no middle name # middle_name # potential preprocessing steps: # - remove punctuation # - create: full_middle_name, first_word_middle_name, second_word_middle_name # last_name # potential preprocessing steps: # - remove punctuation # - create: full_last_name, first_word_last_name, second_word_last_name # suffix if 'suffix' in df.columns: logger.debug('Removing punctuation from suffixes') df['suffix'] = df['suffix'].str.replace('[^\w\s]','') # dob # MUST BE CAST TO DATETIME DURING PREPROCESSING if 'dob' in df.columns: logger.debug('Converting date of birth to datetime') df['dob'] = pd.to_datetime(df['dob']) # ssn # THIS SHOULD BE CONVERTED TO STRING. The SSN consists of 3 words, and numerical distances are only # VAGUELY meaningful (e.g., the first 3 digits increase roughly east to west but not in a rigorous way, # and the second 2 digits are given out in a fixed but non-monotonic order) # the first three digits are the "area code" of where the person was registered. # most people living in an area will have one of a few local area codes; therefore, the distinctiveness # of the area code may be useful for matching. we may want to preprocess ssn to extract the area code # to make this comparison. if 'ssn' in df.columns: logger.debug('Converting social security number to str') df['ssn'] = df['ssn'].astype(str) # dmv_number # THIS SHOULD BE CAST TO STRING. In some jurisdictions, they are strings and in others ints. To ensure # that we can generalize here, we need to convert to string for all of them. if 'dmv_number' in df.columns: logger.debug('Converting dmv number to str') df['dmv_number'] = df['dmv_number'].astype(str) # race # make race into a list # eventually, we will want to combine secondary race and race into a single field if 'race' in df.columns: logger.debug('Converting race to list') df['race'] = df['race'].fillna('').str.split(',') logger.debug(f"Races observed in preprocessed df: {df['race']}") # ethnicity # ethnicity encodes only Hispanic/Not Hispanic. for some databases, Hispanic is actually included # in the race categories instead of in a separate field. we may want to do some pre-processing to # to add H to the race list where the ethnicity field contains 'Hispanic' logger.info('Preprocessing done!') logger.debug(f"The preprocessed dataframe has the following columns: {df.columns}") logger.debug(f"The preprocessed dimensions of the dataframe is: {df.shape}") ioutils.write_dataframe(df.reset_index(), filepath=f'{base_data_directory}/match_cache/preprocessed_data/{match_job_id}') return df
def do_match( base_data_directory:str, schema_pk_lookup:dict, upload_id:str=None, notify_webapp:bool=True, config_path:str='matcher_config.yaml' ): with open(config_path) as f: config = yaml.load(f) # Initializing: let's get started by collecting and logging some job metadata metadata = { 'match_job_start_time': datetime.datetime.now(), 'match_job_id': utils.unique_match_job_id(), 'base_data_directory': base_data_directory, 'config': config } logger.info("Matching process started with the following configuration:") for key, value in config.items(): logger.info(f"Matcher config {key}: {value}") try: # Loading: collect matching data (keys) for all available event types & record which event types were found logger.info('Loading data for matching.') df, event_types_read = ioutils.load_data_for_matching( base_data_directory, list(schema_pk_lookup.keys()), config['keys'], metadata['match_job_id'] ) metadata['event_types_read'] = list(event_types_read) metadata['loaded_data_columns'] = list(df.columns.values) metadata['loaded_data_shape'] = list(df.shape) metadata['data_loaded_time'] = datetime.datetime.now() # Preprocessing: enforce data types and split/combine columns for feartures logger.info('Doing some preprocessing on the columns') df = preprocess.preprocess(df, metadata['match_job_id'], base_data_directory) metadata['preprocessed_data_columns'] = list(df.columns.values) metadata['preprocessed_data_shape'] = list(df.shape) metadata['data_preprocessed_time'] = datetime.datetime.now() # Matching: block the data, generate pairs and features, and cluster entities logger.info(f"Running matcher") match_object = matcher.Matcher( base_data_directory=base_data_directory, match_job_id=metadata['match_job_id'], clustering_rules=config['clusterer']['args'], contrast_rules=config['contrasts'], blocking_rules=config['blocking_rules'] ) matches = match_object.block_and_match(df=df) metadata['data_matched_time'] = datetime.datetime.now() metadata.update(match_object.metadata) logger.debug('Matching done!') logger.debug(f"Number of matched pairs: {len(matches)}") # Writing: Join the matched ids to the source data for each event & write to S3 and postgres logger.info('Writing matched results!') matched_results_paths = ioutils.write_matched_data( matches=matches, base_data_directory=base_data_directory, person_keys=config['keys'], schema_pk_lookup={event_type:schema_pk_lookup[event_type] for event_type in event_types_read}, match_job_id=metadata['match_job_id'] ) metadata['data_written_time'] = datetime.datetime.now() ioutils.write_dict_to_yaml(metadata, f"{base_data_directory}/match_cache/metadata/{metadata['match_job_id']}") logger.info('Finished') match_end_time = datetime.datetime.now() match_runtime = match_end_time - metadata['match_job_start_time'] match_successful = True status_message = 'new matches are available. Yipee!' except Exception as e: match_end_time = datetime.datetime.now() match_run_time = match_end_time - metadata['match_job_start_time'] match_successful = False status_message = 'matching failed. SAD!' try: matched_results_paths except NameError: matched_results_paths = None try: match_end_time except NameError: match_end_time = datetime.datetime.now() try: match_runtime except NameError: match_runtime = match_end_time - metadata['match_job_start_time'] logger.error(f'Matcher failed with message "{str(e)}"') finally: if notify_webapp: job = q.enqueue_call( func='backend.match_finished', args=( matched_results_paths, metadata['match_job_id'], metadata['match_job_start_time'], match_end_time, match_successful, match_runtime, upload_id ), result_ttl=5000, timeout=3600 ) logger.info(f'Notified the webapp that {status_message}') logger.info('Matcher done!!')
def load_data_for_matching(base_data_directory: str, event_types: list, keys: list, match_job_id: str) -> list: # We will frame the record linkage problem as a deduplication problem logger.debug(f'Loading data for event types: {event_types}') try: df = pd.concat([ load_one_event_type(base_data_directory, event_type, keys, match_job_id) for event_type in event_types ]) except ValueError as e: if str(e) != "All objects passed were None": raise else: logger.debug('Found no events data.') raise ValueError( f'No merged data files found for any event type ({event_types}) in {base_data_directory}.' ) logger.debug(f'Number of deduped events: {len(df)}') ## and the match_job_id df['match_job_id'] = match_job_id # Which event types did we read successfully? event_types_read = df.event_type.drop_duplicates().values ## TODO: Check the definition of keys # Drop duplicates, disregarding event type df = df.drop('event_type', axis=1) df = df.drop_duplicates(subset=keys) logger.debug( f"The loaded dataframe has the following columns: {df.columns}") logger.debug(f"The dimensions of the loaded dataframe is: {df.shape}") logger.debug(f"The indices of the loaded dataframe are {df.index}") logger.debug( f'The loaded has {len(df)} rows and {len(df.index.unique())} unique indices' ) logger.debug( f'The loaded dataframe has the following duplicate indices: {df[df.index.duplicated()].index.values}' ) # Cache read data write_dataframe( df=df.reset_index(), filepath=f'{base_data_directory}/match_cache/loaded_data/{match_job_id}' ) return df, event_types_read
def write_dict_to_yaml(dict_to_write: dict, filepath: str): logger.debug(f'Writing some dictionary data to {filepath}! Oooooo!') with smart_open.smart_open(filepath, 'wb') as fout: fout.write(yaml.dump(dict_to_write).encode()) logger.info(f'Wrote metadata to {filepath}')
def compare_exact_n_chars(s1: pd.Series, s2: pd.Series, n: int) -> pd.Series: logger.debug(f'Doing an exact comparison of {n} characters') s1 = truncate_string(s1, n) s2 = truncate_string(s2, n) return (s1 == s2).astype(float)
def run(self, pairs: pd.MultiIndex, df: pd.DataFrame) -> pd.DataFrame: """ Read the config and make the required contrasts. The config dictionary keys are column names. The values define the contrasts to make for the given column. Each definition is a dictionary with with a `method` key and (optionally) an `args` key containing a dictionary of arguments to pass to the method. We will loop over the column names and the contrast definitions and call the appropriate method for each. """ self.metadata['contraster_start_time'] = datetime.datetime.now() logger.debug(f'Making the following contrasts: \n{self.config}') for col_name, contrast_definitions in self.config.items(): logger.debug( f'Found the following contrasts for {col_name}: \n{contrast_definitions}' ) for contrast_definition in contrast_definitions: logger.debug( f"Trying out {contrast_definition['method']} on {col_name}." ) contrast_method = getattr(self, contrast_definition['method']) if 'args' in contrast_definition.keys(): logger.debug( f"Passing {contrast_definition['args']} to {contrast_definition['method']}" ) contrast_method(col_name, contrast_definition['args']) else: logger.debug( f"Found no arguments for {col_name} {contrast_definition['method']}." ) contrast_method(col_name) logger.debug('Running all those contrasts!') contrasts = self.contraster.compute(pairs, df) self.make_contrast_metadata(contrasts) self.metadata['contraster_end_time'] = datetime.datetime.now() return contrasts
def match(self, df:pd.DataFrame, key='all') -> pd.DataFrame: metadata = { 'size': len(df) } logger.debug('Indexing the data for matching!') indexer = rl.FullIndex() pairs = indexer.index(df) metadata['n_pairs'] = len(pairs) logger.debug(f"Number of pairs: {metadata['n_pairs']}") logger.debug(f"Initializing contrasting") contraster_obj = contraster.Contraster(self.contrast_rules) contrasts = contraster_obj.run(pairs, df) metadata['contraster_metadata'] = contraster_obj.metadata logger.debug(f"Contrasts created") contrasts.index.rename(['matcher_index_left', 'matcher_index_right'], inplace=True) contrasts = rules.compactify(contrasts, operation='mean') logger.debug('Summary distances generated. Making you some stats about them.') metadata['scores'] = utils.summarize_column(contrasts.matches) logger.debug('Caching those contrasts and distances for you.') ioutils.write_dataframe(contrasts.reset_index(), filepath=f'{self.base_data_directory}/match_cache/contrasts/{self.match_job_id}/{key}') logger.debug(f"Contrasts dataframe size: {contrasts.shape}") logger.debug(f"Contrasts data without duplicated indexes: {contrasts[~contrasts.index.duplicated(keep='first')].shape}") logger.debug("Duplicated keys:") logger.debug(f"{contrasts[contrasts.index.duplicated(keep=False)]}") matches = cluster.generate_matched_ids( distances=contrasts, DF=df, clustering_params=self.clustering_rules, base_data_directory=self.base_data_directory, # at some point, we may want to consider making the matcher into a class match_job_id=self.match_job_id, # rather than passing around keys, match_job_ids, base_data_directorys, etc. block_name=str(key) ) return matches, metadata