def save_output_file(cls, config, outputdir, job_count, frames, investigations): cls.log.info("Got {} frames. Concatenating...".format(job_count)) df = pd.concat(frames, ignore_index=True) out_file = os.path.join(outputdir, config['OUT_FILE']) out_file_investigate = os.path.join( outputdir, config['OUT_FILE_INVESTIGATE']) # noqa utils.frame_to_file(df, out_file, delimiter=config['OUT_DELIMITER']) cls.log.info("Wrote output file: {} ({} data rows, {})".format( out_file, len(df), utils.get_file_size(out_file))) with open(out_file_investigate, 'w') as invf: for line in investigations: invf.write("{}\n".format(line)) cls.log.info("Wrote hashes that need investigation to {}".format( out_file_investigate))
def generate(cls, config, inputdir, outputdir, partner, ask=True, create_tables=True): """ Read the "phi_hashes.csv" file and generate UUID's. Optionally save the "hash -> UUID" mapping to "links.csv" .. seealso:: :meth:`_process_frame` """ cls._validate_config(config) # engine = db_utils.get_db_engine(config) EXPECTED_COLS = config['EXPECTED_COLS'] SAVE_OUT_FILE = config['SAVE_OUT_FILE'] in_file = os.path.join(inputdir, config['IN_FILE']) cls.log.info("Using [{}] as source folder".format(inputdir)) cls.log.info("Using [{}] as source file".format(in_file)) cls.log.info("Connection HOST:DB - {}/{}" .format(config['DB_HOST'], config['DB_NAME'])) if ask: confirmed = utils.ask_yes_no( "Continue link procedure to create files in the [{}] folder?" .format(outputdir)) if not confirmed: sys.exit("If you say so...") reader = None try: reader = pd.read_csv(in_file, sep=config['IN_DELIMITER'], dtype=object, skipinitialspace=True, skip_blank_lines=True, usecols=list(EXPECTED_COLS), chunksize=config['LINES_PER_CHUNK'], iterator=True) cls.log.info("Reading data from file: {} ({})" .format(in_file, utils.get_file_size(in_file))) except ValueError as exc: cls.log.info("Please check if the actual column names" " in [{}] match the expected column names" " file: {}.".format(in_file, sorted(EXPECTED_COLS))) cls.log.error("Error: {}".format(exc)) frames = [] investigations = [] pool = mp.Pool(processes=NUM_CPUS) jobs = [] for index, df_source in enumerate(reader): df_source.fillna('', inplace=True) # The magic happens here... job = utils.apply_async(pool, cls._process_frame, (config, df_source, partner)) jobs.append(job) job_count = len(jobs) cls.log.info("Total multiproc jobs: {}".format(job_count)) # collect the results for index, job in enumerate(jobs): try: df_temp, to_investigate = job.get() frames.append(df_temp) investigations.extend(to_investigate) if index % 100 == 0: cls.log.info("Appended result {} (out of {})" .format(index, job_count)) except Exception as exc: cls.log.error("Job [{}] error: {}".format(index, exc)) mp.get_log().error(traceback.format_exc()) pool.close() pool.join() if SAVE_OUT_FILE: cls.log.info("Got {} frames. Concatenating...".format(job_count)) df = pd.concat(frames, ignore_index=True) out_file = os.path.join(outputdir, config['OUT_FILE']) out_file_investigate = os.path.join(outputdir, config['OUT_FILE_INVESTIGATE']) # noqa utils.frame_to_file(df, out_file, delimiter=config['OUT_DELIMITER']) cls.log.info("Wrote output file: {} ({} data rows, {})" .format( out_file, len(df), utils.get_file_size(out_file))) with open(out_file_investigate, 'w') as invf: for line in investigations: invf.write("{}\n".format(line)) cls.log.info("Wrote hashes that need investigation to {}" .format(out_file_investigate)) return True
def generate(cls, config, inputdir, outputdir): """ Read the "phi.csv" file and generate "phi_hashes.csv" containing two (or more) sha256 strings for each line in the input file. This method is invoked from .. seealso:: run_hasher.py :param inputdir: directory name for the source file :param outputdir: directory name for generated file :rtype: DataFrame :return the frame with hashes of the PHI data Columns: - patid - sha_rule_1 (first_last_dob_race) - sha_rule_2 (first_last_dob_sex) """ cls._validate_config(config) EXPECTED_COLS = config['EXPECTED_COLS'] ENABLED_RULES = config.get('ENABLED_RULES') cls.log.info("Using [{}] as source folder".format(inputdir)) cls.log.info("Using [{}] as salt".format(config['SALT'])) cls.log.info("Expecting input file to contain columns: {}".format( EXPECTED_COLS)) cls.log.info("Using [{}] as destination folder".format(outputdir)) in_file = os.path.join(inputdir, config['IN_FILE']) reader = None try: reader = pd.read_csv(in_file, sep=config['IN_DELIMITER'], dtype=object, skipinitialspace=True, skip_blank_lines=True, usecols=list(EXPECTED_COLS), chunksize=config['LINES_PER_CHUNK'], iterator=True) cls.log.info("Reading data from file: {} ({})".format( in_file, utils.get_file_size(in_file))) except ValueError as exc: cls.log.info("Please check if the actual column names" " in [{}] match the expected column names" " file: {}.".format(in_file, sorted(EXPECTED_COLS))) cls.log.error("Error: {}".format(exc)) frames = [] pool = mp.Pool(processes=NUM_CPUS) jobs = [] for index, df_source in enumerate(reader): cls.log.info("Processing {} lines of frame {}".format( config['LINES_PER_CHUNK'], index)) df_source.fillna('', inplace=True) for col in EXPECTED_COLS: if col not in sorted(df_source): raise Exception( "The input data frame does not have all " "expected columns: {}".format(EXPECTED_COLS)) # validate the values constrained to set invalid_race = df_source.loc[ ~df_source['race'].isin(VALID_RACE_VALS)] # noqa invalid_sex = df_source.loc[~df_source['sex'].isin(VALID_SEX_VALS)] if len(invalid_race) > 0: cls.log.info("Please check race: {}".format(invalid_race)) raise Exception("The input file contains invalid value for " "`race` column. Please review the specs.") if len(invalid_sex) > 0: cls.log.warning("Please check sex: {}".format(invalid_sex)) raise Exception("The input file contains invalid value for " "`sex` column. Please review the specs.") job = utils.apply_async(pool, cls._process_frame, (df_source, config)) jobs.append(job) job_count = len(jobs) cls.log.info("Total multiproc jobs: {}".format(job_count)) # collect the results for index, job in enumerate(jobs): try: frames.append(job.get()) if index % 10 == 0: cls.log.info("Got results for frame {} (out of {})".format( index, job_count)) except Exception as exc: cls.log.error("Job [{}] error: {}".format(index, exc)) mp.get_log().error(traceback.format_exc()) pool.close() pool.join() cls.log.info("Got all {} frames. Concatenating...".format(job_count)) df = pd.concat(frames, ignore_index=True) # Concatenation can re-order columns so we need to enforce the order out_columns = ['PATID'] out_columns.extend(ENABLED_RULES) out_file = os.path.join(outputdir, config['OUT_FILE']) utils.frame_to_file(df[out_columns], out_file, delimiter=config['OUT_DELIMITER']) cls.log.info("Wrote output file: {} ({} data rows, {})".format( out_file, len(df), utils.get_file_size(out_file))) # count how many patients did not get a hash generated # query_no_hashes = df.query('m == "" & n == ""') query_no_hashes = ' & '.join( ['{} == ""'.format(rule) for rule in ENABLED_RULES]) df_no_hashes = df.query(query_no_hashes) cls.log.info("The result file contains [{}] patients without hashes." " See some examples below.".format(len(df_no_hashes))) cls.log.info(df_no_hashes.head()) return True
def generate(cls, config, inputdir, outputdir, partner): """ Read the "phi_hashes.csv" file and generate UUID's. Optionally save the "hash -> UUID" mapping to "links.csv" """ cls._validate_config(config) engine = db.get_db_engine(config) # pass a session object to avoid creating it in the loop # TODO: add a method parameter for controlling the `create_table` flag session = db.get_db_session(engine, create_tables=True) EXPECTED_COLS = config['EXPECTED_COLS'] SAVE_OUT_FILE = config['SAVE_OUT_FILE'] in_file = os.path.join(inputdir, config['IN_FILE']) reader = None try: reader = pd.read_csv(in_file, sep=config['IN_DELIMITER'], dtype=object, skipinitialspace=True, skip_blank_lines=True, usecols=list(EXPECTED_COLS), chunksize=config['LINES_PER_CHUNK'], iterator=True) cls.log.info("Reading data from file: {} ({})".format( in_file, utils.get_file_size(in_file))) except ValueError as exc: cls.log.info("Please check if the actual column names" " in [{}] match the expected column names" " file: {}.".format(in_file, sorted(EXPECTED_COLS))) cls.log.error("Error: {}".format(exc)) frames = [] investigations = [] for df_source in reader: df_source.fillna('', inplace=True) # The magic happens here... df, to_investigate = cls._process_frame(config, session, df_source, partner) if SAVE_OUT_FILE: frames.append(df) investigations.extend(to_investigate) if SAVE_OUT_FILE: df = pd.concat(frames, ignore_index=True) out_file = os.path.join(outputdir, config['OUT_FILE']) out_file_investigate = os.path.join(outputdir, config['OUT_FILE_INVESTIGATE']) utils.frame_to_file(df, out_file, delimiter=config['OUT_DELIMITER']) cls.log.info("Wrote output file: {} ({} data rows, {})".format( out_file, len(df), utils.get_file_size(out_file))) with open(out_file_investigate, 'w') as invf: for line in investigations: invf.write("{}\n".format(line)) cls.log.info("Wrote hashes that need investigation to {}".format( out_file_investigate)) return True
def generate(cls, config, inputdir, outputdir, partner, ask=True, create_tables=True): """ Read the "phi_hashes.csv" file and generate UUID's. Optionally save the "hash -> UUID" mapping to "links.csv" .. seealso:: :meth:`_process_frame` """ cls._validate_config(config) in_file = os.path.join(inputdir, config['IN_FILE']) cls.log.info("Using {} as input file ({})".format( in_file, utils.get_file_size(in_file))) # noqa cls.log.info("Connection HOST:DB - {}/{}".format( config['DB_HOST'], config['DB_NAME'])) if ask: confirmed = utils.ask_yes_no( "Run [{}] linkage to create files in the [{}] folder?".format( partner, outputdir)) if not confirmed: sys.exit("If you say so...") df = cls._prepare_frame(config, inputdir, outputdir) frames = [] investigations = [] pool = mp.Pool(processes=NUM_CPUS) jobs = [] chunksize = config['LINES_PER_CHUNK'] # for index, df_source in enumerate(reader): for index, group in df.groupby(np.arange(len(df)) // chunksize): # cls.log.error("Frame chunk [{}]".format(index)) df_source = pd.DataFrame(group) # The magic happens here... job = utils.apply_async(pool, cls._process_frame, (config, df_source, partner)) jobs.append(job) job_count = len(jobs) cls.log.info("Total multiproc jobs: {}".format(job_count)) # collect the results for index, job in enumerate(jobs): try: df_temp, to_investigate = job.get() frames.append(df_temp) investigations.extend(to_investigate) if index % 100 == 0: cls.log.info("Appended result {} (out of {})".format( index, job_count)) except Exception as exc: cls.log.error("Job [{}] error: {}".format(index, exc)) cls.log.error(traceback.format_exc()) pool.close() pool.join() if config['SAVE_OUT_FILE']: cls.save_output_file(config, outputdir, job_count, frames, investigations) return True