예제 #1
0
    def generate(cls, config, inputdir, outputdir, partner,
                 ask=True, create_tables=True):
        """
        Read the "phi_hashes.csv" file and generate UUID's.

        Optionally save the "hash -> UUID" mapping to "links.csv"

       .. seealso::
           :meth:`_process_frame`

        """
        cls._validate_config(config)
        # engine = db_utils.get_db_engine(config)
        EXPECTED_COLS = config['EXPECTED_COLS']
        SAVE_OUT_FILE = config['SAVE_OUT_FILE']
        in_file = os.path.join(inputdir, config['IN_FILE'])

        cls.log.info("Using [{}] as source folder".format(inputdir))
        cls.log.info("Using [{}] as source file".format(in_file))
        cls.log.info("Connection HOST:DB - {}/{}"
                     .format(config['DB_HOST'], config['DB_NAME']))

        if ask:
            confirmed = utils.ask_yes_no(
                "Continue link procedure to create files in the [{}] folder?"
                .format(outputdir))

            if not confirmed:
                sys.exit("If you say so...")

        reader = None

        try:
            reader = pd.read_csv(in_file,
                                 sep=config['IN_DELIMITER'],
                                 dtype=object,
                                 skipinitialspace=True,
                                 skip_blank_lines=True,
                                 usecols=list(EXPECTED_COLS),
                                 chunksize=config['LINES_PER_CHUNK'],
                                 iterator=True)
            cls.log.info("Reading data from file: {} ({})"
                         .format(in_file, utils.get_file_size(in_file)))

        except ValueError as exc:
            cls.log.info("Please check if the actual column names"
                         " in [{}] match the expected column names"
                         " file: {}.".format(in_file,
                                             sorted(EXPECTED_COLS)))
            cls.log.error("Error: {}".format(exc))

        frames = []
        investigations = []
        pool = mp.Pool(processes=NUM_CPUS)
        jobs = []

        for index, df_source in enumerate(reader):
            df_source.fillna('', inplace=True)
            # The magic happens here...
            job = utils.apply_async(pool,
                                    cls._process_frame,
                                    (config, df_source, partner))
            jobs.append(job)

        job_count = len(jobs)
        cls.log.info("Total multiproc jobs: {}".format(job_count))

        # collect the results
        for index, job in enumerate(jobs):
            try:
                df_temp, to_investigate = job.get()
                frames.append(df_temp)
                investigations.extend(to_investigate)

                if index % 100 == 0:
                    cls.log.info("Appended result {} (out of {})"
                                 .format(index, job_count))
            except Exception as exc:
                cls.log.error("Job [{}] error: {}".format(index, exc))
                mp.get_log().error(traceback.format_exc())

        pool.close()
        pool.join()

        if SAVE_OUT_FILE:
            cls.log.info("Got {} frames. Concatenating...".format(job_count))
            df = pd.concat(frames, ignore_index=True)

            out_file = os.path.join(outputdir, config['OUT_FILE'])
            out_file_investigate = os.path.join(outputdir, config['OUT_FILE_INVESTIGATE'])  # noqa
            utils.frame_to_file(df, out_file,
                                delimiter=config['OUT_DELIMITER'])
            cls.log.info("Wrote output file: {} ({} data rows, {})"
                         .format(
                             out_file, len(df), utils.get_file_size(out_file)))

            with open(out_file_investigate, 'w') as invf:

                for line in investigations:
                    invf.write("{}\n".format(line))

            cls.log.info("Wrote hashes that need investigation to {}"
                         .format(out_file_investigate))

        return True
예제 #2
0
    def generate(cls,
                 config,
                 inputdir,
                 outputdir,
                 partner,
                 ask=True,
                 create_tables=True):
        """
        Read the "phi_hashes.csv" file and generate UUID's.

        Optionally save the "hash -> UUID" mapping to "links.csv"

       .. seealso::
           :meth:`_process_frame`

        """
        cls._validate_config(config)
        in_file = os.path.join(inputdir, config['IN_FILE'])
        cls.log.info("Using {} as input file ({})".format(
            in_file, utils.get_file_size(in_file)))  # noqa
        cls.log.info("Connection HOST:DB - {}/{}".format(
            config['DB_HOST'], config['DB_NAME']))

        if ask:
            confirmed = utils.ask_yes_no(
                "Run [{}] linkage to create files in the [{}] folder?".format(
                    partner, outputdir))

            if not confirmed:
                sys.exit("If you say so...")

        df = cls._prepare_frame(config, inputdir, outputdir)

        frames = []
        investigations = []
        pool = mp.Pool(processes=NUM_CPUS)
        jobs = []

        chunksize = config['LINES_PER_CHUNK']

        # for index, df_source in enumerate(reader):
        for index, group in df.groupby(np.arange(len(df)) // chunksize):
            # cls.log.error("Frame chunk [{}]".format(index))
            df_source = pd.DataFrame(group)
            # The magic happens here...
            job = utils.apply_async(pool, cls._process_frame,
                                    (config, df_source, partner))
            jobs.append(job)

        job_count = len(jobs)
        cls.log.info("Total multiproc jobs: {}".format(job_count))

        # collect the results
        for index, job in enumerate(jobs):
            try:
                df_temp, to_investigate = job.get()
                frames.append(df_temp)
                investigations.extend(to_investigate)

                if index % 100 == 0:
                    cls.log.info("Appended result {} (out of {})".format(
                        index, job_count))
            except Exception as exc:
                cls.log.error("Job [{}] error: {}".format(index, exc))
                cls.log.error(traceback.format_exc())

        pool.close()
        pool.join()

        if config['SAVE_OUT_FILE']:
            cls.save_output_file(config, outputdir, job_count, frames,
                                 investigations)

        return True
예제 #3
0
    def generate(cls, config, inputdir, outputdir):
        """
        Read the "phi.csv" file and generate "phi_hashes.csv"
        containing two (or more) sha256 strings for each line
        in the input file.

        This method is invoked from

        .. seealso::

            run_hasher.py

        :param inputdir: directory name for the source file
        :param outputdir: directory name for generated file

        :rtype: DataFrame
        :return the frame with hashes of the PHI data

        Columns:
            - patid
            - sha_rule_1 (first_last_dob_race)
            - sha_rule_2 (first_last_dob_sex)

        """
        cls._validate_config(config)
        EXPECTED_COLS = config['EXPECTED_COLS']
        ENABLED_RULES = config.get('ENABLED_RULES')

        cls.log.info("Using [{}] as source folder".format(inputdir))
        cls.log.info("Using [{}] as salt".format(config['SALT']))
        cls.log.info("Expecting input file to contain columns: {}".format(
            EXPECTED_COLS))
        cls.log.info("Using [{}] as destination folder".format(outputdir))

        in_file = os.path.join(inputdir, config['IN_FILE'])
        reader = None

        try:
            reader = pd.read_csv(in_file,
                                 sep=config['IN_DELIMITER'],
                                 dtype=object,
                                 skipinitialspace=True,
                                 skip_blank_lines=True,
                                 usecols=list(EXPECTED_COLS),
                                 chunksize=config['LINES_PER_CHUNK'],
                                 iterator=True)
            cls.log.info("Reading data from file: {} ({})".format(
                in_file, utils.get_file_size(in_file)))

        except ValueError as exc:
            cls.log.info("Please check if the actual column names"
                         " in [{}] match the expected column names"
                         " file: {}.".format(in_file, sorted(EXPECTED_COLS)))
            cls.log.error("Error: {}".format(exc))

        frames = []
        pool = mp.Pool(processes=NUM_CPUS)
        jobs = []

        for index, df_source in enumerate(reader):
            cls.log.info("Processing {} lines of frame {}".format(
                config['LINES_PER_CHUNK'], index))
            df_source.fillna('', inplace=True)

            for col in EXPECTED_COLS:
                if col not in sorted(df_source):
                    raise Exception(
                        "The input data frame does not have all "
                        "expected columns: {}".format(EXPECTED_COLS))

            # validate the values constrained to set
            invalid_race = df_source.loc[
                ~df_source['race'].isin(VALID_RACE_VALS)]  # noqa
            invalid_sex = df_source.loc[~df_source['sex'].isin(VALID_SEX_VALS)]

            if len(invalid_race) > 0:
                cls.log.info("Please check race: {}".format(invalid_race))
                raise Exception("The input file contains invalid value for "
                                "`race` column. Please review the specs.")
            if len(invalid_sex) > 0:
                cls.log.warning("Please check sex: {}".format(invalid_sex))
                raise Exception("The input file contains invalid value for "
                                "`sex` column. Please review the specs.")

            job = utils.apply_async(pool, cls._process_frame,
                                    (df_source, config))
            jobs.append(job)

        job_count = len(jobs)
        cls.log.info("Total multiproc jobs: {}".format(job_count))

        # collect the results
        for index, job in enumerate(jobs):
            try:
                frames.append(job.get())
                if index % 10 == 0:
                    cls.log.info("Got results for frame {} (out of {})".format(
                        index, job_count))
            except Exception as exc:
                cls.log.error("Job [{}] error: {}".format(index, exc))
                mp.get_log().error(traceback.format_exc())

        pool.close()
        pool.join()
        cls.log.info("Got all {} frames. Concatenating...".format(job_count))
        df = pd.concat(frames, ignore_index=True)

        # Concatenation can re-order columns so we need to enforce the order
        out_columns = ['PATID']
        out_columns.extend(ENABLED_RULES)

        out_file = os.path.join(outputdir, config['OUT_FILE'])
        utils.frame_to_file(df[out_columns],
                            out_file,
                            delimiter=config['OUT_DELIMITER'])

        cls.log.info("Wrote output file: {} ({} data rows, {})".format(
            out_file, len(df), utils.get_file_size(out_file)))

        # count how many patients did not get a hash generated
        # query_no_hashes = df.query('m == "" & n == ""')
        query_no_hashes = ' & '.join(
            ['{} == ""'.format(rule) for rule in ENABLED_RULES])
        df_no_hashes = df.query(query_no_hashes)
        cls.log.info("The result file contains [{}] patients without hashes."
                     " See some examples below.".format(len(df_no_hashes)))
        cls.log.info(df_no_hashes.head())
        return True
예제 #4
0
    def _process_frame(cls, config, session, df_source, partner_code):
        """
        """
        # Init an empty frame and copy the patid from the source
        df = pd.DataFrame()
        df['PATID'] = df_source['PATID']
        investigations = []
        hash_uuid_lut = cls._populate_hash_uuid_lut(config, session, df_source)
        mapped_hashes = {ahash: link.linkage_uuid
                         for ahash, link in hash_uuid_lut.items()
                         if link is not None}
        rules_cache = RuleEntity.get_rules_cache(session)
        cls.log.debug("Found {} linked hashes in db: {}"
                      .format(len(mapped_hashes), mapped_hashes))

        # the rules are ordered by their importance
        rules = config['ENABLED_RULES'].values()
        patients_with_no_hashes = []

        #    TODO: Insert multiprocessing pool here
        jobs = []
        frames = []
        pool = mp.Pool(processes=NUM_CPUS)

        #    TODO: take for idx,row... loop and create function to call with
        #        utils.apply_async and pass args
        for index, row in df_source.iterrows():
            patid = row['PATID']
            pat_hashes = {rule: row[rule] for rule in rules if row[rule] != ''}

            if len(pat_hashes) < 1:
                patients_with_no_hashes.append(patid)

            cls.log.debug("Parsing row for patient {} with {} hashes"
                          .format(patid, len(pat_hashes)))

            #    Extract results from a pool.apply_aync call using get()
            #    See:
            #    https://docs.python.org/3/library/multiprocessing.html#multiprocessing.pool.AsyncResult
            job = utils.apply_async(pool,
                                    cls._process_patient_row,
                                    (patid, pat_hashes.copy(), hash_uuid_lut,
                                     rules_cache, config, partner_code))
            jobs.append(job)

        job_count = len(jobs)
        cls.log.info("Total multiproc jobs: {}".format(job_count))

        # collect the results
        for index, job in enumerate(jobs):
            try:
                frames.append(job.get())
                if index % 10 == 0:
                    cls.log.info("Got results for frame {} (out of {})"
                                 .format(index, job_count))
            except Exception as exc:
                cls.log.error("Job [{}] error: {}".format(index, exc))
                mp.get_log().error(traceback.format_exc())

        pool.close()
        pool.join()
        cls.log.info("Got all {} frames. Concatenating...".format(job_count))

        #    TODO: setup loop over frames to extract links and to_investigate
        #    Kludgy as we iterate again over df_source.iterrows() to extract PATID
        for frame, row in zip(frames, df_source.iterrows()):
            patid = row[1]['PATID']
            links = frame[0]
            to_investigate = frame[1]
            #    !OJO! Printing to console/writing to log can slow processes...
            #        Do we need to know the number of links created for ea patid?
            #        Is it possible to write this to the df and then to SQL table/csv file?
            cls.log.debug("Created {} links for patid: {}".format(len(links), patid))  # noqa

            if len(to_investigate) > 0:
                investigations.append(to_investigate)

            i = 0
            for ahash, link in links.items():
                i += 1
                df.loc[df['PATID'] == patid, 'UUID'] = (link.linkage_uuid
                                                        if link else '')
                df.loc[df['PATID'] == patid, "hash_{}".format(i)] = ahash

        job_count = len(jobs)
        cls.log.info("Total multiproc jobs: {}".format(job_count))

        cls.log.warning("{} out of {} patients are missing both hashes: {}"
                        .format(len(patients_with_no_hashes), len(df),
                                patients_with_no_hashes))
        return df, investigations