示例#1
0
def _get_csv_mrns(args):
    mrns = set()
    if args.path_to_csv_deidentified is not None:

        # Get list of full paths to CSV files
        fpaths = []
        if os.path.isdir(args.path_to_csv):
            for root, dirs, fnames in os.walk(args.path_to_csv):
                for fname in fnames:
                    split = os.path.splitext(fname)
                    if split[-1] != CSV_EXT:
                        continue
                    fpath = os.path.join(root, fname)
                    fpaths.append(fpath)

        # If user gave path to single CSV, instead of a directory, use that path
        else:
            fpaths.append(args.path_to_csv)

        # Iterate over paths to CSV files
        for fpath in fpaths:
            try:
                _mrns = patient_csv_to_set(patient_csv=fpath)
            except ValueError:
                print(
                    f"Could not get MRNs from {fpath}, skipping de-identification"
                )
                global path_of_csv_to_skip
                path_of_csv_to_skip.add(fpath)
                continue
            _mrns = {int(mrn) for mrn in _mrns}
            mrns |= _mrns

    return mrns
示例#2
0
    def test_patient_csv(self, patient_csv: DATA_SPLIT):
        csv_path, patient_ids = patient_csv
        sample_set = patient_csv_to_set(csv_path)

        assert open(csv_path).readline() != "patient_id\n"
        assert all([patient_id in sample_set for patient_id in patient_ids])
        assert len(patient_ids) == len(sample_set)
示例#3
0
    def test_patient_csv_duplicates(self, patient_csv: DATA_SPLIT):
        csv_path, patient_ids = patient_csv
        sample_set = patient_csv_to_set(csv_path)
        assert open(csv_path).readline() != "patient_id\n"
        assert all([patient_id in sample_set for patient_id in patient_ids])
        assert len(patient_ids) == len(sample_set)

        with open(csv_path) as csv_file:
            dupe_set = set()
            has_dupe = False
            for line in csv_file:
                if line in dupe_set:
                    has_dupe = True
                dupe_set.add(line)
            assert has_dupe
示例#4
0
    def check_structure(self,
                        patient_csv: str = None,
                        remove_flag: bool = False):
        """
        Checks if edw_dir is structured properly.

        :param patient_csv: <str> Path to CSV with MRNs to parse; no other MRNs
               will be parsed.
        :param remove_flag: <bool> Flag to remove files with wrong or empty format.
        """
        self._check_adt(remove_flag)

        expected_columns = {}
        for element in EDW_FILES:
            columns: Set[str] = set()
            for col in EDW_FILES[element]["columns"]:
                columns &= set(col if isinstance(col, list) else [col])
            expected_columns[EDW_FILES[element]["name"]] = columns
        expected_files = set(expected_columns.keys())
        expected_files.remove(EDW_FILES["adt_file"]["name"])

        mrns_folders = [
            os.path.join(self.edw_dir, folder)
            for folder in os.listdir(self.edw_dir)
            if os.path.isdir(os.path.join(self.edw_dir, folder))
        ]
        if patient_csv:
            mrns = patient_csv_to_set(patient_csv)
        for mrn_folder in mrns_folders:
            if patient_csv and mrn_folder not in mrns:
                continue
            csns_folders = [
                os.path.join(mrn_folder, folder)
                for folder in os.listdir(mrn_folder)
                if os.path.isdir(os.path.join(mrn_folder, folder))
            ]
            unexpected_files = [
                os.path.join(mrn_folder, file_name)
                for file_name in os.listdir(mrn_folder)
                if not os.path.isdir(os.path.join(mrn_folder, file_name))
            ]
            # Check that there is at least one folder inside each mrn folder.
            if len(csns_folders) < 1:
                logging.error(
                    f"Wrong folder format: {mrn_folder} doesn't contain any folder.",
                )
            # Check if there are any unexpected files in mrns folders.
            if len(unexpected_files) > 0:
                logging.warning(
                    f"Unexpected files: {sorted(unexpected_files)}. Just "
                    "folders should be stored inside mrns folders.", )
            for csn_folder in csns_folders:
                files = set(os.listdir(csn_folder))
                missing_files = expected_files.difference(files)
                unexpected = files.difference(expected_files)
                # Check that inside each csn folder are found all the
                # expected .csv.
                if len(missing_files) > 0:
                    logging.error(
                        "Wrong folder format: the files "
                        f"{sorted(missing_files)} were not found in the "
                        f"input folder {csn_folder}.", )
                # Check that all the expected_files have the expected format.
                for file_name in expected_files.intersection(files):
                    full_file_path = os.path.join(csn_folder, file_name)
                    file_expected_columns = expected_columns[file_name]
                    self._check_file_columns(
                        full_file_path,
                        file_expected_columns,
                        remove_flag,
                    )
                # Check if if there are any unexpected file in csns folders.
                if len(unexpected) > 0:
                    unexpected_list = [
                        os.path.join(csn_folder, unexpected_file)
                        for unexpected_file in unexpected
                    ]
                    logging.warning(
                        f"Unexpected files: {sorted(unexpected_list)}. Just "
                        "the specific .csv files should be saved in csns folders.",
                    )