示例#1
0
def load_and_merge_csv_files(data_param, default_folder=None):
    """
    Converts a list of csv_files in data_param
    in to a joint list of file names (by matching the first column)
    This function returns a <pandas.core.frame.DataFrame> of the
    joint list
    """
    if not data_param:
        tf.logging.fatal('nothing to load, please check reader.names')
        raise ValueError
    _file_list = None
    for modality_name in data_param:
        try:
            csv_file = data_param[modality_name].csv_file
        except AttributeError:
            tf.logging.fatal('unrecognised parameter format')
            raise
        if hasattr(data_param[modality_name], 'path_to_search') and \
                len(data_param[modality_name].path_to_search):
            tf.logging.info('[%s] search file folders, writing csv file %s',
                            modality_name, csv_file)
            section_tuple = data_param[modality_name].__dict__.items()
            matcher = KeywordsMatching.from_tuple(section_tuple,
                                                  default_folder)
            match_and_write_filenames_to_csv([matcher], csv_file)
        else:
            tf.logging.info(
                '[%s] using existing csv file %s, skipped folder search',
                modality_name, csv_file)
        if not os.path.isfile(csv_file):
            tf.logging.fatal("[%s] csv file %s not found.", modality_name,
                             csv_file)
            raise IOError
        csv_list = pandas.read_csv(csv_file,
                                   header=None,
                                   names=['subject_id', modality_name])
        if _file_list is None:
            _file_list = csv_list
            continue

        # merge _file_list based on subject_ids (first column of each csv)
        n_rows = _file_list.shape[0]
        _file_list = pandas.merge(_file_list, csv_list, on='subject_id')
        if _file_list.shape[0] != n_rows:
            tf.logging.warning("rows not matched in %s", csv_file)

    if _file_list is None or _file_list.size == 0:
        tf.logging.fatal(
            "empty filename lists, please check the csv "
            "files. (remove csv_file keyword if it is in the config file "
            "to automatically search folders and generate new csv "
            "files again)\n\n"
            "Please note in the matched file names, each subject name are "
            "matched by removing any keywords listed `filename_contains` "
            "in the config.\n\n")
        raise IOError
    return _file_list
    def grep_files_by_data_section(self, modality_name):
        """
        list all files by a given input data section::
            if the ``csv_file`` property of the section corresponds to a file,
                read the list from the file;
            otherwise
                write the list to ``csv_file``.

        :return: a table with two columns,
                 the column names are ``(COLUMN_UNIQ_ID, modality_name)``.
        """
        if modality_name not in self.data_param:
            tf.logging.fatal('unknown section name [%s], '
                             'current input section names: %s.',
                             modality_name, list(self.data_param))
            raise ValueError

        # input data section must have a ``csv_file`` section for loading
        # or writing filename lists
        try:
            csv_file = self.data_param[modality_name].csv_file
            if not os.path.isfile(csv_file):
                # writing to the same folder as data_split_file
                csv_file = os.path.join(os.path.dirname(self.data_split_file),
                                        '{}.csv'.format(modality_name))

        except (AttributeError, TypeError):
            tf.logging.fatal('Missing `csv_file` field in the config file, '
                             'unknown configuration format.')
            raise

        if hasattr(self.data_param[modality_name], 'path_to_search') and \
                self.data_param[modality_name].path_to_search:
            tf.logging.info('[%s] search file folders, writing csv file %s',
                            modality_name, csv_file)
            section_properties = self.data_param[modality_name].__dict__.items()
            # grep files by section properties and write csv
            try:
                matcher = KeywordsMatching.from_tuple(
                    section_properties,
                    self.default_image_file_location)
                match_and_write_filenames_to_csv([matcher], csv_file)
            except (IOError, ValueError) as reading_error:
                tf.logging.warning('Ignoring input section: [%s], '
                                   'due to the following error:',
                                   modality_name)
                tf.logging.warning(repr(reading_error))
                return pandas.DataFrame(
                    columns=[COLUMN_UNIQ_ID, modality_name])
        else:
            tf.logging.info(
                '[%s] using existing csv file %s, skipped filenames search',
                modality_name, csv_file)

        if not os.path.isfile(csv_file):
            tf.logging.fatal(
                '[%s] csv file %s not found.', modality_name, csv_file)
            raise IOError
        try:
            csv_list = pandas.read_csv(
                csv_file,
                header=None,
                dtype=(str, str),
                names=[COLUMN_UNIQ_ID, modality_name],
                skipinitialspace=True)
        except Exception as csv_error:
            tf.logging.fatal(repr(csv_error))
            raise
        return csv_list
示例#3
0
    def grep_files_by_data_section(self, modality_name):
        """
        list all files by a given input data section::
            if the ``csv_file`` property of the section corresponds to a file,
                read the list from the file;
            otherwise
                write the list to ``csv_file``.

        :return: a table with two columns,
                 the column names are ``(COLUMN_UNIQ_ID, modality_name)``.
        """
        if modality_name not in self.data_param:
            tf.logging.fatal(
                'unknown section name [%s], '
                'current input section names: %s.', modality_name,
                list(self.data_param))
            raise ValueError

        # input data section must have a ``csv_file`` section for loading
        # or writing filename lists
        try:
            csv_file = self.data_param[modality_name].csv_file
            if not os.path.isfile(csv_file):
                # writing to the same folder as data_split_file
                csv_file = os.path.join(os.path.dirname(self.data_split_file),
                                        '{}.csv'.format(modality_name))

        except (AttributeError, TypeError):
            tf.logging.fatal('Missing `csv_file` field in the config file, '
                             'unknown configuration format.')
            raise

        if hasattr(self.data_param[modality_name], 'path_to_search') and \
                self.data_param[modality_name].path_to_search:
            tf.logging.info('[%s] search file folders, writing csv file %s',
                            modality_name, csv_file)
            section_properties = self.data_param[modality_name].__dict__.items(
            )
            # grep files by section properties and write csv
            try:
                matcher = KeywordsMatching.from_tuple(
                    section_properties, self.default_image_file_location)
                match_and_write_filenames_to_csv([matcher], csv_file)
            except (IOError, ValueError) as reading_error:
                tf.logging.warning(
                    'Ignoring input section: [%s], '
                    'due to the following error:', modality_name)
                tf.logging.warning(repr(reading_error))
                return pandas.DataFrame(
                    columns=[COLUMN_UNIQ_ID, modality_name])
        else:
            tf.logging.info(
                '[%s] using existing csv file %s, skipped filenames search',
                modality_name, csv_file)

        if not os.path.isfile(csv_file):
            tf.logging.fatal('[%s] csv file %s not found.', modality_name,
                             csv_file)
            raise IOError
        try:
            csv_list = pandas.read_csv(csv_file,
                                       header=None,
                                       dtype=(str, str),
                                       names=[COLUMN_UNIQ_ID, modality_name],
                                       skipinitialspace=True)
        except Exception as csv_error:
            tf.logging.fatal(repr(csv_error))
            raise
        return csv_list