Пример #1
0
    def __init__(self, dataset_dir, split_str, labels_path, transform_fns):
        """
        """
        split_path = path.join(dataset_dir, f'{split_str}.csv')
        self.split_df = pd.read_csv(split_path, index_col=0)
        self.split_str = split_str
        self.labels_df = pd.read_csv(labels_path, index_col=0, header=[0, 1])
        self.exam_ids = list(self.split_df.index.unique())

        self.transform_fns = transform_fns
        self.shuffle_transform = 'shuffle' in [f['fn'] for f in transform_fns]

        self.instance_transform = None
        for f in transform_fns:
            # only extract instances if asked to do so and specified for split
            if 'extract_instance' == f['fn'] and split_str in f['args']['splits']:
                self.instance_transform = f['args']
                logger.info(f"using instance extraction on {f['args']['splits']} splits")
                break
        if self.instance_transform != None and self.instance_transform.get('instance_only', False):
            # only access exam_ids with instance level labels
            exam_ids = []
            for exam_id in self.exam_ids:
                rows = self.split_df.loc[exam_id]
                if isinstance(rows, pd.Series):
                    if not np.isnan(rows['label.lv']):
                        exam_ids.append(exam_id)
                else:
                    if not np.isnan(rows.iloc[0]['label.lv']):
                        exam_ids.append(exam_id)
            logger.info(f'using {len(exam_ids)} of {len(self.exam_ids)} exam_ids')
            self.exam_ids = exam_ids
        else:
            logger.info(f'using {len(self.exam_ids)} exam_ids')

        X_dict = {'exam_ids': []}
        Y_dict = {
            'primary':  [],
            'primary_multiclass': [],
            '2normal_binary': []        # labels: control, 1,2 (normal) | 3,4 (abnormal),
        }

        for idx, exam_id in enumerate(self.exam_ids):
            X_dict['exam_ids'].append(exam_id)

            y_dict = self.get_y(exam_id)
            for t, label in y_dict.items():
                Y_dict[t].append(label)

        Y_dict = {k: torch.from_numpy(np.array(v)) for k, v in Y_dict.items()}
        EmmentalDataset.__init__(self, 'cow-tus-dataset', X_dict=X_dict, Y_dict=Y_dict)
Пример #2
0
    def __init__(self, dataset_dir, split_str, labels_path, transform_fns):
        """
        """
        split_path = path.join(dataset_dir, f'{split_str}.csv')
        self.split_df = pd.read_csv(split_path, index_col=0)
        self.labels_df = pd.read_csv(labels_path, index_col=0, header=[0, 1])
        self.loop_idxs = range(len(self.split_df))

        # use df.iloc because loop_ids are not unique...
        loop_idxs = []
        for loop_idx in self.loop_idxs:
            row = self.split_df.iloc[loop_idx]
            loop_type = row['exdir.loop_type']
            if f'label.{loop_type}' in row.keys() and not np.isnan(row[f'label.{loop_type}']):
                loop_idxs.append(loop_idx)
        logger.info(f'using {len(loop_idxs)} of {len(self.loop_idxs)} loop_idxs')
        self.loop_idxs = loop_idxs

        self.transform_fns = transform_fns

        X_dict = {'loop_idxs': []}
        Y_dict = {
            'primary':  [],
            'primary_multiclass': [],
            '2normal_binary': []        # labels: control, 1,2 (normal) | 3,4 (abnormal),
        }

        for idx, loop_idx in enumerate(self.loop_idxs):
            X_dict['loop_idxs'].append(loop_idx)

            y_dict = self.get_y(loop_idx)
            for t, label in y_dict.items():
                Y_dict[t].append(label)

        Y_dict = {k: torch.from_numpy(np.array(v)) for k, v in Y_dict.items()}
        EmmentalDataset.__init__(self, 'cow-tus-dataset', X_dict=X_dict, Y_dict=Y_dict)