Пример #1
0
    def predict_path(self, path):
        """ Predict path with class folders """

        # prediction batch sizes
        batch_size = 256

        logging.info("Initializing generator")
        generator = self.datagen.flow_from_directory(
                path,
                target_size=self.model.input_shape[1:3],
                color_mode=self.color_mode,
                batch_size=batch_size,
                class_mode='sparse',
                seed=self.cfg_model['random_seed'],
                shuffle=False)

        # predict whole set
        logging.info("Predicting images in path")

        # calculate number of iterations to make
        steps_remainder = generator.n % batch_size
        if steps_remainder > 0:
            extra_step = 1
        else:
            extra_step = 0

        preds = self.model.predict_generator(
            generator,
            steps=(generator.n // batch_size) + extra_step,
            workers=1,
            use_multiprocessing=bool(self.cfg_model['multi_processing']))

        logging.debug("Predicted %s of %s images" % (preds.shape[0],
                                                     generator.n,
                                                     ))
        # check size and log critical
        if preds.shape[0] != generator.n:
            logging.critical("Number of Preds %s don't match" +
                             "number of images %s" % (preds.shape[0],
                                                      generator.n,
                                                      ))

        # consolidate output
        logging.info("Creating Result DF")
        res = self._create_result_df(preds, generator.filenames,
                                     generator.classes,
                                     generator.class_indices,
                                     image_links="")

        return res
    def _preparePaths(self, tag, clear_old_files):
        """ prepare paths to save training data """

        # create directories
        root_path = cfg_path['images'] + tag

        # delete old files
        if clear_old_files:
            if os.path.exists(root_path):
                logging.debug("Deleting %s" % root_path)
                shutil.rmtree(root_path)

        # create root directory
        if not os.path.exists(root_path):
            logging.debug("Creating %s" % root_path)
            os.mkdir(root_path)

        # delete all non-relevant class directories
        if not clear_old_files:
            all_class_dirs = os.listdir(root_path)
            delete_dirs = set(all_class_dirs) - set(self.classes)
            if len(delete_dirs) > 0:
                for d in delete_dirs:
                    logging.debug("Removing directory %s" %
                                  (root_path + os.path.sep + d))
                    shutil.rmtree(root_path + os.path.sep + d)

        # create class directories
        for cl in self.classes:
            if not os.path.exists(root_path + os.path.sep + cl):
                logging.debug("Creating %s" % (root_path + os.path.sep + cl))
                os.mkdir(root_path + os.path.sep + cl)

        return root_path
Пример #3
0
        async def download_coroutine(semaphore, session, key, url):
            # with async_timeout.timeout(180):
            async with semaphore:
                async with session.get(url) as response:
                    while True:
                        chunk = await response.content.read()
                        if not chunk:
                            break
                        try:
                            img = Image.open(BytesIO(chunk))
                            images_dict[key] = img
                        except:
                            logging.warn("Could not access image: %s w id %s" %
                                         (url, key))
                            success = False
                            counter = 0
                            n_attempts = 0
                            while ((not success) and (counter < n_attempts)):
                                logging.debug("Trying again")
                                time.sleep(0.1)
                                try:
                                    chunk = await response.content.read()
                                    img = Image.open(BytesIO(chunk))
                                    images_dict[key] = img
                                    success = True
                                except:
                                    counter += 1
                                    logging.warn("Failed Attempt %s / %s" %
                                                 (counter, n_attempts))
                            # add to failures list
                            if not success:
                                failures['urls'].append(url)
                                failures['ids'].append(key)
                                # log failures
                                for u, i in zip(failures['urls'],
                                                failures['ids']):
                                    logging.warn("Failed to access id: %s on\
                                                 url: %s" % (i, u))

                return await response.release()
    def saveSubjectSetOnDisk(self, overwrite=True):
        """ Save all Subjects / images in class specific folders """

        # check if subject set is already on disk
        if not overwrite:
            file_path = self.cfg_path['db'] + 'subject_set_used.json'
            if os.path.isfile(path=file_path):
                logging.debug("Subjec %s found, not gonna overwrite" %
                              file_path)
                return None
        # to retry saving in case of connection errors while fetching urls
        counter = 0
        success = False
        n_trials = 99
        while ((not success) & (counter < n_trials)):
            try:
                self.subject_set.saveImagesOnDisk(set_name='all',
                                                  cfg=self.cfg,
                                                  cfg_path=self.cfg_path)
                success = True
            except Exception as e:
                # log exception
                logging.exception("saveSubjectSetOnDisk failed")
                counter += 1
                logging.info("Starting attempt %s / %s" % (counter, n_trials))
        if not success:
            IOError("Could not save subject set on disk")
        else:
            # remove unsuccessfully processed subjects
            self.subject_set.removeSubjectsWithoutAllImages()

            # save subject set containing only successfully processed
            # subjects to disk
            self.subject_set.save(path=self.cfg_path['db'] +
                                  'subject_set_used.json')
            logging.info("Saved %s to disk" % (self.cfg_path['db'] +
                         'subject_set_used.json'))

            # print label distribution
            self.subject_set.printLabelDistribution()
    def _classMapper(self, ids, labels):
        """ Map Classes """

        # prepare result lists
        labels_final = list()
        ids_final = list()

        # loop over all labels and ids
        for label, i in zip(labels, ids):
            if self.class_list is not None:
                if label in self.class_list:
                    ids_final.append(i)
                    labels_final.append(label)
            elif self.class_mapper is not None:
                if label in self.class_mapper:
                    new_label = self.class_mapper[label]
                    labels_final.append(new_label)
                    ids_final.append(i)
        logging.debug("ClassMapper contains %s ids and %s labels" %
                      (len(ids_final), len(labels_final)))

        return ids_final, labels_final
    def createTrainTestSplit(self, save_to_disk=False, split_mode="1_on_1"):
        """ create Test / Train / Validation splits """

        # get random seed
        rand = self.random_state

        # get all subject ids and their labels
        ids, labels = self.subject_set.getAllIDsLabels()

        print('Key 10127334 in set: %s' % ('10127334' in set(ids)))

        # prepare meta data dictionary for all subjects
        meta_data = dict()
        for i in ids:
            meta_data[i] = self.subject_set.getSubject(i).getMetaData()

        # create splitting id to split subjects on, using original, unmapped
        # labels
        ids_orig, split_ids, split_labels =\
            createSplitIDs(ids, labels, meta_data=meta_data,
                           split_mode=split_mode)

        # mapper split ids to orig ids
        split_id_mapper = dict()
        for jj in range(0, len(split_ids)):
            if split_ids[jj] not in split_id_mapper:
                split_id_mapper[split_ids[jj]] = [ids_orig[jj]]
            else:
                split_id_mapper[split_ids[jj]].append(ids_orig[jj])

        # mapper orig id to split id
        id_to_split_id_mapper = dict()
        for k, v in split_id_mapper.items():
            for i in v:
                id_to_split_id_mapper[i] = k

        # map labels to classes & keep only relevant ids
        ids, labels = self._classMapper(ids, labels)

        # if equal class sizes, cut larger classes to size of smallest
        if self.equal_class_sizes:
            ids, labels = self._balancedSampling(ids, labels)

        # map split ids to mapped labels
        split_ids, split_labels = self._classMapper(split_ids, split_labels)

        # create id to label mapper
        class_mapper_id = dict()
        for i, l in zip(ids, labels):
            class_mapper_id[i] = l

        # create split id to label mapper
        class_mapper_split_id = dict()
        for i, l in zip(split_ids, split_labels):
            class_mapper_split_id[i] = l

        # get rid of all split ids of ids which have been removed by
        # class mapper and balanced sampling
        split_ids = [id_to_split_id_mapper[i] for i in ids]
        split_labels = [class_mapper_split_id[i] for i in split_ids]

        # deduplicate splitting ids to be used in creating test / train splits
        split_ids_unique_dict = OrderedDict()
        for spl_id in split_ids:
            if spl_id not in split_ids_unique_dict:
                split_ids_unique_dict[spl_id] = 1
        split_ids_unique = list(split_ids_unique_dict.keys())
        split_labels_unique = [class_mapper_split_id[x]
                               for x in split_ids_unique]

        # split id to id mapper after deduplication
        split_id_to_id_mapper = dict()
        for spl, ii in zip(split_ids, ids):
            if spl not in split_id_to_id_mapper:
                split_id_to_id_mapper[spl] = [ii]
            else:
                split_id_to_id_mapper[spl].append(ii)

        # print size of classes
        debuggy = dict()
        logging.debug("Class distribution overall on splitting ids")
        for dd in split_labels_unique:
            if dd not in debuggy:
                debuggy[dd] = 1
            else:
                debuggy[dd] += 1
        for k, v in debuggy.items():
            logging.debug("Class %s has %s Obs" % (k, v))

        # training and test split
        # reverse train/test to guarantee the test set is the same for
        # equal test_size samples on identical ids
        id_test_s, id_train_s = train_test_split(split_ids_unique,
                                                 train_size=self.test_size,
                                                 test_size=self.train_size,
                                                 stratify=split_labels_unique,
                                                 random_state=int(rand))

        # validation split
        labels_s_val = [class_mapper_split_id[x] for x in id_test_s]

        # print size of classes
        debuggy = dict()
        logging.debug("Class distribution test/val on splitting ids")
        for dd in labels_s_val:
            if dd not in debuggy:
                debuggy[dd] = 1
            else:
                debuggy[dd] += 1
        for k, v in debuggy.items():
            logging.debug("Class %s has %s Obs" % (k, v))

        id_test_s, id_val_s = train_test_split(id_test_s,
                                               train_size=0.5,
                                               stratify=labels_s_val,
                                               random_state=int(rand))

        # map split ids to original ids
        id_train = [[x for x in split_id_to_id_mapper[i]] for i in id_train_s]
        id_test = [[x for x in split_id_to_id_mapper[i]] for i in id_test_s]
        id_val = [[x for x in split_id_to_id_mapper[i]] for i in id_val_s]

        # get rid of sublists
        id_train = [item for sublist in id_train for item in sublist]
        id_test = [item for sublist in id_test for item in sublist]
        id_val = [item for sublist in id_val for item in sublist]

        # generate new subject sets
        train_set = SubjectSet(labels=self.classes)
        test_set = SubjectSet(labels=self.classes)
        val_set = SubjectSet(labels=self.classes)

        set_ids = [id_train, id_test, id_val]
        sets = [train_set, test_set, val_set]
        for si, s in zip(set_ids, sets):
            for i in si:
                sub = self.subject_set.getSubject(i)
                # change label
                new_label = class_mapper_id[i]
                sub.setLabels(new_label)
                s.addSubject(sub)

        self.train_set = train_set
        self.test_set = test_set
        self.val_set = val_set

        # save subject sets on disk
        if save_to_disk:
            self.save()

        # print label distribution
        for s, l in zip([self.train_set, self.test_set, self.val_set],
                        ['train', 'test', 'val']):
            logging.info("Label Distribution %s" % l)
            s.printLabelDistribution()
    def createExpDataSet(self, link_only=True,
                         splits="new",
                         clear_old_files=True,
                         split_mode="none"):
        """ Create Test / Train / Validation Data set, if link_only is True
            only symbolik links are created, no actual data is copied """

        # create new splits
        if splits == 'new':
            self.createTrainTestSplit(save_to_disk=True, split_mode=split_mode)
        elif splits == 'disk':
            # if files are there load them from disk, else create new
            try:
                self.load()
            except:
                self.createTrainTestSplit(save_to_disk=True,
                                          split_mode=split_mode)
        else:
            raise NotImplementedError

        logging.debug("Starting to prepare experiment datasets")
        for tag, sub_set in zip(['train', 'test', 'val'],
                                [self.train_set, self.test_set, self.val_set]):

            # prepare subject set
            logging.debug("Preparing Paths for %s" % tag)
            root_path = self._preparePaths(tag, clear_old_files)

            # get all relevant subject ids
            subject_ids = sub_set.getAllIDs()

            # create ordered dict
            subject_ids_dict = OrderedDict()
            for sid in subject_ids:
                subject_ids_dict[sid] = 1

            # check if some already exist and keep them
            if not clear_old_files:
                # get all files already on disk
                all_classes = os.listdir(root_path)

                # store information of existing files in dictionary
                existing_dict = dict()
                for cl in all_classes:
                    existing_files = os.listdir(os.path.join(root_path, cl))
                    for ex in existing_files:
                        existing_id = ex.split('_')[0]
                        if existing_id not in existing_dict:
                            existing_dict[existing_id] = {'cl': cl,
                                                          'files': list()}
                        existing_dict[existing_id]['files'].append(ex)

                if len(existing_dict.keys()) == 0:
                    logging.debug("No files exist in %s directory" % tag)
                else:
                    logging.debug("%s files already exist in %s directory" %
                                  (len(existing_dict.keys()), tag))
                    # relevant subject ids that are not already on disk
                    subject_ids_relev = subject_ids_dict.keys() - \
                        existing_dict.keys()

                    # existing files that have to be removed
                    to_be_removed = existing_dict.keys() - set(subject_ids)

                    # remove files
                    for r in to_be_removed:
                        files_to_remove = existing_dict[r]['files']
                        class_to_be_removed = existing_dict[r]['cl']
                        for fr in files_to_remove:
                            os.remove(os.path.join(root_path,
                                                   class_to_be_removed, fr))

                    # only keep subject ids that are not already on disk
                    subject_ids = list(subject_ids_relev)

            if link_only:
                logging.info("Creating link only files")

                for s_i, c in zip(subject_ids, range(0, len(subject_ids))):
                    if (c % 10000) == 0:
                        logging.debug("Link %s / %s created" %
                                      (c, len(subject_ids)))
                    sub = sub_set.getSubject(s_i)
                    imgs = sub.getImages()
                    label = sub.getLabels()

                    for img in imgs.values():
                        #if img.checkFileExistence():
                        img.createSymLink(dest_path=os.path.join(root_path, label))
            else:
                logging.info("Creating hard copy files")
                for s_i in subject_ids:
                    sub = sub_set.getSubject(s_i)
                    imgs = sub.getImages()
                    label = sub.getLabels()

                    for img in imgs.values():
                        img.copyTo(dest_path=root_path + os.path.sep +
                                   label + os.path.sep)