def predict_path(self, path): """ Predict path with class folders """ # prediction batch sizes batch_size = 256 logging.info("Initializing generator") generator = self.datagen.flow_from_directory( path, target_size=self.model.input_shape[1:3], color_mode=self.color_mode, batch_size=batch_size, class_mode='sparse', seed=self.cfg_model['random_seed'], shuffle=False) # predict whole set logging.info("Predicting images in path") # calculate number of iterations to make steps_remainder = generator.n % batch_size if steps_remainder > 0: extra_step = 1 else: extra_step = 0 preds = self.model.predict_generator( generator, steps=(generator.n // batch_size) + extra_step, workers=1, use_multiprocessing=bool(self.cfg_model['multi_processing'])) logging.debug("Predicted %s of %s images" % (preds.shape[0], generator.n, )) # check size and log critical if preds.shape[0] != generator.n: logging.critical("Number of Preds %s don't match" + "number of images %s" % (preds.shape[0], generator.n, )) # consolidate output logging.info("Creating Result DF") res = self._create_result_df(preds, generator.filenames, generator.classes, generator.class_indices, image_links="") return res
def _preparePaths(self, tag, clear_old_files): """ prepare paths to save training data """ # create directories root_path = cfg_path['images'] + tag # delete old files if clear_old_files: if os.path.exists(root_path): logging.debug("Deleting %s" % root_path) shutil.rmtree(root_path) # create root directory if not os.path.exists(root_path): logging.debug("Creating %s" % root_path) os.mkdir(root_path) # delete all non-relevant class directories if not clear_old_files: all_class_dirs = os.listdir(root_path) delete_dirs = set(all_class_dirs) - set(self.classes) if len(delete_dirs) > 0: for d in delete_dirs: logging.debug("Removing directory %s" % (root_path + os.path.sep + d)) shutil.rmtree(root_path + os.path.sep + d) # create class directories for cl in self.classes: if not os.path.exists(root_path + os.path.sep + cl): logging.debug("Creating %s" % (root_path + os.path.sep + cl)) os.mkdir(root_path + os.path.sep + cl) return root_path
async def download_coroutine(semaphore, session, key, url): # with async_timeout.timeout(180): async with semaphore: async with session.get(url) as response: while True: chunk = await response.content.read() if not chunk: break try: img = Image.open(BytesIO(chunk)) images_dict[key] = img except: logging.warn("Could not access image: %s w id %s" % (url, key)) success = False counter = 0 n_attempts = 0 while ((not success) and (counter < n_attempts)): logging.debug("Trying again") time.sleep(0.1) try: chunk = await response.content.read() img = Image.open(BytesIO(chunk)) images_dict[key] = img success = True except: counter += 1 logging.warn("Failed Attempt %s / %s" % (counter, n_attempts)) # add to failures list if not success: failures['urls'].append(url) failures['ids'].append(key) # log failures for u, i in zip(failures['urls'], failures['ids']): logging.warn("Failed to access id: %s on\ url: %s" % (i, u)) return await response.release()
def saveSubjectSetOnDisk(self, overwrite=True): """ Save all Subjects / images in class specific folders """ # check if subject set is already on disk if not overwrite: file_path = self.cfg_path['db'] + 'subject_set_used.json' if os.path.isfile(path=file_path): logging.debug("Subjec %s found, not gonna overwrite" % file_path) return None # to retry saving in case of connection errors while fetching urls counter = 0 success = False n_trials = 99 while ((not success) & (counter < n_trials)): try: self.subject_set.saveImagesOnDisk(set_name='all', cfg=self.cfg, cfg_path=self.cfg_path) success = True except Exception as e: # log exception logging.exception("saveSubjectSetOnDisk failed") counter += 1 logging.info("Starting attempt %s / %s" % (counter, n_trials)) if not success: IOError("Could not save subject set on disk") else: # remove unsuccessfully processed subjects self.subject_set.removeSubjectsWithoutAllImages() # save subject set containing only successfully processed # subjects to disk self.subject_set.save(path=self.cfg_path['db'] + 'subject_set_used.json') logging.info("Saved %s to disk" % (self.cfg_path['db'] + 'subject_set_used.json')) # print label distribution self.subject_set.printLabelDistribution()
def _classMapper(self, ids, labels): """ Map Classes """ # prepare result lists labels_final = list() ids_final = list() # loop over all labels and ids for label, i in zip(labels, ids): if self.class_list is not None: if label in self.class_list: ids_final.append(i) labels_final.append(label) elif self.class_mapper is not None: if label in self.class_mapper: new_label = self.class_mapper[label] labels_final.append(new_label) ids_final.append(i) logging.debug("ClassMapper contains %s ids and %s labels" % (len(ids_final), len(labels_final))) return ids_final, labels_final
def createTrainTestSplit(self, save_to_disk=False, split_mode="1_on_1"): """ create Test / Train / Validation splits """ # get random seed rand = self.random_state # get all subject ids and their labels ids, labels = self.subject_set.getAllIDsLabels() print('Key 10127334 in set: %s' % ('10127334' in set(ids))) # prepare meta data dictionary for all subjects meta_data = dict() for i in ids: meta_data[i] = self.subject_set.getSubject(i).getMetaData() # create splitting id to split subjects on, using original, unmapped # labels ids_orig, split_ids, split_labels =\ createSplitIDs(ids, labels, meta_data=meta_data, split_mode=split_mode) # mapper split ids to orig ids split_id_mapper = dict() for jj in range(0, len(split_ids)): if split_ids[jj] not in split_id_mapper: split_id_mapper[split_ids[jj]] = [ids_orig[jj]] else: split_id_mapper[split_ids[jj]].append(ids_orig[jj]) # mapper orig id to split id id_to_split_id_mapper = dict() for k, v in split_id_mapper.items(): for i in v: id_to_split_id_mapper[i] = k # map labels to classes & keep only relevant ids ids, labels = self._classMapper(ids, labels) # if equal class sizes, cut larger classes to size of smallest if self.equal_class_sizes: ids, labels = self._balancedSampling(ids, labels) # map split ids to mapped labels split_ids, split_labels = self._classMapper(split_ids, split_labels) # create id to label mapper class_mapper_id = dict() for i, l in zip(ids, labels): class_mapper_id[i] = l # create split id to label mapper class_mapper_split_id = dict() for i, l in zip(split_ids, split_labels): class_mapper_split_id[i] = l # get rid of all split ids of ids which have been removed by # class mapper and balanced sampling split_ids = [id_to_split_id_mapper[i] for i in ids] split_labels = [class_mapper_split_id[i] for i in split_ids] # deduplicate splitting ids to be used in creating test / train splits split_ids_unique_dict = OrderedDict() for spl_id in split_ids: if spl_id not in split_ids_unique_dict: split_ids_unique_dict[spl_id] = 1 split_ids_unique = list(split_ids_unique_dict.keys()) split_labels_unique = [class_mapper_split_id[x] for x in split_ids_unique] # split id to id mapper after deduplication split_id_to_id_mapper = dict() for spl, ii in zip(split_ids, ids): if spl not in split_id_to_id_mapper: split_id_to_id_mapper[spl] = [ii] else: split_id_to_id_mapper[spl].append(ii) # print size of classes debuggy = dict() logging.debug("Class distribution overall on splitting ids") for dd in split_labels_unique: if dd not in debuggy: debuggy[dd] = 1 else: debuggy[dd] += 1 for k, v in debuggy.items(): logging.debug("Class %s has %s Obs" % (k, v)) # training and test split # reverse train/test to guarantee the test set is the same for # equal test_size samples on identical ids id_test_s, id_train_s = train_test_split(split_ids_unique, train_size=self.test_size, test_size=self.train_size, stratify=split_labels_unique, random_state=int(rand)) # validation split labels_s_val = [class_mapper_split_id[x] for x in id_test_s] # print size of classes debuggy = dict() logging.debug("Class distribution test/val on splitting ids") for dd in labels_s_val: if dd not in debuggy: debuggy[dd] = 1 else: debuggy[dd] += 1 for k, v in debuggy.items(): logging.debug("Class %s has %s Obs" % (k, v)) id_test_s, id_val_s = train_test_split(id_test_s, train_size=0.5, stratify=labels_s_val, random_state=int(rand)) # map split ids to original ids id_train = [[x for x in split_id_to_id_mapper[i]] for i in id_train_s] id_test = [[x for x in split_id_to_id_mapper[i]] for i in id_test_s] id_val = [[x for x in split_id_to_id_mapper[i]] for i in id_val_s] # get rid of sublists id_train = [item for sublist in id_train for item in sublist] id_test = [item for sublist in id_test for item in sublist] id_val = [item for sublist in id_val for item in sublist] # generate new subject sets train_set = SubjectSet(labels=self.classes) test_set = SubjectSet(labels=self.classes) val_set = SubjectSet(labels=self.classes) set_ids = [id_train, id_test, id_val] sets = [train_set, test_set, val_set] for si, s in zip(set_ids, sets): for i in si: sub = self.subject_set.getSubject(i) # change label new_label = class_mapper_id[i] sub.setLabels(new_label) s.addSubject(sub) self.train_set = train_set self.test_set = test_set self.val_set = val_set # save subject sets on disk if save_to_disk: self.save() # print label distribution for s, l in zip([self.train_set, self.test_set, self.val_set], ['train', 'test', 'val']): logging.info("Label Distribution %s" % l) s.printLabelDistribution()
def createExpDataSet(self, link_only=True, splits="new", clear_old_files=True, split_mode="none"): """ Create Test / Train / Validation Data set, if link_only is True only symbolik links are created, no actual data is copied """ # create new splits if splits == 'new': self.createTrainTestSplit(save_to_disk=True, split_mode=split_mode) elif splits == 'disk': # if files are there load them from disk, else create new try: self.load() except: self.createTrainTestSplit(save_to_disk=True, split_mode=split_mode) else: raise NotImplementedError logging.debug("Starting to prepare experiment datasets") for tag, sub_set in zip(['train', 'test', 'val'], [self.train_set, self.test_set, self.val_set]): # prepare subject set logging.debug("Preparing Paths for %s" % tag) root_path = self._preparePaths(tag, clear_old_files) # get all relevant subject ids subject_ids = sub_set.getAllIDs() # create ordered dict subject_ids_dict = OrderedDict() for sid in subject_ids: subject_ids_dict[sid] = 1 # check if some already exist and keep them if not clear_old_files: # get all files already on disk all_classes = os.listdir(root_path) # store information of existing files in dictionary existing_dict = dict() for cl in all_classes: existing_files = os.listdir(os.path.join(root_path, cl)) for ex in existing_files: existing_id = ex.split('_')[0] if existing_id not in existing_dict: existing_dict[existing_id] = {'cl': cl, 'files': list()} existing_dict[existing_id]['files'].append(ex) if len(existing_dict.keys()) == 0: logging.debug("No files exist in %s directory" % tag) else: logging.debug("%s files already exist in %s directory" % (len(existing_dict.keys()), tag)) # relevant subject ids that are not already on disk subject_ids_relev = subject_ids_dict.keys() - \ existing_dict.keys() # existing files that have to be removed to_be_removed = existing_dict.keys() - set(subject_ids) # remove files for r in to_be_removed: files_to_remove = existing_dict[r]['files'] class_to_be_removed = existing_dict[r]['cl'] for fr in files_to_remove: os.remove(os.path.join(root_path, class_to_be_removed, fr)) # only keep subject ids that are not already on disk subject_ids = list(subject_ids_relev) if link_only: logging.info("Creating link only files") for s_i, c in zip(subject_ids, range(0, len(subject_ids))): if (c % 10000) == 0: logging.debug("Link %s / %s created" % (c, len(subject_ids))) sub = sub_set.getSubject(s_i) imgs = sub.getImages() label = sub.getLabels() for img in imgs.values(): #if img.checkFileExistence(): img.createSymLink(dest_path=os.path.join(root_path, label)) else: logging.info("Creating hard copy files") for s_i in subject_ids: sub = sub_set.getSubject(s_i) imgs = sub.getImages() label = sub.getLabels() for img in imgs.values(): img.copyTo(dest_path=root_path + os.path.sep + label + os.path.sep)