def __init__(self, root_folder, extensions, prefetch = False, target_size = None, max_size = None, min_size = None, center_crop = None): """ Initialize from a two-layer storage Input: root_folder: the root that contains the data. Under root_folder there should be a list of folders, under which there should be a list of files extensions: the list of extensions that should be used to filter the files. Should be like ['png', 'jpg']. It's case insensitive. prefetch: if True, the images are prefetched to avoid disk read. If you have a large number of images, prefetch would require a lot of memory. target_size, max_size, min_size, center_crop: see manipulate() for details. """ super(TwoLayerDataset, self).__init__() if mpi.agree(not os.path.exists(root_folder)): raise OSError, "The specified folder does not exist." logging.debug('Loading from %s' % (root_folder,)) if type(extensions) is str: extensions = [extensions] extensions = set(extensions) if mpi.is_root(): # get files first files = glob.glob(os.path.join(root_folder, '*', '*')) # select those that fits the extension files = [f for f in files if any([ f.lower().endswith(ext) for ext in extensions])] logging.debug("A total of %d images." % (len(files))) # get raw labels labels = [os.path.split(os.path.split(f)[0])[1] for f in files] classnames = list(set(labels)) # sort so we get a reasonable class order classnames.sort() name2val = dict(zip(classnames, range(len(classnames)))) labels = [name2val[label] for label in labels] else: files = None classnames = None labels = None mpi.barrier() self._rawdata = mpi.distribute_list(files) self._data = self._rawdata self._prefetch = prefetch self._target_size = target_size self._max_size = max_size self._min_size = min_size self._center_crop = center_crop if target_size != None: self._dim = tuple(target_size) + (3,) else: self._dim = False self._channels = 3 if prefetch: self._data = [self._read(idx) for idx in range(len(self._data))] self._label = mpi.distribute_list(labels) self._classnames = mpi.COMM.bcast(classnames)
def testDistributeList(self): lengths = range(1, 5) for length in lengths: source = range(length) * mpi.SIZE result = mpi.distribute_list(source) self.assertEqual(len(result), length) for i in range(length): self.assertEqual(result[i], i)
def __init__(self, root, is_training, crop=False, prefetch=False, target_size=None): """Load the dataset. Input: root: the root folder of the CUB_200_2011 dataset. is_training: if true, load the training data. Otherwise, load the testing data. crop: if False, does not crop the bounding box. If a real value, crop is the ratio of the bounding box that gets cropped. e.g., if crop = 1.5, the resulting image will be 1.5 * the bounding box area. prefetch: if True, the images are prefetched to avoid disk read. If you have a large number of images, prefetch would require a lot of memory. target_size: if provided, all images are resized to the size specified. Should be a list of two integers, like [640,480]. Note that we will use the python indexing (labels start from 0). """ if is_training: mat_filename = 'train_list.mat' else: mat_filename = 'test_list.mat' if mpi.is_root(): matfile = io.loadmat(os.path.join(root, mat_filename)) labels = np.array(matfile['labels'].flatten() - 1, dtype=np.int) files = [f[0][0] for f in matfile['file_list']] else: labels = None files = None self._data = mpi.distribute_list(files) self._label = mpi.distribute(labels) self._root = root self._prefetch = prefetch self._crop = crop self._target_size = target_size if target_size is not None: self._dim = tuple(target_size) + (3, ) else: self._dim = False self._channels = 3 if self._prefetch: self._data = [self._read(i) for i in range(len(self._data))]
def __init__(self, root, is_training, crop = False, prefetch = False, target_size = None): """Load the dataset. Input: root: the root folder of the CUB_200_2011 dataset. is_training: if true, load the training data. Otherwise, load the testing data. crop: if False, does not crop the bounding box. If a real value, crop is the ratio of the bounding box that gets cropped. e.g., if crop = 1.5, the resulting image will be 1.5 * the bounding box area. prefetch: if True, the images are prefetched to avoid disk read. If you have a large number of images, prefetch would require a lot of memory. target_size: if provided, all images are resized to the size specified. Should be a list of two integers, like [640,480]. Note that we will use the python indexing (labels start from 0). """ if is_training: mat_filename = 'train_list.mat' else: mat_filename = 'test_list.mat' if mpi.is_root(): matfile = io.loadmat(os.path.join(root, mat_filename)) labels = np.array(matfile['labels'].flatten()-1, dtype=np.int) files = [f[0][0] for f in matfile['file_list']] else: labels = None files = None self._data = mpi.distribute_list(files) self._label = mpi.distribute(labels) self._root = root self._prefetch = prefetch self._crop = crop self._target_size = target_size if target_size is not None: self._dim = tuple(target_size) + (3,) else: self._dim = False self._channels = 3 if self._prefetch: self._data = [self._read(i) for i in range(len(self._data))]
def __init__(self, root, is_training, crop = False, subset = None, prefetch = False, target_size = None): """Load the dataset. Input: root: the root folder of the CUB_200_2011 dataset. is_training: if true, load the training data. Otherwise, load the testing data. crop: if False, does not crop the bounding box. If a real value, crop is the ratio of the bounding box that gets cropped. e.g., if crop = 1.5, the resulting image will be 1.5 * the bounding box area. subset: if nonempty, we will only use the subset specified in the list. The content of the list should be class subfolder names, like ['001.Black_footed_Albatross', ...] prefetch: if True, the images are prefetched to avoid disk read. If you have a large number of images, prefetch would require a lot of memory. target_size: if provided, all images are resized to the size specified. Should be a list of two integers, like [640,480]. Note that we will use the python indexing (labels start from 0). """ super(CUBDataset, self).__init__() images = [line.split()[1] for line in open(os.path.join(root, 'images.txt'), 'r')] boxes = [line.split()[1:] for line in open(os.path.join(root, 'bounding_boxes.txt'),'r')] labels = [int(line.split()[1]) - 1 for line in open(os.path.join(root, 'image_class_labels.txt'), 'r')] classnames = [line.split()[1] for line in open(os.path.join(root, 'classes.txt'),'r')] class2id = dict(zip(classnames, range(len(classnames)))) split = [int(line.split()[1]) for line in open(os.path.join(root, 'train_test_split.txt'),'r')] # load parts rawparts = np.loadtxt(os.path.join(root, 'parts','part_locs.txt')) rawparts = rawparts[:,2:-1].reshape((len(images), 15, 2)) if subset is not None: # create the subset mapping old2new = {} selected_ids = set() for new_id, name in enumerate(subset): old_id = class2id[name] selected_ids.add(old_id) old2new[old_id] = new_id # select the subset is_selected = [(label in selected_ids) for label in labels] images = [image for image, val in zip(images, is_selected) if val] boxes = [box for box, val in zip(boxes, is_selected) if val] labels = [old2new[label] for label, val in zip(labels, is_selected) \ if val] classnames = subset class2id = dict(zip(classnames, range(len(classnames)))) split = [trte for trte, val in zip(split, is_selected) if val] rawparts = rawparts[np.asarray(is_selected, dtype=bool)] # now, do training testing split if is_training: target = 1 else: target = 0 images = [os.path.join(root, 'images', image) for image, val in zip(images, split) if val == target] boxes = [box for box, val in zip(boxes, split) if val == target] labels = [label for label, val in zip(labels, split) if val == target] rawparts = rawparts[np.asarray(split)==target] - 1 # store the necessary values self._data = mpi.distribute_list(images) # for the boxes, we store them as a numpy array self._boxes = np.array(mpi.distribute_list(boxes)).astype(float) self._boxes -= 1 self._parts = mpi.distribute(rawparts) self._prefetch = prefetch self._target_size = target_size self._crop = crop if target_size is not None: self._dim = tuple(target_size) + (3,) else: self._dim = False self._channels = 3 # we store the raw dimensions for part location computation self._raw_dimension = np.zeros((len(self._data),2), dtype=int) if prefetch: self._data = [self._read(i) for i in range(len(self._data))] self._label = mpi.distribute_list(labels) self._classnames = mpi.COMM.bcast(classnames)
def __init__(self, root_folder, extensions, prefetch=False, target_size=None, max_size=None, min_size=None, center_crop=None): """ Initialize from a two-layer storage Input: root_folder: the root that contains the data. Under root_folder there should be a list of folders, under which there should be a list of files extensions: the list of extensions that should be used to filter the files. Should be like ['png', 'jpg']. It's case insensitive. prefetch: if True, the images are prefetched to avoid disk read. If you have a large number of images, prefetch would require a lot of memory. target_size, max_size, min_size, center_crop: see manipulate() for details. """ super(TwoLayerDataset, self).__init__() if mpi.agree(not os.path.exists(root_folder)): raise OSError, "The specified folder does not exist." logging.debug('Loading from %s' % (root_folder, )) if type(extensions) is str: extensions = [extensions] extensions = set(extensions) if mpi.is_root(): # get files first files = glob.glob(os.path.join(root_folder, '*', '*')) # select those that fits the extension files = [ f for f in files if any([f.lower().endswith(ext) for ext in extensions]) ] logging.debug("A total of %d images." % (len(files))) # get raw labels labels = [os.path.split(os.path.split(f)[0])[1] for f in files] classnames = list(set(labels)) # sort so we get a reasonable class order classnames.sort() name2val = dict(zip(classnames, range(len(classnames)))) labels = [name2val[label] for label in labels] else: files = None classnames = None labels = None mpi.barrier() self._rawdata = mpi.distribute_list(files) self._data = self._rawdata self._prefetch = prefetch self._target_size = target_size self._max_size = max_size self._min_size = min_size self._center_crop = center_crop if target_size != None: self._dim = tuple(target_size) + (3, ) else: self._dim = False self._channels = 3 if prefetch: self._data = [self._read(idx) for idx in range(len(self._data))] self._label = mpi.distribute_list(labels) self._classnames = mpi.COMM.bcast(classnames)
def __init__(self, list_file, feat_range, posting_file, perc_pos, keep_full_utt=False, posting_sampler=None, min_dur=0.2, min_count=0.0, max_count=10000000.0, reader_type='utterance', pickle_fname=None, list_file_sph=None, kw_feat=None, merge_score_files=None): '''TODO: Read pieces of utterance from the CSV file instead to save memory. It would be nice to index thse by utt_id (by now I do a map).''' super(BabelDataset, self).__init__() if list_file.find('eval') >= 0: self.is_eval = True self.T = FLAGS.T_eval else: self.is_eval = False self.T = FLAGS.T_train self.beta = FLAGS.beta self.reader_type = reader_type if reader_type=='lattice': self.is_lattice = True utt_reader = LatticeReader.LatticeReader(list_file) utt_reader.ReadAllLatices() elif reader_type=='utterance': self.is_lattice = False utt_reader = UtteranceReader.UtteranceReader(list_file,pickle_fname=pickle_fname) utt_reader.ReadAllUtterances(feat_range) elif reader_type=='snr': self.is_lattice = False utt_reader = SNRReader.SNRReader(list_file,pickle_fname=pickle_fname) utt_reader.ReadAllSNR() elif reader_type=='srate': self.is_lattice = False utt_reader = SrateReader.SrateReader(list_file,pickle_fname=pickle_fname) utt_reader.ReadAllSrate() elif reader_type=='score': self.is_lattice = False utt_reader = ScoreReader.ScoreReader(list_file,list_file_sph=list_file_sph,pickle_fname=pickle_fname, merge_score_files=merge_score_files) else: print 'Reader not implemented!' exit(0) if posting_sampler == None: testParser = PostingParser.PostingParser(posting_file) self.posting_sampler = Sampler.Sampler(testParser) self.posting_sampler.GetPositive() self.posting_sampler.GetNegative() self.posting_sampler.SampleData(perc_pos) else: self.posting_sampler = posting_sampler self.min_dur = min_dur self._data_all = None self._dim = False self._channels = 1 self.keep_full_utt = keep_full_utt if mpi.is_root(): self._data = [] self._label = [] self._features = [] self._utt_id = [] self._times = [] self._keyword = [] skipped = 0 for i in range(len(self.posting_sampler.negative_data)): if utt_reader.map_utt_idx.has_key(self.posting_sampler.negative_data[i]['file']): if self.posting_sampler.negative_data[i]['sys_bt'] == '': print 'We found a negative example that was not produced by the system!' exit(0) sys_bt = float(self.posting_sampler.negative_data[i]['sys_bt']) sys_et = float(self.posting_sampler.negative_data[i]['sys_et']) sys_sc = float(self.posting_sampler.negative_data[i]['sys_score']) if(sys_et-sys_bt < self.min_dur): skipped += 1 continue self._data.append(utt_reader.GetKeywordData(self.posting_sampler.negative_data[i]['file'], sys_bt, sys_et,kw=self.posting_sampler.negative_data[i]['termid'])) self._label.append(0) self._features.append(sys_sc) self._utt_id.append(self.posting_sampler.negative_data[i]['file']) self._times.append((sys_bt,sys_et)) self._keyword.append(self.posting_sampler.negative_data[i]['termid']) else: pass for i in range(len(self.posting_sampler.positive_data)): if utt_reader.map_utt_idx.has_key(self.posting_sampler.positive_data[i]['file']): if self.posting_sampler.positive_data[i]['sys_bt'] == '': sys_bt = 0 sys_et = None sys_sc = -1.0 #print self.posting_sampler.positive_data[i]['alignment'] continue #Should just ignore these? else: sys_bt = float(self.posting_sampler.positive_data[i]['sys_bt']) sys_et = float(self.posting_sampler.positive_data[i]['sys_et']) sys_sc = float(self.posting_sampler.positive_data[i]['sys_score']) if(sys_et-sys_bt < self.min_dur): skipped += 1 continue self._data.append(utt_reader.GetKeywordData(self.posting_sampler.positive_data[i]['file'], sys_bt, sys_et,kw=self.posting_sampler.positive_data[i]['termid'])) self._label.append(1) self._features.append(sys_sc) self._utt_id.append(self.posting_sampler.positive_data[i]['file']) self._times.append((sys_bt,sys_et)) self._keyword.append(self.posting_sampler.positive_data[i]['termid']) else: pass print 'I skipped ',skipped,' entries out of ',(len(self.posting_sampler.negative_data)+len(self.posting_sampler.positive_data)) self._label = np.array(self._label) else: self._data = None self._label = None self._features = None self._utt_id = None self._times = None self._keyword = None #populate true kw freq self._map_kw_counts = {} for i in range(len(self.posting_sampler.positive_data)): if utt_reader.map_utt_idx.has_key(self.posting_sampler.positive_data[i]['file']): kw = self.posting_sampler.positive_data[i]['termid'] if self._map_kw_counts.has_key(kw): self._map_kw_counts[kw] += 1 else: self._map_kw_counts[kw] = 1 #filter dataset depending on count if mpi.is_root(): ind_keep = [] kw_zero = 0 for i in range(len(self._keyword)): kw = self._keyword[i] kw_count = 0 if self._map_kw_counts.has_key(kw): kw_count = self._map_kw_counts[kw] else: kw_zero += 1 if kw_count <= max_count and kw_count >= min_count: ind_keep.append(i) self._data = [self._data[i] for i in ind_keep] self._label = [self._label[i] for i in ind_keep] self._features = [self._features[i] for i in ind_keep] self._utt_id = [self._utt_id[i] for i in ind_keep] self._times = [self._times[i] for i in ind_keep] self._keyword = [self._keyword[i] for i in ind_keep] self._data = mpi.distribute_list(self._data) self._label = mpi.distribute(self._label) self._features = mpi.distribute_list(self._features) self._utt_id = mpi.distribute_list(self._utt_id) self._times = mpi.distribute_list(self._times) self._keyword = mpi.distribute_list(self._keyword) if self.keep_full_utt == True: self.utt_reader = utt_reader if kw_feat != None: try: kw_feat.has_key('length') self.CopyKeywordMaps(kw_feat) except: self.LoadMappingHescii(FLAGS.hescii_file) self.ComputeKeywordMaps()
def __init__(self, root, is_training, crop = False, subset = None, prefetch = False, target_size = None, version = '2011'): """Load the dataset. Input: root: the root folder of the CUB_200_2011 dataset. is_training: if true, load the training data. Otherwise, load the testing data. crop: if False, does not crop the bounding box. If a real value, crop is the ratio of the bounding box that gets cropped. e.g., if crop = 1.5, the resulting image will be 1.5 * the bounding box area. subset: if nonempty, we will only use the subset specified in the list. The content of the list should be class subfolder names, like ['001.Black_footed_Albatross', ...] prefetch: if True, the images are prefetched to avoid disk read. If you have a large number of images, prefetch would require a lot of memory. target_size: if provided, all images are resized to the size specified. Should be a list of two integers, like [640,480]. version: either '2011' or '2010'. Note that the 2011 version contains the parts, while the 2010 version does not. Note that we will use the python indexing (labels start from 0). """ super(CUBDataset, self).__init__() if version == '2011': images = [line.split()[1] for line in open(os.path.join(root, 'images.txt'), 'r')] boxes = [line.split()[1:] for line in open(os.path.join(root, 'bounding_boxes.txt'),'r')] labels = [int(line.split()[1]) - 1 for line in open(os.path.join(root, 'image_class_labels.txt'), 'r')] classnames = [line.split()[1] for line in open(os.path.join(root, 'classes.txt'),'r')] class2id = dict(zip(classnames, range(len(classnames)))) split = [int(line.split()[1]) for line in open(os.path.join(root, 'train_test_split.txt'),'r')] # load parts rawparts = np.loadtxt(os.path.join(root, 'parts','part_locs.txt')) rawparts = rawparts[:,2:-1].reshape((len(images), 15, 2)) elif version == '2010': # we are using version 2010. We load the data to mimic the 2011 # version data format images = [line.strip() for line in open(os.path.join(root, 'lists', 'files.txt'), 'r')] boxes = [] # TODO: get boxes # unfortunately, we need to load the boxes from matlab annotations for filename in images: matfile = io.loadmat(os.path.join(root, 'annotations-mat', filename[:-3]+'mat')) left, top, right, bottom = \ [matfile['bbox'][0][0][i][0][0] for i in range(4)] boxes.append([left, top, right-left, bottom-top]) train_images = [line.strip() for line in open(os.path.join(root, 'lists', 'train.txt'), 'r')] labels = [int(line[:line.find('.')]) - 1 for line in images] classnames = [line.strip() for line in open(os.path.join(root, 'lists', 'classes.txt'),'r')] class2id = dict(zip(classnames, range(len(classnames)))) split = [int(line in train_images) for line in images] # we do not have rawparts. rawparts = None else: raise ValueError, "Unrecognized version: %s" % version if subset is not None: # create the subset mapping old2new = {} selected_ids = set() for new_id, name in enumerate(subset): old_id = class2id[name] selected_ids.add(old_id) old2new[old_id] = new_id # select the subset is_selected = [(label in selected_ids) for label in labels] images = [image for image, val in zip(images, is_selected) if val] boxes = [box for box, val in zip(boxes, is_selected) if val] labels = [old2new[label] for label, val in zip(labels, is_selected) \ if val] classnames = subset class2id = dict(zip(classnames, range(len(classnames)))) split = [trte for trte, val in zip(split, is_selected) if val] if rawparts is not None: rawparts = rawparts[np.asarray(is_selected, dtype=bool)] # now, do training testing split if is_training: target = 1 else: target = 0 images = [image for image, val in zip(images, split) if val == target] boxes = [box for box, val in zip(boxes, split) if val == target] labels = [label for label, val in zip(labels, split) if val == target] if rawparts is not None: rawparts = rawparts[np.asarray(split)==target] - 1 # store the necessary values self._version = version self._root = root self._data = mpi.distribute_list(images) self._raw_name = self._data # for the boxes, we store them as a numpy array self._boxes = np.array(mpi.distribute_list(boxes)).astype(float) self._boxes -= 1 if rawparts is not None: self._parts = mpi.distribute(rawparts) else: self._parts = None self._prefetch = prefetch self._target_size = target_size self._crop = crop if target_size is not None: self._dim = tuple(target_size) + (3,) else: self._dim = False self._channels = 3 # we store the raw dimensions for part location computation self._raw_dimension = np.zeros((len(self._data),2), dtype=int) if prefetch: self._data = [self._read(i) for i in range(len(self._data))] self._label = mpi.distribute_list(labels) self._classnames = mpi.COMM.bcast(classnames)
mpi.log_level(logging.ERROR) mpi.root_log_level(logging.INFO) files = [] if mpi.is_root(): if FLAGS.train != "": logging.info("Adding training images..") files += glob.glob(os.path.join(FLAGS.train, '*', '*.JPEG')) if FLAGS.val != "": logging.info("Adding validation images..") files += glob.glob(os.path.join(FLAGS.val, '*.JPEG')) if FLAGS.test != "": logging.info("Adding testing images..") files += glob.glob(os.path.join(FLAGS.test, '*.JPEG')) logging.info("A total of %d images to check" % (len(files))) files = mpi.distribute_list(files) logging.info('Validating...') errornum = 0 for i, filename in enumerate(files): try: verify = Image.open(filename) except Exception, e: logging.error(filename) errornum += 1 errornum = mpi.COMM.allreduce(errornum) if errornum == 0: logging.info("Done. No corrupted images found.") else: logging.info("Done. %d corrupted images found." % (errornum, ))
def __init__(self, root, is_training, crop=False, subset=None, prefetch=False, target_size=None, version='2011'): """Load the dataset. Input: root: the root folder of the CUB_200_2011 dataset. is_training: if true, load the training data. Otherwise, load the testing data. crop: if False, does not crop the bounding box. If a real value, crop is the ratio of the bounding box that gets cropped. e.g., if crop = 1.5, the resulting image will be 1.5 * the bounding box area. subset: if nonempty, we will only use the subset specified in the list. The content of the list should be class subfolder names, like ['001.Black_footed_Albatross', ...] prefetch: if True, the images are prefetched to avoid disk read. If you have a large number of images, prefetch would require a lot of memory. target_size: if provided, all images are resized to the size specified. Should be a list of two integers, like [640,480]. version: either '2011' or '2010'. Note that the 2011 version contains the parts, while the 2010 version does not. Note that we will use the python indexing (labels start from 0). """ super(CUBDataset, self).__init__() if version == '2011': images = [ line.split()[1] for line in open(os.path.join(root, 'images.txt'), 'r') ] boxes = [ line.split()[1:] for line in open(os.path.join(root, 'bounding_boxes.txt'), 'r') ] labels = [ int(line.split()[1]) - 1 for line in open( os.path.join(root, 'image_class_labels.txt'), 'r') ] classnames = [ line.split()[1] for line in open(os.path.join(root, 'classes.txt'), 'r') ] class2id = dict(zip(classnames, range(len(classnames)))) split = [ int(line.split()[1]) for line in open( os.path.join(root, 'train_test_split.txt'), 'r') ] # load parts rawparts = np.loadtxt(os.path.join(root, 'parts', 'part_locs.txt')) rawparts = rawparts[:, 2:-1].reshape((len(images), 15, 2)) elif version == '2010': # we are using version 2010. We load the data to mimic the 2011 # version data format images = [ line.strip() for line in open(os.path.join(root, 'lists', 'files.txt'), 'r') ] boxes = [] # TODO: get boxes # unfortunately, we need to load the boxes from matlab annotations for filename in images: matfile = io.loadmat( os.path.join(root, 'annotations-mat', filename[:-3] + 'mat')) left, top, right, bottom = \ [matfile['bbox'][0][0][i][0][0] for i in range(4)] boxes.append([left, top, right - left, bottom - top]) train_images = [ line.strip() for line in open(os.path.join(root, 'lists', 'train.txt'), 'r') ] labels = [int(line[:line.find('.')]) - 1 for line in images] classnames = [ line.strip() for line in open( os.path.join(root, 'lists', 'classes.txt'), 'r') ] class2id = dict(zip(classnames, range(len(classnames)))) split = [int(line in train_images) for line in images] # we do not have rawparts. rawparts = None else: raise ValueError, "Unrecognized version: %s" % version if subset is not None: # create the subset mapping old2new = {} selected_ids = set() for new_id, name in enumerate(subset): old_id = class2id[name] selected_ids.add(old_id) old2new[old_id] = new_id # select the subset is_selected = [(label in selected_ids) for label in labels] images = [image for image, val in zip(images, is_selected) if val] boxes = [box for box, val in zip(boxes, is_selected) if val] labels = [old2new[label] for label, val in zip(labels, is_selected) \ if val] classnames = subset class2id = dict(zip(classnames, range(len(classnames)))) split = [trte for trte, val in zip(split, is_selected) if val] if rawparts is not None: rawparts = rawparts[np.asarray(is_selected, dtype=bool)] # now, do training testing split if is_training: target = 1 else: target = 0 images = [image for image, val in zip(images, split) if val == target] boxes = [box for box, val in zip(boxes, split) if val == target] labels = [label for label, val in zip(labels, split) if val == target] if rawparts is not None: rawparts = rawparts[np.asarray(split) == target] - 1 # store the necessary values self._version = version self._root = root self._data = mpi.distribute_list(images) self._raw_name = self._data # for the boxes, we store them as a numpy array self._boxes = np.array(mpi.distribute_list(boxes)).astype(float) self._boxes -= 1 if rawparts is not None: self._parts = mpi.distribute(rawparts) else: self._parts = None self._prefetch = prefetch self._target_size = target_size self._crop = crop if target_size is not None: self._dim = tuple(target_size) + (3, ) else: self._dim = False self._channels = 3 # we store the raw dimensions for part location computation self._raw_dimension = np.zeros((len(self._data), 2), dtype=int) if prefetch: self._data = [self._read(i) for i in range(len(self._data))] self._label = mpi.distribute_list(labels) self._classnames = mpi.COMM.bcast(classnames)
mpi.log_level(logging.ERROR) mpi.root_log_level(logging.INFO) files = [] if mpi.is_root(): if FLAGS.train != "": logging.info("Adding training images..") files += glob.glob(os.path.join(FLAGS.train, '*', '*.JPEG')) if FLAGS.val != "": logging.info("Adding validation images..") files += glob.glob(os.path.join(FLAGS.val, '*.JPEG')) if FLAGS.test != "": logging.info("Adding testing images..") files += glob.glob(os.path.join(FLAGS.test, '*.JPEG')) logging.info("A total of %d images to check" % (len(files))) files = mpi.distribute_list(files) logging.info('Validating...') errornum = 0 for i, filename in enumerate(files): try: verify = Image.open(filename) except Exception, e: logging.error(filename) errornum += 1 errornum = mpi.COMM.allreduce(errornum) if errornum == 0: logging.info("Done. No corrupted images found.") else: logging.info("Done. %d corrupted images found." % (errornum,))