def _fork(self): self.rec_s = recordio.MXIndexedRecordIO(self.idx_fs, self.rec_fs, 'r') self.idx_s = list(self.rec_s.idx.keys()) self.rec_l = recordio.MXIndexedRecordIO(self.idx_fl, self.rec_fl, 'r') self.idx_l = list(self.rec_l.idx.keys()) self.rec_u = recordio.MXIndexedRecordIO(self.idx_fu, self.rec_fu, 'r') self.idx_u = list(self.rec_u.idx.keys())
def _fork(self): self.rec1 = recordio.MXIndexedRecordIO(self.idx_f1, self.rec_f1, 'r') self.cls_idx_d1 = self.load_or_gen_dict(self.rec_f1, self.rec1) self.idx1 = list(self.rec1.idx.keys()) self.rec2 = recordio.MXIndexedRecordIO(self.idx_2, self.rec_f2, 'r') self.cls_idx_d2 = self.load_or_gen_dict(self.rec_f2, self.rec2) self.idx2 = list(self.rec2.idx.keys())
def __init__(self,batch_size,data_shape,path_imgrec=None,shuffle=False,data_name='data',gender_label_name='label_gender', age_lable_name='label_age',**kwargs): super(SSR_ITER,self).__init__() assert path_imgrec self.batch_size = self.batch_size logging.info('loading recordio %s...',path_imgrec) path_imgidx=path_imgrec[0:-4]+".idx" self.imgrec=recordio.MXIndexedRecordIO(path_imgidx,path_imgrec,'r') self.imgidx=list(self.imgrec.keys) if shuffle: self.seq=self.imgidx self.oseq=self.imgidx else: self.seq=None self.provide_data=[(data_name,(batch_size,)+data_shape)] self.batch_size=batch_size self.data_shape=data_shape self.shuffle=shuffle self.image_size = '%d,%d' % (data_shape[1], data_shape[2]) # self.rand_mirror = rand_mirror # print('rand_mirror', rand_mirror) #self.provide_label=[(gender_label_name,(batch_size,1)),(age_lable_name, (batch_size,1))] self.provide_label = [(age_lable_name, (batch_size,))] self.cur = 0 self.nbatch = 0 self.is_init = False
def __init__(self, path_imgrec, rand_mirror): self.rand_mirror = rand_mirror assert path_imgrec if path_imgrec: logging.info('loading recordio %s...', path_imgrec) path_imgidx = path_imgrec[0:-4] + ".idx" print(path_imgrec, path_imgidx) self.imgrec = recordio.MXIndexedRecordIO(path_imgidx, path_imgrec, 'r') s = self.imgrec.read_idx(0) header, _ = recordio.unpack(s) if header.flag > 0: print('header0 label', header.label) self.header0 = (int(header.label[0]), int(header.label[1])) # assert(header.flag==1) # self.imgidx = range(1, int(header.label[0])) self.imgidx = [] self.id2range = {} self.seq_identity = range(int(header.label[0]), int(header.label[1])) for identity in self.seq_identity: s = self.imgrec.read_idx(identity) header, _ = recordio.unpack(s) a, b = int(header.label[0]), int(header.label[1]) count = b - a self.id2range[identity] = (a, b) self.imgidx += range(a, b) print('id2range', len(self.id2range)) else: self.imgidx = list(self.imgrec.keys) self.seq = self.imgidx
def __init__(self, batch_size, data_shape, path_imgrec = None, shuffle=False, aug_list=None, mean = None, rand_mirror = False, cutoff = 0, data_name='data', label_name='softmax_label', **kwargs): super(FaceImageIter, self).__init__() assert path_imgrec if path_imgrec: logging.info('loading recordio %s...', path_imgrec) path_imgidx = path_imgrec[0:-4]+".idx" self.imgrec = recordio.MXIndexedRecordIO(path_imgidx, path_imgrec, 'r') # pylint: disable=redefined-variable-type s = self.imgrec.read_idx(0) header, _ = recordio.unpack(s) if header.flag>0: print('header0 label', header.label) # print(header); exit(0) self.header0 = (int(header.label[0]), int(header.label[1])) #assert(header.flag==1) self.imgidx = range(1, int(header.label[0])) self.id2range = {} self.seq_identity = range(int(header.label[0]), int(header.label[1])) for identity in self.seq_identity: s = self.imgrec.read_idx(identity) header, _ = recordio.unpack(s) a,b = int(header.label[0]), int(header.label[1]) self.id2range[identity] = (a,b) count = b-a print('id2range', len(self.id2range)) else: self.imgidx = list(self.imgrec.keys) # print('header.flag = ', header.flag) # print(self.imgidx); exit(0) if shuffle: self.seq = self.imgidx self.oseq = self.imgidx print(len(self.seq)) else: self.seq = None self.mean = mean self.nd_mean = None if self.mean: self.mean = np.array(self.mean, dtype=np.float32).reshape(1,1,3) self.nd_mean = mx.nd.array(self.mean).reshape((1,1,3)) self.check_data_shape(data_shape) self.provide_data = [(data_name, (batch_size,) + data_shape)] self.batch_size = batch_size self.data_shape = data_shape self.shuffle = shuffle self.image_size = '%d,%d'%(data_shape[1],data_shape[2]) self.rand_mirror = rand_mirror print('rand_mirror', rand_mirror) self.cutoff = cutoff self.provide_label = [(label_name, (batch_size,))] #print(self.provide_label[0][1]) self.cur = 0 self.nbatch = 0 self.is_init = False
def __init__(self, rec_path: Path, augs=None): self.rec_path = rec_path path_imgidx = rec_path.with_suffix('.idx') self.augs = augs self.imgrec = recordio.MXIndexedRecordIO(str(path_imgidx), str(rec_path), 'r') s = self.imgrec.read_idx(0) header, _ = recordio.unpack(s) if header.flag > 0: self.header0 = (int(header.label[0]), int(header.label[1])) self.imgidx = [] self.id2range = {} self.seq_identity = range(int(header.label[0]), int(header.label[1])) for identity in self.seq_identity: s = self.imgrec.read_idx(identity) header, _ = recordio.unpack(s) a, b = int(header.label[0]), int(header.label[1]) self.id2range[identity] = (a, b) self.imgidx += range(a, b) else: self.imgidx = list(self.imgrec.keys) self.seq = self.imgidx prop_path = rec_path.parent / 'property' with open(str(prop_path), 'r') as f: self.num_labels = int(f.readline().split(',')[0].strip())
def _fork(self): if self.use_src: self.recs = recordio.MXIndexedRecordIO(self.idx_fs, self.rec_fs, 'r') self.idxs = list(self.recs.idx.keys()) if self.use_tgt: self.rect = recordio.MXIndexedRecordIO(self.idx_ft, self.rec_ft, 'r') self.idxt = list(self.rect.idx.keys()) if self.use_src: cls_lst = [] for idx in self.idxt: record = self.rect.read_idx(idx) h, _ = recordio.unpack(record) cls_lst.append(h.label) self.idxt_cls = self.generate_cls_dict(cls_lst)
def __init__(self, cfg, mode='train'): self.prefix = Path(cfg.path) self.mode = mode meta_path = self.prefix / f'{mode}.meta' with meta_path.open('r') as f: meta_info = json.load(f) self.num_sample = meta_info['num_sample'] print('num samples:', self.num_sample) self.record_reader = mxrec.MXIndexedRecordIO( str(self.prefix / f'{mode}.idx'), str(self.prefix / f'{mode}.rec'), 'r')
def __init__(self, batch_size, data_shape, path_imgrec=None, task='age', shuffle=False, aug_list=None, mean=None, rand_mirror=False, cutoff=0, data_name='data', label_name='softmax_label', **kwargs): super(FaceImageIter, self).__init__() assert path_imgrec if path_imgrec: logging.info('loading recordio %s...', path_imgrec) path_imgidx = path_imgrec[0:-4] + ".idx" self.imgrec = recordio.MXIndexedRecordIO(path_imgidx, path_imgrec, 'r') # pylint: disable=redefined-variable-type s = self.imgrec.read_idx(0) header, _ = recordio.unpack(s) self.imgidx = list(self.imgrec.keys) if shuffle: self.seq = self.imgidx self.oseq = self.imgidx print(len(self.seq)) else: self.seq = None self.mean = mean self.nd_mean = None if self.mean: self.mean = np.array(self.mean, dtype=np.float32).reshape(1, 1, 3) self.nd_mean = mx.nd.array(self.mean).reshape((1, 1, 3)) self.check_data_shape(data_shape) self.provide_data = [(data_name, (batch_size, ) + data_shape)] self.batch_size = batch_size self.data_shape = data_shape self.shuffle = shuffle self.image_size = '%d,%d' % (data_shape[1], data_shape[2]) self.rand_mirror = rand_mirror print('rand_mirror', rand_mirror) self.cutoff = cutoff if task == 'age': self.provide_label = [(label_name, (batch_size, 100))] else: self.provide_label = [(label_name, (batch_size, ))] # print(self.provide_label[0][1]) self.cur = 0 self.nbatch = 0 self.is_init = False
def __init__(self, batch_size, data_shape, path_imgrec=None, shuffle=False, aug_list=None, mean=None, rand_mirror=False, cutoff=0, data_name='data', label_name='sigmoid_label', num_class=21, **kwargs): super(FaceImageIter, self).__init__() if path_imgrec: logging.info('loading record %s...', path_imgrec) path_imgidx = path_imgrec[0:-4] + ".idx" self.imgrec = recordio.MXIndexedRecordIO(path_imgidx, path_imgrec, 'r') s_dataset = self.imgrec.read_idx(0) header, _ = recordio.unpack(s_dataset) if header.flag == 2: #print('header0 label', header.label) self.header0 = (int(header.label[0]), int(header.label[1])) #assert(header.flag==1) self.imgidx = range(1, int(header.label[0])) else: print("header flag is not 2 for dataset ", header.flag) #self.imgidx = list(self.imgrec.keys) if shuffle: self.seq = self.imgidx #print("init shutffle",len(self.seq)) else: self.seq = None self.mean = mean if self.mean: self.mean = np.ones([1, 1, 3], dtype=np.float32) * self.mean self.nd_mean = mx.nd.array(self.mean).reshape((1, 1, 3)) self.check_data_shape(data_shape) self.provide_data = [(data_name, (batch_size, ) + data_shape)] self.batch_size = batch_size self.data_shape = data_shape self.shuffle = shuffle self.image_size = '%d,%d' % (data_shape[1], data_shape[2]) self.rand_mirror = rand_mirror self.cutoff = cutoff self.provide_label = [(label_name, (batch_size, num_class))] #print(self.provide_label[0][1]) self.cur = 0 self.nbatch = 0 self.is_init = False
def extract2Output(prefix, database_output, search_output, samples=5): from pathlib import Path p = Path(database_output) if p.exists(): import shutil shutil.rmtree(database_output) p.mkdir() p = Path(search_output) if p.exists(): import shutil shutil.rmtree(search_output) p.mkdir() reader = io.MXIndexedRecordIO(prefix + '.idx', prefix + '.rec', 'r') #第0行是全部种类的信息,获得全部种类的索引 s = reader.read_idx(0) header, _ = io.unpack(s) labels = range(int(header.label[0]), int(header.label[1])) ###############获得种类下实例的索引,imgs保存的是某一个种类下的实例索引#### imgs = [] for l in labels: s = reader.read_idx(int(l)) header, _ = io.unpack(s) a, b = int(header.label[0]), int(header.label[1]) imgs.append(range(a, b)) ##########extract feature of every image############## import tqdm for ii, imgidxs in tqdm.tqdm(enumerate(imgs)): sc_path = os.path.join(search_output, str(ii)) db_path = os.path.join(database_output, str(ii)) os.mkdir(sc_path) os.mkdir(db_path) imgcount = len(list(imgidxs)) smidx = [] if samples > 0: smidx = np.random.choice(list(imgidxs), samples, False) for id in imgidxs: s = reader.read_idx(id) h, img = io.unpack_img(s) if id not in smidx: o = os.path.join(db_path, str(id) + '.jpg') cv2.imwrite(o, img) else: o = os.path.join(sc_path, str(id) + '.jpg') cv2.imwrite(o, img)
def __init__(self, root, flag=1, transform=None): filename = os.path.join(root, 'train.rec') self.filename = filename self.idx_file = os.path.splitext(filename)[0] + '.idx' self._record = recordio.MXIndexedRecordIO(self.idx_file, self.filename, 'r') prop = open(os.path.join(root, "property"), "r").read().strip().split(',') self._flag = flag self._transform = transform assert len(prop) == 3 self.num_classes = int(prop[0]) self.image_size = [int(prop[1]), int(prop[2])]
def load_train_data(data_dir): path_imgrec = os.path.join(data_dir, "train.rec") path_imgidx = path_imgrec[0:-4] + ".idx" print( "Loading recordio {}\n\ Corresponding record idx is {}".format( path_imgrec, path_imgidx ) ) imgrec = recordio.MXIndexedRecordIO( path_imgidx, path_imgrec, "r", key_type=int ) # TODO: key_type ?? # Read header0 to get some info. identity_key_start = 0 identity_key_end = 0 imgidx_list = [] id2range = {} rec0 = imgrec.read_idx(0) header0, img_str = recordio.unpack(rec0) if header0.flag > 0: identity_key_start = int(header0.label[0]) identity_key_end = int(header0.label[1]) imgidx_list = range(1, identity_key_start) # Read identity id range for identity in range(identity_key_start, identity_key_end): rec = imgrec.read_idx(identity) header, s = recordio.unpack(rec) a, b = int(header.label[0]), int(header.label[1]) id2range[identity] = (a, b) else: imgidx_list = imgrec.keys # print id2range to txt file # with open('id2range.txt', 'w') as f: # for identity in range(identity_key_start, identity_key_end): # l = str(identity) \ # + ' ' \ # + str(id2range[identity][0]) \ # + ' ' + str(id2range[identity][1]) + '\n' # f.write(l) return imgrec, imgidx_list
def __init__( self, path_ms1m=lz.share_path2 + 'faces_ms1m_112x112/', ): self.path_ms1m = path_ms1m self.root_path = Path(path_ms1m) path_imgrec = path_ms1m + '/train.rec' path_imgidx = path_imgrec[0:-4] + ".idx" assert os.path.exists(path_imgidx) self.imgrec = recordio.MXIndexedRecordIO(path_imgidx, path_imgrec, 'r') self.imgidx, self.ids, self.id2range = lz.msgpack_load(path_ms1m + '/info.pk') self.num_classes = len(self.ids) self.cur = 0 self.lock = mp.Lock()
def _fork(self): self.records = [recordio.MXIndexedRecordIO(os.path.splitext(fname)[0] + '.idx', fname, 'r') for fname in self.filenames] self.orig_idx = {} idx = 0 for i, r in enumerate(self.records): for j in r.keys: self.orig_idx[idx] = (i, j) idx += 1 self.landmark_dicts = {} if self.with_landmark: for i, fname in enumerate(self.filenames): landmark_file = os.path.splitext(fname)[0] + '.landmark' with open(landmark_file, 'rb') as f: self.landmark_dicts[i] = pickle.load(f)
def extract_asian_celeb_images(args): imgrec = recordio.MXIndexedRecordIO(args.idx_path, args.rec_path, 'r') last = 0 cnt = 0 for i in range(2830146): header, s = recordio.unpack(imgrec.read_idx(i + 1)) img = mx.image.imdecode(s).asnumpy() dst = os.path.join(args.write_path, str(int(header.label[0]))) if not os.path.exists(dst): os.makedirs(dst) last = int(header.label[0]) cnt = 0 cv2.imwrite(os.path.join(dst, f'{cnt}.jpg'), cv2.cvtColor(img, cv2.COLOR_BGR2RGB)) cnt += 1
def _main(args): output_dir = args.output_dir if not os.path.exists(output_dir): os.mkdir(output_dir) path_imgidx = args.path_imgidx path_imgrec = args.path_imgrec count = args.image_count imgrec = recordio.MXIndexedRecordIO(path_imgidx, path_imgrec, 'r') # 创建线程池 # executor = ThreadPoolExecutor(max_workers=args.max_workers) for i in tqdm(range(count)): # executor.submit(extrat_image(recordio, imgrec, output_dir, i)) extrat_image(recordio, imgrec, output_dir, i)
def dump_mxrec(data_splits): for dsp in data_splits: num_sample = 0 source_path = source_root / dsp output_meta = output_root / f'{dsp}.meta' write_record = mxrec.MXIndexedRecordIO(str(output_root / f'{dsp}.idx'), str(output_root / f'{dsp}.rec'), 'w') for pkl_path in tqdm(source_path.glob("*.pkl")): with pkl_path.open('rb') as pf: data = pickle.load(pf) data = pickle.dumps(data) write_record.write_idx(num_sample, data) num_sample += 1 # if num_sample > max_sample: # break with output_meta.open('w') as f: json.dump({'num_sample': num_sample}, f) write_record.close()
def __init__(self, batch_size, data_shape, path_imgrec=None, shuffle=False, aug_list=None, mean=None, rand_mirror=False, cutoff=0, color_jittering=0, data_name='data', label_name='softmax_label', **kwargs): super(FaceImageIter, self).__init__() assert path_imgrec logging.info('loading recordio %s...', path_imgrec) path_imgidx = path_imgrec[0:-4] + ".idx" self.imgrec = recordio.MXIndexedRecordIO(path_imgidx, path_imgrec, 'r') self.seq = list(self.imgrec.keys) logging.info("%s 数据大小:%d", path_imgrec, len(self.seq)) self.mean = mean self.nd_mean = None if self.mean: self.mean = np.array(self.mean, dtype=np.float32).reshape(1, 1, 3) self.nd_mean = mx.nd.array(self.mean).reshape((1, 1, 3)) self.check_data_shape(data_shape) self.provide_data = [(data_name, (batch_size, ) + data_shape)] self.batch_size = batch_size self.data_shape = data_shape self.shuffle = shuffle self.image_size = '%d,%d' % (data_shape[1], data_shape[2]) self.rand_mirror = rand_mirror logging.info('是否随机翻转图片:%s', rand_mirror) self.cutoff = cutoff self.color_jittering = color_jittering self.CJA = mx.image.ColorJitterAug(0.125, 0.125, 0.125) self.provide_label = [(label_name, (batch_size, 101))] self.cur = 0 self.nbatch = 0 self.is_init = False
def load_ms1m_info(): self = edict() path_ms1m = lz.share_path2 + 'faces_ms1m_112x112/' path_imgrec = lz.share_path2 + 'faces_ms1m_112x112/train.rec' path_imgidx = path_imgrec[0:-4] + ".idx" self.imgrec = recordio.MXIndexedRecordIO(path_imgidx, path_imgrec, 'r') imgidx, ids, id2range = lz.msgpack_load(path_ms1m + '/info.pk') print(len(imgidx), len(ids), len(id2range)) # while True: # time.sleep(10) # for indt in range(1): # id1 = ids[0] # imgid = id2range[id1][0] # s = self.imgrec.read_idx(imgid) # header, img = recordio.unpack(s) # print(header.label, id1) imgidx, ids = np.array(imgidx), np.array(ids) print(stat_np(imgidx)) print(stat_np(ids))
def ms1m_gen(batch_size): path_idx = "./data/faces_emore/train.idx" path_rec = "./data/faces_emore/train.rec" imgrec = recordio.MXIndexedRecordIO(path_idx, path_rec, 'r') ms1m_list = np.load("ms1m_list.npy", allow_pickle=True).item() keys = list(ms1m_list.keys()) sz = len(ms1m_list) pp = np.arange(sz) k = np.arange(sz) random.seed(a=None) anc = [] pos = [] neg = [] ind = 0 while True: t = random.randint(1, sz - 1) random.shuffle(k) p = (pp + t) % sz for h in k: i = h j = p[h] if len(ms1m_list[keys[i]]) > 1: i_p = random.sample(ms1m_list[keys[i]], 2) i_n = random.sample(ms1m_list[keys[j]], 1) header, s = recordio.unpack(imgrec.read_idx(int(i_p[0]))) img1 = mx.image.imdecode(s).asnumpy() / 255 header, s = recordio.unpack(imgrec.read_idx(int(i_p[1]))) img2 = mx.image.imdecode(s).asnumpy() / 255 header, s = recordio.unpack(imgrec.read_idx(int(i_n[0]))) img3 = mx.image.imdecode(s).asnumpy() / 255 anc.append(img1) pos.append(img2) neg.append(img3) ind = (ind + 1) % batch_size if ind == 0: x = np.array(anc + pos + neg) y = np.ndarray(shape=(batch_size * 3, 1)) yield x, y anc.clear() pos.clear() neg.clear()
def __init__(self): from mxnet import recordio self.imgrec = recordio.MXIndexedRecordIO(base + "train.idx", rec_files[0], 'r') s = self.imgrec.read_idx(0) header, _ = recordio.unpack(s) assert header.flag > 0, 'ms1m or glint ...' logging.info(f'header0 label {header.label}') self.header0 = (int(header.label[0]), int(header.label[1])) self.id2range = {} self.idx2id = {} self.imgidx = [] self.ids = [] ids_shif = int(header.label[0]) for identity in list(range(int(header.label[0]), int(header.label[1]))): s = self.imgrec.read_idx(identity) header, _ = recordio.unpack(s) a, b = int(header.label[0]), int(header.label[1]) self.id2range[identity] = (a, b) self.ids.append(identity) self.imgidx += list(range(a, b)) self.ids = np.asarray(self.ids) self.num_classes = len(self.ids) self.ids_map = { identity - ids_shif: id2 for identity, id2 in zip(self.ids, range(self.num_classes)) } # now cutoff==0, this is identitical ids_map_tmp = { identity: id2 for identity, id2 in zip(self.ids, range(self.num_classes)) } self.ids = np.asarray([ids_map_tmp[id_] for id_ in self.ids]) self.id2range = { ids_map_tmp[id_]: range_ for id_, range_ in self.id2range.items() } for id_, range_ in self.id2range.items(): for idx_ in range(range_[0], range_[1]): self.idx2id[idx_] = id_ conf.num_clss = self.num_classes
def ms1m_gen_batch(batch_size, sample_size): path_idx = "./data/faces_emore/train.idx" path_rec = "./data/faces_emore/train.rec" imgrec = recordio.MXIndexedRecordIO(path_idx, path_rec, 'r') ms1m_list = np.load("ms1m_list.npy", allow_pickle=True).item() keys = list(ms1m_list.keys()) random.seed(a=None) while True: x = [] y = [] people = random.sample(keys, batch_size) for person in people: imgs = random.sample(ms1m_list[person], np.min([sample_size, len(ms1m_list[person])])) for src in imgs: header, s = recordio.unpack(imgrec.read_idx(int(src))) img = mx.image.imdecode(s).asnumpy() / 255 x.append(img) y.append(person) yield np.array(x), np.array(y)
def push_record(self, recname): print('Pushing file: %s ...'%recname) imgrec = recordio.MXIndexedRecordIO(recname+'.idx', recname+'.rec', 'r') s = imgrec.read_idx(0) header,_ = recordio.unpack(s) header0 = (int(header.label[0]), int(header.label[1])) for idd in tqdm(range(header0[0], header0[1])): idxbuff = [self.idx] s = imgrec.read_idx(idd) header, _ = recordio.unpack(s) imgrange = range(int(header.label[0]), int(header.label[1])) for imgidx in imgrange: s = imgrec.read_idx(imgidx) hdd, img = recordio.unpack(s) hdd = mx.recordio.IRHeader(0, float(self.idnum), 0, 0) s = recordio.pack(hdd, img) self.recout.write_idx( self.idx, s) self.idx += 1 idxbuff.append(self.idx) self.ID_idx.append(idxbuff) self.idnum += 1
def get_msceleb_images(records_dir): imgidx_path = os.path.join(records_dir, "train.idx") imgrec_path = os.path.join(records_dir, "train.rec") images_dir = os.path.join(records_dir, "images") imgrec = recordio.MXIndexedRecordIO(imgidx_path, imgrec_path, 'r') s = imgrec.read_idx(0) header, _ = recordio.unpack(s) tot_images = int(header.label[0]) - 1 print("Total images", tot_images) for i in range(tot_images): print("Reading ", i) s = imgrec.read() header, img = recordio.unpack(s) img = mx.image.imdecode(img).asnumpy() label = int(header.label) img = Image.fromarray(np.uint8(img), "RGB") images_subdir = os.path.join(images_dir, "identity_%d" % label) if not os.path.exists(images_subdir): os.makedirs(images_subdir) image_path = os.path.join(images_subdir, "image_%d.jpg" % i) img.save(image_path)
def push_mxrecord(self, recname): from mxnet import recordio print('Pushing mxrec:', recname) imgrec = recordio.MXIndexedRecordIO(recname + '.idx', recname + '.rec', 'r') header, _ = recordio.unpack(imgrec.read_idx(0)) header0 = (int(header.label[0]), int(header.label[1])) print('datalen', header0[1] - header0[0]) bar = tqdm(range(header0[0], header0[1])) for idd in bar: s = imgrec.read_idx(idd) header, _ = recordio.unpack(s) imgrange = (int(header.label[0]), int(header.label[1])) meta = [self.pos] for idx in range(imgrange[0], imgrange[1]): s = imgrec.read_idx(idx) hdd, img = recordio.unpack(s) self.ioout.write_idx(self.pos, img) self.pos += 1 meta.append(self.pos) self.class_metas.append(meta)
def __init__(self, data_shape, path_imgrec, transform=None): super(FaceDataset, self).__init__() logging.info('loading recordio %s...', path_imgrec) path_imgidx = path_imgrec[0:-4]+".idx" self.imgrec = recordio.MXIndexedRecordIO(path_imgidx, path_imgrec, 'r') # pylint: disable=redefined-variable-type s = self.imgrec.read_idx(0) header, _ = recordio.unpack(s) if header.flag>0: print('header0 label', header.label) self.header0 = (int(header.label[0]), int(header.label[1])) #assert(header.flag==1) self.imgidx = list(range(1, int(header.label[0]))) #self.imgidx = [] #self.id2range = {} #self.seq_identity = range(int(header.label[0]), int(header.label[1])) #for identity in self.seq_identity: # s = self.imgrec.read_idx(identity) # header, _ = recordio.unpack(s) # a,b = int(header.label[0]), int(header.label[1]) # count = b-a # if count<images_filter: # continue # self.id2range[identity] = (a,b) # self.imgidx += range(a, b) #print('id2range', len(self.id2range)) else: self.imgidx = list(self.imgrec.keys) self.seq = self.imgidx self.data_shape = data_shape self.transform = transforms.Compose([ #transforms.RandomBrightness(0.3), #transforms.RandomContrast(0.3), #transforms.RandomSaturation(0.3), transforms.RandomFlipLeftRight(), transforms.ToTensor() ])
def __init__(self, batch_size, data_shape, path_imgrec=None, label_width=None, shuffle=False, mean=None, rand_mirror=False, cutoff=0, color_jittering=0, data_name='data', label_name='softmax_label', **kwargs): super(FaceImageIter, self).__init__() assert path_imgrec assert label_width logging.info('loading recordio %s...', path_imgrec) path_imgidx = path_imgrec[0:-4]+".idx" self.imgrec = recordio.MXIndexedRecordIO(path_imgidx, path_imgrec, 'r') # pylint: disable=redefined-variable-type s = self.imgrec.read_idx(0) header, _ = recordio.unpack(s) self.imgidx = list(self.imgrec.keys) self.seq = self.imgidx self.labelWidth = label_width self.mean = mean self.nd_mean = None if self.mean: self.mean = np.array(self.mean, dtype=np.float32).reshape(1,1,3) self.nd_mean = mx.nd.array(self.mean).reshape((1,1,3)) self.check_data_shape(data_shape) self.provide_data = [(data_name, (batch_size,) + data_shape)] self.batch_size = batch_size self.data_shape = data_shape self.shuffle = shuffle self.image_size = '%d,%d'%(data_shape[1],data_shape[2]) self.rand_mirror = rand_mirror logging.info('rand_mirror: %d', rand_mirror) self.cutoff = cutoff self.color_jittering = color_jittering self.CJA = mx.image.ColorJitterAug(0.125, 0.125, 0.125) self.provide_label = [(label_name, (batch_size, self.labelWidth))] self.cur = 0 self.nbatch = 0 self.is_init = False
def extract_ms1m_info(): self = edict() path_ms1m = lz.share_path2 + 'faces_ms1m_112x112/' path_imgrec = lz.share_path2 + 'faces_ms1m_112x112/train.rec' path_imgidx = path_imgrec[0:-4] + ".idx" self.imgrec = recordio.MXIndexedRecordIO(path_imgidx, path_imgrec, 'r') s = self.imgrec.read_idx(0) header, _ = recordio.unpack(s) self.header0 = (int(header.label[0]), int(header.label[1])) # assert(header.flag==1) self.imgidx = list(range(1, int(header.label[0]))) id2range = dict() self.seq_identity = list(range(int(header.label[0]), int(header.label[1]))) for identity in self.seq_identity: s = self.imgrec.read_idx(identity) header, _ = recordio.unpack(s) a, b = int(header.label[0]), int(header.label[1]) id2range[(identity - 3804847)] = (a, b) count = b - a self.seq = self.imgidx self.seq_identity = [int(t) - 3804847 for t in self.seq_identity] lz.msgpack_dump([self.imgidx, self.seq_identity, id2range], path_ms1m + '/info.pk')
#pre_process.py import os import pickle import cv2 as cv import mxnet as mx from mxnet import recordio from tqdm import tqdm from config import path_imgidx, path_imgrec, IMG_DIR, pickle_file from utils import ensure_folder if __name__ == "__main__": ensure_folder(IMG_DIR) imgrec = recordio.MXIndexedRecordIO(path_imgidx, path_imgrec, 'r') print(len(imgrec)) samples = [] class_ids = set() # %% 1 ~ 5179510 for i in tqdm(range(5179510)): print(i) try: header, s = recordio.unpack(imgrec.read_idx(i + 1)) img = mx.image.imdecode(s).asnumpy() print(img.shape) img = cv.cvtColor(img, cv.COLOR_RGB2BGR) print(header.label) print(type(header.label))