def gen(self, ham, itr=1000, nbasis=1000, burn=500, thin=1): _, state = ground(ham) n = ham.lattice.numel() try: # os.makedirs(os.path.join(self.root, self.raw_folder)) os.makedirs(os.path.join(self.root, self.processed_folder)) except OSError as e: if e.errno == errno.EEXIST: pass else: raise data = [] basis = [] for _ in range(nbasis): ops = MBOp(choice([sigmax, sigmay, sigmaz]) for i in range(n)) _state = ops.invtrans(state) sampler = STMetropolis(proposal=lambda x: abs(_state[bin(x)])**2, size=ham.size) data.extend(sampler.sample(itr=itr, burn=burn, thin=thin)) basis.extend( torch.FloatTensor(ops.params()).resize_(2 * n) for i in range(len(sampler.collector))) return data, basis
def print_instances_class_histogram(dataset_dicts, class_names): """ Args: dataset_dicts (list[dict]): list of dataset dicts. class_names (list[str]): list of class names (zero-indexed). """ num_classes = len(class_names) hist_bins = np.arange(num_classes + 1) histogram = np.zeros((num_classes,), dtype=np.int) for entry in dataset_dicts: annos = entry["annotations"] classes = [x["category_id"] for x in annos if not x.get("iscrowd", 0)] histogram += np.histogram(classes, bins=hist_bins)[0] N_COLS = min(6, len(class_names) * 2) def short_name(x): # make long class names shorter. useful for lvis if len(x) > 13: return x[:11] + ".." return x data = list( itertools.chain(*[[short_name(class_names[i]), int(v)] for i, v in enumerate(histogram)]) ) total_num_instances = sum(data[1::2]) data.extend([None] * (N_COLS - (len(data) % N_COLS))) if num_classes > 1: data.extend(["total", total_num_instances]) data = itertools.zip_longest(*[data[i::N_COLS] for i in range(N_COLS)]) logging.info("Distribution of instances among all {} categories:\n".format(num_classes)) logging.info(pformat(data))
def parse_annotation(ICDAR_path, id, len_max): # print(id) texts = list() labels = list() f_task1 = open( os.path.join(ICDAR_path, 'task3-test(347p)/task3-test(347p)/' + id)) f_img = os.path.join( ICDAR_path, 'task3-test(347p)/task3-test(347p)/' + id.strip('txt') + 'jpg') im = Image.open(f_img) im_width = im.size[0] im_height = im.size[1] # In ICDAR case, the first line is our ROI coordinate (xmin, ymin) line_txt = f_task1.readline() coor = line_txt.split(',') ROI_x = int(coor[0].strip('\'')) ROI_y = int(coor[1].strip('\'')) final_total = 0 while line_txt: line_txt = f_task1.readline() coor = line_txt.split(',') # print(coor) if coor[0] != '"\r\n' and coor[0] != '"\n' and coor[0] != '' and coor[ 0] != '\r\n': xmin = float((int(coor[0].strip('\'')) - ROI_x)) / float(im_width) ymin = float((int(coor[1].strip('\'')) - ROI_y)) / float(im_height) xmax = float((int(coor[4].strip('\'')) - ROI_x)) / float(im_width) ymax = float((int(coor[5].strip('\'')) - ROI_y)) / float(im_height) text = coor[8:] # 'ori_text' pretains special signs which block the following comparison but are useful in encoding ori_text = copy.deepcopy(text) ori_text = ','.join(ori_text) ori_text = list(ori_text) text_ascii = [ord(c) for c in ori_text] data = [xmin, ymin, xmax, ymax] data.extend(text_ascii) data_pad = [0] * len_max if len(data) <= len_max: data_pad[:len(data)] = data else: data_pad = data[:len_max] # print(label) texts.append(data_pad) return {'texts': texts}, [ROI_x, ROI_y, im_width, im_height]
def print_instances_class_histogram(dataset_dicts, class_names): """ Args: dataset_dicts (list[dict]): list of dataset dicts. class_names (list[str]): list of class names (zero-indexed). """ logger = logging.getLogger(__name__) logger.info("Build instances class histogram") num_classes = len(class_names) hist_bins = np.arange(num_classes + 1) histogram = np.zeros((num_classes, ), dtype=np.int) for entry in tqdm(dataset_dicts): annos = entry["annotations"] classes = np.asarray([x["category_id"] for x in annos], dtype=np.int) if len(classes): assert classes.min( ) >= 0, f"Got an invalid category_id={classes.min()}" assert ( classes.max() < num_classes ), f"Got an invalid category_id={classes.max()} for a dataset of {num_classes} classes" histogram += np.histogram(classes, bins=hist_bins)[0] N_COLS = min(6, len(class_names) * 2) def short_name(x): # make long class names shorter. useful for lvis if len(x) > 13: return x[:11] + ".." return x data = list( itertools.chain( *[[short_name(class_names[i]), int(v)] for i, v in enumerate(histogram)])) total_num_instances = sum(data[1::2]) data.extend([None] * (N_COLS - (len(data) % N_COLS))) if num_classes > 1: data.extend(["total", total_num_instances]) data = itertools.zip_longest(*[data[i::N_COLS] for i in range(N_COLS)]) # TODO: Too many classes expected. Need to summarize histogram table = tabulate( data, headers=["category", "#instances"] * (N_COLS // 2), tablefmt="pipe", numalign="left", stralign="center", ) log_first_n( logging.INFO, "Distribution of instances among all {} categories:\n".format( num_classes) + colored(table, "cyan"), key="message", )
def build_data(self, textdata: TextData): data = [] for n in range(len(textdata.sentences)): indexes = textdata.get_sentence_indexes(n) ngram_position_range = range(-self.context_size, len(indexes)) ngrams = [ self.get_ngram_tensors_at_position(position, indexes) for position in ngram_position_range ] data.extend(ngrams) return data
def print_instances_class_histogram(dataset_dicts, class_names): """ Args: dataset_dicts (list[dict]): list of dataset dicts. class_names (list[str]): list of class names (zero-indexed). """ num_classes = len(class_names) hist_bins = np.arange(num_classes + 1) # [0,1] histogram = np.zeros((num_classes, ), dtype=np.int) for entry in dataset_dicts: annos = entry["annotations"] # iscrowd 为0 的才有效 classes = [x["category_id"] for x in annos if not x.get("iscrowd", 0)] # np.histogram(a, bin:int or list, range:tuple..) # a: 待统计数组,bin可以是int,那么要指定range,或者list表示bin的范围, # return hist:统计好的每个区间的数,bin:区间的划分 histogram += np.histogram(classes, bins=hist_bins)[0] N_COLS = min(6, len(class_names) * 2) # 一个图cols? def short_name(x): # make long class names shorter. useful for lvis if len(x) > 13: return x[:11] + ".." return x # 改短名字 data = list( itertools.chain( *[[short_name(class_names[i]), int(v)] for i, v in enumerate(histogram)])) # data = [name1, num1, name2, v2, ...] total_num_instances = sum( data[1::2]) # data[1::2] a[i:j:step]表示从index = 1 开始切片,直到末尾,step = 2 data.extend([None] * (N_COLS - (len(data) % N_COLS))) # 补齐为N_COLS 的倍数 if num_classes > 1: data.extend(["total", total_num_instances]) #data[i::N_COLS],为什么要搞出这么多个呢(6个) # zip_longest, 跟zip差不多,给定多个列表,a,b,.. ,按照最长的那个枚举,不足的填None data = itertools.zip_longest(*[data[i::N_COLS] for i in range(N_COLS)]) # 不知道干啥。。。 table = tabulate( data, headers=["category", "#instances"] * (N_COLS // 2), tablefmt="pipe", numalign="left", stralign="center", ) log_first_n( logging.INFO, "Distribution of instances among all {} categories:\n".format( num_classes) + colored(table, "cyan"), key="message", )
def build_vocab(self, min_freq=0, max_freq=sys.maxsize): """ build vocab + add eos encode sentence """ with open(os.path.join(self.data_dir, 'train.txt'), 'r') as fn: data = fn.readlines() if 'lambada' in self.data_dir: with open(os.path.join(self.data_dir, 'test.txt'), 'r') as fn: data.extend(fn.readlines()) with open(os.path.join(self.data_dir, 'valid.txt'), 'r') as fn: data.extend(fn.readlines()) print('building vocab ...') self.vocab = defaultdict(int) self.tok2id = {} self.id2tok = [] for line in tqdm(data): line = line.strip().split() for tok in line: self.vocab[tok] += 1 self.vocab = { a: self.vocab[a] for a in self.vocab if self.vocab[a] >= min_freq and self.vocab[a] <= max_freq } # sort vocab in case of using adaptive softmax self.vocab = list( sorted(self.vocab.items(), key=lambda a: a[1], reverse=True)) print(self.vocab[:10]) if 'lambada' in self.data_dir: self.vocab = self.vocab[:60000] self.vocab.append(('<unk>', 0)) self.id2tok = ['<pad>'] + ['<eos>'] + [a[0] for a in self.vocab] self.tok2id = {a: i for i, a in enumerate(self.id2tok)} self.vocab_size = len(self.id2tok) print('end building vocab ...') print('vocab size', len(self.tok2id)) with open(os.path.join(self.data_dir, 'vocab.pkl'), 'wb') as fn: pickle.dump( { 'id2tok': self.id2tok, 'tok2id': self.tok2id, 'vocab_size': self.vocab_size }, fn)
def __getitem__(self, index): assert (index < self._dataset_size) #if self._is_for_train: # random # index = random.randint(0, self._dataset_size-1) # get sample data data = self._data[index] / 1000 data = list(data) data_reverse = copy.deepcopy(data) data_reverse.reverse() filt_rri1 = list(moving_average(RRi(data), order=1)) filt_rri2 = list(moving_average(RRi(data), order=2)) filt_rri3 = list(moving_average(RRi(data), order=3)) filt_rri1_reverse = copy.deepcopy(filt_rri1) filt_rri2_reverse = copy.deepcopy(filt_rri2) filt_rri3_reverse = copy.deepcopy(filt_rri3) filt_rri1_reverse.reverse() filt_rri2_reverse.reverse() filt_rri3_reverse.reverse() order_data = [filt_rri1, filt_rri2, filt_rri3] order_data_reverse = [ filt_rri1_reverse, filt_rri2_reverse, filt_rri3_reverse ] label = int(self._label[index]) subject = self._subject[index] mean = self._mean_train[index] sdnn = self._sdnn_train[index] pnn50 = self._pnn50_train[index] rmssd = self._rmssd_train[index] lnrmssd = self._lnrmssd_train[index] vlf = self._vlf_train[index] lf = self._lf_train[index] hf = self._hf_train[index] rlh = self._rlh_train[index] features = list(np.stack((mean, sdnn, pnn50, rmssd, lnrmssd, \ vlf, lf, hf, rlh ))) makeup_length = 512 - len(data) if len(data) > 512: data = data[:512] else: data.extend(0 for _ in range(makeup_length)) return data, data_reverse, order_data, order_data_reverse, label, subject, features
def print_instances_class_histogram(dataset_dicts, class_names, attribute='annotations'): """ Args: dataset_dicts (list[dict]): list of data dicts. class_names (list[str]): list of class names (zero-indexed). """ num_classes = len(class_names) hist_bins = np.arange(num_classes + 1) histogram = np.zeros((num_classes, ), dtype=np.int) for entry in dataset_dicts: classes = [] annos = entry[attribute] if attribute == 'annotations': classes = [ x["category_id"] for x in annos if not x.get("iscrowd", 0) ] elif attribute == 'annotations2': classes = [x["category2_id"] for x in annos] histogram += np.histogram(classes, bins=hist_bins)[0] N_COLS = min(6, len(class_names) * 2) def short_name(x): # make long class names shorter. useful for lvis if len(x) > 13: return x[:11] + ".." return x data = list( itertools.chain( *[[short_name(class_names[i]), int(v)] for i, v in enumerate(histogram)])) total_num_instances = sum(data[1::2]) data.extend([None] * (N_COLS - (len(data) % N_COLS))) if num_classes > 1: data.extend(["total", total_num_instances]) data = itertools.zip_longest(*[data[i::N_COLS] for i in range(N_COLS)]) table = tabulate( data, headers=["category", "#instances"] * (N_COLS // 2), tablefmt="pipe", numalign="left", stralign="center", ) log_first_n( logging.INFO, "Distribution of instances among all {} categories:\n".format( num_classes) + colored(table, "cyan"), key="message", )
def generate(self): data = [] basis = [] for _ in range(self.nbasis): ops = MBOp(choice([sigmax, sigmay, sigmaz]) for i in range(self.n)) _state = ops.invtrans(self.state) sampler = STMetropolis(proposal=lambda x: abs(_state[bin(x)])**2, size=self.size) data.extend( sampler.sample(itr=self.itr, burn=self.burn, thin=self.thin)) basis.extend( torch.FloatTensor(ops.params()).resize_(2 * self.n) for i in range(len(sampler.collector))) return data, basis
def load_data(self): data = [] for idx, model_name in enumerate(os.listdir(self.root)): model_path = os.path.join(self.root, model_name) if 'task_' + str(self.task) in model_path: log('\nExtracting weights from %s' % model_path) model_weights = torch.load(model_path)['model_state_dict'] vec = self.kernels_to_vector(model_weights) data.extend([vec]) path = os.path.join(self.root, '../pickles/dataset.pkl') torch.save(data, path) return data
def __init__(self, configs): split = configs['split'] root = configs['root'] self.transform = configs['transform'] self.split = split data_dict = torch.load( os.path.join(root, 'data', 'cifar10', split + '.pth')) labels = [] data = [] for label, data_list in data_dict.items(): n_samples = len(data_list) labels.extend([label] * n_samples) data.extend(data_list) print('Loaded %d data, %d labels' % (len(labels), len(data))) self.data = np.concatenate([x.reshape(1, -1) for x in data]) print('Concatenated shape:', self.data.shape) self.data = self.data.reshape((-1, 3, 32, 32)) self.data = self.data.transpose((0, 2, 3, 1)) # convert to HWC self.labels = labels
def get_cafe_data(): ''' Returns (data, labels) where data is an n x d tensor and labels is an n x 1 tensor. ''' images = load_cafe() # Build data and label tensors data = [] labels = [] for i, exprList in enumerate(images): data.extend([transform(image).reshape(-1) for image in exprList]) labels.extend([i] * len(exprList)) labels = torch.tensor(labels) data = torch.stack(data) return data, labels
def receive(client_socket, size): """ Recevie data to specific socket client. Args: client_socket: socket id size: data size to be received Returns: received data. """ n = 8 * size data = bytearray() while len(data) < n: packet = client_socket.recv(n - len(data)) if not packet: continue data.extend(packet) assert (n == len(data)) res = decode_float_array(data, size) return res
def my_collate(batch, frame_size, fixed_frame_num): ''' Break an utterance to multiple frames. The last frame is dropped if len(frame) < frame_size. ''' data, target = [], [] for (mat, label) in batch: data_shape = np.shape(mat) # print("mfcc mat shape:", data_shape, "label:", label) if mat.shape[0] >= frame_size: mul_mat, mul_label = breakIntoFrames(mat, label, frame_size, fixed_frame_num) data.extend(mul_mat) target.extend(mul_label) # print(" mul_mat:", len(mul_mat), " mul_label:", len(mul_label)) # print("total data len:", len(data), "total target len:", len(target)) data = torch.LongTensor(data) target = torch.LongTensor(target) return [data, target]
def load_json(json_path, x='feat_length', y='token_length', x_range=(1, 9999), y_range=(1, 999), rate=(1, 99)): # json try: # json_path is a single file with open(json_path) as f: data = json.load(f) except: # json_path is a dir where *.json in data = [] for dir, _, fs in os.walk(json_path): # os.walk获取所有的目录 for f in fs: if f.endswith('.json'): # 判断是否是".json"结尾 filename = os.path.join(dir, f) print('loading json file :', filename) with open(filename) as f: add = json.load(f) data.extend(add) print('loaded {} samples'.format(len(add))) list_to_pop = [] for i, sample in enumerate(data): len_x = sample[x] len_y = sample[y] if not (x_range[0] <= len_x <= x_range[1]) or \ not (y_range[0] <= len_y <= y_range[1]) or \ not (rate[0] <= (len_x / len_y) <= rate[1]): list_to_pop.append(i) # filter print('filtered {}/{} samples\n'.format(len(list_to_pop), len(data))) list_to_pop.reverse() [data.pop(i) for i in list_to_pop] return data
def __init__(self, json_path, reverse=False, feat_range=(1, 99999), label_range=(1, 100), rate_in_out=(4, 999)): try: # json_path is a single file with open(json_path) as f: data = json.load(f) except: # json_path is a dir where *.json in data = [] for dir, _, fs in os.walk(json_path): # os.walk获取所有的目录 for f in fs: if f.endswith('.json'): # 判断是否是".json"结尾 filename = os.path.join(dir, f) print('loading json file :', filename) with open(filename) as f: add = json.load(f) data.extend(add) print('loaded {} samples'.format(len(add))) # filter list_to_pop = [] for i, sample in enumerate(data): len_x = sample['feat_length'] len_y = sample['token_length'] if not (feat_range[0] <= len_x <= feat_range[1]) or \ not (label_range[0] <= len_y <= label_range[1]) or \ not (rate_in_out[0] <= (len_x / len_y) <= rate_in_out[1]): list_to_pop.append(i) print('filtered {}/{} samples\n'.format(len(list_to_pop), len(data))) list_to_pop.reverse() [data.pop(i) for i in list_to_pop] self.data = sorted(data, key=lambda x: float(x["feat_length"])) if reverse: self.data.reverse()
def __getitem__(self, index): data = [] data0, label0 = self.get_item_once(index, self.video_list0) data1, label0 = self.get_item_once(index, self.video_list1) data2, label0 = self.get_item_once(index, self.video_list2) """ data3, label0 = self.get_item_once(index,self.video_list3) data4, label0 = self.get_item_once(index,self.video_list4) data5, label0 = self.get_item_once(index,self.video_list5) data6, label0 = self.get_item_once(index,self.video_list6) data7, label0 = self.get_item_once(index,self.video_list7) """ data.extend(data0) data.extend(data1) data.extend(data2) """ data.extend(data3) data.extend(data4) data.extend(data5) data.extend(data6) data.extend(data7) """ process_data = self.transform(data) return process_data, label0
def __init__(self, scene, data_path, train, transform=None, target_transform=None, real=False, skip_images=False, seed=7, undistort=False, vo_lib='stereo', data_dir=None, unsupervise=False, config=None): """ :param scene: e.g. 'full' or 'loop'. collection of sequences. :param data_path: Root RobotCar data directory. Usually '../data/deepslam_data/RobotCar' :param train: flag for training / validation :param transform: Transform to be applied to images :param target_transform: Transform to be applied to poses :param real: it determines load ground truth pose or vo pose :param skip_images: return None images, only poses :param seed: random seed :param undistort: whether to undistort images (slow) :param vo_lib: Library to use for VO ('stereo' or 'gps') (`gps` is a misnomer in this code - it just loads the position information from GPS) :param data_dir: indicating where to load stats.txt file(to normalize image&pose) :param unsupervise: load training set as supervise or unsupervise """ np.random.seed(seed) self.train = train self.transform = transform self.target_transform = target_transform self.skip_images = skip_images self.undistort = undistort base_dir = osp.expanduser(osp.join(data_path, scene)) self.config = config # data_dir = osp.join('..', 'data', 'RobotCar', scene) if self.config.has_key('new_split') and self.config.new_split: print("use new split dataset") if train: split_filename = osp.join(base_dir, 'train_split.txt') else: split_filename = osp.join(base_dir, 'test_split.txt') with open(split_filename, 'r') as f: seqs = [l.rstrip() for l in f if not l.startswith('#')] pose_filename = osp.join(base_dir, "dataset_train.txt") pose_dict = {} with open(pose_filename, 'r') as f: data = f.readlines()[3:] pose_filename = osp.join(base_dir, "dataset_test.txt") with open(pose_filename, 'r') as f: data.extend(f.readlines()[3:]) imgs = [ l.split(' ')[0] for l in data if not l.startswith('#') ] ps = np.asarray([ [float(num) for num in l.split(' ')[1:]] for l in data if not l.startswith('#') ], dtype=np.float32) poses = np.zeros((ps.shape[0], 6)) poses[:, :3] = ps[:, :3] poses[:, 3:] = np_qlog_t(ps[:, 3:]) for idx, img_name in enumerate(imgs): pose_dict[img_name] = poses[idx, :] self.poses = np.empty((0, 6)) self.imgs = [] for seq in seqs: # seq_dir = osp.join(base_dir, seq) img_names = [img for img in imgs if img.startswith(seq)] # print(img_names) poses = np.asarray([ pose_dict[img_name] for img_name in img_names if pose_dict.has_key(img_name) ]) self.imgs.extend([osp.join(base_dir, img_name) for img_name in img_names]) self.poses = np.vstack((self.poses, poses)) else: if train: if unsupervise: split_filename = osp.join(base_dir, 'unsupervised_train_split.txt') else: split_filename = osp.join(base_dir, 'dataset_train.txt') else: split_filename = osp.join(base_dir, 'dataset_test.txt') with open(split_filename, 'r') as f: data = f.readlines() self.imgs = [ osp.join( base_dir, l.split(' ')[0] ) for l in data[3:] if not l.startswith('#') ] ps = np.asarray([ [float(num) for num in l.split(' ')[1:]] for l in data[3:] if not l.startswith('#') ], dtype=np.float32) self.poses = np.zeros((ps.shape[0], 6)) self.poses[:, :3] = ps[:, :3] self.poses[:, 3:] = np_qlog_t(ps[:, 3:]) self.mask_sampling = self.config.mask_sampling if self.mask_sampling: muimg = read_grayscale_image(osp.join(base_dir, self.config.mu_mask_name)) self.muimg = torch.tensor(muimg.transpose(2, 0, 1)).type(torch.FloatTensor) self.sigmaimg = self.muimg * (1 - self.muimg) pose_stats_filename = osp.join(data_dir, 'pose_stats.txt') if train and not real: mean_t = np.mean(self.poses[:, :3], axis=0) std_t = np.std(self.poses[:, :3], axis=0) np.savetxt(pose_stats_filename, np.vstack((mean_t, std_t)), fmt='%8.7f') print("Saved") else: mean_t, std_t = np.loadtxt(pose_stats_filename) self.poses[:, :3] -= mean_t self.poses[:, :3] /= std_t # convert the pose to translation + log quaternion, align, normalize self.gt_idx = np.asarray(range(len(self.poses))) # camera model and image loader self.im_loader = partial(load_image)
def __getitem__(self, index): count, id_idx, ii, dset, protein_id, seq_length = self.protein_list[ index] window_size = self.window_size id_idx = int(id_idx) win_start = ii - window_size win_end = ii + window_size seq_length = int(seq_length) label_idx = (win_start + win_end) // 2 all_seq_features = [] seq_len = 0 for idx in self.all_sequences[id_idx][:self.max_seq_len]: acid_one_hot = [0 for i in range(20)] acid_one_hot[idx] = 1 all_seq_features.append(acid_one_hot) seq_len += 1 while seq_len < self.max_seq_len: acid_one_hot = [0 for i in range(20)] all_seq_features.append(acid_one_hot) seq_len += 1 all_pssm_features = self.all_pssm[id_idx][:self.max_seq_len] seq_len = len(all_pssm_features) while seq_len < self.max_seq_len: zero_vector = [0 for i in range(20)] all_pssm_features.append(zero_vector) seq_len += 1 all_dssp_features = self.all_dssp[id_idx][:self.max_seq_len] seq_len = len(all_dssp_features) while seq_len < self.max_seq_len: zero_vector = [0 for i in range(9)] all_dssp_features.append(zero_vector) seq_len += 1 local_features = [] labels = [] while win_start < 0: data = [] acid_one_hot = [0 for i in range(20)] data.extend(acid_one_hot) pssm_zero_vector = [0 for i in range(20)] data.extend(pssm_zero_vector) dssp_zero_vector = [0 for i in range(9)] data.extend(dssp_zero_vector) local_features.extend(data) win_start += 1 valid_end = min(win_end, seq_length - 1) while win_start <= valid_end: data = [] idx = self.all_sequences[id_idx][win_start] acid_one_hot = [0 for i in range(20)] acid_one_hot[idx] = 1 data.extend(acid_one_hot) pssm_val = self.all_pssm[id_idx][win_start] data.extend(pssm_val) try: dssp_val = self.all_dssp[id_idx][win_start] except: dssp_val = [0 for i in range(9)] data.extend(dssp_val) local_features.extend(data) win_start += 1 while win_start <= win_end: data = [] acid_one_hot = [0 for i in range(20)] data.extend(acid_one_hot) pssm_zero_vector = [0 for i in range(20)] data.extend(pssm_zero_vector) dssp_zero_vector = [0 for i in range(9)] data.extend(dssp_zero_vector) local_features.extend(data) win_start += 1 label = self.all_label[id_idx][label_idx] label = np.array(label, dtype=np.float32) all_seq_features = np.stack(all_seq_features) all_seq_features = all_seq_features[np.newaxis, :, :] all_pssm_features = np.stack(all_pssm_features) all_pssm_features = all_pssm_features[np.newaxis, :, :] all_dssp_features = np.stack(all_dssp_features) all_dssp_features = all_dssp_features[np.newaxis, :, :] local_features = np.stack(local_features) return all_seq_features, all_pssm_features, all_dssp_features, local_features, label
def __init__(self, hp, mode): super(Photo2Sketch_Dataset, self).__init__() self.hp = hp self.mode = mode hp.root_dir = '/home/media/On_the_Fly/Code_ALL/Final_Dataset' hp.dataset_name = 'ShoeV2' hp.seq_len_threshold = 251 # coordinate_path = os.path.join(hp.root_dir, hp.dataset_name , hp.dataset_name + '_Coordinate') self.root_dir = os.path.join(hp.root_dir, hp.dataset_name) with open('./preprocess/ShoeV2_RDP_3', 'rb') as fp: self.Coordinate = pickle.load(fp) coordinate_refine = {} seq_len = [] for key in self.Coordinate.keys(): if len(self.Coordinate[key]) < 81: coordinate_refine[key] = self.Coordinate[key] seq_len.append(len(self.Coordinate[key])) self.Coordinate = coordinate_refine hp.max_seq_len = max(seq_len) hp.average_seq_len = int(np.round(np.mean(seq_len) + np.std(seq_len))) # greater_than_average = 0 # for seq in seq_len: # if seq > self.hp.average_len: # greater_than_average +=1 self.Train_Sketch = [ x for x in self.Coordinate if ('train' in x) and (len(self.Coordinate[x]) < 130) ] # separating trains self.Test_Sketch = [ x for x in self.Coordinate if ('test' in x) and (len(self.Coordinate[x]) < 130) ] # separating tests self.train_transform = get_transform('Train') self.test_transform = get_transform('Test') # # seq_len = [] # # for key in self.Coordinate.keys(): # # seq_len += [len(self.Coordinate[key])] # # plt.hist(seq_len) # # plt.savefig('histogram of number of Coordinate Points.png') # # plt.close() # # hp.max_seq_len = max(seq_len) # hp.max_seq_len = 130 """" Preprocess offset coordinates """ self.Offset_Coordinate = {} for key in self.Coordinate.keys(): self.Offset_Coordinate[key] = to_delXY(self.Coordinate[key]) data = [] for sample in self.Offset_Coordinate.values(): data.extend(sample[:, 0]) data.extend(sample[:, 1]) data = np.array(data) scale_factor = np.std(data) for key in self.Coordinate.keys(): self.Offset_Coordinate[key][:, :2] /= scale_factor """" <<< Preprocess offset coordinates >>> """ """" <<< Done >>> """
def make_data(root_path, annotation_path, allow_reverse, mirror, balance, balance_proportions, subset, sample_duration, required_overlap, shuffle): assert (subset in ['train', 'val', 'test'], 'subset "{}" is not "train", "val" or "test"'.format(subset)) changevalue = {'validation': 'val', 'train': 'train'} subset = changevalue[subset] frame_format = lambda x: str(x).zfill(4) + '.png' data = [] action_list = _load_list(annotation_path) local_track_action_dict = defaultdict(list) for action in action_list: local_track_action_dict[action[0]].append(action[1:]) for local_track_path, actions in tqdm(local_track_action_dict.items(), desc='Creating a dataset'): # sort the list of action by the starting frame actions.sort(key=lambda a: a[1]) if subset != actions[0][-2]: continue frame_dir = os.path.join(root_path, local_track_path) files = os.listdir(frame_dir) # split the frames into samples for i in range(len(files) // sample_duration): use_sample = True sample_start = i * sample_duration + 1 sample_end = min((i + 1) * sample_duration, len(files)) # make sure all the frames in the sample are saved properly, if they're not ignore the sample frame_indices = list(range(sample_start, sample_end + 1)) for index in frame_indices: frame_path = os.path.join(root_path, local_track_path, frame_format(index)) if not os.path.exists(frame_path): use_sample = False break if not use_sample: continue # count the number of frames in sample that are part of the action action_label = 0 for (label, start, end, frame, subset, hard_data) in actions: action_overlap = max( 0, end - sample_start + 1 if sample_start > start else sample_end - start + 1) if action_overlap > sample_duration * required_overlap: action_label = label break sample = { 'path': frame_dir, 'label': action_label, 'frame_indices': frame_indices, 'mirror': False } data.append(sample) reverse_data = [] if allow_reverse: for sample in tqdm(data, desc='Reversing the data'): reverse = True if sample['label'] != 0 else random.random() > 0.5 if reverse: reverse_data.append(_reverse_sample(sample)) data.extend(reverse_data) mirror_data = [] if mirror: for sample in tqdm(data, desc='Adding mirror flip'): mirror_data.append(_mirror_sample(sample)) data.extend(mirror_data) labels = set([i['label'] for i in data]) count_labels = { label: sum([1 for i in data if i['label'] == label]) for label in labels } print('Dataset is generated. Label counts are:', count_labels) if balance: required = min(count_labels.values()) if balance_proportions is not None: sampling_probabilities = { label: (required / value) * balance_proportions[label] for label, value in count_labels.items() } else: sampling_probabilities = { label: required / value for label, value in count_labels.items() } balanced_data = [] for sample in tqdm(data, desc='Balancing dataset'): if random.random() < sampling_probabilities[sample['label']]: balanced_data.append(sample) data = balanced_data count_labels = { label: sum([1 for i in data if i['label'] == label]) for label in labels } print('Balanced dataset is generated. Label counts are:', count_labels) if shuffle: random.shuffle(data) return data
def search(dirname, data): filenames = os.listdir(dirname) for filename in filenames: full_filename = os.path.join(dirname, filename, '*.png') data.extend(Get_Dataset(full_filename)) print(full_filename + ' Load!')
for string in args.model: if '=' in string: update = parse_dotted(string) else: with open(string, 'r') as f: update = yaml.safe_load(f) # If the yaml file contains an entry with key `model` use that one instead if 'model' in update.keys(): update = update['model'] update_rec(model, update) # Data from --data args for path in args.data: path = Path(path).expanduser().resolve() if path.is_dir(): data.extend(path.glob('*.pt')) elif path.is_file() and path.suffix == '.pt': data.append(path) else: raise ValueError(f'Invalid data: {path}') # Options from --options args for string in args.options: if '=' in string: update = parse_dotted(string) else: with open(string, 'r') as f: update = yaml.safe_load(f) update_rec(options, update) # Resolving paths