def test_case_00(add_remove_file): # only bin data data = [{ "image1": bytes("image1 bytes abc", encoding='UTF-8'), "image2": bytes("image1 bytes def", encoding='UTF-8'), "image3": bytes("image1 bytes ghi", encoding='UTF-8'), "image4": bytes("image1 bytes jkl", encoding='UTF-8'), "image5": bytes("image1 bytes mno", encoding='UTF-8') }, { "image1": bytes("image2 bytes abc", encoding='UTF-8'), "image2": bytes("image2 bytes def", encoding='UTF-8'), "image3": bytes("image2 bytes ghi", encoding='UTF-8'), "image4": bytes("image2 bytes jkl", encoding='UTF-8'), "image5": bytes("image2 bytes mno", encoding='UTF-8') }, { "image1": bytes("image3 bytes abc", encoding='UTF-8'), "image2": bytes("image3 bytes def", encoding='UTF-8'), "image3": bytes("image3 bytes ghi", encoding='UTF-8'), "image4": bytes("image3 bytes jkl", encoding='UTF-8'), "image5": bytes("image3 bytes mno", encoding='UTF-8') }, { "image1": bytes("image5 bytes abc", encoding='UTF-8'), "image2": bytes("image5 bytes def", encoding='UTF-8'), "image3": bytes("image5 bytes ghi", encoding='UTF-8'), "image4": bytes("image5 bytes jkl", encoding='UTF-8'), "image5": bytes("image5 bytes mno", encoding='UTF-8') }, { "image1": bytes("image6 bytes abc", encoding='UTF-8'), "image2": bytes("image6 bytes def", encoding='UTF-8'), "image3": bytes("image6 bytes ghi", encoding='UTF-8'), "image4": bytes("image6 bytes jkl", encoding='UTF-8'), "image5": bytes("image6 bytes mno", encoding='UTF-8') }] schema = { "image1": { "type": "bytes" }, "image2": { "type": "bytes" }, "image3": { "type": "bytes" }, "image4": { "type": "bytes" }, "image5": { "type": "bytes" } } writer = FileWriter(TEMP_FILE, FILES_NUM) writer.add_schema(schema, "schema") writer.write_raw_data(data) writer.commit() d1 = ds.MindDataset(TEMP_FILE, None, num_readers, shuffle=False) d1.save(AUTO_FILE, FILES_NUM) data_value_to_list = [] for item in data: new_data = {} new_data['image1'] = np.asarray(list(item["image1"]), dtype=np.uint8) new_data['image2'] = np.asarray(list(item["image2"]), dtype=np.uint8) new_data['image3'] = np.asarray(list(item["image3"]), dtype=np.uint8) new_data['image4'] = np.asarray(list(item["image4"]), dtype=np.uint8) new_data['image5'] = np.asarray(list(item["image5"]), dtype=np.uint8) data_value_to_list.append(new_data) d2 = ds.MindDataset(dataset_file=AUTO_FILE, num_parallel_workers=num_readers, shuffle=False) assert d2.get_dataset_size() == 5 num_iter = 0 for item in d2.create_dict_iterator(num_epochs=1, output_numpy=True): assert len(item) == 5 for field in item: if isinstance(item[field], np.ndarray): assert ( item[field] == data_value_to_list[num_iter][field]).all() else: assert item[field] == data_value_to_list[num_iter][field] num_iter += 1 assert num_iter == 5
def random_split_trans2mindrecord(input_file_path, output_file_path, criteo_stats_dict, part_rows=2000000, line_per_sample=1000, test_size=0.1, seed=2020): """Random split data and save mindrecord""" test_size = int(TRAIN_LINE_COUNT * test_size) all_indices = [i for i in range(TRAIN_LINE_COUNT)] np.random.seed(seed) np.random.shuffle(all_indices) print("all_indices.size:{}".format(len(all_indices))) test_indices_set = set(all_indices[:test_size]) print("test_indices_set.size:{}".format(len(test_indices_set))) print("-----------------------" * 10 + "\n" * 2) train_data_list = [] test_data_list = [] ids_list = [] wts_list = [] label_list = [] writer_train = FileWriter( os.path.join(output_file_path, "train_input_part.mindrecord"), 21) writer_test = FileWriter( os.path.join(output_file_path, "test_input_part.mindrecord"), 3) schema = { "label": { "type": "float32", "shape": [-1] }, "feat_vals": { "type": "float32", "shape": [-1] }, "feat_ids": { "type": "int32", "shape": [-1] } } writer_train.add_schema(schema, "CRITEO_TRAIN") writer_test.add_schema(schema, "CRITEO_TEST") with open(input_file_path, encoding="utf-8") as file_in: items_error_size_lineCount = [] count = 0 train_part_number = 0 test_part_number = 0 for i, line in enumerate(file_in): count += 1 if count % 1000000 == 0: print("Have handle {}w lines.".format(count // 10000)) line = line.strip("\n") items = line.split("\t") if len(items) != 40: items_error_size_lineCount.append(i) continue label = float(items[0]) values = items[1:14] cats = items[14:] assert len(values) == 13, "values.size: {}".format(len(values)) assert len(cats) == 26, "cats.size: {}".format(len(cats)) ids, wts = criteo_stats_dict.map_cat2id(values, cats) ids_list.extend(ids) wts_list.extend(wts) label_list.append(label) if count % line_per_sample == 0: if i not in test_indices_set: train_data_list.append({ "feat_ids": np.array(ids_list, dtype=np.int32), "feat_vals": np.array(wts_list, dtype=np.float32), "label": np.array(label_list, dtype=np.float32) }) else: test_data_list.append({ "feat_ids": np.array(ids_list, dtype=np.int32), "feat_vals": np.array(wts_list, dtype=np.float32), "label": np.array(label_list, dtype=np.float32) }) if train_data_list and len(train_data_list) % part_rows == 0: writer_train.write_raw_data(train_data_list) train_data_list.clear() train_part_number += 1 if test_data_list and len(test_data_list) % part_rows == 0: writer_test.write_raw_data(test_data_list) test_data_list.clear() test_part_number += 1 ids_list.clear() wts_list.clear() label_list.clear() if train_data_list: writer_train.write_raw_data(train_data_list) if test_data_list: writer_test.write_raw_data(test_data_list) writer_train.commit() writer_test.commit() print("-------------" * 10) print("items_error_size_lineCount.size(): {}.".format( len(items_error_size_lineCount))) print("-------------" * 10) np.save("items_error_size_lineCount.npy", items_error_size_lineCount)
schema = { "input_ids": { "type": "int32", "shape": [-1] }, } writer = FileWriter(file_name=args.output_file, shard_num=args.file_partition) writer.add_schema(schema, args.dataset_type) writer.open_and_set_header() ### transforms_count = 0 if args.dataset_type == 'wiki': for x in tokenize_wiki(args.input_glob): transforms_count += 1 writer.write_raw_data([x]) print("Transformed {} records.".format(transforms_count)) elif args.dataset_type == 'lambada': for x in tokenize_lambada(args.input_glob): transforms_count += 1 writer.write_raw_data([x]) print("Transformed {} records.".format(transforms_count)) elif args.dataset_type == 'openwebtext': file_iter = glob.iglob(args.input_glob) with Pool(processes=args.num_process) as pool: pool.map(task_unit, package_file(file_iter, args.file_batch_size)) else: raise ValueError("Not support dataset type: {}".format( args.dataset_type)) writer.commit()
def test_case_02(add_remove_file): # muti-bytes data = [{ "file_name": "001.jpg", "label": 43, "float32_array": np.array([1.2, 2.78, 3.1234, 4.9871, 5.12341], dtype=np.float32), "float64_array": np.array([ 48.1234556789, 49.3251241431, 50.13514312414, 51.8971298471, 123414314.2141243, 87.1212122 ], dtype=np.float64), "float32": 3456.12345, "float64": 1987654321.123456785, "source_sos_ids": np.array([1, 2, 3, 4, 5], dtype=np.int32), "source_sos_mask": np.array([6, 7, 8, 9, 10, 11, 12], dtype=np.int64), "image1": bytes("image1 bytes abc", encoding='UTF-8'), "image2": bytes("image1 bytes def", encoding='UTF-8'), "image3": bytes("image1 bytes ghi", encoding='UTF-8'), "image4": bytes("image1 bytes jkl", encoding='UTF-8'), "image5": bytes("image1 bytes mno", encoding='UTF-8') }, { "file_name": "002.jpg", "label": 91, "float32_array": np.array([1.2, 2.78, 4.1234, 4.9871, 5.12341], dtype=np.float32), "float64_array": np.array([ 48.1234556789, 49.3251241431, 60.13514312414, 51.8971298471, 123414314.2141243, 87.1212122 ], dtype=np.float64), "float32": 3456.12445, "float64": 1987654321.123456786, "source_sos_ids": np.array([11, 2, 3, 4, 5], dtype=np.int32), "source_sos_mask": np.array([16, 7, 8, 9, 10, 11, 12], dtype=np.int64), "image1": bytes("image2 bytes abc", encoding='UTF-8'), "image2": bytes("image2 bytes def", encoding='UTF-8'), "image3": bytes("image2 bytes ghi", encoding='UTF-8'), "image4": bytes("image2 bytes jkl", encoding='UTF-8'), "image5": bytes("image2 bytes mno", encoding='UTF-8') }, { "file_name": "003.jpg", "label": 61, "float32_array": np.array([1.2, 2.78, 5.1234, 4.9871, 5.12341], dtype=np.float32), "float64_array": np.array([ 48.1234556789, 49.3251241431, 70.13514312414, 51.8971298471, 123414314.2141243, 87.1212122 ], dtype=np.float64), "float32": 3456.12545, "float64": 1987654321.123456787, "source_sos_ids": np.array([21, 2, 3, 4, 5], dtype=np.int32), "source_sos_mask": np.array([26, 7, 8, 9, 10, 11, 12], dtype=np.int64), "image1": bytes("image3 bytes abc", encoding='UTF-8'), "image2": bytes("image3 bytes def", encoding='UTF-8'), "image3": bytes("image3 bytes ghi", encoding='UTF-8'), "image4": bytes("image3 bytes jkl", encoding='UTF-8'), "image5": bytes("image3 bytes mno", encoding='UTF-8') }, { "file_name": "004.jpg", "label": 29, "float32_array": np.array([1.2, 2.78, 6.1234, 4.9871, 5.12341], dtype=np.float32), "float64_array": np.array([ 48.1234556789, 49.3251241431, 80.13514312414, 51.8971298471, 123414314.2141243, 87.1212122 ], dtype=np.float64), "float32": 3456.12645, "float64": 1987654321.123456788, "source_sos_ids": np.array([31, 2, 3, 4, 5], dtype=np.int32), "source_sos_mask": np.array([36, 7, 8, 9, 10, 11, 12], dtype=np.int64), "image1": bytes("image4 bytes abc", encoding='UTF-8'), "image2": bytes("image4 bytes def", encoding='UTF-8'), "image3": bytes("image4 bytes ghi", encoding='UTF-8'), "image4": bytes("image4 bytes jkl", encoding='UTF-8'), "image5": bytes("image4 bytes mno", encoding='UTF-8') }, { "file_name": "005.jpg", "label": 78, "float32_array": np.array([1.2, 2.78, 7.1234, 4.9871, 5.12341], dtype=np.float32), "float64_array": np.array([ 48.1234556789, 49.3251241431, 90.13514312414, 51.8971298471, 123414314.2141243, 87.1212122 ], dtype=np.float64), "float32": 3456.12745, "float64": 1987654321.123456789, "source_sos_ids": np.array([41, 2, 3, 4, 5], dtype=np.int32), "source_sos_mask": np.array([46, 7, 8, 9, 10, 11, 12], dtype=np.int64), "image1": bytes("image5 bytes abc", encoding='UTF-8'), "image2": bytes("image5 bytes def", encoding='UTF-8'), "image3": bytes("image5 bytes ghi", encoding='UTF-8'), "image4": bytes("image5 bytes jkl", encoding='UTF-8'), "image5": bytes("image5 bytes mno", encoding='UTF-8') }, { "file_name": "006.jpg", "label": 37, "float32_array": np.array([1.2, 2.78, 7.1234, 4.9871, 5.12341], dtype=np.float32), "float64_array": np.array([ 48.1234556789, 49.3251241431, 90.13514312414, 51.8971298471, 123414314.2141243, 87.1212122 ], dtype=np.float64), "float32": 3456.12745, "float64": 1987654321.123456789, "source_sos_ids": np.array([51, 2, 3, 4, 5], dtype=np.int32), "source_sos_mask": np.array([56, 7, 8, 9, 10, 11, 12], dtype=np.int64), "image1": bytes("image6 bytes abc", encoding='UTF-8'), "image2": bytes("image6 bytes def", encoding='UTF-8'), "image3": bytes("image6 bytes ghi", encoding='UTF-8'), "image4": bytes("image6 bytes jkl", encoding='UTF-8'), "image5": bytes("image6 bytes mno", encoding='UTF-8') }] schema = { "file_name": { "type": "string" }, "float32_array": { "type": "float32", "shape": [-1] }, "float64_array": { "type": "float64", "shape": [-1] }, "float32": { "type": "float32" }, "float64": { "type": "float64" }, "source_sos_ids": { "type": "int32", "shape": [-1] }, "source_sos_mask": { "type": "int64", "shape": [-1] }, "image1": { "type": "bytes" }, "image2": { "type": "bytes" }, "image3": { "type": "bytes" }, "label": { "type": "int32" }, "image4": { "type": "bytes" }, "image5": { "type": "bytes" } } writer = FileWriter(TEMP_FILE, FILES_NUM) writer.add_schema(schema, "schema") writer.write_raw_data(data) writer.commit() d1 = ds.MindDataset(TEMP_FILE, None, num_readers, shuffle=False) d1.save(AUTO_FILE, FILES_NUM) data_value_to_list = [] for item in data: new_data = {} new_data['file_name'] = np.asarray(item["file_name"], dtype='S') new_data['float32_array'] = item["float32_array"] new_data['float64_array'] = item["float64_array"] new_data['float32'] = item["float32"] new_data['float64'] = item["float64"] new_data['source_sos_ids'] = item["source_sos_ids"] new_data['source_sos_mask'] = item["source_sos_mask"] new_data['label'] = np.asarray(list([item["label"]]), dtype=np.int32) new_data['image1'] = np.asarray(list(item["image1"]), dtype=np.uint8) new_data['image2'] = np.asarray(list(item["image2"]), dtype=np.uint8) new_data['image3'] = np.asarray(list(item["image3"]), dtype=np.uint8) new_data['image4'] = np.asarray(list(item["image4"]), dtype=np.uint8) new_data['image5'] = np.asarray(list(item["image5"]), dtype=np.uint8) data_value_to_list.append(new_data) d2 = ds.MindDataset(dataset_file=AUTO_FILE, num_parallel_workers=num_readers, shuffle=False) assert d2.get_dataset_size() == 6 num_iter = 0 for item in d2.create_dict_iterator(num_epochs=1, output_numpy=True): assert len(item) == 13 for field in item: if isinstance(item[field], np.ndarray): if item[field].dtype == np.float32: assert (item[field] == np.array( data_value_to_list[num_iter][field], np.float32)).all() else: assert (item[field] == data_value_to_list[num_iter][field] ).all() else: assert item[field] == data_value_to_list[num_iter][field] num_iter += 1 assert num_iter == 6
os.makedirs(dst_dir) print('number of samples:', len(lines)) writer = FileWriter(file_name=args.dst_path, shard_num=args.num_shards) writer.add_schema(seg_schema, "seg_schema") cnt = 0 for l in lines: img_name = l.strip('\n') img_path = 'img/' + str(img_name) + '.jpg' label_path = 'cls_png/' + str(img_name) + '.png' sample_ = {"file_name": img_path.split('/')[-1]} with open(os.path.join(args.data_root, img_path), 'rb') as f: sample_['data'] = f.read() with open(os.path.join(args.data_root, label_path), 'rb') as f: sample_['label'] = f.read() data_list.append(sample_) cnt += 1 if cnt % 1000 == 0: writer.write_raw_data(data_list) print('number of samples written:', cnt) data_list = [] if data_list: writer.write_raw_data(data_list) writer.commit() print('number of samples written:', cnt)
def test_write_read_process(): mindrecord_file_name = "test.mindrecord" data = [{ "file_name": "001.jpg", "label": 43, "score": 0.8, "mask": np.array([3, 6, 9], dtype=np.int64), "segments": np.array([[5.0, 1.6], [65.2, 8.3]], dtype=np.float32), "data": bytes("image bytes abc", encoding='UTF-8') }, { "file_name": "002.jpg", "label": 91, "score": 5.4, "mask": np.array([1, 4, 7], dtype=np.int64), "segments": np.array([[5.1, 9.1], [2.0, 65.4]], dtype=np.float32), "data": bytes("image bytes def", encoding='UTF-8') }, { "file_name": "003.jpg", "label": 61, "score": 6.4, "mask": np.array([7, 6, 3], dtype=np.int64), "segments": np.array([[0.0, 5.6], [3.0, 16.3]], dtype=np.float32), "data": bytes("image bytes ghi", encoding='UTF-8') }, { "file_name": "004.jpg", "label": 29, "score": 8.1, "mask": np.array([2, 8, 0], dtype=np.int64), "segments": np.array([[5.9, 7.2], [4.0, 89.0]], dtype=np.float32), "data": bytes("image bytes jkl", encoding='UTF-8') }, { "file_name": "005.jpg", "label": 78, "score": 7.7, "mask": np.array([3, 1, 2], dtype=np.int64), "segments": np.array([[0.6, 8.1], [5.3, 49.3]], dtype=np.float32), "data": bytes("image bytes mno", encoding='UTF-8') }, { "file_name": "006.jpg", "label": 37, "score": 9.4, "mask": np.array([7, 6, 7], dtype=np.int64), "segments": np.array([[4.2, 6.3], [8.9, 81.8]], dtype=np.float32), "data": bytes("image bytes pqr", encoding='UTF-8') }] writer = FileWriter(mindrecord_file_name) schema = { "file_name": { "type": "string" }, "label": { "type": "int32" }, "score": { "type": "float64" }, "mask": { "type": "int64", "shape": [-1] }, "segments": { "type": "float32", "shape": [2, 2] }, "data": { "type": "bytes" } } writer.add_schema(schema, "data is so cool") writer.write_raw_data(data) writer.commit() reader = FileReader(mindrecord_file_name) count = 0 for index, x in enumerate(reader.get_next()): assert len(x) == 6 for field in x: if isinstance(x[field], np.ndarray): assert (x[field] == data[count][field]).all() else: assert x[field] == data[count][field] count = count + 1 logger.info("#item{}: {}".format(index, x)) assert count == 6 reader.close() os.remove("{}".format(mindrecord_file_name)) os.remove("{}.db".format(mindrecord_file_name))
def transfer_coco_to_mindrecord(self, mindrecord_dir, file_name="coco_det.train.mind", shard_num=1): """Create MindRecord file by image_dir and anno_path.""" if not os.path.isdir(mindrecord_dir): os.makedirs(mindrecord_dir) if os.path.isdir(self.image_path) and os.path.exists(self.annot_path): logger.info("Create MindRecord based on COCO_HP dataset") else: raise ValueError( 'data_dir {} or anno_path {} does not exist'.format( self.image_path, self.annot_path)) mindrecord_path = os.path.join(mindrecord_dir, file_name) writer = FileWriter(mindrecord_path, shard_num) centernet_json = { "img_id": { "type": "int32", "shape": [1] }, "image": { "type": "bytes" }, "num_objects": { "type": "int32" }, "bboxes": { "type": "float32", "shape": [-1, 4] }, "category_id": { "type": "int32", "shape": [-1] }, } writer.add_schema(centernet_json, "centernet_json") for img_id in self.images: image_info = self.coco.loadImgs([img_id]) annos = self.coco.loadAnns(self.anns[img_id]) # get image img_name = image_info[0]['file_name'] img_name = os.path.join(self.image_path, img_name) with open(img_name, 'rb') as f: image = f.read() bboxes = [] category_id = [] num_objects = len(annos) for anno in annos: bbox = self._coco_box_to_bbox(anno['bbox']) class_name = self.classs_dict[anno["category_id"]] if class_name in self.train_cls: x_min, x_max = bbox[0], bbox[2] y_min, y_max = bbox[1], bbox[3] bboxes.append([x_min, y_min, x_max, y_max]) category_id.append(self.train_cls_dict[class_name]) row = { "img_id": np.array([img_id], dtype=np.int32), "image": image, "num_objects": num_objects, "bboxes": np.array(bboxes, np.float32), "category_id": np.array(category_id, np.int32) } writer.write_raw_data([row]) writer.commit() logger.info("Create Mindrecord Done, at {}".format(mindrecord_dir))
def main(): parser = argparse.ArgumentParser() parser.add_argument("--input_file", type=str, required=True, help='Input raw text file (or comma-separated list of files).') parser.add_argument("--output_file", type=str, required=True, help='Output MindRecord file.') parser.add_argument("--num_splits", type=int, default=16, help='The MindRecord file will be split into the number of partition.') parser.add_argument("--vocab_file", type=str, required=True, help='The vocabulary file that the Transformer model was trained on.') parser.add_argument("--clip_to_max_len", type=bool, default=False, help='clip sequences to maximum sequence length.') parser.add_argument("--max_seq_length", type=int, default=128, help='Maximum sequence length.') parser.add_argument("--bucket", type=ast.literal_eval, default=[16, 32, 48, 64, 128], help='bucket sequence length') args = parser.parse_args() tokenizer = tokenization.WhiteSpaceTokenizer(vocab_file=args.vocab_file) input_files = [] for input_pattern in args.input_file.split(","): input_files.append(input_pattern) logging.info("*** Read from input files ***") for input_file in input_files: logging.info(" %s", input_file) output_file = args.output_file logging.info("*** Write to output files ***") logging.info(" %s", output_file) total_written = 0 total_read = 0 feature_dict = {} for i in args.bucket: feature_dict[i] = [] for input_file in input_files: logging.info("*** Reading from %s ***", input_file) with open(input_file, "r") as reader: while True: line = tokenization.convert_to_unicode(reader.readline()) if not line: break total_read += 1 if total_read % 100000 == 0: logging.info("Read %d ...", total_read) source_line, target_line = line.strip().split("\t") source_tokens = tokenizer.tokenize(source_line) target_tokens = tokenizer.tokenize(target_line) if len(source_tokens) >= args.max_seq_length or len(target_tokens) >= args.max_seq_length: logging.info("ignore long sentence!") continue instance = create_training_instance(source_tokens, target_tokens, args.max_seq_length, clip_to_max_len=args.clip_to_max_len) if instance is None: continue features, seq_max_bucket_length = get_instance_features(instance, tokenizer, args.max_seq_length, args.bucket) for key in feature_dict: if key == seq_max_bucket_length: feature_dict[key].append(features) if total_read <= 10: logging.info("*** Example ***") logging.info("source tokens: %s", " ".join( [tokenization.convert_to_printable(x) for x in instance.source_eos_tokens])) logging.info("target tokens: %s", " ".join( [tokenization.convert_to_printable(x) for x in instance.target_sos_tokens])) for feature_name in features.keys(): feature = features[feature_name] logging.info("%s: %s", feature_name, feature) for i in args.bucket: if args.num_splits == 1: output_file_name = output_file else: output_file_name = output_file + '_' + str(i) + '_' writer = FileWriter(output_file_name, args.num_splits) data_schema = {"source_sos_ids": {"type": "int64", "shape": [-1]}, "source_sos_mask": {"type": "int64", "shape": [-1]}, "source_eos_ids": {"type": "int64", "shape": [-1]}, "source_eos_mask": {"type": "int64", "shape": [-1]}, "target_sos_ids": {"type": "int64", "shape": [-1]}, "target_sos_mask": {"type": "int64", "shape": [-1]}, "target_eos_ids": {"type": "int64", "shape": [-1]}, "target_eos_mask": {"type": "int64", "shape": [-1]} } writer.add_schema(data_schema, "tranformer") features_ = feature_dict[i] logging.info("Bucket length %d has %d samples, start writing...", i, len(features_)) for item in features_: writer.write_raw_data([item]) total_written += 1 writer.commit() logging.info("Wrote %d total instances", total_written)
def test_write_read_process_with_multi_bytes_and_array(): mindrecord_file_name = "test.mindrecord" data = [{"file_name": "001.jpg", "label": 4, "image1": bytes("image1 bytes abc", encoding='UTF-8'), "image2": bytes("image1 bytes def", encoding='UTF-8'), "source_sos_ids": np.array([1, 2, 3, 4, 5], dtype=np.int64), "source_sos_mask": np.array([6, 7, 8, 9, 10, 11, 12], dtype=np.int64), "image3": bytes("image1 bytes ghi", encoding='UTF-8'), "image4": bytes("image1 bytes jkl", encoding='UTF-8'), "image5": bytes("image1 bytes mno", encoding='UTF-8'), "target_sos_ids": np.array([28, 29, 30, 31, 32], dtype=np.int64), "target_sos_mask": np.array([33, 34, 35, 36, 37, 38], dtype=np.int64), "target_eos_ids": np.array([39, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), "target_eos_mask": np.array([48, 49, 50, 51], dtype=np.int64)}, {"file_name": "002.jpg", "label": 5, "image1": bytes("image2 bytes abc", encoding='UTF-8'), "image2": bytes("image2 bytes def", encoding='UTF-8'), "image3": bytes("image2 bytes ghi", encoding='UTF-8'), "image4": bytes("image2 bytes jkl", encoding='UTF-8'), "image5": bytes("image2 bytes mno", encoding='UTF-8'), "source_sos_ids": np.array([11, 2, 3, 4, 5], dtype=np.int64), "source_sos_mask": np.array([16, 7, 8, 9, 10, 11, 12], dtype=np.int64), "target_sos_ids": np.array([128, 29, 30, 31, 32], dtype=np.int64), "target_sos_mask": np.array([133, 34, 35, 36, 37, 38], dtype=np.int64), "target_eos_ids": np.array([139, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), "target_eos_mask": np.array([148, 49, 50, 51], dtype=np.int64)}, {"file_name": "003.jpg", "label": 6, "source_sos_ids": np.array([21, 2, 3, 4, 5], dtype=np.int64), "source_sos_mask": np.array([26, 7, 8, 9, 10, 11, 12], dtype=np.int64), "target_sos_ids": np.array([228, 29, 30, 31, 32], dtype=np.int64), "target_sos_mask": np.array([233, 34, 35, 36, 37, 38], dtype=np.int64), "target_eos_ids": np.array([239, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), "image1": bytes("image3 bytes abc", encoding='UTF-8'), "image2": bytes("image3 bytes def", encoding='UTF-8'), "image3": bytes("image3 bytes ghi", encoding='UTF-8'), "image4": bytes("image3 bytes jkl", encoding='UTF-8'), "image5": bytes("image3 bytes mno", encoding='UTF-8'), "target_eos_mask": np.array([248, 49, 50, 51], dtype=np.int64)}, {"file_name": "004.jpg", "label": 7, "source_sos_ids": np.array([31, 2, 3, 4, 5], dtype=np.int64), "source_sos_mask": np.array([36, 7, 8, 9, 10, 11, 12], dtype=np.int64), "image1": bytes("image4 bytes abc", encoding='UTF-8'), "image2": bytes("image4 bytes def", encoding='UTF-8'), "image3": bytes("image4 bytes ghi", encoding='UTF-8'), "image4": bytes("image4 bytes jkl", encoding='UTF-8'), "image5": bytes("image4 bytes mno", encoding='UTF-8'), "target_sos_ids": np.array([328, 29, 30, 31, 32], dtype=np.int64), "target_sos_mask": np.array([333, 34, 35, 36, 37, 38], dtype=np.int64), "target_eos_ids": np.array([339, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), "target_eos_mask": np.array([348, 49, 50, 51], dtype=np.int64)}, {"file_name": "005.jpg", "label": 8, "source_sos_ids": np.array([41, 2, 3, 4, 5], dtype=np.int64), "source_sos_mask": np.array([46, 7, 8, 9, 10, 11, 12], dtype=np.int64), "target_sos_ids": np.array([428, 29, 30, 31, 32], dtype=np.int64), "target_sos_mask": np.array([433, 34, 35, 36, 37, 38], dtype=np.int64), "image1": bytes("image5 bytes abc", encoding='UTF-8'), "image2": bytes("image5 bytes def", encoding='UTF-8'), "image3": bytes("image5 bytes ghi", encoding='UTF-8'), "image4": bytes("image5 bytes jkl", encoding='UTF-8'), "image5": bytes("image5 bytes mno", encoding='UTF-8'), "target_eos_ids": np.array([439, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), "target_eos_mask": np.array([448, 49, 50, 51], dtype=np.int64)}, {"file_name": "006.jpg", "label": 9, "source_sos_ids": np.array([51, 2, 3, 4, 5], dtype=np.int64), "source_sos_mask": np.array([56, 7, 8, 9, 10, 11, 12], dtype=np.int64), "target_sos_ids": np.array([528, 29, 30, 31, 32], dtype=np.int64), "image1": bytes("image6 bytes abc", encoding='UTF-8'), "image2": bytes("image6 bytes def", encoding='UTF-8'), "image3": bytes("image6 bytes ghi", encoding='UTF-8'), "image4": bytes("image6 bytes jkl", encoding='UTF-8'), "image5": bytes("image6 bytes mno", encoding='UTF-8'), "target_sos_mask": np.array([533, 34, 35, 36, 37, 38], dtype=np.int64), "target_eos_ids": np.array([539, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), "target_eos_mask": np.array([548, 49, 50, 51], dtype=np.int64)} ] writer = FileWriter(mindrecord_file_name) schema = {"file_name": {"type": "string"}, "image1": {"type": "bytes"}, "image2": {"type": "bytes"}, "source_sos_ids": {"type": "int64", "shape": [-1]}, "source_sos_mask": {"type": "int64", "shape": [-1]}, "image3": {"type": "bytes"}, "image4": {"type": "bytes"}, "image5": {"type": "bytes"}, "target_sos_ids": {"type": "int64", "shape": [-1]}, "target_sos_mask": {"type": "int64", "shape": [-1]}, "target_eos_ids": {"type": "int64", "shape": [-1]}, "target_eos_mask": {"type": "int64", "shape": [-1]}, "label": {"type": "int32"}} writer.add_schema(schema, "data is so cool") writer.write_raw_data(data) writer.commit() reader = FileReader(mindrecord_file_name) count = 0 for index, x in enumerate(reader.get_next()): assert len(x) == 13 for field in x: if isinstance(x[field], np.ndarray): assert (x[field] == data[count][field]).all() else: assert x[field] == data[count][field] count = count + 1 logger.info("#item{}: {}".format(index, x)) assert count == 6 reader.close() reader = FileReader(file_name=mindrecord_file_name, columns=["source_sos_ids", "source_sos_mask", "target_sos_ids"]) count = 0 for index, x in enumerate(reader.get_next()): assert len(x) == 3 for field in x: if isinstance(x[field], np.ndarray): assert (x[field] == data[count][field]).all() else: assert x[field] == data[count][field] count = count + 1 logger.info("#item{}: {}".format(index, x)) assert count == 6 reader.close() reader = FileReader(file_name=mindrecord_file_name, columns=["image2", "source_sos_mask", "image3", "target_sos_ids"]) count = 0 for index, x in enumerate(reader.get_next()): assert len(x) == 4 for field in x: if isinstance(x[field], np.ndarray): assert (x[field] == data[count][field]).all() else: assert x[field] == data[count][field] count = count + 1 logger.info("#item{}: {}".format(index, x)) assert count == 6 reader.close() reader = FileReader(file_name=mindrecord_file_name, columns=["target_sos_ids", "image4", "source_sos_ids"]) count = 0 for index, x in enumerate(reader.get_next()): assert len(x) == 3 for field in x: if isinstance(x[field], np.ndarray): assert (x[field] == data[count][field]).all() else: assert x[field] == data[count][field] count = count + 1 logger.info("#item{}: {}".format(index, x)) assert count == 6 reader.close() reader = FileReader(file_name=mindrecord_file_name, columns=["target_sos_ids", "image5", "image4", "image3", "source_sos_ids"]) count = 0 for index, x in enumerate(reader.get_next()): assert len(x) == 5 for field in x: if isinstance(x[field], np.ndarray): assert (x[field] == data[count][field]).all() else: assert x[field] == data[count][field] count = count + 1 logger.info("#item{}: {}".format(index, x)) assert count == 6 reader.close() reader = FileReader(file_name=mindrecord_file_name, columns=["target_eos_mask", "image5", "image2", "source_sos_mask", "label"]) count = 0 for index, x in enumerate(reader.get_next()): assert len(x) == 5 for field in x: if isinstance(x[field], np.ndarray): assert (x[field] == data[count][field]).all() else: assert x[field] == data[count][field] count = count + 1 logger.info("#item{}: {}".format(index, x)) assert count == 6 reader.close() os.remove("{}".format(mindrecord_file_name)) os.remove("{}.db".format(mindrecord_file_name))
def test_write_read_process_with_multi_array(): mindrecord_file_name = "test.mindrecord" data = [{"source_sos_ids": np.array([1, 2, 3, 4, 5], dtype=np.int64), "source_sos_mask": np.array([6, 7, 8, 9, 10, 11, 12], dtype=np.int64), "source_eos_ids": np.array([13, 14, 15, 16, 17, 18], dtype=np.int64), "source_eos_mask": np.array([19, 20, 21, 22, 23, 24, 25, 26, 27], dtype=np.int64), "target_sos_ids": np.array([28, 29, 30, 31, 32], dtype=np.int64), "target_sos_mask": np.array([33, 34, 35, 36, 37, 38], dtype=np.int64), "target_eos_ids": np.array([39, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), "target_eos_mask": np.array([48, 49, 50, 51], dtype=np.int64)}, {"source_sos_ids": np.array([11, 2, 3, 4, 5], dtype=np.int64), "source_sos_mask": np.array([16, 7, 8, 9, 10, 11, 12], dtype=np.int64), "source_eos_ids": np.array([113, 14, 15, 16, 17, 18], dtype=np.int64), "source_eos_mask": np.array([119, 20, 21, 22, 23, 24, 25, 26, 27], dtype=np.int64), "target_sos_ids": np.array([128, 29, 30, 31, 32], dtype=np.int64), "target_sos_mask": np.array([133, 34, 35, 36, 37, 38], dtype=np.int64), "target_eos_ids": np.array([139, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), "target_eos_mask": np.array([148, 49, 50, 51], dtype=np.int64)}, {"source_sos_ids": np.array([21, 2, 3, 4, 5], dtype=np.int64), "source_sos_mask": np.array([26, 7, 8, 9, 10, 11, 12], dtype=np.int64), "source_eos_ids": np.array([213, 14, 15, 16, 17, 18], dtype=np.int64), "source_eos_mask": np.array([219, 20, 21, 22, 23, 24, 25, 26, 27], dtype=np.int64), "target_sos_ids": np.array([228, 29, 30, 31, 32], dtype=np.int64), "target_sos_mask": np.array([233, 34, 35, 36, 37, 38], dtype=np.int64), "target_eos_ids": np.array([239, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), "target_eos_mask": np.array([248, 49, 50, 51], dtype=np.int64)}, {"source_sos_ids": np.array([31, 2, 3, 4, 5], dtype=np.int64), "source_sos_mask": np.array([36, 7, 8, 9, 10, 11, 12], dtype=np.int64), "source_eos_ids": np.array([313, 14, 15, 16, 17, 18], dtype=np.int64), "source_eos_mask": np.array([319, 20, 21, 22, 23, 24, 25, 26, 27], dtype=np.int64), "target_sos_ids": np.array([328, 29, 30, 31, 32], dtype=np.int64), "target_sos_mask": np.array([333, 34, 35, 36, 37, 38], dtype=np.int64), "target_eos_ids": np.array([339, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), "target_eos_mask": np.array([348, 49, 50, 51], dtype=np.int64)}, {"source_sos_ids": np.array([41, 2, 3, 4, 5], dtype=np.int64), "source_sos_mask": np.array([46, 7, 8, 9, 10, 11, 12], dtype=np.int64), "source_eos_ids": np.array([413, 14, 15, 16, 17, 18], dtype=np.int64), "source_eos_mask": np.array([419, 20, 21, 22, 23, 24, 25, 26, 27], dtype=np.int64), "target_sos_ids": np.array([428, 29, 30, 31, 32], dtype=np.int64), "target_sos_mask": np.array([433, 34, 35, 36, 37, 38], dtype=np.int64), "target_eos_ids": np.array([439, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), "target_eos_mask": np.array([448, 49, 50, 51], dtype=np.int64)}, {"source_sos_ids": np.array([51, 2, 3, 4, 5], dtype=np.int64), "source_sos_mask": np.array([56, 7, 8, 9, 10, 11, 12], dtype=np.int64), "source_eos_ids": np.array([513, 14, 15, 16, 17, 18], dtype=np.int64), "source_eos_mask": np.array([519, 20, 21, 22, 23, 24, 25, 26, 27], dtype=np.int64), "target_sos_ids": np.array([528, 29, 30, 31, 32], dtype=np.int64), "target_sos_mask": np.array([533, 34, 35, 36, 37, 38], dtype=np.int64), "target_eos_ids": np.array([539, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), "target_eos_mask": np.array([548, 49, 50, 51], dtype=np.int64)} ] writer = FileWriter(mindrecord_file_name) schema = {"source_sos_ids": {"type": "int64", "shape": [-1]}, "source_sos_mask": {"type": "int64", "shape": [-1]}, "source_eos_ids": {"type": "int64", "shape": [-1]}, "source_eos_mask": {"type": "int64", "shape": [-1]}, "target_sos_ids": {"type": "int64", "shape": [-1]}, "target_sos_mask": {"type": "int64", "shape": [-1]}, "target_eos_ids": {"type": "int64", "shape": [-1]}, "target_eos_mask": {"type": "int64", "shape": [-1]}} writer.add_schema(schema, "data is so cool") writer.write_raw_data(data) writer.commit() reader = FileReader(mindrecord_file_name) count = 0 for index, x in enumerate(reader.get_next()): assert len(x) == 8 for field in x: if isinstance(x[field], np.ndarray): assert (x[field] == data[count][field]).all() else: assert x[field] == data[count][field] count = count + 1 logger.info("#item{}: {}".format(index, x)) assert count == 6 reader.close() reader = FileReader(file_name=mindrecord_file_name, columns=["source_eos_ids", "source_eos_mask", "target_sos_ids", "target_sos_mask", "target_eos_ids", "target_eos_mask"]) count = 0 for index, x in enumerate(reader.get_next()): assert len(x) == 6 for field in x: if isinstance(x[field], np.ndarray): assert (x[field] == data[count][field]).all() else: assert x[field] == data[count][field] count = count + 1 logger.info("#item{}: {}".format(index, x)) assert count == 6 reader.close() reader = FileReader(file_name=mindrecord_file_name, columns=["source_sos_ids", "target_sos_ids", "target_eos_mask"]) count = 0 for index, x in enumerate(reader.get_next()): assert len(x) == 3 for field in x: if isinstance(x[field], np.ndarray): assert (x[field] == data[count][field]).all() else: assert x[field] == data[count][field] count = count + 1 logger.info("#item{}: {}".format(index, x)) assert count == 6 reader.close() reader = FileReader(file_name=mindrecord_file_name, columns=["target_eos_mask", "source_eos_mask", "source_sos_mask"]) count = 0 for index, x in enumerate(reader.get_next()): assert len(x) == 3 for field in x: if isinstance(x[field], np.ndarray): assert (x[field] == data[count][field]).all() else: assert x[field] == data[count][field] count = count + 1 logger.info("#item{}: {}".format(index, x)) assert count == 6 reader.close() reader = FileReader(file_name=mindrecord_file_name, columns=["target_eos_ids"]) count = 0 for index, x in enumerate(reader.get_next()): assert len(x) == 1 for field in x: if isinstance(x[field], np.ndarray): assert (x[field] == data[count][field]).all() else: assert x[field] == data[count][field] count = count + 1 logger.info("#item{}: {}".format(index, x)) assert count == 6 reader.close() os.remove("{}".format(mindrecord_file_name)) os.remove("{}.db".format(mindrecord_file_name))
def test_write_read_process_with_multi_bytes(): mindrecord_file_name = "test.mindrecord" data = [{"file_name": "001.jpg", "label": 43, "image1": bytes("image1 bytes abc", encoding='UTF-8'), "image2": bytes("image1 bytes def", encoding='UTF-8'), "image3": bytes("image1 bytes ghi", encoding='UTF-8'), "image4": bytes("image1 bytes jkl", encoding='UTF-8'), "image5": bytes("image1 bytes mno", encoding='UTF-8')}, {"file_name": "002.jpg", "label": 91, "image1": bytes("image2 bytes abc", encoding='UTF-8'), "image2": bytes("image2 bytes def", encoding='UTF-8'), "image3": bytes("image2 bytes ghi", encoding='UTF-8'), "image4": bytes("image2 bytes jkl", encoding='UTF-8'), "image5": bytes("image2 bytes mno", encoding='UTF-8')}, {"file_name": "003.jpg", "label": 61, "image1": bytes("image3 bytes abc", encoding='UTF-8'), "image2": bytes("image3 bytes def", encoding='UTF-8'), "image3": bytes("image3 bytes ghi", encoding='UTF-8'), "image4": bytes("image3 bytes jkl", encoding='UTF-8'), "image5": bytes("image3 bytes mno", encoding='UTF-8')}, {"file_name": "004.jpg", "label": 29, "image1": bytes("image4 bytes abc", encoding='UTF-8'), "image2": bytes("image4 bytes def", encoding='UTF-8'), "image3": bytes("image4 bytes ghi", encoding='UTF-8'), "image4": bytes("image4 bytes jkl", encoding='UTF-8'), "image5": bytes("image4 bytes mno", encoding='UTF-8')}, {"file_name": "005.jpg", "label": 78, "image1": bytes("image5 bytes abc", encoding='UTF-8'), "image2": bytes("image5 bytes def", encoding='UTF-8'), "image3": bytes("image5 bytes ghi", encoding='UTF-8'), "image4": bytes("image5 bytes jkl", encoding='UTF-8'), "image5": bytes("image5 bytes mno", encoding='UTF-8')}, {"file_name": "006.jpg", "label": 37, "image1": bytes("image6 bytes abc", encoding='UTF-8'), "image2": bytes("image6 bytes def", encoding='UTF-8'), "image3": bytes("image6 bytes ghi", encoding='UTF-8'), "image4": bytes("image6 bytes jkl", encoding='UTF-8'), "image5": bytes("image6 bytes mno", encoding='UTF-8')} ] writer = FileWriter(mindrecord_file_name) schema = {"file_name": {"type": "string"}, "image1": {"type": "bytes"}, "image2": {"type": "bytes"}, "image3": {"type": "bytes"}, "label": {"type": "int32"}, "image4": {"type": "bytes"}, "image5": {"type": "bytes"}} writer.add_schema(schema, "data is so cool") writer.write_raw_data(data) writer.commit() reader = FileReader(mindrecord_file_name) count = 0 for index, x in enumerate(reader.get_next()): assert len(x) == 7 for field in x: if isinstance(x[field], np.ndarray): assert (x[field] == data[count][field]).all() else: assert x[field] == data[count][field] count = count + 1 logger.info("#item{}: {}".format(index, x)) assert count == 6 reader.close() reader2 = FileReader(file_name=mindrecord_file_name, columns=["image1", "image2", "image5"]) count = 0 for index, x in enumerate(reader2.get_next()): assert len(x) == 3 for field in x: if isinstance(x[field], np.ndarray): assert (x[field] == data[count][field]).all() else: assert x[field] == data[count][field] count = count + 1 logger.info("#item{}: {}".format(index, x)) assert count == 6 reader2.close() reader3 = FileReader(file_name=mindrecord_file_name, columns=["image2", "image4"]) count = 0 for index, x in enumerate(reader3.get_next()): assert len(x) == 2 for field in x: if isinstance(x[field], np.ndarray): assert (x[field] == data[count][field]).all() else: assert x[field] == data[count][field] count = count + 1 logger.info("#item{}: {}".format(index, x)) assert count == 6 reader3.close() reader4 = FileReader(file_name=mindrecord_file_name, columns=["image5", "image2"]) count = 0 for index, x in enumerate(reader4.get_next()): assert len(x) == 2 for field in x: if isinstance(x[field], np.ndarray): assert (x[field] == data[count][field]).all() else: assert x[field] == data[count][field] count = count + 1 logger.info("#item{}: {}".format(index, x)) assert count == 6 reader4.close() reader5 = FileReader(file_name=mindrecord_file_name, columns=["image5", "image2", "label"]) count = 0 for index, x in enumerate(reader5.get_next()): assert len(x) == 3 for field in x: if isinstance(x[field], np.ndarray): assert (x[field] == data[count][field]).all() else: assert x[field] == data[count][field] count = count + 1 logger.info("#item{}: {}".format(index, x)) assert count == 6 reader5.close() os.remove("{}".format(mindrecord_file_name)) os.remove("{}.db".format(mindrecord_file_name))