示例#1
0
def test_case_00(add_remove_file):  # only bin data
    data = [{
        "image1": bytes("image1 bytes abc", encoding='UTF-8'),
        "image2": bytes("image1 bytes def", encoding='UTF-8'),
        "image3": bytes("image1 bytes ghi", encoding='UTF-8'),
        "image4": bytes("image1 bytes jkl", encoding='UTF-8'),
        "image5": bytes("image1 bytes mno", encoding='UTF-8')
    }, {
        "image1": bytes("image2 bytes abc", encoding='UTF-8'),
        "image2": bytes("image2 bytes def", encoding='UTF-8'),
        "image3": bytes("image2 bytes ghi", encoding='UTF-8'),
        "image4": bytes("image2 bytes jkl", encoding='UTF-8'),
        "image5": bytes("image2 bytes mno", encoding='UTF-8')
    }, {
        "image1": bytes("image3 bytes abc", encoding='UTF-8'),
        "image2": bytes("image3 bytes def", encoding='UTF-8'),
        "image3": bytes("image3 bytes ghi", encoding='UTF-8'),
        "image4": bytes("image3 bytes jkl", encoding='UTF-8'),
        "image5": bytes("image3 bytes mno", encoding='UTF-8')
    }, {
        "image1": bytes("image5 bytes abc", encoding='UTF-8'),
        "image2": bytes("image5 bytes def", encoding='UTF-8'),
        "image3": bytes("image5 bytes ghi", encoding='UTF-8'),
        "image4": bytes("image5 bytes jkl", encoding='UTF-8'),
        "image5": bytes("image5 bytes mno", encoding='UTF-8')
    }, {
        "image1": bytes("image6 bytes abc", encoding='UTF-8'),
        "image2": bytes("image6 bytes def", encoding='UTF-8'),
        "image3": bytes("image6 bytes ghi", encoding='UTF-8'),
        "image4": bytes("image6 bytes jkl", encoding='UTF-8'),
        "image5": bytes("image6 bytes mno", encoding='UTF-8')
    }]
    schema = {
        "image1": {
            "type": "bytes"
        },
        "image2": {
            "type": "bytes"
        },
        "image3": {
            "type": "bytes"
        },
        "image4": {
            "type": "bytes"
        },
        "image5": {
            "type": "bytes"
        }
    }
    writer = FileWriter(TEMP_FILE, FILES_NUM)
    writer.add_schema(schema, "schema")
    writer.write_raw_data(data)
    writer.commit()

    d1 = ds.MindDataset(TEMP_FILE, None, num_readers, shuffle=False)
    d1.save(AUTO_FILE, FILES_NUM)
    data_value_to_list = []

    for item in data:
        new_data = {}
        new_data['image1'] = np.asarray(list(item["image1"]), dtype=np.uint8)
        new_data['image2'] = np.asarray(list(item["image2"]), dtype=np.uint8)
        new_data['image3'] = np.asarray(list(item["image3"]), dtype=np.uint8)
        new_data['image4'] = np.asarray(list(item["image4"]), dtype=np.uint8)
        new_data['image5'] = np.asarray(list(item["image5"]), dtype=np.uint8)
        data_value_to_list.append(new_data)

    d2 = ds.MindDataset(dataset_file=AUTO_FILE,
                        num_parallel_workers=num_readers,
                        shuffle=False)
    assert d2.get_dataset_size() == 5
    num_iter = 0
    for item in d2.create_dict_iterator(num_epochs=1, output_numpy=True):
        assert len(item) == 5
        for field in item:
            if isinstance(item[field], np.ndarray):
                assert (
                    item[field] == data_value_to_list[num_iter][field]).all()
            else:
                assert item[field] == data_value_to_list[num_iter][field]
        num_iter += 1
    assert num_iter == 5
示例#2
0
def random_split_trans2mindrecord(input_file_path,
                                  output_file_path,
                                  criteo_stats_dict,
                                  part_rows=2000000,
                                  line_per_sample=1000,
                                  test_size=0.1,
                                  seed=2020):
    """Random split data and save mindrecord"""
    test_size = int(TRAIN_LINE_COUNT * test_size)
    all_indices = [i for i in range(TRAIN_LINE_COUNT)]
    np.random.seed(seed)
    np.random.shuffle(all_indices)
    print("all_indices.size:{}".format(len(all_indices)))
    test_indices_set = set(all_indices[:test_size])
    print("test_indices_set.size:{}".format(len(test_indices_set)))
    print("-----------------------" * 10 + "\n" * 2)

    train_data_list = []
    test_data_list = []
    ids_list = []
    wts_list = []
    label_list = []

    writer_train = FileWriter(
        os.path.join(output_file_path, "train_input_part.mindrecord"), 21)
    writer_test = FileWriter(
        os.path.join(output_file_path, "test_input_part.mindrecord"), 3)

    schema = {
        "label": {
            "type": "float32",
            "shape": [-1]
        },
        "feat_vals": {
            "type": "float32",
            "shape": [-1]
        },
        "feat_ids": {
            "type": "int32",
            "shape": [-1]
        }
    }
    writer_train.add_schema(schema, "CRITEO_TRAIN")
    writer_test.add_schema(schema, "CRITEO_TEST")

    with open(input_file_path, encoding="utf-8") as file_in:
        items_error_size_lineCount = []
        count = 0
        train_part_number = 0
        test_part_number = 0
        for i, line in enumerate(file_in):
            count += 1
            if count % 1000000 == 0:
                print("Have handle {}w lines.".format(count // 10000))
            line = line.strip("\n")
            items = line.split("\t")
            if len(items) != 40:
                items_error_size_lineCount.append(i)
                continue
            label = float(items[0])
            values = items[1:14]
            cats = items[14:]

            assert len(values) == 13, "values.size: {}".format(len(values))
            assert len(cats) == 26, "cats.size: {}".format(len(cats))

            ids, wts = criteo_stats_dict.map_cat2id(values, cats)

            ids_list.extend(ids)
            wts_list.extend(wts)
            label_list.append(label)

            if count % line_per_sample == 0:
                if i not in test_indices_set:
                    train_data_list.append({
                        "feat_ids":
                        np.array(ids_list, dtype=np.int32),
                        "feat_vals":
                        np.array(wts_list, dtype=np.float32),
                        "label":
                        np.array(label_list, dtype=np.float32)
                    })
                else:
                    test_data_list.append({
                        "feat_ids":
                        np.array(ids_list, dtype=np.int32),
                        "feat_vals":
                        np.array(wts_list, dtype=np.float32),
                        "label":
                        np.array(label_list, dtype=np.float32)
                    })
                if train_data_list and len(train_data_list) % part_rows == 0:
                    writer_train.write_raw_data(train_data_list)
                    train_data_list.clear()
                    train_part_number += 1

                if test_data_list and len(test_data_list) % part_rows == 0:
                    writer_test.write_raw_data(test_data_list)
                    test_data_list.clear()
                    test_part_number += 1

                ids_list.clear()
                wts_list.clear()
                label_list.clear()

        if train_data_list:
            writer_train.write_raw_data(train_data_list)
        if test_data_list:
            writer_test.write_raw_data(test_data_list)
    writer_train.commit()
    writer_test.commit()

    print("-------------" * 10)
    print("items_error_size_lineCount.size(): {}.".format(
        len(items_error_size_lineCount)))
    print("-------------" * 10)
    np.save("items_error_size_lineCount.npy", items_error_size_lineCount)
示例#3
0
    schema = {
        "input_ids": {
            "type": "int32",
            "shape": [-1]
        },
    }
    writer = FileWriter(file_name=args.output_file,
                        shard_num=args.file_partition)
    writer.add_schema(schema, args.dataset_type)
    writer.open_and_set_header()
    ###
    transforms_count = 0
    if args.dataset_type == 'wiki':
        for x in tokenize_wiki(args.input_glob):
            transforms_count += 1
            writer.write_raw_data([x])
        print("Transformed {} records.".format(transforms_count))
    elif args.dataset_type == 'lambada':
        for x in tokenize_lambada(args.input_glob):
            transforms_count += 1
            writer.write_raw_data([x])
        print("Transformed {} records.".format(transforms_count))
    elif args.dataset_type == 'openwebtext':
        file_iter = glob.iglob(args.input_glob)
        with Pool(processes=args.num_process) as pool:
            pool.map(task_unit, package_file(file_iter, args.file_batch_size))
    else:
        raise ValueError("Not support dataset type: {}".format(
            args.dataset_type))

    writer.commit()
示例#4
0
def test_case_02(add_remove_file):  # muti-bytes
    data = [{
        "file_name":
        "001.jpg",
        "label":
        43,
        "float32_array":
        np.array([1.2, 2.78, 3.1234, 4.9871, 5.12341], dtype=np.float32),
        "float64_array":
        np.array([
            48.1234556789, 49.3251241431, 50.13514312414, 51.8971298471,
            123414314.2141243, 87.1212122
        ],
                 dtype=np.float64),
        "float32":
        3456.12345,
        "float64":
        1987654321.123456785,
        "source_sos_ids":
        np.array([1, 2, 3, 4, 5], dtype=np.int32),
        "source_sos_mask":
        np.array([6, 7, 8, 9, 10, 11, 12], dtype=np.int64),
        "image1":
        bytes("image1 bytes abc", encoding='UTF-8'),
        "image2":
        bytes("image1 bytes def", encoding='UTF-8'),
        "image3":
        bytes("image1 bytes ghi", encoding='UTF-8'),
        "image4":
        bytes("image1 bytes jkl", encoding='UTF-8'),
        "image5":
        bytes("image1 bytes mno", encoding='UTF-8')
    }, {
        "file_name":
        "002.jpg",
        "label":
        91,
        "float32_array":
        np.array([1.2, 2.78, 4.1234, 4.9871, 5.12341], dtype=np.float32),
        "float64_array":
        np.array([
            48.1234556789, 49.3251241431, 60.13514312414, 51.8971298471,
            123414314.2141243, 87.1212122
        ],
                 dtype=np.float64),
        "float32":
        3456.12445,
        "float64":
        1987654321.123456786,
        "source_sos_ids":
        np.array([11, 2, 3, 4, 5], dtype=np.int32),
        "source_sos_mask":
        np.array([16, 7, 8, 9, 10, 11, 12], dtype=np.int64),
        "image1":
        bytes("image2 bytes abc", encoding='UTF-8'),
        "image2":
        bytes("image2 bytes def", encoding='UTF-8'),
        "image3":
        bytes("image2 bytes ghi", encoding='UTF-8'),
        "image4":
        bytes("image2 bytes jkl", encoding='UTF-8'),
        "image5":
        bytes("image2 bytes mno", encoding='UTF-8')
    }, {
        "file_name":
        "003.jpg",
        "label":
        61,
        "float32_array":
        np.array([1.2, 2.78, 5.1234, 4.9871, 5.12341], dtype=np.float32),
        "float64_array":
        np.array([
            48.1234556789, 49.3251241431, 70.13514312414, 51.8971298471,
            123414314.2141243, 87.1212122
        ],
                 dtype=np.float64),
        "float32":
        3456.12545,
        "float64":
        1987654321.123456787,
        "source_sos_ids":
        np.array([21, 2, 3, 4, 5], dtype=np.int32),
        "source_sos_mask":
        np.array([26, 7, 8, 9, 10, 11, 12], dtype=np.int64),
        "image1":
        bytes("image3 bytes abc", encoding='UTF-8'),
        "image2":
        bytes("image3 bytes def", encoding='UTF-8'),
        "image3":
        bytes("image3 bytes ghi", encoding='UTF-8'),
        "image4":
        bytes("image3 bytes jkl", encoding='UTF-8'),
        "image5":
        bytes("image3 bytes mno", encoding='UTF-8')
    }, {
        "file_name":
        "004.jpg",
        "label":
        29,
        "float32_array":
        np.array([1.2, 2.78, 6.1234, 4.9871, 5.12341], dtype=np.float32),
        "float64_array":
        np.array([
            48.1234556789, 49.3251241431, 80.13514312414, 51.8971298471,
            123414314.2141243, 87.1212122
        ],
                 dtype=np.float64),
        "float32":
        3456.12645,
        "float64":
        1987654321.123456788,
        "source_sos_ids":
        np.array([31, 2, 3, 4, 5], dtype=np.int32),
        "source_sos_mask":
        np.array([36, 7, 8, 9, 10, 11, 12], dtype=np.int64),
        "image1":
        bytes("image4 bytes abc", encoding='UTF-8'),
        "image2":
        bytes("image4 bytes def", encoding='UTF-8'),
        "image3":
        bytes("image4 bytes ghi", encoding='UTF-8'),
        "image4":
        bytes("image4 bytes jkl", encoding='UTF-8'),
        "image5":
        bytes("image4 bytes mno", encoding='UTF-8')
    }, {
        "file_name":
        "005.jpg",
        "label":
        78,
        "float32_array":
        np.array([1.2, 2.78, 7.1234, 4.9871, 5.12341], dtype=np.float32),
        "float64_array":
        np.array([
            48.1234556789, 49.3251241431, 90.13514312414, 51.8971298471,
            123414314.2141243, 87.1212122
        ],
                 dtype=np.float64),
        "float32":
        3456.12745,
        "float64":
        1987654321.123456789,
        "source_sos_ids":
        np.array([41, 2, 3, 4, 5], dtype=np.int32),
        "source_sos_mask":
        np.array([46, 7, 8, 9, 10, 11, 12], dtype=np.int64),
        "image1":
        bytes("image5 bytes abc", encoding='UTF-8'),
        "image2":
        bytes("image5 bytes def", encoding='UTF-8'),
        "image3":
        bytes("image5 bytes ghi", encoding='UTF-8'),
        "image4":
        bytes("image5 bytes jkl", encoding='UTF-8'),
        "image5":
        bytes("image5 bytes mno", encoding='UTF-8')
    }, {
        "file_name":
        "006.jpg",
        "label":
        37,
        "float32_array":
        np.array([1.2, 2.78, 7.1234, 4.9871, 5.12341], dtype=np.float32),
        "float64_array":
        np.array([
            48.1234556789, 49.3251241431, 90.13514312414, 51.8971298471,
            123414314.2141243, 87.1212122
        ],
                 dtype=np.float64),
        "float32":
        3456.12745,
        "float64":
        1987654321.123456789,
        "source_sos_ids":
        np.array([51, 2, 3, 4, 5], dtype=np.int32),
        "source_sos_mask":
        np.array([56, 7, 8, 9, 10, 11, 12], dtype=np.int64),
        "image1":
        bytes("image6 bytes abc", encoding='UTF-8'),
        "image2":
        bytes("image6 bytes def", encoding='UTF-8'),
        "image3":
        bytes("image6 bytes ghi", encoding='UTF-8'),
        "image4":
        bytes("image6 bytes jkl", encoding='UTF-8'),
        "image5":
        bytes("image6 bytes mno", encoding='UTF-8')
    }]
    schema = {
        "file_name": {
            "type": "string"
        },
        "float32_array": {
            "type": "float32",
            "shape": [-1]
        },
        "float64_array": {
            "type": "float64",
            "shape": [-1]
        },
        "float32": {
            "type": "float32"
        },
        "float64": {
            "type": "float64"
        },
        "source_sos_ids": {
            "type": "int32",
            "shape": [-1]
        },
        "source_sos_mask": {
            "type": "int64",
            "shape": [-1]
        },
        "image1": {
            "type": "bytes"
        },
        "image2": {
            "type": "bytes"
        },
        "image3": {
            "type": "bytes"
        },
        "label": {
            "type": "int32"
        },
        "image4": {
            "type": "bytes"
        },
        "image5": {
            "type": "bytes"
        }
    }
    writer = FileWriter(TEMP_FILE, FILES_NUM)
    writer.add_schema(schema, "schema")
    writer.write_raw_data(data)
    writer.commit()

    d1 = ds.MindDataset(TEMP_FILE, None, num_readers, shuffle=False)
    d1.save(AUTO_FILE, FILES_NUM)
    data_value_to_list = []

    for item in data:
        new_data = {}
        new_data['file_name'] = np.asarray(item["file_name"], dtype='S')
        new_data['float32_array'] = item["float32_array"]
        new_data['float64_array'] = item["float64_array"]
        new_data['float32'] = item["float32"]
        new_data['float64'] = item["float64"]
        new_data['source_sos_ids'] = item["source_sos_ids"]
        new_data['source_sos_mask'] = item["source_sos_mask"]
        new_data['label'] = np.asarray(list([item["label"]]), dtype=np.int32)
        new_data['image1'] = np.asarray(list(item["image1"]), dtype=np.uint8)
        new_data['image2'] = np.asarray(list(item["image2"]), dtype=np.uint8)
        new_data['image3'] = np.asarray(list(item["image3"]), dtype=np.uint8)
        new_data['image4'] = np.asarray(list(item["image4"]), dtype=np.uint8)
        new_data['image5'] = np.asarray(list(item["image5"]), dtype=np.uint8)
        data_value_to_list.append(new_data)

    d2 = ds.MindDataset(dataset_file=AUTO_FILE,
                        num_parallel_workers=num_readers,
                        shuffle=False)
    assert d2.get_dataset_size() == 6
    num_iter = 0
    for item in d2.create_dict_iterator(num_epochs=1, output_numpy=True):
        assert len(item) == 13
        for field in item:
            if isinstance(item[field], np.ndarray):
                if item[field].dtype == np.float32:
                    assert (item[field] == np.array(
                        data_value_to_list[num_iter][field],
                        np.float32)).all()
                else:
                    assert (item[field] == data_value_to_list[num_iter][field]
                            ).all()
            else:
                assert item[field] == data_value_to_list[num_iter][field]
        num_iter += 1
    assert num_iter == 6
示例#5
0
        os.makedirs(dst_dir)

    print('number of samples:', len(lines))
    writer = FileWriter(file_name=args.dst_path, shard_num=args.num_shards)
    writer.add_schema(seg_schema, "seg_schema")
    cnt = 0

    for l in lines:
        img_name = l.strip('\n')

        img_path = 'img/' + str(img_name) + '.jpg'
        label_path = 'cls_png/' + str(img_name) + '.png'

        sample_ = {"file_name": img_path.split('/')[-1]}

        with open(os.path.join(args.data_root, img_path), 'rb') as f:
            sample_['data'] = f.read()
        with open(os.path.join(args.data_root, label_path), 'rb') as f:
            sample_['label'] = f.read()
        data_list.append(sample_)
        cnt += 1
        if cnt % 1000 == 0:
            writer.write_raw_data(data_list)
            print('number of samples written:', cnt)
            data_list = []

    if data_list:
        writer.write_raw_data(data_list)
    writer.commit()
    print('number of samples written:', cnt)
def test_write_read_process():
    mindrecord_file_name = "test.mindrecord"
    data = [{
        "file_name": "001.jpg",
        "label": 43,
        "score": 0.8,
        "mask": np.array([3, 6, 9], dtype=np.int64),
        "segments": np.array([[5.0, 1.6], [65.2, 8.3]], dtype=np.float32),
        "data": bytes("image bytes abc", encoding='UTF-8')
    }, {
        "file_name": "002.jpg",
        "label": 91,
        "score": 5.4,
        "mask": np.array([1, 4, 7], dtype=np.int64),
        "segments": np.array([[5.1, 9.1], [2.0, 65.4]], dtype=np.float32),
        "data": bytes("image bytes def", encoding='UTF-8')
    }, {
        "file_name": "003.jpg",
        "label": 61,
        "score": 6.4,
        "mask": np.array([7, 6, 3], dtype=np.int64),
        "segments": np.array([[0.0, 5.6], [3.0, 16.3]], dtype=np.float32),
        "data": bytes("image bytes ghi", encoding='UTF-8')
    }, {
        "file_name": "004.jpg",
        "label": 29,
        "score": 8.1,
        "mask": np.array([2, 8, 0], dtype=np.int64),
        "segments": np.array([[5.9, 7.2], [4.0, 89.0]], dtype=np.float32),
        "data": bytes("image bytes jkl", encoding='UTF-8')
    }, {
        "file_name": "005.jpg",
        "label": 78,
        "score": 7.7,
        "mask": np.array([3, 1, 2], dtype=np.int64),
        "segments": np.array([[0.6, 8.1], [5.3, 49.3]], dtype=np.float32),
        "data": bytes("image bytes mno", encoding='UTF-8')
    }, {
        "file_name": "006.jpg",
        "label": 37,
        "score": 9.4,
        "mask": np.array([7, 6, 7], dtype=np.int64),
        "segments": np.array([[4.2, 6.3], [8.9, 81.8]], dtype=np.float32),
        "data": bytes("image bytes pqr", encoding='UTF-8')
    }]
    writer = FileWriter(mindrecord_file_name)
    schema = {
        "file_name": {
            "type": "string"
        },
        "label": {
            "type": "int32"
        },
        "score": {
            "type": "float64"
        },
        "mask": {
            "type": "int64",
            "shape": [-1]
        },
        "segments": {
            "type": "float32",
            "shape": [2, 2]
        },
        "data": {
            "type": "bytes"
        }
    }
    writer.add_schema(schema, "data is so cool")
    writer.write_raw_data(data)
    writer.commit()

    reader = FileReader(mindrecord_file_name)
    count = 0
    for index, x in enumerate(reader.get_next()):
        assert len(x) == 6
        for field in x:
            if isinstance(x[field], np.ndarray):
                assert (x[field] == data[count][field]).all()
            else:
                assert x[field] == data[count][field]
        count = count + 1
        logger.info("#item{}: {}".format(index, x))
    assert count == 6
    reader.close()

    os.remove("{}".format(mindrecord_file_name))
    os.remove("{}.db".format(mindrecord_file_name))
示例#7
0
    def transfer_coco_to_mindrecord(self,
                                    mindrecord_dir,
                                    file_name="coco_det.train.mind",
                                    shard_num=1):
        """Create MindRecord file by image_dir and anno_path."""
        if not os.path.isdir(mindrecord_dir):
            os.makedirs(mindrecord_dir)
        if os.path.isdir(self.image_path) and os.path.exists(self.annot_path):
            logger.info("Create MindRecord based on COCO_HP dataset")
        else:
            raise ValueError(
                'data_dir {} or anno_path {} does not exist'.format(
                    self.image_path, self.annot_path))

        mindrecord_path = os.path.join(mindrecord_dir, file_name)
        writer = FileWriter(mindrecord_path, shard_num)

        centernet_json = {
            "img_id": {
                "type": "int32",
                "shape": [1]
            },
            "image": {
                "type": "bytes"
            },
            "num_objects": {
                "type": "int32"
            },
            "bboxes": {
                "type": "float32",
                "shape": [-1, 4]
            },
            "category_id": {
                "type": "int32",
                "shape": [-1]
            },
        }

        writer.add_schema(centernet_json, "centernet_json")

        for img_id in self.images:
            image_info = self.coco.loadImgs([img_id])
            annos = self.coco.loadAnns(self.anns[img_id])
            # get image
            img_name = image_info[0]['file_name']
            img_name = os.path.join(self.image_path, img_name)
            with open(img_name, 'rb') as f:
                image = f.read()

            bboxes = []
            category_id = []
            num_objects = len(annos)
            for anno in annos:
                bbox = self._coco_box_to_bbox(anno['bbox'])
                class_name = self.classs_dict[anno["category_id"]]
                if class_name in self.train_cls:
                    x_min, x_max = bbox[0], bbox[2]
                    y_min, y_max = bbox[1], bbox[3]
                    bboxes.append([x_min, y_min, x_max, y_max])
                    category_id.append(self.train_cls_dict[class_name])

            row = {
                "img_id": np.array([img_id], dtype=np.int32),
                "image": image,
                "num_objects": num_objects,
                "bboxes": np.array(bboxes, np.float32),
                "category_id": np.array(category_id, np.int32)
            }
            writer.write_raw_data([row])

        writer.commit()
        logger.info("Create Mindrecord Done, at {}".format(mindrecord_dir))
示例#8
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--input_file", type=str, required=True,
                        help='Input raw text file (or comma-separated list of files).')
    parser.add_argument("--output_file", type=str, required=True, help='Output MindRecord file.')
    parser.add_argument("--num_splits", type=int, default=16,
                        help='The MindRecord file will be split into the number of partition.')
    parser.add_argument("--vocab_file", type=str, required=True,
                        help='The vocabulary file that the Transformer model was trained on.')
    parser.add_argument("--clip_to_max_len", type=bool, default=False,
                        help='clip sequences to maximum sequence length.')
    parser.add_argument("--max_seq_length", type=int, default=128, help='Maximum sequence length.')
    parser.add_argument("--bucket", type=ast.literal_eval, default=[16, 32, 48, 64, 128],
                        help='bucket sequence length')

    args = parser.parse_args()

    tokenizer = tokenization.WhiteSpaceTokenizer(vocab_file=args.vocab_file)

    input_files = []
    for input_pattern in args.input_file.split(","):
        input_files.append(input_pattern)

    logging.info("*** Read from input files ***")
    for input_file in input_files:
        logging.info("  %s", input_file)

    output_file = args.output_file
    logging.info("*** Write to output files ***")
    logging.info("  %s", output_file)

    total_written = 0
    total_read = 0

    feature_dict = {}
    for i in args.bucket:
        feature_dict[i] = []

    for input_file in input_files:
        logging.info("*** Reading from   %s ***", input_file)
        with open(input_file, "r") as reader:
            while True:
                line = tokenization.convert_to_unicode(reader.readline())
                if not line:
                    break

                total_read += 1
                if total_read % 100000 == 0:
                    logging.info("Read %d ...", total_read)

                source_line, target_line = line.strip().split("\t")
                source_tokens = tokenizer.tokenize(source_line)
                target_tokens = tokenizer.tokenize(target_line)

                if len(source_tokens) >= args.max_seq_length or len(target_tokens) >= args.max_seq_length:
                    logging.info("ignore long sentence!")
                    continue

                instance = create_training_instance(source_tokens, target_tokens, args.max_seq_length,
                                                    clip_to_max_len=args.clip_to_max_len)
                if instance is None:
                    continue

                features, seq_max_bucket_length = get_instance_features(instance, tokenizer, args.max_seq_length,
                                                                        args.bucket)
                for key in feature_dict:
                    if key == seq_max_bucket_length:
                        feature_dict[key].append(features)

                if total_read <= 10:
                    logging.info("*** Example ***")
                    logging.info("source tokens: %s", " ".join(
                        [tokenization.convert_to_printable(x) for x in instance.source_eos_tokens]))
                    logging.info("target tokens: %s", " ".join(
                        [tokenization.convert_to_printable(x) for x in instance.target_sos_tokens]))

                    for feature_name in features.keys():
                        feature = features[feature_name]
                        logging.info("%s: %s", feature_name, feature)

    for i in args.bucket:
        if args.num_splits == 1:
            output_file_name = output_file
        else:
            output_file_name = output_file + '_' + str(i) + '_'
        writer = FileWriter(output_file_name, args.num_splits)
        data_schema = {"source_sos_ids": {"type": "int64", "shape": [-1]},
                       "source_sos_mask": {"type": "int64", "shape": [-1]},
                       "source_eos_ids": {"type": "int64", "shape": [-1]},
                       "source_eos_mask": {"type": "int64", "shape": [-1]},
                       "target_sos_ids": {"type": "int64", "shape": [-1]},
                       "target_sos_mask": {"type": "int64", "shape": [-1]},
                       "target_eos_ids": {"type": "int64", "shape": [-1]},
                       "target_eos_mask": {"type": "int64", "shape": [-1]}
                       }
        writer.add_schema(data_schema, "tranformer")
        features_ = feature_dict[i]
        logging.info("Bucket length %d has %d samples, start writing...", i, len(features_))

        for item in features_:
            writer.write_raw_data([item])
            total_written += 1
        writer.commit()

    logging.info("Wrote %d total instances", total_written)
示例#9
0
def test_write_read_process_with_multi_bytes_and_array():
    mindrecord_file_name = "test.mindrecord"
    data = [{"file_name": "001.jpg", "label": 4,
             "image1": bytes("image1 bytes abc", encoding='UTF-8'),
             "image2": bytes("image1 bytes def", encoding='UTF-8'),
             "source_sos_ids": np.array([1, 2, 3, 4, 5], dtype=np.int64),
             "source_sos_mask": np.array([6, 7, 8, 9, 10, 11, 12], dtype=np.int64),
             "image3": bytes("image1 bytes ghi", encoding='UTF-8'),
             "image4": bytes("image1 bytes jkl", encoding='UTF-8'),
             "image5": bytes("image1 bytes mno", encoding='UTF-8'),
             "target_sos_ids": np.array([28, 29, 30, 31, 32], dtype=np.int64),
             "target_sos_mask": np.array([33, 34, 35, 36, 37, 38], dtype=np.int64),
             "target_eos_ids": np.array([39, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64),
             "target_eos_mask": np.array([48, 49, 50, 51], dtype=np.int64)},
            {"file_name": "002.jpg", "label": 5,
             "image1": bytes("image2 bytes abc", encoding='UTF-8'),
             "image2": bytes("image2 bytes def", encoding='UTF-8'),
             "image3": bytes("image2 bytes ghi", encoding='UTF-8'),
             "image4": bytes("image2 bytes jkl", encoding='UTF-8'),
             "image5": bytes("image2 bytes mno", encoding='UTF-8'),
             "source_sos_ids": np.array([11, 2, 3, 4, 5], dtype=np.int64),
             "source_sos_mask": np.array([16, 7, 8, 9, 10, 11, 12], dtype=np.int64),
             "target_sos_ids": np.array([128, 29, 30, 31, 32], dtype=np.int64),
             "target_sos_mask": np.array([133, 34, 35, 36, 37, 38], dtype=np.int64),
             "target_eos_ids": np.array([139, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64),
             "target_eos_mask": np.array([148, 49, 50, 51], dtype=np.int64)},
            {"file_name": "003.jpg", "label": 6,
             "source_sos_ids": np.array([21, 2, 3, 4, 5], dtype=np.int64),
             "source_sos_mask": np.array([26, 7, 8, 9, 10, 11, 12], dtype=np.int64),
             "target_sos_ids": np.array([228, 29, 30, 31, 32], dtype=np.int64),
             "target_sos_mask": np.array([233, 34, 35, 36, 37, 38], dtype=np.int64),
             "target_eos_ids": np.array([239, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64),
             "image1": bytes("image3 bytes abc", encoding='UTF-8'),
             "image2": bytes("image3 bytes def", encoding='UTF-8'),
             "image3": bytes("image3 bytes ghi", encoding='UTF-8'),
             "image4": bytes("image3 bytes jkl", encoding='UTF-8'),
             "image5": bytes("image3 bytes mno", encoding='UTF-8'),
             "target_eos_mask": np.array([248, 49, 50, 51], dtype=np.int64)},
            {"file_name": "004.jpg", "label": 7,
             "source_sos_ids": np.array([31, 2, 3, 4, 5], dtype=np.int64),
             "source_sos_mask": np.array([36, 7, 8, 9, 10, 11, 12], dtype=np.int64),
             "image1": bytes("image4 bytes abc", encoding='UTF-8'),
             "image2": bytes("image4 bytes def", encoding='UTF-8'),
             "image3": bytes("image4 bytes ghi", encoding='UTF-8'),
             "image4": bytes("image4 bytes jkl", encoding='UTF-8'),
             "image5": bytes("image4 bytes mno", encoding='UTF-8'),
             "target_sos_ids": np.array([328, 29, 30, 31, 32], dtype=np.int64),
             "target_sos_mask": np.array([333, 34, 35, 36, 37, 38], dtype=np.int64),
             "target_eos_ids": np.array([339, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64),
             "target_eos_mask": np.array([348, 49, 50, 51], dtype=np.int64)},
            {"file_name": "005.jpg", "label": 8,
             "source_sos_ids": np.array([41, 2, 3, 4, 5], dtype=np.int64),
             "source_sos_mask": np.array([46, 7, 8, 9, 10, 11, 12], dtype=np.int64),
             "target_sos_ids": np.array([428, 29, 30, 31, 32], dtype=np.int64),
             "target_sos_mask": np.array([433, 34, 35, 36, 37, 38], dtype=np.int64),
             "image1": bytes("image5 bytes abc", encoding='UTF-8'),
             "image2": bytes("image5 bytes def", encoding='UTF-8'),
             "image3": bytes("image5 bytes ghi", encoding='UTF-8'),
             "image4": bytes("image5 bytes jkl", encoding='UTF-8'),
             "image5": bytes("image5 bytes mno", encoding='UTF-8'),
             "target_eos_ids": np.array([439, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64),
             "target_eos_mask": np.array([448, 49, 50, 51], dtype=np.int64)},
            {"file_name": "006.jpg", "label": 9,
             "source_sos_ids": np.array([51, 2, 3, 4, 5], dtype=np.int64),
             "source_sos_mask": np.array([56, 7, 8, 9, 10, 11, 12], dtype=np.int64),
             "target_sos_ids": np.array([528, 29, 30, 31, 32], dtype=np.int64),
             "image1": bytes("image6 bytes abc", encoding='UTF-8'),
             "image2": bytes("image6 bytes def", encoding='UTF-8'),
             "image3": bytes("image6 bytes ghi", encoding='UTF-8'),
             "image4": bytes("image6 bytes jkl", encoding='UTF-8'),
             "image5": bytes("image6 bytes mno", encoding='UTF-8'),
             "target_sos_mask": np.array([533, 34, 35, 36, 37, 38], dtype=np.int64),
             "target_eos_ids": np.array([539, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64),
             "target_eos_mask": np.array([548, 49, 50, 51], dtype=np.int64)}
            ]

    writer = FileWriter(mindrecord_file_name)
    schema = {"file_name": {"type": "string"},
              "image1": {"type": "bytes"},
              "image2": {"type": "bytes"},
              "source_sos_ids": {"type": "int64", "shape": [-1]},
              "source_sos_mask": {"type": "int64", "shape": [-1]},
              "image3": {"type": "bytes"},
              "image4": {"type": "bytes"},
              "image5": {"type": "bytes"},
              "target_sos_ids": {"type": "int64", "shape": [-1]},
              "target_sos_mask": {"type": "int64", "shape": [-1]},
              "target_eos_ids": {"type": "int64", "shape": [-1]},
              "target_eos_mask": {"type": "int64", "shape": [-1]},
              "label": {"type": "int32"}}
    writer.add_schema(schema, "data is so cool")
    writer.write_raw_data(data)
    writer.commit()

    reader = FileReader(mindrecord_file_name)
    count = 0
    for index, x in enumerate(reader.get_next()):
        assert len(x) == 13
        for field in x:
            if isinstance(x[field], np.ndarray):
                assert (x[field] == data[count][field]).all()
            else:
                assert x[field] == data[count][field]
        count = count + 1
        logger.info("#item{}: {}".format(index, x))
    assert count == 6
    reader.close()

    reader = FileReader(file_name=mindrecord_file_name, columns=["source_sos_ids", "source_sos_mask",
                                                                 "target_sos_ids"])
    count = 0
    for index, x in enumerate(reader.get_next()):
        assert len(x) == 3
        for field in x:
            if isinstance(x[field], np.ndarray):
                assert (x[field] == data[count][field]).all()
            else:
                assert x[field] == data[count][field]
        count = count + 1
        logger.info("#item{}: {}".format(index, x))
    assert count == 6
    reader.close()

    reader = FileReader(file_name=mindrecord_file_name, columns=["image2", "source_sos_mask",
                                                                 "image3", "target_sos_ids"])
    count = 0
    for index, x in enumerate(reader.get_next()):
        assert len(x) == 4
        for field in x:
            if isinstance(x[field], np.ndarray):
                assert (x[field] == data[count][field]).all()
            else:
                assert x[field] == data[count][field]
        count = count + 1
        logger.info("#item{}: {}".format(index, x))
    assert count == 6
    reader.close()

    reader = FileReader(file_name=mindrecord_file_name, columns=["target_sos_ids", "image4",
                                                                 "source_sos_ids"])
    count = 0
    for index, x in enumerate(reader.get_next()):
        assert len(x) == 3
        for field in x:
            if isinstance(x[field], np.ndarray):
                assert (x[field] == data[count][field]).all()
            else:
                assert x[field] == data[count][field]
        count = count + 1
        logger.info("#item{}: {}".format(index, x))
    assert count == 6
    reader.close()

    reader = FileReader(file_name=mindrecord_file_name, columns=["target_sos_ids", "image5",
                                                                 "image4", "image3", "source_sos_ids"])
    count = 0
    for index, x in enumerate(reader.get_next()):
        assert len(x) == 5
        for field in x:
            if isinstance(x[field], np.ndarray):
                assert (x[field] == data[count][field]).all()
            else:
                assert x[field] == data[count][field]
        count = count + 1
        logger.info("#item{}: {}".format(index, x))
    assert count == 6
    reader.close()

    reader = FileReader(file_name=mindrecord_file_name, columns=["target_eos_mask", "image5", "image2",
                                                                 "source_sos_mask", "label"])
    count = 0
    for index, x in enumerate(reader.get_next()):
        assert len(x) == 5
        for field in x:
            if isinstance(x[field], np.ndarray):
                assert (x[field] == data[count][field]).all()
            else:
                assert x[field] == data[count][field]
        count = count + 1
        logger.info("#item{}: {}".format(index, x))
    assert count == 6
    reader.close()

    os.remove("{}".format(mindrecord_file_name))
    os.remove("{}.db".format(mindrecord_file_name))
示例#10
0
def test_write_read_process_with_multi_array():
    mindrecord_file_name = "test.mindrecord"
    data = [{"source_sos_ids": np.array([1, 2, 3, 4, 5], dtype=np.int64),
             "source_sos_mask": np.array([6, 7, 8, 9, 10, 11, 12], dtype=np.int64),
             "source_eos_ids": np.array([13, 14, 15, 16, 17, 18], dtype=np.int64),
             "source_eos_mask": np.array([19, 20, 21, 22, 23, 24, 25, 26, 27], dtype=np.int64),
             "target_sos_ids": np.array([28, 29, 30, 31, 32], dtype=np.int64),
             "target_sos_mask": np.array([33, 34, 35, 36, 37, 38], dtype=np.int64),
             "target_eos_ids": np.array([39, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64),
             "target_eos_mask": np.array([48, 49, 50, 51], dtype=np.int64)},
            {"source_sos_ids": np.array([11, 2, 3, 4, 5], dtype=np.int64),
             "source_sos_mask": np.array([16, 7, 8, 9, 10, 11, 12], dtype=np.int64),
             "source_eos_ids": np.array([113, 14, 15, 16, 17, 18], dtype=np.int64),
             "source_eos_mask": np.array([119, 20, 21, 22, 23, 24, 25, 26, 27], dtype=np.int64),
             "target_sos_ids": np.array([128, 29, 30, 31, 32], dtype=np.int64),
             "target_sos_mask": np.array([133, 34, 35, 36, 37, 38], dtype=np.int64),
             "target_eos_ids": np.array([139, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64),
             "target_eos_mask": np.array([148, 49, 50, 51], dtype=np.int64)},
            {"source_sos_ids": np.array([21, 2, 3, 4, 5], dtype=np.int64),
             "source_sos_mask": np.array([26, 7, 8, 9, 10, 11, 12], dtype=np.int64),
             "source_eos_ids": np.array([213, 14, 15, 16, 17, 18], dtype=np.int64),
             "source_eos_mask": np.array([219, 20, 21, 22, 23, 24, 25, 26, 27], dtype=np.int64),
             "target_sos_ids": np.array([228, 29, 30, 31, 32], dtype=np.int64),
             "target_sos_mask": np.array([233, 34, 35, 36, 37, 38], dtype=np.int64),
             "target_eos_ids": np.array([239, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64),
             "target_eos_mask": np.array([248, 49, 50, 51], dtype=np.int64)},
            {"source_sos_ids": np.array([31, 2, 3, 4, 5], dtype=np.int64),
             "source_sos_mask": np.array([36, 7, 8, 9, 10, 11, 12], dtype=np.int64),
             "source_eos_ids": np.array([313, 14, 15, 16, 17, 18], dtype=np.int64),
             "source_eos_mask": np.array([319, 20, 21, 22, 23, 24, 25, 26, 27], dtype=np.int64),
             "target_sos_ids": np.array([328, 29, 30, 31, 32], dtype=np.int64),
             "target_sos_mask": np.array([333, 34, 35, 36, 37, 38], dtype=np.int64),
             "target_eos_ids": np.array([339, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64),
             "target_eos_mask": np.array([348, 49, 50, 51], dtype=np.int64)},
            {"source_sos_ids": np.array([41, 2, 3, 4, 5], dtype=np.int64),
             "source_sos_mask": np.array([46, 7, 8, 9, 10, 11, 12], dtype=np.int64),
             "source_eos_ids": np.array([413, 14, 15, 16, 17, 18], dtype=np.int64),
             "source_eos_mask": np.array([419, 20, 21, 22, 23, 24, 25, 26, 27], dtype=np.int64),
             "target_sos_ids": np.array([428, 29, 30, 31, 32], dtype=np.int64),
             "target_sos_mask": np.array([433, 34, 35, 36, 37, 38], dtype=np.int64),
             "target_eos_ids": np.array([439, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64),
             "target_eos_mask": np.array([448, 49, 50, 51], dtype=np.int64)},
            {"source_sos_ids": np.array([51, 2, 3, 4, 5], dtype=np.int64),
             "source_sos_mask": np.array([56, 7, 8, 9, 10, 11, 12], dtype=np.int64),
             "source_eos_ids": np.array([513, 14, 15, 16, 17, 18], dtype=np.int64),
             "source_eos_mask": np.array([519, 20, 21, 22, 23, 24, 25, 26, 27], dtype=np.int64),
             "target_sos_ids": np.array([528, 29, 30, 31, 32], dtype=np.int64),
             "target_sos_mask": np.array([533, 34, 35, 36, 37, 38], dtype=np.int64),
             "target_eos_ids": np.array([539, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64),
             "target_eos_mask": np.array([548, 49, 50, 51], dtype=np.int64)}
            ]
    writer = FileWriter(mindrecord_file_name)
    schema = {"source_sos_ids": {"type": "int64", "shape": [-1]},
              "source_sos_mask": {"type": "int64", "shape": [-1]},
              "source_eos_ids": {"type": "int64", "shape": [-1]},
              "source_eos_mask": {"type": "int64", "shape": [-1]},
              "target_sos_ids": {"type": "int64", "shape": [-1]},
              "target_sos_mask": {"type": "int64", "shape": [-1]},
              "target_eos_ids": {"type": "int64", "shape": [-1]},
              "target_eos_mask": {"type": "int64", "shape": [-1]}}
    writer.add_schema(schema, "data is so cool")
    writer.write_raw_data(data)
    writer.commit()

    reader = FileReader(mindrecord_file_name)
    count = 0
    for index, x in enumerate(reader.get_next()):
        assert len(x) == 8
        for field in x:
            if isinstance(x[field], np.ndarray):
                assert (x[field] == data[count][field]).all()
            else:
                assert x[field] == data[count][field]
        count = count + 1
        logger.info("#item{}: {}".format(index, x))
    assert count == 6
    reader.close()

    reader = FileReader(file_name=mindrecord_file_name, columns=["source_eos_ids", "source_eos_mask",
                                                                 "target_sos_ids", "target_sos_mask",
                                                                 "target_eos_ids", "target_eos_mask"])
    count = 0
    for index, x in enumerate(reader.get_next()):
        assert len(x) == 6
        for field in x:
            if isinstance(x[field], np.ndarray):
                assert (x[field] == data[count][field]).all()
            else:
                assert x[field] == data[count][field]
        count = count + 1
        logger.info("#item{}: {}".format(index, x))
    assert count == 6
    reader.close()

    reader = FileReader(file_name=mindrecord_file_name, columns=["source_sos_ids",
                                                                 "target_sos_ids",
                                                                 "target_eos_mask"])
    count = 0
    for index, x in enumerate(reader.get_next()):
        assert len(x) == 3
        for field in x:
            if isinstance(x[field], np.ndarray):
                assert (x[field] == data[count][field]).all()
            else:
                assert x[field] == data[count][field]
        count = count + 1
        logger.info("#item{}: {}".format(index, x))
    assert count == 6
    reader.close()

    reader = FileReader(file_name=mindrecord_file_name, columns=["target_eos_mask",
                                                                 "source_eos_mask",
                                                                 "source_sos_mask"])
    count = 0
    for index, x in enumerate(reader.get_next()):
        assert len(x) == 3
        for field in x:
            if isinstance(x[field], np.ndarray):
                assert (x[field] == data[count][field]).all()
            else:
                assert x[field] == data[count][field]
        count = count + 1
        logger.info("#item{}: {}".format(index, x))
    assert count == 6
    reader.close()

    reader = FileReader(file_name=mindrecord_file_name, columns=["target_eos_ids"])
    count = 0
    for index, x in enumerate(reader.get_next()):
        assert len(x) == 1
        for field in x:
            if isinstance(x[field], np.ndarray):
                assert (x[field] == data[count][field]).all()
            else:
                assert x[field] == data[count][field]
        count = count + 1
        logger.info("#item{}: {}".format(index, x))
    assert count == 6
    reader.close()

    os.remove("{}".format(mindrecord_file_name))
    os.remove("{}.db".format(mindrecord_file_name))
示例#11
0
def test_write_read_process_with_multi_bytes():
    mindrecord_file_name = "test.mindrecord"
    data = [{"file_name": "001.jpg", "label": 43,
             "image1": bytes("image1 bytes abc", encoding='UTF-8'),
             "image2": bytes("image1 bytes def", encoding='UTF-8'),
             "image3": bytes("image1 bytes ghi", encoding='UTF-8'),
             "image4": bytes("image1 bytes jkl", encoding='UTF-8'),
             "image5": bytes("image1 bytes mno", encoding='UTF-8')},
            {"file_name": "002.jpg", "label": 91,
             "image1": bytes("image2 bytes abc", encoding='UTF-8'),
             "image2": bytes("image2 bytes def", encoding='UTF-8'),
             "image3": bytes("image2 bytes ghi", encoding='UTF-8'),
             "image4": bytes("image2 bytes jkl", encoding='UTF-8'),
             "image5": bytes("image2 bytes mno", encoding='UTF-8')},
            {"file_name": "003.jpg", "label": 61,
             "image1": bytes("image3 bytes abc", encoding='UTF-8'),
             "image2": bytes("image3 bytes def", encoding='UTF-8'),
             "image3": bytes("image3 bytes ghi", encoding='UTF-8'),
             "image4": bytes("image3 bytes jkl", encoding='UTF-8'),
             "image5": bytes("image3 bytes mno", encoding='UTF-8')},
            {"file_name": "004.jpg", "label": 29,
             "image1": bytes("image4 bytes abc", encoding='UTF-8'),
             "image2": bytes("image4 bytes def", encoding='UTF-8'),
             "image3": bytes("image4 bytes ghi", encoding='UTF-8'),
             "image4": bytes("image4 bytes jkl", encoding='UTF-8'),
             "image5": bytes("image4 bytes mno", encoding='UTF-8')},
            {"file_name": "005.jpg", "label": 78,
             "image1": bytes("image5 bytes abc", encoding='UTF-8'),
             "image2": bytes("image5 bytes def", encoding='UTF-8'),
             "image3": bytes("image5 bytes ghi", encoding='UTF-8'),
             "image4": bytes("image5 bytes jkl", encoding='UTF-8'),
             "image5": bytes("image5 bytes mno", encoding='UTF-8')},
            {"file_name": "006.jpg", "label": 37,
             "image1": bytes("image6 bytes abc", encoding='UTF-8'),
             "image2": bytes("image6 bytes def", encoding='UTF-8'),
             "image3": bytes("image6 bytes ghi", encoding='UTF-8'),
             "image4": bytes("image6 bytes jkl", encoding='UTF-8'),
             "image5": bytes("image6 bytes mno", encoding='UTF-8')}
            ]
    writer = FileWriter(mindrecord_file_name)
    schema = {"file_name": {"type": "string"},
              "image1": {"type": "bytes"},
              "image2": {"type": "bytes"},
              "image3": {"type": "bytes"},
              "label": {"type": "int32"},
              "image4": {"type": "bytes"},
              "image5": {"type": "bytes"}}
    writer.add_schema(schema, "data is so cool")
    writer.write_raw_data(data)
    writer.commit()

    reader = FileReader(mindrecord_file_name)
    count = 0
    for index, x in enumerate(reader.get_next()):
        assert len(x) == 7
        for field in x:
            if isinstance(x[field], np.ndarray):
                assert (x[field] == data[count][field]).all()
            else:
                assert x[field] == data[count][field]
        count = count + 1
        logger.info("#item{}: {}".format(index, x))
    assert count == 6
    reader.close()

    reader2 = FileReader(file_name=mindrecord_file_name, columns=["image1", "image2", "image5"])
    count = 0
    for index, x in enumerate(reader2.get_next()):
        assert len(x) == 3
        for field in x:
            if isinstance(x[field], np.ndarray):
                assert (x[field] == data[count][field]).all()
            else:
                assert x[field] == data[count][field]
        count = count + 1
        logger.info("#item{}: {}".format(index, x))
    assert count == 6
    reader2.close()

    reader3 = FileReader(file_name=mindrecord_file_name, columns=["image2", "image4"])
    count = 0
    for index, x in enumerate(reader3.get_next()):
        assert len(x) == 2
        for field in x:
            if isinstance(x[field], np.ndarray):
                assert (x[field] == data[count][field]).all()
            else:
                assert x[field] == data[count][field]
        count = count + 1
        logger.info("#item{}: {}".format(index, x))
    assert count == 6
    reader3.close()

    reader4 = FileReader(file_name=mindrecord_file_name, columns=["image5", "image2"])
    count = 0
    for index, x in enumerate(reader4.get_next()):
        assert len(x) == 2
        for field in x:
            if isinstance(x[field], np.ndarray):
                assert (x[field] == data[count][field]).all()
            else:
                assert x[field] == data[count][field]
        count = count + 1
        logger.info("#item{}: {}".format(index, x))
    assert count == 6
    reader4.close()

    reader5 = FileReader(file_name=mindrecord_file_name, columns=["image5", "image2", "label"])
    count = 0
    for index, x in enumerate(reader5.get_next()):
        assert len(x) == 3
        for field in x:
            if isinstance(x[field], np.ndarray):
                assert (x[field] == data[count][field]).all()
            else:
                assert x[field] == data[count][field]
        count = count + 1
        logger.info("#item{}: {}".format(index, x))
    assert count == 6
    reader5.close()

    os.remove("{}".format(mindrecord_file_name))
    os.remove("{}.db".format(mindrecord_file_name))