Пример #1
0
def create_diff_page_size_cv_mindrecord(files_num):
    """tutorial for cv dataset writer."""
    if os.path.exists(CV1_FILE_NAME):
        os.remove(CV1_FILE_NAME)
    if os.path.exists("{}.db".format(CV1_FILE_NAME)):
        os.remove("{}.db".format(CV1_FILE_NAME))
    writer = FileWriter(CV1_FILE_NAME, files_num)
    writer.set_page_size(1 << 26)  # 64MB
    cv_schema_json = {
        "file_name": {
            "type": "string"
        },
        "label": {
            "type": "int32"
        },
        "data": {
            "type": "bytes"
        }
    }
    data = [{
        "file_name": "001.jpg",
        "label": 43,
        "data": bytes('0xffsafdafda', encoding='utf-8')
    }]
    writer.add_schema(cv_schema_json, "img_schema")
    writer.add_index(["file_name", "label"])
    writer.write_raw_data(data)
    writer.commit()
Пример #2
0
    def init_writer(mr_schema):
        """
        init writer
        """
        print("Init writer  ...")
        mr_writer = FileWriter(args.mindrecord_file,
                               args.mindrecord_partitions)

        # set the header size
        if args.mindrecord_header_size_by_bit != 24:
            header_size = 1 << args.mindrecord_header_size_by_bit
            mr_writer.set_header_size(header_size)

        # set the page size
        if args.mindrecord_page_size_by_bit != 25:
            page_size = 1 << args.mindrecord_page_size_by_bit
            mr_writer.set_page_size(page_size)

        # create the schema
        mr_writer.add_schema(mr_schema, "mindrecord_graph_schema")

        # open file and set header
        mr_writer.open_and_set_header()

        return mr_writer
Пример #3
0
def add_and_remove_nlp_file():
    """add/remove nlp file"""
    paths = ["{}{}".format(NLP_FILE_NAME, str(x).rjust(1, '0'))
             for x in range(FILES_NUM)]
    for x in paths:
        if os.path.exists("{}".format(x)):
            os.remove("{}".format(x))
        if os.path.exists("{}.db".format(x)):
            os.remove("{}.db".format(x))
    writer = FileWriter(NLP_FILE_NAME, FILES_NUM)
    data = [x for x in get_nlp_data(NLP_FILE_POS, NLP_FILE_VOCAB, 10)]
    nlp_schema_json = {"id": {"type": "string"}, "label": {"type": "int32"},
                       "rating": {"type": "float32"},
                       "input_ids": {"type": "int64",
                                     "shape": [-1]},
                       "input_mask": {"type": "int64",
                                      "shape": [1, -1]},
                       "segment_ids": {"type": "int64",
                                       "shape": [2, -1]}
                       }
    writer.set_header_size(1 << 14)
    writer.set_page_size(1 << 15)
    writer.add_schema(nlp_schema_json, "nlp_schema")
    writer.add_index(["id", "rating"])
    writer.write_raw_data(data)
    writer.commit()
    yield "yield_nlp_data"
    for x in paths:
        os.remove("{}".format(x))
        os.remove("{}.db".format(x))
Пример #4
0
def test_issue_84():
    """test file reader when db does not match."""
    writer = FileWriter(CV_FILE_NAME, FILES_NUM)
    data = get_data("../data/mindrecord/testImageNetData/")
    cv_schema_json = {"file_name": {"type": "string"},
                      "label": {"type": "number"}, "data": {"type": "bytes"}}
    writer.add_schema(cv_schema_json, "img_schema")
    writer.add_index(["file_name", "label"])
    writer.write_raw_data(data)
    writer.commit()

    writer = FileWriter(NLP_FILE_NAME, FILES_NUM)
    data = list(get_nlp_data("../data/mindrecord/testAclImdbData/pos",
                             "../data/mindrecord/testAclImdbData/vocab.txt",
                             10))
    nlp_schema_json = {"id": {"type": "string"}, "label": {"type": "number"},
                       "rating": {"type": "number"},
                       "input_ids": {"type": "array",
                                     "items": {"type": "number"}},
                       "input_mask": {"type": "array",
                                      "items": {"type": "number"}},
                       "segment_ids": {"type": "array",
                                       "items": {"type": "number"}}
                       }
    writer.set_header_size(1 << 14)
    writer.set_page_size(1 << 15)
    writer.add_schema(nlp_schema_json, "nlp_schema")
    writer.add_index(["id", "rating"])
    writer.write_raw_data(data)
    writer.commit()

    reader = ShardReader()
    os.rename("imagenet.mindrecord1.db", "imagenet.mindrecord1.db.bk")
    os.rename("aclImdb.mindrecord1.db", "imagenet.mindrecord1.db")
    file_name = os.path.join(os.getcwd(), "imagenet.mindrecord1")
    with pytest.raises(Exception) as e:
        reader.open(file_name)
    assert str(e.value) == "[MRMOpenError]: error_code: 1347690596, " \
                           "error_msg: " \
                           "MindRecord File could not open successfully."

    os.rename("imagenet.mindrecord1.db", "aclImdb.mindrecord1.db")
    paths = ["{}{}".format(NLP_FILE_NAME, str(x).rjust(1, '0'))
             for x in range(FILES_NUM)]
    for item in paths:
        os.remove("{}".format(item))
        os.remove("{}.db".format(item))

    os.rename("imagenet.mindrecord1.db.bk", "imagenet.mindrecord1.db")
    paths = ["{}{}".format(CV_FILE_NAME, str(x).rjust(1, '0'))
             for x in range(FILES_NUM)]
    for item in paths:
        os.remove("{}".format(item))
        os.remove("{}.db".format(item))
Пример #5
0
    # get number of files
    writer = FileWriter(args.mindrecord_file, args.mindrecord_partitions)

    start_time = time.time()

    # set the header size
    try:
        header_size = mr_api.mindrecord_header_size
        writer.set_header_size(header_size)
    except AttributeError:
        print("Default header size: {}".format(1 << 24))

    # set the page size
    try:
        page_size = mr_api.mindrecord_page_size
        writer.set_page_size(page_size)
    except AttributeError:
        print("Default page size: {}".format(1 << 25))

    # get schema
    try:
        mindrecord_schema = mr_api.mindrecord_schema
    except AttributeError:
        raise RuntimeError("mindrecord_schema is not defined in mr_api.py.")

    # create the schema
    writer.add_schema(mindrecord_schema, "mindrecord_schema")

    # add the index
    try:
        index_fields = mr_api.mindrecord_index_fields