def create_diff_page_size_cv_mindrecord(files_num): """tutorial for cv dataset writer.""" if os.path.exists(CV1_FILE_NAME): os.remove(CV1_FILE_NAME) if os.path.exists("{}.db".format(CV1_FILE_NAME)): os.remove("{}.db".format(CV1_FILE_NAME)) writer = FileWriter(CV1_FILE_NAME, files_num) writer.set_page_size(1 << 26) # 64MB cv_schema_json = { "file_name": { "type": "string" }, "label": { "type": "int32" }, "data": { "type": "bytes" } } data = [{ "file_name": "001.jpg", "label": 43, "data": bytes('0xffsafdafda', encoding='utf-8') }] writer.add_schema(cv_schema_json, "img_schema") writer.add_index(["file_name", "label"]) writer.write_raw_data(data) writer.commit()
def init_writer(mr_schema): """ init writer """ print("Init writer ...") mr_writer = FileWriter(args.mindrecord_file, args.mindrecord_partitions) # set the header size if args.mindrecord_header_size_by_bit != 24: header_size = 1 << args.mindrecord_header_size_by_bit mr_writer.set_header_size(header_size) # set the page size if args.mindrecord_page_size_by_bit != 25: page_size = 1 << args.mindrecord_page_size_by_bit mr_writer.set_page_size(page_size) # create the schema mr_writer.add_schema(mr_schema, "mindrecord_graph_schema") # open file and set header mr_writer.open_and_set_header() return mr_writer
def add_and_remove_nlp_file(): """add/remove nlp file""" paths = ["{}{}".format(NLP_FILE_NAME, str(x).rjust(1, '0')) for x in range(FILES_NUM)] for x in paths: if os.path.exists("{}".format(x)): os.remove("{}".format(x)) if os.path.exists("{}.db".format(x)): os.remove("{}.db".format(x)) writer = FileWriter(NLP_FILE_NAME, FILES_NUM) data = [x for x in get_nlp_data(NLP_FILE_POS, NLP_FILE_VOCAB, 10)] nlp_schema_json = {"id": {"type": "string"}, "label": {"type": "int32"}, "rating": {"type": "float32"}, "input_ids": {"type": "int64", "shape": [-1]}, "input_mask": {"type": "int64", "shape": [1, -1]}, "segment_ids": {"type": "int64", "shape": [2, -1]} } writer.set_header_size(1 << 14) writer.set_page_size(1 << 15) writer.add_schema(nlp_schema_json, "nlp_schema") writer.add_index(["id", "rating"]) writer.write_raw_data(data) writer.commit() yield "yield_nlp_data" for x in paths: os.remove("{}".format(x)) os.remove("{}.db".format(x))
def test_issue_84(): """test file reader when db does not match.""" writer = FileWriter(CV_FILE_NAME, FILES_NUM) data = get_data("../data/mindrecord/testImageNetData/") cv_schema_json = {"file_name": {"type": "string"}, "label": {"type": "number"}, "data": {"type": "bytes"}} writer.add_schema(cv_schema_json, "img_schema") writer.add_index(["file_name", "label"]) writer.write_raw_data(data) writer.commit() writer = FileWriter(NLP_FILE_NAME, FILES_NUM) data = list(get_nlp_data("../data/mindrecord/testAclImdbData/pos", "../data/mindrecord/testAclImdbData/vocab.txt", 10)) nlp_schema_json = {"id": {"type": "string"}, "label": {"type": "number"}, "rating": {"type": "number"}, "input_ids": {"type": "array", "items": {"type": "number"}}, "input_mask": {"type": "array", "items": {"type": "number"}}, "segment_ids": {"type": "array", "items": {"type": "number"}} } writer.set_header_size(1 << 14) writer.set_page_size(1 << 15) writer.add_schema(nlp_schema_json, "nlp_schema") writer.add_index(["id", "rating"]) writer.write_raw_data(data) writer.commit() reader = ShardReader() os.rename("imagenet.mindrecord1.db", "imagenet.mindrecord1.db.bk") os.rename("aclImdb.mindrecord1.db", "imagenet.mindrecord1.db") file_name = os.path.join(os.getcwd(), "imagenet.mindrecord1") with pytest.raises(Exception) as e: reader.open(file_name) assert str(e.value) == "[MRMOpenError]: error_code: 1347690596, " \ "error_msg: " \ "MindRecord File could not open successfully." os.rename("imagenet.mindrecord1.db", "aclImdb.mindrecord1.db") paths = ["{}{}".format(NLP_FILE_NAME, str(x).rjust(1, '0')) for x in range(FILES_NUM)] for item in paths: os.remove("{}".format(item)) os.remove("{}.db".format(item)) os.rename("imagenet.mindrecord1.db.bk", "imagenet.mindrecord1.db") paths = ["{}{}".format(CV_FILE_NAME, str(x).rjust(1, '0')) for x in range(FILES_NUM)] for item in paths: os.remove("{}".format(item)) os.remove("{}.db".format(item))
# get number of files writer = FileWriter(args.mindrecord_file, args.mindrecord_partitions) start_time = time.time() # set the header size try: header_size = mr_api.mindrecord_header_size writer.set_header_size(header_size) except AttributeError: print("Default header size: {}".format(1 << 24)) # set the page size try: page_size = mr_api.mindrecord_page_size writer.set_page_size(page_size) except AttributeError: print("Default page size: {}".format(1 << 25)) # get schema try: mindrecord_schema = mr_api.mindrecord_schema except AttributeError: raise RuntimeError("mindrecord_schema is not defined in mr_api.py.") # create the schema writer.add_schema(mindrecord_schema, "mindrecord_schema") # add the index try: index_fields = mr_api.mindrecord_index_fields