def _init_reader_schema(self, field_names=None): """Restore a reader schema from the DB file. If `field_names` given, restore scheme according to it. Overwise, loade blobs from the DB file into the workspace, and restore schema from these blob names. It is also assumed that: 1). Each field of the schema have corresponding blobs stored in the DB file. 2). Each blob loaded from the DB file corresponds to a field of the schema. 3). field_names in the original schema are in alphabetic order, since blob names loaded to the workspace from the DB file will be in alphabetic order. Load a set of blobs from a DB file. From names of these blobs, restore the DB file schema using `from_column_list(...)`. Returns: schema: schema.Struct. Used in Reader.__init__(...). """ if field_names: return from_column_list(field_names) if self.db_type == "log_file_db": assert os.path.exists(self.db_path), \ 'db_path [{db_path}] does not exist'.format(db_path=self.db_path) with core.NameScope(self.name): # blob_prefix is for avoiding name conflict in workspace blob_prefix = scope.CurrentNameScope() workspace.RunOperatorOnce( core.CreateOperator( 'Load', [], [], absolute_path=True, db=self.db_path, db_type=self.db_type, load_all=True, add_prefix=blob_prefix, )) col_names = [ blob_name[len(blob_prefix):] for blob_name in sorted(workspace.Blobs()) if blob_name.startswith(blob_prefix) ] schema = from_column_list(col_names) return schema
def __init__(self, fields, name=None, capacity=1, enforce_unique_name=False, num_threads=1): assert isinstance(fields, list) or isinstance(fields, Struct), ( 'fields must be either a Struct or a list of raw field names.') if isinstance(fields, list): fields = from_column_list(fields) self.schema = fields self.name = name or 'queue' self.num_threads = num_threads num_blobs = len(self.schema.field_names()) init_net = core.Net(self.name + '/init_net') self.blobs_queue = init_net.CreateBlobsQueue( [], 1, capacity=capacity, num_blobs=num_blobs, enforce_unique_name=enforce_unique_name) core.workspace.RunNetOnce(init_net) self.writer = _QueueWriter(self.blobs_queue, self.schema) reader_name = self.name + '_reader' self.reader = _QueueReader(self.blobs_queue, self.schema, reader_name) exit_net = core.Net(self.name + '/exit_net') exit_net.CloseBlobsQueue(self.blobs_queue, 0) self.exit_step = core.execution_step( '{}_close_step'.format(str(exit_net)), exit_net)
def testFromColumnList(self): st = schema.Struct(('a', schema.Scalar()), ('b', schema.List(schema.Scalar())), ('c', schema.Map(schema.Scalar(), schema.Scalar()))) columns = st.field_names() # test that recovery works for arbitrary order for _ in range(10): some_blobs = [core.BlobReference('blob:' + x) for x in columns] rec = schema.from_column_list(columns, col_blobs=some_blobs) self.assertTrue(rec.has_blobs()) self.assertEqual(sorted(st.field_names()), sorted(rec.field_names())) self.assertEqual( [str(blob) for blob in rec.field_blobs()], [str('blob:' + name) for name in rec.field_names()]) random.shuffle(columns)
def testFromColumnList(self): st = schema.Struct( ('a', schema.Scalar()), ('b', schema.List(schema.Scalar())), ('c', schema.Map(schema.Scalar(), schema.Scalar())) ) columns = st.field_names() # test that recovery works for arbitrary order for _ in range(10): some_blobs = [core.BlobReference('blob:' + x) for x in columns] rec = schema.from_column_list(columns, col_blobs=some_blobs) self.assertTrue(rec.has_blobs()) self.assertEqual(sorted(st.field_names()), sorted(rec.field_names())) self.assertEqual([str(blob) for blob in rec.field_blobs()], [str('blob:' + name) for name in rec.field_names()]) random.shuffle(columns)
def shrink_output_schema(net, out_schema): if len(out_schema.field_names()) <= 1: return out_schema exists = [net.BlobIsDefined(blob) for blob in out_schema.field_blobs()] return schema.from_column_list([ col_name for ok, col_name in zip(exists, out_schema.field_names()) if ok ], [ col_type for ok, col_type in zip(exists, out_schema.field_types()) if ok ], [ col_blob for ok, col_blob in zip(exists, out_schema.field_blobs()) if ok ], [ col_meta for ok, col_meta in zip(exists, out_schema.field_metadata()) if ok ])
def __init__(self, fields, name=None): """Create an un-initialized dataset with schema provided by `fields`. Before this dataset can be used, it must be initialized, either by `init_empty` or `init_from_dataframe`. Args: fields: either a schema.Struct or a list of field names in a format compatible with the one described in schema.py. name: optional name to prepend to blobs that will store the data. """ assert isinstance(fields, list) or isinstance(fields, Struct), ( 'fields must be either a Struct or a list of raw field names.') if isinstance(fields, list): fields = from_column_list(fields) self.schema = fields self.fields = fields.field_names() self.field_types = fields.field_types() self.name = name or 'dataset' self.field_blobs = fields.field_blobs() if fields.has_blobs() else None
def shrink_output_schema(net, out_schema): if len(out_schema.field_names()) <= 1: return out_schema exists = [net.BlobIsDefined(blob) for blob in out_schema.field_blobs()] return schema.from_column_list( [ col_name for ok, col_name in zip(exists, out_schema.field_names()) if ok ], [ col_type for ok, col_type in zip(exists, out_schema.field_types()) if ok ], [ col_blob for ok, col_blob in zip(exists, out_schema.field_blobs()) if ok ], [ col_meta for ok, col_meta in zip(exists, out_schema.field_metadata()) if ok ] )
def _init_reader_schema(self): """Restore a reader schema from the DB file. Here it is assumed that: 1). Each field of the schema have corresponding blobs stored in the DB file. 2). Each blob loaded from the DB file corresponds to a field of the schema. Load a set of blobs from a DB file. From names of these blobs, restore the DB file schema using `from_column_list(...)`. Returns: schema: schema.Struct. Used in Reader.__init__(...). """ assert os.path.exists(self.db_path), \ 'db_path [{db_path}] does not exist'.format(db_path=self.db_path) with core.NameScope(self.name): # blob_prefix is for avoiding name conflict in workspace blob_prefix = scope.CurrentNameScope() workspace.RunOperatorOnce( core.CreateOperator( 'Load', [], [], absolute_path=True, db=self.db_path, db_type=self.db_type, load_all=True, add_prefix=blob_prefix, ) ) col_names = [ blob_name[len(blob_prefix):] for blob_name in workspace.Blobs() if blob_name.startswith(blob_prefix) ] schema = from_column_list(col_names) return schema
def testFromEmptyColumnList(self): st = schema.Struct() columns = st.field_names() rec = schema.from_column_list(col_names=columns) self.assertEqual(rec, schema.Struct())
def get_predictor_export_meta_and_workspace(self, feature_extractor=None, output_transformer=None): """ ONNX would load blobs into a private workspace. We returns the workspace here instead of copying the blobs to the global workspace in order to save memory in the export state. Returning private workspace, we only need memory for PyTorch model, ONNX buffer, and Caffe2 model. Including optimizer parameters, this means we can train and save a model a quarter of the size of machine memory. We should revisit this once PyTorch 1.0 is ready. Args: feature_extractor: An instance of FeatureExtractorBase """ # 1. Get Caffe2 model c2_model, input_blobs, output_blobs = self.get_caffe2_model() ws = c2_model.workspace # Initializing constants in the model init_net = core.Net(c2_model.init_net) ws.CreateNet(init_net) ws.RunNet(init_net) # Per ONNX code comment, input blobs are not initilized model_inputs = c2_model.uninitialized assert len(model_inputs) > 0, "Model is expected to have some input" parameters = [b for b in ws.Blobs() if b not in model_inputs] # Input blobs in order model_input_blobs = [b for b in input_blobs if b in model_inputs] predict_net = core.Net("predict_net") output_blob_names = self.output_blob_names() assert len(output_blobs) == len(output_blob_names), ( "output_blobs and output_blob_names must have the same lengths. " "Check that your model don't reuse output tensors. " "output_blobs: {}; output_blob_names: {}".format( output_blobs, output_blob_names)) blob_remap = { onnx_name: explicit_name for onnx_name, explicit_name in zip(output_blobs, output_blob_names) } shapes = {} # 2. Create feature extractor net if feature_extractor: feature_extractor_nets = feature_extractor.create_net() # Initializing feature extractor parameters ws.CreateNet(feature_extractor_nets.init_net) ws.RunNet(feature_extractor_nets.init_net) feature_extractor_params = set( feature_extractor_nets.init_net.Proto().external_output) assert (len(set(parameters) & feature_extractor_params) == 0 ), "Blob names collide! Please open a bug report" parameters += feature_extractor_params extracted_blobs = [ str(b) for b in feature_extractor_nets.net.output_record().field_blobs() ] assert len(model_input_blobs) == len(extracted_blobs), ( "The lengths of model_input_blobs and extracted_blobs must match. " "model_input_blobs: {}; extracted_blobs: {}".format( model_input_blobs, extracted_blobs)) blob_remap.update({ onnx_name: extracted_name for onnx_name, extracted_name in zip(model_input_blobs, extracted_blobs) }) predict_net.AppendNet(feature_extractor_nets.net) del predict_net.Proto().external_output[:] input_blobs = [ b for b in predict_net.Proto().external_input if b not in feature_extractor_params ] shapes.update({b: [] for b in input_blobs}) else: input_blobs = model_input_blobs # 3. Rename the input blobs of model to match the output of feature # extractor net model_net = core.Net(c2_model.predict_net).Clone("remapped_model_net", blob_remap=blob_remap) # 5. Join feature extractor net & model net predict_net.AppendNet(model_net) if output_transformer is not None: output_field_names = self.output_field_names() original_output = schema.from_column_list( col_names=output_field_names, col_blobs=[core.BlobReference(b) for b in output_blob_names], ) output_transformer_nets = output_transformer.create_net( original_output) # Initializing output transformer parameters ws.CreateNet(output_transformer_nets.init_net) ws.RunNet(output_transformer_nets.init_net) output_transformer_params = set( output_transformer_nets.init_net.Proto().external_output) assert (len(set(parameters) & output_transformer_params) == 0 ), "Blob names collide! Please open a bug report" parameters += output_transformer_params del predict_net.Proto().external_output[:] predict_net.AppendNet(output_transformer_nets.net) # These shapes are not really used but required, so just pass fake ones shapes.update({b: [] for b in predict_net.Proto().external_output}) return ( PredictorExportMeta( predict_net, parameters, input_blobs, predict_net.Proto().external_output, shapes=shapes, net_type="async_scheduling", num_workers=8, ), ws, )