def test_get_predictor_export_meta_and_workspace_full(self): model = Model() state_normalization_parameters = { i: NormalizationParameters(feature_type=CONTINUOUS) for i in range(1, 5) } action_normalization_parameters = { i: NormalizationParameters(feature_type=CONTINUOUS) for i in range(5, 9) } extractor = PredictorFeatureExtractor( state_normalization_parameters=state_normalization_parameters, action_normalization_parameters=action_normalization_parameters, normalize=False, ) output_transformer = TestOutputTransformer() pem, ws = model.get_predictor_export_meta_and_workspace( feature_extractor=extractor, output_transformer=output_transformer) # model has 2 params + 1 const. extractor has 1 const. output_transformer has 1 const. self.assertEqual(5, len(pem.parameters)) for p in pem.parameters: self.assertTrue(ws.HasBlob(p)) self.assertEqual(3, len(pem.inputs)) self.assertEqual(5, len(pem.outputs)) self.assertEqual( { "output/string_weighted_multi_categorical_features.lengths", "output/string_weighted_multi_categorical_features.keys", "output/string_weighted_multi_categorical_features.values.lengths", "output/string_weighted_multi_categorical_features.values.keys", "output/string_weighted_multi_categorical_features.values.values", }, set(pem.outputs), ) input_prototype = model.input_prototype() with tempfile.TemporaryDirectory() as tmpdirname: db_path = os.path.join(tmpdirname, "model") logger.info("DB path: {}".format(db_path)) db_type = "minidb" with ws._ctx: save_to_db(db_type, db_path, pem) # Load the model from DB file and run it net = prepare_prediction_net(db_path, db_type) state_features = input_prototype.state.float_features action_features = input_prototype.action.float_features float_features_values = (torch.cat( (state_features, action_features), dim=1).reshape(-1).numpy()) float_features_keys = np.arange(1, 9) float_features_lengths = np.array([8], dtype=np.int32) workspace.FeedBlob("input/float_features.keys", float_features_keys) workspace.FeedBlob("input/float_features.values", float_features_values) workspace.FeedBlob("input/float_features.lengths", float_features_lengths) workspace.RunNet(net) model_sum, model_mul, model_plus_one, model_linear = model( input_prototype) lengths = workspace.FetchBlob( "output/string_weighted_multi_categorical_features.lengths") keys = workspace.FetchBlob( "output/string_weighted_multi_categorical_features.keys") values_lengths = workspace.FetchBlob( "output/string_weighted_multi_categorical_features.values.lengths" ) values_keys = workspace.FetchBlob( "output/string_weighted_multi_categorical_features.values.keys" ) values_values = workspace.FetchBlob( "output/string_weighted_multi_categorical_features.values.values" ) N = 1 npt.assert_array_equal(np.ones(N, dtype=np.int32), lengths) npt.assert_array_equal(np.zeros(N, dtype=np.int64), keys) npt.assert_array_equal([1] * N, values_lengths) npt.assert_array_equal(np.array([b"TestAction"], dtype=np.object), values_keys) npt.assert_array_equal(model_linear.detach().numpy().reshape(-1), values_values)
def testEqualToCudnn(self): with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA)): T = 8 batch_size = 4 input_dim = 8 hidden_dim = 31 workspace.FeedBlob("seq_lengths", np.array([T] * batch_size, dtype=np.int32)) workspace.FeedBlob( "target", np.zeros([T, batch_size, hidden_dim], dtype=np.float32)) workspace.FeedBlob( "hidden_init", np.zeros([1, batch_size, hidden_dim], dtype=np.float32)) workspace.FeedBlob( "cell_init", np.zeros([1, batch_size, hidden_dim], dtype=np.float32)) own_model = model_helper.ModelHelper(name="own_lstm") input_shape = [T, batch_size, input_dim] cudnn_model = model_helper.ModelHelper(name="cudnn_lstm") input_blob = cudnn_model.param_init_net.UniformFill( [], "input", shape=input_shape) workspace.FeedBlob( "CUDNN/hidden_init_cudnn", np.zeros([1, batch_size, hidden_dim], dtype=np.float32)) workspace.FeedBlob( "CUDNN/cell_init_cudnn", np.zeros([1, batch_size, hidden_dim], dtype=np.float32)) cudnn_output, cudnn_last_hidden, _, param_extract = rnn_cell.cudnn_LSTM( model=cudnn_model, input_blob=input_blob, initial_states=("hidden_init_cudnn", "hidden_init_cudnn"), dim_in=input_dim, dim_out=hidden_dim, scope="CUDNN", return_params=True, ) cudnn_loss = cudnn_model.AveragedLoss( cudnn_model.SquaredL2Distance([cudnn_output, "target"], "CUDNN/dist"), "CUDNN/loss") own_output, own_last_hidden, _, last_state, own_params = rnn_cell.LSTM( model=own_model, input_blob=input_blob, seq_lengths="seq_lengths", initial_states=("hidden_init", "cell_init"), dim_in=input_dim, dim_out=hidden_dim, scope="OWN", return_params=True, ) own_loss = own_model.AveragedLoss( own_model.SquaredL2Distance([own_output, "target"], "OWN/dist"), "OWN/loss") # Add gradients cudnn_model.AddGradientOperators([cudnn_loss]) own_model.AddGradientOperators([own_loss]) # Add parameter updates LR = cudnn_model.param_init_net.ConstantFill([], shape=[1], value=0.01) ONE = cudnn_model.param_init_net.ConstantFill([], shape=[1], value=1.0) for param in cudnn_model.GetParams(): cudnn_model.WeightedSum( [param, ONE, cudnn_model.param_to_grad[param], LR], param) for param in own_model.GetParams(): own_model.WeightedSum( [param, ONE, own_model.param_to_grad[param], LR], param) workspace.RunNetOnce(cudnn_model.param_init_net) workspace.CreateNet(cudnn_model.net) ## ## CUDNN LSTM MODEL EXECUTION ## # Get initial values from CuDNN LSTM so we can feed them # to our own. (param_extract_net, param_extract_mapping) = param_extract workspace.RunNetOnce(param_extract_net) cudnn_lstm_params = {} for input_type, pars in param_extract_mapping.items(): cudnn_lstm_params[input_type] = {} for k, v in pars.items(): cudnn_lstm_params[input_type][k] = workspace.FetchBlob( v[0]) # Run the model 3 times, so that some parameter updates are done workspace.RunNet(cudnn_model.net.Proto().name, 3) ## ## OWN LSTM MODEL EXECUTION ## # Map the cuDNN parameters to our own workspace.RunNetOnce(own_model.param_init_net) rnn_cell.InitFromLSTMParams(own_params, cudnn_lstm_params) # Run the model 3 times, so that some parameter updates are done workspace.CreateNet(own_model.net) workspace.RunNet(own_model.net.Proto().name, 3) ## ## COMPARE RESULTS ## # Then compare that final results after 3 runs are equal own_output_data = workspace.FetchBlob(own_output) own_last_hidden = workspace.FetchBlob(own_last_hidden) own_loss = workspace.FetchBlob(own_loss) cudnn_output_data = workspace.FetchBlob(cudnn_output) cudnn_last_hidden = workspace.FetchBlob(cudnn_last_hidden) cudnn_loss = workspace.FetchBlob(cudnn_loss) self.assertTrue(np.allclose(own_output_data, cudnn_output_data)) self.assertTrue(np.allclose(own_last_hidden, cudnn_last_hidden)) self.assertTrue(np.allclose(own_loss, cudnn_loss))
def im_detections(model, im, anchors): """Generate RetinaNet detections on a single image.""" k_max, k_min = cfg.FPN.RPN_MAX_LEVEL, cfg.FPN.RPN_MIN_LEVEL A = cfg.RETINANET.SCALES_PER_OCTAVE * len(cfg.RETINANET.ASPECT_RATIOS) inputs = {} inputs['data'], inputs['im_info'] = _get_image_blob(im) cls_probs, box_preds = [], [] for lvl in range(k_min, k_max + 1): suffix = 'fpn{}'.format(lvl) cls_probs.append(core.ScopedName('retnet_cls_prob_{}'.format(suffix))) box_preds.append(core.ScopedName('retnet_bbox_pred_{}'.format(suffix))) for k, v in inputs.items(): workspace.FeedBlob(core.ScopedName(k), v.astype(np.float32, copy=False)) workspace.RunNet(model.net.Proto().name) scale = inputs['im_info'][0, 2] cls_probs = workspace.FetchBlobs(cls_probs) box_preds = workspace.FetchBlobs(box_preds) # here the boxes_all are [x0, y0, x1, y1, score] boxes_all = defaultdict(list) cnt = 0 for lvl in range(k_min, k_max + 1): # create cell anchors array stride = 2.**lvl cell_anchors = anchors[lvl] # fetch per level probability cls_prob = cls_probs[cnt] box_pred = box_preds[cnt] cls_prob = cls_prob.reshape( (cls_prob.shape[0], A, int(cls_prob.shape[1] / A), cls_prob.shape[2], cls_prob.shape[3])) box_pred = box_pred.reshape( (box_pred.shape[0], A, 4, box_pred.shape[2], box_pred.shape[3])) cnt += 1 if cfg.RETINANET.SOFTMAX: cls_prob = cls_prob[:, :, 1::, :, :] cls_prob_ravel = cls_prob.ravel() # In some cases [especially for very small img sizes], it's possible that # candidate_ind is empty if we impose threshold 0.05 at all levels. This # will lead to errors since no detections are found for this image. Hence, # for lvl 7 which has small spatial resolution, we take the threshold 0.0 th = cfg.RETINANET.INFERENCE_TH if lvl < k_max else 0.0 candidate_inds = np.where(cls_prob_ravel > th)[0] if (len(candidate_inds) == 0): continue pre_nms_topn = min(cfg.RETINANET.PRE_NMS_TOP_N, len(candidate_inds)) inds = np.argpartition(cls_prob_ravel[candidate_inds], -pre_nms_topn)[-pre_nms_topn:] inds = candidate_inds[inds] inds_5d = np.array(np.unravel_index(inds, cls_prob.shape)).transpose() classes = inds_5d[:, 2] anchor_ids, y, x = inds_5d[:, 1], inds_5d[:, 3], inds_5d[:, 4] scores = cls_prob[:, anchor_ids, classes, y, x] boxes = np.column_stack((x, y, x, y)).astype(dtype=np.float32) boxes *= stride boxes += cell_anchors[anchor_ids, :] if not cfg.RETINANET.CLASS_SPECIFIC_BBOX: box_deltas = box_pred[0, anchor_ids, :, y, x] else: box_cls_inds = classes * 4 box_deltas = np.vstack([ box_pred[0, ind:ind + 4, yi, xi] for ind, yi, xi in zip(box_cls_inds, y, x) ]) pred_boxes = (box_utils.bbox_transform(boxes, box_deltas) if cfg.TEST.BBOX_REG else boxes) pred_boxes /= scale pred_boxes = box_utils.clip_tiled_boxes(pred_boxes, im.shape) box_scores = np.zeros((pred_boxes.shape[0], 5)) box_scores[:, 0:4] = pred_boxes box_scores[:, 4] = scores for cls in range(1, cfg.MODEL.NUM_CLASSES): inds = np.where(classes == cls - 1)[0] if len(inds) > 0: boxes_all[cls].extend(box_scores[inds, :]) # Combine predictions across all levels and retain the top scoring by class detections = [] for cls, boxes in boxes_all.items(): cls_dets = np.vstack(boxes).astype(dtype=np.float32) # do class specific nms here keep = box_utils.nms(cls_dets, cfg.TEST.NMS) cls_dets = cls_dets[keep, :] out = np.zeros((len(keep), 6)) out[:, 0:5] = cls_dets out[:, 5].fill(cls) detections.append(out) detections = np.vstack(detections) # sort all again inds = np.argsort(-detections[:, 4]) detections = detections[inds[0:cfg.TEST.DETECTIONS_PER_IM], :] boxes = detections[:, 0:4] scores = detections[:, 4] classes = detections[:, 5] return boxes, scores, classes
def test_slws_fused_8bit_rowwise_acc32_nnpi(self, seed, num_rows, embedding_dim, batch_size, max_weight): workspace.GlobalInit([ "caffe2", "--glow_global_fp16=0", "--glow_global_fused_scale_offset_fp16=0", "--glow_global_force_sls_fp16_accum=0", ]) workspace.ResetWorkspace() np.random.seed(seed) data = np.random.rand(num_rows, embedding_dim).astype(np.float32) lengths = np.random.choice(np.arange(1, num_rows), batch_size).astype(np.int32) indices = [] for length in lengths: indices.extend(np.random.choice(np.arange(1, num_rows), length)) indices = np.asarray(indices).astype(np.int64) weights = np.random.uniform(low=0, high=max_weight, size=[len(indices)]).astype(np.float32) pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend( ["quantized_data", "weights", "indices", "lengths"]) pred_net.external_output.append("Y") pred_net.op.add().CopyFrom( core.CreateOperator( "SparseLengthsWeightedSumFused8BitRowwise", ["quantized_data", "weights", "indices", "lengths"], ["Y"], )) ref_net = caffe2_pb2.NetDef() ref_net.name = "ref" ref_net.external_input.extend( ["quantized_data", "weights", "indices", "lengths"]) ref_net.external_output.append("Y") ref_net.op.add().CopyFrom( core.CreateOperator( "SparseLengthsWeightedSumFused8BitRowwiseFakeFP32NNPI", ["quantized_data", "weights", "indices", "lengths"], ["Y"], )) workspace.FeedBlob("data", data) workspace.RunOperatorOnce( core.CreateOperator("FloatToFused8BitRowwiseQuantized", ["data"], ["quantized_data"])) onnxified_net = onnxifi_caffe2_net( pred_net, {}, max_batch_size=batch_size, max_seq_size=batch_size * np.max(lengths), debug=True, adjust_batch=True, use_onnx=False, ) workspace.FeedBlob("indices", indices) workspace.FeedBlob("lengths", lengths) workspace.FeedBlob("weights", weights) workspace.CreateNet(onnxified_net) workspace.CreateNet(ref_net) workspace.RunNet(onnxified_net.name) Y_glow = workspace.FetchBlob("Y") workspace.RunNet(ref_net.name) Y_ref = workspace.FetchBlob("Y") diff = np.abs((Y_ref - Y_glow) / (Y_ref + 1e-8)) max_err = np.max(diff, axis=1) num_offenders = (max_err > 0).sum() if num_offenders > 0: print_test_debug_info( "test_slws_fused_8bit_rowwise_acc32_nnpi", { "seed": seed, "num_rows": num_rows, "embedding_dim": embedding_dim, "batch_size": batch_size, "indices": indices, "data": data.shape, "lengths": lengths, "weights": weights, "Y_glow": Y_glow, "Y_ref": Y_ref, "diff": diff, "rowwise_diff": np.max(diff, axis=1), }, ) assert 0
def train_with_eval( self, num_epoch=1, report_interval=0, eval_during_training=False, ): ''' Fastest mode: report_interval = 0 Medium mode: report_interval > 0, eval_during_training=False Slowest mode: report_interval > 0, eval_during_training=True ''' num_batch_per_epoch = int(self.input_data_store['train'][1] / self.batch_size) if not self.input_data_store['train'][1] % self.batch_size == 0: num_batch_per_epoch += 1 print('[Warning]: batch_size cannot be divided. ' + 'Run on {} example instead of {}'.format( num_batch_per_epoch * self.batch_size, self.input_data_store['train'][1])) print('<<< Run {} iteration'.format(num_epoch * num_batch_per_epoch)) train_net = self.net_store['train_net'] if report_interval > 0: print('>>> Training with Reports') num_eval = int(num_epoch / report_interval) num_unit_iter = int((num_batch_per_epoch * num_epoch) / num_eval) if eval_during_training and 'eval_net' in self.net_store: print('>>> Training with Eval Reports (Slowest mode)') eval_net = self.net_store['eval_net'] for i in range(num_eval): workspace.RunNet(train_net.Proto().name, num_iter=num_unit_iter) self.reports['epoch'].append((i + 1) * report_interval) train_loss = np.asscalar(schema.FetchRecord(self.loss).get()) self.reports['train_loss'].append(train_loss) # Add metrics train_l1_metric = np.asscalar( schema.FetchRecord( self.model.metrics_schema.l1_metric).get()) self.reports['train_l1_metric'].append(train_l1_metric) train_scaled_l1_metric = np.asscalar( schema.FetchRecord( self.model.metrics_schema.scaled_l1_metric).get()) self.reports['train_scaled_l1_metric'].append( train_scaled_l1_metric) if eval_during_training and 'eval_net' in self.net_store: workspace.RunNet(eval_net.Proto().name, num_iter=num_unit_iter) eval_loss = np.asscalar( schema.FetchRecord(self.loss).get()) # Add metrics self.reports['eval_loss'].append(eval_loss) eval_l1_metric = np.asscalar( schema.FetchRecord( self.model.metrics_schema.l1_metric).get()) self.reports['eval_l1_metric'].append(eval_l1_metric) eval_scaled_l1_metric = np.asscalar( schema.FetchRecord( self.model.metrics_schema.scaled_l1_metric).get()) self.reports['eval_scaled_l1_metric'].append( eval_scaled_l1_metric) else: print('>>> Training without Reports (Fastest mode)') workspace.RunNet( train_net, num_iter=num_epoch * num_batch_per_epoch, ) print('>>> Saving test model') # Save Net exporter.save_net(self.net_store['pred_net'], self.model, self.model_name + '_init', self.model_name + '_predict') # Save Loss Trend if report_interval > 0: self.save_loss_trend(self.model_name)
def forward(self, niters): workspace.RunNet(self.net, niters, False)
pad=1) conv3 = brew.relu(model, conv3, conv3) fc3 = brew.fc(model, conv3, 'fc3', dim_in=256 * 28 * 28, dim_out=512) fc3 = brew.relu(model, fc3, fc3) pred = brew.fc(model, fc3, 'pred', 512, 10) softmax = brew.softmax(model, pred, 'softmax') return softmax core.GlobalInit(['caffe2', '--caffe2_log_level=0']) root_folder, data_folder = DownloadMNIST() workspace.ResetWorkspace(root_folder) arg_scope = {"order": "NCHW"} test_model = model_helper.ModelHelper(name="mnist_test", arg_scope=arg_scope, init_params=True) data, label = AddInput(test_model, batch_size=1, db=os.path.join(data_folder, 'mnist-test-nchw-lmdb'), db_type='lmdb') softmax = AddLeNetModel(test_model, data) # run a test pass on the test net workspace.RunNetOnce(test_model.param_init_net) workspace.CreateNet(test_model.net, overwrite=True) test_accuracy = np.zeros(10000) for i in tqdm.tqdm(range(10000)): workspace.RunNet(test_model.net.Proto().name)
db_type="lmdb", load_all=1, keep_device=1, absolute_path=0) workspace.RunNetOnce(load.net) workspace.FeedBlob('iter', [iter_val]) save_trained_model(deploy) if load_trained: load_crunk(load_trained, device_opts) loss = np.zeros(train_iters) start = time.time() name = train.net.Proto().name numstraight = 0 i = 0 while i < train_iters: workspace.RunNet(name) if i == 0: realstart = time.time() loss[i] = workspace.FetchBlob('avgloss') if i % 200 == 0: LR = workspace.FetchBlob('LR') stop = time.time() j = i if j == 0: j = 1 st = workspace.FetchBlob('output') steer = st[0, 0] lb = workspace.FetchBlob('label') label = lb[0, 0] outputs = [] for q in st: outputs.append(q[0])
workspace.FeedBlob("data", image) workspace.FeedBlob("label", label) break workspace.RunNetOnce(test_model.param_init_net) workspace.CreateNet(test_model.net, overwrite=True) num_correct = 0 total = 0 # Cycle through the test dictionary once, with batch size = 1, meaning we only consider one stack at a time for stack, label in test_dataset.read(batch_size=1): # Run the stack through the predictor and get the result array workspace.FeedBlob("data", stack, device_option=device_opts) workspace.FeedBlob("label", label, device_option=device_opts) workspace.RunNet(test_model.net) results = workspace.FetchBlob('softmax')[0] print results # Get the top-1 prediction max_index, max_value = max(enumerate(results), key=operator.itemgetter(1)) print "Prediction: ", max_index print "Confidence: ", max_value # Update confusion matrix cmat[label, max_index] += 1 if max_index == label: num_correct += 1
def RunValidation(model, i): workspace.RunNet(model.net) print 'after val:' PrintStatistics(i)
from caffe2.python import cnn, workspace, core from caffe2.proto import caffe2_pb2 import numpy as np import time #device_opts = caffe2_pb2.DeviceOption() #device_opts.device_type = caffe2_pb2.CUDA #device_opts.cuda_gpu_id = 0 device_opts = core.DeviceOption(caffe2_pb2.CUDA, 0) net = core.Net("smoothL1Loss_test") net.SmoothL1LossGradient(["data1", "data2", "avg_loss"], "loss", device_option=device_opts) print net.Proto() data1 = np.load('data1.npy') data2 = np.load('data2.npy') avg_loss = np.ones(1, dtype=np.float32) workspace.FeedBlob("data1", data1, device_option=device_opts) workspace.FeedBlob("data2", data2, device_option=device_opts) workspace.FeedBlob("avg_loss", avg_loss, device_option=device_opts) workspace.CreateNet(net.Proto()) workspace.RunNet("smoothL1Loss_test", 1) caffe2_out = workspace.FetchBlob('loss') print(caffe2_out)
def run_conv_or_fc(test_case, init_net, net, X, W, b, op_type, engine, order, gc, outputs, scale=None, zero_point=None): if order: # Conv Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"]) else: # FC Output = collections.namedtuple("Output", ["Y", "op_type", "engine"]) # We run DNNLOWP ops multiple times to test their first runs that # do caching so exercises different code paths from the subsequent # runs # self.ws.run re-creates operator every time so this test covers # cases when we have multiple nets sharing the same workspace test_case.ws.create_blob("X").feed(X, device_option=gc) test_case.ws.create_blob("W").feed(W, device_option=gc) test_case.ws.create_blob("b").feed(b, device_option=gc) if scale is not None and zero_point is not None: test_case.ws.create_blob("scale").feed(scale, device_option=gc) test_case.ws.create_blob("zero_point").feed(zero_point, device_option=gc) if init_net: test_case.ws.run(init_net) for i in range(1 if engine == "" else 2): test_case.ws.run(net) Y = test_case.ws.blobs["Y"].fetch() if order: outputs.append( Output(Y=Y, op_type=op_type, engine=engine, order=order)) else: outputs.append(Output(Y=Y, op_type=op_type, engine=engine)) # workspace.CreateNet + workspace.RunNet reuses the same operator if engine != "": workspace.FeedBlob("X", X) workspace.FeedBlob("W", W) workspace.FeedBlob("b", b) if scale is not None and zero_point is not None: workspace.FeedBlob("scale", scale) workspace.FeedBlob("zero_point", zero_point) if init_net: workspace.RunNetOnce(init_net) workspace.CreateNet(net) for i in range(2): workspace.RunNet(net) Y = workspace.FetchBlob("Y") if order: outputs.append( Output(Y=Y, op_type=op_type, engine=engine, order=order)) else: outputs.append(Output(Y=Y, op_type=op_type, engine=engine))
# Weighted sum train_net.WeightedSum([W, ONE, gradient_map[W], LR], W) train_net.WeightedSum([B, ONE, gradient_map[B], LR], B) # Let's show the graph again. graph = net_drawer.GetPydotGraph(train_net.Proto().op, "train", rankdir="LR") graph.write_svg('Sixth.svg') workspace.RunNetOnce(init_net) workspace.CreateNet(train_net) # ------------------------------------------------------------------------------------ print("Before training, W is: {}".format(workspace.FetchBlob("W"))) print("Before training, B is: {}".format(workspace.FetchBlob("B"))) for i in range(100): workspace.RunNet(train_net.Proto().name) print("After training, W is: {}".format(workspace.FetchBlob("W"))) print("After training, B is: {}".format(workspace.FetchBlob("B"))) print("Ground truth W is: {}".format(workspace.FetchBlob("W_gt"))) print("Ground truth B is: {}".format(workspace.FetchBlob("B_gt"))) # ------------------------------------------------------------------------------------ workspace.RunNetOnce(init_net) w_history = [] b_history = [] for i in range(50): workspace.RunNet(train_net.Proto().name) w_history.append(workspace.FetchBlob("W")) b_history.append(workspace.FetchBlob("B"))
def test_get_predictor_export_meta_and_workspace_with_feature_extractor( self): model = Model() state_normalization_parameters = { i: NormalizationParameters(feature_type=CONTINUOUS) for i in range(1, 5) } action_normalization_parameters = { i: NormalizationParameters(feature_type=CONTINUOUS) for i in range(5, 9) } extractor = PredictorFeatureExtractor( state_normalization_parameters=state_normalization_parameters, action_normalization_parameters=action_normalization_parameters, normalize=False, ) pem, ws = model.get_predictor_export_meta_and_workspace( feature_extractor=extractor) # model has 2 params + 1 const. extractor has 1 const. self.assertEqual(4, len(pem.parameters)) for p in pem.parameters: self.assertTrue(ws.HasBlob(p)) self.assertEqual(3, len(pem.inputs)) self.assertEqual(4, len(pem.outputs)) input_prototype = model.input_prototype() with tempfile.TemporaryDirectory() as tmpdirname: db_path = os.path.join(tmpdirname, "model") logger.info("DB path: ", db_path) db_type = "minidb" with ws._ctx: save_to_db(db_type, db_path, pem) # Load the model from DB file and run it net = prepare_prediction_net(db_path, db_type) state_features = input_prototype.state.float_features action_features = input_prototype.action.float_features float_features_values = (torch.cat( (state_features, action_features), dim=1).reshape(-1).numpy()) float_features_keys = np.arange(1, 9) float_features_lengths = np.array([8], dtype=np.int32) workspace.FeedBlob("input/float_features.keys", float_features_keys) workspace.FeedBlob("input/float_features.values", float_features_values) workspace.FeedBlob("input/float_features.lengths", float_features_lengths) workspace.RunNet(net) net_sum = workspace.FetchBlob("sum") net_mul = workspace.FetchBlob("mul") net_plus_one = workspace.FetchBlob("plus_one") net_linear = workspace.FetchBlob("linear") model_sum, model_mul, model_plus_one, model_linear = model( input_prototype) npt.assert_array_equal(model_sum.numpy(), net_sum) npt.assert_array_equal(model_mul.numpy(), net_mul) npt.assert_array_equal(model_plus_one.numpy(), net_plus_one) npt.assert_allclose(model_linear.detach().numpy(), net_linear, rtol=1e-4)
def main(): root_path = '/home/osboxes/zementis/scalogram/fault_diagnosis' data_path = os.path.join(root_path, 'data') labels_path = os.path.join(data_path, 'labels.txt') labels_to_classes_map = get_labels_to_classes_map(labels_path) fault_types_path = { 'baseLine': os.path.join(data_path, 'raw_signals', 'baseLine'), 'rollingDefect': os.path.join(data_path, 'raw_signals', 'rollingDefect'), 'innerRace': os.path.join(data_path, 'raw_signals', 'innerRace'), 'outerRace': os.path.join(data_path, 'raw_signals', 'outerRace') } sub_signal_len = 400 # The sample rate is 12 kHz and the approximate motor speed is 1797 RPM. Therefore, there are approximately 401 # sample points per revolution (12000 / (1797 / 60)). for fault_type, fault_dir_path in fault_types_path.items(): sub_signals = get_sub_signals(fault_dir_path, sub_signal_len) sub_signal_idx = 0 for sub_signal in sub_signals: sub_signal_idx += 1 scalo = get_scalogram(sub_signal) scaled_scalo = get_scaled_data( scalo) # scales an array to have values between 0.0 and 1.0 img_obj = PIL.Image.fromarray(scaled_scalo) img_f_name = str(sub_signal_idx) + '_' + fault_type + '.tiff' img_obj.save(os.path.join(data_path, img_f_name)) if sub_signal_idx == 50: break # stop after creating 50 images in each class # Create txt files mapping image names to classes img_to_class_paths = create_img_to_class_files(data_path, labels_to_classes_map) # Create lmdb files lmdb_paths = write_lmdb_files(data_path, img_to_class_paths) model_files_path = os.path.join(root_path, 'model_files') if not os.path.isdir(model_files_path): os.makedirs(model_files_path) workspace.ResetWorkspace(model_files_path) unique_timestamp = str( datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) checkpoint_dir = os.path.join(model_files_path, unique_timestamp) os.makedirs(checkpoint_dir) print("Checkpoint output location: ", checkpoint_dir) # Dataset specific params image_width = sub_signal_len image_height = sub_signal_len image_channels = 1 num_classes = 4 init_net_out_fname = 'init_net.pb' predict_net_out_fname = 'predict_net.pb' # Training params n_iters = 600 # total training iterations batch_size = 10 # batch size for training n_val_images = 30 # total number of validation images validation_interval = 50 # validate every <validation_interval> training iterations n_checkpoint_iters = 200 # output checkpoint db every <checkpoint_iters> iterations # TRAINING MODEL train_model = model_helper.ModelHelper(name="train_net") data, label = add_input(train_model, batch_size=batch_size, db=lmdb_paths['train'], db_type='lmdb') softmax = add_cnn_model_1(train_model, data, num_classes, image_height, image_width, image_channels) add_optmzer_lossfunc(train_model, softmax, label) add_check_points(train_model, unique_timestamp, n_checkpoint_iters, db_type="lmdb") # VALIDATION MODEL # Initialize with ModelHelper class without re-initializing params val_model = model_helper.ModelHelper(name="val_net", init_params=False) data, label = add_input(val_model, batch_size=n_val_images, db=lmdb_paths['val'], db_type='lmdb') softmax = add_cnn_model_1(val_model, data, num_classes, image_height, image_width, image_channels) add_accuracy(val_model, softmax, label) # DEPLOY MODEL # Initialize with ModelHelper class without re-initializing params deploy_model = model_helper.ModelHelper(name="deploy_net", init_params=False) # Add model definition, expect input blob called "data" add_cnn_model_1(deploy_model, "data", num_classes, image_height, image_width, image_channels) print("Training, Validation, and Deploy models all defined!") # Initialize and create the training network workspace.RunNetOnce(train_model.param_init_net) workspace.CreateNet(train_model.net, overwrite=True) # Initialize and create validation network workspace.RunNetOnce(val_model.param_init_net) workspace.CreateNet(val_model.net, overwrite=True) # Placeholder to track loss and validation accuracy training_loss = np.zeros(int(math.ceil(n_iters / validation_interval))) val_accuracy = np.zeros(int(math.ceil(n_iters / validation_interval))) val_count = 0 val_iter_list = np.zeros(int(math.ceil(n_iters / validation_interval))) # run the network (forward & backward pass) for i in range(n_iters): workspace.RunNet(train_model.net) # Validate every <validation_interval> training iterations if (i % validation_interval) == 0: print("Training iter: ", i) training_loss[val_count] = workspace.FetchBlob('loss') workspace.RunNet(val_model.net) val_accuracy[val_count] = workspace.FetchBlob('accuracy') print("Loss: ", str(training_loss[val_count])) print("Validation accuracy: ", str(val_accuracy[val_count]) + "\n") val_iter_list[val_count] = i val_count += 1 fig = pyplot.figure() fig.add_subplot(111) pyplot.title("Training Loss and Validation Accuracy") pyplot.plot(val_iter_list, training_loss, 'b') pyplot.plot(val_iter_list, val_accuracy, 'r') pyplot.xlabel("Training iteration") pyplot.legend(('Training Loss', 'Validation Accuracy'), loc='upper right') pyplot.savefig("loss_and_accuracy.png") pyplot.close() # Save trained model workspace.RunNetOnce(deploy_model.param_init_net) workspace.CreateNet(deploy_model.net, overwrite=True) init_net, predict_net = mobile_exporter.Export(workspace, deploy_model.net, deploy_model.params) init_net_out_path = os.path.join(checkpoint_dir, init_net_out_fname) predict_net_out_path = os.path.join(checkpoint_dir, predict_net_out_fname) with open(init_net_out_path, 'wb') as f: f.write(init_net.SerializeToString()) with open(predict_net_out_path, 'wb') as f: f.write(predict_net.SerializeToString()) print("Model saved as " + init_net_out_path + " and " + predict_net_out_path)
def RunEpoch( args, epoch, train_model, test_model, total_batch_size, num_shards, expname, explog, ): ''' Run one epoch of the trainer. TODO: add checkpointing here. ''' # TODO: add loading from checkpoint log.info("Starting epoch {}/{}".format(epoch, args.num_epochs)) epoch_iters = int(args.epoch_size / total_batch_size / num_shards) test_epoch_iters = int(args.test_epoch_size / total_batch_size / num_shards) for i in range(epoch_iters): # This timeout is required (temporarily) since CUDA-NCCL # operators might deadlock when synchronizing between GPUs. timeout = 600.0 if i == 0 else 60.0 with timeout_guard.CompleteInTimeOrDie(timeout): t1 = time.time() workspace.RunNet(train_model.net.Proto().name) t2 = time.time() dt = t2 - t1 fmt = "Finished iteration {}/{} of epoch {} ({:.2f} images/sec)" log.info(fmt.format(i + 1, epoch_iters, epoch, total_batch_size / dt)) prefix = "{}_{}".format(train_model._device_prefix, train_model._devices[0]) accuracy = workspace.FetchBlob(prefix + '/accuracy') loss = workspace.FetchBlob(prefix + '/loss') train_fmt = "Training loss: {}, accuracy: {}" log.info(train_fmt.format(loss, accuracy)) num_images = epoch * epoch_iters * total_batch_size prefix = "{}_{}".format(train_model._device_prefix, train_model._devices[0]) accuracy = workspace.FetchBlob(prefix + '/accuracy') loss = workspace.FetchBlob(prefix + '/loss') learning_rate = workspace.FetchBlob( data_parallel_model.GetLearningRateBlobNames(train_model)[0]) test_accuracy = 0 test_accuracy_top5 = 0 if test_model is not None: # Run 100 iters of testing ntests = 0 for _ in range(test_epoch_iters): workspace.RunNet(test_model.net.Proto().name) for g in test_model._devices: test_accuracy += np.asscalar( workspace.FetchBlob( "{}_{}".format(test_model._device_prefix, g) + '/accuracy')) test_accuracy_top5 += np.asscalar( workspace.FetchBlob( "{}_{}".format(test_model._device_prefix, g) + '/accuracy_top5')) ntests += 1 test_accuracy /= ntests test_accuracy_top5 /= ntests else: test_accuracy = (-1) test_accuracy_top5 = (-1) explog.log(input_count=num_images, batch_count=(i + epoch * epoch_iters), additional_values={ 'accuracy': accuracy, 'loss': loss, 'learning_rate': learning_rate, 'epoch': epoch, 'top1_test_accuracy': test_accuracy, 'top5_test_accuracy': test_accuracy_top5, }) assert loss < 40, "Exploded gradients :(" # TODO: add checkpointing return epoch + 1
######################################################################## # Run training procedure ######################################################################## # The parameter initialization network only needs to be run once. workspace.RunNetOnce(train_model.param_init_net) # creating the network workspace.CreateNet(train_model.net, overwrite=True) # initialize and create validation network workspace.RunNetOnce(val_model.param_init_net) workspace.CreateNet(val_model.net, overwrite=True) # variables to track the accuracy & loss accuracy = np.zeros(training_iters) loss = np.zeros(training_iters) # Now, we will manually run the network for 200 iterations. for i in range(training_iters): workspace.RunNet(train_model.net) accuracy[i] = workspace.FetchBlob('accuracy') loss[i] = workspace.FetchBlob('loss') if (i % validation_interval == 0): print("Training iter: ", i) #run validation workspace.RunNet(val_model.net.Proto().name) val_accuracy = workspace.FetchBlob('accuracy') print("Validation accuracy: ", str(val_accuracy)) # After the execution is done, let's plot the values. pyplot.plot(loss, 'b') pyplot.plot(accuracy, 'r') pyplot.legend(('Loss', 'Accuracy'), loc='upper right') pyplot.show()
def InferTensorRunAndCompare(self, model): ''' Runs shape inference, and then the model to check that the inferred shapes agree with the actual ones ''' (shapes, types) = workspace.InferShapesAndTypes( [model.param_init_net, model.net], ) # .. Create net workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(model.net, True) workspace.RunNet(model.Proto().name) # ... and then check the shapes mismatch correct_shapes = {} correct_types = {} for b in workspace.Blobs(): arr = workspace.FetchBlob(b) correct_shapes[b] = arr.shape if type(arr) is np.ndarray: if arr.dtype == np.dtype('float32'): correct_types[b] = caffe2_pb2.TensorProto.FLOAT elif arr.dtype == np.dtype('int32'): correct_types[b] = caffe2_pb2.TensorProto.INT32 # BYTE # STRING elif arr.dtype == np.dtype('bool'): correct_types[b] = caffe2_pb2.TensorProto.BOOL elif arr.dtype == np.dtype('uint8'): correct_types[b] = caffe2_pb2.TensorProto.UINT8 elif arr.dtype == np.dtype('int8'): correct_types[b] = caffe2_pb2.TensorProto.INT8 elif arr.dtype == np.dtype('uint16'): correct_types[b] = caffe2_pb2.TensorProto.UINT16 elif arr.dtype == np.dtype('int16'): correct_types[b] = caffe2_pb2.TensorProto.INT16 elif arr.dtype == np.dtype('int64'): correct_types[b] = caffe2_pb2.TensorProto.INT64 elif arr.dtype == np.dtype('float16'): correct_types[b] = caffe2_pb2.TensorProto.FLOAT16 elif arr.dtype == np.dtype('float64'): correct_types[b] = caffe2_pb2.TensorProto.DOUBLE else: correct_types[b] = "unknown {}".format(arr.dtype) else: correct_types[b] = str(type(arr)) for b in correct_shapes: self.assertTrue( np.array_equal( np.array(shapes[b]).astype(np.int32), np.array(correct_shapes[b]).astype(np.int32) ), "Shape {} mismatch: {} vs. {}".format( b, shapes[b], correct_shapes[b] ) ) self.assertFalse( b not in types and b in correct_types, "Type for {} not defined".format(b), ) self.assertEqual( types[b], correct_types[b], "Type {} mismatch: {} vs. {}".format( b, types[b], correct_types[b], ) )
my_model.AddGradientOperators([loss]) opt = optimizer.build_sgd(my_model, base_learning_rate=0.1) for param in my_model.GetOptimizationParamInfo(): opt(my_model.net, my_model.param_init_net, param) ################################################################################## # Run the training workspace.RunNetOnce(my_model.param_init_net) workspace.CreateNet(my_model.net, overwrite=True) total_iters = train_iters accuracy = np.zeros(total_iters) loss = np.zeros(total_iters) for i in range(total_iters): workspace.RunNet(my_model.net) accuracy[i] = workspace.FetchBlob('accuracy') loss[i] = workspace.FetchBlob('loss') print "accuracy: ", accuracy[i] print "loss: ", loss[i] plt.plot(loss, 'b', label="loss") plt.plot(accuracy, 'r', label="accuracy") plt.legend(loc="upper right") plt.show() exit() ################################################################################## # Save the newly finetuned model deploy_model = model_helper.ModelHelper("finetuned_squeezenet_ucf11_deploy", arg_scope=arg_scope, init_params=False)
def run_model(): iterations = ITERATIONS if model_name == "MLP": iterations = 1 # avoid numeric instability with MLP gradients workspace.RunNet(model.net, iterations)
def test_small_sls_acc32(self, seed): workspace.GlobalInit([ "caffe2", "--glow_global_fp16=0", "--glow_global_fused_scale_offset_fp16=0", "--glow_global_force_sls_fp16_accum=0", ]) np.random.seed(seed) workspace.ResetWorkspace() n = 2 DIM = 3 data = 4 * (np.random.random_sample((n, DIM)) + 1).astype(np.float32) lengths = np.array([n], dtype=np.int32) indices = np.array(range(n), dtype=np.int64) weights = np.random.uniform(low=0.01, high=0.5, size=[n]).astype(np.float32) pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend( ["quantized_data", "weights", "indices", "lengths"]) pred_net.external_output.append("Y") pred_net.op.add().CopyFrom( core.CreateOperator( "SparseLengthsWeightedSumFused8BitRowwise", ["quantized_data", "weights", "indices", "lengths"], ["Y"], )) ref_net = caffe2_pb2.NetDef() ref_net.name = "ref" ref_net.external_input.extend( ["quantized_data", "weights", "indices", "lengths"]) ref_net.external_output.append("Y") ref_net.op.add().CopyFrom( core.CreateOperator( "SparseLengthsWeightedSumFused8BitRowwiseFakeFP32NNPI", ["quantized_data", "weights", "indices", "lengths"], ["Y"], )) workspace.FeedBlob("data", data) workspace.RunOperatorOnce( core.CreateOperator("FloatToFused8BitRowwiseQuantized", ["data"], ["quantized_data"])) quantized_data = workspace.FetchBlob("quantized_data") onnxified_net = onnxifi_caffe2_net( pred_net, {}, max_batch_size=1, max_seq_size=n, debug=True, adjust_batch=True, use_onnx=False, ) workspace.FeedBlob("indices", indices) workspace.FeedBlob("lengths", lengths) workspace.FeedBlob("weights", weights) workspace.CreateNet(onnxified_net) workspace.CreateNet(ref_net) workspace.RunNet(onnxified_net.name) Y_glow = workspace.FetchBlob("Y") workspace.RunNet(ref_net.name) Y_ref = workspace.FetchBlob("Y") diff = np.abs((Y_ref - Y_glow) / (Y_ref + 1e-8)) max_err = np.max(diff, axis=1) num_offenders = (max_err > 0).sum() if num_offenders > 0: np.set_printoptions(precision=12) print( "ref", Y_ref.astype(np.float16).astype(np.float32), "glow", Y_glow.astype(np.float16).astype(np.float32), ) print_test_debug_info( "test_small_sls_acc32", { "seed": seed, "num_rows": num_rows, "embedding_dim": embedding_dim, "batch_size": batch_size, "indices": indices, "data": data, "quantized_data": quantized_data, "lengths": lengths, "weights": weights, "Y_glow": Y_glow, "Y_ref": Y_ref, "diff": diff, "rowwise_diff": np.max(diff, axis=1), }, ) assert 0
core.DeviceOption(train_model._device_type, g)): workspace.FeedBlob( "{}_{}/data".format(train_model._device_prefix, g), data_device) workspace.FeedBlob( "{}_{}/label".format(train_model._device_prefix, g), labels_device) if i == 0 and e == 0: workspace.RunNetOnce(train_model.param_init_net) workspace.CreateNet(train_model.net) workspace.RunNetOnce(test_model.param_init_net) workspace.CreateNet(test_model.net, overwrite=True) workspace.RunNetOnce(deploy_model.param_init_net) workspace.CreateNet(deploy_model.net, overwrite=True) workspace.RunNet(train_model.net.Proto().name) loss_sum += workspace.FetchBlob("gpu_0/loss") correct += workspace.FetchBlob("gpu_0/accuracy") time_ep = time.time() - time_ep lr = workspace.FetchBlob( data_parallel_model.GetLearningRateBlobNames(train_model)[0]) values = [ e + 1, lr, loss_sum / batch_num, correct / batch_num, test_res['loss'], test_res['accuracy'], time_ep,
def run_training_net(self): timeout = 2000.0 with timeout_guard.CompleteInTimeOrDie(timeout): workspace.RunNet(self.train_model.net.Proto().name)
def test_slws_fused_4bit_rowwise(self, seed, num_rows, embedding_dim, batch_size, max_weight): workspace.ResetWorkspace() np.random.seed(seed) data = np.random.rand(num_rows, embedding_dim).astype(np.float32) data = data * 1e-3 lengths = np.random.choice(np.arange(1, num_rows), batch_size).astype(np.int32) indices = [] for length in lengths: indices.extend(np.random.choice(np.arange(1, num_rows), length)) indices = np.asarray(indices).astype(np.int64) weights = np.random.uniform( low=0, high=max_weight, size=[len(indices)]).astype( np.float32) - max_weight / 2.0 pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend( ["quantized_data", "weights", "indices", "lengths"]) pred_net.external_output.append("Y") pred_net.op.add().CopyFrom( core.CreateOperator( "SparseLengthsWeightedSumFused4BitRowwise", ["quantized_data", "weights", "indices", "lengths"], ["Y"], )) ref_net = caffe2_pb2.NetDef() ref_net.name = "ref" ref_net.external_input.extend( ["quantized_data", "weights", "indices", "lengths"]) ref_net.external_output.append("Y") ref_net.op.add().CopyFrom( core.CreateOperator( "SparseLengthsWeightedSumFused4BitRowwiseFakeFP16NNPI", ["quantized_data", "weights", "indices", "lengths"], ["Y"], )) workspace.FeedBlob("data", data) workspace.RunOperatorOnce( core.CreateOperator("FloatToFused4BitRowwiseQuantized", ["data"], ["quantized_data"])) pred_net_onnxified = onnxifi_caffe2_net(pred_net, {}, max_batch_size=batch_size, max_seq_size=np.max(lengths), debug=True, adjust_batch=True, use_onnx=False) num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op) np.testing.assert_equal(num_onnxified_ops, 1) workspace.FeedBlob("indices", indices) workspace.FeedBlob("lengths", lengths) workspace.FeedBlob("weights", weights) workspace.CreateNet(pred_net_onnxified) workspace.CreateNet(ref_net) workspace.RunNet(pred_net_onnxified.name) Y_glow = workspace.FetchBlob('Y') workspace.RunNet(ref_net.name) Y_c2 = workspace.FetchBlob('Y') if not np.allclose(Y_c2, Y_glow): print_test_debug_info( "slws_fused_4bit_rowwise", { "seed": seed, "indices": indices, "data": data.shape, "lengths": lengths, "weights": weights, "Y_c2": Y_c2.shape, "Y_glow": Y_glow.shape, "diff": Y_glow - Y_c2, "rowwise_diff": (Y_glow - Y_c2)[:, 0] }) assert (0)
total_iterations = 501 Snapshot_interval = 10 total_iterations = total_iterations * 64 print workspace.Blobs() accuracy = [] val_accuracy = [] loss = [] lr = [] start = 0 while start < total_iterations: l = train[start:start + Batch_Size, 0].astype(np.int32) # labels for a given batch d = train[start:start + Batch_Size, 1:].reshape(l.shape[0], 28, 28) # pixel values for each sample in the batch d = d[:, np.newaxis, ...].astype(np.float32) d = d * float( 1. / 256) # Scaling the pixel values for faster computation workspace.FeedBlob("data", d, device_option) workspace.FeedBlob("label", l, device_option) workspace.RunNet(training_model.net, num_iter=1) accuracy.append(workspace.FetchBlob('accuracy')) loss.append(workspace.FetchBlob('loss')) lr.append(workspace.FetchBlob('SgdOptimizer_0_lr_gpu0')) # lr.append(workspace.FetchBlob('conv1_b_lr')) if start % Snapshot_interval == 0: save_snapshot(training_model, start) val_accuracy.append(check_val()) start += Batch_Size
def test_slws_fused_4bit_rowwise_all_same(self, seed): np.random.seed(seed) workspace.ResetWorkspace() n = 1 m = 2 data = np.ones((n, m)).astype(np.float32) * 0.2 - 0.1 max_segments = 5 max_segment_length = 100 num_lengths = np.random.randint(1, max_segments + 1) # number of segments to run lengths = np.random.randint(0, max_segment_length + 1, size=num_lengths).astype(np.int32) num_indices = np.sum(lengths) indices = np.zeros(num_indices, dtype=np.int64) weights = np.random.uniform(low=-0.5, high=0.5, size=[len(indices)])\ .astype(np.float32) weights = np.ones(len(indices)).astype(np.float32) pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend( ["quantized_data", "weights", "indices", "lengths"]) pred_net.external_output.append("Y") pred_net.op.add().CopyFrom( core.CreateOperator( "SparseLengthsWeightedSumFused4BitRowwise", ["quantized_data", "weights", "indices", "lengths"], ["Y"], )) ref_net = caffe2_pb2.NetDef() ref_net.name = "ref" ref_net.external_input.extend( ["quantized_data", "weights", "indices", "lengths"]) ref_net.external_output.append("Y") ref_net.op.add().CopyFrom( core.CreateOperator( "SparseLengthsWeightedSumFused4BitRowwiseFakeFP16NNPI", ["quantized_data", "weights", "indices", "lengths"], ["Y"], )) workspace.FeedBlob("data", data) workspace.RunOperatorOnce( core.CreateOperator("FloatToFused4BitRowwiseQuantized", ['data'], ['quantized_data'])) print("quantized", workspace.FetchBlob("quantized_data")) pred_net_onnxified = onnxifi_caffe2_net( pred_net, {}, max_batch_size=max_segments, max_seq_size=max_segment_length, debug=True, adjust_batch=True, use_onnx=False) num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op) np.testing.assert_equal(num_onnxified_ops, 1) workspace.FeedBlob("indices", indices) workspace.FeedBlob("lengths", lengths) workspace.FeedBlob("weights", weights) workspace.CreateNet(pred_net_onnxified) workspace.CreateNet(ref_net) workspace.RunNet(pred_net_onnxified.name) Y_glow = workspace.FetchBlob('Y') workspace.RunNet(ref_net.name) Y_c2 = workspace.FetchBlob('Y') if not np.allclose(Y_c2, Y_glow): print_test_debug_info( "slws_fused_4bit_rowwise", { "seed": seed, "indices": indices, "data": data, "lengths": lengths, "weights": weights, "Y_c2": Y_c2, "Y_glow": Y_glow, "diff": Y_glow - Y_c2, "rowwise_diff": (Y_glow - Y_c2)[:, 0] }) assert (0)
def RunNet(model, num_iterations): for net_iter in model._data_parallel_model_nets: if isinstance(net_iter, tuple): workspace.RunNet(net_iter[0].Proto().name, net_iter[1]) else: workspace.RunNet(net_iter, num_iterations)
def Caffe2LSTM(args): T = args.data_size // args.batch_size input_blob_shape = [args.seq_length, args.batch_size, args.input_dim] queue, label_queue, entry_counts = generate_data(T // args.seq_length, input_blob_shape, args.hidden_dim, args.fixed_shape) workspace.FeedBlob( "seq_lengths", np.array([args.seq_length] * args.batch_size, dtype=np.int32)) model, output = create_model(args, queue, label_queue, input_blob_shape) workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(model.net) start_time = time.time() num_iters = T // args.seq_length total_iters = 0 # Run the Benchmark log.info("------ Warming up ------") workspace.RunNet(model.net.Proto().name) if (args.gpu): log.info("Memory stats:") stats = utils.GetGPUMemoryUsageStats() log.info("GPU memory:\t{} MB".format(stats['max_total'] / 1024 / 1024)) log.info("------ Starting benchmark ------") start_time = time.time() last_time = time.time() for iteration in range(1, num_iters, args.iters_to_report): iters_once = min(args.iters_to_report, num_iters - iteration) total_iters += iters_once workspace.RunNet(model.net.Proto().name, iters_once) new_time = time.time() log.info("Iter: {} / {}. Entries Per Second: {}k.".format( iteration, num_iters, np.sum(entry_counts[iteration:iteration + iters_once]) / (new_time - last_time) // 100 / 10, )) last_time = new_time log.info("Done. Total EPS excluding 1st iteration: {}k {}".format( np.sum(entry_counts[1:]) / (time.time() - start_time) // 100 / 10, " (with RNN executor)" if args.rnn_executor else "", )) if (args.gpu): log.info("Memory stats:") stats = utils.GetGPUMemoryUsageStats() log.info("GPU memory:\t{} MB".format(stats['max_total'] / 1024 / 1024)) if (stats['max_total'] != stats['total']): log.warning( "Max usage differs from current total usage: {} > {}".format( stats['max_total'], stats['total'])) log.warning("This means that costly deallocations occurred.") return time.time() - start_time
def test_int8_small_input(self, n, rand_seed): print("n={}, rand_seed={}".format(n, rand_seed)) np.random.seed(rand_seed) workspace.ResetWorkspace() X_fp32 = np.random.uniform(0.01, 0.03, size=(n, n)).astype(np.float32) W_fp32 = np.identity(n, dtype=np.float32) b_fp32 = np.zeros((n,), dtype=np.float32) X_scale, X_zero_point = self._get_scale_zp(X_fp32) workspace.FeedBlob("X", X_fp32) workspace.FeedBlob("W", W_fp32) workspace.FeedBlob("b", b_fp32) workspace.RunOperatorOnce( core.CreateOperator( "Int8FCPackWeight", ["W"], ["W_int8"], engine="DNNLOWP", save_unpacked_weights=True, in_scale=X_scale, ) ) ref_net = core.Net("net") ref_net.Int8QuantizeNNPI( ["X"], ["X_int8"], Y_scale=X_scale, Y_zero_point=X_zero_point ) ref_net.Int8FCFakeAcc32NNPI( ["X_int8", "W_int8", "b"], ["Y_int8"], Y_scale=X_scale, Y_zero_point=X_zero_point, ) ref_net.Int8DequantizeNNPI( ["Y_int8"], ["Y"] ) ref_net.Proto().external_output.append("Y") # run ref_net workspace.RunNetOnce(ref_net) Y_fbgemm = workspace.FetchBlob("Y") # run onnxifi net ref_net.Proto().op[0].type = "Int8Quantize" ref_net.Proto().op[1].type = "Int8FC" ref_net.Proto().op[2].type = "Int8Dequantize" net_onnxified = onnxifi_caffe2_net( ref_net.Proto(), {}, debug=True, adjust_batch=False, use_onnx=False, weight_names=["W_int8", "b"], ) num_onnxified_ops = sum( 1 if o.type == "Onnxifi" else 0 for o in net_onnxified.op ) np.testing.assert_equal(num_onnxified_ops, 1) workspace.CreateNet(net_onnxified) workspace.RunNet(net_onnxified.name) Y_glow = workspace.FetchBlob("Y") if not np.allclose(Y_glow, Y_fbgemm): diff_Y = np.abs(Y_glow - Y_fbgemm) print_test_debug_info( "int8_fc", { "seed": rand_seed, "n": n, "X": X_fp32, "W": W_fp32, "b": b_fp32, "Y_fbgemm": Y_fbgemm, "Y_glow": Y_glow, "diff": diff_Y, "maxdiff": diff_Y.max(axis=1), }, ) assert 0
t2 = time.time() print('Finish loading model in %.4fs' % (t2 - t1)) t1 = time.time() data_list = [ np.random.uniform( -1, 1, (args.batch_size, 3, im_size, im_size)).astype(np.float32) for i in range(int(np.ceil(1.0 * args.n_sample / args.batch_size))) ] t2 = time.time() print('Generate %d random images in %.4fs!' % (args.n_sample, t2 - t1)) # dry run for i in range(5): workspace.FeedBlob('data', data_list[i], device_opts) workspace.RunNet(net_def.name, 1) print('Finish dry run(5 times)') t_list = [] t_start = time.time() for i in range(args.n_epoch): t1 = time.time() for j, batch in enumerate(data_list): workspace.FeedBlob('data', batch, device_opts) workspace.RunNet(net_def.name, 1) t2 = time.time() t_list.append(t2 - t1) if args.verbose: print('Epoch %d, finish %d images in %.4fs, speed = %.4f image/s' %