def _do_broadcast(all_blobs): assert len(all_blobs) % cfg.NUM_GPUS == 0, \ ('Unexpected value for NUM_GPUS. Make sure you are not ' 'running single-GPU inference with NUM_GPUS > 1.') blobs_per_gpu = int(len(all_blobs) / cfg.NUM_GPUS) for i in range(blobs_per_gpu): blobs = [p for p in all_blobs[i::blobs_per_gpu]] data = workspace.FetchBlob(blobs[0]) logger.debug('Broadcasting {} to'.format(str(blobs[0]))) for i, p in enumerate(blobs[1:]): logger.debug(' |-> {}'.format(str(p))) with c2_utils.CudaScope(i + 1): workspace.FeedBlob(p, data)
def InitializeLossWeight(self): weight_cls1 = np.array([0.5]).astype(np.float32) weight_cls2 = np.array([0.5]).astype(np.float32) weight_bbox1 = np.array([0.5]).astype(np.float32) weight_bbox2 = np.array([0.5]).astype(np.float32) for i in range(cfg.NUM_GPUS): with c2_utils.CudaScope(i): workspace.FeedBlob('gpu_{}/weight_cls1'.format(i), weight_cls1) workspace.FeedBlob('gpu_{}/weight_cls2'.format(i), weight_cls2) workspace.FeedBlob('gpu_{}/weight_bbox1'.format(i), weight_bbox1) workspace.FeedBlob('gpu_{}/weight_bbox2'.format(i), weight_bbox2)
def _add_allreduce_graph(model): """Construct the graph that performs Allreduce on the gradients.""" # Need to all-reduce the per-GPU gradients if training with more than 1 GPU all_params = model.TrainableParams() assert len(all_params) % cfg.NUM_GPUS == 0 # The model parameters are replicated on each GPU, get the number # distinct parameter blobs (i.e., the number of parameter blobs on # each GPU) params_per_gpu = int(len(all_params) / cfg.NUM_GPUS) with c2_utils.CudaScope(0): # Iterate over distinct parameter blobs for i in range(params_per_gpu): # Gradients from all GPUs for this parameter blob gradients = [ model.param_to_grad[p] for p in all_params[i::params_per_gpu] ] if len(gradients) > 0: if cfg.USE_NCCL: model.net.NCCLAllreduce(gradients, gradients) else: muji.Allreduce(model.net, gradients, reduced_affix='')
def _CorrectMomentum(self, correction): """The MomentumSGDUpdate op implements the update V as V := mu * V + lr * grad, where mu is the momentum factor, lr is the learning rate, and grad is the stochastic gradient. Since V is not defined independently of the learning rate (as it should ideally be), when the learning rate is changed we should scale the update history V in order to make it compatible in scale with lr * grad. """ logger.info( 'Scaling update history by {:.6f} (new lr / old lr)'.format( correction)) for i in range(cfg.NUM_GPUS): with c2_utils.CudaScope(i): for param in self.TrainableParams(gpu_id=i): op = core.CreateOperator('Scale', [param + '_momentum'], [param + '_momentum'], scale=correction) workspace.RunOperatorOnce(op)
def UpdateLossWeight(self): scale = 10 # set bias for constraint for weight >0 bias = 0.5 lr = workspace.FetchBlob('gpu_0/lr').astype(np.float32) weight_cls1 = workspace.FetchBlob('gpu_0/weight_cls1').astype( np.float32) weight_cls2 = workspace.FetchBlob('gpu_0/weight_cls2').astype( np.float32) #loss_cls1 = workspace.FetchBlob('gpu_0/loss_cls1') #loss_cls2 = workspace.FetchBlob('gpu_0/loss_cls2') weight_cls1 -= lr * scale * workspace.FetchBlob( 'gpu_0/weight_cls1_grad') + bias weight_cls2 -= lr * scale * workspace.FetchBlob( 'gpu_0/weight_cls2_grad') + bias weight_cls1 = weight_cls1 / (weight_cls1 + weight_cls2) weight_cls2 = weight_cls2 / (weight_cls1 + weight_cls2) weight_bbox1 = workspace.FetchBlob('gpu_0/weight_bbox1').astype( np.float32) weight_bbox2 = workspace.FetchBlob('gpu_0/weight_bbox2').astype( np.float32) #loss_bbox1 = workspace.FetchBlob('gpu_0/loss_bbox1') #loss_bbox2 = workspace.FetchBlob('gpu_0/loss_bbox2') weight_bbox1 -= lr * scale * workspace.FetchBlob( 'gpu_0/weight_bbox1_grad') + bias weight_bbox2 -= lr * scale * workspace.FetchBlob( 'gpu_0/weight_bbox2_grad') + bias weight_bbox1 = weight_bbox1 / (weight_bbox1 + weight_bbox2) weight_bbox2 = weight_bbox2 / (weight_bbox1 + weight_bbox2) for i in range(cfg.NUM_GPUS): with c2_utils.CudaScope(i): workspace.FeedBlob('gpu_{}/weight_cls1'.format(i), weight_cls1) workspace.FeedBlob('gpu_{}/weight_cls2'.format(i), weight_cls2) workspace.FeedBlob('gpu_{}/weight_bbox1'.format(i), weight_bbox1) workspace.FeedBlob('gpu_{}/weight_bbox2'.format(i), weight_bbox2)
def _SetNewLr(self, cur_lr, new_lr): """Do the actual work of updating the model and workspace blobs. """ for i in range(cfg.NUM_GPUS): with c2_utils.CudaScope(i): workspace.FeedBlob('gpu_{}/lr'.format(i), np.array([new_lr], dtype=np.float32)) lr_scale_new_param = cfg.SOLVER.LR_SCALE_NEW_PARAM workspace.FeedBlob( 'gpu_{}/lr_new_param'.format(i), np.array([new_lr * lr_scale_new_param], dtype=np.float32)) lr_scale_new_fc = cfg.SOLVER.LR_SCALE_NEW_FC workspace.FeedBlob( 'gpu_{}/lr_new_fc'.format(i), np.array([new_lr * lr_scale_new_fc], dtype=np.float32)) ratio = _get_lr_change_ratio(cur_lr, new_lr) if cfg.SOLVER.SCALE_MOMENTUM and cur_lr > 1e-7 and \ ratio > cfg.SOLVER.SCALE_MOMENTUM_THRESHOLD: self._CorrectMomentum(new_lr / cur_lr)
def GenerateProposals(self, blobs_in, blobs_out, anchors, spatial_scale): """Op for generating RPN porposals. blobs_in: - 'rpn_cls_probs': 4D tensor of shape (N, A, H, W), where N is the number of minibatch images, A is the number of anchors per locations, and (H, W) is the spatial size of the prediction grid. Each value represents a "probability of object" rating in [0, 1]. - 'rpn_bbox_pred': 4D tensor of shape (N, 4 * A, H, W) of predicted deltas for transformation anchor boxes into RPN proposals. - 'im_info': 2D tensor of shape (N, 3) where the three columns encode the input image's [height, width, scale]. Height and width are for the input to the network, not the original image; scale is the scale factor used to scale the original image to the network input size. blobs_out: - 'rpn_rois': 2D tensor of shape (R, 5), for R RPN proposals where the five columns encode [batch ind, x1, y1, x2, y2]. The boxes are w.r.t. the network input, which is a *scaled* version of the original image; these proposals must be scaled by 1 / scale (where scale comes from im_info; see above) to transform it back to the original input image coordinate system. - 'rpn_roi_probs': 1D tensor of objectness probability scores (extracted from rpn_cls_probs; see above). """ cfg_key = 'TRAIN' if self.train else 'TEST' if cfg[cfg_key].GENERATE_PROPOSALS_ON_GPU: rpn_pre_nms_topN = cfg[cfg_key].RPN_PRE_NMS_TOP_N rpn_post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N rpn_nms_thresh = cfg[cfg_key].RPN_NMS_THRESH rpn_min_size = float(cfg[cfg_key].RPN_MIN_SIZE) input_name = str(blobs_in[0]) lvl = int(input_name[-1]) if input_name[-1].isdigit() else None anchors_name = 'anchors{}'.format(lvl) if lvl else 'anchors' for i in range(cfg.NUM_GPUS): with c2_utils.CudaScope(i): workspace.FeedBlob( 'gpu_{}/{}'.format(i, anchors_name), anchors.astype(np.float32)) self.net.GenerateProposals( blobs_in + [anchors_name], blobs_out, spatial_scale=spatial_scale, pre_nms_topN=rpn_pre_nms_topN, post_nms_topN=rpn_post_nms_topN, nms_thresh=rpn_nms_thresh, min_size=rpn_min_size, ) else: name = 'GenerateProposalsOp:' + ','.join([str(b) for b in blobs_in]) # spatial_scale passed to the Python op is only used in # convert_pkl_to_pb self.net.Python( GenerateProposalsOp(anchors, spatial_scale, self.train).forward )(blobs_in, blobs_out, name=name, spatial_scale=spatial_scale) return blobs_out
def softmax_surgery(model): print('softmax surgery') gpu_prefixs = ['gpu_' + str(i) for i in range(cfg.NUM_GPUS)] old_ops = model.net._net.op[:] num_op = len(model.net._net.op) is_end = False del model.net._net.op[:] gpu_point = {gpu_prefix: -1 for gpu_prefix in gpu_prefixs} while (True): for gpu_prefix in gpu_prefixs: for i, op in enumerate(old_ops): if i <= gpu_point[gpu_prefix]: continue gpu = op.input[0].split('/')[0] if gpu == gpu_prefix: pass else: continue if op.type == 'Softmax' and 'fc8d_t' in op.input[0]: gpu_point[gpu_prefix] = i # print(op) print('find softmax: ', op.input[0], '\t-->\t', op.output[0]) break model.net._net.op.extend([op]) if i == num_op - 1: is_end = True if is_end: break if gpu_point[gpu_prefixs[0]] == -1 or gpu_point[ gpu_prefixs[1]] == -1 or gpu_point[ gpu_prefixs[2]] == -1 or gpu_point[gpu_prefixs[3]] == -1: break assert old_ops[gpu_point[gpu_prefixs[0]]].input[0].split('/')[ 1] == old_ops[gpu_point[gpu_prefixs[1]]].input[0].split('/')[1] assert old_ops[gpu_point[gpu_prefixs[0]]].input[0].split('/')[ 1] == old_ops[gpu_point[gpu_prefixs[2]]].input[0].split('/')[1] assert old_ops[gpu_point[gpu_prefixs[0]]].input[0].split('/')[ 1] == old_ops[gpu_point[gpu_prefixs[3]]].input[0].split('/')[1] in_blobs = [] out_blobs = [] for gpu_prefix in gpu_prefixs: in_blob = old_ops[gpu_point[gpu_prefix]].input[0] in_blobs.append(in_blob) out_blob = old_ops[gpu_point[gpu_prefix]].output[0] out_blobs.append(out_blob) in_blob_name = in_blobs[0].split('/')[1] out_blob_name = out_blobs[0].split('/')[1] for gpu_prefix in gpu_prefixs: gpu_id = int(gpu_prefix.split('_')[1]) with c2_utils.CudaScope(gpu_id): for i in range(cfg.NUM_GPUS): if gpu_id == i: continue model.net.Copy( in_blobs[i], gpu_prefix + '/' + in_blob_name + '_gpu_' + str(i)) model.net.StopGradient( gpu_prefix + '/' + in_blob_name + '_gpu_' + str(i), gpu_prefix + '/' + in_blob_name + '_gpu_' + str(i)) concat_in_blobs = [ gpu_prefix + '/' + in_blob_name + '_gpu_' + str(i) for i in range(cfg.NUM_GPUS) ] concat_in_blobs[gpu_id] = in_blobs[gpu_id] model.net.Concat(concat_in_blobs, [ gpu_prefix + '/' + in_blob_name + '_cross', gpu_prefix + '/' + in_blob_name + '_cross_split_info' ], axis=1) op = old_ops[gpu_point[gpu_prefix]] op.input[0] = gpu_prefix + '/' + in_blob_name + '_cross' op.output[0] = gpu_prefix + '/' + out_blob_name + '_cross' model.net._net.op.extend([op]) split_out_blobs = [ gpu_prefix + '/' + str(i) + '_useless' for i in range(len(out_blobs)) ] split_out_blobs[gpu_id] = out_blobs[gpu_id] model.net.Split([ gpu_prefix + '/' + out_blob_name + '_cross', gpu_prefix + '/' + in_blob_name + '_cross_split_info' ], split_out_blobs, axis=1) return num_op = len(model.net._net.op) for i, op in enumerate(model.net._net.op): print(op) exit(0)