def infer_slowfast(args): config = parse_config(args.config_file) infer_config = merge_configs(config, 'infer', vars(args)) print_configs(infer_config, "Infer") if not os.path.isdir(infer_config.INFER.save_path): os.makedirs(infer_config.INFER.save_path) if not args.use_gpu: place = fluid.CPUPlace() elif not args.use_data_parallel: place = fluid.CUDAPlace(0) else: place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id) _nranks = ParallelEnv().nranks # num gpu bs_single = int(infer_config.INFER.batch_size / _nranks) # batch_size of each gpu with fluid.dygraph.guard(place): #build model slowfast = SlowFast(cfg=infer_config, num_classes=400) if args.weights: assert os.path.exists(args.weights + '.pdparams'),\ "Given weight dir {} not exist.".format(args.weights) logger.info('load test weights from {}'.format(args.weights)) model_dict, _ = fluid.load_dygraph(args.weights) slowfast.set_dict(model_dict) if args.use_data_parallel: strategy = fluid.dygraph.parallel.prepare_context() slowfast = fluid.dygraph.parallel.DataParallel( slowfast, strategy, find_unused_parameters=False) #create reader infer_data = KineticsDataset(mode="infer", cfg=infer_config) infer_sampler = DistributedBatchSampler(infer_data, batch_size=bs_single, shuffle=False, drop_last=False) infer_loader = DataLoader(infer_data, batch_sampler=infer_sampler, places=place, feed_list=None, num_workers=0, return_list=True) # start infer num_ensemble_views = infer_config.INFER.num_ensemble_views num_spatial_crops = infer_config.INFER.num_spatial_crops num_cls = infer_config.MODEL.num_classes num_clips = num_ensemble_views * num_spatial_crops num_videos = len(infer_data) // num_clips video_preds = np.zeros((num_videos, num_cls)) clip_count = {} video_paths = [] with open(infer_config.INFER.filelist, "r") as f: for path in f.read().splitlines(): video_paths.append(path) print( "[INFER] infer start, number of videos {}, number of clips {}, total number of clips {}" .format(num_videos, num_clips, num_clips * num_videos)) slowfast.eval() for batch_id, data in enumerate(infer_loader): # call net model_inputs = [data[0], data[1]] preds = slowfast(model_inputs, training=False) clip_ids = data[3] # gather mulit card, results of following process in each card is the same. if _nranks > 1: preds = _all_gather(preds, _nranks) clip_ids = _all_gather(clip_ids, _nranks) # to numpy preds = preds.numpy() clip_ids = clip_ids.numpy() # preds ensemble for ind in range(preds.shape[0]): vid_id = int(clip_ids[ind]) // num_clips ts_idx = int(clip_ids[ind]) % num_clips if vid_id not in clip_count: clip_count[vid_id] = [] if ts_idx in clip_count[vid_id]: print( "[INFER] Passed!! read video {} clip index {} / {} repeatedly." .format(vid_id, ts_idx, clip_ids[ind])) else: clip_count[vid_id].append(ts_idx) video_preds[vid_id] += preds[ind] # ensemble method: sum if batch_id % args.log_interval == 0: print("[INFER] Processing batch {}/{} ...".format( batch_id, len(infer_data) // infer_config.INFER.batch_size)) # check clip index of each video for key in clip_count.keys(): if len(clip_count[key]) != num_clips or sum( clip_count[key]) != num_clips * (num_clips - 1) / 2: print( "[INFER] Warning!! video [{}] clip count [{}] not match number clips {}" .format(key, clip_count[key], num_clips)) res_list = [] for j in range(video_preds.shape[0]): pred = to_variable(video_preds[j] / num_clips) #mean prob video_path = video_paths[j] pred = to_variable(pred) top1_values, top1_indices = fluid.layers.topk(pred, k=1) top5_values, top5_indices = fluid.layers.topk(pred, k=5) top1_values = top1_values.numpy().astype("float64")[0] top1_indices = int(top1_indices.numpy()[0]) top5_values = list(top5_values.numpy().astype("float64")) top5_indices = [int(item) for item in top5_indices.numpy() ] #np.int is not JSON serializable print( "[INFER] video id [{}], top1 value {}, top1 indices {}".format( video_path, top1_values, top1_indices)) print( "[INFER] video id [{}], top5 value {}, top5 indices {}".format( video_path, top5_values, top5_indices)) save_dict = { 'video_id': video_path, 'top1_values': top1_values, 'top1_indices': top1_indices, 'top5_values': top5_values, 'top5_indices': top5_indices } res_list.append(save_dict) with open( os.path.join(infer_config.INFER.save_path, 'result' + '.json'), 'w') as f: json.dump(res_list, f) print('[INFER] infer finished, results saved in {}'.format( infer_config.INFER.save_path))
def do_train(args): paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args) tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) trans_func = partial(convert_example, tokenizer=tokenizer, max_seq_length=args.max_seq_length) if args.task_type == "cross-lingual-transfer": train_ds = load_dataset("xnli", "en", splits="train") train_ds = train_ds.map(trans_func, lazy=True) elif args.task_type == "translate-train-all": all_train_ds = [] for language in all_languages: train_ds = load_dataset("xnli", language, splits="train") all_train_ds.append(train_ds.map(trans_func, lazy=True)) train_ds = XnliDataset(all_train_ds) train_batch_sampler = DistributedBatchSampler(train_ds, batch_size=args.batch_size, shuffle=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64" ), # input_ids Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64" ), # position_ids Pad(axis=0, pad_val=0, dtype="int64"), # attention_mask Stack(dtype="int64") # labels ): fn(samples) train_data_loader = DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) num_classes = 3 model = AutoModelForSequenceClassification.from_pretrained( args.model_name_or_path, num_classes=num_classes, dropout=args.dropout) n_layers = model.ernie_m.config['num_hidden_layers'] if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) if args.max_steps > 0: num_training_steps = args.max_steps num_train_epochs = math.ceil(num_training_steps / len(train_data_loader)) else: num_training_steps = len(train_data_loader) * args.num_train_epochs num_train_epochs = args.num_train_epochs warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, warmup) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] # Construct dict name_dict = dict() for n, p in model.named_parameters(): name_dict[p.name] = n optimizer = AdamWDL(learning_rate=lr_scheduler, beta1=0.9, beta2=0.999, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, n_layers=n_layers, layerwise_decay=args.layerwise_decay, apply_decay_param_fun=lambda x: x in decay_params, name_dict=name_dict) loss_fct = nn.CrossEntropyLoss() if args.use_amp: scaler = paddle.amp.GradScaler(init_loss_scaling=args.scale_loss) metric = Accuracy() global_step = 0 tic_train = time.time() for epoch in range(num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, position_ids, attention_mask, labels = batch with paddle.amp.auto_cast( args.use_amp, custom_white_list=["layer_norm", "softmax", "gelu"]): logits = model(input_ids, position_ids, attention_mask) loss = loss_fct(logits, labels) if args.use_amp: scaled_loss = scaler.scale(loss) scaled_loss.backward() scaler.minimize(optimizer, scaled_loss) else: loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.logging_steps == 0: print( "global step %d/%d, epoch: %d, batch: %d, rank_id: %s, loss: %f, lr: %.10f, speed: %.4f step/s" % (global_step, num_training_steps, epoch, step, paddle.distributed.get_rank(), loss, optimizer.get_lr(), args.logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_step % args.save_steps == 0 or global_step == num_training_steps: for language in all_languages: tic_eval = time.time() test_data_loader = get_test_dataloader( args, language, batchify_fn, trans_func) evaluate(model, loss_fct, metric, test_data_loader, language) print("eval done total : %s s" % (time.time() - tic_eval)) if paddle.distributed.get_rank() == 0: output_dir = os.path.join( args.output_dir, "ernie_m_ft_model_%d.pdparams" % (global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) # Need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) if global_step >= num_training_steps: break if global_step >= num_training_steps: break if paddle.distributed.get_rank() == 0: output_dir = os.path.join( args.output_dir, "ernie_m_final_model_%d.pdparams" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) # Need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir)
def main(): paddle.enable_static() if FLAGS.static else None device = paddle.set_device(FLAGS.device) if not FLAGS.eval_only: # training mode train_transform = Compose([ ColorDistort(), RandomExpand(), RandomCrop(), RandomFlip(), NormalizeBox(), PadBox(), BboxXYXY2XYWH() ]) train_collate_fn = BatchCompose([RandomShape(), NormalizeImage()]) dataset = COCODataset(dataset_dir=FLAGS.data, anno_path='annotations/instances_train2017.json', image_dir='train2017', with_background=False, mixup=True, transform=train_transform) batch_sampler = DistributedBatchSampler(dataset, batch_size=FLAGS.batch_size, shuffle=True, drop_last=True) loader = DataLoader(dataset, batch_sampler=batch_sampler, num_workers=FLAGS.num_workers, return_list=True, collate_fn=train_collate_fn) else: # evaluation mode eval_transform = Compose([ ResizeImage(target_size=608), NormalizeBox(), PadBox(), BboxXYXY2XYWH() ]) eval_collate_fn = BatchCompose([NormalizeImage()]) dataset = COCODataset(dataset_dir=FLAGS.data, anno_path='annotations/instances_val2017.json', image_dir='val2017', with_background=False, transform=eval_transform) # batch_size can only be 1 in evaluation for YOLOv3 # prediction bbox is a LoDTensor batch_sampler = DistributedBatchSampler(dataset, batch_size=1, shuffle=False, drop_last=False) loader = DataLoader(dataset, batch_sampler=batch_sampler, num_workers=FLAGS.num_workers, return_list=True, collate_fn=eval_collate_fn) pretrained = FLAGS.eval_only and FLAGS.weights is None model = yolov3_darknet53(num_classes=dataset.num_classes, num_max_boxes=NUM_MAX_BOXES, model_mode='eval' if FLAGS.eval_only else 'train', pretrained=pretrained) if FLAGS.pretrain_weights and not FLAGS.eval_only: model.load(FLAGS.pretrain_weights, skip_mismatch=True, reset_optimizer=True) optim = make_optimizer(len(batch_sampler), parameters=model.parameters()) model.prepare(optimizer=optim, loss=YoloLoss(num_classes=dataset.num_classes)) # NOTE: we implement COCO metric of YOLOv3 model here, separately # from 'prepare' and 'fit' framework for follwing reason: # 1. YOLOv3 network structure is different between 'train' and # 'eval' mode, in 'eval' mode, output prediction bbox is not the # feature map used for YoloLoss calculating # 2. COCO metric behavior is also different from defined Metric # for COCO metric should not perform accumulate in each iteration # but only accumulate at the end of an epoch if FLAGS.eval_only: if FLAGS.weights is not None: model.load(FLAGS.weights, reset_optimizer=True) preds = model.predict(loader, stack_outputs=False) _, _, _, img_ids, bboxes = preds anno_path = os.path.join(FLAGS.data, 'annotations/instances_val2017.json') coco_metric = COCOMetric(anno_path=anno_path, with_background=False) for img_id, bbox in zip(img_ids, bboxes): coco_metric.update(img_id, bbox) coco_metric.accumulate() coco_metric.reset() return if FLAGS.resume is not None: model.load(FLAGS.resume) save_dir = FLAGS.save_dir or 'yolo_checkpoint' model.fit(train_data=loader, epochs=FLAGS.epoch - FLAGS.no_mixup_epoch, save_dir=os.path.join(save_dir, "mixup"), save_freq=10) # do not use image mixup transfrom in the last FLAGS.no_mixup_epoch epoches dataset.mixup = False model.fit(train_data=loader, epochs=FLAGS.no_mixup_epoch, save_dir=os.path.join(save_dir, "no_mixup"), save_freq=5)
def build_dataloader(dataset, batch_size, num_workers, places=None, shuffle=True, drop_last=True, multigrid=False, collate_fn_cfg=None, **kwargs): """Build Paddle Dataloader. XXX explain how the batch_sampler work! Args: dataset (paddle.dataset): A PaddlePaddle dataset object. batch_size (int): batch size on single card. num_worker (int): num_worker shuffle(bool): whether to shuffle the data at every epoch. """ if not kwargs.get('sampler'): batch_sampler = DistributedBatchSampler(dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last) else: sampler = build_sampler(kwargs['sampler']) batch_sampler = BatchSampler(dataset, sampler=sampler, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last) kwargs.update({'batch_sampler': batch_sampler}) # NOTE(shipping): when switch the mix operator on, such as: mixup, cutmix. # batch like: [[img, label, attibute, ...], [imgs, label, attribute, ...], ...] will recollate to: # [[img, img, ...], [label, label, ...], [attribute, attribute, ...], ...] as using numpy.transpose. def mix_collate_fn(batch): pipeline = build_batch_pipeline(collate_fn_cfg) batch = pipeline(batch) slots = [] for items in batch: for i, item in enumerate(items): if len(slots) < len(items): slots.append([item]) else: slots[i].append(item) return [np.stack(slot, axis=0) for slot in slots] # if collate_fn_cfg is not None: # ugly code here. collate_fn is mix op config # collate_fn = mix_collate_fn(collate_fn_cfg) data_loader = DataLoader( dataset, places=places, num_workers=num_workers, collate_fn=mix_collate_fn if collate_fn_cfg is not None else None, **kwargs) return data_loader