def get_input(self, fact): key = (fact[0], fact[1]) features = [0, 0, 0, 0, 0, 0, 0] if (key in self.table.keys()): val_list = [x[0] for x in self.table[key].values()] if (len(val_list) != 0): max_score = self.stat_table[key]['max_score'] my_score = self.compute_score(fact)[0] simi = self.base_model.get_entity_similarity( fact[2], self.stat_table[key]['simi_index']) rank = utils.get_rank(val_list, my_score) conditional_rank = rank * 1.0 / len(val_list) mean = self.stat_table[key]['mean'] std = self.stat_table[key]['std'] features = [ my_score, max_score, simi, rank, conditional_rank, mean, std ] return features
async def user_info(ctx, handle): url = f' {CF_USER_INFO}{handle}' obj = requests.get(url) data = json.loads(obj.text) if data['status'] == "FAILED": await ctx.send(f'{data["comment"]}') return pic = "https:" + data['result'][0]['titlePhoto'] rating = data['result'][0]['rating'] colour, rank = utils.get_rank(rating) embed = discord.Embed( title=f'{handle}', description= f'{data["result"][0]["organization"]} {data["result"][0]["city"]} {data["result"][0]["country"]}', colour=colour) embed.add_field(name=f'{rating} ({rank})', value=f'Max Rating : {data["result"][0]["maxRating"]}', inline=False) embed.set_image(url=pic) await ctx.channel.send(embed=embed)
async def handle_sub_url(task_dict, db): result_dict = { 'title': task_dict['title'], 'date': task_dict['date'], 'main_text': '', 'img': '', 'attachment': '', 'rank': 0, 'links_id': task_dict['link_id'] } if task_dict['sub_url'][-4:] in {'.pdf', '.doc', '.docx', '.txt'}: result_dict['attachment'] = task_dict['sub_url'] else: request = request_tools.Request(url=task_dict['sub_url']) await request.get_page_async() content = request.text print(task_dict['sub_url'], request.status_info) if content: # 连接成功 result_dict['main_text'], result_dict['attachment'], result_dict['img'], result_dict['title'], result_dict['date'] = \ parse(content, db, task_dict['sub_url'], task_dict['main_text_pattern'], task_dict['date_pattern'], task_dict['source_pattern'], task_dict['title_pattern'], task_dict['link_id'], task_dict['title'], task_dict['date']) rank = utils.get_rank(result_dict) result_dict['rank'] = rank save_data(db, result_dict)
cudnn.benchmark = True if args.load_features: train_features = torch.load( os.path.join(args.load_features, "trainfeat.pth")) test_features = torch.load( os.path.join(args.load_features, "testfeat.pth")) train_labels = torch.load( os.path.join(args.load_features, "trainlabels.pth")) test_labels = torch.load( os.path.join(args.load_features, "testlabels.pth")) else: # need to extract features ! train_features, test_features, train_labels, test_labels = extract_feature_pipeline( args) if utils.get_rank() == 0: if args.use_cuda: train_features = train_features.cuda() test_features = test_features.cuda() train_labels = train_labels.cuda() test_labels = test_labels.cuda() print("Features are ready!\nStart the k-NN classification.") for k in args.nb_knn: top1, top5 = knn_classifier(train_features, train_labels, test_features, test_labels, k, args.temperature) print(f"{k}-NN classifier result: Top1: {top1}, Top5: {top5}") dist.barrier()
def extract_feature_pipeline(args): # ============ preparing data ... ============ transform = pth_transforms.Compose([ pth_transforms.Resize(256, interpolation=3), pth_transforms.CenterCrop(224), pth_transforms.ToTensor(), pth_transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), ]) dataset_train = ReturnIndexDataset(os.path.join(args.data_path, "train"), transform=transform) dataset_val = ReturnIndexDataset(os.path.join(args.data_path, "val"), transform=transform) sampler = torch.utils.data.DistributedSampler(dataset_train, shuffle=False) data_loader_train = torch.utils.data.DataLoader( dataset_train, sampler=sampler, batch_size=args.batch_size_per_gpu, num_workers=args.num_workers, pin_memory=True, drop_last=False, ) data_loader_val = torch.utils.data.DataLoader( dataset_val, batch_size=args.batch_size_per_gpu, num_workers=args.num_workers, pin_memory=True, drop_last=False, ) print( f"Data loaded with {len(dataset_train)} train and {len(dataset_val)} val imgs." ) # ============ building network ... ============ model = vits.__dict__[args.arch](patch_size=args.patch_size, num_classes=0) print(f"Model {args.arch} {args.patch_size}x{args.patch_size} built.") model.cuda() utils.load_pretrained_weights(model, args.pretrained_weights, args.checkpoint_key, args.arch, args.patch_size) model.eval() # ============ extract features ... ============ print("Extracting features for train set...") train_features = extract_features(model, data_loader_train) print("Extracting features for val set...") test_features = extract_features(model, data_loader_val) if utils.get_rank() == 0: train_features = nn.functional.normalize(train_features, dim=1, p=2) test_features = nn.functional.normalize(test_features, dim=1, p=2) train_labels = torch.tensor([s[-1] for s in dataset_train.samples]).long() test_labels = torch.tensor([s[-1] for s in dataset_val.samples]).long() # save features and labels if args.dump_features and dist.get_rank() == 0: torch.save(train_features.cpu(), os.path.join(args.dump_features, "trainfeat.pth")) torch.save(test_features.cpu(), os.path.join(args.dump_features, "testfeat.pth")) torch.save(train_labels.cpu(), os.path.join(args.dump_features, "trainlabels.pth")) torch.save(test_labels.cpu(), os.path.join(args.dump_features, "testlabels.pth")) return train_features, test_features, train_labels, test_labels
def training(self): self.net.train() save_to_disk = ptutil.get_rank() == 0 start_training_time = time.time() trained_time = 0 tic = time.time() end = time.time() iteration, max_iter = 0, self.args.max_iter save_iter, eval_iter = self.args.per_iter * self.args.save_epoch, self.args.per_iter * self.args.eval_epoch # save_iter, eval_iter = self.args.per_iter * self.args.save_epoch, 10 logger.info("Start training, total epochs {:3d} = total iteration: {:6d}".format(self.args.epochs, max_iter)) # TODO: add mixup for i, batch in enumerate(self.train_loader): iteration += 1 self.scheduler.step() image = batch[0].to(self.device) fixed_targets = [batch[it].to(self.device) for it in range(1, 6)] gt_boxes = batch[6].to(self.device) self.optimizer.zero_grad() loss_dict = self.net(image, gt_boxes, *fixed_targets) # reduce losses over all GPUs for logging purposes loss_dict_reduced = ptutil.reduce_loss_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss = sum(loss for loss in loss_dict.values()) loss.backward() self.optimizer.step() trained_time += time.time() - end end = time.time() if iteration % args.log_step == 0: eta_seconds = int((trained_time / iteration) * (max_iter - iteration)) log_str = ["Iteration {:06d} , Lr: {:.5f}, Cost: {:.2f}s, Eta: {}" .format(iteration, self.optimizer.param_groups[0]['lr'], time.time() - tic, str(datetime.timedelta(seconds=eta_seconds))), "total_loss: {:.3f}".format(losses_reduced.item())] for loss_name, loss_item in loss_dict_reduced.items(): log_str.append("{}: {:.3f}".format(loss_name, loss_item.item())) log_str = ', '.join(log_str) logger.info(log_str) tic = time.time() if save_to_disk and iteration % save_iter == 0: model_path = os.path.join(self.args.save_dir, "{}_iter_{:06d}.pth" .format(self.save_prefix, iteration)) self.save_model(model_path) # Do eval when training, to trace the mAP changes and see performance improved whether or nor if self.args.eval_epoch > 0 and iteration % eval_iter == 0 and not iteration == max_iter: metrics = self.validate() ptutil.synchronize() names, values = ptutil.accumulate_metric(metrics) if names is not None: log_str = ['{}: {:.5f}'.format(k, v) for k, v in zip(names, values)] log_str = '\n'.join(log_str) logger.info(log_str) self.net.train() if save_to_disk: model_path = os.path.join(self.args.save_dir, "{}_iter_{:06d}.pth" .format(self.save_prefix, max_iter)) self.save_model(model_path) # compute training time total_training_time = int(time.time() - start_training_time) total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info( "Total training time: {} ({:.4f} s / it)".format(total_time_str, total_training_time / max_iter))
def main(args, ds_init): utils.init_distributed_mode(args) if ds_init is not None: utils.create_ds_config(args) print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) # random.seed(seed) cudnn.benchmark = True dataset_train, args.nb_classes = build_dataset(is_train=True, args=args) if args.disable_eval_during_finetuning: dataset_val = None else: dataset_val, _ = build_dataset(is_train=False, args=args) if True: # args.distributed: num_tasks = utils.get_world_size() global_rank = utils.get_rank() sampler_train = torch.utils.data.DistributedSampler( dataset_train, num_replicas=num_tasks, rank=global_rank, shuffle=True) print("Sampler_train = %s" % str(sampler_train)) if args.dist_eval: if len(dataset_val) % num_tasks != 0: print( 'Warning: Enabling distributed evaluation with an eval dataset not divisible by process number. ' 'This will slightly alter validation results as extra duplicate entries are added to achieve ' 'equal num of samples per-process.') sampler_val = torch.utils.data.DistributedSampler( dataset_val, num_replicas=num_tasks, rank=global_rank, shuffle=False) else: sampler_val = torch.utils.data.SequentialSampler(dataset_val) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) if global_rank == 0 and args.log_dir is not None: os.makedirs(args.log_dir, exist_ok=True) log_writer = utils.TensorboardLogger(log_dir=args.log_dir) else: log_writer = None data_loader_train = torch.utils.data.DataLoader( dataset_train, sampler=sampler_train, batch_size=args.batch_size, num_workers=args.num_workers, pin_memory=args.pin_mem, drop_last=True, ) if dataset_val is not None: data_loader_val = torch.utils.data.DataLoader( dataset_val, sampler=sampler_val, batch_size=int(1.5 * args.batch_size), num_workers=args.num_workers, pin_memory=args.pin_mem, drop_last=False) else: data_loader_val = None mixup_fn = None mixup_active = args.mixup > 0 or args.cutmix > 0. or args.cutmix_minmax is not None if mixup_active: print("Mixup is activated!") mixup_fn = Mixup(mixup_alpha=args.mixup, cutmix_alpha=args.cutmix, cutmix_minmax=args.cutmix_minmax, prob=args.mixup_prob, switch_prob=args.mixup_switch_prob, mode=args.mixup_mode, label_smoothing=args.smoothing, num_classes=args.nb_classes) model = create_model( args.model, pretrained=False, num_classes=args.nb_classes, drop_rate=args.drop, drop_path_rate=args.drop_path, attn_drop_rate=args.attn_drop_rate, drop_block_rate=None, use_mean_pooling=args.use_mean_pooling, init_scale=args.init_scale, use_rel_pos_bias=args.rel_pos_bias, use_abs_pos_emb=args.abs_pos_emb, init_values=args.layer_scale_init_value, ) patch_size = model.patch_embed.patch_size print("Patch size = %s" % str(patch_size)) args.window_size = (args.input_size // patch_size[0], args.input_size // patch_size[1]) args.patch_size = patch_size if args.finetune: if args.finetune.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.finetune, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.finetune, map_location='cpu') print("Load ckpt from %s" % args.finetune) checkpoint_model = None for model_key in args.model_key.split('|'): if model_key in checkpoint: checkpoint_model = checkpoint[model_key] print("Load state_dict by model_key = %s" % model_key) break if checkpoint_model is None: checkpoint_model = checkpoint state_dict = model.state_dict() for k in ['head.weight', 'head.bias']: if k in checkpoint_model and checkpoint_model[ k].shape != state_dict[k].shape: print(f"Removing key {k} from pretrained checkpoint") del checkpoint_model[k] if model.use_rel_pos_bias and "rel_pos_bias.relative_position_bias_table" in checkpoint_model: print( "Expand the shared relative position embedding to each transformer block. " ) num_layers = model.get_num_layers() rel_pos_bias = checkpoint_model[ "rel_pos_bias.relative_position_bias_table"] for i in range(num_layers): checkpoint_model["blocks.%d.attn.relative_position_bias_table" % i] = rel_pos_bias.clone() checkpoint_model.pop("rel_pos_bias.relative_position_bias_table") all_keys = list(checkpoint_model.keys()) for key in all_keys: if "relative_position_index" in key: checkpoint_model.pop(key) if "relative_position_bias_table" in key: rel_pos_bias = checkpoint_model[key] src_num_pos, num_attn_heads = rel_pos_bias.size() dst_num_pos, _ = model.state_dict()[key].size() dst_patch_shape = model.patch_embed.patch_shape if dst_patch_shape[0] != dst_patch_shape[1]: raise NotImplementedError() num_extra_tokens = dst_num_pos - ( dst_patch_shape[0] * 2 - 1) * (dst_patch_shape[1] * 2 - 1) src_size = int((src_num_pos - num_extra_tokens)**0.5) dst_size = int((dst_num_pos - num_extra_tokens)**0.5) if src_size != dst_size: print("Position interpolate for %s from %dx%d to %dx%d" % (key, src_size, src_size, dst_size, dst_size)) extra_tokens = rel_pos_bias[-num_extra_tokens:, :] rel_pos_bias = rel_pos_bias[:-num_extra_tokens, :] def geometric_progression(a, r, n): return a * (1.0 - r**n) / (1.0 - r) left, right = 1.01, 1.5 while right - left > 1e-6: q = (left + right) / 2.0 gp = geometric_progression(1, q, src_size // 2) if gp > dst_size // 2: right = q else: left = q # if q > 1.090307: # q = 1.090307 dis = [] cur = 1 for i in range(src_size // 2): dis.append(cur) cur += q**(i + 1) r_ids = [-_ for _ in reversed(dis)] x = r_ids + [0] + dis y = r_ids + [0] + dis t = dst_size // 2.0 dx = np.arange(-t, t + 0.1, 1.0) dy = np.arange(-t, t + 0.1, 1.0) print("Original positions = %s" % str(x)) print("Target positions = %s" % str(dx)) all_rel_pos_bias = [] for i in range(num_attn_heads): z = rel_pos_bias[:, i].view(src_size, src_size).float().numpy() f = interpolate.interp2d(x, y, z, kind='cubic') all_rel_pos_bias.append( torch.Tensor(f(dx, dy)).contiguous().view( -1, 1).to(rel_pos_bias.device)) rel_pos_bias = torch.cat(all_rel_pos_bias, dim=-1) new_rel_pos_bias = torch.cat((rel_pos_bias, extra_tokens), dim=0) checkpoint_model[key] = new_rel_pos_bias # interpolate position embedding if 'pos_embed' in checkpoint_model: pos_embed_checkpoint = checkpoint_model['pos_embed'] embedding_size = pos_embed_checkpoint.shape[-1] num_patches = model.patch_embed.num_patches num_extra_tokens = model.pos_embed.shape[-2] - num_patches # height (== width) for the checkpoint position embedding orig_size = int( (pos_embed_checkpoint.shape[-2] - num_extra_tokens)**0.5) # height (== width) for the new position embedding new_size = int(num_patches**0.5) # class_token and dist_token are kept unchanged if orig_size != new_size: print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size)) extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens] # only the position tokens are interpolated pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:] pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute( 0, 3, 1, 2) pos_tokens = torch.nn.functional.interpolate( pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False) pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2) new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1) checkpoint_model['pos_embed'] = new_pos_embed utils.load_state_dict(model, checkpoint_model, prefix=args.model_prefix) # model.load_state_dict(checkpoint_model, strict=False) model.to(device) model_ema = None if args.model_ema: # Important to create EMA model after cuda(), DP wrapper, and AMP but before SyncBN and DDP wrapper model_ema = ModelEma(model, decay=args.model_ema_decay, device='cpu' if args.model_ema_force_cpu else '', resume='') print("Using EMA with decay = %.8f" % args.model_ema_decay) model_without_ddp = model n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print("Model = %s" % str(model_without_ddp)) print('number of params:', n_parameters) total_batch_size = args.batch_size * args.update_freq * utils.get_world_size( ) num_training_steps_per_epoch = len(dataset_train) // total_batch_size print("LR = %.8f" % args.lr) print("Batch size = %d" % total_batch_size) print("Update frequent = %d" % args.update_freq) print("Number of training examples = %d" % len(dataset_train)) print("Number of training training per epoch = %d" % num_training_steps_per_epoch) num_layers = model_without_ddp.get_num_layers() if args.layer_decay < 1.0: assigner = LayerDecayValueAssigner( list(args.layer_decay**(num_layers + 1 - i) for i in range(num_layers + 2))) else: assigner = None if assigner is not None: print("Assigned values = %s" % str(assigner.values)) skip_weight_decay_list = model.no_weight_decay() if args.disable_weight_decay_on_rel_pos_bias: for i in range(num_layers): skip_weight_decay_list.add( "blocks.%d.attn.relative_position_bias_table" % i) if args.enable_deepspeed: loss_scaler = None optimizer_params = get_parameter_groups( model, args.weight_decay, skip_weight_decay_list, assigner.get_layer_id if assigner is not None else None, assigner.get_scale if assigner is not None else None) model, optimizer, _, _ = ds_init( args=args, model=model, model_parameters=optimizer_params, dist_init_required=not args.distributed, ) print("model.gradient_accumulation_steps() = %d" % model.gradient_accumulation_steps()) assert model.gradient_accumulation_steps() == args.update_freq else: if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu], find_unused_parameters=True) model_without_ddp = model.module optimizer = create_optimizer(args, model_without_ddp, skip_list=skip_weight_decay_list, get_num_layer=assigner.get_layer_id if assigner is not None else None, get_layer_scale=assigner.get_scale if assigner is not None else None) loss_scaler = NativeScaler() print("Use step level LR scheduler!") lr_schedule_values = utils.cosine_scheduler( args.lr, args.min_lr, args.epochs, num_training_steps_per_epoch, warmup_epochs=args.warmup_epochs, warmup_steps=args.warmup_steps, ) if args.weight_decay_end is None: args.weight_decay_end = args.weight_decay wd_schedule_values = utils.cosine_scheduler(args.weight_decay, args.weight_decay_end, args.epochs, num_training_steps_per_epoch) print("Max WD = %.7f, Min WD = %.7f" % (max(wd_schedule_values), min(wd_schedule_values))) if mixup_fn is not None: # smoothing is handled with mixup label transform criterion = SoftTargetCrossEntropy() elif args.smoothing > 0.: criterion = LabelSmoothingCrossEntropy(smoothing=args.smoothing) else: criterion = torch.nn.CrossEntropyLoss() print("criterion = %s" % str(criterion)) utils.auto_load_model(args=args, model=model, model_without_ddp=model_without_ddp, optimizer=optimizer, loss_scaler=loss_scaler, model_ema=model_ema) if args.eval: test_stats = evaluate(data_loader_val, model, device) print( f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%" ) exit(0) print(f"Start training for {args.epochs} epochs") start_time = time.time() max_accuracy = 0.0 for epoch in range(args.start_epoch, args.epochs): if args.distributed: data_loader_train.sampler.set_epoch(epoch) if log_writer is not None: log_writer.set_step(epoch * num_training_steps_per_epoch * args.update_freq) train_stats = train_one_epoch( model, criterion, data_loader_train, optimizer, device, epoch, loss_scaler, args.clip_grad, model_ema, mixup_fn, log_writer=log_writer, start_steps=epoch * num_training_steps_per_epoch, lr_schedule_values=lr_schedule_values, wd_schedule_values=wd_schedule_values, num_training_steps_per_epoch=num_training_steps_per_epoch, update_freq=args.update_freq, ) if args.output_dir and args.save_ckpt: if (epoch + 1) % args.save_ckpt_freq == 0 or epoch + 1 == args.epochs: utils.save_model(args=args, model=model, model_without_ddp=model_without_ddp, optimizer=optimizer, loss_scaler=loss_scaler, epoch=epoch, model_ema=model_ema) if data_loader_val is not None: test_stats = evaluate(data_loader_val, model, device) print( f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%" ) if max_accuracy < test_stats["acc1"]: max_accuracy = test_stats["acc1"] if args.output_dir and args.save_ckpt: utils.save_model(args=args, model=model, model_without_ddp=model_without_ddp, optimizer=optimizer, loss_scaler=loss_scaler, epoch="best", model_ema=model_ema) print(f'Max accuracy: {max_accuracy:.2f}%') if log_writer is not None: log_writer.update(test_acc1=test_stats['acc1'], head="perf", step=epoch) log_writer.update(test_acc5=test_stats['acc5'], head="perf", step=epoch) log_writer.update(test_loss=test_stats['loss'], head="perf", step=epoch) log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } else: log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, # **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } if args.output_dir and utils.is_main_process(): if log_writer is not None: log_writer.flush() with open(os.path.join(args.output_dir, "log.txt"), mode="a", encoding="utf-8") as f: f.write(json.dumps(log_stats) + "\n") total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(): global timeout_sent args = parse_arguments() random.seed(args.seed + args.local_rank) np.random.seed(args.seed + args.local_rank) torch.manual_seed(args.seed + args.local_rank) torch.cuda.manual_seed(args.seed + args.local_rank) worker_init = WorkerInitObj(args.seed + args.local_rank) device, args = setup_training(args) dllogger.log(step="PARAMETER", data={"Config": [str(args)]}) # Prepare optimizer model, optimizer, lr_scheduler, checkpoint, global_step, criterion = prepare_model_and_optimizer(args, device) if args.disable_weight_tying: # Sanity Check that new param is in optimizer print ("SANITY CHECK OPTIMIZER: ", id(model.module.cls.predictions.decoder.weight) in [id(g) for g in optimizer.param_groups[0]['params']]) assert id(model.module.cls.predictions.decoder.weight) in [id(g) for g in optimizer.param_groups[0]['params']] print (f"SAVING EVERY {args.num_steps_per_checkpoint} STEPS!") if is_main_process(): dllogger.log(step="PARAMETER", data={"SEED": args.seed}) raw_train_start = None if args.do_train: if is_main_process(): dllogger.log(step="PARAMETER", data={"train_start": True}) dllogger.log(step="PARAMETER", data={"batch_size_per_gpu": args.train_batch_size}) dllogger.log(step="PARAMETER", data={"learning_rate": args.learning_rate}) model.train() most_recent_ckpts_paths = [] average_loss = 0.0 # averaged loss every args.log_freq steps epoch = 0 training_steps = 0 pool = ProcessPoolExecutor(1) # Note: We loop infinitely over epochs, termination is handled via iteration count while True: thread = None restored_data_loader = None if not args.resume_from_checkpoint or epoch > 0 or (args.phase2 and global_step < 1) or args.init_checkpoint: files = [os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if os.path.isfile(os.path.join(args.input_dir, f)) and ('training' in f or 'train' in f)] files.sort() num_files = len(files) random.Random(args.seed + epoch).shuffle(files) f_start_id = 0 else: f_start_id = checkpoint['files'][0] files = checkpoint['files'][1:] args.resume_from_checkpoint = False num_files = len(files) # may not exist in all checkpoints epoch = checkpoint.get('epoch', 0) restored_data_loader = checkpoint.get('data_loader', None) shared_file_list = {} if torch.distributed.is_initialized() and get_world_size() > num_files: remainder = get_world_size() % num_files data_file = files[(f_start_id*get_world_size()+get_rank() + remainder*f_start_id)%num_files] else: data_file = files[(f_start_id*get_world_size()+get_rank())%num_files] previous_file = data_file if restored_data_loader is None: train_data = pretraining_dataset(data_file, args.max_predictions_per_seq) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size * args.n_gpu, num_workers=4, worker_init_fn=worker_init, pin_memory=True) # shared_file_list["0"] = (train_dataloader, data_file) else: train_dataloader = restored_data_loader restored_data_loader = None overflow_buf = None if args.allreduce_post_accumulation: overflow_buf = torch.cuda.IntTensor([0]) for f_id in range(f_start_id + 1 , len(files)): if get_world_size() > num_files: data_file = files[(f_id*get_world_size()+get_rank() + remainder*f_id)%num_files] else: data_file = files[(f_id*get_world_size()+get_rank())%num_files] previous_file = data_file dataset_future = pool.submit(create_pretraining_dataset, data_file, args.max_predictions_per_seq, shared_file_list, args, worker_init) train_iter = tqdm(train_dataloader, desc="Iteration", disable=args.disable_progress_bar) if is_main_process() else train_dataloader if raw_train_start is None: raw_train_start = time.time() for step, batch in enumerate(train_iter): training_steps += 1 batch = [t.to(device) for t in batch] input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels = batch prediction_scores, seq_relationship_score = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask) loss, mlm_loss, ns_loss = criterion(prediction_scores, seq_relationship_score, masked_lm_labels, next_sentence_labels) if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. mlm_loss = mlm_loss.detach().mean() ns_loss = ns_loss.detach().mean() divisor = args.gradient_accumulation_steps if args.gradient_accumulation_steps > 1: if not args.allreduce_post_accumulation: # this division was merged into predivision loss = loss / args.gradient_accumulation_steps mlm_loss = mlm_loss.detach() / args.gradient_accumulation_steps ns_loss = ns_loss.detach() / args.gradient_accumulation_steps divisor = 1.0 if args.fp16: with amp.scale_loss(loss, optimizer, delay_overflow_check=args.allreduce_post_accumulation) as scaled_loss: scaled_loss.backward() else: loss.backward() average_loss += loss.item() if training_steps % args.gradient_accumulation_steps == 0: lr_scheduler.step() # learning rate warmup global_step = take_optimizer_step(args, optimizer, model, overflow_buf, global_step) if global_step >= args.steps_this_run or timeout_sent: train_time_raw = time.time() - raw_train_start last_num_steps = int(training_steps / args.gradient_accumulation_steps) % args.log_freq last_num_steps = args.log_freq if last_num_steps == 0 else last_num_steps average_loss = torch.tensor(average_loss, dtype=torch.float32).cuda() average_loss = average_loss / (last_num_steps * divisor) if (torch.distributed.is_initialized()): average_loss /= get_world_size() torch.distributed.all_reduce(average_loss) final_loss = average_loss.item() if is_main_process(): dllogger.log(step=(epoch, global_step, ), data={"final_loss": final_loss}) elif training_steps % (args.log_freq * args.gradient_accumulation_steps) == 0: if is_main_process(): dllogger.log(step=(epoch, global_step, ), data={"average_loss": average_loss / (args.log_freq * divisor), "step_loss": loss.item() * args.gradient_accumulation_steps / divisor, "learning_rate": optimizer.param_groups[0]['lr'], "mlm_loss" : mlm_loss.item(), "ns_loss" : ns_loss.item()}) average_loss = 0 if global_step >= args.steps_this_run or training_steps % ( args.num_steps_per_checkpoint * args.gradient_accumulation_steps) == 0 or timeout_sent: if is_main_process() and not args.skip_checkpoint: # Save a trained model dllogger.log(step="PARAMETER", data={"checkpoint_step": global_step}) model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self if args.resume_step < 0 or not args.phase2: output_save_file = os.path.join(args.output_dir, "ckpt_{}.pt".format(global_step)) else: output_save_file = os.path.join(args.output_dir, "ckpt_{}.pt".format(global_step + args.phase1_end_step)) if args.do_train: torch.save({'model': model_to_save.state_dict(), 'optimizer': optimizer.state_dict(), 'master params': list(amp.master_params(optimizer)), 'files': [f_id] + files, 'epoch': epoch, 'data_loader': None if global_step >= args.max_steps else train_dataloader}, output_save_file) most_recent_ckpts_paths.append(output_save_file) # Exiting the training due to hitting max steps, or being sent a # timeout from the cluster scheduler if global_step >= args.steps_this_run or timeout_sent: del train_dataloader # thread.join() return args, final_loss, train_time_raw, global_step del train_dataloader # thread.join() # Make sure pool has finished and switch train_dataloader # NOTE: Will block until complete train_dataloader, data_file = dataset_future.result(timeout=None) epoch += 1
def main(): args = parse_args() hvd.init() set_affinity(hvd.local_rank()) if is_main_process(): log("Running total processes: {}".format(get_world_size())) log("Starting process: {}".format(get_rank())) if is_main_process(): dllogger.init(backends=[dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=args.json_summary), dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE, step_format=format_step)]) else: dllogger.init(backends=[]) tf.random.set_seed(args.seed) dllogger.log(step="PARAMETER", data={"SEED": args.seed}) # script parameters BATCH_SIZE = args.train_batch_size EVAL_BATCH_SIZE = args.predict_batch_size USE_XLA = args.xla USE_AMP = args.amp EPOCHS = args.num_train_epochs if not args.do_train: EPOCHS = args.num_train_epochs = 1 log("Since running inference only, setting args.num_train_epochs to 1") if not os.path.exists(args.output_dir) and is_main_process(): os.makedirs(args.output_dir) # TensorFlow configuration gpus = tf.config.experimental.list_physical_devices('GPU') if gpus: for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') tf.config.optimizer.set_jit(USE_XLA) #tf.config.optimizer.set_experimental_options({"auto_mixed_precision": USE_AMP}) if args.amp: policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16", loss_scale="dynamic") tf.keras.mixed_precision.experimental.set_policy(policy) print('Compute dtype: %s' % policy.compute_dtype) # Compute dtype: float16 print('Variable dtype: %s' % policy.variable_dtype) # Variable dtype: float32 if is_main_process(): log("***** Loading tokenizer and model *****") # Load tokenizer and model from pretrained model/vocabulary. Specify the number of labels to classify (2+: classification, 1: regression) electra_model = args.electra_model config = ElectraConfig.from_pretrained(electra_model, cache_dir=args.cache_dir) config.update({"amp": args.amp}) if args.vocab_file is None: tokenizer = ElectraTokenizer.from_pretrained(electra_model, cache_dir=args.cache_dir) else: tokenizer = ElectraTokenizer( vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) model = TFElectraForQuestionAnswering.from_pretrained(electra_model, config=config, cache_dir=args.cache_dir, args=args) if is_main_process(): log("***** Loading dataset *****") # Load data processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor() train_examples = processor.get_train_examples(args.data_dir) if args.do_train else None dev_examples = processor.get_dev_examples(args.data_dir) if args.do_predict else None if is_main_process(): log("***** Loading features *****") # Load cached features squad_version = '2.0' if args.version_2_with_negative else '1.1' if args.cache_dir is None: args.cache_dir = args.data_dir cached_train_features_file = args.cache_dir.rstrip('/') + '/' + 'TF2_train-v{4}.json_{1}_{2}_{3}'.format( electra_model.split("/")[1], str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length), squad_version) cached_dev_features_file = args.cache_dir.rstrip('/') + '/' + 'TF2_dev-v{4}.json_{1}_{2}_{3}'.format( electra_model.split("/")[1], str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length), squad_version) try: with open(cached_train_features_file, "rb") as reader: train_features = pickle.load(reader) if args.do_train else [] with open(cached_dev_features_file, "rb") as reader: dev_features = pickle.load(reader) if args.do_predict else [] except: train_features = ( # TODO: (yy) do on rank 0? squad_convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True, return_dataset="", ) if args.do_train else [] ) dev_features = ( squad_convert_examples_to_features( examples=dev_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False, return_dataset="", ) if args.do_predict else [] ) # Dump Cached features if not args.skip_cache and is_main_process(): if args.do_train: log("***** Building Cache Files: {} *****".format(cached_train_features_file)) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) if args.do_predict: log("***** Building Cache Files: {} *****".format(cached_dev_features_file)) with open(cached_dev_features_file, "wb") as writer: pickle.dump(dev_features, writer) len_train_features = len(train_features) total_train_steps = int((len_train_features * EPOCHS / BATCH_SIZE) / get_world_size()) + 1 train_steps_per_epoch = int((len_train_features / BATCH_SIZE) / get_world_size()) + 1 len_dev_features = len(dev_features) total_dev_steps = int((len_dev_features / EVAL_BATCH_SIZE)) + 1 train_dataset = get_dataset_from_features(train_features, BATCH_SIZE, v2=args.version_2_with_negative) if args.do_train else [] dev_dataset = get_dataset_from_features(dev_features, EVAL_BATCH_SIZE, drop_remainder=False, ngpu=1, mode="dev", v2=args.version_2_with_negative) if args.do_predict else [] opt = create_optimizer(init_lr=args.learning_rate, num_train_steps=total_train_steps, num_warmup_steps=int(args.warmup_proportion * total_train_steps), weight_decay_rate=args.weight_decay_rate, layerwise_lr_decay=args.layerwise_lr_decay, n_transformer_layers=model.num_hidden_layers) if USE_AMP: # loss scaling is currently required when using mixed precision opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, "dynamic") # Define loss function loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) loss_class = tf.keras.losses.BinaryCrossentropy( from_logits=True, name='binary_crossentropy' ) metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy") model.compile(optimizer=opt, loss=loss, metrics=[metric]) train_loss_results = [] if args.do_train and is_main_process(): log("***** Running training *****") log(" Num examples = ", len_train_features) log(" Num Epochs = ", args.num_train_epochs) log(" Instantaneous batch size per GPU = ", args.train_batch_size) log( " Total train batch size (w. parallel, distributed & accumulation) = ", args.train_batch_size * get_world_size(), ) log(" Total optimization steps =", total_train_steps) total_train_time = 0 latency = [] for epoch in range(EPOCHS): if args.do_train: epoch_loss_avg = tf.keras.metrics.Mean() epoch_perf_avg = tf.keras.metrics.Mean() epoch_start = time.time() epoch_iterator = tqdm(train_dataset, total=train_steps_per_epoch, desc="Iteration", mininterval=5, disable=not is_main_process()) for iter, inputs in enumerate(epoch_iterator): # breaking criterion if max_steps if > 1 if args.max_steps > 0 and (epoch * train_steps_per_epoch + iter) > args.max_steps: break iter_start = time.time() # Optimize the model loss_value = train_step(model, inputs, loss, USE_AMP, opt, (iter == 0 and epoch == 0), v2=args.version_2_with_negative, loss_class=loss_class, fp16=USE_AMP) epoch_perf_avg.update_state(1. * BATCH_SIZE / (time.time() - iter_start)) if iter % args.log_freq == 0: if is_main_process(): log("\nEpoch: {:03d}, Step:{:6d}, Loss:{:12.8f}, Perf:{:5.0f}, loss_scale:{}, opt_step:{}".format(epoch, iter, loss_value, epoch_perf_avg.result() * get_world_size(), opt.loss_scale if config.amp else 1, int(opt.iterations))) dllogger.log(step=(epoch, iter,), data={"step_loss": float(loss_value.numpy()), "train_perf": float( epoch_perf_avg.result().numpy() * get_world_size())}) # Track progress epoch_loss_avg.update_state(loss_value) # Add current batch loss # End epoch train_loss_results.append(epoch_loss_avg.result()) total_train_time += float(time.time() - epoch_start) # Summarize and save checkpoint at the end of each epoch if is_main_process(): dllogger.log(step=tuple(), data={"e2e_train_time": total_train_time, "training_sequences_per_second": float( epoch_perf_avg.result().numpy() * get_world_size()), "final_loss": float(epoch_loss_avg.result().numpy())}) if not args.skip_checkpoint: if args.ci: checkpoint_name = "{}/electra_base_qa_v2_{}_epoch_{}_ckpt".format(args.output_dir, args.version_2_with_negative, epoch + 1) else: checkpoint_name = "checkpoints/electra_base_qa_v2_{}_epoch_{}_ckpt".format(args.version_2_with_negative, epoch + 1) if is_main_process(): model.save_weights(checkpoint_name) if args.do_predict and (args.evaluate_during_training or epoch == args.num_train_epochs - 1): if not args.do_train: log("***** Loading checkpoint: {} *****".format(args.init_checkpoint)) model.load_weights(args.init_checkpoint).expect_partial() current_feature_id = 0 all_results = [] if is_main_process(): log("***** Running evaluation *****") log(" Num Batches = ", total_dev_steps) log(" Batch size = ", args.predict_batch_size) raw_infer_start = time.time() if is_main_process(): infer_perf_avg = tf.keras.metrics.Mean() dev_iterator = tqdm(dev_dataset, total=total_dev_steps, desc="Iteration", mininterval=5, disable=not is_main_process()) for input_ids, input_mask, segment_ids, start_positions, end_positions, cls_index, p_mask, is_impossible in dev_iterator: # training=False is needed only if there are layers with different # behavior during training versus inference (e.g. Dropout). iter_start = time.time() if not args.joint_head: batch_start_logits, batch_end_logits = infer_step(model, input_ids, attention_mask=input_mask, token_type_ids=segment_ids, )[:2] #Synchronize with GPU to compute time _ = batch_start_logits.numpy() else: outputs = infer_step(model, input_ids, attention_mask=input_mask, token_type_ids=segment_ids, cls_index=cls_index, p_mask=p_mask, ) #Synchronize with GPU to compute time _ = outputs[0].numpy() infer_time = (time.time() - iter_start) infer_perf_avg.update_state(1. * EVAL_BATCH_SIZE / infer_time) latency.append(infer_time) for iter_ in range(input_ids.shape[0]): if not args.joint_head: start_logits = batch_start_logits[iter_].numpy().tolist() end_logits = batch_end_logits[iter_].numpy().tolist() dev_feature = dev_features[current_feature_id] current_feature_id += 1 unique_id = int(dev_feature.unique_id) all_results.append(RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) else: dev_feature = dev_features[current_feature_id] current_feature_id += 1 unique_id = int(dev_feature.unique_id) output = [output[iter_].numpy().tolist() for output in outputs] start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] result = SquadResult( unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits, ) all_results.append(result) # Compute and save predictions answers, nbest_answers = get_answers(dev_examples, dev_features, all_results, args) output_prediction_file = os.path.join(args.output_dir, "predictions.json") output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json") e2e_infer_time = time.time() - raw_infer_start # if args.version_2_with_negative: # output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json") # else: # output_null_log_odds_file = None with open(output_prediction_file, "w") as f: f.write(json.dumps(answers, indent=4) + "\n") with open(output_nbest_file, "w") as f: f.write(json.dumps(nbest_answers, indent=4) + "\n") if args.do_eval: if args.version_2_with_negative: dev_file = "dev-v2.0.json" else: dev_file = "dev-v1.1.json" eval_out = subprocess.check_output([sys.executable, args.eval_script, args.data_dir + "/" + dev_file, output_prediction_file]) log(eval_out.decode('UTF-8')) scores = str(eval_out).strip() exact_match = float(scores.split(":")[1].split(",")[0]) if args.version_2_with_negative: f1 = float(scores.split(":")[2].split(",")[0]) else: f1 = float(scores.split(":")[2].split("}")[0]) log("Epoch: {:03d} Results: {}".format(epoch, eval_out.decode('UTF-8'))) log("**EVAL SUMMARY** - Epoch: {:03d}, EM: {:6.3f}, F1: {:6.3f}, Infer_Perf: {:4.0f} seq/s" .format(epoch, exact_match, f1, infer_perf_avg.result())) latency_all = sorted(latency)[:-2] log( "**LATENCY SUMMARY** - Epoch: {:03d}, Ave: {:6.3f} ms, 90%: {:6.3f} ms, 95%: {:6.3f} ms, 99%: {:6.3f} ms" .format(epoch, sum(latency_all) / len(latency_all) * 1000, sum(latency_all[:int(len(latency_all) * 0.9)]) / int(len(latency_all) * 0.9) * 1000, sum(latency_all[:int(len(latency_all) * 0.95)]) / int(len(latency_all) * 0.95) * 1000, sum(latency_all[:int(len(latency_all) * 0.99)]) / int(len(latency_all) * 0.99) * 1000, )) dllogger.log(step=tuple(), data={"inference_sequences_per_second": float(infer_perf_avg.result().numpy()), "e2e_inference_time": e2e_infer_time}) if is_main_process() and args.do_train and args.do_eval: log( "**RESULTS SUMMARY** - EM: {:6.3f}, F1: {:6.3f}, Train_Time: {:4.0f} s, Train_Perf: {:4.0f} seq/s, Infer_Perf: {:4.0f} seq/s" .format(exact_match, f1, total_train_time, epoch_perf_avg.result() * get_world_size(), infer_perf_avg.result())) dllogger.log(step=tuple(), data={"exact_match": exact_match, "F1": f1})
parser.add_argument( "--config-file", default="", metavar="FILE", help="path to config file", type=str, ) parser.add_argument('--local_rank', default=0, help='set GPU id', type=int) args = parser.parse_args() num_gpus = int( os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 cfg.merge_from_file(args.config_file) cfg.freeze() if get_rank() == 0: if not os.path.isdir(cfg.OUTPUT_DIR): try: os.mkdir(cfg.OUTPUT_DIR) except FileExistsError: print('%s alreadly exist' % cfg.OUTPUT_DIR) is_distributed = (num_gpus > 1) if is_distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") synchronize() if cfg.MODEL.DEVICE is 'cuda': cudnn.benchmark = True
def main(args): utils.init_distributed_mode(args) print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) # random.seed(seed) cudnn.benchmark = True dataset_train, args.nb_classes = build_dataset(is_train=True, args=args) dataset_val, _ = build_dataset(is_train=False, args=args) if True: # args.distributed: num_tasks = utils.get_world_size() global_rank = utils.get_rank() if args.repeated_aug: sampler_train = RASampler(dataset_train, num_replicas=num_tasks, rank=global_rank, shuffle=True) else: sampler_train = torch.utils.data.DistributedSampler( dataset_train, num_replicas=num_tasks, rank=global_rank, shuffle=True) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) data_loader_train = torch.utils.data.DataLoader( dataset_train, sampler=sampler_train, batch_size=args.batch_size, num_workers=args.num_workers, pin_memory=args.pin_mem, drop_last=True, ) data_loader_val = torch.utils.data.DataLoader(dataset_val, batch_size=int( 1.5 * args.batch_size), shuffle=False, num_workers=args.num_workers, pin_memory=args.pin_mem, drop_last=False) mixup_fn = None mixup_active = args.mixup > 0 or args.cutmix > 0. or args.cutmix_minmax is not None if mixup_active: mixup_fn = Mixup(mixup_alpha=args.mixup, cutmix_alpha=args.cutmix, cutmix_minmax=args.cutmix_minmax, prob=args.mixup_prob, switch_prob=args.mixup_switch_prob, mode=args.mixup_mode, label_smoothing=args.smoothing, num_classes=args.nb_classes) print(f"Creating model: {args.model}") model = create_model( args.model, pretrained=False, num_classes=args.nb_classes, drop_rate=args.drop, drop_path_rate=args.drop_path, drop_block_rate=args.drop_block, ) # TODO: finetuning model.to(device) model_ema = None if args.model_ema: # Important to create EMA model after cuda(), DP wrapper, and AMP but before SyncBN and DDP wrapper model_ema = ModelEma(model, decay=args.model_ema_decay, device='cpu' if args.model_ema_force_cpu else '', resume='') model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) linear_scaled_lr = args.lr * args.batch_size * utils.get_world_size( ) / 512.0 args.lr = linear_scaled_lr optimizer = create_optimizer(args, model) loss_scaler = NativeScaler() lr_scheduler, _ = create_scheduler(args, optimizer) criterion = LabelSmoothingCrossEntropy() if args.mixup > 0.: # smoothing is handled with mixup label transform criterion = SoftTargetCrossEntropy() elif args.smoothing: criterion = LabelSmoothingCrossEntropy(smoothing=args.smoothing) else: criterion = torch.nn.CrossEntropyLoss() output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.model_ema: utils._load_checkpoint_for_ema(model_ema, checkpoint['model_ema']) if args.eval: test_stats = evaluate(data_loader_val, model, device) print( f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%" ) return print("Start training") start_time = time.time() max_accuracy = 0.0 for epoch in range(args.start_epoch, args.epochs): if args.distributed: data_loader_train.sampler.set_epoch(epoch) train_stats = train_one_epoch(model, criterion, data_loader_train, optimizer, device, epoch, loss_scaler, args.clip_grad, model_ema, mixup_fn) lr_scheduler.step(epoch) if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'model_ema': get_state_dict(model_ema), 'args': args, }, checkpoint_path) test_stats = evaluate(data_loader_val, model, device) print( f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%" ) max_accuracy = max(max_accuracy, test_stats["acc1"]) print(f'Max accuracy: {max_accuracy:.2f}%') log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) # random.seed(seed) cudnn.benchmark = True model = get_model(args) patch_size = model.patch_embed.patch_size print("Patch size = %s" % str(patch_size)) args.window_size = (args.input_size // patch_size[0], args.input_size // patch_size[1]) args.patch_size = patch_size # get dataset dataset_train = build_beit_pretraining_dataset(args) # prepare discrete vae d_vae = utils.create_d_vae( weight_path=args.discrete_vae_weight_path, d_vae_type=args.discrete_vae_type, device=device, image_size=args.second_input_size) if True: # args.distributed: num_tasks = utils.get_world_size() global_rank = utils.get_rank() sampler_rank = global_rank num_training_steps_per_epoch = len(dataset_train) // args.batch_size // num_tasks sampler_train = torch.utils.data.DistributedSampler( dataset_train, num_replicas=num_tasks, rank=sampler_rank, shuffle=True ) print("Sampler_train = %s" % str(sampler_train)) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) if global_rank == 0 and args.log_dir is not None: os.makedirs(args.log_dir, exist_ok=True) log_writer = utils.TensorboardLogger(log_dir=args.log_dir) else: log_writer = None data_loader_train = torch.utils.data.DataLoader( dataset_train, sampler=sampler_train, batch_size=args.batch_size, num_workers=args.num_workers, pin_memory=args.pin_mem, drop_last=True, ) model.to(device) model_without_ddp = model n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print("Model = %s" % str(model_without_ddp)) print('number of params:', n_parameters) total_batch_size = args.batch_size * utils.get_world_size() print("LR = %.8f" % args.lr) print("Batch size = %d" % total_batch_size) print("Number of training steps = %d" % num_training_steps_per_epoch) print("Number of training examples per epoch = %d" % (total_batch_size * num_training_steps_per_epoch)) if args.distributed: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], find_unused_parameters=True) model_without_ddp = model.module optimizer = create_optimizer( args, model_without_ddp) loss_scaler = NativeScaler() print("Use step level LR & WD scheduler!") lr_schedule_values = utils.cosine_scheduler( args.lr, args.min_lr, args.epochs, num_training_steps_per_epoch, warmup_epochs=args.warmup_epochs, warmup_steps=args.warmup_steps, ) if args.weight_decay_end is None: args.weight_decay_end = args.weight_decay wd_schedule_values = utils.cosine_scheduler( args.weight_decay, args.weight_decay_end, args.epochs, num_training_steps_per_epoch) print("Max WD = %.7f, Min WD = %.7f" % (max(wd_schedule_values), min(wd_schedule_values))) utils.auto_load_model( args=args, model=model, model_without_ddp=model_without_ddp, optimizer=optimizer, loss_scaler=loss_scaler) print(f"Start training for {args.epochs} epochs") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: data_loader_train.sampler.set_epoch(epoch) if log_writer is not None: log_writer.set_step(epoch * num_training_steps_per_epoch) train_stats = train_one_epoch( model, d_vae, data_loader_train, optimizer, device, epoch, loss_scaler, args.clip_grad, log_writer=log_writer, start_steps=epoch * num_training_steps_per_epoch, lr_schedule_values=lr_schedule_values, wd_schedule_values=wd_schedule_values, ) if args.output_dir: if (epoch + 1) % args.save_ckpt_freq == 0 or epoch + 1 == args.epochs: utils.save_model( args=args, model=model, model_without_ddp=model_without_ddp, optimizer=optimizer, loss_scaler=loss_scaler, epoch=epoch) log_stats = {**{f'train_{k}': v for k, v in train_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters} if args.output_dir and utils.is_main_process(): if log_writer is not None: log_writer.flush() with open(os.path.join(args.output_dir, "log.txt"), mode="a", encoding="utf-8") as f: f.write(json.dumps(log_stats) + "\n") total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
out_file = os.path.join(prediction_dir, prediction_file_base + '.readable.txt') with open(out_file, 'w', encoding=ENCODING) as f: for data_line, prediction_line in zip( open(answer_file, encoding=ENCODING), open(prediction_file, encoding=ENCODING)): data = json.loads(data_line) question = data['question'] answer = [normalize(a) for a in data['answer']] prediction = json.loads(prediction_line) doc_predictions = sorted(prediction, key=lambda k: k['doc_score'], reverse=True) doc_rank = get_rank(doc_predictions, answer, match_func) ans_rank = get_rank( sorted(prediction, key=lambda k: k['span_score'], reverse=True), answer, match_func) qa_str = 'q_{}: {}\n'.format(question_count, question) qa_str += 'ans_rank: {}, doc_rank: {}, answer: {}\n'.format( ans_rank, doc_rank, '; '.join(answer)) for d_no, ans_prediction in enumerate(doc_predictions, 1): qa_str += '\tdoc_{:3s}: {:12s}, d_score: {:.4f}, a_score: {:.4f}, ans: {:20s}, s: {}, e: {}\n'.format( str(d_no), ans_prediction['doc_id'], ans_prediction['doc_score'], ans_prediction['span_score'], ans_prediction['span'], ans_prediction['start'], ans_prediction['end'])
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.mask_model != "none": args.masks = True print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessor = build_model(args) postprocessor.rescale_to_orig_size = True # for evaluation model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = builtins.sum(p.numel() for p in model.parameters() if p.requires_grad) print("number of params:", n_parameters) # optimizer = torch.optim.Adam(model.parameters()) param_dicts = [ { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" not in n ] }, { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" in n ], "lr": args.lr_backbone, }, ] if args.optimizer == "sgd": optimizer = torch.optim.SGD(param_dicts, lr=args.lr, momentum=0.9, weight_decay=args.weight_decay) elif args.optimizer in ["adam", "adamw"]: optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) else: raise RuntimeError(f"Unsupported optimizer {args.optimizer}") if args.schedule == "step": lr_scheduler = StepLR(optimizer, args.lr_drop) elif args.schedule == "multistep": milestones = list(range(args.lr_drop, args.epochs, 50)) lr_scheduler = MultiStepLR(optimizer, gamma=0.5, milestones=milestones) dataset_train = build_dataset(image_set="trainval", args=args) dataset_val = build_dataset(image_set="test", args=args) if args.distributed: sampler_train = DistributedSampler(dataset_train) sampler_val = DistributedSampler(dataset_val, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader( dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers, ) data_loader_val = DataLoader( dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers, ) if args.dataset_file == "coco_panoptic": # We also evaluate AP during panoptic training, on original coco DS coco_val = datasets.coco.build("val", args) base_ds = to_coco_api.get_coco_api_from_dataset(coco_val) else: base_ds = None # to_coco_api.get_coco_api_from_dataset(dataset_val) output_dir = Path(args.output_dir) if args.resume: checkpoint = torch.load(args.resume, map_location="cpu") model_without_ddp.load_state_dict(checkpoint["model"]) optimizer.load_state_dict(checkpoint["optimizer"]) lr_scheduler.load_state_dict(checkpoint["lr_scheduler"]) args.start_epoch = checkpoint["epoch"] + 1 if args.eval: test_stats, coco_evaluator = evaluate( model, criterion, postprocessor, data_loader_val, base_ds, device, eval_bbox=True, eval_masks=args.masks, ) if args.output_dir: utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") return print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / "checkpoint.pth"] # extra checkpoint before LR drop and every 100 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0: checkpoint_paths.append(output_dir / f"checkpoint{epoch:04}.pth") for checkpoint_path in checkpoint_paths: utils.save_on_master( { "model": model_without_ddp.state_dict(), "optimizer": optimizer.state_dict(), "lr_scheduler": lr_scheduler.state_dict(), "epoch": epoch, "args": args, }, checkpoint_path, ) # if epoch % args.eval_skip == 0: # test_stats, coco_evaluator = evaluate( # model, criterion, postprocessor, data_loader_val, base_ds, device, eval_bbox=True, eval_masks=args.masks # ) # else: # test_stats, coco_evaluator = {}, None test_stats, coco_evaluator = {}, None log_stats = { **{f"train_{k}": v for k, v in train_stats.items()}, **{f"test_{k}": v for k, v in test_stats.items()}, "n_parameters": n_parameters, } if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") # for evaluation logs if coco_evaluator is not None: os.makedirs(os.path.join(args.output_dir, "eval"), exist_ok=True) if "bbox" in coco_evaluator.coco_eval: filenames = ["latest.pth"] if epoch % 50 == 0: filenames.append(f"{epoch:03}.pth") for name in filenames: torch.save(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval" / name) with (output_dir / "log_tb.txt").open("a") as f: f.write(f"TORCHBOARD_METRICS[epoch] = {epoch}\n") for k, v in vars(args).items(): f.write(f"TORCHBOARD_METRICS[{k}] = {v}") for key in log_stats: v = log_stats[key] if isinstance(v, list): for i, vi in enumerate(v): f.write(f"TORCHBOARD_METRICS[{key}_{i}] = {vi}\n") else: f.write(f"TORCHBOARD_METRICS[{key}] = {v}\n") total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print("Training time {}".format(total_time_str))
def main(): global timeout_sent args = parse_arguments() random.seed(args.seed + args.local_rank) np.random.seed(args.seed + args.local_rank) torch.manual_seed(args.seed + args.local_rank) torch.cuda.manual_seed(args.seed + args.local_rank) worker_init = WorkerInitObj(args.seed + args.local_rank) device, args = setup_training(args) if is_main_process(): dllogger.log(step="PARAMETER", data={"Config": [str(args)]}) # Prepare optimizer model, optimizer, lr_scheduler, checkpoint, global_step, criterion = prepare_model_and_optimizer(args, device) if is_main_process(): dllogger.log(step="PARAMETER", data={"SEED": args.seed}) raw_train_start = None if is_main_process(): dllogger.log(step="PARAMETER", data={"train_start": True}) dllogger.log(step="PARAMETER", data={"batch_size_per_gpu": args.train_batch_size}) dllogger.log(step="PARAMETER", data={"learning_rate": args.learning_rate}) most_recent_ckpts_paths = [] average_loss = 0.0 # averaged loss every args.log_freq steps epoch = 0 training_steps = 0 test_losses = [] pool = ProcessPoolExecutor(1) # Note: We loop infinitely over epochs, termination is handled via iteration count while True: thread = None restored_data_loader = None if not args.resume_from_checkpoint or epoch > 0 or (args.phase2 and global_step < 1) or args.init_checkpoint: files = [os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if os.path.isfile(os.path.join(args.input_dir, f)) and 'training' in f] files.sort() num_files = len(files) random.Random(args.seed + epoch).shuffle(files) f_start_id = 0 else: f_start_id = checkpoint['files'][0] files = checkpoint['files'][1:] args.resume_from_checkpoint = False num_files = len(files) # may not exist in all checkpoints epoch = checkpoint.get('epoch', 0) restored_dataloader = checkpoint.get('data_loader', None) shared_file_list = {} if torch.distributed.is_initialized() and get_world_size() > num_files: remainder = get_world_size() % num_files data_file = files[(f_start_id*get_world_size()+get_rank() + remainder*f_start_id)%num_files] else: data_file = files[(f_start_id*get_world_size()+get_rank())%num_files] previous_file = data_file if restored_data_loader is None: train_data = pretraining_dataset(data_file, args.max_predictions_per_seq) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size * args.n_gpu, num_workers=4, worker_init_fn=worker_init, pin_memory=True) # shared_file_list["0"] = (train_dataloader, data_file) else: train_dataloader = restored_data_loader restored_data_loader = None overflow_buf = None if args.allreduce_post_accumulation: overflow_buf = torch.cuda.IntTensor([0]) for f_id in range(f_start_id + 1 , len(files)): if get_world_size() > num_files: data_file = files[(f_id*get_world_size()+get_rank() + remainder*f_id)%num_files] else: data_file = files[(f_id*get_world_size()+get_rank())%num_files] previous_file = data_file dataset_future = pool.submit(create_pretraining_dataset, data_file, args.max_predictions_per_seq, shared_file_list, args, worker_init) train_iter = tqdm(train_dataloader, desc="Iteration", disable=args.disable_progress_bar) if is_main_process() else train_dataloader if raw_train_start is None: raw_train_start = time.time() for step, batch in enumerate(train_iter): training_steps += 1 batch = [t.to(device) for t in batch] input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels = batch if args.do_train: from smdistributed.modelparallel.test.torch.utils import verify, dump_model model.train() if args.smp > 0: loss_mbs = smp_step(args, device, input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels, model, optimizer, criterion, step) loss = loss_mbs.reduce_mean() if smp.rank() == 0: print("Loss:", loss.item()) else: loss = train_step(args, device, input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels, model, optimizer, criterion, step) divisor=1 average_loss += loss.item() if training_steps % args.gradient_accumulation_steps == 0: lr_scheduler.step() # learning rate warmup global_step = take_optimizer_step(args, optimizer, model, overflow_buf, global_step) if global_step >= args.steps_this_run or timeout_sent: train_time_raw = time.time() - raw_train_start last_num_steps = int(training_steps / args.gradient_accumulation_steps) % args.log_freq last_num_steps = args.log_freq if last_num_steps == 0 else last_num_steps average_loss = torch.tensor(average_loss, dtype=torch.float32).cuda() average_loss = average_loss / (last_num_steps * divisor) if (torch.distributed.is_initialized()): average_loss /= get_world_size() torch.distributed.all_reduce(average_loss) final_loss = loss.item() if is_main_process(): dllogger.log(step=(epoch, global_step, ), data={"final_loss": final_loss}) elif training_steps % (args.log_freq * args.gradient_accumulation_steps) == 0: if is_main_process(): dllogger.log(step=(epoch, global_step, ), data={"average_loss": average_loss / (args.log_freq * divisor), "step_loss": loss.item() * args.gradient_accumulation_steps / divisor, "learning_rate": optimizer.param_groups[0]['lr']}) average_loss = 0 if global_step >= args.steps_this_run or training_steps % ( args.num_steps_per_checkpoint * args.gradient_accumulation_steps) == 0 or timeout_sent: if smp.dp_rank() == 0 and not args.skip_checkpoint: # Save a trained model dllogger.log(step="PARAMETER", data={"checkpoint_step": global_step}) # model_to_save = model.module if hasattr(model, # 'module') else model # Only save the model it-self if args.resume_step < 0 or not args.phase2: output_save_file = os.path.join(args.output_dir, "ckpt_{}.pt".format(global_step)) else: output_save_file = os.path.join(args.output_dir, "ckpt_{}.pt".format(global_step + args.phase1_end_step)) if args.do_train: save_dict = { 'model': model.local_state_dict(), 'optimizer': optimizer.local_state_dict(), 'files': [f_id] + files, 'epoch': epoch, 'data_loader': None if global_step >= args.steps_this_run else train_dataloader} if args.fp16: save_dict['master params'] = list(amp.master_params(optimizer)) # SMP: Checkpoint mp_rank specific state smp.save(save_dict, output_save_file, partial=True) most_recent_ckpts_paths.append(output_save_file) if len(most_recent_ckpts_paths) > 3 and (args.smp == 0 or smp.dp_rank() == 0): ckpt_to_be_removed = most_recent_ckpts_paths.pop(0) os.remove(ckpt_to_be_removed+f"_{smp.mp_rank()}") # Exiting the training due to hitting max steps, or being sent a # timeout from the cluster scheduler if global_step >= args.steps_this_run or timeout_sent: del train_dataloader # thread.join() if smp.dp_rank() == 0 and args.save_full: output_save_file = os.path.join(args.output_dir, "ckpt_{}.pt".format(global_step)) save_dict = { 'model': model.local_state_dict(), 'optimizer': optimizer.local_state_dict(), 'files': [f_id] + files, 'epoch': epoch, 'data_loader': None if global_step >= args.steps_this_run else train_dataloader} if args.fp16: save_dict['master params'] = list(amp.master_params(optimizer)) # SMP: Save a single checkpoint containing entire model parameters smp.save(save_dict, output_save_file, partial=False) smp.barrier() return args, final_loss, train_time_raw, global_step else: model.eval() with torch.no_grad(): loss = test_step(args, device, input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels, model, criterion, step) print(f"global_step {global_step} Test Loss:", loss) test_losses.append(loss) global_step += 1 if global_step >= args.steps_this_run: return sum(test_losses) / len(test_losses) del train_dataloader # thread.join() # Make sure pool has finished and switch train_dataloader # NOTE: Will block until complete train_dataloader, data_file = dataset_future.result(timeout=None) epoch += 1
parser = argparse.ArgumentParser(description="train") parser.add_argument("--local_rank", type=int, default=0) parser.add_argument("opts", default=None, nargs=argparse.REMAINDER) args = parser.parse_args() if config.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group("nccl", init_method="env://") synchronize() config.merge_from_list(args.opts) config.freeze() save_dir = os.path.join(config.save_dir, f'train') mkdir(save_dir) logger = setup_logger("train", save_dir, get_rank()) logger.info("Running with config:\n{}".format(config)) arguments = {'iteration': 0} device = torch.device(config.device) bert_config = BertConfig(type_vocab_size=len(config.boundaries) + 2) generator = Generator(bert_config) generator = generator.to(device) optimizer = AdamW( params=generator.parameters(), lr=config.solver.lr, weight_decay=config.solver.weight_decay, betas=config.solver.betas )
def main(args): utils.init_distributed_mode(args) # disable any harsh augmentation in case of Self-supervise training if args.training_mode == 'SSL': print("NOTE: Smoothing, Mixup, CutMix, and AutoAugment will be disabled in case of Self-supervise training") args.smoothing = args.reprob = args.reprob = args.recount = args.mixup = args.cutmix = 0.0 args.aa = '' if args.SiT_LinearEvaluation == 1: print("Warning: Linear Evaluation should be set to 0 during SSL training - changing SiT_LinearEvaluation to 0") args.SiT_LinearEvaluation = 0 utils.print_args(args) device = torch.device(args.device) seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) cudnn.benchmark = True print("Loading dataset ....") dataset_train, args.nb_classes = build_dataset(is_train=True, args=args) dataset_val, _ = build_dataset(is_train=False, args=args) num_tasks = utils.get_world_size() global_rank = utils.get_rank() if args.repeated_aug: sampler_train = RASampler(dataset_train, num_replicas=num_tasks, rank=global_rank, shuffle=True) else: sampler_train = torch.utils.data.DistributedSampler(dataset_train, num_replicas=num_tasks, rank=global_rank, shuffle=True) sampler_val = torch.utils.data.SequentialSampler(dataset_val) data_loader_train = torch.utils.data.DataLoader(dataset_train, sampler=sampler_train, batch_size=args.batch_size, num_workers=args.num_workers, pin_memory=args.pin_mem, drop_last=True, collate_fn=collate_fn) data_loader_val = torch.utils.data.DataLoader(dataset_val, sampler=sampler_val, batch_size=int(1.5 * args.batch_size), num_workers=args.num_workers, pin_memory=args.pin_mem, drop_last=False, collate_fn=collate_fn) mixup_fn = None mixup_active = args.mixup > 0 or args.cutmix > 0. or args.cutmix_minmax is not None if mixup_active: mixup_fn = Mixup( mixup_alpha=args.mixup, cutmix_alpha=args.cutmix, cutmix_minmax=args.cutmix_minmax, prob=args.mixup_prob, switch_prob=args.mixup_switch_prob, mode=args.mixup_mode, label_smoothing=args.smoothing, num_classes=args.nb_classes) print(f"Creating model: {args.model}") model = create_model( args.model, pretrained=False, num_classes=args.nb_classes, drop_rate=args.drop, drop_path_rate=args.drop_path, representation_size=args.representation_size, drop_block_rate=None, training_mode=args.training_mode) if args.finetune: checkpoint = torch.load(args.finetune, map_location='cpu') checkpoint_model = checkpoint['model'] state_dict = model.state_dict() for k in ['rot_head.weight', 'rot_head.bias', 'contrastive_head.weight', 'contrastive_head.bias']: if k in checkpoint_model and checkpoint_model[k].shape != state_dict[k].shape: print(f"Removing key {k} from pretrained checkpoint") del checkpoint_model[k] # interpolate position embedding pos_embed_checkpoint = checkpoint_model['pos_embed'] embedding_size = pos_embed_checkpoint.shape[-1] num_patches = model.patch_embed.num_patches num_extra_tokens = model.pos_embed.shape[-2] - num_patches orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5) new_size = int(num_patches ** 0.5) extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens] pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:] pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2) pos_tokens = torch.nn.functional.interpolate( pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False) pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2) new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1) checkpoint_model['pos_embed'] = new_pos_embed model.load_state_dict(checkpoint_model, strict=False) model.to(device) # Freeze the backbone in case of linear evaluation if args.SiT_LinearEvaluation == 1: requires_grad(model, False) model.rot_head.weight.requires_grad = True model.rot_head.bias.requires_grad = True model.contrastive_head.weight.requires_grad = True model.contrastive_head.bias.requires_grad = True if args.representation_size is not None: model.pre_logits_rot.fc.weight.requires_grad = True model.pre_logits_rot.fc.bias.requires_grad = True model.pre_logits_contrastive.fc.weight.requires_grad = True model.pre_logits_contrastive.fc.bias.requires_grad = True model_ema = None if args.model_ema: model_ema = ModelEma(model, decay=args.model_ema_decay, device='cpu' if args.model_ema_force_cpu else '', resume='') model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) linear_scaled_lr = args.lr * args.batch_size * utils.get_world_size() / 512.0 args.lr = linear_scaled_lr optimizer = create_optimizer(args, model_without_ddp) loss_scaler = NativeScaler() lr_scheduler, _ = create_scheduler(args, optimizer) if args.training_mode == 'SSL': criterion = MTL_loss(args.device, args.batch_size) elif args.training_mode == 'finetune' and args.mixup > 0.: criterion = SoftTargetCrossEntropy() else: criterion = torch.nn.CrossEntropyLoss() output_dir = Path(args.output_dir) if args.resume: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.model_ema: utils._load_checkpoint_for_ema(model_ema, checkpoint['model_ema']) if 'scaler' in checkpoint: loss_scaler.load_state_dict(checkpoint['scaler']) if args.eval: test_stats = evaluate_SSL(data_loader_val, model, device) print(f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%") return print(f"Start training for {args.epochs} epochs") start_time = time.time() max_accuracy = 0.0 for epoch in range(args.start_epoch, args.epochs): if args.distributed: data_loader_train.sampler.set_epoch(epoch) if args.training_mode == 'SSL': train_stats = train_SSL( model, criterion, data_loader_train, optimizer, device, epoch, loss_scaler, args.clip_grad, model_ema, mixup_fn) else: train_stats = train_finetune( model, criterion, data_loader_train, optimizer, device, epoch, loss_scaler, args.clip_grad, model_ema, mixup_fn) lr_scheduler.step(epoch) if epoch%args.validate_every == 0: if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] for checkpoint_path in checkpoint_paths: utils.save_on_master({ 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'model_ema': get_state_dict(model_ema), 'scaler': loss_scaler.state_dict(), 'args': args, }, checkpoint_path) if args.training_mode == 'SSL': test_stats = evaluate_SSL(data_loader_val, model, device, epoch, args.output_dir) else: test_stats = evaluate_finetune(data_loader_val, model, device) print(f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%") max_accuracy = max(max_accuracy, test_stats["acc1"]) print(f'Max accuracy: {max_accuracy:.2f}%') log_stats = {**{f'train_{k}': v for k, v in train_stats.items()}, **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters} if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def process_record(data_line_, prediction_line_, neg_gap_, feature_dir_, record_dir_, match_fn, all_doc_scores, all_ans_scores, z_scores): missing_count_ = 0 total_count_ = 0 stop_count_ = 0 data = json.loads(data_line_) question = data['question'] q_id = slugify(question) q_path = os.path.join(feature_dir_, '%s.json' % q_id) n_q = [0 for _ in Tokenizer.FEAT] if os.path.exists(q_path): q_data = open(q_path, encoding=ENCODING).read() record = json.loads(q_data) q_ner = record['ner'] q_pos = record['pos'] for feat in q_ner + q_pos: n_q[Tokenizer.FEAT_DICT[feat]] += 1 else: print('question feature file %s not exist!' % q_path) sys.stdout.flush() missing_count_ += 1 return missing_count_, total_count_, stop_count_ answer = [normalize(a) for a in data['answer']] prediction = json.loads(prediction_line_) # MAKE SURE REVERSE IS TRUE ranked_prediction = sorted(prediction, key=lambda k: k['doc_score'], reverse=True) correct_rank = get_rank(prediction, answer, match_fn) if correct_rank > 150: # if correct_rank < 50 or correct_rank > 150: return missing_count_, total_count_, stop_count_ all_corr_rank.append(correct_rank - 1) all_n_p = [] all_n_a = [] all_p_scores = [] all_a_scores = [] all_probs = [] all_spans = [] repeats = 0 for i, entry in enumerate(ranked_prediction): doc_id = entry['doc_id'] start = int(entry['start']) end = int(entry['end']) doc_score = entry['doc_score'] ans_score = entry['span_score'] prob = entry['prob'] span = entry['span'] # RESTRICT TO MAX 1000000000 # print("Threshold 1000000") # ans_score=min(ans_score, 1000000) #restrict to max of million if span in all_spans: repeats += 1 all_spans.append(span) ################Calculate sample z score (t statistic) for answer score if all_a_scores == [] or len( all_a_scores ) == 1: # dont use a_zscore feature at the beginning or if we only have 1 a_zscore = 0 else: # Take the sample mean of the previous ones, take zscore of the current with respect to that # sample_mean = np.mean(all_a_scores + [ans_score]) sample_mean = np.mean(all_a_scores) # sample_std = np.std(all_a_scores + [ans_score]) sample_std = np.std(all_a_scores) # if sample_std != 0: a_zscore = (ans_score - sample_mean) / sample_std # else: # a_zscore = 0 z_scores.append(a_zscore) # THESE ARE FOR STATISTISTICS OVER ENTIRE DATA SET, IGNORE all_doc_scores.append(doc_score) all_ans_scores.append(ans_score) corr_doc_score = (doc_score - DOC_MEAN) / DOC_STD corr_ans_mean_score = (np.mean(all_a_scores + [ans_score]) - ANS_MEAN) / ANS_STD all_probs.append(prob) ############### p_pos = dict() p_ner = dict() feat_file = os.path.join(feature_dir_, '%s.json' % doc_id) if os.path.exists(feat_file): record = json.load(open(feat_file)) p_ner[doc_id] = record['ner'] p_pos[doc_id] = record['pos'] n_p = [0 for _ in Tokenizer.FEAT] n_a = [0 for _ in Tokenizer.FEAT] for feat in p_ner[doc_id] + p_pos[doc_id]: n_p[Tokenizer.FEAT_DICT[feat]] += 1 for feat in p_ner[doc_id][start:end + 1] + p_pos[doc_id][start:end + 1]: n_a[Tokenizer.FEAT_DICT[feat]] += 1 all_n_p.append(n_p) all_n_a.append(n_a) all_p_scores.append(doc_score) all_a_scores.append(ans_score) f_np = aggregate(all_n_p) f_na = aggregate(all_n_a) f_sp = aggregate(all_p_scores) f_sa = aggregate_ans(all_a_scores) record = OrderedDict() # sp, nq, np, na, ha record['sp'] = f_sp record['nq'] = list(map(float, n_q)) record['np'] = f_np record['na'] = f_na record['sa'] = f_sa record['a_zscore'] = a_zscore record['corr_doc_score'] = corr_doc_score record['i'] = i record['prob_avg'] = sum(all_probs) / len(all_probs) record['prob'] = prob record['repeats'] = repeats record['ans_avg'] = corr_ans_mean_score if i + 1 == correct_rank: # if i + 1 >= correct_rank: record['stop'] = 1 stop_count_ += 1 write_record = True # if i % neg_gap_ ==0: # write_record = True # else: # write_record = False should_return = True # if i + 1 - correct_rank > 30: # should_return = True # else: # should_return = False else: should_return = False if i % neg_gap_ == 0: record['stop'] = 0 write_record = True else: write_record = False if write_record: record_path = os.path.join(record_dir_, '%s_%s.pkl' % (q_id, doc_id)) with open(record_path, 'wb') as f: pk.dump(record, f) total_count_ += 1 if should_return: return missing_count_, total_count_, stop_count_ return missing_count_, total_count_, stop_count_
def main(): global timeout_sent args = parse_arguments() random.seed(args.seed + args.local_rank) np.random.seed(args.seed + args.local_rank) torch.manual_seed(args.seed + args.local_rank) torch.cuda.manual_seed(args.seed + args.local_rank) worker_init = WorkerInitObj(args.seed + args.local_rank) device, args = setup_training(args) dllogger.log(step="PARAMETER", data={"Config": [str(args)]}) # Prepare optimizer model, optimizer, lr_scheduler, checkpoint, global_step, criterion = prepare_model_and_optimizer( args, device) gradient_accumulation_steps = torch.tensor( args.gradient_accumulation_steps, dtype=torch.float32).to(device) world_size = torch.tensor(get_world_size(), dtype=torch.float32).to(device) if is_main_process(): dllogger.log(step="PARAMETER", data={"SEED": args.seed}) raw_train_start = None if args.do_train: if is_main_process(): dllogger.log(step="PARAMETER", data={"train_start": True}) dllogger.log(step="PARAMETER", data={"batch_size_per_pu": args.train_batch_size}) dllogger.log(step="PARAMETER", data={"learning_rate": args.learning_rate}) model.train() most_recent_ckpts_paths = [] average_loss = 0.0 # averaged loss every args.log_freq steps epoch = 0 training_steps = 0 model_traced = False if device.type == 'cuda': pool = ProcessPoolExecutor(1) # Note: We loop infinitely over epochs, termination is handled via iteration count while True: thread = None restored_data_loader = None if not args.resume_from_checkpoint or epoch > 0 or ( args.phase2 and global_step < 1) or args.init_checkpoint: files = [ os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if os.path.isfile(os.path.join(args.input_dir, f)) and 'training' in f ] files.sort() num_files = len(files) random.Random(args.seed + epoch).shuffle(files) f_start_id = 0 else: f_start_id = checkpoint['files'][0] files = checkpoint['files'][1:] args.resume_from_checkpoint = False num_files = len(files) # may not exist in all checkpoints epoch = checkpoint.get('epoch', 0) restored_data_loader = checkpoint.get('data_loader', None) shared_file_list = {} if torch.distributed.is_initialized( ) and get_world_size() > num_files: remainder = get_world_size() % num_files data_file = files[(f_start_id * get_world_size() + get_rank() + remainder * f_start_id) % num_files] else: data_file = files[(f_start_id * get_world_size() + get_rank()) % num_files] previous_file = data_file if restored_data_loader is None: use_pin_memory = False if args.no_cuda or args.use_habana else True num_workers = 0 if args.use_habana else 4 train_data = pretraining_dataset(data_file, args.max_predictions_per_seq) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader( train_data, sampler=train_sampler, batch_size=args.train_batch_size * args.n_pu, num_workers=num_workers, worker_init_fn=worker_init, pin_memory=use_pin_memory, drop_last=True) # shared_file_list["0"] = (train_dataloader, data_file) else: train_dataloader = restored_data_loader restored_data_loader = None overflow_buf = None if args.allreduce_post_accumulation: overflow_buf = torch.cuda.IntTensor([0]) for f_id in range(f_start_id + 1, len(files)): if get_world_size() > num_files: data_file = files[(f_id * get_world_size() + get_rank() + remainder * f_id) % num_files] else: data_file = files[(f_id * get_world_size() + get_rank()) % num_files] previous_file = data_file if device.type == 'cuda': dataset_future = pool.submit(create_pretraining_dataset, data_file, args.max_predictions_per_seq, shared_file_list, args, worker_init) train_iter = tqdm(train_dataloader, desc="Iteration", disable=args.disable_progress_bar ) if is_main_process() else train_dataloader if raw_train_start is None: raw_train_start = time.time() for step, batch in enumerate(train_iter): training_steps += 1 position_ids = compute_position_ids(batch[0]) if torch.distributed.is_initialized(): torch.distributed.barrier() if args.use_habana: batch = [t.to(dtype=torch.int32) for t in batch] position_ids = position_ids.to(dtype=torch.int32) position_ids = position_ids.to(device) batch = [t.to(device) for t in batch] input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels = batch if args.use_jit_trace: if model_traced == False: model = torch.jit.trace(model, (input_ids, segment_ids, input_mask, position_ids), check_trace=False) model_traced = True if args.local_rank != -1 and not args.allreduce_post_accumulation: if args.use_habana: model = DDP(model) else: model = DDP(model, message_size=250000000, gradient_predivide_factor= get_world_size()) if args.local_rank != -1 and not args.allreduce_post_accumulation \ and (training_steps % args.gradient_accumulation_steps != 0): with model.no_sync(): prediction_scores, seq_relationship_score = model( input_ids, segment_ids, input_mask, position_ids) else: prediction_scores, seq_relationship_score = model( input_ids, segment_ids, input_mask, position_ids) else: if args.local_rank != -1 and not args.allreduce_post_accumulation \ and (training_steps % args.gradient_accumulation_steps != 0): with model.no_sync(): prediction_scores, seq_relationship_score = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, position_ids=position_ids) else: prediction_scores, seq_relationship_score = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, position_ids=position_ids) loss = criterion(prediction_scores, seq_relationship_score, masked_lm_labels, next_sentence_labels) if args.n_pu > 1: loss = loss.mean() # mean() to average on multi-pu. divisor = args.gradient_accumulation_steps if args.gradient_accumulation_steps > 1: if not args.allreduce_post_accumulation: # this division was merged into predivision loss = loss / gradient_accumulation_steps divisor = 1.0 if args.fp16: with amp.scale_loss( loss, optimizer, delay_overflow_check=args. allreduce_post_accumulation) as scaled_loss: scaled_loss.backward() else: loss.backward() average_loss += loss.item() if training_steps % args.gradient_accumulation_steps == 0: lr_scheduler.step() # learning rate warmup global_step = take_optimizer_step( args, optimizer, model, overflow_buf, global_step) if global_step >= args.steps_this_run or timeout_sent: train_time_raw = time.time() - raw_train_start last_num_steps = int( training_steps / args.gradient_accumulation_steps) % args.log_freq last_num_steps = args.log_freq if last_num_steps == 0 else last_num_steps average_loss = average_loss / (last_num_steps * divisor) average_loss = torch.tensor( average_loss, dtype=torch.float32).to(device) if (torch.distributed.is_initialized()): average_loss /= world_size torch.distributed.all_reduce(average_loss) final_loss = average_loss.item() if is_main_process(): dllogger.log(step=( epoch, global_step, ), data={"final_loss": final_loss}) elif training_steps % ( args.log_freq * args.gradient_accumulation_steps) == 0: if is_main_process(): dllogger.log( step=( epoch, global_step, ), data={ "average_loss": average_loss / (args.log_freq * divisor), "step_loss": loss.item() * args.gradient_accumulation_steps / divisor, "learning_rate": optimizer.param_groups[0]['lr'] }) average_loss = 0 if global_step >= args.steps_this_run or training_steps % ( args.num_steps_per_checkpoint * args. gradient_accumulation_steps) == 0 or timeout_sent: if is_main_process() and not args.skip_checkpoint: # Save a trained model dllogger.log(step="PARAMETER", data={"checkpoint_step": global_step}) model_to_save = model.module if hasattr( model, 'module' ) else model # Only save the model it-self if args.resume_step < 0 or not args.phase2: output_save_file = os.path.join( args.output_dir, "ckpt_{}.pt".format(global_step)) else: output_save_file = os.path.join( args.output_dir, "ckpt_{}.pt".format(global_step + args.phase1_end_step)) checkpoint_dict = {} if args.do_train: if args.use_habana: config = modeling.BertConfig.from_json_file( args.config_file) # Padding for divisibility by 8 if config.vocab_size % 8 != 0: config.vocab_size += 8 - ( config.vocab_size % 8) model_copy = modeling.BertForPreTraining( config) model_copy.load_state_dict( model_to_save.state_dict()) param_groups_copy = optimizer.state_dict( )['param_groups'] state_dict_copy = {} for st_key, st_val in optimizer.state_dict( )['state'].items(): st_val_copy = {} for k, v in st_val.items(): if isinstance(v, torch.Tensor): st_val_copy[k] = v.to('cpu') else: st_val_copy[k] = v state_dict_copy[ st_key] = st_val_copy optim_dict = {} optim_dict['state'] = state_dict_copy optim_dict[ 'param_groups'] = param_groups_copy checkpoint_dict = { 'model': model_copy.state_dict(), 'optimizer': optim_dict, 'files': [f_id] + files, 'epoch': epoch, 'data_loader': None if global_step >= args.max_steps else train_dataloader } elif no_cuda: checkpoint_dict = { 'model': model_to_save.state_dict(), 'optimizer': optimizer.state_dict(), 'files': [f_id] + files, 'epoch': epoch, 'data_loader': None if global_step >= args.max_steps else train_dataloader } else: checkpoint_dict = { 'model': model_to_save.state_dict(), 'optimizer': optimizer.state_dict(), 'master params': list(amp.master_params(optimizer)), 'files': [f_id] + files, 'epoch': epoch, 'data_loader': None if global_step >= args.max_steps else train_dataloader } torch.save(checkpoint_dict, output_save_file) most_recent_ckpts_paths.append( output_save_file) if len(most_recent_ckpts_paths) > 3: ckpt_to_be_removed = most_recent_ckpts_paths.pop( 0) os.remove(ckpt_to_be_removed) # Exiting the training due to hitting max steps, or being sent a # timeout from the cluster scheduler if global_step >= args.steps_this_run or timeout_sent: del train_dataloader # thread.join() return args, final_loss, train_time_raw, global_step del train_dataloader # thread.join() # Make sure pool has finished and switch train_dataloader # NOTE: Will block until complete if device.type == 'cuda': train_dataloader, data_file = dataset_future.result( timeout=None) else: train_dataloader, data_file = create_pretraining_dataset( data_file, args.max_predictions_per_seq, shared_file_list, args, worker_init) epoch += 1
def main(): global timeout_sent args = parse_arguments() random.seed(args.seed + args.local_rank) np.random.seed(args.seed + args.local_rank) torch.manual_seed(args.seed + args.local_rank) torch.cuda.manual_seed(args.seed + args.local_rank) worker_init = WorkerInitObj(args.seed + args.local_rank) device, args = setup_training(args) dllogger.log(step="PARAMETER", data={"Config": [str(args)]}) # Prepare optimizer model, optimizer, lr_scheduler, checkpoint, global_step, criterion = prepare_model_and_optimizer(args, device) if is_main_process(): dllogger.log(step="PARAMETER", data={"SEED": args.seed}) raw_train_start = None if args.do_train: if is_main_process(): dllogger.log(step="PARAMETER", data={"train_start": True}) dllogger.log(step="PARAMETER", data={"batch_size_per_gpu": args.train_batch_size}) dllogger.log(step="PARAMETER", data={"learning_rate": args.learning_rate}) model.train() most_recent_ckpts_paths = [] average_loss = 0.0 # averaged loss every args.log_freq steps ave_mask_acc=0.0 ave_cgp_acc=0.0 epoch = 0 training_steps = 0 pool = ProcessPoolExecutor(1) # Note: We loop infinitely over epochs, termination is handled via iteration count while True: thread = None restored_data_loader = None if not args.resume_from_checkpoint or epoch > 0 or (args.phase2 and global_step < 1) or args.init_checkpoint: files = [i for i in range(256)] files.sort() num_files = len(files) random.Random(args.seed + epoch).shuffle(files) f_start_id = 0 else: f_start_id = checkpoint['files'][0] files = checkpoint['files'][1:] args.resume_from_checkpoint = False num_files = len(files) # may not exist in all checkpoints epoch = checkpoint.get('epoch', 0) restored_dataloader = checkpoint.get('data_loader', None) shared_file_list = {} if torch.distributed.is_initialized() and get_world_size() > num_files: remainder = get_world_size() % num_files data_file = files[(f_start_id*get_world_size()+get_rank() + remainder*f_start_id)%num_files] else: data_file = files[(f_start_id*get_world_size()+get_rank())%num_files] previous_file = data_file if restored_data_loader is None: train_data = PretrainDataset("data/" + args.dataset,rank=data_file) train_sampler = RandomSampler(train_data) train_dataloader = DataLoaderMasking(train_data, sampler=train_sampler, batch_size=args.train_batch_size * args.n_gpu, num_workers=4, worker_init_fn=worker_init, pin_memory=True) # shared_file_list["0"] = (train_dataloader, data_file) else: train_dataloader = restored_data_loader restored_data_loader = None overflow_buf = None if args.allreduce_post_accumulation: overflow_buf = torch.cuda.IntTensor([0]) for f_id in range(f_start_id + 1 , len(files)): if get_world_size() > num_files: data_file = files[(f_id*get_world_size()+get_rank() + remainder*f_id)%num_files] else: data_file = files[(f_id*get_world_size()+get_rank())%num_files] previous_file = data_file dataset_future = pool.submit(create_pretraining_dataset, data_file, args, worker_init) train_iter = tqdm(train_dataloader, desc="Iteration", disable=args.disable_progress_bar) if is_main_process() else train_dataloader if raw_train_start is None: raw_train_start = time.time() for step, batch in enumerate(train_iter): training_steps += 1 batch = batch.to(device) pred_node, pred_graph = model(batch) loss = criterion(pred_node, pred_graph,batch.mask_node_label,batch.ngp_y) if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. divisor = args.gradient_accumulation_steps if args.gradient_accumulation_steps > 1: if not args.allreduce_post_accumulation: # this division was merged into predivision loss = loss / args.gradient_accumulation_steps divisor = 1.0 if args.fp16: with amp.scale_loss(loss, optimizer, delay_overflow_check=args.allreduce_post_accumulation) as scaled_loss: scaled_loss.backward() else: loss.backward() average_loss += loss.item() acc_node = compute_accuracy(pred_node, batch.mask_node_label) acc_cgp = compute_accuracy(pred_graph, batch.ngp_y) ave_mask_acc+=acc_node ave_cgp_acc+=acc_cgp if training_steps % args.gradient_accumulation_steps == 0: lr_scheduler.step() # learning rate warmup global_step = take_optimizer_step(args, optimizer, model, overflow_buf, global_step) if global_step >= args.steps_this_run or timeout_sent: train_time_raw = time.time() - raw_train_start last_num_steps = int(training_steps / args.gradient_accumulation_steps) % args.log_freq last_num_steps = args.log_freq if last_num_steps == 0 else last_num_steps average_loss = torch.tensor(average_loss, dtype=torch.float32).cuda() average_loss = average_loss / (last_num_steps * divisor) ave_mask_acc = torch.tensor(ave_mask_acc, dtype=torch.float32).cuda() ave_mask_acc = ave_mask_acc / (last_num_steps * divisor) ave_cgp_acc = torch.tensor(ave_cgp_acc, dtype=torch.float32).cuda() ave_cgp_acc = ave_cgp_acc / (last_num_steps * divisor) if (torch.distributed.is_initialized()): average_loss /= get_world_size() torch.distributed.all_reduce(average_loss) ave_mask_acc/=get_world_size() torch.distributed.all_reduce(ave_mask_acc) ave_cgp_acc /= get_world_size() torch.distributed.all_reduce(ave_cgp_acc) final_loss = average_loss.item() final_maskacc = ave_mask_acc.item() final_cgpacc = ave_mask_acc.item() if is_main_process(): dllogger.log(step=(epoch, global_step, ), data={"final_loss": final_loss,"final_maskacc":final_maskacc, "final_cgpacc":final_cgpacc}) elif training_steps % (args.log_freq * args.gradient_accumulation_steps) == 0: if is_main_process(): dllogger.log(step=(epoch, global_step, ), data={"average_loss": average_loss / (args.log_freq * divisor), "average_mask_acc":ave_mask_acc / (args.log_freq * divisor), "average_cgp_acc": ave_cgp_acc / (args.log_freq * divisor), "step_loss": loss.item() * args.gradient_accumulation_steps / divisor, "learning_rate": optimizer.param_groups[0]['lr']}) average_loss = 0 ave_mask_acc = 0 ave_cgp_acc = 0 if global_step >= args.steps_this_run or training_steps % ( args.num_steps_per_checkpoint * args.gradient_accumulation_steps) == 0 or timeout_sent: if is_main_process() and not args.skip_checkpoint: # Save a trained model dllogger.log(step="PARAMETER", data={"checkpoint_step": global_step}) model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self if args.resume_step < 0 or not args.phase2: output_save_file = os.path.join(args.output_dir, "ckpt_{}.pt".format(global_step)) else: output_save_file = os.path.join(args.output_dir, "ckpt_{}.pt".format(global_step + args.phase1_end_step)) if args.do_train: torch.save({'model': model_to_save.state_dict(), 'gnn':model_to_save.gnn.state_dict(), 'linear_atom':model_to_save.linear_pred_atoms.state_dict(), 'optimizer': optimizer.state_dict(), 'master params': list(amp.master_params(optimizer)), 'files': [f_id] + files, 'epoch': epoch, 'data_loader': None if global_step >= args.max_steps else train_dataloader }, output_save_file) most_recent_ckpts_paths.append(output_save_file) if len(most_recent_ckpts_paths) > 3: ckpt_to_be_removed = most_recent_ckpts_paths.pop(0) os.remove(ckpt_to_be_removed) # Exiting the training due to hitting max steps, or being sent a # timeout from the cluster scheduler if global_step >= args.steps_this_run or timeout_sent: del train_dataloader # thread.join() return args, final_loss, final_maskacc, final_cgpacc, train_time_raw, global_step del train_dataloader # thread.join() # Make sure pool has finished and switch train_dataloader # NOTE: Will block until complete train_dataloader, data_file = dataset_future.result(timeout=None) epoch += 1
def training(self): self.net.train() save_to_disk = ptutil.get_rank() == 0 start_training_time = time.time() trained_time = 0 mIoU = 0 best_miou = 0 tic = time.time() end = time.time() iteration, max_iter = 0, self.max_iter save_iter, eval_iter = self.per_iter * self.config.TRAIN.SAVE_EPOCH, self.per_iter * self.config.TRAIN.EVAL_EPOCHS self.logger.info("Start training, total epochs {:3d} = total iteration: {:6d}".format(self.config.TRAIN.EPOCHS, max_iter)) for i, (image, target) in enumerate(self.train_loader): iteration += 1 self.scheduler.step() self.optimizer.zero_grad() image, target = image.to(self.device,dtype=self.dtype), target.to(self.device) if self.config.DATASET.IMG_TRANSFORM == False: image = image.permute(0,3,1,2) outputs = self.net(image) loss_dict = self.criterion(outputs, target) loss_dict_reduced = ptutil.reduce_loss_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss = sum(loss for loss in loss_dict.values()) if self.config.TRAIN.MIXED_PRECISION: with amp.scale_loss(loss,self.optimizer) as scale_loss: scale_loss.backward() else: loss.backward() self.optimizer.step() trained_time += time.time() - end end = time.time() if iteration % self.config.TRAIN.LOG_STEP == 0: eta_seconds = int((trained_time / iteration) * (max_iter - iteration)) log_str = ["Iteration {:06d} , Lr: {:.5f}, Cost: {:.2f}s, Eta: {}" .format(iteration, self.optimizer.param_groups[0]['lr'], time.time() - tic, str(datetime.timedelta(seconds=eta_seconds))), "total_loss: {:.3f}".format(losses_reduced.item())] log_str = ', '.join(log_str) self.logger.info(log_str) tic = time.time() if save_to_disk and iteration % save_iter == 0: model_path = os.path.join(self.config.TRAIN.SAVE_DIR, "{}_{}_{}_iter_{:06d}.pth" .format(self.config.MODEL.NAME, self.config.TRAIN.SEG_LOSS, self.config.DATASET.NAME, iteration)) ptutil.save_model(self.net,model_path,self.logger) if self.config.TRAIN.EVAL_EPOCHS > 0 and iteration % eval_iter == 0 and not iteration == max_iter: metrics = ptutil.validate(self.net,self.valid_loader,self.metric,self.device,self.config) ptutil.synchronize() pixAcc, mIoU = ptutil.accumulate_metric(metrics) if mIoU !=None and mIoU >= best_miou: best_miou = mIoU model_path = os.path.join(self.config.TRAIN.SAVE_DIR, "{}_{}_{}_best.pth" .format(self.config.MODEL.NAME, self.config.TRAIN.SEG_LOSS, self.config.DATASET.NAME)) ptutil.save_model(self.net,model_path,self.logger) if pixAcc is not None: self.logger.info('pixAcc: {:.4f}, mIoU: {:.4f}'.format(pixAcc, mIoU)) self.net.train() if save_to_disk: model_path = os.path.join(self.config.TRAIN.SAVE_DIR, "{}_{}_{}_iter_{:06d}.pth" .format(self.config.MODEL.NAME, self.config.TRAIN.SEG_LOSS, self.config.DATASET.NAME, max_iter)) ptutil.save_model(self.net,model_path,self.logger) total_training_time = int(time.time() - start_training_time) total_time_str = str(datetime.timedelta(seconds=total_training_time)) self.logger.info("Total training time: {} ({:.4f} s / it)".format(total_time_str, total_training_time / max_iter)) # eval after training if not self.config.TRAIN.SKIP_EVAL: metrics = ptutil.validate(self.net,self.valid_loader,self.metric,self.device,self.config) ptutil.synchronize() pixAcc, mIoU = ptutil.accumulate_metric(metrics) if pixAcc is not None: self.logger.info('After training, pixAcc: {:.4f}, mIoU: {:.4f}'.format(pixAcc, mIoU))
default=None, nargs=argparse.REMAINDER) # the parser args = parser.parse_args() return args if __name__ == "__main__": num_gpus = int( os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 args = parse_args() assert args.framework in ( 'ace', 'base', 'distillation', 'gan'), 'cannot support this framework: {}'.format(args.framework) config = get_config(args.framework) update_config(config, args) assert num_gpus == len(config.GPUS), 'GPUS config error' os.environ['CUDA_VISIBLE_DEVICES'] = str(config.GPUS) log_dir = os.path.join(config.TRAIN.SAVE_DIR, 'log') ptutil.mkdir(log_dir) logger = ptutil.setup_logger('TRAIN', log_dir, ptutil.get_rank(), 'log_{}.txt'.format(args.framework), 'w') logger.info('Using {} GPUs'.format(len(config.GPUS))) logger.info(args) logger.info(config) trainer = get_framework(args.framework, config=config, args=args, logger=logger) trainer.training() torch.cuda.empty_cache()
def main(args): utils.init_distributed_mode(args) update_config_from_file(args.cfg) print(args) args_text = yaml.safe_dump(args.__dict__, default_flow_style=False) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) # random.seed(seed) cudnn.benchmark = True dataset_train, args.nb_classes = build_dataset(is_train=True, args=args) dataset_val, _ = build_dataset(is_train=False, args=args) if args.distributed: num_tasks = utils.get_world_size() global_rank = utils.get_rank() if args.repeated_aug: sampler_train = RASampler(dataset_train, num_replicas=num_tasks, rank=global_rank, shuffle=True) else: sampler_train = torch.utils.data.DistributedSampler( dataset_train, num_replicas=num_tasks, rank=global_rank, shuffle=True) if args.dist_eval: if len(dataset_val) % num_tasks != 0: print( 'Warning: Enabling distributed evaluation with an eval dataset not divisible by process number. ' 'This will slightly alter validation results as extra duplicate entries are added to achieve ' 'equal num of samples per-process.') sampler_val = torch.utils.data.DistributedSampler( dataset_val, num_replicas=num_tasks, rank=global_rank, shuffle=False) else: sampler_val = torch.utils.data.SequentialSampler(dataset_val) else: sampler_val = torch.utils.data.SequentialSampler(dataset_val) sampler_train = torch.utils.data.RandomSampler(dataset_train) data_loader_train = torch.utils.data.DataLoader( dataset_train, sampler=sampler_train, batch_size=args.batch_size, num_workers=args.num_workers, pin_memory=args.pin_mem, drop_last=True, ) data_loader_val = torch.utils.data.DataLoader(dataset_val, batch_size=int( 2 * args.batch_size), sampler=sampler_val, num_workers=args.num_workers, pin_memory=args.pin_mem, drop_last=False) mixup_fn = None mixup_active = args.mixup > 0 or args.cutmix > 0. or args.cutmix_minmax is not None if mixup_active: mixup_fn = Mixup(mixup_alpha=args.mixup, cutmix_alpha=args.cutmix, cutmix_minmax=args.cutmix_minmax, prob=args.mixup_prob, switch_prob=args.mixup_switch_prob, mode=args.mixup_mode, label_smoothing=args.smoothing, num_classes=args.nb_classes) print(f"Creating SuperVisionTransformer") print(cfg) model = Vision_TransformerSuper( img_size=args.input_size, patch_size=args.patch_size, embed_dim=cfg.SUPERNET.EMBED_DIM, depth=cfg.SUPERNET.DEPTH, num_heads=cfg.SUPERNET.NUM_HEADS, mlp_ratio=cfg.SUPERNET.MLP_RATIO, qkv_bias=True, drop_rate=args.drop, drop_path_rate=args.drop_path, gp=args.gp, num_classes=args.nb_classes, max_relative_position=args.max_relative_position, relative_position=args.relative_position, change_qkv=args.change_qkv, abs_pos=not args.no_abs_pos) choices = { 'num_heads': cfg.SEARCH_SPACE.NUM_HEADS, 'mlp_ratio': cfg.SEARCH_SPACE.MLP_RATIO, 'embed_dim': cfg.SEARCH_SPACE.EMBED_DIM, 'depth': cfg.SEARCH_SPACE.DEPTH } model.to(device) if args.teacher_model: teacher_model = create_model( args.teacher_model, pretrained=True, num_classes=args.nb_classes, ) teacher_model.to(device) teacher_loss = LabelSmoothingCrossEntropy(smoothing=args.smoothing) else: teacher_model = None teacher_loss = None model_ema = None model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu], find_unused_parameters=True) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) linear_scaled_lr = args.lr * args.batch_size * utils.get_world_size( ) / 512.0 args.lr = linear_scaled_lr optimizer = create_optimizer(args, model_without_ddp) loss_scaler = NativeScaler() lr_scheduler, _ = create_scheduler(args, optimizer) # criterion = LabelSmoothingCrossEntropy() if args.mixup > 0.: # smoothing is handled with mixup label transform criterion = SoftTargetCrossEntropy() elif args.smoothing: criterion = LabelSmoothingCrossEntropy(smoothing=args.smoothing) else: criterion = torch.nn.CrossEntropyLoss() output_dir = Path(args.output_dir) if not output_dir.exists(): output_dir.mkdir(parents=True) # save config for later experiments with open(output_dir / "config.yaml", 'w') as f: f.write(args_text) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if 'scaler' in checkpoint: loss_scaler.load_state_dict(checkpoint['scaler']) if args.model_ema: utils._load_checkpoint_for_ema(model_ema, checkpoint['model_ema']) retrain_config = None if args.mode == 'retrain' and "RETRAIN" in cfg: retrain_config = { 'layer_num': cfg.RETRAIN.DEPTH, 'embed_dim': [cfg.RETRAIN.EMBED_DIM] * cfg.RETRAIN.DEPTH, 'num_heads': cfg.RETRAIN.NUM_HEADS, 'mlp_ratio': cfg.RETRAIN.MLP_RATIO } if args.eval: print(retrain_config) test_stats = evaluate(data_loader_val, model, device, mode=args.mode, retrain_config=retrain_config) print( f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%" ) return print("Start training") start_time = time.time() max_accuracy = 0.0 for epoch in range(args.start_epoch, args.epochs): if args.distributed: data_loader_train.sampler.set_epoch(epoch) train_stats = train_one_epoch( model, criterion, data_loader_train, optimizer, device, epoch, loss_scaler, args.clip_grad, model_ema, mixup_fn, amp=args.amp, teacher_model=teacher_model, teach_loss=teacher_loss, choices=choices, mode=args.mode, retrain_config=retrain_config, ) lr_scheduler.step(epoch) if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, # 'model_ema': get_state_dict(model_ema), 'scaler': loss_scaler.state_dict(), 'args': args, }, checkpoint_path) test_stats = evaluate(data_loader_val, model, device, amp=args.amp, choices=choices, mode=args.mode, retrain_config=retrain_config) print( f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%" ) max_accuracy = max(max_accuracy, test_stats["acc1"]) print(f'Max accuracy: {max_accuracy:.2f}%') log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def training(self): self.net.train() save_to_disk = ptutil.get_rank() == 0 start_training_time = time.time() trained_time = 0 tic = time.time() end = time.time() iteration, max_iter = 0, self.args.max_iter save_iter, eval_iter = self.args.per_iter * self.args.save_epoch, self.args.per_iter * self.args.eval_epochs # save_iter, eval_iter = 10, 10 logger.info( "Start training, total epochs {:3d} = total iteration: {:6d}". format(self.args.epochs, max_iter)) for i, (image, target) in enumerate(self.train_loader): iteration += 1 self.scheduler.step() self.optimizer.zero_grad() image, target = image.to(self.device), target.to(self.device) outputs = self.net(image) loss_dict = self.criterion(outputs, target) # reduce losses over all GPUs for logging purposes loss_dict_reduced = ptutil.reduce_loss_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss = sum(loss for loss in loss_dict.values()) loss.backward() self.optimizer.step() trained_time += time.time() - end end = time.time() if iteration % args.log_step == 0: eta_seconds = int( (trained_time / iteration) * (max_iter - iteration)) log_str = [ "Iteration {:06d} , Lr: {:.5f}, Cost: {:.2f}s, Eta: {}". format(iteration, self.optimizer.param_groups[0]['lr'], time.time() - tic, str(datetime.timedelta(seconds=eta_seconds))), "total_loss: {:.3f}".format(losses_reduced.item()) ] log_str = ', '.join(log_str) logger.info(log_str) tic = time.time() if save_to_disk and iteration % save_iter == 0: model_path = os.path.join( self.args.save_dir, "{}_iter_{:06d}.pth".format('LEDNet', iteration)) self.save_model(model_path) # Do eval when training, to trace the mAP changes and see performance improved whether or nor if args.eval_epochs > 0 and iteration % eval_iter == 0 and not iteration == max_iter: metrics = self.validate() ptutil.synchronize() pixAcc, mIoU = ptutil.accumulate_metric(metrics) if pixAcc is not None: logger.info('pixAcc: {:.4f}, mIoU: {:.4f}'.format( pixAcc, mIoU)) self.net.train() if save_to_disk: model_path = os.path.join( self.args.save_dir, "{}_iter_{:06d}.pth".format('LEDNet', max_iter)) self.save_model(model_path) # compute training time total_training_time = int(time.time() - start_training_time) total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info("Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / max_iter)) # eval after training if not self.args.skip_eval: metrics = self.validate() ptutil.synchronize() pixAcc, mIoU = ptutil.accumulate_metric(metrics) if pixAcc is not None: logger.info( 'After training, pixAcc: {:.4f}, mIoU: {:.4f}'.format( pixAcc, mIoU))
model = self.net torch.save(model.state_dict(), model_path) logger.info("Saved checkpoint to {}".format(model_path)) if __name__ == '__main__': args = parse_args() # device setting num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 args.distributed = num_gpus > 1 args.num_gpus = num_gpus if not args.no_cuda and torch.cuda.is_available(): torch.backends.cudnn.benchmark = True args.device = "cuda" else: args.distributed = False args.device = "cpu" if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method=args.init_method) args.lr = args.lr * args.num_gpus # scale by num gpus logger = ptutil.setup_logger('SSD', args.save_dir, ptutil.get_rank(), 'log_yolo3.txt', 'w') logger.info("Using {} GPUs".format(num_gpus)) logger.info(args) trainer = Trainer(args) trainer.training() torch.cuda.empty_cache()
# init config num_gpus = int( os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 distributed = num_gpus > 1 if distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method=args.init_method) ptutil.synchronize() cfg.merge_from_file(args.config_file) cfg.freeze() # logging logger = ptutil.setup_logger("RetinaNet", cfg.CONFIG.save_dir, ptutil.get_rank()) logger.info("Using {} GPUs".format(num_gpus)) logger.info(cfg) model = get_model(cfg.CONFIG.model, pretrained=cfg.TEST.pretrained) model.to(cfg.MODEL.device) output_dir = cfg.CONFIG.output_dir if output_dir: output_folder = os.path.join(output_dir, "inference", cfg.DATA.dataset) ptutil.mkdir(output_folder) # dataset data_loader = build_dataloader(cfg, False, distributed) inference(model, data_loader,
if __name__ == "__main__": args = parse_args() # device setting num_gpus = int( os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 args.distributed = num_gpus > 1 args.num_gpus = num_gpus if not args.no_cuda and torch.cuda.is_available(): torch.backends.cudnn.benchmark = True args.device = "cuda" else: args.distributed = False args.device = "cpu" if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method=args.init_method) args.lr = args.lr * args.num_gpus # scale by num gpus logger = ptutil.setup_logger('Segmentation', args.save_dir, ptutil.get_rank(), 'log_seg.txt', 'w') logger.info("Using {} GPUs".format(num_gpus)) logger.info(args) trainer = Trainer(args) trainer.training() torch.cuda.empty_cache()
def batch_predict_test(data_line_, prediction_line_, model, feature_dir_, match_fn_, stop_at=-1): data = json.loads(data_line_) # question = data['question'] # q_id = slugify(question) # q_path = os.path.join(feature_dir_, '%s.json' % q_id) # n_q = [0 for _ in Tokenizer.FEAT] # if os.path.exists(q_path): # q_data = open(q_path, encoding=ENCODING).read() # record = json.loads(q_data) # q_ner = record['ner'] # q_pos = record['pos'] # for feat in q_ner + q_pos: # n_q[Tokenizer.FEAT_DICT[feat]] += 1 answer = [normalize(a) for a in data['answer']] prediction = json.loads(prediction_line_) ranked_prediction = sorted(prediction, key=lambda k: k['doc_score'], reverse=True) correct_rank = get_rank(ranked_prediction, answer, match_fn_) total_count_ = 0 correct_count_ = 0 if correct_rank > 150: print("BAD") return 0, 0, 0, ranked_prediction # all_n_p = [] # all_n_a = [] all_p_scores = [] all_a_scores = [] all_probs = [] diff = 0 repeats = 0 all_spans = [] es_preds = [] stop_loc = 0 for i, entry in enumerate(ranked_prediction): es_preds.append(entry) # doc_id = entry['doc_id'] # start = int(entry['start']) # end = int(entry['end']) doc_score = entry['doc_score'] ans_score = entry['span_score'] prob = entry['prob'] span = entry['span'] if span in all_spans: repeats += 1 all_spans.append(span) all_probs.append(prob) # print("Threshold 1000000") # ans_score=min(ans_score, 1000000) #restrict to max of million # p_pos = dict() # p_ner = dict() # feat_file = os.path.join(feature_dir_, '%s.json' % doc_id) # if os.path.exists(feat_file): # record = json.load(open(feat_file)) # p_ner[doc_id] = record['ner'] # p_pos[doc_id] = record['pos'] # n_p = [0 for _ in Tokenizer.FEAT] # n_a = [0 for _ in Tokenizer.FEAT] # for feat in p_ner[doc_id] + p_pos[doc_id]: # n_p[Tokenizer.FEAT_DICT[feat]] += 1 # for feat in p_ner[doc_id][start:end + 1] + p_pos[doc_id][start:end + 1]: # n_a[Tokenizer.FEAT_DICT[feat]] += 1 ################Calculate sample z score (t statistic) for answer score if all_a_scores == [] or len( all_a_scores ) == 1: # dont use a_zscore feature at the beginning a_zscore = 0 else: # sample_mean = numpy.mean(all_a_scores + [ans_score]) sample_mean = numpy.mean(all_a_scores) # sample_std = numpy.std(all_a_scores + [ans_score]) sample_std = numpy.std(all_a_scores) # if sample_std != 0: a_zscore = (ans_score - sample_mean) / sample_std # else: # a_zscore = 0 # if a_zscore != 0: # az_norm = (a_zscore - Z_MEAN) / Z_STD # else: # az_norm = 0 # a_zscore_norm = torch.FloatTensor(list([az_norm])) # 1 corr_doc_score = (doc_score - DOC_MEAN) / DOC_STD # ans_avg = (numpy.mean(all_a_scores + [ans_score]) - ANS_MEAN) / ANS_STD a_zscore_t = torch.FloatTensor(list([a_zscore])) # 1 # ans_avg = torch.FloatTensor(list([ans_avg])) # 1 corr_doc_score_t = torch.FloatTensor(list([corr_doc_score])) # 1 # prob_avg = sum(all_probs) / len(all_probs) # prob_avg = torch.FloatTensor([prob_avg]) # repeats_t = torch.FloatTensor([repeats]) ############### # all_n_p.append(n_p) # all_n_a.append(n_a) all_p_scores.append(doc_score) all_a_scores.append(ans_score) # f_np = aggregate(all_n_p) # f_na = aggregate(all_n_a) # f_sp = aggregate(all_p_scores) # f_sa = aggregate_ans(all_a_scores) # sp, nq, np, na, ha # sp = torch.FloatTensor(f_sp) # 4x1 # sa = torch.FloatTensor(f_sa) # 2x1 # i_ft = torch.FloatTensor([i]) # i_std = (i - I_MEAN) / I_STD # i_std = torch.FloatTensor([i_std]) # OLD ONES NO GOOD # np = torch.FloatTensor(list(map(float, n_q))) # 4x58 # na = torch.FloatTensor(f_np) # 4x58 # nq = torch.FloatTensor(f_na) # 1x58 # na = torch.FloatTensor(f_na) # 4x58 # np = torch.FloatTensor(f_np) # nq = torch.FloatTensor(list(map(float, n_q))) # 4x58 # inputs = torch.cat([sp, sa, nq, np, na]) # Uncomment this one # inputs = torch.cat([sp, nq, np, na, a_zscore_t]) # inputs = torch.cat([sp, a_zscore_t]) # inputs = torch.cat([sp, a_zscore_t]) # inputs = torch.cat([corr_doc_score_t, a_zscore_t, i_ft]) inputs = torch.cat([corr_doc_score_t, a_zscore_t]) prob = model.predict(inputs, prob=True) # print(list(model.network.parameters())) if stop_at <= 0: print( "Prob of STOP = {}, Correct Rank = {}, i = {}, answer_score = {}, REPEATS = {}" .format(prob, correct_rank, i, ans_score, repeats)) # if prob > 0.5: if prob > 0.95: if i + 1 >= correct_rank: correct_count_ += 1 diff = i + 1 - correct_rank print("stop_at <=0 prob > 0.45 CORRECT") print("AVG ANS SCORE {}".format(numpy.mean(all_probs))) print("STD ANS SCORE {}".format(numpy.std(all_probs))) stop_loc = i + 1 break elif i + 1 >= 40: print("AVG ANS SCORE {}".format(numpy.mean(all_probs))) print("STD ANS SCORE {}".format(numpy.std(all_probs))) if i + 1 >= correct_rank: correct_count_ += 1 print("stop_at <=0 prob <= 0.45 CORRECT") diff = i + 1 - correct_rank stop_loc = i + 1 break else: if i + 1 == stop_at: # if prob > 0.75: if i + 1 >= correct_rank: correct_count_ += 1 diff = i + 1 - correct_rank print("stop_at > 0, CORRECT") stop_loc = i + 1 break print("stop at: ", stop_loc) assert stop_loc == len(es_preds) total_count_ += 1 return correct_count_, total_count_, diff, es_preds
def main(args): utils.init_distributed_mode(args) print(args) if args.distillation_type != 'none' and args.finetune and not args.eval: raise NotImplementedError( "Finetuning with distillation not yet supported") device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) # random.seed(seed) cudnn.benchmark = True dataset_train, args.nb_classes = build_dataset(is_train=True, args=args) dataset_val, _ = build_dataset(is_train=False, args=args) if True: # args.distributed: num_tasks = utils.get_world_size() global_rank = utils.get_rank() if args.repeated_aug: sampler_train = RASampler(dataset_train, num_replicas=num_tasks, rank=global_rank, shuffle=True) else: sampler_train = torch.utils.data.DistributedSampler( dataset_train, num_replicas=num_tasks, rank=global_rank, shuffle=True) if args.dist_eval: if len(dataset_val) % num_tasks != 0: print( 'Warning: Enabling distributed evaluation with an eval dataset not divisible by process number. ' 'This will slightly alter validation results as extra duplicate entries are added to achieve ' 'equal num of samples per-process.') sampler_val = torch.utils.data.DistributedSampler( dataset_val, num_replicas=num_tasks, rank=global_rank, shuffle=False) else: sampler_val = torch.utils.data.SequentialSampler(dataset_val) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) data_loader_train = torch.utils.data.DataLoader( dataset_train, sampler=sampler_train, batch_size=args.batch_size, num_workers=args.num_workers, pin_memory=args.pin_mem, drop_last=True, ) data_loader_val = torch.utils.data.DataLoader(dataset_val, sampler=sampler_val, batch_size=int( 1.5 * args.batch_size), num_workers=args.num_workers, pin_memory=args.pin_mem, drop_last=False) mixup_fn = None mixup_active = args.mixup > 0 or args.cutmix > 0. or args.cutmix_minmax is not None if mixup_active: mixup_fn = Mixup(mixup_alpha=args.mixup, cutmix_alpha=args.cutmix, cutmix_minmax=args.cutmix_minmax, prob=args.mixup_prob, switch_prob=args.mixup_switch_prob, mode=args.mixup_mode, label_smoothing=args.smoothing, num_classes=args.nb_classes) print(f"Creating model: {args.model}") model = create_model( args.model, pretrained=False, num_classes=args.nb_classes, drop_rate=args.drop, drop_path_rate=args.drop_path, drop_block_rate=None, ) if args.finetune: if args.finetune.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.finetune, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.finetune, map_location='cpu') checkpoint_model = checkpoint['model'] state_dict = model.state_dict() for k in [ 'head.weight', 'head.bias', 'head_dist.weight', 'head_dist.bias' ]: if k in checkpoint_model and checkpoint_model[ k].shape != state_dict[k].shape: print(f"Removing key {k} from pretrained checkpoint") del checkpoint_model[k] # interpolate position embedding pos_embed_checkpoint = checkpoint_model['pos_embed'] embedding_size = pos_embed_checkpoint.shape[-1] num_patches = model.patch_embed.num_patches num_extra_tokens = model.pos_embed.shape[-2] - num_patches # height (== width) for the checkpoint position embedding orig_size = int( (pos_embed_checkpoint.shape[-2] - num_extra_tokens)**0.5) # height (== width) for the new position embedding new_size = int(num_patches**0.5) # class_token and dist_token are kept unchanged extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens] # only the position tokens are interpolated pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:] pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2) pos_tokens = torch.nn.functional.interpolate(pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False) pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2) new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1) checkpoint_model['pos_embed'] = new_pos_embed model.load_state_dict(checkpoint_model, strict=False) model.to(device) model_ema = None if args.model_ema: # Important to create EMA model after cuda(), DP wrapper, and AMP but before SyncBN and DDP wrapper model_ema = ModelEma(model, decay=args.model_ema_decay, device='cpu' if args.model_ema_force_cpu else '', resume='') model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) linear_scaled_lr = args.lr * args.batch_size * utils.get_world_size( ) / 512.0 args.lr = linear_scaled_lr optimizer = create_optimizer(args, model_without_ddp) loss_scaler = NativeScaler() lr_scheduler, _ = create_scheduler(args, optimizer) criterion = LabelSmoothingCrossEntropy() if args.mixup > 0.: # smoothing is handled with mixup label transform criterion = SoftTargetCrossEntropy() elif args.smoothing: criterion = LabelSmoothingCrossEntropy(smoothing=args.smoothing) else: criterion = torch.nn.CrossEntropyLoss() teacher_model = None if args.distillation_type != 'none': assert args.teacher_path, 'need to specify teacher-path when using distillation' print(f"Creating teacher model: {args.teacher_model}") teacher_model = create_model( args.teacher_model, pretrained=False, num_classes=args.nb_classes, global_pool='avg', ) if args.teacher_path.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.teacher_path, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.teacher_path, map_location='cpu') teacher_model.load_state_dict(checkpoint['model']) teacher_model.to(device) teacher_model.eval() # wrap the criterion in our custom DistillationLoss, which # just dispatches to the original criterion if args.distillation_type is 'none' criterion = DistillationLoss(criterion, teacher_model, args.distillation_type, args.distillation_alpha, args.distillation_tau) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.model_ema: utils._load_checkpoint_for_ema(model_ema, checkpoint['model_ema']) if 'scaler' in checkpoint: loss_scaler.load_state_dict(checkpoint['scaler']) if args.eval: test_stats = evaluate(data_loader_val, model, device) print( f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%" ) return print(f"Start training for {args.epochs} epochs") start_time = time.time() max_accuracy = 0.0 for epoch in range(args.start_epoch, args.epochs): if args.distributed: data_loader_train.sampler.set_epoch(epoch) train_stats = train_one_epoch( model, criterion, data_loader_train, optimizer, device, epoch, loss_scaler, args.clip_grad, model_ema, mixup_fn, set_training_mode=args.finetune == '' # keep in eval mode during finetuning ) lr_scheduler.step(epoch) if args.output_dir: checkpoint_paths = [output_dir / ('checkpoint_%04d.pth' % (epoch))] for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'model_ema': get_state_dict(model_ema), 'scaler': loss_scaler.state_dict(), 'args': args, }, checkpoint_path) if not args.train_without_eval: test_stats = evaluate(data_loader_val, model, device) print( f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%" ) max_accuracy = max(max_accuracy, test_stats["acc1"]) print(f'Max accuracy: {max_accuracy:.2f}%') log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } else: log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(): global timeout_sent args = parse_arguments() random.seed(args.seed + args.local_rank) np.random.seed(args.seed + args.local_rank) torch.manual_seed(args.seed + args.local_rank) torch.cuda.manual_seed(args.seed + args.local_rank) worker_init = WorkerInitObj(args.seed + args.local_rank) device, args = setup_training(args) # Prepare optimizer ( model, optimizer, lr_scheduler, checkpoint, global_step, criterion, ) = prepare_model_and_optimizer(args, device) raw_train_start = None most_recent_ckpts_paths = [] average_loss = 0.0 # averaged loss every args.log_freq steps epoch = 0 training_steps = 0 test_losses = [] pool = ProcessPoolExecutor(1) # Note: We loop infinitely over epochs, termination is handled via iteration count while True: thread = None restored_data_loader = None if (not args.resume_from_checkpoint or epoch > 0 or (args.phase2 and global_step < 1) or args.init_checkpoint): files = [ os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if os.path.isfile(os.path.join(args.input_dir, f)) and "training" in f ] files.sort() num_files = len(files) random.Random(args.seed + epoch).shuffle(files) f_start_id = 0 else: f_start_id = checkpoint["files"][0] files = checkpoint["files"][1:] args.resume_from_checkpoint = False num_files = len(files) # may not exist in all checkpoints epoch = checkpoint.get("epoch", 0) restored_dataloader = checkpoint.get("data_loader", None) shared_file_list = {} if smp.is_initialized(): dpsize = smp.dp_size() dprank = smp.dp_rank() elif torch.distributed.is_initialized(): dpsize = get_world_size() dprank = get_rank() else: dpsize = 1 dprank = 0 dparallel = dpsize > 1 if dparallel and dpsize > num_files: remainder = dpsize % num_files data_file = files[(f_start_id * dpsize + dprank + remainder * f_start_id) % num_files] else: data_file = files[(f_start_id * dpsize + dprank) % num_files] previous_file = data_file if restored_data_loader is None: train_data = pretraining_dataset(data_file, args.max_predictions_per_seq) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader( train_data, sampler=train_sampler, batch_size=args.train_batch_size * args.n_gpu, num_workers=4, worker_init_fn=worker_init, pin_memory=True, drop_last=True, ) # shared_file_list["0"] = (train_dataloader, data_file) else: train_dataloader = restored_data_loader restored_data_loader = None overflow_buf = None if args.allreduce_post_accumulation: overflow_buf = torch.cuda.IntTensor([0]) for f_id in range(f_start_id + 1, len(files)): if get_world_size() > num_files: data_file = files[(f_id * get_world_size() + get_rank() + remainder * f_id) % num_files] else: data_file = files[(f_id * get_world_size() + get_rank()) % num_files] previous_file = data_file dataset_future = pool.submit( create_pretraining_dataset, data_file, args.max_predictions_per_seq, shared_file_list, args, worker_init, ) train_iter = (tqdm(train_dataloader, desc="Iteration", disable=args.disable_progress_bar) if is_main_process() else train_dataloader) if raw_train_start is None: raw_train_start = time.time() for step, batch in enumerate(train_iter): training_steps += 1 batch = [t.to(device) for t in batch] input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels = batch if args.do_train: from smdistributed.modelparallel.test.torch.utils import dump_model, verify model.train() if args.smp > 0: loss_mbs = smp_step( args, device, input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels, model, optimizer, criterion, step, ) loss = loss_mbs.reduce_mean() if smp.rank() == 0: print("Loss:", loss.item()) else: loss = train_step( args, device, input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels, model, optimizer, criterion, step, ) divisor = 1 average_loss += loss.item() if training_steps % args.gradient_accumulation_steps == 0: lr_scheduler.step() # learning rate warmup global_step = take_optimizer_step( args, optimizer, model, overflow_buf, global_step) if global_step >= args.steps_this_run or timeout_sent: train_time_raw = time.time() - raw_train_start last_num_steps = (int( training_steps / args.gradient_accumulation_steps) % args.log_freq) last_num_steps = args.log_freq if last_num_steps == 0 else last_num_steps average_loss = torch.tensor( average_loss, dtype=torch.float32).cuda() average_loss = average_loss / (last_num_steps * divisor) if torch.distributed.is_initialized(): average_loss /= get_world_size() torch.distributed.all_reduce(average_loss) final_loss = loss.item() elif training_steps % ( args.log_freq * args.gradient_accumulation_steps) == 0: average_loss = 0 if (global_step >= args.steps_this_run or training_steps % (args.num_steps_per_checkpoint * args.gradient_accumulation_steps) == 0 or timeout_sent): if smp.dp_rank() == 0 and not args.skip_checkpoint: if args.resume_step < 0 or not args.phase2: output_save_file = os.path.join( args.output_dir, "ckpt_{}.pt".format(global_step)) else: output_save_file = os.path.join( args.output_dir, "ckpt_{}.pt".format(global_step + args.phase1_end_step), ) if args.do_train: save_dict = { "model": model.local_state_dict(), "optimizer": optimizer.local_state_dict(), "files": [f_id] + files, "epoch": epoch, "data_loader": None if global_step >= args.steps_this_run else train_dataloader, } if args.fp16: save_dict["master params"] = list( amp.master_params(optimizer)) # SMP: Checkpoint mp_rank specific state smp.save(save_dict, output_save_file, partial=True) most_recent_ckpts_paths.append( output_save_file) if len(most_recent_ckpts_paths) > 3 and ( args.smp == 0 or smp.dp_rank() == 0): ckpt_to_be_removed = most_recent_ckpts_paths.pop( 0) os.remove(ckpt_to_be_removed + f"_{smp.mp_rank()}") # Exiting the training due to hitting max steps, or being sent a # timeout from the cluster scheduler if global_step >= args.steps_this_run or timeout_sent: del train_dataloader # thread.join() if smp.dp_rank() == 0 and args.save_full: output_save_file = os.path.join( args.output_dir, "ckpt_{}.pt".format(global_step)) save_dict = { "model": model.local_state_dict(), "optimizer": optimizer.local_state_dict(), "files": [f_id] + files, "epoch": epoch, "data_loader": None if global_step >= args.steps_this_run else train_dataloader, } if args.fp16: save_dict["master params"] = list( amp.master_params(optimizer)) # SMP: Save a single checkpoint containing entire model parameters smp.save(save_dict, output_save_file, partial=False) smp.barrier() if smp.local_rank() == 0: print(f"Start syncing model checkpoints to s3") base_s3_path = os.path.dirname( os.path.dirname( os.getenv("SM_MODULE_DIR", ""))) curr_host = os.getenv("SM_CURRENT_HOST") full_s3_path = f"{base_s3_path}/checkpoints/{curr_host}/" sync_local_checkpoints_to_s3( local_path=args.output_dir, s3_path=full_s3_path) print( f"Finished syncing model checkpoints to s3" ) return args, final_loss, train_time_raw, global_step else: model.eval() with torch.no_grad(): loss = test_step( args, device, input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels, model, criterion, step, ) print(f"global_step {global_step} Test Loss:", loss) test_losses.append(loss) global_step += 1 if global_step >= args.steps_this_run: return sum(test_losses) / len(test_losses) del train_dataloader # thread.join() # Make sure pool has finished and switch train_dataloader # NOTE: Will block until complete train_dataloader, data_file = dataset_future.result(timeout=None) epoch += 1