def all_gather_list(data, group=None, max_size=16384): """Gathers arbitrary data from all nodes into a list. Similar to :func:`~torch.distributed.all_gather` but for arbitrary Python data. Note that *data* must be picklable and any CUDA tensors will be moved to CPU and returned on CPU as well. Args: data (Any): data from the local worker to be gathered on other workers group: group of the collective max_size (int, optional): maximum size of the data to be gathered across workers """ from fairseq import utils if group is None: group = get_global_group() rank = get_rank(group=group) world_size = get_world_size(group=group) buffer_size = max_size * world_size if (not hasattr(all_gather_list, "_buffer") or all_gather_list._buffer.numel() < buffer_size): all_gather_list._buffer = torch.cuda.ByteTensor(buffer_size) all_gather_list._cpu_buffer = torch.ByteTensor(max_size).pin_memory() buffer = all_gather_list._buffer buffer.zero_() cpu_buffer = all_gather_list._cpu_buffer data = utils.move_to_cpu(data) enc = pickle.dumps(data) enc_size = len(enc) header_size = 4 # size of header that contains the length of the encoded data size = header_size + enc_size if size > max_size: raise ValueError("encoded data size ({}) exceeds max_size ({})".format( size, max_size)) header = struct.pack(">I", enc_size) cpu_buffer[:size] = torch.ByteTensor(list(header + enc)) start = rank * max_size buffer[start:start + size].copy_(cpu_buffer[:size]) all_reduce(buffer, group=group) buffer = buffer.cpu() try: result = [] for i in range(world_size): out_buffer = buffer[i * max_size:(i + 1) * max_size] (enc_size, ) = struct.unpack( ">I", bytes(out_buffer[:header_size].tolist())) if enc_size > 0: result.append( pickle.loads( bytes(out_buffer[header_size:header_size + enc_size].tolist()))) return result except pickle.UnpicklingError: raise Exception( "Unable to unpickle data from other workers. all_gather_list requires all " "workers to enter the function together, so this error usually indicates " "that the workers have fallen out of sync somehow. Workers can fall out of " "sync if one of them runs out of memory, or if there are other conditions " "in your training script that can cause one worker to finish an epoch " "while other workers are still iterating over their portions of the data. " "Try rerunning with --ddp-backend=legacy_ddp and see if that helps." )
def all_gather_list(data, group=None, max_size=16384): """Gathers arbitrary data from all nodes into a list. Similar to :func:`~torch.distributed.all_gather` but for arbitrary Python data. Note that *data* must be picklable. Args: data (Any): data from the local worker to be gathered on other workers group (optional): group of the collective max_size (int, optional): maximum size of the data to be gathered across workers """ rank = get_rank() world_size = get_world_size() buffer_size = max_size * world_size if not hasattr(all_gather_list, '_buffer') or \ all_gather_list._buffer.numel() < buffer_size: all_gather_list._buffer = torch.cuda.ByteTensor(buffer_size) all_gather_list._cpu_buffer = torch.ByteTensor(max_size).pin_memory() buffer = all_gather_list._buffer buffer.zero_() cpu_buffer = all_gather_list._cpu_buffer data = utils.move_to_cpu(data) enc = pickle.dumps(data) enc_size = len(enc) if enc_size + 2 > max_size: raise ValueError('encoded data exceeds max_size: {}'.format(enc_size + 2)) assert max_size < 255 * 256 cpu_buffer[0] = enc_size // 255 # this encoding works for max_size < 65k cpu_buffer[1] = enc_size % 255 cpu_buffer[2:enc_size + 2] = torch.ByteTensor(list(enc)) start = rank * max_size size = enc_size + 2 buffer[start:start + size].copy_(cpu_buffer[:size]) all_reduce(buffer, group=group) try: result = [] print(f"world_size:{world_size}") for i in range(world_size): out_buffer = buffer[i * max_size:(i + 1) * max_size] print(f"{type(out_buffer)} out_buffer:{out_buffer}") size = (255 * utils.item(out_buffer[0])) + utils.item( out_buffer[1]) if size > 0: print( f"{type(pickle.loads(bytes(out_buffer[2 : size + 2].tolist())))} pickle.loads:{pickle.loads(bytes(out_buffer[2 : size + 2].tolist()))}" ) result.append( pickle.loads(bytes(out_buffer[2:size + 2].tolist()))) return result except pickle.UnpicklingError: raise Exception( 'Unable to unpickle data from other workers. all_gather_list requires all ' 'workers to enter the function together, so this error usually indicates ' 'that the workers have fallen out of sync somehow. Workers can fall out of ' 'sync if one of them runs out of memory, or if there are other conditions ' 'in your training script that can cause one worker to finish an epoch ' 'while other workers are still iterating over their portions of the data.' )
def save_state( filename, cfg: FairseqConfig, model_state_dict, criterion, optimizer, lr_scheduler, num_updates, optim_history=None, extra_state=None, **kwargs, ): from fairseq import utils if optim_history is None: optim_history = [] if extra_state is None: extra_state = {} state_dict = { "cfg": cfg, "args": kwargs.get("args", None), "model": model_state_dict or {}, "optimizer_history": optim_history + [{ "criterion_name": criterion.__class__.__name__, "optimizer_name": optimizer.__class__.__name__, "lr_scheduler_state": lr_scheduler.state_dict(), "num_updates": num_updates, }], "extra_state": extra_state, } if utils.has_parameters(criterion): state_dict["criterion"] = criterion.state_dict() if cfg is None: cfg = state_dict["args"] assert cfg is not None, "must provide cfg or args" if isinstance(cfg, DictConfig): no_save_optimizer_state = cfg.checkpoint.no_save_optimizer_state else: no_save_optimizer_state = cfg.no_save_optimizer_state if not no_save_optimizer_state: state_dict["last_optimizer_state"] = optimizer.state_dict() # keep everything on CPU state_dict = utils.move_to_cpu(state_dict) if PathManager.supports_rename(filename): # do atomic save with PathManager.open(filename + ".tmp", "wb") as f: torch_persistent_save(state_dict, f) PathManager.rename(filename + ".tmp", filename) else: # fallback to non-atomic save with PathManager.open(filename, "wb") as f: torch_persistent_save(state_dict, f)
def main(args): state = checkpoint_utils.load_checkpoint_to_cpu(args.checkpoint) ns = state["args"] model = state["model"] ns.arch = "transformer_modular" if (args.encoder_attention_heads_active is None and args.decoder_attention_heads_active is None): raise ValueError( 'Either --encoder-attention-heads-active or ' '--decoder-attention-heads-active option must be set.') if args.encoder_attention_heads_active is None: args.encoder_attention_heads_active = args.decoder_attention_heads_active if args.encoder_modular_layer_indices is not None: ns.encoder_modular_layer_indices = "({})".format( args.encoder_modular_layer_indices) model = convert_model(model, ns, coder="encoder", att_type="self_attn") if args.decoder_modular_layer_indices is not None: ns.decoder_modular_layer_indices = "({})".format( args.decoder_modular_layer_indices) model = convert_model(model, ns, coder="decoder", att_type="self_attn") model = convert_model(model, ns, coder="decoder", att_type="encoder_attn") ctrl_enc = ModularCtrl(ns.encoder_embed_dim, ns.encoder_attention_heads, args.encoder_attention_heads_active, hidden_depth=args.ctrl_hidden_depth, hidden_dim=args.ctrl_hidden_dim, ctrl_type=args.ctrl_type) ns.module_ctrl_hidden_depth = args.ctrl_hidden_depth ns.module_ctrl_hidden_dim = args.ctrl_hidden_dim ns.module_ctrl_type = args.ctrl_type for k, v in ctrl_enc.state_dict().items(): model["encoder.module_ctrl.{}".format(k)] = v if not args.share_encoder_ctrl: if args.decoder_attention_heads_active is None: raise ValueError("Missing ``decoder-attention-heads-active'' " "when ``share-encoder-ctrl'' is disabled.") ns.share_encoder_ctrl = False ctrl_dec = ModularCtrl(ns.decoder_embed_dim, ns.decoder_attention_heads, args.decoder_attention_heads_active, hidden_depth=args.ctrl_hidden_depth, hidden_dim=args.ctrl_hidden_dim, ctrl_type=args.ctrl_type) for k, v in ctrl_dec.state_dict().items(): model["decoder.module_ctrl.{}".format(k)] = v else: ns.share_encoder_ctrl = True ns.arch = "transformer_modular" ns.criterion = "label_smoothed_cross_entropy_modular" ns.task = "translation_modular" ns.encoder_attention_heads_active = args.encoder_attention_heads_active state["args"] = ns state["model"] = model for i, _ in enumerate(state["optimizer_history"]): state["optimizer_history"][i][ "criterion_name"] = 'LabelSmoothedCrossEntropyModularCriterion' state = utils.move_to_cpu(state) with PathManager.open(args.save_as, "wb") as f: checkpoint_utils.torch_persistent_save(state, f)