class Miner(): def __init__(self, config: Munch = None, **kwargs): if config == None: config = Miner.default_config() bittensor.config.Config.update_with_kwargs(config.miner, kwargs) Miner.check_config(config) self.config = config # ---- Neuron ---- self.neuron = bittensor.neuron.Neuron(self.config) # ---- Model ---- self.model = GPT2Synapse( self.config ) # ---- Model Load/Save tools ---- self.model_toolbox = ModelToolbox(GPT2Synapse, AdamW) # ---- Optimizer ---- self.optimizer = self.configure_optimizers() self.lr = self.config.miner.learning_rate self.training_loss = math.inf self.best_train_loss = math.inf # ---- Dataset ---- # The Genesis Dataset: # The dataset used to train Adam and his first 100 children. # Here block size = sequence length. self.dataset = AdamCorpus(self.model.get_block_size()) self.tokens = 0 # ---- Logging ---- self.tensorboard = SummaryWriter(log_dir = self.config.miner.full_path) if self.config.miner.record_log == True: filepath = self.config.miner.full_path + "/{}_{}.log".format(self.config.miner.name, self.config.miner.trial_uid), logger.add ( filepath, format="{time:YYYY-MM-DD at HH:mm:ss} | {level} | {message}", rotation="250 MB", retention="10 days" ) @staticmethod def default_config() -> Munch: parser = argparse.ArgumentParser() Miner.add_args(parser) config = bittensor.config.Config.to_config(parser) return config @staticmethod def add_args(parser: argparse.ArgumentParser): parser.add_argument('--miner.learning_rate', default=3e-2, type=float, help='Training initial learning rate.') parser.add_argument('--miner.weight_decay', default=0.25, help='Model parameter weight decay.') parser.add_argument('--miner.lr_decay', default=True, help='learning rate decay params: linear warmup followed by cosine decay to 10% of original.') parser.add_argument('--miner.warmup_tokens', default=375e6, help='A linear LR warmup over the first miner.warmup_tokens tokens (default is 365 million)') parser.add_argument('--miner.final_tokens', default=260e9, help='At what point we reach 10% of original LR') parser.add_argument('--miner.num_workers', default=1, help='Number of workers for data loader.') parser.add_argument('--miner.clip_gradients', default=1.0, type=float, help='Implement gradient clipping to avoid exploding loss on smaller architectures.') parser.add_argument('--miner.n_epochs', default=int(sys.maxsize), type=int, help='Number of training epochs.') parser.add_argument('--miner.epoch_length', default=500, type=int, help='Iterations of training per epoch') parser.add_argument('--miner.batch_size_train', default=2, type=int, help='Training batch size.') parser.add_argument('--miner.root_dir', default='~/.bittensor/miners/', type=str, help='Root path to load and save data associated with each miner') parser.add_argument('--miner.name', default='gpt2-genesis', type=str, help='Trials for this miner go in miner.root / miner.name') parser.add_argument('--miner.trial_uid', default=str(time.time()).split('.')[0], type=str, help='Saved models go in miner.root_dir / miner.name / miner.uid') parser.add_argument('--miner.record_log', default=False, help='Record all logs when running this miner') parser.add_argument('--miner.custom_dataset', default="~/.bittensor/bittensor/miners/TEXT/gpt2_genesis/genesis_dataset/", type=str, help='Custom datasets to train on.') parser.add_argument('--miner.config_file', type=str, help='config file to run this neuron, if not using cmd line arguments.') GPT2Synapse.add_args(parser) bittensor.neuron.Neuron.add_args(parser) @staticmethod def check_config(config: Munch): assert config.miner.batch_size_train > 0, "batch_size_train must a positive value" assert config.miner.learning_rate > 0, "learning_rate must be a positive value." config.miner.custom_dataset = os.path.expanduser(config.miner.custom_dataset) full_path = '{}/{}/{}'.format(config.miner.root_dir, config.miner.name, config.miner.trial_uid) config.miner.full_path = os.path.expanduser(full_path) if not os.path.exists(config.miner.full_path): os.makedirs(config.miner.full_path) def configure_optimizers(self): """ This long function is unfortunately doing something very simple and is being very defensive: We are separating out all parameters of the model into two buckets: those that will experience weight decay for regularization and those that won't (biases, and layernorm/embedding weights). We are then returning the PyTorch optimizer object. """ # separate out all parameters to those that will and won't experience regularizing weight decay decay = set() no_decay = set() whitelist_weight_modules = (torch.nn.Linear, ) blacklist_weight_modules = (torch.nn.LayerNorm, torch.nn.Embedding, torch.nn.Tanh) for mn, m in self.model.named_modules(): for pn, p in m.named_parameters(): fpn = '%s.%s' % (mn, pn) if mn else pn # full param name if pn.endswith('bias'): # all biases will not be decayed no_decay.add(fpn) elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules): # weights of whitelist modules will be weight decayed decay.add(fpn) elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules): # weights of blacklist modules will NOT be weight decayed no_decay.add(fpn) # special case the position embedding parameter in the root GPT module as not decayed no_decay.add('pos_emb') # validate that we considered every parameter param_dict = {pn: p for pn, p in self.model.named_parameters()} inter_params = decay & no_decay union_params = decay | no_decay assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), ) assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \ % (str(param_dict.keys() - union_params), ) # create the pytorch optimizer object optim_groups = [ {"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": self.config.miner.weight_decay}, {"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0}, ] optimizer = torch.optim.AdamW(optim_groups, lr=self.config.miner.learning_rate, betas=(0.9, 0.95)) return optimizer # --- Main loop ---- def run (self): # ---- Subscribe ---- with self.neuron: # ---- Weights ---- self.row = self.neuron.metagraph.row.to(self.model.device) # --- Run state --- self.global_step = 0 # --- Loop for epochs --- for self.epoch in range(self.config.miner.n_epochs): # ---- Serve ---- self.neuron.axon.serve( self.model ) # ---- Train Model ---- self.train() # If model has borked for some reason, we need to make sure it doesn't emit weights # Instead, reload into previous version of model if torch.any(torch.isnan(torch.cat([param.view(-1) for param in self.model.parameters()]))): self.model, self.optimizer = self.model_toolbox.load_model(self.config) continue # ---- Emitting weights ---- self.neuron.metagraph.set_weights(self.row, wait_for_inclusion = True) # Sets my row-weights on the chain. # ---- Sync metagraph ---- self.neuron.metagraph.sync() # Pulls the latest metagraph state (with my update.) self.row = self.neuron.metagraph.row.to(self.model.device) # ---- Update Tensorboard ---- self.neuron.dendrite.__to_tensorboard__(self.tensorboard, self.global_step) self.neuron.metagraph.__to_tensorboard__(self.tensorboard, self.global_step) self.neuron.axon.__to_tensorboard__(self.tensorboard, self.global_step) # ---- Save best loss and model ---- if self.training_loss < self.best_train_loss: #self.epoch % 10 == 0: self.best_train_loss = self.training_loss # update best train loss self.model_toolbox.save_model( self.config.miner.full_path, { 'epoch': self.epoch, 'model_state_dict': self.model.state_dict(), 'loss': self.best_train_loss, 'optimizer_state_dict': self.optimizer.state_dict(), } ) self.tensorboard.add_scalar('Neuron/Train_loss', self.training_loss, self.global_step) logger.info("This epoch's training loss: {}...Current best training loss: {}".format(self.training_loss, self.best_train_loss)) def decay_learning_rate(self, batch): """Decay the learning rate based on the progress thus far. Adjusts the self.config.miner.learning_rate according to the tokens processed so far, returns number of tokens. Args: tokens (int): Number of tokens processed so far. """ if self.config.miner.lr_decay: # number of tokens processed this step self.tokens += (batch >= 0).sum() if self.tokens < self.config.miner.warmup_tokens: # linear warmup lr_mult = float(self.tokens) / float(max(1, self.config.miner.warmup_tokens)) else: # cosine learning rate decay progress = float(self.tokens - self.config.miner.warmup_tokens) / float(max(1, self.config.miner.final_tokens - self.config.miner.warmup_tokens)) lr_mult = max(0.1, 0.5 * (1.0 + math.cos(math.pi * progress))) self.lr = self.config.miner.learning_rate * lr_mult for param_group in self.optimizer.param_groups: param_group['lr'] = self.lr else: self.lr = self.config.miner.learning_rate def shuffle_dataset_epoch_length(self): """Shuffles the miner's dataset so we get a shuffled, randomized dataset of length miner.epoch_length Returns: [list] : shuffled dataset of length miner.epoch_length """ shuffled_dataset = [] loader = DataLoader(self.dataset, shuffle=True, batch_size=self.config.miner.batch_size_train, num_workers=self.config.miner.num_workers) for it, batch in enumerate(loader): shuffled_dataset.append(batch) if it == self.config.miner.epoch_length: break return shuffled_dataset def get_lr(self): for param_group in self.optimizer.param_groups: return param_group['lr'] # ---- Train Epoch ---- def train(self): def run_epoch(): self.model.train(True) losses = [] # Re-create dataloader every time we call train # This way, since epoch_length < len(dataset), we can # make sure that the dataset is randomly shuffled each time # we train for an epoch. logger.info("Preparing dataset batch...") dataset = self.shuffle_dataset_epoch_length() pbar = qqdm(enumerate(dataset), total=len(dataset), desc=format_str('blue', f'Epoch Progress')) for it, (batch) in pbar: batch = batch.to(self.model.device) output = self.model.remote_forward(self.neuron, batch, training=True) loss = output.local_target_loss + output.distillation_loss + output.remote_target_loss loss.backward() clip_grad_norm_(self.model.parameters(), self.config.miner.clip_gradients) self.optimizer.step() self.optimizer.zero_grad() self.decay_learning_rate(batch) losses.append(loss.item()) # ---- Train row weights ---- batch_weights = torch.mean(output.router.weights, axis = 0).to(self.model.device) # Average over batch. self.row = (1 - 0.03) * self.row + 0.03 * batch_weights # Moving avg update. self.row = F.normalize(self.row, p = 1, dim = 0) # Ensure normalization. pbar.set_infos({ 'GS': colored('{}'.format(self.global_step), 'red'), 'LS': colored('{}'.format(it), 'blue'), 'Epoch': colored('{}'.format(self.epoch+1), 'green'), 'Local loss': colored('{:.5f}'.format(output.local_target_loss.item()), 'red'), 'Remote loss': colored('{:.5f}'.format(output.remote_target_loss.item()), 'blue'), 'Distillation loss': colored('{:.5f}'.format(output.distillation_loss.item()), 'green'), 'Learning Rate:': colored('{:e}'.format(self.lr), 'white'), 'Axon': self.neuron.axon.__str__(), 'Dendrite': self.neuron.dendrite.__str__(), }) self.tensorboard.add_scalar('Neuron/Rloss', output.remote_target_loss.item(), self.global_step) self.tensorboard.add_scalar('Neuron/Lloss', output.local_target_loss.item(), self.global_step) self.tensorboard.add_scalar('Neuron/Dloss', output.distillation_loss.item(), self.global_step) self.global_step += 1 avg_loss = sum(losses) / len(losses) self.training_loss = avg_loss run_epoch()
class Miner(): """ Initializes, trains, and tests models created inside of 'bittensor/synapses'. During instantiation, this class takes a config as a [Munch](https://github.com/Infinidat/munch) object. """ def __init__(self, config: Munch = None, **kwargs): if config == None: config = Miner.default_config() bittensor.config.Config.update_with_kwargs(config.miner, kwargs) Miner.check_config(config) self.config = config # ---- Neuron ---- self.neuron = bittensor.neuron.Neuron(self.config) # ---- Model ---- self.model = XLMSynapse(self.config) # ---- Optimizer ---- self.optimizer = torch.optim.SGD(self.model.parameters(), lr=self.config.miner.learning_rate, momentum=self.config.miner.momentum) self.scheduler = WarmupCosineWithHardRestartsSchedule( self.optimizer, 50, 300) # ---- Model Load/Save tools ---- self.model_toolbox = ModelToolbox(XLMSynapse, torch.optim.SGD) # ---- Dataset ---- # Dataset: 74 million sentences pulled from books. self.dataset = load_dataset('amazon_reviews_multi', 'en')['train'] self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") if self.config.synapse.device: self.device = torch.device(self.config.synapse.device) # ---- Logging ---- self.tensorboard = SummaryWriter(log_dir=self.config.miner.full_path) if self.config.miner.record_log == True: filepath = f"{self.config.miner.full_path}/{self.config.miner.name}_ {self.config.miner.trial_uid}.log" logger.add( filepath, format="{time:YYYY-MM-DD at HH:mm:ss} | {level} | {message}", rotation="250 MB", retention="10 days") @staticmethod def default_config() -> Munch: parser = argparse.ArgumentParser() Miner.add_args(parser) config = bittensor.config.Config.to_config(parser) return config @staticmethod def add_args(parser: argparse.ArgumentParser): parser.add_argument('--miner.learning_rate', default=0.01, type=float, help='Training initial learning rate.') parser.add_argument('--miner.momentum', default=0.98, type=float, help='Training initial momentum for SGD.') parser.add_argument('--miner.n_epochs', default=int(sys.maxsize), type=int, help='Number of training epochs.') parser.add_argument('--miner.epoch_length', default=500, type=int, help='Iterations of training per epoch') parser.add_argument('--miner.batch_size_train', default=1, type=int, help='Training batch size.') parser.add_argument( '--miner.sync_interval', default=100, type=int, help='Batches before we sync with chain and emit new weights.') parser.add_argument('--miner.log_interval', default=10, type=int, help='Batches before we log miner info.') parser.add_argument( '--miner.accumulation_interval', default=1, type=int, help='Batches before we apply acummulated gradients.') parser.add_argument( '--miner.apply_remote_gradients', default=False, type=bool, help= 'If true, neuron applies gradients which accumulate from remotes calls.' ) parser.add_argument( '--miner.root_dir', default='~/.bittensor/miners/', type=str, help='Root path to load and save data associated with each miner') parser.add_argument( '--miner.name', default='xlm_wiki', type=str, help='Trials for this miner go in miner.root / miner.name') parser.add_argument( '--miner.trial_uid', default=str(time.time()).split('.')[0], type=str, help='Saved models go in miner.root_dir / miner.name / miner.uid') parser.add_argument('--miner.record_log', default=False, help='Record all logs when running this miner') parser.add_argument( '--miner.config_file', type=str, help= 'config file to run this neuron, if not using cmd line arguments.') parser.add_argument('--debug', dest='debug', action='store_true', help='''Turn on bittensor debugging information''') parser.set_defaults(debug=False) XLMSynapse.add_args(parser) bittensor.neuron.Neuron.add_args(parser) @staticmethod def check_config(config: Munch): if config.debug: bittensor.__log_level__ = 'TRACE' logger.debug('DEBUG is ON') else: logger.info('DEBUG is OFF') assert config.miner.momentum > 0 and config.miner.momentum < 1, "momentum must be a value between 0 and 1" assert config.miner.batch_size_train > 0, "batch_size_train must be a positive value" assert config.miner.learning_rate > 0, "learning_rate must be a positive value." full_path = '{}/{}/{}'.format(config.miner.root_dir, config.miner.name, config.miner.trial_uid) config.miner.full_path = os.path.expanduser(full_path) if not os.path.exists(config.miner.full_path): os.makedirs(config.miner.full_path) # --- Main loop ---- def run(self): # ---- Subscribe ---- with self.neuron: # ---- Weights ---- self.row = self.neuron.metagraph.row.to(self.model.device) # --- Run state --- self.global_step = 0 self.best_train_loss = math.inf # --- Loop for epochs --- for self.epoch in range(self.config.miner.n_epochs): try: # ---- Serve ---- self.neuron.axon.serve(self.model) # ---- Train Model ---- self.train() self.scheduler.step() # If model has borked for some reason, we need to make sure it doesn't emit weights # Instead, reload into previous version of model if torch.any( torch.isnan( torch.cat([ param.view(-1) for param in self.model.parameters() ]))): self.model, self.optimizer = self.model_toolbox.load_model( self.config) continue # ---- Emitting weights ---- self.neuron.metagraph.set_weights( self.row, wait_for_inclusion=True ) # Sets my row-weights on the chain. # ---- Sync metagraph ---- self.neuron.metagraph.sync( ) # Pulls the latest metagraph state (with my update.) self.row = self.neuron.metagraph.row.to(self.model.device) # --- Epoch logs ---- print(self.neuron.axon.__full_str__()) print(self.neuron.dendrite.__full_str__()) print(self.neuron.metagraph) # ---- Update Tensorboard ---- self.neuron.dendrite.__to_tensorboard__( self.tensorboard, self.global_step) self.neuron.metagraph.__to_tensorboard__( self.tensorboard, self.global_step) self.neuron.axon.__to_tensorboard__( self.tensorboard, self.global_step) # ---- Save best loss and model ---- if self.training_loss and self.epoch % 10 == 0 and self.training_loss < self.best_train_loss: self.best_train_loss = self.training_loss / 10 # update best train loss self.model_toolbox.save_model( self.config.miner.full_path, { 'epoch': self.epoch, 'model_state_dict': self.model.state_dict(), 'loss': self.best_train_loss, 'optimizer_state_dict': self.optimizer.state_dict(), }) self.tensorboard.add_scalar('Neuron/Train_loss', self.training_loss, self.global_step) # --- Catch Errors ---- except Exception as e: logger.error( 'Exception in training script with error: {}, {}', e, traceback.format_exc()) logger.info('Continuing to train.') # ---- Train Epoch ---- def train(self): self.training_loss = 0.0 for local_step in range(self.config.miner.epoch_length): # ---- Forward pass ---- inputs = nextbatch(self.dataset, self.config.miner.batch_size_train, bittensor.__tokenizer__()) output = self.model.remote_forward( self.neuron, inputs.to(self.model.device), training=True, ) # ---- Backward pass ---- loss = output.local_target_loss + output.distillation_loss + output.remote_target_loss loss.backward() # Accumulates gradients on the model. self.optimizer.step() # Applies accumulated gradients. self.optimizer.zero_grad( ) # Zeros out gradients for next accummulation # ---- Train row weights ---- batch_weights = torch.mean(output.router.weights, axis=0).to( self.model.device) # Average over batch. self.row = ( 1 - 0.03) * self.row + 0.03 * batch_weights # Moving avg update. self.row = F.normalize(self.row, p=1, dim=0) # Ensure normalization. # ---- Step logs ---- logger.info( 'GS: {} LS: {} Epoch: {}\tLocal Target Loss: {}\tRemote Target Loss: {}\tDistillation Loss: {}\tAxon: {}\tDendrite: {}', colored('{}'.format(self.global_step), 'red'), colored('{}'.format(local_step), 'blue'), colored('{}'.format(self.epoch), 'green'), colored('{:.4f}'.format(output.local_target_loss.item()), 'green'), colored('{:.4f}'.format(output.remote_target_loss.item()), 'blue'), colored('{:.4f}'.format(output.distillation_loss.item()), 'red'), self.neuron.axon, self.neuron.dendrite) logger.info('Codes: {}', output.router.return_codes.tolist()) self.tensorboard.add_scalar('Neuron/Rloss', output.remote_target_loss.item(), self.global_step) self.tensorboard.add_scalar('Neuron/Lloss', output.local_target_loss.item(), self.global_step) self.tensorboard.add_scalar('Neuron/Dloss', output.distillation_loss.item(), self.global_step) # ---- Step increments ---- self.global_step += 1 self.training_loss += output.local_target_loss.item() # --- Memory clean up ---- torch.cuda.empty_cache() del output
class Miner(): def __init__(self, config: Munch = None, **kwargs): if config == None: config = Miner.default_config() bittensor.config.Config.update_with_kwargs(config.miner, kwargs) Miner.check_config(config) self.config = config # ---- Neuron ---- self.neuron = bittensor.neuron.Neuron(self.config) # ---- Model ---- self.model = FFNNSynapse( config) # Feedforward neural network with PKMRouter. self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.model.to(self.device) # Set model to device # ---- Optimizer ---- self.optimizer = optim.SGD(self.model.parameters(), lr=self.config.miner.learning_rate, momentum=self.config.miner.momentum) self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=10.0, gamma=0.1) # ---- Model Load/Save tools ---- self.model_toolbox = ModelToolbox(FFNNSynapse, optim.SGD) # ---- Dataset ---- self.train_data = torchvision.datasets.MNIST( root=self.config.miner.root_dir + "datasets/", train=True, download=True, transform=transforms.ToTensor()) self.trainloader = torch.utils.data.DataLoader( self.train_data, batch_size=self.config.miner.batch_size_train, shuffle=True, num_workers=2) self.test_data = torchvision.datasets.MNIST( root=self.config.miner.root_dir + "datasets/", train=False, download=True, transform=transforms.ToTensor()) self.testloader = torch.utils.data.DataLoader( self.test_data, batch_size=self.config.miner.batch_size_test, shuffle=False, num_workers=2) # ---- Tensorboard ---- self.global_step = 0 self.tensorboard = SummaryWriter(log_dir=self.config.miner.full_path) if self.config.miner.record_log: logger.add( self.config.miner.full_path + "/{}_{}.log".format( self.config.miner.name, self.config.miner.trial_uid), format="{time:YYYY-MM-DD at HH:mm:ss} | {level} | {message}") @staticmethod def default_config() -> Munch: parser = argparse.ArgumentParser() Miner.add_args(parser) config = bittensor.config.Config.to_config(parser) return config @staticmethod def add_args(parser: argparse.ArgumentParser): parser.add_argument('--miner.learning_rate', default=0.01, type=float, help='Training initial learning rate.') parser.add_argument('--miner.momentum', default=0.9, type=float, help='Training initial momentum for SGD.') parser.add_argument('--miner.n_epochs', default=int(sys.maxsize), type=int, help='Number of training epochs.') parser.add_argument( '--miner.epoch_length', default=int(sys.maxsize), type=int, help='Iterations of training per epoch (or dataset EOF)') parser.add_argument('--miner.batch_size_train', default=64, type=int, help='Training batch size.') parser.add_argument('--miner.batch_size_test', default=64, type=int, help='Testing batch size.') parser.add_argument('--miner.log_interval', default=150, type=int, help='Batches until miner prints log statements.') parser.add_argument( '--miner.sync_interval', default=10, type=int, help='Batches before we we sync with chain and emit new weights.') parser.add_argument( '--miner.root_dir', default='~/.bittensor/miners/', type=str, help='Root path to load and save data associated with each miner') parser.add_argument( '--miner.name', default='mnist', type=str, help='Trials for this miner go in miner.root / miner.name') parser.add_argument( '--miner.trial_uid', default=str(time.time()).split('.')[0], type=str, help='Saved models go in miner.root_dir / miner.name / miner.uid') parser.add_argument('--miner.record_log', default=False, help='Record all logs when running this miner') parser.add_argument( '--miner.config_file', type=str, help= 'config file to run this neuron, if not using cmd line arguments.') bittensor.neuron.Neuron.add_args(parser) FFNNSynapse.add_args(parser) @staticmethod def check_config(config: Munch): assert config.miner.log_interval > 0, "log_interval dimension must be positive" assert config.miner.momentum > 0 and config.miner.momentum < 1, "momentum must be a value between 0 and 1" assert config.miner.batch_size_train > 0, "batch_size_train must be a positive value" assert config.miner.batch_size_test > 0, "batch_size_test must be a positive value" assert config.miner.learning_rate > 0, "learning rate must be be a positive value." full_path = '{}/{}/{}/'.format(config.miner.root_dir, config.miner.name, config.miner.trial_uid) config.miner.full_path = os.path.expanduser(full_path) if not os.path.exists(config.miner.full_path): os.makedirs(config.miner.full_path) # --- Main loop ---- def run(self): # ---- Subscribe neuron ---- with self.neuron: # ---- Weights ---- self.row = self.neuron.metagraph.row.to(self.model.device) # --- Loop for epochs --- self.best_test_loss = math.inf self.global_step = 0 for self.epoch in range(self.config.miner.n_epochs): # ---- Serve ---- self.neuron.axon.serve(self.model) # ---- Train ---- self.train() self.scheduler.step() # If model has borked for some reason, we need to make sure it doesn't emit weights # Instead, reload into previous version of model if torch.any( torch.isnan( torch.cat([ param.view(-1) for param in self.model.parameters() ]))): self.model, self.optimizer = self.model_toolbox.load_model( self.config) continue # ---- Test ---- test_loss, test_accuracy = self.test() # ---- Emit ---- self.neuron.metagraph.set_weights( self.row, wait_for_inclusion=True ) # Sets my row-weights on the chain. # ---- Sync ---- self.neuron.metagraph.sync( ) # Pulls the latest metagraph state (with my update.) self.row = self.neuron.metagraph.row.to(self.device) # --- Display Epoch ---- print(self.neuron.axon.__full_str__()) print(self.neuron.dendrite.__full_str__()) print(self.neuron.metagraph) # ---- Update Tensorboard ---- self.neuron.dendrite.__to_tensorboard__( self.tensorboard, self.global_step) self.neuron.metagraph.__to_tensorboard__( self.tensorboard, self.global_step) self.neuron.axon.__to_tensorboard__(self.tensorboard, self.global_step) # ---- Save ---- if test_loss < self.best_test_loss: self.best_test_loss = test_loss # Update best loss. self.model_toolbox.save_model( self.config.miner.full_path, { 'epoch': self.epoch, 'model_state_dict': self.model.state_dict(), 'loss': self.best_test_loss, 'optimizer_state_dict': self.optimizer.state_dict(), }) self.tensorboard.add_scalar('Test loss', test_loss, self.global_step) # ---- Train epoch ---- def train(self): # ---- Init training state ---- self.model.train() # Turn on dropout etc. for batch_idx, (images, targets) in enumerate(self.trainloader): if batch_idx >= self.config.miner.epoch_length: break self.global_step += 1 # ---- Remote Forward pass ---- output = self.model.remote_forward( neuron=self.neuron, images=images.to(self.device), targets=torch.LongTensor(targets).to(self.device), ) # ---- Remote Backward pass ---- loss = output.remote_target_loss + output.local_target_loss + output.distillation_loss loss.backward() # Accumulates gradients on the model. self.optimizer.step() # Applies accumulated gradients. self.optimizer.zero_grad( ) # Zeros out gradients for next accummulation # ---- Train weights ---- batch_weights = torch.mean(output.router.weights, axis=0).to( self.model.device) # Average over batch. self.row = ( 1 - 0.03) * self.row + 0.03 * batch_weights # Moving avg update. self.row = F.normalize(self.row, p=1, dim=0) # Ensure normalization. # ---- Step Logs + Tensorboard ---- processed = ((batch_idx + 1) * self.config.miner.batch_size_train) progress = (100. * processed) / len(self.train_data) logger.info( 'GS: {}\t Epoch: {} [{}/{} ({})]\tLoss: {}\tAcc: {}\tAxon: {}\tDendrite: {}', colored('{}'.format(self.global_step), 'blue'), colored('{}'.format(self.epoch), 'blue'), colored('{}'.format(processed), 'green'), colored('{}'.format(len(self.train_data)), 'red'), colored('{:.2f}%'.format(progress), 'green'), colored('{:.4f}'.format(output.local_target_loss.item()), 'green'), colored('{:.4f}'.format(output.local_accuracy.item()), 'green'), self.neuron.axon, self.neuron.dendrite) self.tensorboard.add_scalar('Rloss', output.remote_target_loss.item(), self.global_step) self.tensorboard.add_scalar('Lloss', output.local_target_loss.item(), self.global_step) self.tensorboard.add_scalar('Dloss', output.distillation_loss.item(), self.global_step) # --- Test epoch ---- def test(self): with torch.no_grad( ): # Turns off gradient computation for inference speed up. self.model.eval() # Turns off Dropoutlayers, BatchNorm etc. loss = 0.0 accuracy = 0.0 for _, (images, labels) in enumerate(self.testloader): # ---- Local Forward pass ---- outputs = self.model.local_forward( images=images.to(self.device), targets=torch.LongTensor(labels).to(self.device), ) loss += outputs.local_target_loss.item() accuracy += outputs.local_accuracy.item() return loss / len(self.testloader), accuracy / len(self.testloader)
class Miner( bittensor.miner.Miner ): def __init__(self, config: Munch = None, **kwargs): if config == None: config = Miner.default_config(); bittensor.config.Config.update_with_kwargs(config.miner, kwargs) Miner.check_config(config) self.config = config # ---- Model ---- self.model = BertMLMSynapse( self.config ) # ---- Optimizer ---- self.optimizer = torch.optim.SGD(self.model.parameters(), lr = self.config.miner.learning_rate, momentum=self.config.miner.momentum) self.scheduler = WarmupCosineWithHardRestartsSchedule(self.optimizer, 50, 300) # ---- Model Load/Save tools ---- self.model_toolbox = ModelToolbox(BertMLMSynapse, torch.optim.SGD) # ---- Dataset ---- # Dataset: 74 million sentences pulled from books. self.dataset = load_dataset('ag_news')['train'] # The collator accepts a list [ dict{'input_ids, ...; } ] where the internal dict # is produced by the tokenizer. self.data_collator = DataCollatorForLanguageModeling ( tokenizer=bittensor.__tokenizer__(), mlm=True, mlm_probability=0.15 ) super( Miner, self ).__init__( self.config, **kwargs ) @staticmethod def default_config() -> Munch: parser = argparse.ArgumentParser(); Miner.add_args(parser) config = bittensor.config.Config.to_config(parser); return config @staticmethod def check_config(config: Munch): assert config.miner.momentum > 0 and config.miner.momentum < 1, "momentum must be a value between 0 and 1" assert config.miner.batch_size_train > 0, "batch_size_train must a positive value" assert config.miner.learning_rate > 0, "learning_rate must be a positive value." BertMLMSynapse.check_config( config ) bittensor.miner.Miner.check_config( config ) @staticmethod def add_args(parser: argparse.ArgumentParser): parser.add_argument('--miner.learning_rate', default=0.01, type=float, help='Training initial learning rate.') parser.add_argument('--miner.momentum', default=0.98, type=float, help='Training initial momentum for SGD.') parser.add_argument('--miner.clip_gradients', default=0.8, type=float, help='Implement gradient clipping to avoid exploding loss on smaller architectures.') parser.add_argument('--miner.n_epochs', default=int(sys.maxsize), type=int, help='Number of training epochs.') parser.add_argument('--miner.epoch_length', default=500, type=int, help='Iterations of training per epoch') parser.add_argument('--miner.batch_size_train', default=1, type=int, help='Training batch size.') parser.add_argument('--miner.name', default='bert_mlm', type=str, help='Trials for this miner go in miner.root / (wallet_cold - wallet_hot) / miner.name ') BertMLMSynapse.add_args(parser) bittensor.miner.Miner.add_args(parser) # --- Main loop ---- def run (self): # ---- Subscribe ---- with self: # ---- Weights ---- self.row = self.metagraph.row # --- Run state --- self.global_step = 0 self.best_train_loss = math.inf # --- Loop for epochs --- for self.epoch in range(self.config.miner.n_epochs): try: # ---- Serve ---- self.axon.serve( self.model ) # ---- Train Model ---- self.train() self.scheduler.step() # If model has borked for some reason, we need to make sure it doesn't emit weights # Instead, reload into previous version of model if torch.any(torch.isnan(torch.cat([param.view(-1) for param in self.model.parameters()]))): self.model, self.optimizer = self.model_toolbox.load_model(self.config) continue # ---- Emitting weights ---- self.metagraph.set_weights(self.row, wait_for_inclusion = True) # Sets my row-weights on the chain. # ---- Sync metagraph ---- self.metagraph.sync() # Pulls the latest metagraph state (with my update.) self.row = self.metagraph.row logger.info(self.metagraph) # ---- Update Tensorboard ---- self.dendrite.__to_tensorboard__(self.tensorboard, self.global_step) self.metagraph.__to_tensorboard__(self.tensorboard, self.global_step) self.axon.__to_tensorboard__(self.tensorboard, self.global_step) # ---- Save best loss and model ---- if self.training_loss and self.epoch % 10 == 0: if self.training_loss < self.best_train_loss: self.best_train_loss = self.training_loss # update best train loss self.model_toolbox.save_model( self.config.miner.full_path, { 'epoch': self.epoch, 'model_state_dict': self.model.state_dict(), 'loss': self.best_train_loss, 'optimizer_state_dict': self.optimizer.state_dict(), } ) self.tensorboard.add_scalar('Neuron/Train_loss', self.training_loss, self.global_step) # --- Catch Errors ---- except Exception as e: logger.error('Exception in training script with error: {}', e) logger.info(traceback.print_exc()) logger.info('Continuing to train.') time.sleep(1) # ---- Train Epoch ---- def train(self): self.training_loss = 0.0 for local_step in range(self.config.miner.epoch_length): # ---- Forward pass ---- inputs, targets = mlm_batch(self.dataset, self.config.miner.batch_size_train, bittensor.__tokenizer__(), self.data_collator) output = self.model.remote_forward ( self, inputs = inputs.to(self.model.device), targets = targets.to(self.model.device) ) # ---- Backward pass ---- loss = output.local_target_loss + output.distillation_loss + output.remote_target_loss loss.backward() # Accumulates gradients on the model. clip_grad_norm_(self.model.parameters(), self.config.miner.clip_gradients) # clip model gradients self.optimizer.step() # Applies accumulated gradients. self.optimizer.zero_grad() # Zeros out gradients for next accummulation # ---- Train row weights ---- batch_weights = torch.mean(output.router.weights, axis = 0) # Average over batch. self.row = (1 - 0.03) * self.row + 0.03 * batch_weights # Moving avg update. self.row = F.normalize(self.row, p = 1, dim = 0) # Ensure normalization. # ---- Step logs ---- logger.info('GS: {} LS: {} Epoch: {}\tLocal Target Loss: {}\tRemote Target Loss: {}\tDistillation Loss: {}\tAxon: {}\tDendrite: {}', colored('{}'.format(self.global_step), 'red'), colored('{}'.format(local_step), 'blue'), colored('{}'.format(self.epoch), 'green'), colored('{:.4f}'.format(output.local_target_loss.item()), 'green'), colored('{:.4f}'.format(output.remote_target_loss.item()), 'blue'), colored('{:.4f}'.format(output.distillation_loss.item()), 'red'), self.axon, self.dendrite) logger.info('Codes: {}', output.router.return_codes.tolist()) self.tensorboard.add_scalar('Neuron/Rloss', output.remote_target_loss.item(), self.global_step) self.tensorboard.add_scalar('Neuron/Lloss', output.local_target_loss.item(), self.global_step) self.tensorboard.add_scalar('Neuron/Dloss', output.distillation_loss.item(), self.global_step) # ---- Step increments ---- self.global_step += 1 self.training_loss += output.local_target_loss.item() # --- Memory clean up ---- torch.cuda.empty_cache() del output
class Miner(): def __init__(self, config: Munch = None, **kwargs): if config == None: config = Miner.default_config() bittensor.config.Config.update_with_kwargs(config.miner, kwargs) Miner.check_config(config) self.config = config # ---- Build Neuron ---- self.neuron = bittensor.neuron.Neuron(config) # ---- Build FFNN Model ---- self.model = FFNNSynapse(self.config) self.model.to( torch.device("cuda" if torch.cuda.is_available() else "cpu")) self.neuron.axon.serve(self.model) # ---- Optimizer ---- self.optimizer = torch.optim.SGD(self.model.parameters(), lr=self.config.miner.learning_rate, momentum=self.config.miner.momentum) # ---- Model Load/Save tools ---- self.model_toolbox = ModelToolbox(FFNNSynapse, torch.optim.SGD) # ---- Logging ---- self.tensorboard = SummaryWriter(log_dir=self.config.miner.full_path) if self.config.miner.record_log: logger.add( self.config.miner.full_path + "/{}_{}.log".format( self.config.miner.name, self.config.miner.trial_uid), format="{time:YYYY-MM-DD at HH:mm:ss} | {level} | {message}") @staticmethod def default_config() -> Munch: parser = argparse.ArgumentParser() Miner.add_args(parser) config = bittensor.config.Config.to_config(parser) return config @staticmethod def add_args(parser: argparse.ArgumentParser): parser.add_argument('--miner.learning_rate', default=0.01, type=float, help='Training initial learning rate.') parser.add_argument('--miner.momentum', default=0.9, type=float, help='Training initial momentum for SGD.') parser.add_argument('--miner.n_epochs', default=int(sys.maxsize), type=int, help='Number of training epochs.') parser.add_argument( '--miner.sync_interval', default=150, type=int, help='Batches before we we sync with chain and emit new weights.') parser.add_argument( '--miner.root_dir', default='~/.bittensor/miners/', type=str, help='Root path to load and save data associated with each miner') parser.add_argument( '--miner.name', default='ffnn-grunt', type=str, help='Trials for this miner go in miner.root / miner.name') parser.add_argument( '--miner.trial_uid', default=str(time.time()).split('.')[0], type=str, help='Saved models go in miner.root_dir / miner.name / miner.uid') parser.add_argument('--miner.record_log', default=False, help='Record all logs when running this miner') parser.add_argument( '--miner.config_file', type=str, help= 'config file to run this neuron, if not using cmd line arguments.') bittensor.neuron.Neuron.add_args(parser) FFNNSynapse.add_args(parser) @staticmethod def check_config(config: Munch): assert config.miner.momentum > 0 and config.miner.momentum < 1, "momentum must be a value between 0 and 1" assert config.miner.learning_rate > 0, "learning rate must be be a positive value." full_path = '{}/{}/{}/'.format(config.miner.root_dir, config.miner.name, config.miner.trial_uid) config.miner.full_path = os.path.expanduser(full_path) if not os.path.exists(config.miner.full_path): os.makedirs(config.miner.full_path) # ---- Main loop ---- def run(self): # --- Subscribe / Update neuron --- with self.neuron: # ---- Loop for epochs ---- self.model.train() for self.epoch in range(self.config.miner.n_epochs): # ---- Poll until gradients ---- public_key, inputs_x, grads_dy, modality_x = self.neuron.axon.gradients.get( block=True) # ---- Backward Gradients ---- # TODO (const): batch normalization over the gradients for consistency. grads_dy = torch.where(torch.isnan(grads_dy), torch.zeros_like(grads_dy), grads_dy) self.model.backward(inputs_x, grads_dy, modality_x) # ---- Apply Gradients ---- self.optimizer.step() # Apply accumulated gradients. self.optimizer.zero_grad() # Clear any lingering gradients # If model has borked for some reason, we need to make sure it doesn't emit weights # Instead, reload into previous version of the model if torch.any( torch.isnan( torch.cat([ param.view(-1) for param in self.model.parameters() ]))): self.model, self.optimizer = self.model_toolbox.load_model( self.config) # ---- Serve latest model ---- self.neuron.axon.serve(self.model) # Serve the newest model. logger.info('Step: {} \t Key: {} \t sum(W[:,0])', self.epoch, public_key, torch.sum(self.neuron.metagraph.col).item()) # ---- Sync State ---- if (self.epoch + 1) % self.config.miner.sync_interval == 0: # --- Display Epoch ---- print(self.neuron.axon.__full_str__()) print(self.neuron.dendrite.__full_str__()) print(self.neuron.metagraph) # ---- Sync metagrapn from chain ---- self.neuron.metagraph.sync() # Sync with the chain. # --- Save Model ---- self.model_toolbox.save_model( self.config.miner.full_path, { 'epoch': self.epoch, 'model_state_dict': self.model.state_dict(), 'optimizer_state_dict': self.optimizer.state_dict(), })
class Miner( bittensor.miner.Miner ): def __init__(self, config: Munch = None, **kwargs): if config == None: config = Miner.default_config() bittensor.config.Config.update_with_kwargs(config.miner, kwargs) Miner.check_config(config) self.config = config # ---- Model ---- self.model = GPT2Synapse( self.config ) # ---- Model Load/Save tools ---- self.model_toolbox = ModelToolbox(GPT2Synapse, AdamW) # ---- Optimizer ---- self.optimizer = self.configure_optimizers() self.lr = self.config.miner.learning_rate self.training_loss = math.inf self.best_train_loss = math.inf self.rloss = math.inf self.lloss = math.inf self.dloss = math.inf # ---- Dataset ---- # The Genesis Dataset: # The dataset used to train Adam and his first 100 children. self.dataset = GenesisTextDataloader(self.config.miner.batch_size_train, self.model.get_block_size()) self.tokens = 0 super( Miner, self ).__init__( self.config, **kwargs ) @staticmethod def default_config() -> Munch: parser = argparse.ArgumentParser() Miner.add_args(parser) config = bittensor.config.Config.to_config(parser) return config @staticmethod def add_args(parser: argparse.ArgumentParser): parser.add_argument( '--miner.learning_rate', default=3e-2, type=float, help='Training initial learning rate.' ) parser.add_argument( '--miner.weight_decay', default=0.25, type=float, help='Model parameter weight decay.' ) parser.add_argument( '--miner.lr_decay', default=True, type=bool, help='learning rate decay params: linear warmup followed by cosine decay to 10%% of original.' ) parser.add_argument( '--miner.warmup_tokens', default=375e6, type=float, help='A linear LR warmup over the first miner.warmup_tokens tokens (default is 365 million)' ) parser.add_argument( '--miner.final_tokens', default=260e9, type=float, help='At what point we reach 10%% of original LR' ) parser.add_argument( '--miner.clip_gradients', default=1.0, type=float, help='Implement gradient clipping to avoid exploding loss on smaller architectures.' ) parser.add_argument( '--miner.n_epochs', default=int(sys.maxsize), type=int, help='Number of training epochs.' ) parser.add_argument( '--miner.epoch_length', default=500, type=int, help='Iterations of training per epoch' ) parser.add_argument( '--miner.batch_size_train', default=2, type=int, help='Training batch size.' ) parser.add_argument('--miner.name', default='gpt2_genesis', type=str, help='Trials for this miner go in miner.root / (wallet_cold - wallet_hot) / miner.name ') GPT2Synapse.add_args( parser ) bittensor.miner.Miner.add_args( parser ) GenesisTextDataloader.add_args( parser ) @staticmethod def check_config(config: Munch): assert config.miner.batch_size_train > 0, "batch_size_train must a positive value" assert config.miner.learning_rate > 0, "learning_rate must be a positive value." bittensor.miner.Miner.check_config( config ) GenesisTextDataloader.check_config( config ) def configure_optimizers(self): """ This long function is unfortunately doing something very simple and is being very defensive: We are separating out all parameters of the model into two buckets: those that will experience weight decay for regularization and those that won't (biases, and layernorm/embedding weights). We are then returning the PyTorch optimizer object. """ # separate out all parameters to those that will and won't experience regularizing weight decay decay = set() no_decay = set() whitelist_weight_modules = (torch.nn.Linear, ) blacklist_weight_modules = (torch.nn.LayerNorm, torch.nn.Embedding, torch.nn.Tanh) for mn, m in self.model.named_modules(): for pn, p in m.named_parameters(): fpn = '%s.%s' % (mn, pn) if mn else pn # full param name if pn.endswith('bias'): # all biases will not be decayed no_decay.add(fpn) elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules): # weights of whitelist modules will be weight decayed decay.add(fpn) elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules): # weights of blacklist modules will NOT be weight decayed no_decay.add(fpn) # special case the position embedding parameter in the root GPT module as not decayed no_decay.add('pos_emb') # validate that we considered every parameter param_dict = {pn: p for pn, p in self.model.named_parameters()} inter_params = decay & no_decay union_params = decay | no_decay assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), ) assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \ % (str(param_dict.keys() - union_params), ) # create the pytorch optimizer object optim_groups = [ {"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": self.config.miner.weight_decay}, {"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0}, ] optimizer = torch.optim.AdamW(optim_groups, lr=self.config.miner.learning_rate, betas=(0.9, 0.95)) return optimizer # --- Main loop ---- def run (self): # ---- Subscribe ---- with self: # ---- Weights ---- self.row = self.metagraph.row.to(self.model.device) # --- Run state --- self.global_step = 0 # --- Loop for epochs --- for self.epoch in range(self.config.miner.n_epochs): # ---- Serve ---- self.axon.serve( self.model ) # ---- Train Model ---- self.train() # If model has borked for some reason, we need to make sure it doesn't emit weights # Instead, reload into previous version of model if torch.any(torch.isnan(torch.cat([param.view(-1) for param in self.model.parameters()]))): self.model, self.optimizer = self.model_toolbox.load_model(self.config) continue # ---- Emitting weights ---- try: self.metagraph.set_weights(self.row, wait_for_inclusion = True) # Sets my row-weights on the chain. except Exception as e: logger.error("Failed to set weights") raise e try: # ---- Sync metagraph ---- self.metagraph.sync() # Pulls the latest metagraph state (with my update.) except Exception as e: logger.error("Failed to sync metagraph") raise e self.row = self.metagraph.row.to(self.model.device) # ---- Update Tensorboard ---- self.dendrite.__to_tensorboard__(self.tensorboard, self.global_step) self.metagraph.__to_tensorboard__(self.tensorboard, self.global_step) self.axon.__to_tensorboard__(self.tensorboard, self.global_step) # ---- Save best loss and model ---- if self.training_loss < self.best_train_loss: #self.epoch % 10 == 0: self.best_train_loss = self.training_loss # update best train loss self.model_toolbox.save_model( self.config.miner.full_path, { 'epoch': self.epoch, 'model_state_dict': self.model.state_dict(), 'loss': self.best_train_loss/3, 'optimizer_state_dict': self.optimizer.state_dict(), 'rloss' : self.rloss, 'lloss': self.lloss, 'dloss': self.dloss, } ) self.tensorboard.add_scalar('Neuron/Train_loss', self.training_loss, self.global_step) logger.info("This epoch's training losses: L-Loss: {:.2f} | R-Loss: {:.2f} | D-Loss: {:.2f} | avg: {:.2f} ... Current best average training loss: {:.2f}".format(self.lloss, self.rloss, self.dloss, self.training_loss/3, self.best_train_loss/3)) def decay_learning_rate(self, batch): """Decay the learning rate based on the progress thus far. Adjusts the self.config.miner.learning_rate according to the tokens processed so far, returns number of tokens. Args: tokens (int): Number of tokens processed so far. """ if self.config.miner.lr_decay: # number of tokens processed this step self.tokens += (batch >= 0).sum() if self.tokens < self.config.miner.warmup_tokens: # linear warmup lr_mult = float(self.tokens) / float(max(1, self.config.miner.warmup_tokens)) else: # cosine learning rate decay progress = float(self.tokens - self.config.miner.warmup_tokens) / float(max(1, self.config.miner.final_tokens - self.config.miner.warmup_tokens)) lr_mult = max(0.1, 0.5 * (1.0 + math.cos(math.pi * progress))) self.lr = self.config.miner.learning_rate * lr_mult for param_group in self.optimizer.param_groups: param_group['lr'] = self.lr else: self.lr = self.config.miner.learning_rate def get_lr(self): for param_group in self.optimizer.param_groups: return param_group['lr'] # ---- Train Epoch ---- def train(self): def run_epoch(): self.model.train(True) losses = [] rlosses = [] llosses = [] dlosses = [] # we train for an epoch. logger.info("Preparing dataset batch...") # Set up the dataloader dataloader = self.dataset.dataloader(self.config.miner.epoch_length) pbar = qqdm(enumerate(dataloader), total=len(dataloader), desc=format_str('blue', f'Epoch Progress')) for it, (batch) in pbar: # ---- Forward pass ---- batch = batch.to(self.model.device) output = self.model.remote_forward(self, batch, training=True) # ---- Backward pass ---- loss = output.local_target_loss + output.distillation_loss + output.remote_target_loss loss.backward() # ---- Gradient Step ---- clip_grad_norm_(self.model.parameters(), self.config.miner.clip_gradients) self.optimizer.step() self.optimizer.zero_grad() self.decay_learning_rate(batch) # Add losses up losses.append(loss.item()) llosses.append(output.local_target_loss.item()) rlosses.append(output.remote_target_loss.item()) dlosses.append(output.distillation_loss.item()) # ---- Train row weights ---- batch_weights = torch.mean(output.router.weights, axis = 0).to(self.model.device) # Average over batch. self.row = (1 - 0.03) * self.row + 0.03 * batch_weights # Moving avg update. self.row = F.normalize(self.row, p = 1, dim = 0) # Ensure normalization. # ---- Logging ---- index = self.metagraph.state.index_for_uid[self.metagraph.uid] pbar.set_infos({ 'GS': colored('{}'.format(self.global_step), 'red'), 'LS': colored('{}'.format(it), 'blue'), 'Epoch': colored('{}'.format(self.epoch+1), 'green'), 'L-loss': colored('{:.5f}'.format(output.local_target_loss.item()), 'red'), 'R-loss': colored('{:.5f}'.format(output.remote_target_loss.item()), 'blue'), 'D-loss': colored('{:.5f}'.format(output.distillation_loss.item()), 'green'), 'lr': colored('{:e}'.format(self.lr), 'white'), 'nPeers': self.metagraph.n, 'Stake(\u03C4)': float(self.metagraph.S[index]), 'Rank(\u03C4)': float(self.metagraph.R[index]), 'Incentive(\u03C4/block)': float(self.metagraph.I[index]), 'Axon': self.axon.__str__(), 'Dendrite': self.dendrite.__str__(), }) self.tensorboard.add_scalar('Neuron/Rloss', output.remote_target_loss.item(), self.global_step) self.tensorboard.add_scalar('Neuron/Lloss', output.local_target_loss.item(), self.global_step) self.tensorboard.add_scalar('Neuron/Dloss', output.distillation_loss.item(), self.global_step) self.global_step += 1 avg_loss = sum(losses) / len(losses) self.rloss = sum(rlosses) / len(rlosses) self.lloss = sum(llosses) / len(llosses) self.dloss = sum(dlosses) / len(dlosses) self.training_loss = avg_loss run_epoch()
class Miner(): def __init__(self, config: Munch = None): if config == None: config = Miner.build_config(); logger.info(bittensor.config.Config.toString(config)) self.config = config # ---- Neuron ---- self.neuron = bittensor.neuron.Neuron(self.config) # ---- Model ---- self.model = BertMLMSynapse( self.config ) # ---- Optimizer ---- self.optimizer = torch.optim.SGD(self.model.parameters(), lr = self.config.miner.learning_rate, momentum=self.config.miner.momentum) self.scheduler = WarmupCosineWithHardRestartsSchedule(self.optimizer, 50, 300) # ---- Model Load/Save tools ---- self.model_toolbox = ModelToolbox(BertMLMSynapse, torch.optim.SGD) # ---- Dataset ---- # Dataset: 74 million sentences pulled from books. self.dataset = load_dataset('ag_news')['train'] # The collator accepts a list [ dict{'input_ids, ...; } ] where the internal dict # is produced by the tokenizer. self.data_collator = DataCollatorForLanguageModeling ( tokenizer=bittensor.__tokenizer__(), mlm=True, mlm_probability=0.15 ) # ---- Logging ---- self.tensorboard = SummaryWriter(log_dir = self.config.miner.full_path) if self.config.miner.record_log: logger.add(self.config.miner.full_path + "/{}_{}.log".format(self.config.miner.name, self.config.miner.trial_uid),format="{time:YYYY-MM-DD at HH:mm:ss} | {level} | {message}") @staticmethod def build_config() -> Munch: parser = argparse.ArgumentParser(); Miner.add_args(parser) config = bittensor.config.Config.to_config(parser); Miner.check_config(config) return config @staticmethod def check_config(config: Munch): assert config.miner.momentum > 0 and config.miner.momentum < 1, "momentum must be a value between 0 and 1" assert config.miner.batch_size_train > 0, "batch_size_train must a positive value" assert config.miner.learning_rate > 0, "learning_rate must be a positive value." full_path = '{}/{}/{}'.format(config.miner.root_dir, config.miner.name, config.miner.trial_uid) config.miner.full_path = os.path.expanduser(full_path) if not os.path.exists(config.miner.full_path): os.makedirs(config.miner.full_path) BertMLMSynapse.check_config(config) bittensor.neuron.Neuron.check_config(config) @staticmethod def add_args(parser: argparse.ArgumentParser): parser.add_argument('--miner.learning_rate', default=0.01, type=float, help='Training initial learning rate.') parser.add_argument('--miner.momentum', default=0.98, type=float, help='Training initial momentum for SGD.') parser.add_argument('--miner.n_epochs', default=int(sys.maxsize), type=int, help='Number of training epochs.') parser.add_argument('--miner.epoch_length', default=500, type=int, help='Iterations of training per epoch') parser.add_argument('--miner.batch_size_train', default=1, type=int, help='Training batch size.') parser.add_argument('--miner.sync_interval', default=100, type=int, help='Batches before we sync with chain and emit new weights.') parser.add_argument('--miner.log_interval', default=10, type=int, help='Batches before we log miner info.') parser.add_argument('--miner.accumulation_interval', default=1, type=int, help='Batches before we apply acummulated gradients.') parser.add_argument('--miner.apply_remote_gradients', default=False, type=bool, help='If true, neuron applies gradients which accumulate from remotes calls.') parser.add_argument('--miner.root_dir', default='~/.bittensor/miners/', type=str, help='Root path to load and save data associated with each miner') parser.add_argument('--miner.name', default='bert-nsp', type=str, help='Trials for this miner go in miner.root / miner.name') parser.add_argument('--miner.trial_uid', default=str(time.time()).split('.')[0], type=str, help='Saved models go in miner.root_dir / miner.name / miner.uid') parser.add_argument('--miner.record_log', default=True, help='Record all logs when running this miner') parser.add_argument('--miner.config_file', type=str, help='config file to run this neuron, if not using cmd line arguments.') BertMLMSynapse.add_args(parser) bittensor.neuron.Neuron.add_args(parser) # --- Main loop ---- def run (self): # ---- Subscribe ---- with self.neuron: # ---- Weights ---- self.row = self.neuron.metagraph.row # --- Run state --- self.global_step = 0 self.best_train_loss = math.inf # --- Loop for epochs --- for self.epoch in range(self.config.miner.n_epochs): try: # ---- Serve ---- self.neuron.axon.serve( self.model ) # ---- Train Model ---- self.train() self.scheduler.step() # If model has borked for some reason, we need to make sure it doesn't emit weights # Instead, reload into previous version of model if torch.any(torch.isnan(torch.cat([param.view(-1) for param in self.model.parameters()]))): self.model, self.optimizer = self.model_toolbox.load_model(self.config) continue # ---- Emitting weights ---- self.neuron.metagraph.set_weights(self.row, wait_for_inclusion = True) # Sets my row-weights on the chain. # ---- Sync metagraph ---- self.neuron.metagraph.sync() # Pulls the latest metagraph state (with my update.) self.row = self.neuron.metagraph.row # --- Epoch logs ---- print(self.neuron.axon.__full_str__()) print(self.neuron.dendrite.__full_str__()) print(self.neuron.metagraph) # ---- Update Tensorboard ---- self.neuron.dendrite.__to_tensorboard__(self.tensorboard, self.global_step) self.neuron.metagraph.__to_tensorboard__(self.tensorboard, self.global_step) self.neuron.axon.__to_tensorboard__(self.tensorboard, self.global_step) # ---- Save best loss and model ---- if self.training_loss and self.epoch % 10 == 0: if self.training_loss < self.best_train_loss: self.best_train_loss = self.training_loss # update best train loss self.model_toolbox.save_model( self.config.miner.full_path, { 'epoch': self.epoch, 'model_state_dict': self.model.state_dict(), 'loss': self.best_train_loss, 'optimizer_state_dict': self.optimizer.state_dict(), } ) self.tensorboard.add_scalar('Neuron/Train_loss', self.training_loss, self.global_step) # --- Catch Errors ---- except Exception as e: logger.error('Exception in training script with error: {}', e) logger.info(traceback.print_exc()) logger.info('Continuing to train.') time.sleep(1) # ---- Train Epoch ---- def train(self): self.training_loss = 0.0 for local_step in range(self.config.miner.epoch_length): # ---- Forward pass ---- inputs, targets = mlm_batch(self.dataset, self.config.miner.batch_size_train, bittensor.__tokenizer__(), self.data_collator) output = self.model.remote_forward ( self.neuron, inputs = inputs.to(self.model.device), targets = targets.to(self.model.device) ) # ---- Backward pass ---- loss = output.local_target_loss + output.distillation_loss + output.remote_target_loss loss.backward() # Accumulates gradients on the model. self.optimizer.step() # Applies accumulated gradients. self.optimizer.zero_grad() # Zeros out gradients for next accummulation # ---- Train row weights ---- batch_weights = torch.mean(output.router.weights, axis = 0) # Average over batch. self.row = (1 - 0.03) * self.row + 0.03 * batch_weights # Moving avg update. self.row = F.normalize(self.row, p = 1, dim = 0) # Ensure normalization. # ---- Step logs ---- logger.info('GS: {} LS: {} Epoch: {}\tLocal Target Loss: {}\tRemote Target Loss: {}\tDistillation Loss: {}\tAxon: {}\tDendrite: {}', colored('{}'.format(self.global_step), 'red'), colored('{}'.format(local_step), 'blue'), colored('{}'.format(self.epoch), 'green'), colored('{:.4f}'.format(output.local_target_loss.item()), 'green'), colored('{:.4f}'.format(output.remote_target_loss.item()), 'blue'), colored('{:.4f}'.format(output.distillation_loss.item()), 'red'), self.neuron.axon, self.neuron.dendrite) logger.info('Codes: {}', output.router.return_codes.tolist()) self.tensorboard.add_scalar('Neuron/Rloss', output.remote_target_loss.item(), self.global_step) self.tensorboard.add_scalar('Neuron/Lloss', output.local_target_loss.item(), self.global_step) self.tensorboard.add_scalar('Neuron/Dloss', output.distillation_loss.item(), self.global_step) # ---- Step increments ---- self.global_step += 1 self.training_loss += output.local_target_loss.item() # --- Memory clean up ---- torch.cuda.empty_cache() del output