def optimize_global(self, niter=200, npop=50, population=None, label='Global optimisation', leave=False): if self.de is None: self.de = DiffEvol(self.lnposterior, clip(self.ps.bounds, -1, 1), npop, maximize=True, vectorize=True) if population is None: self.de._population[:, :] = self.create_pv_population(npop) else: self.de._population[:, :] = population for _ in tqdm(self.de(niter), total=niter, desc=label, leave=leave): pass
def sample_mcmc(self, niter=500, thin=5, label='MCMC sampling', reset=False, leave=True): if self.sampler is None: self.sampler = EnsembleSampler(self.de.n_pop, self.de.n_par, self.lnposterior, vectorize=True) pop0 = self.de.population else: pop0 = self.sampler.chain[:,-1,:].copy() if reset: self.sampler.reset() for _ in tqdm(self.sampler.sample(pop0, iterations=niter, thin=thin), total=niter, desc=label, leave=False): pass
def bestThresshold(y_train,train_preds): tmp = [0,0,0] # idx, cur, max delta = 0 for tmp[0] in tqdm(np.arange(0.1, 0.501, 0.01)): tmp[1] = f1_score(y_train, np.array(train_preds)>tmp[0]) if tmp[1] > tmp[2]: delta = tmp[0] tmp[2] = tmp[1] print('best threshold is {:.4f} with F1 score: {:.4f}'.format(delta, tmp[2])) return delta
def display_progress(iterator, total=None, **kwargs): """ displays a progress bar when iterating """ if tqdm is not None: return tqdm(iterator, total=total, **kwargs) else: if display_progress._show_warning: logging.getLogger(__name__).warn('Module `tqdm` is not available ' 'and progress cannot be displayed') display_progress._show_warning = False return iterator
def transcribe( self, paths2audio_files: List[str], batch_size: int = 4, return_hypotheses: bool = False, partial_hypothesis: Optional[List['Hypothesis']] = None, num_workers: int = 0, ) -> (List[str], Optional[List['Hypothesis']]): """ Uses greedy decoding to transcribe audio files. Use this method for debugging and prototyping. Args: paths2audio_files: (a list) of paths to audio files. \ Recommended length per file is between 5 and 25 seconds. \ But it is possible to pass a few hours long file if enough GPU memory is available. batch_size: (int) batch size to use during inference. \ Bigger will result in better throughput performance but would use more memory. return_hypotheses: (bool) Either return hypotheses or text With hypotheses can do some postprocessing like getting timestamp or rescoring num_workers: (int) number of workers for DataLoader Returns: A list of transcriptions in the same order as paths2audio_files. Will also return """ if paths2audio_files is None or len(paths2audio_files) == 0: return {} # We will store transcriptions here hypotheses = [] all_hypotheses = [] # Model's mode and device mode = self.training device = next(self.parameters()).device dither_value = self.preprocessor.featurizer.dither pad_to_value = self.preprocessor.featurizer.pad_to if num_workers is None: num_workers = min(batch_size, os.cpu_count() - 1) try: self.preprocessor.featurizer.dither = 0.0 self.preprocessor.featurizer.pad_to = 0 # Switch model to evaluation mode self.eval() # Freeze the encoder and decoder modules self.encoder.freeze() self.decoder.freeze() self.joint.freeze() logging_level = logging.get_verbosity() logging.set_verbosity(logging.WARNING) # Work in tmp directory - will store manifest file there with tempfile.TemporaryDirectory() as tmpdir: with open(os.path.join(tmpdir, 'manifest.json'), 'w', encoding='utf-8') as fp: for audio_file in paths2audio_files: entry = { 'audio_filepath': audio_file, 'duration': 100000, 'text': 'nothing' } fp.write(json.dumps(entry) + '\n') config = { 'paths2audio_files': paths2audio_files, 'batch_size': batch_size, 'temp_dir': tmpdir, 'num_workers': num_workers, } temporary_datalayer = self._setup_transcribe_dataloader(config) for test_batch in tqdm(temporary_datalayer, desc="Transcribing"): encoded, encoded_len = self.forward( input_signal=test_batch[0].to(device), input_signal_length=test_batch[1].to(device)) best_hyp, all_hyp = self.decoding.rnnt_decoder_predictions_tensor( encoded, encoded_len, return_hypotheses=return_hypotheses, partial_hypotheses=partial_hypothesis, ) hypotheses += best_hyp if all_hyp is not None: all_hypotheses += all_hyp else: all_hypotheses += best_hyp del encoded del test_batch finally: # set mode back to its original value self.train(mode=mode) self.preprocessor.featurizer.dither = dither_value self.preprocessor.featurizer.pad_to = pad_to_value logging.set_verbosity(logging_level) if mode is True: self.encoder.unfreeze() self.decoder.unfreeze() self.joint.unfreeze() return hypotheses, all_hypotheses
def trainGAN(self): gen = self.generator().to(self.device) gen_opt = torch.optim.Adam(gen.parameters(), lr=self.lr, betas=(self.beta1, self.beta2)) critic = self.critic().to(self.device) critic_opt = torch.optim.Adam(critic.parameters(), lr=self.lr, betas=(self.beta1, self.beta2)) cur_step = 0 loadAndAgumentMasks = makeMasks.MaskClass(self.config, rand_seed=None) # måske nn.Conv2d med vægte ikke virker når vi bruger partconv2d, i så fald måske tilføje # or isinstance(m,partConv2d) og læg partconv2d et sted hvor den er accessible. def weights_init(m): if isinstance(m, nn.Conv2d) or isinstance( m, nn.ConvTranspose2d) or isinstance(m, PartialConv2d): nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') if m.bias is not None: nn.init.constant_(m.bias, 0) elif isinstance(m, nn.BatchNorm2d): torch.nn.init.constant_(m.weight, 1) torch.nn.init.constant_(m.bias, 0) elif isinstance(m, nn.LayerNorm): torch.nn.init.normal_(m.weight, 0.0, 0.02) torch.nn.init.constant_(m.bias, 0) elif isinstance(m, nn.Linear): nn.init.normal_(m.weight, 0.0, 0.02) gen = gen.apply(weights_init) critic = critic.apply(weights_init) print("Setup loss function...") loss_func = CalculateLoss(config=self.config).to(self.device) for epoch in range(self.epochs): for real, SAR in tqdm(self.dataloader, position=0, leave=True, disable=True): #self.config.run_polyaxon): masks = loadAndAgumentMasks.returnTensorMasks(self.batchSize) masks = torch.from_numpy(masks) masks = masks.type(torch.cuda.FloatTensor) masks = 1 - masks masks.to(self.device) real = real.to(self.device) # --------------------- # Train critic # --------------------- critic.zero_grad() # Real images real_validity = critic(real) d_real = real_validity.mean() # Generate a batch of images with mask #Masked_fake_img = torch.mul(real_vv_vh, masks4) Masked_fake_img = torch.mul(real, masks) fake_imgs = gen(Masked_fake_img, masks) # Fake images fake_validity = critic(fake_imgs) # Detach or not? d_fake = fake_validity.mean() gradient_penalty = self.calc_gradient_penalty( critic, real.data, fake_imgs.data) d_loss = d_fake - d_real + gradient_penalty d_loss.backward() critic_opt.step() # Values for txt / logging critic_cost = d_fake - d_real + gradient_penalty wasserstein_d = d_real - d_fake critic_score = real_validity.mean().item() gen_score = fake_validity.mean().item() # Train the generator every n_critic steps if cur_step % self.n_critic == 0: # ----------------- # Train Generator # ----------------- gen.zero_grad() # Generate a batch of images fake_noise = torch.mul(real, masks) fake_imgs = gen(fake_noise, masks) # Loss measures generator's ability to fool the critic # Train on fake images fake_validity1 = critic(fake_imgs) loss_dict = loss_func(real, masks, fake_imgs, real) loss = 0.0 # sums up each loss value for key, value in loss_dict.items(): loss += value loss.backward(retain_graph=True) g_loss = fake_validity1.mean() #g_lossMSE = criterionMSE(real, fake_imgs) #g_lossMSE.backward(retain_graph=True) g_loss = -g_loss g_loss.backward() #mone gen_opt.step() gen_cost = g_loss cur_step += 1 if self.config.run_polyaxon and epoch % 5 == 0: metrics = {} for key, value in loss_dict.items(): modelHelper.saveMetricsNewPolyaxon(metrics, key, value.item(), epoch, self.config) modelHelper.saveMetricsNewPolyaxon(metrics, 'critic cost', critic_cost.item(), epoch, self.config) modelHelper.saveMetricsNewPolyaxon(metrics, 'Wasserstein distance', wasserstein_d.item(), epoch, self.config) modelHelper.saveMetricsNewPolyaxon(metrics, 'Gen cost', gen_cost.item(), epoch, self.config) if epoch % self.save_model_step == 0 and self.trainMode == True: name = str(self.modelName) + '_' + str(epoch) model_path = modelHelper.saveModel(name, self.modelOutputPath, gen, self.modelName) if self.config.nir_data: modelHelper.save_tensor_batch_NIR( real, Masked_fake_img, fake_imgs, self.batchSize, Path.joinpath(self.ImageOutputPath, 'epoch_' + str(epoch))) else: modelHelper.save_tensor_batch_NIR( real, Masked_fake_img, fake_imgs, self.batchSize, Path.joinpath(self.ImageOutputPath, 'epoch_' + str(epoch))) #Changed the else loop to handle the new generator taking SAR #else: # modelHelper.save_tensor_batch(real, Masked_fake_img, fake_imgs, self.batchSize, # Path.joinpath(self.ImageOutputPath, 'epoch_' + str(epoch))) # Save loss from generator and critic to a file #filename = Path.joinpath(self.modelOutputPath, self.modelName + '_' + str(self.batchSize) + 'Errors.txt') #saveString = 'wasserStein Number: ' + str(wasserstein_d) +' Generator loss: ' + str(g_loss.item()) + '\n' + 'critic loss: ' + str(d_loss.item()) + '\n' + 'critic guess on reals: ' + str(critic_score) + ' critic guess on fakes: ' + str(gen_score) + ' Updated critic guess on fake: ' + str(gen_cost) + '\n' #modelHelper.saveToTxt(filename, saveString) if self.trainWithFreeze: #trainFrozenModel = trainFrozenGan(self.dataloader,gen,critic,gen_opt,critic_opt, self.config) #trainFrozenGan.trainGAN() #Frys BN i encoder parts of the network #Bruge affine? eller sætte weight og bias til module.eval for name, module in gen.named_modules(): if isinstance(module, nn.BatchNorm2d) and 'down' in name: module.eval() for epoch in range(self.epochsFrozen): for real, SAR in tqdm(self.dataloader, position=0, leave=True, disable=self.config.run_polyaxon): masks = loadAndAgumentMasks.returnTensorMasks( self.batchSize) masks = torch.from_numpy(masks) masks = masks.type(torch.cuda.FloatTensor) masks = 1 - masks masks.to(self.device) real = real.to(self.device) # --------------------- # Train critic # --------------------- critic.zero_grad() # Real images real_validity = critic(real) d_real = real_validity.mean() # Generate a batch of images with mask Masked_fake_img = torch.mul(real, masks) fake_imgs = gen(Masked_fake_img, masks) # Fake images fake_validity = critic(fake_imgs) # Detach or not? d_fake = fake_validity.mean() gradient_penalty = self.calc_gradient_penalty( critic, real.data, fake_imgs.data) d_loss = d_fake - d_real + gradient_penalty d_loss.backward() critic_opt.step() # Values for txt / logging critic_cost = d_fake - d_real + gradient_penalty wasserstein_d = d_real - d_fake critic_score = real_validity.mean().item() gen_score = fake_validity.mean().item() # Train the generator every n_critic steps if cur_step % self.n_critic == 0: # ----------------- # Train Generator # ----------------- gen.zero_grad() # Generate a batch of images fake_noise = torch.mul(real, masks) fake_imgs = gen(fake_noise, masks) # Loss measures generator's ability to fool the critic # Train on fake images fake_validity1 = critic(fake_imgs) loss_dict = loss_func(fake_noise, masks, fake_imgs, real) loss = 0.0 # sums up each loss value for key, value in loss_dict.items(): loss += value loss.backward(retain_graph=True) g_loss = fake_validity1.mean() # g_lossMSE = criterionMSE(real, fake_imgs) # g_lossMSE.backward(retain_graph=True) g_loss = -g_loss g_loss.backward() # mone gen_opt.step() gen_cost = g_loss cur_step += 1 if self.config.run_polyaxon and epoch % 5 == 0: metrics = {} for key, value in loss_dict.items(): for key, value in loss_dict.items(): modelHelper.saveMetricsNewPolyaxon( metrics, key, value.item(), epoch, self.config) modelHelper.saveMetricsNewPolyaxon( metrics, 'critic cost', critic_cost.item(), epoch, self.config) modelHelper.saveMetricsNewPolyaxon( metrics, 'Wasserstein distance', wasserstein_d.item(), epoch, self.config) modelHelper.saveMetricsNewPolyaxon( metrics, 'Gen cost', gen_cost.item(), epoch, self.config) if epoch % self.save_model_step == 0 and self.trainMode == True: name = str(self.modelName) + '_' + str(epoch + self.epochs) model_path = modelHelper.saveModel(name, self.modelOutputPath, gen, self.modelName) if self.config.nir_data: modelHelper.save_tensor_batch_NIR( real, Masked_fake_img, fake_imgs, self.batchSize, Path.joinpath(self.ImageOutputPath, 'epoch_' + str(epoch))) else: modelHelper.save_tensor_batch( real, Masked_fake_img, fake_imgs, self.batchSize, Path.joinpath(self.ImageOutputPath, 'epoch_' + str(epoch + self.epochs))) # Save loss from generator and critic to a file filename = Path.joinpath( self.modelOutputPath, self.modelName + '_' + str(self.batchSize) + 'Errors.txt') saveString = 'wasserStein Number: ' + str( wasserstein_d) + ' Generator loss: ' + str(g_loss.item( )) + '\n' + 'critic loss: ' + str(d_loss.item( )) + '\n' + 'critic guess on reals: ' + str( critic_score) + ' critic guess on fakes: ' + str( gen_score ) + ' Updated critic guess on fake: ' + str( gen_cost) + '\n' modelHelper.saveToTxt(filename, saveString) return model_path
def main(): args = parse_args() # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. accelerator = Accelerator() # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.info(accelerator.state) # Setup logging, we only want one process per machine to log things on the screen. # accelerator.is_local_main_process is only True for one process per machine. logger.setLevel( logging.INFO if accelerator.is_local_main_process else logging.ERROR) if accelerator.is_local_main_process: datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_info() else: datasets.utils.logging.set_verbosity_error() transformers.utils.logging.set_verbosity_error() # If passed along, set the training seed now. if args.seed is not None: set_seed(args.seed) # Handle the repository creation if accelerator.is_main_process: if args.push_to_hub: if args.hub_model_id is None: repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token) else: repo_name = args.hub_model_id repo = Repository(args.output_dir, clone_from=repo_name) elif args.output_dir is not None: os.makedirs(args.output_dir, exist_ok=True) accelerator.wait_for_everyone() # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name) else: data_files = {} if args.train_file is not None: data_files["train"] = args.train_file if args.validation_file is not None: data_files["validation"] = args.validation_file extension = args.train_file.split(".")[-1] raw_datasets = load_dataset(extension, data_files=data_files) # Trim a number of training examples if args.debug: for split in raw_datasets.keys(): raw_datasets[split] = raw_datasets[split].select(range(100)) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. if raw_datasets["train"] is not None: column_names = raw_datasets["train"].column_names else: column_names = raw_datasets["validation"].column_names # When using your own dataset or a different dataset from swag, you will probably need to change this. ending_names = [f"ending{i}" for i in range(4)] context_name = "sent1" question_header_name = "sent2" label_column_name = "label" if "label" in column_names else "labels" # Load pretrained model and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if args.config_name: config = AutoConfig.from_pretrained(args.model_name_or_path) elif args.model_name_or_path: config = AutoConfig.from_pretrained(args.model_name_or_path) else: config = CONFIG_MAPPING[args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") if args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained( args.tokenizer_name, use_fast=not args.use_slow_tokenizer) elif args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( args.model_name_or_path, use_fast=not args.use_slow_tokenizer) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if args.model_name_or_path: model = AutoModelForMultipleChoice.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, ) else: logger.info("Training new model from scratch") model = AutoModelForMultipleChoice.from_config(config) model.resize_token_embeddings(len(tokenizer)) # Preprocessing the datasets. # First we tokenize all the texts. padding = "max_length" if args.pad_to_max_length else False def preprocess_function(examples): first_sentences = [[context] * 4 for context in examples[context_name]] question_headers = examples[question_header_name] second_sentences = [[ f"{header} {examples[end][i]}" for end in ending_names ] for i, header in enumerate(question_headers)] labels = examples[label_column_name] # Flatten out first_sentences = list(chain(*first_sentences)) second_sentences = list(chain(*second_sentences)) # Tokenize tokenized_examples = tokenizer( first_sentences, second_sentences, max_length=args.max_length, padding=padding, truncation=True, ) # Un-flatten tokenized_inputs = { k: [v[i:i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items() } tokenized_inputs["labels"] = labels return tokenized_inputs with accelerator.main_process_first(): processed_datasets = raw_datasets.map( preprocess_function, batched=True, remove_columns=raw_datasets["train"].column_names) train_dataset = processed_datasets["train"] eval_dataset = processed_datasets["validation"] # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): logger.info( f"Sample {index} of the training set: {train_dataset[index]}.") # DataLoaders creation: if args.pad_to_max_length: # If padding was already done ot max length, we use the default data collator that will just convert everything # to tensors. data_collator = default_data_collator else: # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta). data_collator = DataCollatorForMultipleChoice( tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None)) train_dataloader = DataLoader(train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size) eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size) # Optimizer # Split weights in two groups, one with weight decay and the other not. no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) # Use the device given by the `accelerator` object. device = accelerator.device model.to(device) # Prepare everything with our `accelerator`. model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare( model, optimizer, train_dataloader, eval_dataloader) # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be # shorter in multiprocess) # Scheduler and math around the number of training steps. num_update_steps_per_epoch = math.ceil( len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch else: args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps, num_training_steps=args.max_train_steps, ) # Metrics metric = load_metric("accuracy") # Train! total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {args.num_train_epochs}") logger.info( f" Instantaneous batch size per device = {args.per_device_train_batch_size}" ) logger.info( f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}" ) logger.info( f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {args.max_train_steps}") # Only show the progress bar once on each machine. progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 for epoch in range(args.num_train_epochs): model.train() for step, batch in enumerate(train_dataloader): outputs = model(**batch) loss = outputs.loss loss = loss / args.gradient_accumulation_steps accelerator.backward(loss) if step % args.gradient_accumulation_steps == 0 or step == len( train_dataloader) - 1: optimizer.step() lr_scheduler.step() optimizer.zero_grad() progress_bar.update(1) completed_steps += 1 if completed_steps >= args.max_train_steps: break model.eval() for step, batch in enumerate(eval_dataloader): with torch.no_grad(): outputs = model(**batch) predictions = outputs.logits.argmax(dim=-1) metric.add_batch( predictions=accelerator.gather(predictions), references=accelerator.gather(batch["labels"]), ) eval_metric = metric.compute() accelerator.print(f"epoch {epoch}: {eval_metric}") if args.push_to_hub and epoch < args.num_train_epochs - 1: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save) if accelerator.is_main_process: tokenizer.save_pretrained(args.output_dir) repo.push_to_hub( commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True) if args.output_dir is not None: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save) if accelerator.is_main_process: tokenizer.save_pretrained(args.output_dir) if args.push_to_hub: repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)
def predict(self, to_predict): """ Performs predictions on a list of text. Args: to_predict: A python list of text (str) to be sent to the model for prediction. Returns: preds: A Python list of lists with dicts containg each word mapped to its NER tag. model_outputs: A python list of the raw model outputs for each text. """ device = self.device model = self.model args = self.args pad_token_label_id = self.pad_token_label_id self._move_model_to_device() predict_examples = [ InputExample(i, sentence.split(), ["O" for word in sentence.split()]) for i, sentence in enumerate(to_predict) ] eval_dataset = self.load_and_cache_examples( None, to_predict=predict_examples) eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args["eval_batch_size"]) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None model.eval() for batch in tqdm(eval_dataloader, disable=args["silent"]): batch = tuple(t.to(device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3], } # XLM and RoBERTa don"t use segment_ids if args["model_type"] in ["bert", "xlnet"]: inputs["token_type_ids"] = batch[2] outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs["labels"].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append( out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps model_outputs = preds preds = np.argmax(preds, axis=2) label_map = {i: label for i, label in enumerate(self.labels)} out_label_list = [[] for _ in range(out_label_ids.shape[0])] preds_list = [[] for _ in range(out_label_ids.shape[0])] for i in range(out_label_ids.shape[0]): for j in range(out_label_ids.shape[1]): if out_label_ids[i, j] != pad_token_label_id: out_label_list[i].append(label_map[out_label_ids[i][j]]) preds_list[i].append(label_map[preds[i][j]]) preds = [[{ word: preds_list[i][j] } for j, word in enumerate(sentence.split()[:len(preds_list[i])])] for i, sentence in enumerate(to_predict)] return preds, model_outputs
def coverage_table( peakfile, datafiles, window, log_transform=True, normalization="none", top=0, topmethod="var", rmdup=True, rmrepeats=True, ncpus=12, ): for x in datafiles: if not os.path.isfile(x): print("ERROR: Data file '{0}' does not exist".format(x)) sys.exit(1) for x in datafiles: if ".bam" in x and not os.path.isfile("{0}.bai".format(x)): print("Data file '{0}' does not have an index file." " Creating an index file for {0}.".format(x)) pysam.index(x) logger.info("Loading data") data = {} try: # Load data in parallel pool = multiprocessing.Pool(processes=ncpus) jobs = [] for datafile in datafiles: jobs.append( pool.apply_async( load_heatmap_data, args=( peakfile, datafile, 1, window // 2, window // 2, rmdup, False, rmrepeats, None, False, None, ), )) for job in tqdm(jobs): track, regions, profile, guard = job.get() data[os.path.splitext(track)[0]] = profile[:, 0] except Exception as e: sys.stderr.write("Error loading data in parallel, trying serial\n") sys.stderr.write("Error: {}\n".format(e)) for datafile in tqdm(datafiles): track, regions, profile, guard = load_heatmap_data( peakfile, datafile, 1, window // 2, window // 2, rmdup, False, rmrepeats, None, False, None, ) data[os.path.splitext(track)[0]] = profile[:, 0] # Create DataFrame with regions as index regions = ["{}:{}-{}".format(*region[:3]) for region in regions] df = pd.DataFrame(data, index=regions) if log_transform: logger.info("Log transform") df = np.log1p(df) if normalization == "scale": logger.info("Normalization by scaling") df[:] = scale(df, axis=0) if normalization == "quantile": logger.info("Normalization by quantile normalization") df = qnorm.quantile_normalize(df) else: logger.info("No normalization") if top > 0: if topmethod == "var": idx = df.var(1).sort_values().tail(top).index elif topmethod == "std": idx = df.std(1).sort_values().tail(top).index elif topmethod == "mean": idx = df.mean(1).sort_values().tail(top).index elif topmethod == "random": idx = df.sample(top).index else: raise ValueError( "unknown method {} for selecting regions".format(topmethod)) df = df.loc[idx] return df
def render_dataset(dataset: np.ndarray, names: np.ndarray, args): '''Renders a list of tex equations Args: dataset (numpy.ndarray): List of equations names (numpy.ndarray): List of integers of size `dataset` that give the name of the saved image args (Union[Namespace, Munch]): additional arguments: mode (equation or inline), out (output directory), divable (common factor ) batchsize (how many samples to render at once), dpi, font (Math font), preprocess (crop, alpha off) shuffle (bool) Returns: list: equation indices that could not be rendered. ''' assert len(names) == len( dataset), 'names and dataset must be of equal size' math_mode = '$$' if args.mode == 'equation' else '$' os.makedirs(args.out, exist_ok=True) indices = np.array([ int(os.path.basename(img).split('.')[0]) for img in glob.glob(os.path.join(args.out, '*.png')) ]) valid = [i for i, j in enumerate(names) if j not in indices] dataset = dataset[valid] names = names[valid] order = np.random.permutation(len(dataset)) if args.shuffle else np.arange( len(dataset)) faulty = [] for i in tqdm(range(0, len(dataset), args.batchsize)): batch = dataset[order[i:i + args.batchsize]] #batch = [x for j, x in enumerate(batch) if order[i+j] not in indices] if len(batch) == 0: continue math = [math_mode + x + math_mode for x in batch if x != ''] #print('\n', i, len(math), '\n'.join(math)) if len(args.font) > 1: font = np.random.choice(args.font) else: font = args.font[0] if len(args.dpi) > 1: dpi = np.random.choice(np.arange(min(args.dpi), max(args.dpi))) else: dpi = args.dpi[0] if len(math) > 0: try: if args.preprocess: pngs = tex2pil(math, dpi=dpi, font=font) else: pngs = Latex(math, dpi=dpi, font=font).write(return_bytes=False) except Exception as e: #print(e) #print(math) #raise e faulty.extend(list(names[order[i:i + args.batchsize]])) continue for j, k in enumerate(range(i, i + len(pngs))): outpath = os.path.join(args.out, '%07d.png' % names[order[k]]) if args.preprocess: try: data = np.asarray(pngs[j]) # print(data.shape) gray = 255 * (data[..., 0] < 128).astype( np.uint8) # To invert the text to white coords = cv2.findNonZero( gray) # Find all non-zero points (text) a, b, w, h = cv2.boundingRect( coords) # Find minimum spanning bounding box rect = data[b:b + h, a:a + w] im = Image.fromarray((255 - rect[..., -1]).astype( np.uint8)).convert('L') dims = [] for x in [w, h]: div, mod = divmod(x, args.divable) dims.append(args.divable * (div + (1 if mod > 0 else 0))) padded = Image.new('L', dims, 255) padded.paste(im, im.getbbox()) padded.save(outpath) except Exception as e: print(e) pass else: shutil.move(pngs[j], outpath) return np.array(faulty)
def train(self, train_dataset, output_dir, show_running_loss=True, eval_df=None, verbose=True): """ Trains the model on train_dataset. Utility function to be used by the train_model() method. Not intended to be used directly. """ device = self.device model = self.model args = self.args tb_writer = SummaryWriter(logdir=args["tensorboard_dir"]) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args["train_batch_size"]) t_total = len(train_dataloader) // args[ "gradient_accumulation_steps"] * args["num_train_epochs"] no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args["weight_decay"], }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] warmup_steps = math.ceil(t_total * args["warmup_ratio"]) args["warmup_steps"] = warmup_steps if args[ "warmup_steps"] == 0 else args["warmup_steps"] optimizer = AdamW( optimizer_grouped_parameters, lr=args["learning_rate"], eps=args["adam_epsilon"], ) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args["warmup_steps"], num_training_steps=t_total) if args["fp16"]: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args["fp16_opt_level"]) if args["n_gpu"] > 1: model = torch.nn.DataParallel(model) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args["num_train_epochs"]), desc="Epoch", disable=args["silent"]) epoch_number = 0 best_eval_metric = None early_stopping_counter = 0 if args["evaluate_during_training"]: training_progress_scores = self._create_training_progress_scores() if args["wandb_project"]: wandb.init(project=args["wandb_project"], config={**args}, **args["wandb_kwargs"]) wandb.watch(self.model) model.train() for _ in train_iterator: # epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate( tqdm(train_dataloader, desc="Current iteration", disable=args["silent"])): batch = tuple(t.to(device) for t in batch) inputs = self._get_inputs_dict(batch) outputs = model(**inputs) # model outputs are always tuple in pytorch-transformers (see doc) loss = outputs[0] if args["n_gpu"] > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training current_loss = loss.item() if show_running_loss: print("\rRunning loss: %f" % loss, end="") if args["gradient_accumulation_steps"] > 1: loss = loss / args["gradient_accumulation_steps"] if args["fp16"]: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() # torch.nn.utils.clip_grad_norm_( # amp.master_params(optimizer), args["max_grad_norm"] # ) else: loss.backward() # torch.nn.utils.clip_grad_norm_( # model.parameters(), args["max_grad_norm"] # ) tr_loss += loss.item() if (step + 1) % args["gradient_accumulation_steps"] == 0: if args["fp16"]: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args["max_grad_norm"]) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args["max_grad_norm"]) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args["logging_steps"] > 0 and global_step % args[ "logging_steps"] == 0: # Log metrics tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar( "loss", (tr_loss - logging_loss) / args["logging_steps"], global_step, ) logging_loss = tr_loss if args["wandb_project"]: wandb.log({ "Training loss": current_loss, "lr": scheduler.get_lr()[0], "global_step": global_step, }) if args["save_steps"] > 0 and global_step % args[ "save_steps"] == 0: # Save model checkpoint output_dir_current = os.path.join( output_dir, "checkpoint-{}".format(global_step)) self._save_model(output_dir_current, model=model) if args["evaluate_during_training"] and ( args["evaluate_during_training_steps"] > 0 and global_step % args["evaluate_during_training_steps"] == 0): # Only evaluate when single GPU otherwise metrics may not average well results, _, _ = self.eval_model(eval_df, verbose=True) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) output_dir_current = os.path.join( output_dir, "checkpoint-{}".format(global_step)) os.makedirs(output_dir_current, exist_ok=True) if args["save_eval_checkpoints"]: self._save_model(output_dir_current, model=model, results=results) training_progress_scores["global_step"].append( global_step) training_progress_scores["train_loss"].append( current_loss) for key in results: training_progress_scores[key].append(results[key]) report = pd.DataFrame(training_progress_scores) report.to_csv( os.path.join(args["output_dir"], "training_progress_scores.csv"), index=False, ) if args["wandb_project"]: wandb.log( self._get_last_metrics( training_progress_scores)) if not best_eval_metric: best_eval_metric = results[ args["early_stopping_metric"]] self._save_model(args["best_model_dir"], model=model, results=results) if best_eval_metric and args[ "early_stopping_metric_minimize"]: if results[args[ "early_stopping_metric"]] - best_eval_metric < args[ "early_stopping_delta"]: best_eval_metric = results[ args["early_stopping_metric"]] self._save_model(args["best_model_dir"], model=model, results=results) early_stopping_counter = 0 else: if args["use_early_stopping"]: if early_stopping_counter < args[ "early_stopping_patience"]: early_stopping_counter += 1 if verbose: logger.info( f" No improvement in {args['early_stopping_metric']}" ) logger.info( f" Current step: {early_stopping_counter}" ) logger.info( f" Early stopping patience: {args['early_stopping_patience']}" ) else: if verbose: logger.info( f" Patience of {args['early_stopping_patience']} steps reached" ) logger.info( " Training terminated.") train_iterator.close() return global_step, tr_loss / global_step else: if results[args[ "early_stopping_metric"]] - best_eval_metric > args[ "early_stopping_delta"]: best_eval_metric = results[ args["early_stopping_metric"]] self._save_model(args["best_model_dir"], model=model, results=results) early_stopping_counter = 0 else: if args["use_early_stopping"]: if early_stopping_counter < args[ "early_stopping_patience"]: early_stopping_counter += 1 if verbose: logger.info( f" No improvement in {args['early_stopping_metric']}" ) logger.info( f" Current step: {early_stopping_counter}" ) logger.info( f" Early stopping patience: {args['early_stopping_patience']}" ) else: if verbose: logger.info( f" Patience of {args['early_stopping_patience']} steps reached" ) logger.info( " Training terminated.") train_iterator.close() return global_step, tr_loss / global_step epoch_number += 1 output_dir_current = os.path.join( output_dir, "checkpoint-{}-epoch-{}".format(global_step, epoch_number)) if args["save_model_every_epoch"] or args[ "evaluate_during_training"]: os.makedirs(output_dir_current, exist_ok=True) if args["save_model_every_epoch"]: self._save_model(output_dir_current, model=model) if args["evaluate_during_training"]: results, _, _ = self.eval_model(eval_df, verbose=True) self._save_model(output_dir_current, results=results) training_progress_scores["global_step"].append(global_step) training_progress_scores["train_loss"].append(current_loss) for key in results: training_progress_scores[key].append(results[key]) report = pd.DataFrame(training_progress_scores) report.to_csv(os.path.join(args["output_dir"], "training_progress_scores.csv"), index=False) if not best_eval_metric: best_eval_metric = results[args["early_stopping_metric"]] self._save_model(args["best_model_dir"], model=model, results=results) if best_eval_metric and args["early_stopping_metric_minimize"]: if results[args[ "early_stopping_metric"]] - best_eval_metric < args[ "early_stopping_delta"]: best_eval_metric = results[ args["early_stopping_metric"]] self._save_model(args["best_model_dir"], model=model, results=results) early_stopping_counter = 0 else: if results[args[ "early_stopping_metric"]] - best_eval_metric > args[ "early_stopping_delta"]: best_eval_metric = results[ args["early_stopping_metric"]] self._save_model(args["best_model_dir"], model=model, results=results) early_stopping_counter = 0 return global_step, tr_loss / global_step
def _prediction_loop( self, dataloader: DataLoader, description: str, prediction_loss_only: Optional[bool] = None ) -> PredictionOutput: """ Prediction/evaluation loop, shared by `evaluate()` and `predict()`. Works both with or without labels. """ prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else self.prediction_loss_only model = self.model # multi-gpu eval if self.args.n_gpu > 1: model = torch.nn.DataParallel(model) else: model = self.model # Note: in torch.distributed mode, there's no point in wrapping the model # inside a DistributedDataParallel as we'll be under `no_grad` anyways. batch_size = dataloader.batch_size logger.info("***** Running %s *****", description) logger.info(" Num examples = %d", self.num_examples(dataloader)) logger.info(" Batch size = %d", batch_size) eval_losses: List[float] = [] preds: torch.Tensor = None label_ids: torch.Tensor = None model.eval() if is_torch_tpu_available(): dataloader = pl.ParallelLoader(dataloader, [self.args.device]).per_device_loader(self.args.device) if self.args.past_index >= 0: past = None for inputs in tqdm(dataloader, desc=description): has_labels = any(inputs.get(k) is not None for k in ["labels", "lm_labels", "masked_lm_labels"]) for k, v in inputs.items(): if isinstance(v, torch.Tensor): inputs[k] = v.to(self.args.device) if self.args.past_index >= 0: inputs["mems"] = past with torch.no_grad(): outputs = model(**inputs) if has_labels: step_eval_loss, logits = outputs[:2] eval_losses += [step_eval_loss.mean().item()] else: logits = outputs[0] if self.args.past_index >= 0: past = outputs[self.args.past_index if has_labels else self.args.past_index - 1] if not prediction_loss_only: if preds is None: preds = logits.detach() else: preds = torch.cat((preds, logits.detach()), dim=0) if inputs.get("labels") is not None: if label_ids is None: label_ids = inputs["labels"].detach() else: label_ids = torch.cat((label_ids, inputs["labels"].detach()), dim=0) if self.args.local_rank != -1: # In distributed mode, concatenate all results from all nodes: if preds is not None: preds = self.distributed_concat(preds, num_total_examples=self.num_examples(dataloader)) if label_ids is not None: label_ids = self.distributed_concat(label_ids, num_total_examples=self.num_examples(dataloader)) elif is_torch_tpu_available(): # tpu-comment: Get all predictions and labels from all worker shards of eval dataset if preds is not None: preds = xm.mesh_reduce("eval_preds", preds, torch.cat) if label_ids is not None: label_ids = xm.mesh_reduce("eval_label_ids", label_ids, torch.cat) # Finally, turn the aggregated tensors into numpy arrays. if preds is not None: preds = preds.cpu().numpy() if label_ids is not None: label_ids = label_ids.cpu().numpy() if self.compute_metrics is not None and preds is not None and label_ids is not None: metrics = self.compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids)) else: metrics = {} if len(eval_losses) > 0: metrics["eval_loss"] = np.mean(eval_losses) # Prefix all keys with eval_ for key in list(metrics.keys()): if not key.startswith("eval_"): metrics[f"eval_{key}"] = metrics.pop(key) return PredictionOutput(predictions=preds, label_ids=label_ids, metrics=metrics)
def compute_IDFS(output_folder, cut): config = wandb.config if not os.path.isdir(output_folder): os.makedirs(output_folder) cpus = max(1, config.number_of_cpus) logging.info("Computing IDF with %i cpus" % cpus) excess_lines = config.corpus_size % cpus number_of_chunks = cpus if excess_lines > 0: number_of_chunks = cpus - 1 excess_lines = config.corpus_size % number_of_chunks lines_per_chunk = config.corpus_size // number_of_chunks logging.info("{} lines per chunk".format(lines_per_chunk)) logging.info("{} lines for last chunk".format(excess_lines)) assert (number_of_chunks * lines_per_chunk + excess_lines) == config.corpus_size if cut == 'cut': docs_path = os.path.join(config.data_home, "docs/msmarco-docs.tokenized.cut.tsv") else: docs_path = os.path.join(config.data_home, "docs/msmarco-docs.tokenized.tsv") block_offset = dict() if cpus < 2: block_offset[0] = 0 else: # Compute offset for documents for each chunk to be processed output_file = os.path.join(output_folder, "blocks_offset_{}-cpus".format(cpus)) if not os.path.isfile(output_file): pbar = tqdm(total=config.corpus_size + 1, desc="Computing chunks for each processor") with open(docs_path) as f: current_chunk = 0 counter = 0 line = True while (line): if counter % lines_per_chunk == 0: block_offset[current_chunk] = f.tell() current_chunk += 1 line = f.readline() pbar.update() counter += 1 pbar.close() pickle.dump(block_offset, open(output_file, 'wb')) else: block_offset = pickle.load(open(output_file, 'rb')) if cpus < 2: # Single CPU, compute directly. process_chunk(0, block_offset, docs_path, lines_per_chunk, output_folder) else: pbar = tqdm(total=cpus, position=0) def update(*a): # Update progress bar pbar.update() pool = mp.Pool(cpus) jobs = [] for i in range(len(block_offset)): jobs.append( pool.apply_async(process_chunk, args=(i, block_offset, docs_path, lines_per_chunk, output_folder), callback=update)) for job in jobs: job.get() pool.close() pbar.close() full_IDFS = Counter() for i in range(len(block_offset)): _idf = pickle.load( open(os.path.join(output_folder, "IDFS-{}".format(i)), 'rb')) for k in _idf: full_IDFS[k] += _idf[k] os.remove(os.path.join(output_folder, "IDFS-{}".format(i))) pickle.dump( full_IDFS, open(os.path.join(output_folder, "IDFS-FULL-{}".format(cut)), 'wb'))
bboxs = np.stack(df['bbox'].apply(lambda x: np.fromstring(x[1:-1], sep=','))) for i, column in enumerate(['x', 'y', 'w', 'h']): df[column] = bboxs[:, i] df.drop(columns=['bbox'], inplace=True) df['x_center'] = df['x'] + df['w'] / 2 df['y_center'] = df['y'] + df['h'] / 2 df['classes'] = 0 from tqdm.auto import tqdm import shutil as sh df = df[['image_id', 'x', 'y', 'w', 'h', 'x_center', 'y_center', 'classes']] source = 'train' if True: for fold in [0]: val_index = index[len(index) * fold // 5:len(index) * (fold + 1) // 5] for name, mini in tqdm(df.groupby('image_id')): if name in val_index: path2save = 'val2017/' else: path2save = 'train2017/' if not os.path.exists('convertor/fold{}/labels/'.format(fold) + path2save): os.makedirs('convertor/fold{}/labels/'.format(fold) + path2save) with open( 'convertor/fold{}/labels/'.format(fold) + path2save + name + ".txt", 'w+') as f: row = mini[['classes', 'x_center', 'y_center', 'w', 'h']].astype(float).values row = row / 1024 row = row.astype(str)
def train(self, model_path: Optional[str] = None): """ Main training entry point. Args: model_path (:obj:`str`, `optional`): Local path to the model if the model to train has been instantiated from a local path. If present, training will resume from the optimizer/scheduler states loaded here. """ train_dataloader = self.get_train_dataloader() if self.args.max_steps > 0: t_total = self.args.max_steps num_train_epochs = ( self.args.max_steps // (len(train_dataloader) // self.args.gradient_accumulation_steps) + 1 ) else: t_total = int(len(train_dataloader) // self.args.gradient_accumulation_steps * self.args.num_train_epochs) num_train_epochs = self.args.num_train_epochs optimizer, scheduler = self.get_optimizers(num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if ( model_path is not None and os.path.isfile(os.path.join(model_path, "optimizer.pt")) and os.path.isfile(os.path.join(model_path, "scheduler.pt")) ): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(model_path, "optimizer.pt"), map_location=self.args.device) ) scheduler.load_state_dict(torch.load(os.path.join(model_path, "scheduler.pt"))) model = self.model if self.args.fp16: if not is_apex_available(): raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=self.args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if self.args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if self.args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[self.args.local_rank], output_device=self.args.local_rank, find_unused_parameters=True, ) if self.tb_writer is not None: self.tb_writer.add_text("args", self.args.to_json_string()) self.tb_writer.add_hparams(self.args.to_sanitized_dict(), metric_dict={}) # Train! if is_torch_tpu_available(): total_train_batch_size = self.args.train_batch_size * xm.xrt_world_size() else: total_train_batch_size = ( self.args.train_batch_size * self.args.gradient_accumulation_steps * (torch.distributed.get_world_size() if self.args.local_rank != -1 else 1) ) logger.info("***** Running training *****") logger.info(" Num examples = %d", self.num_examples(train_dataloader)) logger.info(" Num Epochs = %d", num_train_epochs) logger.info(" Instantaneous batch size per device = %d", self.args.per_device_train_batch_size) logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", total_train_batch_size) logger.info(" Gradient Accumulation steps = %d", self.args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) self.global_step = 0 self.epoch = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if model_path is not None: # set global_step to global_step of last saved checkpoint from model path try: self.global_step = int(model_path.split("-")[-1].split("/")[0]) epochs_trained = self.global_step // (len(train_dataloader) // self.args.gradient_accumulation_steps) steps_trained_in_current_epoch = self.global_step % ( len(train_dataloader) // self.args.gradient_accumulation_steps ) logger.info(" Continuing training from checkpoint, will skip to saved global_step") logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", self.global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: self.global_step = 0 logger.info(" Starting fine-tuning.") tr_loss = 0.0 logging_loss = 0.0 model.zero_grad() train_iterator = trange( epochs_trained, int(num_train_epochs), desc="Epoch", disable=not self.is_local_master() ) for epoch in train_iterator: if isinstance(train_dataloader, DataLoader) and isinstance(train_dataloader.sampler, DistributedSampler): train_dataloader.sampler.set_epoch(epoch) if is_torch_tpu_available(): parallel_loader = pl.ParallelLoader(train_dataloader, [self.args.device]).per_device_loader( self.args.device ) epoch_iterator = tqdm(parallel_loader, desc="Iteration", disable=not self.is_local_master()) else: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=not self.is_local_master()) # Reset the past mems state at the beginning of each epoch if necessary. if self.args.past_index >= 0: self._past = None for step, inputs in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue tr_loss += self._training_step(model, inputs, optimizer) if (step + 1) % self.args.gradient_accumulation_steps == 0 or ( # last step in epoch but step is always smaller than gradient_accumulation_steps len(epoch_iterator) <= self.args.gradient_accumulation_steps and (step + 1) == len(epoch_iterator) ): if self.args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), self.args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), self.args.max_grad_norm) if is_torch_tpu_available(): xm.optimizer_step(optimizer) else: optimizer.step() scheduler.step() model.zero_grad() self.global_step += 1 self.epoch = epoch + (step + 1) / len(epoch_iterator) if (self.args.logging_steps > 0 and self.global_step % self.args.logging_steps == 0) or ( self.global_step == 1 and self.args.logging_first_step ): logs: Dict[str, float] = {} logs["loss"] = (tr_loss - logging_loss) / self.args.logging_steps # backward compatibility for pytorch schedulers logs["learning_rate"] = ( scheduler.get_last_lr()[0] if version.parse(torch.__version__) >= version.parse("1.4") else scheduler.get_lr()[0] ) logging_loss = tr_loss self._log(logs) if self.args.evaluate_during_training and self.global_step % self.args.eval_steps == 0: self.evaluate() if self.args.save_steps > 0 and self.global_step % self.args.save_steps == 0: # In all cases (even distributed/parallel), self.model is always a reference # to the model we want to save. if hasattr(model, "module"): assert model.module is self.model else: assert model is self.model # Save model checkpoint output_dir = os.path.join(self.args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{self.global_step}") self.save_model(output_dir) if self.is_world_master(): self._rotate_checkpoints() if is_torch_tpu_available(): xm.rendezvous("saving_optimizer_states") xm.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) xm.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) elif self.is_world_master(): torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) if self.args.max_steps > 0 and self.global_step > self.args.max_steps: epoch_iterator.close() break if self.args.max_steps > 0 and self.global_step > self.args.max_steps: train_iterator.close() break if self.args.tpu_metrics_debug or self.args.debug: # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.) xm.master_print(met.metrics_report()) if self.tb_writer: self.tb_writer.close() if self.args.past_index and hasattr(self, "_past"): # Clean the state at the end of training delattr(self, "_past") logger.info("\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n") return TrainOutput(self.global_step, tr_loss / self.global_step)
'DFIPS': str }) census_tracts = pd.read_csv('united-states-commutes/census_tracts_2010.csv', dtype={'GEOID': str}) census_tracts = census_tracts[census_tracts['USPS'].isin(filterUSPS)] commute_data = commute_data[commute_data['OFIPS'].isin( census_tracts['GEOID'].unique())] commute_data = commute_data[commute_data['DFIPS'].isin( census_tracts['GEOID'].unique())] census_tracts = census_tracts.sort_values('POP10', ascending=True) census_tracts.reset_index(inplace=True) commute_data.reset_index(inplace=True) attribution = np.empty(len(census_tracts)) attribution[:] = np.nan for index, row in tqdm(census_tracts.iterrows(), total=census_tracts.shape[0]): attribution[index] = int(str(row['GEOID'])[:5]) census_tracts['county'] = attribution counties_names = np.unique(attribution) n_counties = len(counties_names) pop = np.zeros(n_counties) state_USPS = [] for i, ct in enumerate(counties_names): pop[i] = census_tracts[census_tracts['county'] == ct]['POP10'].sum() state_USPS.append( census_tracts[census_tracts['county'] == ct].USPS.iloc[0]) groups = {'geoid': counties_names, 'pop2010': pop, 'stateUSPS': state_USPS} geodata = pd.DataFrame.from_dict(groups)
def predict(model, ds_test, batch_size, device='cpu', scaler=None): """ Gather all predictions into xarray. When we generate prediction in a sequence to sequence model we start at a time then predict N steps into the future. So we have 2 dimensions: source time, target time. But we also care about how far we were predicting into the future, so we have 3 dimensions: source time, target time, time ahead. It's hard to use pandas for data with virtual dimensions so we will use xarray. Xarray has an interface similar to pandas but also allows coordinates which are virtual dimensions. """ load_test = torch.utils.data.dataloader.DataLoader(ds_test, batch_size=batch_size) freq = ds_test.df.index.freq xrs = [] for i, batch in enumerate(tqdm(load_test, desc='predict', leave=False)): model.eval() with torch.no_grad(): x_past, y_past, x_future, y_future = [d.to(device) for d in batch] y_dist, extra = model(x_past, y_past, x_future) nll = -y_dist.log_prob(y_future) # Convert to numpy mean = to_numpy(y_dist.loc.squeeze(-1)) std = to_numpy(y_dist.scale.squeeze(-1)) nll = to_numpy(nll.squeeze(-1)) y_future = to_numpy(y_future.squeeze(-1)) y_past = to_numpy(y_past.squeeze(-1)) # Make an xarray.Dataset for the data bs = y_future.shape[0] wp = ds_test.window_past t_source = ds_test.df.index[wp + i * bs - 1:wp + i * bs + bs - 1].values t_ahead = pd.timedelta_range(1, periods=ds_test.window_future, freq=freq).values t_behind = pd.timedelta_range(end=0, periods=ds_test.window_past, freq=freq) xr_out = xr.Dataset( { # Format> name: ([dimensions,...], array), "y_past": ([ "t_source", "t_behind", ], y_past), "nll": ([ "t_source", "t_ahead", ], nll), "y_pred": ([ "t_source", "t_ahead", ], mean), "y_pred_std": ([ "t_source", "t_ahead", ], std), "y_true": ([ "t_source", "t_ahead", ], y_future), }, coords={ "t_source": t_source, "t_ahead": t_ahead, "t_behind": t_behind }, attrs={ 'freq': str(ds_test.freq), "model": str(type(model)), "targets": ds_test.columns_target }) xrs.append(xr_out) # Join all batches ds_preds = xr.concat(xrs, dim="t_source") # undo scaling on y if scaler: ds_preds['y_pred_std'].values = ds_preds.y_pred_std * scaler.scale_ ds_preds['y_past'].values = scaler.inverse_transform(ds_preds.y_past) ds_preds['y_pred'].values = scaler.inverse_transform(ds_preds.y_pred) ds_preds['y_true'].values = scaler.inverse_transform(ds_preds.y_true) # Add some derived coordinates, they will be the ones not in bold # The target time, is a function of the source time, and how far we predict ahead ds_preds = ds_preds.assign_coords(t_target=ds_preds.t_source + ds_preds.t_ahead) ds_preds = ds_preds.assign_coords(t_past=ds_preds.t_source + ds_preds.t_behind) # Some plots don't like timedeltas, so lets make a coordinate for time ahead in hours ds_preds = ds_preds.assign_coords( t_ahead_hours=(ds_preds.t_ahead * 1.0e-9 / 60 / 60).astype(float)) return ds_preds
def import_from_context(context, num_traces, log, parameters=None): """ Import a XES log from an iterparse context Parameters -------------- context Iterparse context num_traces Number of traces of the XES log log Event log (empty) parameters Parameters of the algorithm Returns -------------- log Event log (filled with the contents of the XES log) """ if parameters is None: parameters = {} max_no_traces_to_import = exec_utils.get_param_value( Parameters.MAX_TRACES, parameters, sys.maxsize) timestamp_sort = exec_utils.get_param_value(Parameters.TIMESTAMP_SORT, parameters, False) timestamp_key = exec_utils.get_param_value( Parameters.TIMESTAMP_KEY, parameters, xes_constants.DEFAULT_TIMESTAMP_KEY) reverse_sort = exec_utils.get_param_value(Parameters.REVERSE_SORT, parameters, False) show_progress_bar = exec_utils.get_param_value( Parameters.SHOW_PROGRESS_BAR, parameters, True) date_parser = dt_parser.get() progress = None if pkgutil.find_loader("tqdm") and show_progress_bar: from tqdm.auto import tqdm progress = tqdm(total=num_traces, desc="parsing log, completed traces :: ") trace = None event = None tree = {} compression_dictio = {} for tree_event, elem in context: if tree_event == _EVENT_START: # starting to read parent = tree[ elem.getparent()] if elem.getparent() in tree else None if elem.tag.endswith(xes_constants.TAG_STRING): if parent is not None: tree = __parse_attribute(elem, parent, elem.get(xes_constants.KEY_KEY), elem.get(xes_constants.KEY_VALUE), tree, compression_dictio) continue elif elem.tag.endswith(xes_constants.TAG_DATE): try: dt = date_parser.apply(elem.get(xes_constants.KEY_VALUE)) tree = __parse_attribute(elem, parent, elem.get(xes_constants.KEY_KEY), dt, tree, compression_dictio) except TypeError: logging.info("failed to parse date: " + str(elem.get(xes_constants.KEY_VALUE))) except ValueError: logging.info("failed to parse date: " + str(elem.get(xes_constants.KEY_VALUE))) continue elif elem.tag.endswith(xes_constants.TAG_EVENT): if event is not None: raise SyntaxError( 'file contains <event> in another <event> tag') event = Event() tree[elem] = event continue elif elem.tag.endswith(xes_constants.TAG_TRACE): if len(log) >= max_no_traces_to_import: break if trace is not None: raise SyntaxError( 'file contains <trace> in another <trace> tag') trace = Trace() tree[elem] = trace.attributes continue elif elem.tag.endswith(xes_constants.TAG_FLOAT): if parent is not None: try: val = float(elem.get(xes_constants.KEY_VALUE)) tree = __parse_attribute( elem, parent, elem.get(xes_constants.KEY_KEY), val, tree, compression_dictio) except ValueError: logging.info("failed to parse float: " + str(elem.get(xes_constants.KEY_VALUE))) continue elif elem.tag.endswith(xes_constants.TAG_INT): if parent is not None: try: val = int(elem.get(xes_constants.KEY_VALUE)) tree = __parse_attribute( elem, parent, elem.get(xes_constants.KEY_KEY), val, tree, compression_dictio) except ValueError: logging.info("failed to parse int: " + str(elem.get(xes_constants.KEY_VALUE))) continue elif elem.tag.endswith(xes_constants.TAG_BOOLEAN): if parent is not None: try: val0 = elem.get(xes_constants.KEY_VALUE) val = False if str(val0).lower() == "true": val = True tree = __parse_attribute( elem, parent, elem.get(xes_constants.KEY_KEY), val, tree, compression_dictio) except ValueError: logging.info("failed to parse boolean: " + str(elem.get(xes_constants.KEY_VALUE))) continue elif elem.tag.endswith(xes_constants.TAG_LIST): if parent is not None: # lists have no value, hence we put None as a value tree = __parse_attribute(elem, parent, elem.get(xes_constants.KEY_KEY), None, tree, compression_dictio) continue elif elem.tag.endswith(xes_constants.TAG_ID): if parent is not None: tree = __parse_attribute(elem, parent, elem.get(xes_constants.KEY_KEY), elem.get(xes_constants.KEY_VALUE), tree, compression_dictio) continue elif elem.tag.endswith(xes_constants.TAG_EXTENSION): if elem.get(xes_constants.KEY_NAME) is not None and elem.get( xes_constants.KEY_PREFIX) is not None and elem.get( xes_constants.KEY_URI) is not None: log.extensions[elem.get(xes_constants.KEY_NAME)] = { xes_constants.KEY_PREFIX: elem.get(xes_constants.KEY_PREFIX), xes_constants.KEY_URI: elem.get(xes_constants.KEY_URI) } continue elif elem.tag.endswith(xes_constants.TAG_GLOBAL): if elem.get(xes_constants.KEY_SCOPE) is not None: log.omni_present[elem.get(xes_constants.KEY_SCOPE)] = {} tree[elem] = log.omni_present[elem.get( xes_constants.KEY_SCOPE)] continue elif elem.tag.endswith(xes_constants.TAG_CLASSIFIER): if elem.get(xes_constants.KEY_KEYS) is not None: classifier_value = elem.get(xes_constants.KEY_KEYS) if "'" in classifier_value: log.classifiers[elem.get(xes_constants.KEY_NAME)] = [ x for x in classifier_value.split("'") if x.strip() ] else: log.classifiers[elem.get(xes_constants.KEY_NAME )] = classifier_value.split() continue elif elem.tag.endswith(xes_constants.TAG_LOG): tree[elem] = log.attributes continue elif tree_event == _EVENT_END: if elem in tree: del tree[elem] elem.clear() if elem.getprevious() is not None: try: del elem.getparent()[0] except TypeError: pass if elem.tag.endswith(xes_constants.TAG_EVENT): if trace is not None: trace.append(event) event = None continue elif elem.tag.endswith(xes_constants.TAG_TRACE): log.append(trace) if progress is not None: progress.update() trace = None continue elif elem.tag.endswith(xes_constants.TAG_LOG): continue # gracefully close progress bar if progress is not None: progress.close() del context, progress if timestamp_sort: log = sorting.sort_timestamp(log, timestamp_key=timestamp_key, reverse_sort=reverse_sort) return log
raise "No_token" session = vk.AuthSession(access_token=token_to_use) api = vk.API(session, v='5.95', lang='ru', timeout=10) def get_friends_ids(uid): ids = api.friends.get(user_id=uid) return ids['items'] try: with open("whole_leo_subscribers_list.json", "r") as f: whole_leo_subscribers_list = json.load(f) except: whole_leo_subscribers_list = [] for offset_curr in tqdm(range(0, 390556, 1000)): subscribers_ids_leo = api.groups.getMembers( group_id='15787787', fields="""sex, bdate, city, country, photo_max_orig, lists, domain, contacts, connections, education, universities, schools, status, relation, relatives""", offset=offset_curr) whole_leo_subscribers_list.extend(subscribers_ids_leo['items']) time.sleep(0.1) whole_leo_opened_subscribers_list = [] for friend in whole_leo_subscribers_list: if 'deactivated' in friend or friend['is_closed'] == True: pass
def main(): logging.basicConfig(level=logging.INFO) opts = Settings() opts = update_settings(opts) pool_states = [{} for _ in range(opts.num_workers)] # for train in [False, True]: for train in [False, True]: name = 'objectron-train' if train else 'objectron-test' logging.info(F'Processing {name}') max_bytes = opts.max_train_bytes if train else opts.max_test_bytes # TODO(ycho): Consider fancier (e.g. class-equalizing) shard samplers. shards = ObjectronDetection(ObjectronDetection.Settings(local=False), train).shards out_dir = (Path(opts.cache_dir).expanduser() / name) out_dir.mkdir(parents=True, exist_ok=True) if opts.use_pool: # NOTE(ycho): The initial approach based on mp.Pool(). # Turned out that it is not possible to guarantee graceful exit in # this way. _download = functools.partial(download_shard, out_dir=out_dir) with mp.Pool(opts.num_workers, init_worker) as p: with tqdm(total=max_bytes) as pbar: total_bytes = 0 for shard_bytes in p.imap_unordered(_download, shards): pbar.update(shard_bytes) # Accumulate and check for termination. total_bytes += shard_bytes if total_bytes >= max_bytes: logging.info(F'Done: {total_bytes} > {max_bytes}') # NOTE(ycho): Due to bug in mp.Pool(), imap_unordered() with close()/join() # does NOT work, thus we implicitly call terminate() via context manager # which may result in incomplete shards. This condition # must be checked. break else: init_bytes = sum(f.stat().st_size for f in out_dir.rglob('*') if f.is_file()) logging.info(F'Starting from {init_bytes}/{max_bytes} ...') ctx = mp.get_context('fork') stop = ctx.Value('b', (init_bytes >= max_bytes)) queue = ctx.Queue() workers = [ ctx.Process(target=download_shards, args=(shards[i::opts.num_workers], out_dir, stop, queue)) for i in range(opts.num_workers) ] # Start! for p in workers: p.start() # Progress logging ... try: with tqdm(initial=init_bytes, total=max_bytes) as pbar: # Periodically check progress... total_bytes = init_bytes while True: shard_bytes = queue.get() pbar.update(shard_bytes) total_bytes += shard_bytes if total_bytes >= max_bytes: break except KeyboardInterrupt: logging.info('Cancelling download, trying to clean up ...') pass finally: # Stop. with stop.get_lock(): stop.value = True # Join. logging.info( 'Download completed, joining the rest of the processes...') for p in workers: p.join()
def generate(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray: """ Generate adversarial samples and return them in an array. :param x: An array with the original inputs. :param y: Target values (class labels) one-hot-encoded of shape `(nb_samples, nb_classes)` or indices of shape (nb_samples,). Only provide this parameter if you'd like to use true labels when crafting adversarial samples. Otherwise, model predictions are used as labels to avoid the "label leaking" effect (explained in this paper: https://arxiv.org/abs/1611.01236). Default is `None`. :param mask: An array with a mask broadcastable to input `x` defining where to apply adversarial perturbations. Shape needs to be broadcastable to the shape of x and can also be of the same shape as `x`. Any features for which the mask is zero will not be adversarially perturbed. :type mask: `np.ndarray` :return: An array holding the adversarial examples. """ import torch # lgtm [py/repeated-import] mask = self._get_mask(x, **kwargs) # Ensure eps is broadcastable self._check_compatibility_input_and_eps(x=x) # Check whether random eps is enabled self._random_eps() # Set up targets targets = self._set_targets(x, y) # Create dataset if mask is not None: # Here we need to make a distinction: if the masks are different for each input, we need to index # those for the current batch. Otherwise (i.e. mask is meant to be broadcasted), keep it as it is. if len(mask.shape) == len(x.shape): dataset = torch.utils.data.TensorDataset( torch.from_numpy(x.astype(ART_NUMPY_DTYPE)), torch.from_numpy(targets.astype(ART_NUMPY_DTYPE)), torch.from_numpy(mask.astype(ART_NUMPY_DTYPE)), ) else: dataset = torch.utils.data.TensorDataset( torch.from_numpy(x.astype(ART_NUMPY_DTYPE)), torch.from_numpy(targets.astype(ART_NUMPY_DTYPE)), torch.from_numpy(np.array([mask.astype(ART_NUMPY_DTYPE)] * x.shape[0])), ) else: dataset = torch.utils.data.TensorDataset( torch.from_numpy(x.astype(ART_NUMPY_DTYPE)), torch.from_numpy(targets.astype(ART_NUMPY_DTYPE)), ) data_loader = torch.utils.data.DataLoader( dataset=dataset, batch_size=self.batch_size, shuffle=False, drop_last=False ) # Start to compute adversarial examples adv_x = x.astype(ART_NUMPY_DTYPE) # Compute perturbation with batching for (batch_id, batch_all) in enumerate( tqdm(data_loader, desc="PGD - Batches", leave=False, disable=not self.verbose) ): self._batch_id = batch_id if mask is not None: (batch, batch_labels, mask_batch) = batch_all[0], batch_all[1], batch_all[2] else: (batch, batch_labels, mask_batch) = batch_all[0], batch_all[1], None batch_index_1, batch_index_2 = batch_id * self.batch_size, (batch_id + 1) * self.batch_size # Compute batch_eps and batch_eps_step if isinstance(self.eps, np.ndarray) and isinstance(self.eps_step, np.ndarray): if len(self.eps.shape) == len(x.shape) and self.eps.shape[0] == x.shape[0]: batch_eps = self.eps[batch_index_1:batch_index_2] batch_eps_step = self.eps_step[batch_index_1:batch_index_2] else: batch_eps = self.eps batch_eps_step = self.eps_step else: batch_eps = self.eps batch_eps_step = self.eps_step for rand_init_num in range(max(1, self.num_random_init)): if rand_init_num == 0: # first iteration: use the adversarial examples as they are the only ones we have now adv_x[batch_index_1:batch_index_2] = self._generate_batch( x=batch, targets=batch_labels, mask=mask_batch, eps=batch_eps, eps_step=batch_eps_step ) else: adversarial_batch = self._generate_batch( x=batch, targets=batch_labels, mask=mask_batch, eps=batch_eps, eps_step=batch_eps_step ) # return the successful adversarial examples attack_success = compute_success_array( self.estimator, batch, batch_labels, adversarial_batch, self.targeted, batch_size=self.batch_size, ) adv_x[batch_index_1:batch_index_2][attack_success] = adversarial_batch[attack_success] logger.info( "Success rate of attack: %.2f%%", 100 * compute_success(self.estimator, x, targets, adv_x, self.targeted, batch_size=self.batch_size), ) if self.summary_writer is not None: self.summary_writer.reset() return adv_x
for epoch in range(args.epoch): good_smiles = sorted(set(good_smiles)) random.shuffle(good_smiles) dataset = hgraph.MoleculeDataset(good_smiles, args.vocab, args.atom_vocab, args.batch_size) print(f'Epoch {epoch} training...') for _ in range(args.inner_epoch): meters = np.zeros(6) dataloader = DataLoader(dataset, batch_size=1, collate_fn=lambda x: x[0], shuffle=True, num_workers=16) for batch in tqdm(dataloader): model.zero_grad() loss, kl_div, wacc, iacc, tacc, sacc = model(*batch, beta=beta) loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.clip_norm) optimizer.step() meters = meters + np.array([ kl_div, loss.item(), wacc * 100, iacc * 100, tacc * 100, sacc * 100 ]) meters /= len(dataset) print( "Beta: %.3f, KL: %.2f, loss: %.3f, Word: %.2f, %.2f, Topo: %.2f, Assm: %.2f, PNorm: %.2f, GNorm: %.2f" % (beta, meters[0], meters[1], meters[2], meters[3], meters[4], meters[5], param_norm(model), grad_norm(model)))
def train(model, train_loader, train_subval_loader, epochs, controller=None, gamma_start=0.0, gamma_end=0.0, lr=3e-3, prefix_name=''): if isinstance(model, SuperNet): gamma_scheduler = GammaScheduler(model, total_steps=epochs * len(train_loader), gamma_start=gamma_start, gamma_end=gamma_end) else: gamma_scheduler = None optimizer = torch.optim.Adam(model.parameters(), lr=lr) criterion = nn.CrossEntropyLoss() history = [] for epoch in range(epochs): train_loss = 0 val_loss = 0 best_val_loss = float('inf') # Train on 50k/60k training data model.train() tq = tqdm(train_loader, leave=False) for x, cls in tq: if isinstance(model, SuperNet): out, choice = model(x.to(device)) else: out = model(x.to(device)) loss = criterion(out, cls.to(device)) optimizer.zero_grad() loss.backward() norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 10) optimizer.step() train_loss += loss.item() / len(train_loader) if gamma_scheduler: gamma_scheduler.step() if controller: controller.update_score(choice, loss.item()) tq.set_postfix(loss=f'{loss.item():.4f}', norm=f'{norm:.4f}') # Evaluate on 10k/60k training data model.eval() tq = tqdm(train_subval_loader, leave=False) for x, cls in tq: with torch.no_grad(): if isinstance(model, SuperNet): out, choice = model(x.to(device)) else: out = model(x.to(device)) loss = criterion(out, cls.to(device)) val_loss += loss.item() / len(train_subval_loader) if controller: controller.update_score(choice, loss.item()) tq.set_postfix(loss=f'{loss.item():.4f}') history.append([train_loss, val_loss]) state_dict = { 'model': model.state_dict(), 'controller': controller, 'gamma_scheduler': gamma_scheduler, 'history': history } torch.save(state_dict, f'{prefix_name}_last.pt') if (val_loss < best_val_loss): best_val_loss = val_loss torch.save(state_dict, f'{prefix_name}_best.pt') print( f'{epoch+1:>2} / {epochs:>2}, loss = {train_loss:.4f}, val_loss = {val_loss:.4f}' ) return history
def main(): args = parse_args() # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. accelerator = Accelerator() # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.info(accelerator.state) # Setup logging, we only want one process per machine to log things on the screen. # accelerator.is_local_main_process is only True for one process per machine. logger.setLevel(logging.INFO if accelerator.is_local_main_process else logging.ERROR) if accelerator.is_local_main_process: datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_info() else: datasets.utils.logging.set_verbosity_error() transformers.utils.logging.set_verbosity_error() # If passed along, set the training seed now. if args.seed is not None: set_seed(args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name) if "validation" not in raw_datasets.keys(): raw_datasets["validation"] = load_dataset( args.dataset_name, args.dataset_config_name, split=f"train[:{args.validation_split_percentage}%]", ) raw_datasets["train"] = load_dataset( args.dataset_name, args.dataset_config_name, split=f"train[{args.validation_split_percentage}%:]", ) else: data_files = {} if args.train_file is not None: data_files["train"] = args.train_file if args.validation_file is not None: data_files["validation"] = args.validation_file extension = args.train_file.split(".")[-1] if extension == "txt": extension = "text" raw_datasets = load_dataset(extension, data_files=data_files) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if args.config_name: config = AutoConfig.from_pretrained(args.config_name) elif args.model_name_or_path: config = AutoConfig.from_pretrained(args.model_name_or_path) else: config = CONFIG_MAPPING[args.model_type]() logger.warning("You are instantiating a new config instance from scratch.") if args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=not args.use_slow_tokenizer) elif args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=not args.use_slow_tokenizer) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if args.model_name_or_path: model = AutoModelForMaskedLM.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, ) else: logger.info("Training new model from scratch") model = AutoModelForMaskedLM.from_config(config) model.resize_token_embeddings(len(tokenizer)) # Preprocessing the datasets. # First we tokenize all the texts. column_names = raw_datasets["train"].column_names text_column_name = "text" if "text" in column_names else column_names[0] if args.max_seq_length is None: max_seq_length = tokenizer.model_max_length if max_seq_length > 1024: logger.warning( f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx." ) max_seq_length = 1024 else: if args.max_seq_length > tokenizer.model_max_length: logger.warning( f"The max_seq_length passed ({args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(args.max_seq_length, tokenizer.model_max_length) if args.line_by_line: # When using line_by_line, we just tokenize each nonempty line. padding = "max_length" if args.pad_to_max_length else False def tokenize_function(examples): # Remove empty lines examples[text_column_name] = [ line for line in examples[text_column_name] if len(line) > 0 and not line.isspace() ] return tokenizer( examples[text_column_name], padding=padding, truncation=True, max_length=max_seq_length, # We use this option because DataCollatorForLanguageModeling (see below) is more efficient when it # receives the `special_tokens_mask`. return_special_tokens_mask=True, ) tokenized_datasets = raw_datasets.map( tokenize_function, batched=True, num_proc=args.preprocessing_num_workers, remove_columns=[text_column_name], load_from_cache_file=not args.overwrite_cache, desc="Running tokenizer on dataset line_by_line", ) else: # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts. # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more # efficient when it receives the `special_tokens_mask`. def tokenize_function(examples): return tokenizer(examples[text_column_name], return_special_tokens_mask=True) tokenized_datasets = raw_datasets.map( tokenize_function, batched=True, num_proc=args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not args.overwrite_cache, desc="Running tokenizer on every text in dataset", ) # Main data processing function that will concatenate all texts from our dataset and generate chunks of # max_seq_length. def group_texts(examples): # Concatenate all texts. concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # customize this part to your needs. total_length = (total_length // max_seq_length) * max_seq_length # Split by chunks of max_len. result = { k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)] for k, t in concatenated_examples.items() } return result # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value # might be slower to preprocess. # # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map tokenized_datasets = tokenized_datasets.map( group_texts, batched=True, num_proc=args.preprocessing_num_workers, load_from_cache_file=not args.overwrite_cache, desc=f"Grouping texts in chunks of {max_seq_length}", ) train_dataset = tokenized_datasets["train"] eval_dataset = tokenized_datasets["validation"] # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): logger.info(f"Sample {index} of the training set: {train_dataset[index]}.") # Data collator # This one will take care of randomly masking the tokens. data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=args.mlm_probability) # DataLoaders creation: train_dataloader = DataLoader( train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size ) eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size) # Optimizer # Split weights in two groups, one with weight decay and the other not. no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay, }, { "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) # Prepare everything with our `accelerator`. model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare( model, optimizer, train_dataloader, eval_dataloader ) # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be # shorter in multiprocess) # Scheduler and math around the number of training steps. num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch else: args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps, num_training_steps=args.max_train_steps, ) # Train! total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {args.num_train_epochs}") logger.info(f" Instantaneous batch size per device = {args.per_device_train_batch_size}") logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}") logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {args.max_train_steps}") # Only show the progress bar once on each machine. progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 for epoch in range(args.num_train_epochs): model.train() for step, batch in enumerate(train_dataloader): outputs = model(**batch) loss = outputs.loss loss = loss / args.gradient_accumulation_steps accelerator.backward(loss) if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: optimizer.step() lr_scheduler.step() optimizer.zero_grad() progress_bar.update(1) completed_steps += 1 if completed_steps >= args.max_train_steps: break model.eval() losses = [] for step, batch in enumerate(eval_dataloader): with torch.no_grad(): outputs = model(**batch) loss = outputs.loss losses.append(accelerator.gather(loss.repeat(args.per_device_eval_batch_size))) losses = torch.cat(losses) losses = losses[: len(eval_dataset)] try: perplexity = math.exp(torch.mean(losses)) except OverflowError: perplexity = float("inf") logger.info(f"epoch {epoch}: perplexity: {perplexity}") if args.output_dir is not None: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
def evaluate(self, eval_dataset, output_dir): """ Evaluates the model on eval_dataset. Utility function to be used by the eval_model() method. Not intended to be used directly. """ device = self.device model = self.model args = self.args pad_token_label_id = self.pad_token_label_id eval_output_dir = output_dir results = {} eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args["eval_batch_size"]) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None model.eval() for batch in tqdm(eval_dataloader, disable=args["silent"]): batch = tuple(t.to(device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3], } # XLM and RoBERTa don"t use segment_ids if args["model_type"] in ["bert", "xlnet"]: inputs["token_type_ids"] = batch[2] outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs["labels"].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append( out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps model_outputs = preds preds = np.argmax(preds, axis=2) label_map = {i: label for i, label in enumerate(self.labels)} out_label_list = [[] for _ in range(out_label_ids.shape[0])] preds_list = [[] for _ in range(out_label_ids.shape[0])] for i in range(out_label_ids.shape[0]): for j in range(out_label_ids.shape[1]): if out_label_ids[i, j] != pad_token_label_id: out_label_list[i].append(label_map[out_label_ids[i][j]]) preds_list[i].append(label_map[preds[i][j]]) result = { "eval_loss": eval_loss, "precision": precision_score(out_label_list, preds_list), "recall": recall_score(out_label_list, preds_list), "f1_score": f1_score(out_label_list, preds_list), } results.update(result) output_eval_file = os.path.join(eval_output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: if args["classification_report"]: cls_report = classification_report(out_label_list, preds_list) writer.write("{}\n".format(cls_report)) for key in sorted(result.keys()): writer.write("{} = {}\n".format(key, str(result[key]))) #MAKE A TABEL FROM OUT LABEL LIST AND PREDS LIST return results, model_outputs, preds_list
def proc_preds( examples, features, predictions, version_2_with_negative=False, n_best_size=20, max_answer_length=30, start_n_top=5, end_n_top=5, out_dir=None, prefix=None, log_level=logging.WARNING, ): if len(predictions) != 5: raise ValueError("`predictions` should be a tuple with five elements.") start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits = predictions if len(predictions[0]) != len(features): raise ValueError( f"Got {len(predictions[0])} predictions and {len(features)} features." ) example_id_to_index = {k: i for i, k in enumerate(examples["id"])} features_per_example = collections.defaultdict(list) for i, feature in enumerate(features): features_per_example[example_id_to_index[ feature["example_id"]]].append(i) all_predictions = collections.OrderedDict() all_nbest_json = collections.OrderedDict() scores_diff_json = collections.OrderedDict( ) if version_2_with_negative else None log.setLevel(log_level) log.info( f"Post-processing {len(examples)} example predictions split into {len(features)} features." ) for example_index, example in enumerate(tqdm(examples)): feature_indices = features_per_example[example_index] min_null_score = None prelim_predictions = [] for feature_index in feature_indices: start_log_prob = start_top_log_probs[feature_index] start_indexes = start_top_index[feature_index] end_log_prob = end_top_log_probs[feature_index] end_indexes = end_top_index[feature_index] feature_null_score = cls_logits[feature_index] offset_mapping = features[feature_index]["offset_mapping"] token_is_max_context = features[feature_index].get( "token_is_max_context", None) if min_null_score is None or feature_null_score < min_null_score: min_null_score = feature_null_score for i in range(start_n_top): for j in range(end_n_top): start_index = int(start_indexes[i]) j_index = i * end_n_top + j end_index = int(end_indexes[j_index]) if (start_index >= len(offset_mapping) or end_index >= len(offset_mapping) or offset_mapping[start_index] is None or offset_mapping[end_index] is None): continue if end_index < start_index or end_index - start_index + 1 > max_answer_length: continue if token_is_max_context is not None and not token_is_max_context.get( str(start_index), False): continue prelim_predictions.append({ "offsets": ( offset_mapping[start_index][0], offset_mapping[end_index][1], ), "score": start_log_prob[i] + end_log_prob[j_index], "start_log_prob": start_log_prob[i], "end_log_prob": end_log_prob[j_index], }) predictions = sorted(prelim_predictions, key=lambda x: x["score"], reverse=True)[:n_best_size] context = example["context"] for pred in predictions: offsets = pred.pop("offsets") pred["text"] = context[offsets[0]:offsets[1]] if len(predictions) == 0: predictions.insert( 0, { "text": "", "start_logit": -1e-6, "end_logit": -1e-6, "score": -2e-6 }) scores = np.array([pred.pop("score") for pred in predictions]) exp_scores = np.exp(scores - np.max(scores)) probs = exp_scores / exp_scores.sum() for prob, pred in zip(probs, predictions): pred["probability"] = prob all_predictions[example["id"]] = predictions[0]["text"] if version_2_with_negative: scores_diff_json[example["id"]] = float(min_null_score) all_nbest_json[example["id"]] = [{ k: (float(v) if isinstance(v, (np.float16, np.float32, np.float64)) else v) for k, v in pred.items() } for pred in predictions] if out_dir is not None: if not os.path.isdir(out_dir): raise EnvironmentError(f"{out_dir} is not a directory.") prediction_file = os.path.join( out_dir, "predictions.json" if prefix is None else f"{prefix}_predictions.json") nbest_file = os.path.join( out_dir, "nbest_predictions.json" if prefix is None else f"{prefix}_nbest_predictions.json", ) if version_2_with_negative: null_odds_file = os.path.join( out_dir, "null_odds.json" if prefix is None else f"{prefix}_null_odds.json") log.info(f"Saving predictions to {prediction_file}.") with open(prediction_file, "w") as writer: writer.write(json.dumps(all_predictions, indent=4) + "\n") log.info(f"Saving nbest_preds to {nbest_file}.") with open(nbest_file, "w") as writer: writer.write(json.dumps(all_nbest_json, indent=4) + "\n") if version_2_with_negative: log.info(f"Saving null_odds to {null_odds_file}.") with open(null_odds_file, "w") as writer: writer.write(json.dumps(scores_diff_json, indent=4) + "\n") return all_predictions, scores_diff_json
def _generate_cost_matrices( self, adata: AnnData, cost_matrices: Optional[ Union[str, Mapping[Tuple[float, float], np.ndarray]] ] = None, ) -> Tuple[Mapping[Tuple[float, float], Optional[np.ndarray]], str]: timepoints = self.experimental_time.cat.categories timepoints = list(zip(timepoints[:-1], timepoints[1:])) if cost_matrices is None: logg.info("Using default cost matrices") return {tpair: None for tpair in timepoints}, "default" if isinstance(cost_matrices, dict): logg.info("Using precomputed cost matrices") cmats = {} for tpair in timepoints: if tpair not in cost_matrices: logg.warning( f"Unable to find cost matrix for pair `{tpair}`. Using default" ) cmats[tpair] = cmat = cost_matrices.get(tpair, None) if cmat is not None: n_start = len(np.where(self.experimental_time == tpair[0])[0]) n_end = len(np.where(self.experimental_time == tpair[1])[0]) try: if cmat.shape != (n_start, n_end): raise ValueError( f"Expected cost matrix for time pair `{tpair}` to be " f"of shape `{(n_start, n_end)}`, found `{cmat.shape}`." ) except AttributeError: logg.warning( f"Unable to verify whether supplied cost matrix for time pair `{tpair}` " f"has the correct shape `{(n_start, n_end)}`" ) # prevent equality comparison when comparing with cache return cmats, nstr("precomputed") if isinstance(cost_matrices, str): logg.info(f"Computing cost matrices using `{cost_matrices!r}` key") if cost_matrices == "X": cost_matrices = None try: features = adata._get_X(layer=cost_matrices) modifier = "layer" except KeyError: try: features = adata.obsm[cost_matrices] modifier = "obsm" except KeyError: raise KeyError( f"Unable to find key `{cost_matrices!r}` in `adata.layers` or `adata.obsm`." ) from None cmats = {} for tpair in tqdm(timepoints, unit="cost matrix"): start_ixs = np.where(self.experimental_time == tpair[0])[0] end_ixs = np.where(self.experimental_time == tpair[1])[0] # being sparse is handled in WOT's function below cmats[tpair] = wot.ot.OTModel.compute_default_cost_matrix( features[start_ixs], features[end_ixs] ) return cmats, f"{modifier}:{cost_matrices}" raise NotImplementedError( f"Specifying cost matrices as " f"`{type(cost_matrices).__name__}` is not yet implemented." )
def test_eval(model, log_dir, mini_batch, lstm_layer, lstm_dim, max_sen_len, gpu, cuda, reverse, unk, trunc, epoch, id_to_de): mini_batch = int(mini_batch / 4) # check dir, make dir test_dir = os.path.join(log_dir, 'test') if not os.path.exists(test_dir): os.mkdir(test_dir) # load test data print("Load the preprocessed test data..") if unk: if reverse: with open( f'datasets/preprocessed/test/test{trunc}_source_reverse_unk.pkl', 'rb') as fr: test_source_input, test_source_len = pickle.load(fr) else: with open(f'datasets/preprocessed/test/test{trunc}_source_unk.pkl', 'rb') as fr: test_source_input, test_source_len = pickle.load(fr) with open(f'datasets/preprocessed/test/test{trunc}_label_unk.pkl', 'rb') as fr: test_target_output = pickle.load(fr) else: if reverse: with open( f'datasets/preprocessed/test/test{trunc}_source_reverse.pkl', 'rb') as fr: test_source_input, test_source_len = pickle.load(fr) else: with open(f'datasets/preprocessed/test/test{trunc}_source.pkl', 'rb') as fr: test_source_input, test_source_len = pickle.load(fr) with open(f'datasets/preprocessed/test/test{trunc}_label.pkl', 'rb') as fr: test_target_output = pickle.load(fr) print("Complete.") print("Split the data into mini_batch..") test_src_input = make_batch(test_source_input, mini_batch) test_src_len = make_batch(test_source_len, mini_batch) test_tgt_output = make_batch(test_target_output, mini_batch) print("Complete.") test_src_input = torch.from_numpy(test_src_input) test_src_len = torch.from_numpy(test_src_len) test_tgt_output = torch.from_numpy(test_tgt_output) test_src_input = test_src_input.to(torch.int64) test_src_len = test_src_len.to(torch.int64) test_tgt_output = test_tgt_output.to(torch.int64) # test start cur = 0 output = torch.zeros_like(test_src_input) # output = (40, 64, 51) for batch_src_input, batch_src_len in tqdm( zip(test_src_input, test_src_len), total=len(test_src_input), bar_format='{l_bar}{bar:30}{r_bar}'): # init hidden state h_0 = torch.zeros(lstm_layer, mini_batch, lstm_dim) # (4, 128, 1000) c_0 = torch.zeros(lstm_layer, mini_batch, lstm_dim) hidden = (h_0, c_0) # hidden = [state.detach() for state in hidden] tgt = torch.ones(mini_batch, 1) # tgt = (mini_batch, 1) ==> SOS tokens tgt = tgt.to(torch.int64) if gpu: device = torch.device( f"cuda:{cuda}" if torch.cuda.is_available() else "cpu") batch_src_input = batch_src_input.to(device) batch_src_len = batch_src_len.to(device) tgt = tgt.to(device) hidden = [state.to(device) for state in hidden] # first decoder (past) output hht = torch.zeros(mini_batch, 1, lstm_dim) # first time-step prev decoder context if gpu: device = torch.device( f"cuda:{cuda}" if torch.cuda.is_available() else "cpu") hht = hht.to(device) for i in range(max_sen_len): out = model(batch_src_input, tgt, hidden, hht, batch_src_len) if gpu: device = torch.device( f"cuda:{cuda}" if torch.cuda.is_available() else "cpu") out = out.to(device) # out = (mini_batch, seq_len, tgt_vocab) pred = torch.max(out, dim=-1)[1] # pred = (mini_batch, seq_len) tgt = torch.cat((tgt, pred[:, i].unsqueeze(1)), dim=1) output[cur] = tgt[:, 1:] # output[cur] = (mini_batch, seq_len) cur += 1 # make prediction.txt output = output.view(-1, max_sen_len) test_pred_output = [] for line in output: sentence = ' '.join([id_to_de[int(idx)] for idx in line]) sentence = sentence.replace('</s>', '').strip() + ' \n' test_pred_output.append(sentence) # make label.txt test_tgt_output = test_tgt_output.view(-1, max_sen_len) test_label = [] for line in test_tgt_output: sentence = ' '.join([id_to_de[int(idx)] for idx in line]) sentence = sentence.replace('</s>', '').strip() + ' \n' test_label.append(sentence) # save the prediction and label text. test_dir = os.path.join(log_dir, 'test') if not os.path.exists(test_dir): os.mkdir(test_dir) with open(os.path.join(test_dir, f'output_{epoch+1}.txt'), 'w', encoding='utf8') as fw: fw.writelines(test_pred_output) with open(os.path.join(test_dir, f'label_{epoch+1}.txt'), 'w', encoding='utf8') as fw: fw.writelines(test_label) print("Succeed to save the prediction and label text file!") print('\n')
def ppo(env_name, total_steps, model, act_var_schedule=[0.7], epoch_batch_size=2048, gamma=0.99, lam=0.99, eps=0.2, seed=0, pol_batch_size=1024, val_batch_size=1024, pol_lr=1e-4, val_lr=1e-4, pol_epochs=10, val_epochs=10, target_kl=.01, use_gpu=False, reward_stop=None, normalize_return=True, env_config={}): """ Implements proximal policy optimization with clipping Args: env_name: name of the openAI gym environment to solve total_steps: number of timesteps to run the PPO for model: model from seagul.rl.models. Contains policy and value fn act_var_schedule: schedule to set the variance of the policy. Will linearly interpolate values epoch_batch_size: number of environment steps to take per batch, total steps will be num_epochs*epoch_batch_size seed: seed for all the rngs gamma: discount applied to future rewards, usually close to 1 lam: lambda for the Advantage estimation, usually close to 1 eps: epsilon for the clipping, usually .1 or .2 pol_batch_size: batch size for policy updates val_batch_size: batch size for value function updates pol_lr: learning rate for policy pol_optimizer val_lr: learning rate of value function pol_optimizer pol_epochs: how many epochs to use for each policy update val_epochs: how many epochs to use for each value update target_kl: max KL before breaking use_gpu: want to use the GPU? set to true reward_stop: reward value to stop if we achieve normalize_return: should we normalize the return? env_config: dictionary containing kwargs to pass to your the environment Returns: model: trained model avg_reward_hist: list with the average reward per episode at each epoch var_dict: dictionary with all locals, for logging/debugging purposes Example: from seagul.rl.algos import ppo from seagul.nn import MLP from seagul.rl.models import PPOModel import torch input_size = 3 output_size = 1 layer_size = 64 num_layers = 2 policy = MLP(input_size, output_size, num_layers, layer_size) value_fn = MLP(input_size, 1, num_layers, layer_size) model = PPOModel(policy, value_fn) model, rews, var_dict = ppo("Pendulum-v0", 10000, model) """ # init everything # ============================================================================== torch.set_num_threads(1) env = gym.make(env_name, **env_config) if isinstance(env.action_space, gym.spaces.Box): act_size = env.action_space.shape[0] act_dtype = torch.double else: raise NotImplementedError("trying to use unsupported action space", env.action_space) actvar_lookup = make_variance_schedule(act_var_schedule, model, total_steps) model.action_var = actvar_lookup(0) obs_size = env.observation_space.shape[0] obs_mean = torch.zeros(obs_size) obs_var = torch.ones(obs_size) adv_mean = torch.zeros(1) adv_var = torch.ones(1) rew_mean = torch.zeros(1) rew_var = torch.ones(1) old_model = pickle.loads( pickle.dumps(model) ) # copy.deepcopy broke for me with older version of torch. Using pickle for this is weird but works fine pol_opt = torch.optim.Adam(model.policy.parameters(), lr=pol_lr) val_opt = torch.optim.Adam(model.value_fn.parameters(), lr=val_lr) # seed all our RNGs env.seed(seed) torch.manual_seed(seed) np.random.seed(seed) # set defaults, and decide if we are using a GPU or not use_cuda = torch.cuda.is_available() and use_gpu device = torch.device("cuda:0" if use_cuda else "cpu") # init logging stuff raw_rew_hist = [] val_loss_hist = [] pol_loss_hist = [] progress_bar = tqdm.tqdm(total=total_steps) cur_total_steps = 0 progress_bar.update(0) early_stop = False # Train until we hit our total steps or reach our reward threshold # ============================================================================== while cur_total_steps < total_steps: batch_obs = torch.empty(0) batch_act = torch.empty(0) batch_adv = torch.empty(0) batch_discrew = torch.empty(0) cur_batch_steps = 0 # Bail out if we have met out reward threshold if len(raw_rew_hist) > 2 and reward_stop: if raw_rew_hist[-1] >= reward_stop and raw_rew_hist[ -2] >= reward_stop: early_stop = True break # construct batch data from rollouts # ============================================================================== while cur_batch_steps < epoch_batch_size: ep_obs, ep_act, ep_rew, ep_steps = do_rollout(env, model) raw_rew_hist.append(sum(ep_rew)) ep_rew = (ep_rew - ep_rew.mean()) / (ep_rew.std() + 1e-6) batch_obs = torch.cat((batch_obs, ep_obs[:-1])) batch_act = torch.cat((batch_act, ep_act[:-1])) ep_discrew = discount_cumsum( ep_rew, gamma ) # [:-1] because we appended the value function to the end as an extra reward batch_discrew = torch.cat((batch_discrew, ep_discrew[:-1])) if normalize_return: rew_mean = update_mean(batch_discrew, rew_mean, cur_total_steps) rew_var = update_std(batch_discrew, rew_var, cur_total_steps) batch_discrew = (batch_discrew - rew_mean) / (rew_var + 1e-6) # calculate this episodes advantages last_val = model.value_fn(ep_obs[-1]).reshape(-1, 1) ep_val = model.value_fn(ep_obs) ep_val[-1] = last_val deltas = ep_rew[:-1] + gamma * ep_val[1:] - ep_val[:-1] ep_adv = discount_cumsum(deltas.detach(), gamma * lam) batch_adv = torch.cat((batch_adv, ep_adv)) cur_batch_steps += ep_steps cur_total_steps += ep_steps # make sure our advantages are zero mean and unit variance adv_mean = update_mean(batch_adv, adv_mean, cur_total_steps) adv_var = update_std(batch_adv, adv_var, cur_total_steps) batch_adv = (batch_adv - adv_mean) / (adv_var + 1e-6) # policy update # ======================================================================== num_mbatch = int(batch_obs.shape[0] / pol_batch_size) # Update the policy using the PPO loss for pol_epoch in range(pol_epochs): for i in range(num_mbatch): cur_sample = i * pol_batch_size logp = model.get_logp( batch_obs[cur_sample:cur_sample + pol_batch_size], batch_act[cur_sample:cur_sample + pol_batch_size]).reshape( -1, act_size) old_logp = old_model.get_logp( batch_obs[cur_sample:cur_sample + pol_batch_size], batch_act[cur_sample:cur_sample + pol_batch_size]).reshape( -1, act_size) r = torch.exp(logp - old_logp) clip_r = torch.clamp(r, 1 - eps, 1 + eps) pol_loss = -torch.min( r * batch_adv[cur_sample:cur_sample + pol_batch_size], clip_r * batch_adv[cur_sample:cur_sample + pol_batch_size]).mean() approx_kl = (logp - old_logp).mean() if approx_kl > target_kl: break pol_opt.zero_grad() pol_loss.backward() pol_opt.step() # value_fn update # ======================================================================== num_mbatch = int(batch_obs.shape[0] / val_batch_size) # Update value function with the standard L2 Loss for val_epoch in range(val_epochs): for i in range(num_mbatch): cur_sample = i * pol_batch_size # predict and calculate loss for the batch val_preds = model.value_fn(batch_obs[cur_sample:cur_sample + pol_batch_size]) val_loss = ((val_preds - batch_discrew[cur_sample:cur_sample + pol_batch_size])**2).mean() # do the normal pytorch update val_opt.zero_grad() val_loss.backward() val_opt.step() # update observation mean and variance obs_mean = update_mean(batch_obs, obs_mean, cur_total_steps) obs_var = update_std(batch_obs, obs_var, cur_total_steps) model.policy.state_means = obs_mean model.value_fn.state_means = obs_mean model.policy.state_std = obs_var model.value_fn.state_std = obs_var model.action_var = actvar_lookup(cur_total_steps) old_model = pickle.loads(pickle.dumps(model)) val_loss_hist.append(val_loss) pol_loss_hist.append(pol_loss) progress_bar.update(cur_batch_steps) progress_bar.close() return model, raw_rew_hist, locals()
def prepare_wenet_speech( corpus_dir: Pathlike, dataset_parts: Union[str, Sequence[str]] = "all", output_dir: Optional[Pathlike] = None, num_jobs: int = 1, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param corpus_dir: Pathlike, the path of the data dir. :param dataset_parts: Which parts of dataset to prepare, all for all the parts. :param output_dir: Pathlike, the path where to write the manifests. :num_jobs Number of workers to extract manifests. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) subsets = WETNET_SPEECH_PARTS if "all" in dataset_parts else dataset_parts manifests = defaultdict(dict) for sub in subsets: if sub not in WETNET_SPEECH_PARTS: raise ValueError(f"No such part of dataset in WenetSpeech : {sub}") manifests[sub] = {"recordings": [], "supervisions": []} raw_manifests_path = corpus_dir / "WenetSpeech.json" assert raw_manifests_path.is_file(), f"No such file : {raw_manifests_path}" logging.info(f"Loading raw manifests from : {raw_manifests_path}") raw_manifests = json.load(open(raw_manifests_path, "r", encoding="utf8")) with ProcessPoolExecutor(num_jobs) as ex: for recording, segments in tqdm( ex.map( parse_utterance, raw_manifests["audios"], repeat(corpus_dir), repeat(subsets), ), desc="Processing WenetSpeech JSON entries", ): for part in segments: manifests[part]["recordings"].append(recording) manifests[part]["supervisions"].extend(segments[part]) for sub in subsets: recordings, supervisions = fix_manifests( recordings=RecordingSet.from_recordings( manifests[sub]["recordings"]), supervisions=SupervisionSet.from_segments( manifests[sub]["supervisions"]), ) validate_recordings_and_supervisions(recordings=recordings, supervisions=supervisions) if output_dir is not None: supervisions.to_file(output_dir / f"wenetspeech_supervisions_{sub}.jsonl.gz") recordings.to_file(output_dir / f"wenetspeech_recordings_{sub}.jsonl.gz") manifests[sub] = { "recordings": recordings, "supervisions": supervisions, } return manifests
def progress_bar(self, config, log): _, throughput, _ = log.compute_throughput() exp = config["basedir"] model = config["model"].lower() self._connect() date_filename = exp.split("/")[-1] + "_" + model + ".date" remote_command = ("cd " + config["basedir"] + "/scripts/; cat " + date_filename + " |awk '{ print $1 }'") stdin, stdout, stderr = self.ssh.exec_command(remote_command) # stdout is now something like 19500101 # Assume that you get something like Y*YMMDD; so cut off the last 4 digits # (note that we dont know how many places the year has; so we need to cut # from the end) current_date = int(stdout.readlines()[0][:-5]) remote_command = ("cd " + config["basedir"] + "/scripts/; cat " + date_filename + " |awk '{ print $2 }'") stdin, stdout, stderr = self.ssh.exec_command(remote_command) current_run = int(stdout.readlines()[0]) runscript_file = config.get("runscript", config["basedir"] + "/scripts/*run") # POTENTIAL BUG: These things are all very dependent on the runscript's way # of defining time control. It might be better to do this somehow # differently start_year = self.ssh.exec_command("grep INITIAL_DATE_" + model + " " + runscript_file)[1].readlines()[0] final_year = self.ssh.exec_command("grep FINAL_DATE_" + model + " " + runscript_file)[1].readlines()[0] # POTENTIAL BUG: What about people who run on monthly basis? run_size = self.ssh.exec_command("grep NYEAR_" + model + " " + runscript_file)[1].readlines()[0] # Reformat to get just the years and run sizes start_year = int(start_year.split("=")[1].split("-")[0]) final_year = int(final_year.split("=")[1].split("-")[0]) run_size = int(stripComments(" ".join(run_size.split("=")[1].split()))) total_number_of_runs = int((final_year - start_year) / run_size) years_per_day = throughput years_left = final_year - current_date days_left = years_left / years_per_day finishing_date = datetime.datetime.now() + datetime.timedelta( days=days_left) using_tqdm = False using_html = True if using_tqdm: r_bar = (" " + str(current_run) + "/" + str(total_number_of_runs) + ", Throughput ~" + str(np.round(years_per_day, 2)) + "runs/day") pbar = tqdm( total=total_number_of_runs, desc="Done on: " + finishing_date.strftime("%d %b, %Y"), bar_format="{n}/|/{l_bar} " + r_bar, ) pbar.update(current_run) return pbar if using_html: DOC = (""" <style> #myProgress { width: 100%; background-color: #ddd; } """ + """ #myBar { width: """ + str(100 * current_run / total_number_of_runs) + "%" + """; height: 30px; background-color: #4CAF50; text-align: center; line-height: 30px; color: black; } </style> <body> """ + """Based on current throughput, this run may be done on: """ + finishing_date.strftime("%d %b, %Y") + """ <div id="myProgress"> <div id="myBar">""" + str(100 * current_run / total_number_of_runs) + """%</div> </div> """ + str(current_run) + "/" + str(total_number_of_runs) + " Throughput ~" + str(np.round(years_per_day, 2)) + """ runs/day</body>""") return DOC
# img = Image.fromarray(img) # img.save(filename) # local path setting path = '/home/ubuntu/context/data' print(path) path_cropped = '/home/ubuntu/context/context_encoder_pytorch-master_ver_1/dataset/train/annals' path_cropped2 = '/home/ubuntu/context/context_encoder_pytorch-master_ver_1/dataset/val/annals' # original data path os.chdir(path) file_list = os.listdir(os.getcwd()) cnt = 0 for l in tqdm(file_list): # a,b,... path_img = os.path.join(path, l) os.chdir(path_img) image_list = glob.glob('*.jpg') print(len(image_list)) for i, img in tqdm(enumerate(image_list)): try: image = load_image(os.path.join(path_img, img)) chunk = ImageChunker(256, 256, overlap=0) results = chunk.dimension_preprocess(image) print("=======cropped========>", img) cnt += 1 if cnt < 20000: save_image(img, results, count=True)
phases = ['train', 'val'] for f in ['images', 'labels']: if not os.path.isdir(f): os.mkdir(f) for p in phases: if not os.path.isdir(os.path.join(f, p)): os.mkdir(os.path.join(f, p)) image_folder = 'images' df = pd.read_csv('train.csv', index_col=0) box_count = df.groupby(level=0).count().width #all_id = list(box_count.index) all_id = list(set([i[:-4] for i in os.listdir('all_images')])) idsets = train_test_split(all_id, test_size=0.1, random_state=7) for p, ids in zip(phases, idsets): i = 0 for iid in tqdm(ids): imagefile1 = os.path.join('all_images', iid + '.jpg') imagefile2 = os.path.join('images', p, iid + '.jpg') copyfile(imagefile1, imagefile2) fn = os.path.join('labels', p, iid + '.txt') if iid not in df.index: open(fn, 'w').close() continue idf = df.loc[iid] if isinstance(idf, pd.Series): idf = df.loc[[iid]] with open(fn, 'w') as fw: for _, row in idf.iterrows(): wi, hi, bbox = row.width, row.height, row.bbox xb, yb, wb, hb = ast.literal_eval(bbox) yolo_row = [ 0, (xb + wb / 2) / wi, (yb + hb / 2) / hi, wb / wi, hb / hi
def on_epoch_end(self, learner): with torch.no_grad(): model = learner.model model.eval() h_context = [] h_candidate = [] pb_h = tqdm(self.dl_holdout, total=len(self.dl_holdout), desc='MRR-Score: Calculate H') for batch in pb_h: data, targets = learner.to_device( batch[0], self.device), learner.to_device(batch[1], self.device) T_context, X_context, T_candidate, X_candidate = data h_context_batch = model.get_h_context( X_context, T_context).detach().cpu().numpy() h_candidate_batch = model.get_h_candidate( X_candidate, T_candidate).detach().cpu().numpy() h_context.append(h_context_batch) h_candidate.append(h_candidate_batch) h_context = flatten(h_context) h_candidate = np.vstack(h_candidate) dl_h_candidate = DataLoader(dataset=TensorDataset( torch.FloatTensor(h_candidate)), batch_size=self.dl_holdout.batch_size, shuffle=False) all_contexts_logits = [] pb_h_context = tqdm(h_context, total=len(h_context), desc='Scoring') for h_context in pb_h_context: context_logits = [] h_context_batch = torch.FloatTensor([h_context ]).to(self.device) for h_candidate_batch in dl_h_candidate: h_candidate_batch = h_candidate_batch[0].to(self.device) context_logits_ = model.get_logits( h_context_batch, h_candidate_batch).detach().cpu().numpy() context_logits.extend(context_logits_) all_contexts_logits.append(context_logits) all_contexts_logits = np.array(all_contexts_logits) ranks = np.abs( np.apply_along_axis(rankdata, 1, all_contexts_logits) - all_contexts_logits.shape[1]) + 1 mrr = np.mean(1 / ranks.diagonal()) self.tb_writer.add_scalar(f'MRRScore/valid', mrr, learner.cur_epoch)