def predict(self, data, pred=None, buckets=8, batch_size=5000, prob=False, **kwargs): args = self.args.update(locals()) init_logger(logger, verbose=args.verbose) self.transform.eval() if args.prob: self.transform.append(Field('probs')) logger.info("Load the data") dataset = Dataset(self.transform, data) dataset.build(args.batch_size, args.buckets, shuffle=False) logger.info(f"\n{dataset}") logger.info("Make predictions on the dataset") start = datetime.now() preds = self._predict(dataset.loader) elapsed = datetime.now() - start for name, value in preds.items(): setattr(dataset, name, value) if pred is not None: logger.info(f"Save predicted results to {pred}") self.transform.save(pred, dataset.sentences) logger.info( f"{elapsed}s elapsed, {len(dataset) / elapsed.total_seconds():.2f} Sents/s" ) return dataset
def predict(self, data, pred=None, buckets=8, batch_size=5000, prob=False, **kwargs): args = self.args.update(locals()) init_logger(logger, verbose=args.verbose) self.transform.eval() logger.info("Load the data") # test = {'sentences': os.path.join(data, "Testing_InputSentences.pickle"), # 'edu_break': os.path.join(data, "Testing_EDUBreaks.pickle"), # 'golden_metric': os.path.join(data, "Testing_GoldenLabelforMetric.pickle")} # dataset = Dataset(self.transform, test) dataset = Dataset(self.transform, data) dataset.build(args.batch_size, n_buckets=1, shuffle=False) logger.info(f"\n{dataset}") logger.info(vars(dataset)) logger.info("Make predictions on the dataset") start = datetime.now() preds = self._predict(dataset.loader) import pickle with open(args.predict_output_path,'wb') as f: pickle.dump(preds, f) # elapsed = datetime.now() - start # for name, value in preds.items(): # setattr(dataset, name, value) # if pred is not None: # logger.info(f"Save predicted results to {pred}") # self.transform.save(pred, dataset.sentences) # logger.info(f"{elapsed}s elapsed, {len(dataset) / elapsed.total_seconds():.2f} Sents/s") return dataset
def train_abs_multi(args): """ Spawns 1 process per GPU """ init_logger() nb_gpu = args.world_size mp = torch.multiprocessing.get_context('spawn') # Create a thread to listen for errors in the child processes. error_queue = mp.SimpleQueue() error_handler = ErrorHandler(error_queue) # Train with multiprocessing. procs = [] for i in range(nb_gpu): device_id = i procs.append( mp.Process(target=run, args=( args, device_id, error_queue, ), daemon=True)) procs[i].start() logger.info(" Starting process pid: %d " % procs[i].pid) error_handler.add_child(procs[i].pid) for p in procs: p.join()
def evaluate(self, data, buckets=8, batch_size=5000, **kwargs): args = self.args.update(locals()) init_logger(logger, verbose=args.verbose) self.transform.train() logger.info("Load the data") dataset = Dataset(self.transform, data) dataset.build(args.batch_size, args.buckets) logger.info(f"\n{dataset}") logger.info("Evaluate the dataset") start = datetime.now() loss, metric = self._evaluate(dataset.loader) elapsed = datetime.now() - start logger.info(f"loss: {loss:.4f} - {metric}") logger.info( f"{elapsed}s elapsed, {len(dataset)/elapsed.total_seconds():.2f} Sents/s" ) return loss, metric
def main(model_id, folds, debug): MODEL_ID = model_id # Script parameters MODEL_NAME = f"model_L1A_{MODEL_ID}" DATA_DIR = "data" PATH_TO_LOG = ".log" print('-' * 80) # Init logger if folds is None: log_filename = os.path.join(PATH_TO_ROOT, PATH_TO_LOG, MODEL_NAME + '.log') else: log_filename = os.path.join(PATH_TO_ROOT, PATH_TO_LOG, MODEL_NAME + f'_{folds}.log') orig_stdout, orig_stderr, sys.stdout, sys.stderr = init_logger( sys, log_filename, timestamp=True, verbose=True) print(f'Logged to file: {log_filename}') # Read model_module MM = exe.load_model_module(MODEL_NAME + "_module", os.path.join(PATH_TO_ROOT, MODEL_MODULE_PATH)) # Print information print('Executed with arguments:') print(MM.ARGS) print('-' * 80) # Read dataset path_to_data = os.path.join(PATH_TO_ROOT, DATA_DIR) dset_df, annot_dict = MM.get_dset(path_to_data) # Add folds dset_df = exe.load_kfolds( dset_df, os.path.join(PATH_TO_ROOT, DATA_DIR, KFOLDS_FILE)) # Get list of fold_ids fold_ids = dset_df.fold.dropna().unique().tolist() fold_ids.sort() # Folds to train train_folds = fold_ids if folds is not None: train_folds = [s for s in train_folds if s in folds] print('-' * 80) print( subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE).stdout.decode('utf-8')) # Iterate folds for fold_id in train_folds: fold_id = [ fold_id, ] print('-' * 40) offold_ids = [s for s in fold_ids if s not in fold_id] print( f"TRAINING FOLDS: '{','.join(offold_ids)}' TO PREDICT FOLD '{','.join(fold_id)}'" ) # Generate datasets datasets = { 'train': dset_df[dset_df.train & ~dset_df.fold.isin(fold_id)], # 'train': dset_df[dset_df.fold.isin(fold_id)], 'valid': dset_df[dset_df.fold.isin(fold_id)], 'fold': dset_df[dset_df.fold.isin(fold_id)], 'test': dset_df[dset_df.test], } if debug: datasets['train'] = datasets['train'][0:len(datasets['valid'])] print(f"Training: {len(datasets['train']):,} | " f"Validation: {len(datasets['valid']):,} | " f"OOT-Fold: {len(datasets['fold']):,} | " f"Test: {len(datasets['test']):,}") # Get data loaders data_loaders = MM.get_dataloaders(path_to_data, datasets) # Get learner learner = MM.get_learner(annot_dict['nb_classes']) if True and 'original_class_id' in datasets[ 'train'].columns: # Check original/pseudo labels train_original = datasets['train'].groupby( 'class_id').count().iloc[:, 0].values train_actual = datasets['train'].groupby( 'original_class_id').count().iloc[:, 0].values if any([s1 != s2 for s1, s2 in zip(train_original, train_actual)]): print("Using not original labels for training!") # Train print('-' * 40 + " Training") epochs = MM.args.max_train_epochs if not debug else 1 learner.train_loader(data_loaders, epochs=epochs) # Output_name ouput_name = MM.args.ouput_name if not debug else 'debug_' + MM.args.ouput_name # Predict Fold print('-' * 40 + " Predicting Fold") valid_preds = learner.predict_loader(data_loaders['valid']) valid_preds_df = pd.DataFrame(valid_preds, index=datasets['valid'].id) valid_preds_df.columns = [ annot_dict['classId_to_name'][s1] for s1 in valid_preds_df.columns ] print(f"Dataset shape: {valid_preds_df.shape}") print(valid_preds_df.head(2)) filepath = os.path.join( PATH_TO_ROOT, OUTPUTS_PATH, ouput_name + f"_fold_{','.join(fold_id)}.csv.gz") print(f"Saving FOLD predictions: {filepath}") valid_preds_df.to_csv(filepath, index=True) # Predict Test print('-' * 40 + " Predicting Test") test_preds = learner.predict_loader(data_loaders['test']) test_preds_df = pd.DataFrame(test_preds, index=datasets['test'].id) test_preds_df.columns = [ annot_dict['classId_to_name'][s1] for s1 in test_preds_df.columns ] print(f"Dataset shape: {test_preds_df.shape}") test_preds_df.head(2) filepath = os.path.join( PATH_TO_ROOT, OUTPUTS_PATH, ouput_name + f"_test_{','.join(fold_id)}.csv.gz") print(f"Saving TEST predictions: {filepath}") test_preds_df.to_csv(filepath, index=True) learner.clean_memory() del learner
def main(model_id, folds, debug, only_test): MODEL_ID = model_id # Script parameters MODEL_NAME = f"model_L1A_{MODEL_ID}" DATA_DIR = "data" PATH_TO_LOG = ".log" print('*' * 80) prefix = 'debug_' if debug else '' # Init logger if folds is None: log_filename = os.path.join(PATH_TO_ROOT, PATH_TO_LOG, prefix + MODEL_NAME + '_predict.log') else: log_filename = os.path.join( PATH_TO_ROOT, PATH_TO_LOG, prefix + MODEL_NAME + f'_{folds}_predict.log') orig_stdout, orig_stderr, sys.stdout, sys.stderr = init_logger( sys, log_filename, timestamp=True, verbose=True) print(f'Logged to file: {log_filename}') # Read model_module MM = exe.load_model_module(MODEL_NAME + "_module", os.path.join(PATH_TO_ROOT, MODEL_MODULE_PATH)) # Print information print('Executed with arguments:') print(MM.ARGS) print('-' * 80) # Read dataset path_to_data = os.path.join(PATH_TO_ROOT, DATA_DIR) dset_df = MM.get_dset(path_to_data) # Add folds dset_df = exe.load_kfolds( dset_df, os.path.join(PATH_TO_ROOT, DATA_DIR, KFOLDS_FILE)) # Get list of fold_ids fold_ids = dset_df.fold.dropna().unique().tolist() fold_ids.sort() print('-' * 80) print( subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE).stdout.decode('utf-8')) # Folds to train train_folds = fold_ids if folds is not None: train_folds = [s for s in train_folds if s in folds] # Iterate folds for fold_id in train_folds: fold_id = [ fold_id, ] print('-' * 40) offold_ids = [s for s in fold_ids if s not in fold_id] print(f"PREDICT FOLD '{','.join(fold_id)}'") # Generate datasets datasets = { 'train': dset_df[dset_df.train & ~dset_df.fold.isin(fold_id)], 'valid': dset_df[dset_df.train & dset_df.fold.isin(fold_id) & dset_df.for_validation], 'fold': dset_df[dset_df.train & dset_df.fold.isin(fold_id)], 'test': dset_df[dset_df.test], } if debug: datasets['train'] = datasets['train'][0:len(datasets['valid'])] print(f"Training: {len(datasets['train']):,} | " f"Validation: {len(datasets['valid']):,} | " f"OOT-Fold: {len(datasets['fold']):,} | " f"Test: {len(datasets['test']):,}") # Get data loaders data_loaders = MM.get_dataloaders(path_to_data, datasets) # Output_name ouput_name = MM.args.ouput_name if not debug else 'debug_' + MM.args.ouput_name # Load model filename = os.path.join(PATH_TO_ROOT, MODELS_PATH, ouput_name + f"_model_{','.join(fold_id)}.tar") learner = MM.load_model(filename) if not only_test: # Predict Fold print('-' * 40 + " Predicting Fold") valid_preds = learner.predict_loader(data_loaders['fold']) valid_preds_df = pd.DataFrame(valid_preds, index=datasets['fold'].image_id) valid_preds_df.columns = ['wind_speed'] print(f"Dataset shape: {valid_preds_df.shape}") print(valid_preds_df.head(2)) filepath = os.path.join( PATH_TO_ROOT, OUTPUTS_PATH, ouput_name + f"_fold_{','.join(fold_id)}.csv.gz") print(f"Saving FOLD predictions: {filepath}") valid_preds_df.to_csv(filepath, index=True) # Predict Test print('-' * 40 + " Predicting Test") test_preds = learner.predict_loader(data_loaders['test']) test_preds_df = pd.DataFrame(test_preds, index=datasets['test'].image_id) test_preds_df.columns = ['wind_speed'] print(f"Dataset shape: {test_preds_df.shape}") test_preds_df.head(2) filepath = os.path.join( PATH_TO_ROOT, OUTPUTS_PATH, ouput_name + f"_test_{','.join(fold_id)}.csv.gz") print(f"Saving TEST predictions: {filepath}") test_preds_df.to_csv(filepath, index=True) learner.clean_memory() del learner
test_all = False test_start_from = -1 # type=int) train_from = '' report_rouge = True block_trigram = True args = parser args.gpu_ranks = [int(i) for i in range(len(args.visible_gpus.split(',')))] args.world_size = len(args.gpu_ranks) os.environ["CUDA_VISIBLE_DEVICES"] = args.visible_gpus init_logger(args.log_file) device = "cpu" if args.visible_gpus == '-1' else "cuda" device_id = 0 if device == "cuda" else -1 def summarize_pdf(pdf_file, sent_percentage): pdf_file_obj = open(pdf_file, 'rb') pdf_reader = PyPDF2.PdfFileReader(pdf_file_obj) title = pdf_reader.getDocumentInfo().title summary_title = "Summary" if title is not None: summary_title = title + ' - ' + summary_title num_of_pages = pdf_reader.numPages body = '' for i in range(num_of_pages): pageobj = pdf_reader.getPage(i)
def train(self, train, dev, test, buckets=32, batch_size=5000, lr=8e-4, mu=.9, nu=.9, epsilon=1e-12, clip=5.0, decay=.75, decay_steps=5000, step_decay_factor=0.5, step_decay_patience=15, epochs=5000, patience=100, verbose=True, **kwargs): args = self.args.update(locals()) init_logger(logger, verbose=args.verbose) self.transform.train() if dist.is_initialized(): args.batch_size = args.batch_size // dist.get_world_size() logger.info("Load the data") train = Dataset(self.transform, args.train, **args) dev = Dataset(self.transform, args.dev) test = Dataset(self.transform, args.test) train.build(args.batch_size, args.buckets, True, dist.is_initialized()) dev.build(args.batch_size, args.buckets) test.build(args.batch_size, args.buckets) logger.info( f"\n{'train:':6} {train}\n{'dev:':6} {dev}\n{'test:':6} {test}\n") logger.info(f"{self.model}\n") if dist.is_initialized(): self.model = DDP(self.model, device_ids=[dist.get_rank()], find_unused_parameters=True) self.optimizer = Adam(self.model.parameters(), args.lr, (args.mu, args.nu), args.epsilon) if self.args.learning_rate_schedule == 'Exponential': self.scheduler = ExponentialLR(self.optimizer, args.decay**(1 / args.decay_steps)) elif self.args.learning_rate_schedule == 'Plateau': self.scheduler = ReduceLROnPlateau( self.optimizer, 'max', factor=args.step_decay_factor, patience=args.step_decay_patience, verbose=True) elapsed = timedelta() best_e, best_metric = 1, Metric() best_metric_test = Metric() for epoch in range(1, args.epochs + 1): start = datetime.now() logger.info(f"Epoch {epoch} / {args.epochs}:") loss = self._train(train.loader) logger.info(f"{'train:':6} - loss: {loss:.4f}") loss, dev_metric = self._evaluate(dev.loader) logger.info(f"{'dev:':6} - loss: {loss:.4f} - {dev_metric}") loss, test_metric = self._evaluate(test.loader) logger.info(f"{'test:':6} - loss: {loss:.4f} - {test_metric}") t = datetime.now() - start # save the model if it is the best so far if dev_metric > best_metric: best_e, best_metric = epoch, dev_metric dev_metric_name = '_dev_LP_{:.2f}_LR_{:.2f}_LF_{:.2f}.pt'.format( 100 * best_metric.lp, 100 * best_metric.lr, 100 * best_metric.lf) if is_master(): self.save(args.path + dev_metric_name) logger.info(f"{t}s elapsed (saved)\n") keep_last_n_checkpoint(args.path + '_dev_', n=5) else: logger.info(f"{t}s elapsed\n") elapsed += t if self.args.learning_rate_schedule == 'Plateau': self.scheduler.step(best_metric.score) # if epoch - best_e >= args.patience: # break loss, metric = self.load(args.path)._evaluate(test.loader) logger.info(f"Epoch {best_e} saved") logger.info(f"{'dev:':6} - {best_metric}") logger.info(f"{'test:':6} - {metric}") logger.info(f"{elapsed}s elapsed, {elapsed / epoch}s/epoch")
def train_abs_single(args, device_id): init_logger(args.log_file) logger.info(str(args)) device = "cpu" if args.visible_gpus == '-1' else "cuda" logger.info('Device ID %d' % device_id) logger.info('Device %s' % device) torch.manual_seed(args.seed) random.seed(args.seed) torch.backends.cudnn.deterministic = True if device_id >= 0: torch.cuda.set_device(device_id) torch.cuda.manual_seed(args.seed) if args.train_from != '': logger.info('Loading checkpoint from %s' % args.train_from) checkpoint = torch.load(args.train_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) else: checkpoint = None if (args.load_from_extractive != ''): logger.info('Loading bert from extractive model %s' % args.load_from_extractive) bert_from_extractive = torch.load( args.load_from_extractive, map_location=lambda storage, loc: storage) bert_from_extractive = bert_from_extractive['model'] else: bert_from_extractive = None torch.manual_seed(args.seed) random.seed(args.seed) torch.backends.cudnn.deterministic = True def train_iter_fct(): return data_loader.Dataloader(args, load_dataset(args, 'train', shuffle=True), args.batch_size, device, shuffle=True, is_test=False) model = AbsSummarizer(args, device, checkpoint, bert_from_extractive) if (args.sep_optim): optim_bert = model_builder.build_optim_bert(args, model, checkpoint) optim_dec = model_builder.build_optim_dec(args, model, checkpoint) optim = [optim_bert, optim_dec] else: optim = [model_builder.build_optim(args, model, checkpoint)] logger.info(model) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=args.temp_dir) symbols = { 'BOS': tokenizer.vocab['[unused0]'], 'EOS': tokenizer.vocab['[unused1]'], 'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused2]'] } train_loss = abs_loss(model.generator, symbols, model.vocab_size, device, train=True, label_smoothing=args.label_smoothing) trainer = build_trainer(args, device_id, model, optim, train_loss) trainer.train(train_iter_fct, args.train_steps)