def sequentialSampler(self, batch_size): return DataLoader(self, sampler=SequentialSampler(self), batch_size=batch_size, num_workers=8)
def __iter__(self): self.sampler = SequentialSampler(range(self.size)) self.iter_sampler = iter(self.sampler) return self
def main(args): #with torch.cuda.device(args.gpu): layers_map = { 'relu4_2': '22', 'relu2_2': '8', 'relu3_2': '13', 'relu1_2': '4' } vis = visdom.Visdom(port=args.display_port) loss_graph = { "g": [], "gd": [], "gf": [], "gpl": [], "gpab": [], "gs": [], "d": [], "gdl": [], "dl": [], } # for rgb the change is to feed 3 channels to D instead of just 1. and feed 3 channels to vgg. # can leave pixel separate between r and gb for now. assume user use the same weights transforms = get_transforms(args) if args.color_space == 'rgb': args.pixel_weight_ab = args.pixel_weight_rgb args.pixel_weight_l = args.pixel_weight_rgb rgbify = custom_transforms.toRGB() train_dataset = ImageFolder('train', args.data_path, transforms) train_loader = DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=True) val_dataset = ImageFolder('val', args.data_path, transforms) indices = torch.randperm(len(val_dataset)) val_display_size = args.batch_size val_display_sampler = SequentialSampler(indices[:val_display_size]) val_loader = DataLoader(dataset=val_dataset, batch_size=val_display_size, sampler=val_display_sampler) # renormalize = transforms.Normalize(mean=[+0.5+0.485, +0.5+0.456, +0.5+0.406], std=[0.229, 0.224, 0.225]) feat_model = models.vgg19(pretrained=True) netG, netD, netD_local = get_models(args) criterion_gan, criterion_pixel_l, criterion_pixel_ab, criterion_style, criterion_feat, criterion_texturegan = get_criterions( args) real_label = 1 fake_label = 0 optimizerD = optim.Adam(netD.parameters(), lr=args.learning_rate_D, betas=(0.5, 0.999)) optimizerG = optim.Adam(netG.parameters(), lr=args.learning_rate, betas=(0.5, 0.999)) optimizerD_local = optim.Adam(netD_local.parameters(), lr=args.learning_rate_D_local, betas=(0.5, 0.999)) with torch.cuda.device(args.gpu): netG.cuda() netD.cuda() netD_local.cuda() feat_model.cuda() criterion_gan.cuda() criterion_pixel_l.cuda() criterion_pixel_ab.cuda() criterion_feat.cuda() criterion_texturegan.cuda() input_stack = torch.FloatTensor().cuda() target_img = torch.FloatTensor().cuda() target_texture = torch.FloatTensor().cuda() segment = torch.FloatTensor().cuda() label = torch.FloatTensor(args.batch_size).cuda() label_local = torch.FloatTensor(args.batch_size).cuda() extract_content = FeatureExtractor(feat_model.features, [layers_map[args.content_layers]]) extract_style = FeatureExtractor( feat_model.features, [layers_map[x.strip()] for x in args.style_layers.split(',')]) model = { "netG": netG, "netD": netD, "netD_local": netD_local, "criterion_gan": criterion_gan, "criterion_pixel_l": criterion_pixel_l, "criterion_pixel_ab": criterion_pixel_ab, "criterion_feat": criterion_feat, "criterion_style": criterion_style, "criterion_texturegan": criterion_texturegan, "real_label": real_label, "fake_label": fake_label, "optimizerD": optimizerD, "optimizerD_local": optimizerD_local, "optimizerG": optimizerG } for epoch in range(args.load_epoch, args.num_epoch): train(model, train_loader, val_loader, input_stack, target_img, target_texture, segment, label, label_local, extract_content, extract_style, loss_graph, vis, epoch, args)
def get_loader(dataset, dataset_root, split, transform, batch_size, shuffle, num_workers, include_eos, drop_last=False, shuffle_labels=False, seed=1234, checkpoint=None): # reads the file with ids to use for this split perm_file = os.path.join('../data/splits/', dataset, split + '.txt') with open(perm_file, 'r') as f: perm = np.array([int(line.rstrip('\n')) for line in f]) if dataset == 'coco': if split == 'train' or split == 'val': annFile = os.path.join(dataset_root, 'annotations', 'instances_train2014.json') impath = os.path.join(dataset_root, 'train2014') else: annFile = os.path.join(dataset_root, 'annotations', 'instances_val2014.json') impath = os.path.join(dataset_root, 'val2014') dataset = COCO(root=impath, annFile=annFile, transform=transform, shuffle=shuffle_labels, perm=perm, include_eos=include_eos) elif dataset == 'voc': dataset = VOC(root=dataset_root, year='2007', image_set=split, download=False, transform=transform, shuffle=shuffle_labels, perm=perm, include_eos=include_eos) elif dataset == 'nuswide': dataset = NUSWIDE(dataset_root, split, transform=transform, shuffle=shuffle_labels, perm=perm, include_eos=include_eos) elif dataset == 'ade20k': dataset = ADE20K(dataset_root, split, transform=transform, shuffle=shuffle_labels, perm=perm, include_eos=include_eos) elif dataset == 'recipe1m': dataset = Recipe1M(dataset_root, split, maxnumims=5, shuffle=shuffle_labels, transform=transform, use_lmdb=False, suff='final_', perm=perm, include_eos=include_eos) def worker_init_fn(worker_id): np.random.seed(seed) if shuffle: # for training sampler = RandomSamplerWithState(dataset, batch_size, seed) if checkpoint is not None: sampler.set_state(checkpoint['args'].current_epoch, checkpoint['current_step']) else: # for validation and test sampler = SequentialSampler(dataset) data_loader = torch.utils.data.DataLoader(dataset=dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, drop_last=drop_last, pin_memory=True, collate_fn=collate_fn, worker_init_fn=worker_init_fn, sampler=sampler) return data_loader, dataset
) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=BATCH_SIZE, sampler=RandomSampler(train_dataset), pin_memory=False, drop_last=True, num_workers=6, collate_fn=collate_fn, ) val_loader = torch.utils.data.DataLoader( validation_dataset, batch_size=BATCH_SIZE, num_workers=6, shuffle=False, sampler=SequentialSampler(validation_dataset), pin_memory=False, collate_fn=collate_fn, ) device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') print('Cuda is available: {}'.format(torch.cuda.is_available())) cpu_device = torch.device("cpu") num_classes = 2 model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True, pretrained_backbone=True) in_features = model.roi_heads.box_predictor.cls_score.in_features model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes) model.to(device)
def get_reactions(self, sents, products=None): """ """ if products is None: tokenized_sents, products = self.get_products(sents) assert len(products) == len(tokenized_sents) # create dataset # for each sent, create #{prod} instances examples = [] num_rxns_per_sent = [] for guid, (sent, prod_labels) in enumerate(zip(tokenized_sents, products)): assert len(sent) == len(prod_labels) prods = get_entities(prod_labels) num_rxns_per_sent.append(len(prods)) for i, (etype, ss, se) in enumerate(prods): assert etype == "Prod" labels = ["O"] * len(sent) labels[ss] = "B-Prod" labels[ss + 1:se + 1] = ["I-Prod"] * (se - ss) examples.append( InputExample(guid=guid, words=sent, labels=labels)) features = cre.data.role.convert_examples_to_features( examples, self.role_labels, self.role_max_seq_len, self.role_tokenizer, pad_token=self.role_tokenizer.pad_token_id, pad_token_label_id=self.pad_token_label_id) dataset = RxnDataset(features) data_loader = DataLoader(dataset, sampler=SequentialSampler(dataset), batch_size=self.batch_size, collate_fn=default_data_collator) all_preds = [] for batch in data_loader: with torch.no_grad(): for k, v in batch.items(): if isinstance(v, torch.Tensor): batch[k] = v.to(self.device) outputs = self.role_extractor( input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], prod_start_mask=batch['prod_start_mask'], prod_end_mask=batch['prod_end_mask'], prod_mask=batch['prod_mask'], token_type_ids=batch['token_type_ids']) logits = outputs[0] preds = self.role_extractor.decode( logits, batch['decoder_mask'].bool().to(self.device)) preds = [[self.role_labels[x] for x in seq] for seq in preds] all_preds += preds # align predictions with inputs example_id = 0 results = [] for guid, sent in enumerate(tokenized_sents): rxns = {"tokens": sent, "reactions": []} for k in range(num_rxns_per_sent[guid]): # merge preds with prod labels rxn_labels = [] ex = examples[example_id] for j, label in enumerate(ex.labels): if label in ["B-Prod", "I-Prod"]: rxn_labels.append(label) else: rxn_labels.append(all_preds[example_id].pop(0)) rxn = {} for role, ss, se in get_entities(rxn_labels): if role == "Prod": rxn["Product"] = (" ".join(sent[ss:se + 1]), ss, se) else: if role not in rxn: rxn[role] = [] # e.g., multiple reactants rxn[role].append((" ".join(sent[ss:se + 1]), ss, se)) rxns["reactions"].append(rxn) example_id += 1 results.append(rxns) return results
def fit(self, train_dataset, validation_dataset, epochs, train_batch_size, validation_batch_size, results_base_dir_path, epoch_handler=None, validation_split=None, shuffle_dataset=True): dataset_size = None train_dataset_size = None validation_dataset_size = None if validation_split is not None: dataset_size = len(train_dataset) indices = list(range(dataset_size)) split = int(numpy.floor(validation_split * dataset_size)) train_indices, validation_indices = indices[ split:], indices[:split] actual_train_dataset = train_dataset actual_validation_dataset = train_dataset else: train_dataset_size = len(train_dataset) validation_dataset_size = len(validation_dataset) train_indices = list(range(train_dataset_size)) validation_indices = list(range(validation_dataset_size)) actual_train_dataset = train_dataset actual_validation_dataset = validation_dataset if shuffle_dataset is True: train_sampler = SubsetRandomSampler(train_indices) validation_sampler = SubsetRandomSampler(validation_indices) else: train_sampler = SequentialSampler(train_indices) validation_sampler = SequentialSampler(validation_indices) train_data_loader = DataLoader(actual_train_dataset, batch_size=train_batch_size, sampler=train_sampler, drop_last=False, num_workers=0) validation_data_loader = DataLoader(actual_validation_dataset, batch_size=validation_batch_size, sampler=validation_sampler, drop_last=False, num_workers=0) epochs_text = epochs if epochs is not None else 'infinite' ModelTrainer._print_training_configuration('Epochs', epochs_text) ModelTrainer._print_training_configuration('Train Batch size', train_batch_size) ModelTrainer._print_training_configuration('Validation Batch size', validation_batch_size) ModelTrainer._print_training_configuration('Training dataset length', len(train_indices)) ModelTrainer._print_training_configuration( 'Training batches per epoch', int(numpy.ceil(len(train_indices) / train_batch_size))) ModelTrainer._print_training_configuration('Validation dataset length', len(validation_indices)) ModelTrainer._print_training_configuration( 'Validation batches per epoch', int(numpy.ceil(len(validation_indices) / validation_batch_size))) results_dir_path = os.path.normpath( os.path.join(results_base_dir_path, datetime.now().strftime('%Y-%m-%d-%H-%M-%S'))) model_file_path = os.path.normpath( os.path.join(results_dir_path, 'model.pt')) results_file_path = os.path.normpath( os.path.join(results_dir_path, 'results.npy')) model_architecture_file_path = os.path.normpath( os.path.join(results_dir_path, 'model_arch.txt')) loss_functions_file_path = os.path.normpath( os.path.join(results_dir_path, 'loss_functions.txt')) optimizer_file_path = os.path.normpath( os.path.join(results_dir_path, 'optimizer.txt')) trainer_data_file_path = os.path.normpath( os.path.join(results_dir_path, 'trainer_data.txt')) Path(results_dir_path).mkdir(parents=True, exist_ok=True) with open(model_architecture_file_path, "w") as text_file: text_file.write(str(self._model)) with open(loss_functions_file_path, "w") as text_file: for loss_function in self._loss_functions: text_file.write(str(loss_function)) print('\n') with open(optimizer_file_path, "w") as text_file: text_file.write(str(self._optimizer)) with open(trainer_data_file_path, "w") as text_file: text_file.write(f'train_batch_size: {train_batch_size}\n') text_file.write( f'validation_batch_size: {validation_batch_size}\n') text_file.write(f'epochs: {epochs_text}\n') text_file.write(f'results_dir_path: {results_dir_path}\n') if validation_split is not None: text_file.write(f'validation_split: {validation_split}\n') text_file.write(f'dataset_size: {dataset_size}\n') else: text_file.write(f'train_dataset_size: {train_dataset_size}\n') text_file.write( f'validation_dataset_size: {validation_dataset_size}\n') print(f' - Start Training:') results = None best_validation_average_loss = None train_loss_array = numpy.array([]) validation_loss_array = numpy.array([]) for epoch_index in itertools.count(): print(f' - Training Epoch #{epoch_index+1}:') train_loss = self._train_epoch(epoch_index=epoch_index, data_loader=train_data_loader) train_loss_array = numpy.append(train_loss_array, [numpy.mean(train_loss)]) print(f' - Validation Epoch #{epoch_index+1}:') validation_loss = self._validation_epoch( epoch_index=epoch_index, data_loader=validation_data_loader) validation_loss_array = numpy.append(validation_loss_array, [numpy.mean(validation_loss)]) if best_validation_average_loss is None: torch.save(self._model.state_dict(), model_file_path) best_validation_average_loss = numpy.mean(validation_loss) else: validation_average_loss = numpy.mean(validation_loss) if validation_average_loss < best_validation_average_loss: torch.save(self._model.state_dict(), model_file_path) best_validation_average_loss = validation_average_loss lastest_model_path = os.path.normpath( os.path.join(results_dir_path, f'model_{epoch_index}.pt')) torch.save(self._model.state_dict(), lastest_model_path) if epoch_handler is not None: epoch_handler(epoch_index) results = { 'train_loss_array': train_loss_array, 'validation_loss_array': validation_loss_array, 'epochs': epochs_text, 'train_batch_size': train_batch_size, 'validation_batch_size': validation_batch_size, 'model_file_path': model_file_path, 'results_file_path': results_file_path } numpy.save(file=results_file_path, arr=results, allow_pickle=True) if (epochs is not None) and (epoch_index + 1 == epochs): break return results
transform=transform, ) num_train = len(train_dataset) indices = list(range(num_train)) for i in range(100): np.random.shuffle(indices) split1 = int(np.floor(0.14 * num_train)) #10% of train data is val data at 0.1 split2 = int(np.floor(0.23 * num_train)) #10% of train data is val data at 0.1 train_indices, valid_indices, test_indices = indices[split2:], indices[ split1:split2], indices[:split1] train_sampler = SubsetRandomSampler(train_indices) val_sampler = SequentialSampler(valid_indices) test_sampler = SequentialSampler(test_indices) kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} train_loader = torch.utils.data.DataLoader(train_dataset, sampler=train_sampler, batch_size=args.batch_size, shuffle=False, pin_memory=True) test_loader = torch.utils.data.DataLoader(train_dataset, sampler=val_sampler, batch_size=args.test_batch_size, shuffle=False, pin_memory=True) test_loader2 = torch.utils.data.DataLoader(train_dataset,
def test_auto_dataloader_warning(distributed_context_single_node_gloo): with pytest.warns(UserWarning, match=r"Found batch_sampler in provided kwargs"): auto_dataloader( DummyDS(), batch_sampler=BatchSampler(SequentialSampler(range(10)), batch_size=3, drop_last=False) )
def update_stage1_oof_preds(df, cv_df): res_file_name = STAGE1_CFGS_TAG+"-train.csv" new_feats = get_stage1_columns() for f in new_feats: df[f] = 0 if os.path.isfile(res_file_name): df = pd.read_csv(res_file_name) print('img acc:', ((df[new_feats[0]]>0)==df[CFG['image_target_cols'][0]]).mean()) return df for fold, (train_fold, valid_fold) in enumerate(zip(CFG['train_folds'], CFG['valid_folds'])): if fold < 0: continue valid_patients = cv_df.loc[cv_df.fold.isin(valid_fold), 'StudyInstanceUID'].unique() filt = df.StudyInstanceUID.isin(valid_patients) valid_ = df.loc[filt,:].reset_index(drop=True) image_preds_all_list = [] for cfg in STAGE1_CFGS: valid_ds = cfg['dataset_constructor'](valid_, 0.0, CFG['train_img_path'], image_subsampling=False, transforms=get_valid_transforms(), output_label=True) val_loader = torch.utils.data.DataLoader( valid_ds, batch_size=256, num_workers=CFG['num_workers'], shuffle=False, pin_memory=False, sampler=SequentialSampler(valid_ds) ) device = torch.device(CFG['device']) model = cfg['model_constructor']().to(device) model.load_state_dict(torch.load('{}/model_fold_{}_{}'.format(CFG['model_path'], fold, cfg['tag']))) model.eval() image_preds_all = [] correct_count = 0 count = 0 for step, (imgs, target) in enumerate(val_loader): imgs = imgs.to(device).float() target = target.to(device).float() image_preds = model(imgs) #output = model(input) #print(image_preds[:,0], image_preds[:,0].shape) #print(target, target.shape) if len(image_preds.shape) == 1: image_preds = image_preds.view(-1, 1) correct_count += ((image_preds[:,0]>0) == target[:,0]).sum().detach().item() count += imgs.shape[0] image_preds_all += [image_preds.cpu().detach().numpy()] print('acc: {:.4f}, {}, {}, {}/{}'.format(correct_count/count, correct_count, count, step+1, len(val_loader)), end='\r') print() image_preds_all = np.concatenate(image_preds_all, axis=0) image_preds_all_list += [image_preds_all] del model, val_loader torch.cuda.empty_cache() image_preds_all_list = np.concatenate(image_preds_all_list, axis=1) df.loc[filt, new_feats] = image_preds_all_list df.to_csv(res_file_name, index=False) return df
def main(): args = parser.parse_args() log_out_dir = opj(RESULT_DIR, 'logs', args.out_dir, f'fold{args.fold}') if not ope(log_out_dir): os.makedirs(log_out_dir) log = Logger() log.open(opj(log_out_dir, 'log.submit.txt'), mode='a') if args.ema: network_path = opj(RESULT_DIR, 'models', args.out_dir, f'fold{args.fold}', f'{args.predict_epoch}_ema.pth') else: network_path = opj(RESULT_DIR, 'models', args.out_dir, f'fold{args.fold}', f'{args.predict_epoch}.pth') submit_out_dir = opj(RESULT_DIR, 'submissions', args.out_dir, f'fold{args.fold}', f'epoch_{args.predict_epoch}') log.write(">> Creating directory if it does not exist:\n>> '{}'\n".format( submit_out_dir)) if not ope(submit_out_dir): os.makedirs(submit_out_dir) # setting up the visible GPU os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_id args.augment = args.augment.split(',') for augment in args.augment: if augment not in augment_list: raise ValueError( 'Unsupported or unknown test augmentation: {}!'.format( augment)) model_params = {} model_params['architecture'] = args.arch model = init_network(model_params) log.write(">> Loading network:\n>>>> '{}'\n".format(network_path)) checkpoint = torch.load(network_path) model.load_state_dict(checkpoint['state_dict']) log.write(">>>> loaded network:\n>>>> epoch {}\n".format( checkpoint['epoch'])) # moving network to gpu and eval mode model = DataParallel(model) model.cuda() model.eval() # Data loading code dataset = args.dataset if dataset == 'test': steel_test_df = pd.read_csv(opj('..', 'input', 'sample_submission.csv')) elif dataset == 'val': steel_test_df = pd.read_csv( opj(DATA_DIR, args.split_type, args.split_name, f'random_valid_cv{args.fold}.csv')) else: raise ValueError('Unsupported or unknown dataset: {}!'.format(dataset)) steel_test_df['ImageId'], steel_test_df['ClassId'] = zip( *steel_test_df['ImageId_ClassId'].apply(lambda x: x.split('_'))) imageId = pd.DataFrame(steel_test_df['ImageId'].unique(), columns=['ImageId']) test_dataset = SteelDataset( imageId, img_size=args.img_size, mask_size=args.img_size, transform=None, return_label=False, dataset=args.dataset, ) test_loader = DataLoader( test_dataset, sampler=SequentialSampler(test_dataset), batch_size=args.batch_size, drop_last=False, num_workers=args.workers, pin_memory=True, ) for augment in args.augment: test_loader.dataset.transform = eval('augment_%s' % augment) unaugment_func = eval('unaugment_%s' % augment) sub_submit_out_dir = opj(submit_out_dir, augment) if not ope(sub_submit_out_dir): os.makedirs(sub_submit_out_dir) with torch.no_grad(): predict(test_loader, model, sub_submit_out_dir, dataset, args, unaugment_func=unaugment_func)
def __init__(self, data, mode, batch_size, vocabs, topology, bucket_by, max_len=None, bucket_order=None, **kwargs): self.datasets = {} self.mode = mode self.vocabs = vocabs self.batch_size = batch_size self.topology = topology self.bucket_by = bucket_by # Disable filtering if not training self.max_len = max_len if self.mode == 'train' else None self.bucket_order = bucket_order if self.mode == 'train' else None # For old models to work, set it to the first source if self.bucket_by is None: if len(self.topology.get_src_langs()) > 0: self.bucket_by = self.topology.get_src_langs()[0] elif self.mode != 'beam' and len( self.topology.get_trg_langs()) > 0: self.bucket_by = self.topology.get_trg_langs()[0] for key, ds in self.topology.all.items(): if self.mode == 'beam' and ds.trg: # Skip target streams continue if key == self.bucket_by: self.bucket_by = ds if ds._type == "Text": # Prepend <bos> if datasource is on target side self.datasets[ds] = TextDataset(data[key], vocabs[key], bos=ds.trg) elif ds._type == "OneHot": self.datasets[ds] = OneHotDataset(data[key], vocabs[key]) elif ds._type == "ImageFolder": self.datasets[ds] = ImageFolderDataset(data[key], **kwargs) elif ds._type == "Numpy": self.datasets[ds] = NumpyDataset(data[key]) elif ds._type == "Shelve": self.datasets[ds] = ShelveDataset(data[key], **kwargs) elif ds._type == "Kaldi": self.datasets[ds] = KaldiDataset(data[key]) elif ds._type == "NumpySequence": self.datasets[ds] = NumpySequenceDataset(data[key], **kwargs) else: raise ValueError("Unknown dataset type: {}.".format(ds)) # Detect dataset sizes sizes = set() for dataset in self.datasets.values(): sizes.add(len(dataset)) assert len(sizes) == 1, "Non-parallel datasets are not supported." # Set dataset size self.size = list(sizes)[0] # Set list of available datasets self.keys = list(self.datasets.keys()) self.n_sources = len([k for k in self.keys if k.src]) self.n_targets = len([k for k in self.keys if k.trg]) self.collate_fn = get_collate(self.keys) if self.bucket_by is not None: self.sort_lens = self.datasets[self.bucket_by].lengths self.sampler = BucketBatchSampler( batch_size=self.batch_size, sort_lens=self.sort_lens, max_len=self.max_len, store_indices=self.mode == 'beam', order=self.bucket_order) else: # No modality to sort batches, return sequential data # Used for beam-search in image->text tasks self.sampler = BatchSampler(SequentialSampler(self), batch_size=self.batch_size, drop_last=False)
def run_train(args): out_dir = args.out_dir + '/' + args.model_name use_gridmask = args.use_gridmask initial_checkpoint = args.initial_checkpoint if args.scheduler_name == 'null': schduler = NullScheduler(lr=0.001) else: schduler = CyclicScheduler0(min_lr=0.00001, max_lr=0.00005, period=750, ratio=1 ) iter_accum = 1 batch_size = args.batch_size # set-up directories for f in ['checkpoint'] : os.makedirs(out_dir +'/'+f, exist_ok=True) log = Logger() log.open(out_dir+'/log.train.txt',mode='a') log.write('\n--- [START %s] %s\n\n' % (IDENTIFIER, '-' * 64)) log.write('\t%s\n' % COMMON_STRING) log.write('\n') log.write('\tSEED = %u\n' % SEED) log.write('\t__file__ = %s\n' % __file__) log.write('\tout_dir = %s\n' % out_dir) log.write('\n') ## dataset ---------------------------------------- log.write('** dataset setting **\n') files_train = [f'train_image_data_{fid}.feather' for fid in range(4)] data = read_data(args.data_dir, files_train) df = pd.read_csv(args.df_path) train_split = np.load(args.data_dir + '/train_b_fold1_184855.npy').tolist() valid_split = np.load(args.data_dir + '/valid_b_fold1_15985.npy').tolist() train_df = df[df['image_id'].isin(train_split)] valid_df = df[df['image_id'].isin(valid_split)] train_dataset = KaggleDataset( df = df, data = data, idx = train_df.index.values, augment = train_augment if use_gridmask else valid_augment, ) train_loader = DataLoader( train_dataset, sampler = RandomSampler(train_dataset), batch_size = batch_size, drop_last = True, num_workers = 4, pin_memory = True, collate_fn = null_collate ) valid_dataset = KaggleDataset( df = df, data = data, idx = valid_df.index.values, augment = valid_augment, ) valid_loader = DataLoader( valid_dataset, sampler = SequentialSampler(valid_dataset), batch_size = batch_size, drop_last = False, num_workers = 4, pin_memory = True, collate_fn = null_collate ) assert(len(train_dataset)>=batch_size) log.write('batch_size = %d\n'%(batch_size)) log.write('\n') ## net ---------------------------------------- log.write('** net setting **\n') if args.model_name == 'serex50': net = Serex50_Net().cuda() elif args.model_name == 'effnetb3': net = EfficientNet_3().cuda() else: raise NotImplemented log.write('\tinitial_checkpoint = %s\n' % initial_checkpoint) if initial_checkpoint is not None: state_dict = torch.load(initial_checkpoint, map_location=lambda storage, loc: storage) net.load_state_dict(state_dict,strict=True) else: if args.model_name == 'serex50': net.load_pretrain(is_print=False) else: pass log.write('net=%s\n'%(type(net))) log.write('\n') if args.optimizer_name == 'AdamW': optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, net.parameters()),lr=schduler(0), weight_decay=1e-4) else: optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, net.parameters()), lr=schduler(0), momentum=0.0, weight_decay = 1e-4) num_iters = 3000*1000 iter_smooth = 50 iter_log = 250 iter_valid = 500 iter_save = [0, num_iters-1]\ + list(range(0, num_iters, 1000))#1*1000 start_iter = 0 start_epoch= 0 rate = 0 if initial_checkpoint is not None: initial_optimizer = initial_checkpoint.replace('_model.pth','_optimizer.pth') if os.path.exists(initial_optimizer): checkpoint = torch.load(initial_optimizer) start_iter = checkpoint['iter' ] start_epoch = checkpoint['epoch'] optimizer.load_state_dict(checkpoint['optimizer']) pass log.write('optimizer\n %s\n'%(optimizer)) log.write('schduler\n %s\n'%(schduler)) log.write('\n') ## start training here! ############################################## log.write('** start training here! **\n') log.write(' batch_size=%d, iter_accum=%d\n'%(batch_size,iter_accum)) log.write(' experiment = %s\n' % str(__file__.split('/')[-2:])) log.write(' |----------------------- VALID------------------------------------|------- TRAIN/BATCH -----------\n') log.write('rate iter epoch | kaggle | loss acc | loss | time \n') log.write('----------------------------------------------------------------------------------------------------------------------\n') def message(rate, iter, epoch, kaggle, valid_loss, train_loss, batch_loss, mode='print'): if mode==('print'): asterisk = ' ' loss = batch_loss if mode==('log'): asterisk = '*' if iter in iter_save else ' ' loss = train_loss text = \ '%0.5f %5.1f%s %4.1f | '%(rate, iter/1000, asterisk, epoch,) +\ '%0.4f : %0.4f %0.4f %0.4f | '%(kaggle[1],*kaggle[0]) +\ '%4.4f, %4.4f, %4.4f : %4.4f, %4.4f, %4.4f | '%(*valid_loss,) +\ '%4.4f, %4.4f, %4.4f |'%(*loss,) +\ '%s' % (time_to_str((timer() - start_timer),'min')) return text kaggle = (0,0,0,0) valid_loss = np.zeros(6,np.float32) train_loss = np.zeros(3,np.float32) batch_loss = np.zeros_like(train_loss) iter = 0 i = 0 start_timer = timer() while iter<num_iters: sum_train_loss = np.zeros_like(train_loss) sum_train = np.zeros_like(train_loss) optimizer.zero_grad() for t, (input, truth, infor) in enumerate(train_loader): input, truth, shuffled_truth, lam = cutmix(input, truth,alpha=0.3) batch_size = len(infor) iter = i + start_iter epoch = (iter-start_iter)*batch_size/len(train_dataset) + start_epoch if (iter % iter_valid==0): valid_loss, kaggle = do_valid(net, valid_loader, out_dir) # pass if (iter % iter_log==0): print('\r',end='',flush=True) log.write(message(rate, iter, epoch, kaggle, valid_loss, train_loss, batch_loss, mode='log')) log.write('\n') if iter in iter_save: torch.save({ 'optimizer': optimizer.state_dict(), 'iter' : iter, 'epoch' : epoch, }, out_dir +'/checkpoint/%08d_optimizer.pth'%(iter)) if iter!=start_iter: torch.save(net.state_dict(),out_dir +'/checkpoint/%08d_model.pth'%(iter)) pass # learning rate schduler ------------- lr = schduler(iter) if lr<0 : break adjust_learning_rate(optimizer, lr) rate = get_learning_rate(optimizer) net.train() input = input.cuda() truth = [t.cuda() for t in truth] shuffled_truth = [t.cuda() for t in shuffled_truth] logit = net(input) probability = logit_to_probability(logit) loss = cutmix_criterion(logit, truth, shuffled_truth, lam) ((loss[0]+loss[1]+loss[2] )/iter_accum).backward() if (iter % iter_accum)==0: optimizer.step() optimizer.zero_grad() loss = [l.item() for l in loss] l = np.array([ *loss, ])*batch_size n = np.array([ 1, 1, 1 ])*batch_size batch_loss = l/(n+1e-8) sum_train_loss += l sum_train += n if iter%iter_smooth == 0: train_loss = sum_train_loss/(sum_train+1e-12) sum_train_loss[...] = 0 sum_train[...] = 0 print('\r',end='',flush=True) print(message(rate, iter, epoch, kaggle, valid_loss, train_loss, batch_loss, mode='print'), end='',flush=True) i=i+1 pass #-- end of one data loader -- pass #-- end of all iterations -- log.write('\n')
def train(args): processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, 'sst': SstProcessor, 'aspect': AspectProcessor, } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.accumulate_gradients < 1: raise ValueError( "Invalid accumulate_gradients parameter: {}, should be >= 1". format(args.accumulate_gradients)) args.train_batch_size = int(args.train_batch_size / args.accumulate_gradients) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") bert_config = BertConfig.from_json_file(args.bert_config_file) if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}" .format(args.max_seq_length, bert_config.max_position_embeddings)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir, exist_ok=True) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_steps = int( len(train_examples) / args.train_batch_size * args.num_train_epochs) model = BertForMultiLabelClassification(bert_config, len(label_list)) if args.init_checkpoint is not None: model.bert.load_state_dict( torch.load(args.init_checkpoint, map_location='cpu')) model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) no_decay = ['bias', 'gamma', 'beta'] optimizer_parameters = [{ 'params': [p for n, p in model.named_parameters() if n not in no_decay], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in model.named_parameters() if n in no_decay], 'weight_decay_rate': 0.0 }] optimizer = BERTAdam(optimizer_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) global_step = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() min_loss = 100000000 for epoch in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, (input_ids, input_mask, segment_ids, label_ids) in enumerate( tqdm(train_dataloader, desc="Iteration")): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) loss, _ = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() # We have accumulated enought gradients model.zero_grad() global_step += 1 if args.do_eval: eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor( [f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor( [f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: eval_sampler = SequentialSampler(eval_data) else: eval_sampler = DistributedSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 logit_list = [] labels_eval_list = [] for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits = model(input_ids, segment_ids, input_mask, label_ids) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() logit_list.extend(logits.tolist()) labels_eval_list.extend(label_ids.tolist()) tmp_eval_accuracy = accuracy(logits, label_ids) # _ = accuracy2(logits, label_ids) # _ = accuracy3(logits, label_ids) # _ = accuracy4(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 print(epoch) _ = accuracy2(logit_list, labels_eval_list) _ = accuracy3(logit_list, labels_eval_list) _ = accuracy3_2(logit_list, labels_eval_list) _ = accuracy4(logit_list, labels_eval_list) _ = accuracy5(logit_list, labels_eval_list) _ = accuracy7(logit_list, labels_eval_list) eval_loss = eval_loss / nb_eval_steps # len(eval_dataloader) eval_accuracy = eval_accuracy / nb_eval_examples # len(eval_dataloader) print("eval_loss", eval_loss) result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': tr_loss / nb_tr_steps } # 'loss': loss.item()} output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if eval_loss < min_loss: np.save(os.path.join(args.data_dir, 'oof_train'), np.asarray(logit_list)) np.save(os.path.join(args.data_dir, 'oof_train_y'), np.asarray(labels_eval_list)) eval_examples = processor.get_test_examples(args.data_dir) eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running test *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor( [f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor( [f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: eval_sampler = SequentialSampler(eval_data) else: eval_sampler = DistributedSampler(eval_data) eval_dataloader = DataLoader( eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() logit_test = [] for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): _, logits = model(input_ids, segment_ids, input_mask, label_ids) logits = logits.detach().cpu().numpy() # label_ids = label_ids.to('cpu').numpy() logit_test.extend(logits.tolist()) # labels_eval_list.extend(label_ids.tolist()) np.save(os.path.join(args.data_dir, 'oof_test'), np.asarray(logit_test)) min_loss = eval_loss
if isinstance(data, np.ndarray): data = np.reshape(data, (n_data_total, -1, data.shape[-1])) elif isinstance(data, list): data = [np.reshape(data_slice, (-1, data_slice.shape[-1])) for data_slice in data] else: raise ValueError("Invalid type of data given") # take out only a fraction of the test data data = data[idx_start:idx_end] labels = labels[idx_start:idx_end] n_data_total = len(data) input_size = data[0].shape[0] output_size = meta['output_size'] dataset = nu.TimeSeriesDataset(data, labels, transform=nu.ToTensor()) dataloader = DataLoader(dataset, sampler=SequentialSampler(range(n_data_total)), batch_size=batch_size, collate_fn=nu.collate_fn, num_workers=0) if device == 'cpu': rnn = nu.RNN(input_size=meta['input_size'], hidden_size=meta['hidden_size'], output_size=meta['output_size'], n_layers=meta['n_layers'], bidirectional=meta['bidirectional']) else: rnn = nu.RNN(input_size=meta['input_size'], hidden_size=meta['hidden_size'], output_size=meta['output_size'], n_layers=meta['n_layers'], bidirectional=meta['bidirectional']).cuda() rnn.load_state_dict(meta['model'][idx_min_loss_epoch]) del meta #criterion = nn.CrossEntropyLoss(reduction='sum') if classifier else nn.MSELoss(reduction='sum') #metric = 'cross_entropy_mean' if classifier else 'rmse' loss_sum = {} loss_metric = {} loss_sum = 0.
def main(): parser = MyArgumentParser((InferenceArguments, )) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. (args, ) = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: (args, ) = parser.parse_args_into_dataclasses() params = dict( pretrained_model_name_or_path=args.model_name_or_path, cache_dir=args.cache_dir, ) config = AutoConfig.from_pretrained(**params) tokenizer = AutoTokenizer.from_pretrained(**params) model = AutoModelForSeq2SeqLM.from_pretrained(config=config, **params) if args.model_parameters: print("====== MODEL PARAMETER LOADING... ======\n" f" {args.model_parameters}") model.load_state_dict(torch.load(args.model_parameters)) max_length = args.test_max_target_length # set num_beams for evaluation num_beams = args.num_beams if args.num_beams else model.config.num_beams test_dataset = Seq2SeqDataset( tokenizer=tokenizer, type_path='test', data_dir=args.data_dir, max_target_length=args.test_max_target_length, max_source_length=args.max_source_length, ) test_sampler = SequentialSampler(test_dataset) data_collator = Seq2SeqDataCollator(tokenizer, args) test_dataloader = DataLoader( test_dataset, sampler=test_sampler, batch_size=args.per_device_test_batch_size, collate_fn=data_collator, drop_last=False, ) # prediction_loop description = "Prediction" batch_size = test_dataloader.batch_size num_examples = len(test_dataloader.dataset) print(f"***** Running {description} *****") print(f" Num examples = {num_examples}") print(f" Batch size = {batch_size}") device = "cuda" if torch.cuda.is_available() else "cpu" model = model.to(device) res = [] for step, inputs in enumerate(test_dataloader): # prediction_step, generative based has_labels = "labels" in inputs # False # _prepare_inputs # 1. device로 보내기 # 2. memory에 _past 올리기 for k, v in inputs.items(): if isinstance(v, torch.Tensor): inputs[k] = v.to(device) gen_kwargs = {"max_length": max_length, "num_beams": num_beams} generated_tokens = model.generate( inputs['input_ids'], attention_mask=inputs['attention_mask'], **gen_kwargs, ) # in case the batch is shorter than max length, the output should be padded if generated_tokens.shape[-1] < gen_kwargs["max_length"]: # If PAD token is not defined at least EOS token has to be defined padded_tensor = tokenizer.pad_token_id * torch.ones( (generated_tokens.shape[0], gen_kwargs["max_length"]), dtype=generated_tokens.dtype, device=generated_tokens.device, ) padded_tensor[:, :generated_tokens.shape[-1]] = generated_tokens generated_tokens = padded_tensor loss = None labels = None res.extend(list(generated_tokens)) submit(args, tokenizer, res) print("Finished!")
def training(model_name, model_type, optimizer_name, lr_scheduler_name, lr, batch_size, valid_batch_size, num_epoch, start_epoch, accumulation_steps, train_data_folder, checkpoint_folder, train_split, val_split, fold, load_pretrain): COMMON_STRING = '@%s: \n' % os.path.basename(__file__) COMMON_STRING += '\tset random seed\n' COMMON_STRING += '\t\tSEED = %d\n' % SEED torch.backends.cudnn.benchmark = False ##uses the inbuilt cudnn auto-tuner to find the fastest convolution algorithms. - torch.backends.cudnn.enabled = True torch.backends.cudnn.deterministic = True COMMON_STRING += '\tset cuda environment\n' COMMON_STRING += '\t\ttorch.__version__ = %s\n' % torch.__version__ COMMON_STRING += '\t\ttorch.version.cuda = %s\n' % torch.version.cuda COMMON_STRING += '\t\ttorch.backends.cudnn.version() = %s\n' % torch.backends.cudnn.version( ) try: COMMON_STRING += '\t\tos[\'CUDA_VISIBLE_DEVICES\'] = %s\n' % os.environ[ 'CUDA_VISIBLE_DEVICES'] NUM_CUDA_DEVICES = len(os.environ['CUDA_VISIBLE_DEVICES'].split(',')) except Exception: COMMON_STRING += '\t\tos[\'CUDA_VISIBLE_DEVICES\'] = None\n' NUM_CUDA_DEVICES = 1 COMMON_STRING += '\t\ttorch.cuda.device_count() = %d\n' % torch.cuda.device_count( ) COMMON_STRING += '\n' os.makedirs(checkpoint_folder + '/' + model_type + '/' + model_name, exist_ok=True) log = Logger() log.open(checkpoint_folder + '/' + model_type + '/' + model_name + '/' + model_name + '_fold_' + str(fold) + '_log_train.txt', mode='a+') log.write('\t%s\n' % COMMON_STRING) log.write('\n') log.write('\tSEED = %u\n' % SEED) log.write('\tPROJECT_PATH = %s\n' % train_data_folder) log.write('\t__file__ = %s\n' % __file__) log.write('\tout_dir = %s\n' % checkpoint_folder) log.write('\n') ## dataset ---------------------------------------- log.write('** dataset setting **\n') train_dataset = URESDataset( data_dir=train_data_folder, mode='train', csv=[ 'train.csv', ], split=train_split, augment=transform_train, size=(1024, 1024), ) train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size, drop_last=True, num_workers=4, pin_memory=True, collate_fn=null_collate) valid_dataset = URESDataset( data_dir=train_data_folder, mode='train', csv=[ 'train.csv', ], split=val_split, augment=transform_valid, size=(1024, 1024), ) valid_dataloader = DataLoader(valid_dataset, sampler=SequentialSampler(valid_dataset), batch_size=valid_batch_size, drop_last=False, num_workers=4, pin_memory=True, collate_fn=null_collate) log.write('train_dataset : \n%s\n' % (train_dataset)) log.write('valid_dataset : \n%s\n' % (valid_dataset)) log.write('\n') ############################################################################## define unet model with backbone def load(model, pretrain_file, skip=[]): pretrain_state_dict = torch.load(pretrain_file) state_dict = model.state_dict() keys = list(state_dict.keys()) for key in keys: if any(s in key for s in skip): continue try: state_dict[key] = pretrain_state_dict[key] except: print(key) model.load_state_dict(state_dict) return model def get_deeplab_model(model_name="deep_se101", in_channel=3, num_classes=1, criterion=SoftDiceLoss_binary(), \ load_pretrain=False, checkpoint_filepath=None): if model_name == 'deep_se50': model = DeepSRNX50V3PlusD_m1(in_channel=in_channel, num_classes=num_classes, criterion=criterion) elif model_name == 'deep_se101': model = DeepSRNX101V3PlusD_m1(in_channel=in_channel, num_classes=num_classes, criterion=criterion) elif model_name == 'WideResnet38': model = DeepWR38V3PlusD_m1(in_channel=in_channel, num_classes=num_classes, criterion=criterion) elif model_name == 'unet_ef3': model = EfficientNet_3_unet() elif model_name == 'unet_ef5': model = EfficientNet_5_unet() else: print('No model name in it') model = None if (load_pretrain): model = load(model, checkpoint_filepath) return model def get_unet_model(model_name="efficientnet-b3", IN_CHANNEL=3, NUM_CLASSES=1, \ WIDTH=MASK_WIDTH, HEIGHT=MASK_HEIGHT, load_pretrain=False, checkpoint_filepath=None): model = model_iMet(model_name, IN_CHANNEL, NUM_CLASSES, WIDTH, HEIGHT) if (load_pretrain): model.load_pretrain(checkpoint_filepath) return model def get_aspp_model(model_name="efficientnet-b3", NUM_CLASSES=1, load_pretrain=False, checkpoint_filepath=None): model = Net(model_name, IN_CHANNEL, NUM_CLASSES, WIDTH, HEIGHT) if (load_pretrain): state_dict = torch.load(checkpoint_filepath, map_location=lambda storage, loc: storage) model.load_state_dict(state_dict, strict=True) return model ############################################################################### training parameters checkpoint_filename = model_type + '/' + model_name + '/' + model_name + "_" + model_type + '_fold_' + str( fold) + "_checkpoint.pth" checkpoint_filepath = os.path.join(checkpoint_folder, checkpoint_filename) ############################################################################### model and optimizer if model_type == 'unet': model = get_unet_model(model_name=model_name, IN_CHANNEL=3, NUM_CLASSES=NUM_CLASS, \ WIDTH=MASK_WIDTH, HEIGHT=MASK_HEIGHT, load_pretrain=load_pretrain, checkpoint_filepath=checkpoint_filepath) elif model_type == 'deeplab': model = get_deeplab_model(model_name=model_name, in_channel=3, num_classes=NUM_CLASS, \ criterion=BCEDiceLoss(), load_pretrain=load_pretrain, checkpoint_filepath=checkpoint_filepath) elif model_type == 'aspp': model = get_aspp_model(model_name=model_name, NUM_CLASSES=NUM_CLASS, \ load_pretrain=load_pretrain, checkpoint_filepath=checkpoint_filepath) model = model.cuda() if optimizer_name == "Adam": if model_type != 'deeplab': optimizer = torch.optim.Adam([{ 'params': model.decoder.parameters(), 'lr': lr, 'weight_decay': 0.01 }, { 'params': model.encoder.parameters(), 'lr': lr * 0.05 }]) else: optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) elif optimizer_name == "adamonecycle": flatten_model = lambda m: sum(map(flatten_model, m.children()), [] ) if num_children(m) else [m] get_layer_groups = lambda m: [nn.Sequential(*flatten_model(m))] optimizer_func = partial(optim.Adam, betas=(0.9, 0.99)) optimizer = OptimWrapper.create(optimizer_func, 3e-3, get_layer_groups(model), wd=1e-4, true_wd=True, bn_wd=True) elif optimizer_name == "Ranger": if model_type != 'deeplab': optimizer = Ranger([{ 'params': model.decoder.parameters(), 'lr': lr, 'weight_decay': 0.01 }, { 'params': model.encoder.parameters(), 'lr': lr * 0.05 }]) else: optimizer = Ranger(filter(lambda p: p.requires_grad, model.parameters()), lr, weight_decay=1e-5) else: raise NotImplementedError if lr_scheduler_name == "adamonecycle": scheduler = lsf.OneCycle(optimizer, len(train_dataset) * num_epoch, lr, [0.95, 0.85], 10.0, 0.4) lr_scheduler_each_iter = True elif lr_scheduler_name == "CosineAnealing": scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, num_epoch, eta_min=0, last_epoch=-1) lr_scheduler_each_iter = False elif lr_scheduler_name == "WarmRestart": scheduler = WarmRestart(optimizer, T_max=5, T_mult=1, eta_min=1e-6) lr_scheduler_each_iter = False else: raise NotImplementedError log.write('net\n %s\n' % (model_name)) log.write('optimizer\n %s\n' % (optimizer_name)) log.write('schduler\n %s\n' % (lr_scheduler_name)) log.write('\n') # mix precision model, optimizer = amp.initialize(model, optimizer, opt_level="O1") ############################################################################### training log.write('** start training here! **\n') log.write(' batch_size=%d, accumulation_steps=%d\n' % (batch_size, accumulation_steps)) log.write(' experiment = %s\n' % str(__file__.split('/')[-2:])) valid_loss = np.zeros(3, np.float32) train_loss = np.zeros(3, np.float32) valid_metric_optimal = np.inf eval_step = len(train_dataloader) # or len(train_dataloader) log_step = 10 eval_count = 0 # define tensorboard writer and timer writer = SummaryWriter() start_timer = timer() # define criterion criterion = BCEDiceLoss() metric = FscoreMetric(activation=None) for epoch in range(1, num_epoch + 1): torch.cuda.empty_cache() # update lr and start from start_epoch # if (not lr_scheduler_each_iter): # if epoch < 600: # if epoch != 0: # scheduler.step() # scheduler = warm_restart(scheduler, T_mult=2) # elif epoch > 600 and epoch < 800: # optimizer.param_groups[0]['lr'] = 1e-5 # else: # optimizer.param_groups[0]['lr'] = 5e-6 affect_rate = CosineAnnealingWarmUpRestarts( epoch, T_0=num_epoch, T_warmup=15, gamma=0.8, ) optimizer.param_groups[0]['lr'] = affect_rate * lr if epoch < 100: optimizer.param_groups[0]['lr'] = affect_rate * lr elif epoch < 150: lr = 4e-4 optimizer.param_groups[0]['lr'] = affect_rate * lr else: lr = 1e-4 # optimizer.param_groups[0]['lr'] = rate * lr # optimizer.param_groups[1]['lr'] = rate * lr * 0.01 if (epoch < start_epoch): continue log.write("Epoch%s\n" % epoch) log.write('\n') for param_group in optimizer.param_groups: rate = param_group['lr'] sum_train_loss = np.zeros_like(train_loss) sum_train = np.zeros_like(train_loss) seed_everything(SEED + epoch) torch.cuda.empty_cache() optimizer.zero_grad() for tr_batch_i, (X, truth_mask) in enumerate(train_dataloader): if (lr_scheduler_each_iter): scheduler.step(tr_batch_i) model.train() X = X.cuda().float() truth_mask = truth_mask.cuda() prediction = model(X) # [N, C, H, W] # loss = criterion_mask(prediction, truth_mask, weight=None) loss = criterion(prediction, truth_mask) with amp.scale_loss(loss / accumulation_steps, optimizer) as scaled_loss: scaled_loss.backward() #loss.backward() if ((tr_batch_i + 1) % accumulation_steps == 0): torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0, norm_type=2) optimizer.step() optimizer.zero_grad() writer.add_scalar( 'train_loss_' + str(fold), loss.item(), (epoch - 1) * len(train_dataloader) * batch_size + tr_batch_i * batch_size) # print statistics -------- # probability_mask = prediction probability_mask = torch.sigmoid(prediction) mask_positive = torch.where(truth_mask > 0.5, torch.ones_like(truth_mask), truth_mask) mask_negative = 1 - mask_positive fscore_positive = metric(probability_mask, mask_positive) fscore_negative = metric(1 - probability_mask, mask_negative) # probability_mask = torch.sigmoid(prediction) # mask_positive = np.where(truth_mask.clone().detach().cpu().numpy().flatten() > 0, 1, 0) # mask_negative = 1 - mask_positive # mask_pred_positive = np.where(probability_mask.detach().clone().cpu().numpy().flatten() > 0.5, 1, 0) # mask_pred_negative = 1 - mask_pred_positive # fscore_positive = f1_score(mask_positive, mask_pred_positive) # fscore_negative = f1_score(mask_negative, mask_pred_negative) l = np.array( [loss.item() * batch_size, fscore_positive, fscore_negative]) n = np.array([batch_size]) sum_train_loss = sum_train_loss + l sum_train = sum_train + n # log for training if (tr_batch_i + 1) % log_step == 0: train_loss = sum_train_loss / (sum_train + 1e-12) sum_train_loss[...] = 0 sum_train[...] = 0 log.write('lr: %f train loss: %f fscore_positive: %f fscore_negative: %f\n' % \ (rate, train_loss[0], train_loss[1], train_loss[2])) if (tr_batch_i + 1) % eval_step == 0: eval_count += 1 valid_loss = np.zeros(3, np.float32) valid_num = np.zeros_like(valid_loss) valid_metric = [] with torch.no_grad(): torch.cuda.empty_cache() for val_batch_i, ( X, truth_mask) in enumerate(valid_dataloader): model.eval() X = X.cuda().float() truth_mask = truth_mask.cuda() prediction = model(X) # [N, C, H, W] # loss = criterion_mask(prediction, truth_mask, weight=None) loss = criterion(prediction, truth_mask) writer.add_scalar( 'val_loss_' + str(fold), loss.item(), (eval_count - 1) * len(valid_dataloader) * valid_batch_size + val_batch_i * valid_batch_size) # print statistics -------- # probability_mask = prediction probability_mask = torch.sigmoid(prediction) mask_positive = torch.where( truth_mask > 0.5, torch.ones_like(truth_mask), truth_mask) mask_negative = 1 - mask_positive fscore_positive = metric(probability_mask, mask_positive) fscore_negative = metric(1 - probability_mask, mask_negative) # if (epoch == 1) and (val_batch_i == 0): # predict = probability_mask[0, :, :].detach().squeeze().cpu().numpy() # predict = predict > 0.5 # Threshould # predict = (1 - predict)*255 # cv2.imwrite('result/0_0.tiff', predict.astype(np.uint8)) # probability_mask = torch.sigmoid(prediction) # mask_positive = np.where(truth_mask.clone().detach().cpu().numpy().flatten() > 0, 1, 0) # mask_negative = 1 - mask_positive # mask_pred_positive = np.where(probability_mask.detach().clone().cpu().numpy().flatten() > 0.5, 1, 0) # mask_pred_negative = 1 - mask_pred_positive # fscore_positive = f1_score(mask_positive, mask_pred_positive) # fscore_negative = f1_score(mask_negative, mask_pred_negative) #--- l = np.array([ loss.item() * valid_batch_size, fscore_positive, fscore_negative ]) n = np.array([valid_batch_size]) valid_loss = valid_loss + l valid_num = valid_num + n valid_loss = valid_loss / valid_num log.write('validation loss: %f fscore_positive: %f fscore_negative: %f\n' % \ (valid_loss[0], \ valid_loss[1], \ valid_loss[2])) val_metric_epoch = valid_loss[0] if (val_metric_epoch <= valid_metric_optimal): log.write('Validation metric improved ({:.6f} --> {:.6f}). Saving model ...'.format(\ valid_metric_optimal, val_metric_epoch)) valid_metric_optimal = val_metric_epoch torch.save(model.state_dict(), checkpoint_filepath)
def get_dataloaders(checkpoint_dir, rsyncing, selective_sampling=False, warmup_trainer=None, batch_size=16, num_workers=os.cpu_count() - 1, data_aug_vec=[0.5, 0.25, 0.5, 0.5], toy=False, notebook=False, cat=False): """ :param checkpoint_dir: :param rsyncing: :param selective_sampling: :param warmup_trainer: :param batch_size: :param num_workers: :param seed: :param data_aug_vec: probabilities for rnd flip, rnd gamma, rnd translation and rnd scale :param toy: :param notebook: :return: """ # if torch.cuda.is_available(): # mp.set_start_method('spawn') multiprocessing = False num_workers = 0 sampler_size = 3000 if rsyncing: print('Rsynced data! (prepare feat)', flush=True) else: print('Using symbolic links! (prepare feat)', flush=True) print('Getting path ready..', flush=True) anno_path_train, anno_path_val, png_path = get_paths( rsyncing, toy, notebook) # TODO # png_path = os.path.join('/Users/lisa/Documents/Uni/ThesisDS/thesis_ds/one_img_dataset', 'png') # anno_path_train = os.path.join('/Users/lisa/Documents/Uni/ThesisDS/thesis_ds/one_img_dataset', # 'annotations/mscoco_train_full.json') # anno_path_val = os.path.join('/Users/lisa/Documents/Uni/ThesisDS/thesis_ds/one_img_dataset', # 'annotations/mscoco_train_full.json') print('Creating Coco Datasets..', flush=True) # t.ToTensor() if not cat: trans_img = torchvision.transforms.Compose([ t.Normalize(), t.BboxCrop(targetsize=224), t.RandomFlipImg(prob=data_aug_vec[0]), t.RandomGammaImg(prob=data_aug_vec[1], use_normal_distribution=True) ]) trans_bb = torchvision.transforms.Compose([ t.GetFiveBBs(), t.RandomTranslateBB(prob=data_aug_vec[2], pixel_range=10), t.RandomScaleBB(prob=data_aug_vec[3], max_percentage=0.1) ]) else: trans_img = torchvision.transforms.Compose([ t.Normalize(), t.BboxCropMult(targetsize=224), t.RandomFlipImg(prob=data_aug_vec[0]), t.RandomGammaImg(prob=data_aug_vec[1], use_normal_distribution=True) ]) trans_bb = torchvision.transforms.Compose([ t.GetBBsMult(), t.RandomTranslateBB(prob=data_aug_vec[2], pixel_range=10, cat=True), t.RandomScaleBB(prob=data_aug_vec[3], max_percentage=0.1, cat=True) ]) trainset = u.dataset_coco(png_path, anno_path_train, transform=trans_img, bbox_transform=trans_bb, for_feature=True, cat=cat) print('Training set has', len(trainset), 'images', flush=True) if not cat: valset = u.dataset_coco( png_path, anno_path_val, transform=torchvision.transforms.Compose( [t.Normalize(), t.BboxCrop(targetsize=224)]), bbox_transform=torchvision.transforms.Compose([t.GetFiveBBs()]), for_feature=True, cat=cat) else: valset = u.dataset_coco( png_path, anno_path_val, transform=torchvision.transforms.Compose( [t.Normalize(), t.BboxCropMult(targetsize=224)]), bbox_transform=torchvision.transforms.Compose([t.GetBBsMult()]), for_feature=True, cat=cat) print('Validation set has', len(valset), 'images', flush=True) if selective_sampling: if not warmup_trainer: print( 'Cannot calculate weights for selective sampling: no model given. Using normal sampling instead', flush=True) trainloader = torch.utils.data.DataLoader( trainset, batch_size=batch_size, sampler=RandomSampler(trainset), num_workers=num_workers, collate_fn=u.mammo_collate, pin_memory=multiprocessing) else: print('Getting weights for sampling..', flush=True) trainloader = torch.utils.data.DataLoader( trainset, batch_size=batch_size, sampler=SequentialSampler(trainset), num_workers=num_workers, collate_fn=u.mammo_collate, pin_memory=multiprocessing) weights = warmup_trainer.predict_dataset(trainloader) pkl.dump( weights, open( os.path.join(checkpoint_dir, 'weights_selective_train.pkl'), 'wb')) trainloader = torch.utils.data.DataLoader( trainset, batch_size=batch_size, sampler=WeightedRandomSampler(weights.double(), sampler_size, replacement=False), num_workers=num_workers, collate_fn=u.mammo_collate, pin_memory=multiprocessing) else: trainloader = torch.utils.data.DataLoader( trainset, batch_size=batch_size, sampler=RandomSampler(trainset), num_workers=num_workers, collate_fn=u.mammo_collate, pin_memory=multiprocessing) valloader = torch.utils.data.DataLoader(valset, batch_size=batch_size, sampler=SequentialSampler(valset), num_workers=num_workers, collate_fn=u.mammo_collate, pin_memory=multiprocessing) print('Training loader has', len(trainloader), 'batches', flush=True) print('Validation loader has', len(valloader), 'batches', flush=True) return trainloader, valloader
type=str, default= '../k_logs/2020-09-24T15-18-33-duration_extractor/2020-09-24_checkpoint_step15000.pth', help="Path to checkpoint of convolutional_cacotron model") parser.add_argument( "--data_folder", type=str, default='../code/datasets/data/kss', help="Where the data live and where to save durations.") parser.add_argument("--durations_filename", default='durations.txt', type=str, help="Name of the final durations file.") parser.add_argument("--batch_size", default=256, type=int, help="Batch size") args = parser.parse_args() # Load pretrained checkpoint and extract alignments to data_folder m = DurationExtractor().load(args.checkpoint) dataset = K_AudioDataset(root=args.data_folder, durations=False) dataloader = DataLoader(dataset, batch_size=args.batch_size, collate_fn=Collate(m.device), shuffle=False, sampler=SequentialSampler(dataset)) save_alignments_as_fertilities(m, dataloader, args.data_folder, args.durations_filename)
def get_sequential_trainloader(toy, rsyncing, batch_size=16, num_workers=os.cpu_count() - 1, data_aug_vec=[0.5, 0.25, 0.5, 0.5], notebook=False): """ :param toy: :param rsyncing: :param batch_size: :param num_workers: :param data_aug_vec: :param notebook: :return: """ num_workers = 0 if rsyncing: print('Rsynced data! (prepare feat)', flush=True) else: print('Using symbolic links! (prepare feat)', flush=True) print('Getting path ready..', flush=True) anno_path_train, _, png_path = get_paths(rsyncing, toy, notebook) # TODO # png_path = os.path.join('/Users/lisa/Documents/Uni/ThesisDS/thesis_ds/one_img_dataset', 'png') # anno_path_train = os.path.join('/Users/lisa/Documents/Uni/ThesisDS/thesis_ds/one_img_dataset', # 'annotations/mscoco_train_full.json') trans_img = torchvision.transforms.Compose([ t.Normalize(), t.BboxCrop(targetsize=224), t.RandomFlipImg(prob=data_aug_vec[0]), t.RandomGammaImg(prob=data_aug_vec[1], use_normal_distribution=True) ]) trans_bb = torchvision.transforms.Compose([ t.GetFiveBBs(), t.RandomTranslateBB(prob=data_aug_vec[2], pixel_range=10), t.RandomScaleBB(prob=data_aug_vec[3], max_percentage=0.1) ]) start_time = time.time() print('Creating Coco Dataset..', flush=True) trainset = u.dataset_coco(png_path, anno_path_train, transform=trans_img, bbox_transform=trans_bb, for_feature=True) print('Training set has', len(trainset), 'images', flush=True) trainloader = torch.utils.data.DataLoader( trainset, batch_size=batch_size, sampler=SequentialSampler(trainset), num_workers=num_workers, collate_fn=u.mammo_collate) print('Training loader has', len(trainloader), 'batches', flush=True) total_time = time.time() - start_time print('Creating Datasets took {:.0f} seconds.'.format(total_time), flush=True) return trainloader
def main(): # define parser and arguments args = get_train_test_args() util.set_seed(args.seed) model = DistilBertForQuestionAnswering.from_pretrained( "distilbert-base-uncased") if args.do_finetune: checkpoint_path = os.path.join(args.save_dir, 'checkpoint') model = DistilBertForQuestionAnswering.from_pretrained(checkpoint_path) for name, param in model.named_parameters(): if name.startswith("distilbert.embeddings."): param.requires_grad = False for i in range(args.freeze_layer): if name.startswith("distilbert.transformer.layer.%s." % i): param.requires_grad = False return tokenizer = DistilBertTokenizerFast.from_pretrained( 'distilbert-base-uncased') if args.do_train: if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) args.save_dir = util.get_save_dir(args.save_dir, args.run_name) log = util.get_logger(args.save_dir, 'log_train') log.info(f'Args: {json.dumps(vars(args), indent=4, sort_keys=True)}') log.info("Preparing Training Data...") args.device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') trainer = Trainer(args, log) train_dataset, _ = get_dataset(args, args.train_datasets, args.train_dir, tokenizer, 'train') log.info("Preparing Validation Data...") val_dataset, val_dict = get_dataset(args, args.train_datasets, args.val_dir, tokenizer, 'val') train_loader = DataLoader(train_dataset, batch_size=args.batch_size, sampler=RandomSampler(train_dataset)) val_loader = DataLoader(val_dataset, batch_size=args.batch_size, sampler=SequentialSampler(val_dataset)) best_scores = trainer.train(model, train_loader, val_loader, val_dict) if args.do_eval: args.device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') split_name = 'test' if 'test' in args.eval_dir else 'validation' log = util.get_logger(args.save_dir, f'log_{split_name}') trainer = Trainer(args, log) checkpoint_path = os.path.join(args.save_dir, 'checkpoint') model = DistilBertForQuestionAnswering.from_pretrained(checkpoint_path) model.to(args.device) eval_dataset, eval_dict = get_dataset(args, args.eval_datasets, args.eval_dir, tokenizer, split_name) eval_loader = DataLoader(eval_dataset, batch_size=args.batch_size, sampler=SequentialSampler(eval_dataset)) eval_preds, eval_scores = trainer.evaluate(model, eval_loader, eval_dict, return_preds=True, split=split_name) results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in eval_scores.items()) log.info(f'Eval {results_str}') # Write submission file sub_path = os.path.join(args.save_dir, split_name + '_' + args.sub_file) log.info(f'Writing submission file to {sub_path}...') with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh: csv_writer = csv.writer(csv_fh, delimiter=',') csv_writer.writerow(['Id', 'Predicted']) for uuid in sorted(eval_preds): csv_writer.writerow([uuid, eval_preds[uuid]])
def main(): # parse command line options parser = argparse.ArgumentParser() parser.add_argument( "experiment", nargs='?', default="", help= "Experiment name (sub-folder for this particular run). Default: test") parser.add_argument("-data-dir", default='data/maze/', help="Directory where maze data is located") parser.add_argument( "-output-dir", default='data/mapnet', help= "Output directory where results will be stored (point OverBoard to this location)" ) parser.add_argument("-device", default="cuda:0", help="Device, cpu or cuda") parser.add_argument( "-data-loaders", default=8, type=int, help="Number of asynchronous worker threads for data loading") parser.add_argument("-epochs", default=40, type=int, help="Number of training epochs") parser.add_argument("-bs", default=100, type=int, help="Batch size") parser.add_argument("-lr", default=1e-3, type=float, help="Learning rate") parser.add_argument("--no-bn", dest="bn", action="store_false", help="Disable batch normalization") parser.add_argument( "-seq-length", default=5, type=int, help= "Sequence length for unrolled RNN (longer creates more long-term maps)" ) parser.add_argument("-map-size", default=15, type=int, help="Spatial size of map memory (always square)") parser.add_argument( "-embedding", default=16, type=int, help="Size of map embedding (vector stored in each map cell)") parser.add_argument( "--no-improved-padding", dest="improved_padding", action="store_false", help= "Disable improved padding, which ensures softmax is only over valid locations and not edges" ) parser.add_argument("-lstm-forget-bias", default=1.0, type=float, help="Initial value for LSTM forget gate") parser.add_argument( "-max-speed", default=0, type=int, help= "If non-zero, only samples trajectories with this maximum spatial distance between steps" ) parser.add_argument( "--spawn", action="store_true", help= "Use spawn multiprocessing method, to work around problem with some debuggers (e.g. VSCode)" ) parser.set_defaults(bn=True, improved_padding=True) args = parser.parse_args() if not t.cuda.is_available(): args.device = 'cpu' if args.spawn: # workaround for vscode debugging import torch.multiprocessing as multiprocessing multiprocessing.set_start_method('spawn', True) if not args.experiment: args.experiment = 'test' # complete directory with experiment name args.output_dir = (args.output_dir + '/' + args.experiment) if os.path.isdir(args.output_dir): input( 'Directory already exists. Press Enter to overwrite or Ctrl+C to cancel.' ) # repeatable random sequences hopefully random.seed(0) t.manual_seed(0) # initialize dataset env_size = (21, 21) full_set = Mazes(args.data_dir + '/mazes-10-10-100000.txt', env_size, seq_length=args.seq_length, max_speed=args.max_speed) (train_set, val_set) = t.utils.data.random_split(full_set, (len(full_set) - 5000, 5000)) val_loader = DataLoader(val_set, batch_size=10 * args.bs, shuffle=False, num_workers=args.data_loaders) # create base CNN and MapNet cnn = get_two_layers_cnn(args) mapnet = MapNet(cnn=cnn, embedding_size=args.embedding, map_size=args.map_size, lstm_forget_bias=args.lstm_forget_bias, improved_padding=args.improved_padding, orientations=4) # use GPU if needed device = t.device(args.device) mapnet.to(device) # create optimizer optimizer = t.optim.Adam(mapnet.parameters(), lr=args.lr) with Logger(args.output_dir, meta=args) as logger: for epoch in range(args.epochs): # refresh subset of mazes every epoch train_sampler = BatchSampler(RandomSampler(SequentialSampler( range(95000)), num_samples=10000, replacement=True), batch_size=args.bs, drop_last=True) train_loader = DataLoader(train_set, batch_sampler=train_sampler, num_workers=args.data_loaders) # training phase mapnet.train() for inputs in train_loader: #with t.autograd.detect_anomaly(): optimizer.zero_grad() loss = batch_forward(inputs, mapnet, 'train', device, args, logger) loss.backward() optimizer.step() logger.print(prefix='train', line_prefix=f"ep {epoch+1} ") # validation phase mapnet.eval() with t.no_grad(): for inputs in val_loader: loss = batch_forward(inputs, mapnet, 'val', device, args, logger) logger.print(prefix='val', line_prefix=f"ep {epoch+1} ") logger.append() # save state state = { 'epoch': epoch, 'state_dict': mapnet.state_dict(), 'optimizer': optimizer.state_dict() } try: os.replace(args.output_dir + "/state.pt", args.output_dir + "/prev_state.pt") except: pass t.save(state, args.output_dir + "/state.pt")
def load_data(args): """Load data from here and return. Note: Compose Composes several transforms together and if augmentation is chosen you compose an additional bunch of transforms to be applied to the train data and you send this to the DataTransformer class which returns the data set that is used in the data loader. The data loader then takes in this dataset with a batch size and sampler. Sampler is defines the strategy to draw samples from the dataset. Here for training data random sampling is used and for validation sequential is used. You can also write a custom sampler class if you want. :param args: main_dir (string) : path to the main directory from the args. image_size (int) : size of the image to be resized. transform_prob (float) : probability to apply transformations on the data. batch_size (int) : batch size to be used in the data loader. :return: the train loader and validation loader to be used for training and validating. """ # get data set file path data_path = os.path.join(args.main_dir, 'data', 'train-volume.tif') labels_path = os.path.join(args.main_dir, 'data', 'train-labels.tif') # compose the transforms for the train set train_data = Compose([Resize(args.image_size), ToTensor()]) # choose between augmentations for train data if args.augment: train_augment = augmentations(args) train_transform = DataTransformer(data_path, labels_path, image_transform=train_data, image_augmentation=train_augment) else: # transforming the train data and returning a 4D tensor train_transform = DataTransformer(data_path, labels_path, image_transform=train_data, image_augmentation=None) # transform for validation data val_data = Compose([Resize(args.image_size), ToTensor()]) val_transform = DataTransformer(data_path, labels_path, image_transform=val_data, image_augmentation=None) # split the train and validation indices train_indices, validation_indices = train_test_split(range( len(train_transform)), test_size=0.15) # call the sampler for the train and validation data train_samples = RandomSampler(train_indices) validation_samples = SequentialSampler(validation_indices) # load train and validation data train_loader = DataLoader(train_transform, batch_size=args.batch_size, sampler=train_samples) val_loader = DataLoader(val_transform, batch_size=args.batch_size, sampler=validation_samples) return train_loader, val_loader
def mnist_classifier_crossentropyloss(): # paths path = dict() path['project'] = os.path.dirname(os.path.abspath(__file__)) path['state'] = os.path.join(path['project'], 'epoch') path['dataset'] = os.path.join(path['project'], 'dataset') path['graph'] = os.path.join(path['project'], 'graph') path['array'] = os.path.join(path['project'], 'array') for key, value in path.items(): if not os.path.exists(path[key]): os.mkdir(path[key]) # parameters batch_size = 1000 number_of_epochs = 20 learning_rate = 1e-3 device = 'cuda' if torch.cuda.is_available() else 'cpu' mean = 0.1307 std = 0.3081 loss = nn.CrossEntropyLoss() info_per_batch = 6 validation_ratio = 0.1 # transform transform = torchvision.transforms.Compose([ torchvision.transforms.ToTensor(), torchvision.transforms.Normalize(mean=(mean, ), std=(std, )) ]) # dataset train_dataset = torchvision.datasets.MNIST(root=path['dataset'], train=True, transform=transform, download=True) test_dataset = torchvision.datasets.MNIST(root=path['dataset'], train=False, transform=transform, download=True) # validation dataset validation_limit = int((1 - validation_ratio) * len(train_dataset)) index_list = list(range(len(train_dataset))) train_indexes, validation_indexes = index_list[: validation_limit], index_list[ validation_limit:] train_sampler = SubsetRandomSampler(train_indexes) validation_sampler = SequentialSampler(validation_indexes) # dataset loaders train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, sampler=train_sampler) validation_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, sampler=validation_sampler) test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size) # model model = MnistClassifierCrossEntropyLoss().to(device) # optimizer optimizer = optim.SGD(params=model.parameters(), lr=learning_rate) epochs = np.arange(start=1, stop=(number_of_epochs + 1), step=1, dtype=int) print('Mnist Classifier CrossEntropyLoss') train_losses = [] train_accuracies = [] validation_losses = [] validation_accuracies = [] test_losses = [] test_accuracies = [] for epoch in epochs: info = 'Epoch {epoch_index}/{number_of_epochs}' print(info.format(epoch_index=epoch, number_of_epochs=number_of_epochs)) # train train_loss, train_accuracy = train(model=model, device=device, loader=train_loader, optimizer=optimizer, loss=loss, info_per_batch=info_per_batch) info = 'Train: Average Loss: {train_loss:.5f}, Accuracy: % {train_accuracy:.2f}' print( info.format(train_loss=train_loss, train_accuracy=(100 * train_accuracy))) train_losses.append(train_loss) train_accuracies.append(train_accuracy) # validation validation_loss, validation_accuracy = test( model=model, loader=validation_loader, device=device, loss=loss, info_per_batch=info_per_batch, info_name='Validation') info = 'Validation: Average Loss: {validation_loss:.5f}, Accuracy: % {validation_accuracy:.2f}' print( info.format(validation_loss=validation_loss, validation_accuracy=(100 * validation_accuracy))) validation_losses.append(validation_loss) validation_accuracies.append(validation_accuracy) # test test_loss, test_accuracy = test(model=model, loader=test_loader, device=device, loss=loss, info_per_batch=info_per_batch, info_name='Test') info = 'Test: Average Loss: {test_loss:.5f}, Accuracy: % {test_accuracy:.2f}' print( info.format(test_loss=test_loss, test_accuracy=(100 * test_accuracy))) test_losses.append(test_loss) test_accuracies.append(test_accuracy) # epoch state state_file_name = 'mnist_classifier_crossentropyloss_epoch_{epoch_index}.pkl'.format( epoch_index=epoch) save_state(model=model, directory=path['state'], file_name=state_file_name) # train loss save_data(array=train_losses, directory=path['array'], file_name='mnist_classifier_crossentropyloss_train_loss.npy') draw_line_graph( x=epochs, y=train_losses, x_label='Epoch', y_label='Loss', title='Mnist Classifier CrossEntropyLoss Train Loss', directory=path['graph'], file_name='mnist_classifier_crossentropyloss_train_loss.png') # train accuracy save_data(array=train_accuracies, directory=path['array'], file_name='mnist_classifier_crossentropyloss_train_accuracy.npy') draw_line_graph( x=epochs, y=train_accuracies, x_label='Epoch', y_label='Accuracy', title='Mnist Classifier CrossEntropyLoss Train Accuracy', directory=path['graph'], file_name='mnist_classifier_crossentropyloss_train_accuracy.png') # validation loss save_data( array=validation_losses, directory=path['array'], file_name='mnist_classifier_crossentropyloss_validation_loss.npy') draw_line_graph( x=epochs, y=validation_losses, x_label='Epoch', y_label='Loss', title='Mnist Classifier CrossEntropyLoss Validation Loss', directory=path['graph'], file_name='mnist_classifier_crossentropyloss_validation_loss.png') # validation accuracy save_data( array=validation_accuracies, directory=path['array'], file_name='mnist_classifier_crossentropyloss_validation_accuracy.npy') draw_line_graph( x=epochs, y=validation_accuracies, x_label='Epoch', y_label='Accuracy', title='Mnist Classifier CrossEntropyLoss Validation Accuracy', directory=path['graph'], file_name='mnist_classifier_crossentropyloss_validation_accuracy.png') # test loss save_data(array=test_losses, directory=path['array'], file_name='mnist_classifier_crossentropyloss_test_loss.npy') draw_line_graph( x=epochs, y=test_losses, x_label='Epoch', y_label='Loss', title='Mnist Classifier CrossEntropyLoss Test Loss', directory=path['graph'], file_name='mnist_classifier_crossentropyloss_test_loss.png') # test accuracy save_data(array=test_accuracies, directory=path['array'], file_name='mnist_classifier_crossentropyloss_test_accuracy.npy') draw_line_graph( x=epochs, y=test_accuracies, x_label='Epoch', y_label='Accuracy', title='Mnist Classifier CrossEntropyLoss Test Accuracy', directory=path['graph'], file_name='mnist_classifier_crossentropyloss_test_accuracy.png') # loss draw_multi_lines_graph( lines=[ dict(label='Train', data=dict(x=epochs, y=train_losses)), dict(label='Validation', data=dict(x=epochs, y=validation_losses)), dict(label='Test', data=dict(x=epochs, y=test_losses)) ], x_label='Epoch', y_label='Loss', title='Mnist Classifier CrossEntropyLoss Loss', directory=path['graph'], file_name='mnist_classifier_crossentropyloss_loss.png') # accuracy draw_multi_lines_graph( lines=[ dict(label='Train', data=dict(x=epochs, y=train_accuracies)), dict(label='Validation', data=dict(x=epochs, y=validation_accuracies)), dict(label='Test', data=dict(x=epochs, y=test_accuracies)) ], x_label='Epoch', y_label='Accuracy', title='Mnist Classifier CrossEntropyLoss Accuracy', directory=path['graph'], file_name='mnist_classifier_crossentropyloss_accuracy.png')
def main(): args = parser.parse_args() log_out_dir = os.path.join(RESULT_DIR, 'logs', args.out_dir, 'fold%d' % args.fold) if not os.path.exists(log_out_dir): os.makedirs(log_out_dir) log = Logger() log.open(os.path.join(log_out_dir, 'log.train.txt'), mode='a') model_out_dir = os.path.join(RESULT_DIR, 'models', args.out_dir, 'fold%d' % args.fold) log.write(">> Creating directory if it does not exist:\n>> '{}'\n".format( model_out_dir)) if not os.path.exists(model_out_dir): os.makedirs(model_out_dir) # set cuda visible device if not args.all_gpus: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_id cudnn.benchmark = True # cudnn.enabled = False # set random seeds torch.manual_seed(0) torch.cuda.manual_seed_all(0) np.random.seed(0) model_params = {} model_params['architecture'] = args.arch model_params['num_classes'] = 1 model_params['in_channels'] = args.in_channels if 'efficientnet' in args.arch: model_params['image_size'] = args.img_size model_params['encoder'] = args.effnet_encoder model = init_network(model_params) if args.load_state_dict_path is not None: if args.load_state_dict_path == 'use-img-level-densenet-ckpt': model_dir = '../output/models/densenet121_1024_all_data__obvious_neg__gradaccum_20__start_lr_3e6' pretrained_ckpt_path = os.path.join(f'{model_dir}', f'fold{args.fold}', 'final.pth') else: pretrained_ckpt_path = args.load_state_dict_path init_pretrained = torch.load(pretrained_ckpt_path) if args.load_as_is: model.load_state_dict(init_pretrained['state_dict']) else: model.load_state_dict({ key: (val if key not in {'logit.weight', 'logit.bias'} else torch.rand([1, 1024] if key == 'logit.weight' else [1])) for key, val in init_pretrained['state_dict'].items() }) torch.nn.init.xavier_uniform(model.logit.weight) if args.all_gpus: model = DataParallel(model) model.cuda() # define loss function (criterion) try: criterion = eval(args.loss)().cuda() except: raise (RuntimeError("Loss {} not available!".format(args.loss))) start_epoch = 0 best_loss = 1e5 best_epoch = 0 best_val_pr_auc_score = 0 # define scheduler try: scheduler = eval(args.scheduler)( scheduler_lr_multiplier=args.scheduler_lr_multiplier, scheduler_epoch_offset=args.scheduler_epoch_offset) except: raise (RuntimeError("Scheduler {} not available!".format( args.scheduler))) optimizer = scheduler.schedule(model, start_epoch, args.epochs)[0] # Data loading code train_transform = train_multi_augment2 with open('../input/imagelevel_folds_obvious_staining_5.pkl', 'rb') as f: folds = pickle.load(f) fold = args.fold trn_img_paths, val_img_paths = folds[fold] train_df = get_train_df_ohe(clean_from_duplicates=True) basepath_2_ohe_vector = { img: vec for img, vec in zip(train_df['img_base_path'], train_df.iloc[:, 2:].values) } public_hpa_df_17 = get_public_df_ohe(clean_from_duplicates=True) public_basepath_2_ohe_vector = { img_path: vec for img_path, vec in zip(public_hpa_df_17['img_base_path'], public_hpa_df_17.iloc[:, 2:].values) } basepath_2_ohe_vector.update(public_basepath_2_ohe_vector) available_paths = set( np.concatenate((train_df['img_base_path'].values, public_hpa_df_17['img_base_path'].values))) trn_img_paths = [path for path in trn_img_paths if path in available_paths] val_img_paths = [path for path in val_img_paths if path in available_paths] labels_df = pd.read_hdf(args.cell_level_labels_path) # modifying minor class labels cherrypicked_mitotic_spindle = pd.read_csv( '../input/mitotic_cells_selection.csv') cherrypicked_mitotic_spindle_img_cell = set( cherrypicked_mitotic_spindle[['ID', 'cell_i']].apply(tuple, axis=1).values) cherrypicked_mitotic_spindle_img_cell = { (img, cell_i - 1) for img, cell_i in cherrypicked_mitotic_spindle_img_cell } class_names = get_class_names() mitotic_spindle_class_i = class_names.index('Mitotic spindle') cherrypicked_mitotic_spindle_based_on_nn = pd.read_csv( '../input/mitotic_pos_nn_added.csv') cherrypicked_mitotic_spindle_img_cell.update( set(cherrypicked_mitotic_spindle_based_on_nn[['ID', 'cell_i' ]].apply(tuple, axis=1).values)) mitotic_bool_idx = labels_df.index.isin( cherrypicked_mitotic_spindle_img_cell) negative_img_ids_cell = labels_df.index[np.logical_not( mitotic_bool_idx)].values dfs = [] for fold in range(5): dfs.append(pd.read_csv(f'../output/mitotic_pred_fold_{fold}.csv')) pred_df = pd.concat(dfs) pred_df.set_index(['ID', 'cell_i'], inplace=True) positive_img_ids_cell = pred_df.index[pred_df['pred'] < 0.6].values if args.ignore_negative: raise NotImplementedError train_dataset = ProteinMitoticDatasetCellSeparateLoading( trn_img_paths, positive_img_ids_cell, negative_img_ids_cell, in_channels=args.in_channels, transform=train_transform, target_raw_img_size=args.target_raw_img_size) train_loader = DataLoader( train_dataset, sampler=MitoticBalancingSubSampler(train_dataset.img_ids_cell, train_dataset.id_cell_2_y), batch_size=args.batch_size, drop_last=False, num_workers=args.workers, pin_memory=True, ) valid_dataset = ProteinMitoticDatasetCellSeparateLoading( val_img_paths, positive_img_ids_cell, sample(list(negative_img_ids_cell), 10000), img_size=args.img_size, in_channels=args.in_channels, target_raw_img_size=args.target_raw_img_size) valid_loader = DataLoader(valid_dataset, sampler=SequentialSampler(valid_dataset), batch_size=args.batch_size, drop_last=False, num_workers=args.workers, pin_memory=True) log.write('** start training here! **\n') log.write('\n') log.write( 'epoch iter rate | train_loss/acc | valid_loss/acc/pr_auc/--- |best_epoch/best_pr_auc| min \n' ) log.write( '-----------------------------------------------------------------------------------------------------------------\n' ) start_epoch += 1 if args.eval_at_start: with torch.no_grad(): valid_loss, valid_acc, val_pr_auc_score = validate( valid_loader, model, criterion, -1, log) print('\r', end='', flush=True) log.write( '%5.1f %5d %0.6f | %0.4f %0.4f | %0.4f %6.4f %6.4f %6.1f | %6.4f %6.4f | %3.1f min \n' % \ (-1, -1, -1, -1, -1, valid_loss, valid_acc, val_pr_auc_score, -1, best_epoch, -1, -1)) for epoch in range(start_epoch, args.epochs + 1): end = time.time() # set manual seeds per epoch np.random.seed(epoch) torch.manual_seed(epoch) torch.cuda.manual_seed_all(epoch) # adjust learning rate for each epoch lr_list = scheduler.step(model, epoch, args.epochs) lr = lr_list[0] # train for one epoch on train set iter, train_loss, train_acc = train( train_loader, model, criterion, optimizer, epoch, clipnorm=args.clipnorm, lr=lr, agg_steps=args.gradient_accumulation_steps) with torch.no_grad(): valid_loss, valid_acc, val_pr_auc_score = validate( valid_loader, model, criterion, epoch, log) # remember best loss and save checkpoint is_best = val_pr_auc_score > best_val_pr_auc_score best_loss = min(valid_loss, best_loss) best_epoch = epoch if is_best else best_epoch best_val_pr_auc_score = val_pr_auc_score if is_best else best_val_pr_auc_score print('\r', end='', flush=True) log.write('%5.1f %5d %0.6f | %0.4f %0.4f | %0.4f %6.4f %6.4f %6.1f | %6.4f %6.4f | %3.1f min \n' % \ (epoch, iter + 1, lr, train_loss, train_acc, valid_loss, valid_acc, val_pr_auc_score, -1, best_epoch, best_val_pr_auc_score, (time.time() - end) / 60)) save_model(model, is_best, model_out_dir, optimizer=optimizer, epoch=epoch, best_epoch=best_epoch, best_map=best_val_pr_auc_score)
def learn_step(self, updater, network, next_obs, next_internals): r = self.exp_cache.read() device = r.rewards[0].device rollout_len = self.exp_cache.rollout_len # estimate value of next state with torch.no_grad(): pred, _, _ = network(next_obs, next_internals) last_values = pred['critic'].squeeze(-1).data # calc nsteps gae = 0. next_values = last_values gae_returns = [] for i in reversed(range(rollout_len)): rewards = r.rewards[i] terminal_mask = 1. - r.terminals[i].float() current_values = r.values[i].squeeze(-1) # generalized advantage estimation delta_t = rewards + self.discount * next_values.data * terminal_mask - current_values gae = gae * self.discount * self.gae_discount * terminal_mask + delta_t gae_returns.append(gae + current_values) next_values = current_values.data gae_returns = torch.stack(list(reversed(gae_returns))).data # Convert to torch tensors of [seq, num_env] old_values = torch.stack(r.values).squeeze(-1) adv_targets_batch = (gae_returns - old_values).data old_log_probs_batch = torch.stack(r.log_probs).data # keep a copy of terminals on the cpu it's faster rollout_terminals = torch.stack(r.terminals).cpu().numpy() # Normalize advantage if self.normalize_advantage: adv_targets_batch = (adv_targets_batch - adv_targets_batch.mean()) / \ (adv_targets_batch.std() + 1e-5) for e in range(self.nb_rollout_epoch): # setup minibatch iterator minibatch_inds = list( BatchSampler(SequentialSampler(range(rollout_len)), self.rollout_minibatch_len, drop_last=False)) # randomize sequences to sample NOTE: in-place operation np.random.shuffle(minibatch_inds) for i in minibatch_inds: # TODO: detach internals, no_grad in compute_action_exp takes care of this starting_internals = { k: ts[i[0]].unbind(0) for k, ts in r.internals.items() } gae_return = gae_returns[i] old_log_probs = old_log_probs_batch[i] sampled_actions = [r.actions[x] for x in i] batch_obs = [r.observations[x] for x in i] # needs to be seq, batch, broadcast dim adv_targets = adv_targets_batch[i].unsqueeze(-1) terminals_batch = rollout_terminals[i] # forward pass cur_log_probs, cur_values, entropies = self.act_batch( network, batch_obs, terminals_batch, sampled_actions, starting_internals, device) value_loss = 0.5 * torch.mean((cur_values - gae_return).pow(2)) # calculate surrogate loss surrogate_ratio = torch.exp(cur_log_probs - old_log_probs) surrogate_loss = surrogate_ratio * adv_targets surrogate_loss_clipped = torch.clamp( surrogate_ratio, min=1 - self.policy_clipping, max=1 + self.policy_clipping) * adv_targets policy_loss = torch.mean( -torch.min(surrogate_loss, surrogate_loss_clipped)) entropy_loss = torch.mean(-self.entropy_weight * entropies) losses = { 'value_loss': value_loss, 'policy_loss': policy_loss, 'entropy_loss': entropy_loss } total_loss = torch.sum( torch.stack(tuple(loss for loss in losses.values()))) updater.step(total_loss) # TODO: metrics: average loss, policy % change, loss over epochs?, value change metrics = {'advantage': torch.mean(adv_targets_batch)} return losses, metrics
def main(): args = parser.parse_args() log_out_dir = os.path.join(RESULT_DIR, 'logs', args.out_dir, 'fold%d' % args.fold) if not os.path.exists(log_out_dir): os.makedirs(log_out_dir) log = Logger() log.open(os.path.join(log_out_dir, 'log.train.txt'), mode='a') model_out_dir = os.path.join(RESULT_DIR, 'models', args.out_dir, 'fold%d' % args.fold) log.write(">> Creating directory if it does not exist:\n>> '{}'\n".format( model_out_dir)) if not os.path.exists(model_out_dir): os.makedirs(model_out_dir) # set cuda visible device os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_id cudnn.benchmark = True # cudnn.enabled = False # set random seeds torch.manual_seed(0) torch.cuda.manual_seed_all(0) np.random.seed(0) model_params = {} model_params['architecture'] = args.arch model_params['num_classes'] = args.num_classes model_params['in_channels'] = args.in_channels if 'efficientnet' in args.arch: model_params['image_size'] = args.img_size model_params['encoder'] = args.effnet_encoder model = init_network(model_params) if args.load_state_dict_path is not None: init_pretrained = torch.load(args.load_state_dict_path) model.load_state_dict(init_pretrained['state_dict']) # state_dict = model.state_dict() # torch.save({ # 'state_dict': state_dict # }, '../output/densenet121_bestfitting_converted_classes.h5') # sys.exit(0) # move network to gpu # model = DataParallel(model) if args.clip_and_replace_grad_explosures: def clip_and_replace_explosures(grad): grad[torch.logical_or( torch.isnan(grad), torch.isinf(grad))] = torch.tensor(0.0).cuda() grad = torch.clamp(grad, -0.5, 0.5) return grad for param in model.parameters(): if param.requires_grad: param.register_hook(clip_and_replace_explosures) model.cuda() # define loss function (criterion) try: criterion = eval(args.loss)().cuda() except: raise (RuntimeError("Loss {} not available!".format(args.loss))) start_epoch = 0 best_loss = 1e5 best_epoch = 0 best_map = 0 # define scheduler try: scheduler = eval(args.scheduler)( scheduler_lr_multiplier=args.scheduler_lr_multiplier, scheduler_epoch_offset=args.scheduler_epoch_offset) except: raise (RuntimeError("Scheduler {} not available!".format( args.scheduler))) optimizer = scheduler.schedule(model, start_epoch, args.epochs)[0] # optionally resume from a checkpoint if args.resume: # args.resume = os.path.join(model_out_dir, args.resume) if os.path.isfile(args.resume): # load checkpoint weights and update model and optimizer log.write(">> Loading checkpoint:\n>> '{}'\n".format(args.resume)) checkpoint = torch.load(args.resume) start_epoch = checkpoint['epoch'] best_epoch = checkpoint['best_epoch'] best_map = checkpoint['best_score'] model.module.load_state_dict(checkpoint['state_dict']) optimizer_fpath = args.resume.replace('.pth', '_optim.pth') if os.path.exists(optimizer_fpath): log.write(">> Loading checkpoint:\n>> '{}'\n".format( optimizer_fpath)) optimizer.load_state_dict( torch.load(optimizer_fpath)['optimizer']) log.write(">>>> loaded checkpoint:\n>>>> '{}' (epoch {})\n".format( args.resume, checkpoint['epoch'])) else: log.write(">> No checkpoint found at '{}'\n".format(args.resume)) # Data loading code train_transform = train_multi_augment2 with open('../input/imagelevel_folds_obvious_staining_5.pkl', 'rb') as f: folds = pickle.load(f) fold = args.fold trn_img_paths, val_img_paths = folds[fold] train_df = get_train_df_ohe(clean_from_duplicates=args.clean_duplicates, clean_mitotic=args.clean_mitotic_samples, clean_aggresome=args.clean_aggresome) if args.ignore_negs: train_df['Negative'] = 0 train_paths_set = set(train_df['img_base_path']) basepath_2_ohe_vector = { img: vec for img, vec in zip(train_df['img_base_path'], train_df.iloc[:, 2:].values) } train_paths_set = set(train_df['img_base_path']) if not args.without_public_data: public_hpa_df_17 = get_public_df_ohe( clean_from_duplicates=args.clean_duplicates, clean_mitotic=args.clean_mitotic_samples, clean_aggresome=args.clean_aggresome) if args.ignore_negs: public_hpa_df_17['Negative'] = 0 public_basepath_2_ohe_vector = { img_path: vec for img_path, vec in zip(public_hpa_df_17['img_base_path'], public_hpa_df_17.iloc[:, 2:].values) } basepath_2_ohe_vector.update(public_basepath_2_ohe_vector) else: trn_img_paths = [ path for path in trn_img_paths if path in train_paths_set ] if not args.without_public_data: available_paths = set( np.concatenate((train_df['img_base_path'].values, public_hpa_df_17['img_base_path'].values))) else: available_paths = set(train_df['img_base_path'].values) trn_img_paths = [path for path in trn_img_paths if path in available_paths] val_img_paths = [path for path in val_img_paths if path in available_paths] if args.copy_paste_augment_mitotic_aggresome: train_ids = {os.path.basename(x) for x in trn_img_paths} id_2_ohe_vector = { os.path.basename(path): ohe for path, ohe in basepath_2_ohe_vector.items() } cherrypicked_mitotic_spindle = pd.read_csv( '../input/mitotic_cells_selection.csv') cherrypicked_mitotic_spindle = cherrypicked_mitotic_spindle[ cherrypicked_mitotic_spindle['ID'].isin(train_ids)] cherrypicked_aggresome = pd.read_csv( '../input/aggressome_cells_selection.csv') cherrypicked_aggresome = cherrypicked_aggresome[ cherrypicked_aggresome['ID'].isin(train_ids)] cherrypicked_mitotic_spindle['ohe'] = cherrypicked_mitotic_spindle[ 'ID'].map(id_2_ohe_vector) cherrypicked_aggresome['ohe'] = cherrypicked_aggresome['ID'].map( id_2_ohe_vector) mitotic_idx = [ idx for idx, colname in enumerate(train_df.columns) if colname == 'Mitotic spindle' ][0] aggresome_idx = [ idx for idx, colname in enumerate(train_df.columns) if colname == 'Aggresome' ][0] mitotic_ohe = np.zeros_like(cherrypicked_aggresome['ohe'].values[0]) mitotic_ohe[mitotic_idx] = 1 aggresome_ohe = np.zeros_like(cherrypicked_aggresome['ohe'].values[0]) aggresome_ohe[aggresome_idx] = 1 cherrypicked_mitotic_spindle.loc[ cherrypicked_mitotic_spindle['is_pure'] == 1, 'ohe'] = pd.Series( [ mitotic_ohe for _ in range( sum(cherrypicked_mitotic_spindle['is_pure'] == 1)) ], index=cherrypicked_mitotic_spindle.index[ cherrypicked_mitotic_spindle['is_pure'] == 1]) cherrypicked_aggresome.loc[ cherrypicked_aggresome['is_pure'] == 1, 'ohe'] = pd.Series([ mitotic_ohe for _ in range(sum(cherrypicked_aggresome['is_pure'] == 1)) ], index=cherrypicked_aggresome.index[ cherrypicked_aggresome['is_pure'] == 1]) class_purity_2_weight = {1: 4, 0: 1} cherrypicked_mitotic_spindle[ 'sampling_weight'] = cherrypicked_mitotic_spindle['is_pure'].map( class_purity_2_weight) cherrypicked_aggresome['sampling_weight'] = cherrypicked_aggresome[ 'is_pure'].map(class_purity_2_weight) else: cherrypicked_mitotic_spindle = None cherrypicked_aggresome = None train_dataset = ProteinDatasetImageLevel( trn_img_paths, basepath_2_ohe=basepath_2_ohe_vector, img_size=args.img_size, is_trainset=True, return_label=True, in_channels=args.in_channels, transform=train_transform, cherrypicked_mitotic_spindle_df=cherrypicked_mitotic_spindle, cherrypicked_aggresome_df=cherrypicked_aggresome) class_names = get_class_names() if args.balance_classes: sampler = BalancingSubSampler(trn_img_paths, basepath_2_ohe_vector, class_names, required_class_count=1500) else: sampler = RandomSampler(train_dataset) train_loader = DataLoader( train_dataset, sampler=sampler, batch_size=args.batch_size, drop_last=True, num_workers=args.workers, pin_memory=True, ) # val_img_paths = [path for path in val_img_paths if path in train_paths_set] valid_dataset = ProteinDatasetImageLevel( val_img_paths, basepath_2_ohe=basepath_2_ohe_vector, img_size=args.img_size, is_trainset=True, return_label=True, in_channels=args.in_channels, transform=train_transform) valid_loader = DataLoader(valid_dataset, sampler=SequentialSampler(valid_dataset), batch_size=args.batch_size, drop_last=False, num_workers=args.workers, pin_memory=True) focal_loss = FocalLoss().cuda() log.write('** start training here! **\n') log.write('\n') log.write( 'epoch iter rate | train_loss/acc | valid_loss/acc/focal/map |best_epoch/best_map| min \n' ) log.write( '-----------------------------------------------------------------------------------------------------------------\n' ) start_epoch += 1 if args.eval_at_start: with torch.no_grad(): valid_loss, valid_acc, valid_focal_loss, valid_map = validate( valid_loader, model, criterion, -1, focal_loss, log) print('\r', end='', flush=True) log.write( '%5.1f %5d %0.6f | %0.4f %0.4f | %0.4f %6.4f %6.4f %6.4f | %6.1f %6.4f | %3.1f min \n' % \ (-1, -1, -1, -1, -1, valid_loss, valid_acc, valid_focal_loss, valid_map, best_epoch, best_map, -1)) for epoch in range(start_epoch, args.epochs + 1): end = time.time() # set manual seeds per epoch np.random.seed(epoch) torch.manual_seed(epoch) torch.cuda.manual_seed_all(epoch) # adjust learning rate for each epoch lr_list = scheduler.step(model, epoch, args.epochs) lr = lr_list[0] # train for one epoch on train set iter, train_loss, train_acc = train( train_loader, model, criterion, optimizer, epoch, clipnorm=args.clipnorm, lr=lr, agg_steps=args.gradient_accumulation_steps) if np.isnan(train_loss): print('@@@@@NAN!') else: print('norm') with torch.no_grad(): valid_loss, valid_acc, valid_focal_loss, valid_map = validate( valid_loader, model, criterion, epoch, focal_loss, log) # remember best loss and save checkpoint is_best = valid_map > best_map best_loss = min(valid_focal_loss, best_loss) best_epoch = epoch if is_best else best_epoch best_map = valid_map if is_best else best_map print('\r', end='', flush=True) log.write( '%5.1f %5d %0.6f | %0.4f %0.4f | %0.4f %6.4f %6.4f %6.4f | %6.1f %6.4f | %3.1f min \n' % \ (epoch, iter + 1, lr, train_loss, train_acc, valid_loss, valid_acc, valid_focal_loss, valid_map, best_epoch, best_map, (time.time() - end) / 60)) save_model(model, is_best, model_out_dir, optimizer=optimizer, epoch=epoch, best_epoch=best_epoch, best_map=best_map)
def ablation(args, subset, model, checkpoint, dataset): logger.info( 'Beginning ablation study for subset {} on model checkpoint {}'.format( subset, checkpoint)) # for speeding up computation load in mapping dict for good edges, keys are senders nodes; values are all receiver nodes good_edge_connections = model.good_edge_connections sender_good = {} for kg in good_edge_connections.keys(): id1, id2 = kg if id1 not in sender_good: sender_good[id1] = [id2] else: sender_good[id1].append(id2) all_edge_connections = model.all_edge_connections # create the ablation files in their own directory ablation_dir = os.path.join(args.output_dir, 'ablation_{}'.format(subset)) if not os.path.exists(ablation_dir): os.makedirs(ablation_dir) ablation_filename = os.path.join(ablation_dir, 'checkpoint_{}.txt'.format(checkpoint)) # load in tokenizer for the myind_to_word dict to help translate the ids back to words my_tokenizer = MyTokenizer.load_tokenizer(args, evaluating=True) myind_to_word = my_tokenizer.myind_to_word # load in lines one at a time train_sampler = SequentialSampler(dataset) train_dataloader = DataLoader(dataset, sampler=train_sampler, batch_size=1) with open(ablation_filename, 'w') as af: # for each question in the subset for batch_ind, batch in enumerate(train_dataloader): model.eval() # get batch batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[0], 'input_mask': batch[1], 'labels': batch[2], } _, softmaxed_scores = model(training=False, **inputs) inputs = {k: v.squeeze() for k, v in inputs.items()} # calculate prediction prediction = torch.argmax(softmaxed_scores, dim=1) input_ids = inputs['input_ids'] input_masks = inputs['input_mask'] # print out the question with the prediction, prediction score, and the correct label # get up until the index they are all the same change_index_list = [ input_ids[1, i] == input_ids[2, i] == input_ids[3, i] == input_ids[0, i] for i in range(input_ids.shape[1]) ] if False not in change_index_list: continue change_index = change_index_list.index(False) # get first token which is padding to separate the answers pad_indices = [ input_masks[i, :].tolist().index(0) for i in range(input_masks.shape[0]) ] if not all([pi > change_index for pi in pad_indices]): continue question_ids = input_ids[1, :change_index] answers_ids = [ input_ids[i, change_index:pad_ind] for i, pad_ind in zip(range(input_ids.shape[0]), pad_indices) ] question_text = ' '.join( [myind_to_word[qi.item()] for qi in question_ids]) # get all answer features to display answers_text = [ ' '.join([myind_to_word[ai.item()] for ai in answer_id]) for answer_id in answers_ids ] answer_choice_text = ['A.', 'B.', 'C.', 'D.'] correct_label = [ ' ' if lab == 0 else '*' for lab in inputs['labels'] ] predicted_label = [' '] * 4 predicted_label[prediction.item()] = '#' softmaxed_scores = [ round(ss, 3) for ss in softmaxed_scores.squeeze().tolist() ] assert len(correct_label) == len(predicted_label) == len( softmaxed_scores) == len(answer_choice_text) == len( answers_text) answer_features = list( map( tuple, zip(correct_label, predicted_label, softmaxed_scores, answer_choice_text, answers_text))) # print out the batch_ind then the question text then newline af.write('{}. {}\n'.format(batch_ind + 1, question_text)) # print out a * for correct answer, # for prediction, rounded softmaxed score, and then answer text for each of four options for (cl, pl, ss, act, at) in answer_features: af.write('{} {} {} {}{}\n'.format(cl, pl, ss, act, at)) af.write('\n') ## print out the best connections for the subsetted graph # get all unique ids unique_ids = torch.unique(input_ids) # get all first neighbor connections within good and all relevant_best_connections = {} for ui in unique_ids: ui = ui.item() if ui in sender_good and sender_good[ui]: for id2 in sender_good[ui]: if id2 == ui: continue assert (ui, id2) in good_edge_connections assert (ui, id2) in all_edge_connections relevant_best_connections[( ui, id2)] = good_edge_connections[(ui, id2)] / float( all_edge_connections[(ui, id2)]) # print out top n=10 in percentage best_connections = [(k, v) for k, v in relevant_best_connections.items()] best_connections.sort(key=lambda t: t[1], reverse=True) num_to_print = min(10, len(best_connections)) for i in range(num_to_print): id1, id2 = best_connections[i][0] val = best_connections[i][1] word1 = myind_to_word[id1] word2 = myind_to_word[id2] af.write( 'The connection between "{}" and "{}" has value {}.\n'. format(word1, word2, round(val, 3))) af.write('\n\n') return -1
def predict(model_args, predict_args): # Setup logging logging.basicConfig( format="%(asctime)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) # logger = create_logger(name="predict_prod", save_dir=train_args.output_dir) logger.info("Predict parameters %s", predict_args) # Prepare prod-ext task labels = get_labels(predict_args.labels) label_map: Dict[int, str] = {i: label for i, label in enumerate(labels)} num_labels = len(labels) # Load pretrained model and tokenizer config = AutoConfig.from_pretrained( model_args.model_name_or_path, num_labels=num_labels, id2label=label_map, label2id={label: i for i, label in enumerate(labels)}, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast, ) if model_args.use_crf: model = BertCRFForTagging.from_pretrained( model_args.model_name_or_path, config=config, cache_dir=model_args.cache_dir, tagging_schema="BIO" ) else: model = BertForTagging.from_pretrained( model_args.model_name_or_path, config=config, cache_dir=model_args.cache_dir ) device = torch.device( "cuda" if (not predict_args.no_cuda and torch.cuda.is_available()) else "cpu" ) model = model.to(device) # load test dataset test_dataset = ProdDataset( data_file=predict_args.input_file, tokenizer=tokenizer, labels=labels, model_type=config.model_type, max_seq_length=predict_args.max_seq_length, overwrite_cache=predict_args.overwrite_cache, ) sampler = SequentialSampler(test_dataset) data_loader = DataLoader( test_dataset, sampler=sampler, batch_size=predict_args.batch_size, collate_fn=default_data_collator ) logger.info("***** Running Prediction *****") logger.info(" Num examples = {}".format(len(data_loader.dataset))) logger.info(" Batch size = {}".format(predict_args.batch_size)) model.eval() with open(predict_args.input_file, "r") as f: all_preds = [] for inputs in tqdm(data_loader, desc="Predicting"): for k, v in inputs.items(): if isinstance(v, torch.Tensor): inputs[k] = v.to(device) with torch.no_grad(): outputs = model( input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], token_type_ids=inputs['token_type_ids'] ) logits = outputs[0] preds = model.decode(logits, inputs['decoder_mask'].bool()) preds_list = [[label_map[x] for x in seq] for seq in preds] all_preds += preds_list write_predictions( predict_args.input_file, predict_args.output_file, all_preds )
def main(): # define parser and arguments args = get_train_test_args() util.set_seed(args.seed) #### Change Made By Xuran Wang: Comment out original lines ####### # model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased") # tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased') #### Change End ####### #### Change Made By Xuran Wang: Add custom lines ####### tokenizer = DistilBertTokenizerFast.from_pretrained( 'distilbert-base-uncased') finetuned_model_path = 'save/baseline-01/' #### Change End ####### if args.do_train: if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) args.save_dir = util.get_save_dir(args.save_dir, args.run_name) log = util.get_logger(args.save_dir, 'log_train') log.info(f'Args: {json.dumps(vars(args), indent=4, sort_keys=True)}') log.info("Preparing Training Data...") #### Change Made By Xuran Wang: Add custom lines ####### checkpoint_path = os.path.join(finetuned_model_path, 'checkpoint') model = DistilBertForQuestionAnswering.from_pretrained(checkpoint_path) #### Change End ####### '''###''' # if args.reinit_pooler: # encoder_temp = getattr(model, "distilbert") # Equivalent to model.distilbert # encoder_temp.pooler.dense.weight.data.normal_(mean=0.0, std=encoder_temp.config.initializer_range) # encoder_temp.pooler.dense.bias.data.zero_() # The change of encoder_temp would affect the model # for p in encoder_temp.pooler.parameters(): # p.requires_grad = True if args.reinit_layers > 0: import torch.nn as nn from transformers.models.distilbert.modeling_distilbert import MultiHeadSelfAttention, FFN # model_distilbert = getattr(model, "distilbert") # model.distilbert; change of model_distilbert affects model! # Reinitialization for the last few layers for layer in model.distilbert.transformer.layer[-args. reinit_layers:]: for module in layer.modules(): # print(module) model.distilbert._init_weights( module) # It's the line equivalent to below approach # if isinstance(module, nn.modules.linear.Linear): # Original form for nn.Linear # # model.config.initializer_range == model.distilbert.config.initializer_range => True # module.weight.data.normal_(mean=0.0, std=model.distilbert.config.initializer_range) # if module.bias is not None: # module.bias.data.zero_() # elif isinstance(module, nn.modules.normalization.LayerNorm): # module.weight.data.fill_(1.0) # module.bias.data.zero_() # elif isinstance(module, FFN): # for param in [module.lin1, module.lin2]: # param.weight.data.normal_(mean=0.0, std=model.distilbert.config.initializer_range) # if param.bias is not None: # param.bias.data.zero_() # elif isinstance(module, MultiHeadSelfAttention): # for param in [module.q_lin, module.k_lin, module.v_lin, module.out_lin]: # param.data.weight.normal_(mean=0.0, std=model.distilbert.config.initializer_range) # if param.bias is not None: # param.bias.data.zero_() args.device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') model.to(args.device) trainer = Trainer(args, log) #### Change Made By Xuran Wang: Add custom lines, comment out original line ####### # train_dataset, _ = get_dataset(args, args.train_datasets, args.train_dir, tokenizer, 'train') train_dataset, _ = get_dataset_eda_revised(args, args.train_datasets, args.train_dir, tokenizer, 'train', train_fraction) #### Change End ####### log.info("Preparing Validation Data...") val_dataset, val_dict = get_dataset(args, args.train_datasets, args.val_dir, tokenizer, 'val') train_loader = DataLoader(train_dataset, batch_size=args.batch_size, sampler=RandomSampler(train_dataset)) val_loader = DataLoader(val_dataset, batch_size=args.batch_size, sampler=SequentialSampler(val_dataset)) best_scores = trainer.train(model, train_loader, val_loader, val_dict) if args.do_eval: args.device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') split_name = 'test' if 'test' in args.eval_dir else 'validation' log = util.get_logger(args.save_dir, f'log_{split_name}') trainer = Trainer(args, log) checkpoint_path = os.path.join(args.save_dir, 'checkpoint') model = DistilBertForQuestionAnswering.from_pretrained(checkpoint_path) model.to(args.device) eval_dataset, eval_dict = get_dataset(args, args.eval_datasets, args.eval_dir, tokenizer, split_name) eval_loader = DataLoader(eval_dataset, batch_size=args.batch_size, sampler=SequentialSampler(eval_dataset)) eval_preds, eval_scores = trainer.evaluate(model, eval_loader, eval_dict, return_preds=True, split=split_name) results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in eval_scores.items()) log.info(f'Eval {results_str}') # Write submission file sub_path = os.path.join(args.save_dir, split_name + '_' + args.sub_file) log.info(f'Writing submission file to {sub_path}...') with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh: csv_writer = csv.writer(csv_fh, delimiter=',') csv_writer.writerow(['Id', 'Predicted']) for uuid in sorted(eval_preds): csv_writer.writerow([uuid, eval_preds[uuid]])