def main(): seed_everything(0) # Load Data ##################################### # From csv since = time.time() print('Data Loading...') # From Original ################# # data_dir = '../data/input' # df = load_data(nrows=None, merge=True, data_dir=data_dir) # From Pickle ################### # with open('../data/input/data.pkl', 'rb') as f: # df = pickle.load(f) # Preprocessing # df = prep_dict[args.preprocessing](df) # df = reduce_mem_usage(df) # From Feather ################# target_features = [ 'Snap', 'SellPrice', 'Lag', 'Lag_RollMean_28', 'TimeFeatures', 'Lag_SellPrice', 'Lag_SellPrice_diff', 'Ids', 'Event' ] target_path = [f'../features/{name}.ftr' for name in target_features] df = load_from_feather(target_path) # Model Training ##################################### lgbm = LGBMModel_group(df, **config) res = lgbm.train() # WRMSSE ################################################## print('Reading files...') calendar = pd.read_csv('../data/input/calendar.csv') sell_prices = pd.read_csv('../data/input/sell_prices.csv') sales_train_validation = pd.read_csv('../data/input/sales_train_validation.csv') train_fold_df = sales_train_validation.iloc[:, :-28] valid_fold_df = sales_train_validation.iloc[:, -28:] del sales_train_validation wrmsse = lgbm.get_wrmsse(train_fold_df, valid_fold_df, calendar, sell_prices) print(f'WRMSSE: {wrmsse:.3f}') del calendar, sell_prices, train_fold_df, valid_fold_df gc.collect() # Evaluate ##################################### sub_name = f"{config['exp_name']}_wrmsse_{wrmsse:.3f}.csv" res.to_csv(f'../data/output/{sub_name}', index=False) del df gc.collect() # Feature Importance ##################################### lgbm.visualize_feature_importance() # Time Counting ################################################## erapsedtime = time.time() - since s = datetime.timedelta(seconds=erapsedtime) print(f'All Times: {str(s)}')
def main(): parser = get_argparse() parser.add_argument("--fine_tunning_model", type=str, required=True, help="fine_tuning model path") args = parser.parse_args() print( json.dumps(vars(args), sort_keys=True, indent=4, separators=(', ', ': '), ensure_ascii=False)) init_logger(log_file="./log/{}.log".format( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))) seed_everything(args.seed) # save path if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) # device args.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") # tokenizer tokenizer = BertTokenizerFast.from_pretrained(args.model_name_or_path) # Dataset & Dataloader test_dataset = MrcDataset(args, json_path="./data/test1.json", tokenizer=tokenizer) test_iter = DataLoader(test_dataset, shuffle=False, batch_size=args.per_gpu_eval_batch_size, collate_fn=collate_fn, num_workers=24) logger.info("The nums of the test_dataset examples is {}".format( len(test_dataset.examples))) logger.info("The nums of the test_dataset features is {}".format( len(test_dataset))) # model model = MRC_model(args.model_name_or_path) model.to(args.device) model.load_state_dict(torch.load(args.fine_tunning_model)) # predict test model.eval() evaluate(args, test_iter, model, prefix="test")
def train(self, data, seed): train_data = data['trainloader'] valid_data = data['validloader'] tgt_vocab = data['tgt_vocab'] label2id= data['label2id'] seed_everything(seed) if self.config.classifier == 'BertSGM' or self.config.classifier == 'SGM': with codecs.open(self.config.sgm.label_dict_file, 'r', 'utf-8') as f: label_dict = json.load(f) # *************************************************************** best=0 for epoch in range(self.start_epoch, self.start_epoch+self.epochs): print(f"Epoch {epoch}/{self.epochs}") if self.config.classifier=='BertCNN' or self.config.classifier=='BertRCNN': train_log = self.train_epoch(train_data) valid_log = self.valid_epoch(valid_data) logs = dict(train_log, **valid_log) show_info = f'\nEpoch: {epoch} - ' + "-".join([f' {key}: {value:.4f} ' for key, value in logs.items()]) print(show_info) if self.config.classifier=='BertSGM' or self.config.classifier=='SGM': self.train_bertsgm_epoch(train_data,epoch) logs = self.valid_bertsgm_epoch(valid_data,tgt_vocab,label_dict) print(logs) if self.config.classifier=='BertSeq2Set': self.train_seq2set_epoch(train_data,epoch) logs = self.valid_seq2set_epoch(valid_data,tgt_vocab,label2id) print(logs) # 存储f1值最好的模型 if logs['valid_f1'] > best: best = logs['valid_f1'] torch.save(self.model, self.config.model_save_path + str(self.config.classifier)+'_bestmodel.pth') print('Epoch:%d best f1:%s' % (epoch, str(best))) # early_stopping if self.early_stopping: self.early_stopping.epoch_step( epoch=epoch, current=logs[self.early_stopping.monitor]) if self.early_stopping.stop_training: break
def train(self): self.logger.info(" rate step epoch | loss val_loss | time") self.logger.info("-" * 68) min_loss = np.Inf start_time = timer() for epoch in range(self.start_epoch, self.num_epochs): seed_everything(epoch * 1000 + epoch) train_log = self._train_epoch(start_time) valid_log = self._valid_epoch() logs = dict(train_log, **valid_log) rate = self.optimizer.get_lr() now_epoch = (self.global_step * self.batch_size / len(self.train_loader.dataset)) asterisk = " " if logs["val_loss"] < min_loss: min_loss = logs["val_loss"] asterisk = "*" self.logger.info(f"{rate[0]:.7f} " f"{self.global_step / 1000:5.2f} " f"{now_epoch:6.2f} | " f'{logs["loss"]:.4f} ' f'{logs["val_loss"]:.4f} {asterisk}| ' f'{time_to_str((timer() - start_time), "sec")} ' f"{torch.cuda.memory_allocated() // 1024 ** 2}") valid_probs = logs["val_probs"] correct = evaluate(valid_probs) self.logger.info( f"min: {np.min(valid_probs):.4f} " f"max: {np.max(valid_probs):.4f} " f"avg: {np.average(valid_probs):.4f} " f"acc: {correct}, {float(correct / len(valid_probs)):.4f}") if self.model_checkpoint: state = self._save_info(epoch, val_loss=logs["val_loss"]) self.model_checkpoint.step(state=state)
def run(args, log): df = pd.read_csv(args.df_path) df_train = df[df['Fold']!=args.fold] df_valid = df[df['Fold']==args.fold] dfs = {} dfs['train'] = df_train dfs['val'] = df_valid model = get_model(args).cuda() if args.mode != 'segmentation': for param in model.model.encoder.parameters(): param.requires_grad = True for param in model.model.decoder.parameters(): param.requires_grad = True for params in model.model.classification_head.parameters(): params.requires_grad = False elif args.mode == 'classification': for param in model.model.encoder.parameters(): param.requires_grad = False for param in model.model.decoder.parameters(): param.requires_grad = False for param in model.classification_head.parameters(): param.requires_grad = True criterion = get_loss(args) optimizer = get_optimizer(args, model) if args.initial_ckpt is not None: last_epoch, step = checkpoint.load_checkpoint(args, model, checkpoint=args.initial_ckpt) log.write(f'Resume training from {args.initial_ckpt} @ {last_epoch}\n') else: last_epoch, step = -1, -1 dataloaders = {mode:get_dataloader(args.data_dir, dfs[mode], mode, args.pretrain, args.batch_size) for mode in ['train', 'val']} seed_everything(seed=123) clr = CLR(optimizer, len(dataloaders['train'])) train(args, model, dataloaders['train'], criterion, optimizer, clr)
def train(self, train_data, valid_data, seed): seed_everything(seed) print("model summary info: ") for step, (input_ids, input_mask, segment_ids, label_ids) in enumerate(train_data): input_ids = input_ids.to(self.device) input_mask = input_mask.to(self.device) segment_ids = segment_ids.to(self.device) summary(self.model, *(input_ids, input_mask, segment_ids), show_input=True) break # *************************************************************** for epoch in range(self.start_epoch, self.start_epoch + self.epochs): self.logger.info(f"Epoch {epoch}/{self.epochs}") train_log = self.train_epoch(train_data) valid_log = self.valid_epoch(valid_data) logs = dict(train_log, **valid_log) show_info = f'\nEpoch: {epoch} - ' + "-".join( [f' {key}: {value:.4f} ' for key, value in logs.items()]) self.logger.info(show_info) # save if self.training_monitor: self.training_monitor.epoch_step(logs) # save model if self.model_checkpoint: state = self.save_info(epoch, best=logs['valid_loss']) self.model_checkpoint.bert_epoch_step( current=logs[self.model_checkpoint.monitor], state=state) # early_stopping if self.early_stopping: self.early_stopping.epoch_step( epoch=epoch, current=logs[self.early_stopping.monitor]) if self.early_stopping.stop_training: break
def submit(args, log): df = pd.read_csv(args.df_path) df['Image'] = df.Image_Label.map(lambda v: v[:v.find('_')]) print(df.head()) model = get_model(args).cuda() last_epoch, step = checkpoint.load_checkpoint(args, model, checkpoint=args.initial_ckpt) log.write(f'Loaded checkpoint from {args.initial_ckpt} @ {last_epoch}\n') dataloader = get_dataloader(args.data_dir, df, 'test', args.pretrain, args.batch_size) seed_everything() # inference test_ids, mask_predictions = inference_submit(model, dataloader, args.tta_augment) assert len(test_ids) == mask_predictions.shape[0] ids = [] rles = [] for i, image_id in tqdm.tqdm(enumerate(test_ids), total=len(test_ids)): predictions = mask_predictions[i] for cls_idx in range(4): prediction = predictions[cls_idx, :, :] H, W = prediction.shape assert H == 350 and W == 525 rle_encoded = mask2rle(prediction) assert np.all(rle2mask(H, W, rle_encoded) == prediction) ids.append(f'{image_id}_{LABEL_LIST[cls_idx]}') rles.append(rle_encoded) df_submission = pd.DataFrame({'Image_Label': ids, 'EncodedPixels': rles}) df_submission.to_csv(args.sub_name, index=False) print(df_submission.head())
def train(args, logger=None): from utils.utils import create_loaders, seed_everything, CIFAR_NORMALIZATION import utils.config as cf import os import torch.backends.cudnn as cudnn import time seed_everything(args.seed) normalize = None if args.normalize == "meanstd": from torchvision import transforms normalize = transforms.Normalize(cf.mean["cifar10"], cf.std["cifar10"]) elif args.normalize == "default": normalize = CIFAR_NORMALIZATION # Hyper Parameter settings use_cuda = torch.cuda.is_available() best_acc = 0 start_epoch, num_epochs = cf.start_epoch, cf.num_epochs # Data Uplaod trainloader, testloader = create_loaders(args, augment=not args.no_augment, normalize=normalize) # Model print('\n[Phase 2] : Model setup') net = Wide_ResNet(**vars(args)) file_name = os.path.join(args.output, "%s/%s/model_%i.pt" % (args.dataset, "wide_resnet", args.seed)) net.apply(conv_init) if use_cuda: net.cuda() net = torch.nn.DataParallel(net, device_ids=range(torch.cuda.device_count())) cudnn.benchmark = True criterion = nn.CrossEntropyLoss() if args.optimizer == "adam": from torch.optim import Adam optimizer = Adam(net.parameters(), lr=args.lr) elif args.optimizer == "sgd": from torch.optim import SGD optimizer = None elif args.optimizer == "sls": from utils.sls import Sls n_batches_per_epoch = len(trainloader) print(n_batches_per_epoch) optimizer = Sls(net.parameters(), n_batches_per_epoch=n_batches_per_epoch) else: raise ValueError("Only supports adam or sgd for optimizer.") # Training def train(epoch, optimizer=None): net.train() net.training = True train_loss = 0 correct = 0 total = 0 if args.optimizer == "sgd": optimizer = SGD(net.parameters(), lr=cf.learning_rate(args.lr, epoch), momentum=0.9, weight_decay=5e-4) print('\n=> Training Epoch #%d, LR=%.4f' %(epoch, cf.learning_rate(args.lr, epoch))) for batch_idx, (inputs, targets) in enumerate(trainloader): if use_cuda: inputs, targets = inputs.cuda(), targets.cuda() # GPU settings optimizer.zero_grad() inputs, targets = Variable(inputs), Variable(targets) outputs = net(inputs) # Forward Propagation loss = criterion(outputs, targets) # Loss if args.optimizer == "sls": def closure(): output = net(inputs) loss = criterion(output, targets) return loss optimizer.step(closure) else: loss.backward() optimizer.step() train_loss += loss.item() _, predicted = torch.max(outputs.data, 1) total += targets.size(0) correct += predicted.eq(targets.data).cpu().sum() sys.stdout.write('\r') sys.stdout.write('| Epoch [%3d/%3d] Iter[%3d/%3d]\t\tLoss: %.4f Acc@1: %.3f%%' %(epoch, num_epochs, batch_idx+1, len(trainloader), loss.item(), 100.*correct/total)) sys.stdout.flush() if logger is not None: logger.write(dict(train_accuracy=100. * correct / total, loss=loss.item()), epoch) def test(epoch, best_acc=0): net.eval() net.training = False test_loss = 0 correct = 0 total = 0 with torch.no_grad(): for batch_idx, (inputs, targets) in enumerate(testloader): if use_cuda: inputs, targets = inputs.cuda(), targets.cuda() inputs, targets = Variable(inputs), Variable(targets) outputs = net(inputs) loss = criterion(outputs, targets) test_loss += loss.item() _, predicted = torch.max(outputs.data, 1) total += targets.size(0) correct += predicted.eq(targets.data).cpu().sum() # Save checkpoint when best model acc = 100.*correct/total if logger is None: print("\n| Validation Epoch #%d\t\t\tLoss: %.4f Acc@1: %.2f%%" %(epoch, loss.item(), acc)) else: logger.write(dict(test_loss=loss.item(), test_accuracy=acc), epoch) if acc > best_acc: print('| Saving Best model...\t\t\tTop1 = %.2f%%' %(acc)) state = { 'net':net.module if use_cuda else net, 'acc':acc, 'epoch':epoch, } dirname = os.path.dirname(file_name) if not os.path.exists(dirname): os.makedirs(dirname) torch.save(net.state_dict(), file_name) best_acc = acc return best_acc print('\n[Phase 3] : Training model') print('| Training Epochs = ' + str(num_epochs)) print('| Initial Learning Rate = ' + str(args.lr)) elapsed_time = 0 for epoch in range(start_epoch, start_epoch+num_epochs): start_time = time.time() train(epoch, optimizer) best_acc = test(epoch, best_acc) epoch_time = time.time() - start_time elapsed_time += epoch_time print('| Elapsed time : %d:%02d:%02d' %(cf.get_hms(elapsed_time))) print('\n[Phase 4] : Testing model') print('* Test results : Acc@1 = %.2f%%' %(best_acc))
import glob, pickle, time, datetime, argparse, gc import pandas as pd from sklearn.model_selection import KFold, TimeSeriesSplit from utils.utils import load_data, load_from_feather, reduce_mem_usage, seed_everything from model.Model import LGBMModel_group seed_everything(0) # Parser ################################################################ parser = argparse.ArgumentParser() parser.add_argument('-exp', '--expname') parser.add_argument('-obj', '--objective', default='regression', choices=['regression', 'poisson', 'tweedie']) parser.add_argument('-lr', '--learningrate', type=float, default=0.01) parser.add_argument('-subs', '--subsample', type=float, default=1.0) parser.add_argument('-featfrac', '--featurefraction', type=float, default=1.0) parser.add_argument('-cv', '--crossval', default='kfold', choices=['kfold', 'time', 'none']) parser.add_argument('-nsplit', '--nsplit', type=int, default=4) parser.add_argument('-num', '--num_boost_round', type=int, default=1000) parser.add_argument('-early', '--early_stopping_rounds', type=int, default=10) parser.add_argument('-drate', '--data_rate', type=float, default=0.1) parser.add_argument('-grp', '--group', default='store', choices=['store', 'cat', 'state']) parser.add_argument('-prep', '--preprocess', action='store_true') parser.add_argument('-post', '--postprocess', action='store_true') args = parser.parse_args() # Parameter ############################################################# params = { 'boosting_type': 'gbdt', 'objective': args.objective,
def main(cfg: DictConfig): print('Nishika Second-hand Apartment Price Training') cur_dir = hydra.utils.get_original_cwd() os.chdir(cur_dir) data_dir = './input' seed_everything(cfg.data.seed) experiment = Experiment(api_key=cfg.exp.api_key, project_name=cfg.exp.project_name, auto_output_logging='simple', auto_metric_logging=False) experiment.log_parameters(dict(cfg.data)) # Config #################################################################################### del_tar_col = ['取引時点'] id_col = 'ID' tar_col = '取引価格(総額)_log' g_col = 'year' criterion = MAE cv = KFold(n_splits=cfg.data.n_splits, shuffle=True, random_state=cfg.data.seed) # cv = GroupKFold(n_splits=5) # Load Data #################################################################################### if cfg.exp.use_pickle: # pickleから読み込み df = unpickle('./input/data.pkl') else: df = load_data(data_dir, sampling=cfg.data.sampling, seed=cfg.data.seed, id_col=id_col, target_col=tar_col) # Preprocessing print('Preprocessing') df = preprocessing(df, cfg) # pickle形式で保存 to_pickle('./input/data.pkl', df) try: experiment.log_asset(file_data='./input/data.pkl', file_name='data.pkl') except: pass features = [c for c in df.columns if c not in del_tar_col] # Model #################################################################################### model = None if cfg.exp.model == 'lgb': model = LGBMModel(dict(cfg.lgb)) elif cfg.exp.model == 'cat': model = CatBoostModel(dict(cfg.cat)) # Train & Predict ############################################################################## trainer = Trainer(model, id_col, tar_col, g_col, features, cv, criterion, experiment) trainer.fit(df) trainer.predict(df) trainer.get_feature_importance()
ARCH = "bert" SEED = 2323 FOLD_ID = 2 TEST_PATH = "/input/input.txt" OUTPUT_PATH = "/output/output.txt" # TEST_PATH = "datasets/input.txt" # # TEST_PATH = "datasets/SCM_5k.json" TEST_PATH = f"datasets/bigfolds/fold{FOLD_ID}_valid.txt" # OUTPUT_PATH = "output/output.txt" # LOG_DIR = "output/logs" MAX_SEQ_LENGTH = 445 BATCH_SIZE = 16 seed_everything(SEED) logger = init_logger(log_name=ARCH, log_dir=LOG_DIR) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print("---------- Bert Eval ... ----------") start_time = timer() # bert_config.json, pytorch_model.bin vocab.txt in chpts BERT_MODEL_PATH = "output/ckpts6920" BERT_VOCAB_PATH = "output/ckpts6920/vocab.txt" test_dataset = CAILDataset( data_path=TEST_PATH, max_seq_len=MAX_SEQ_LENGTH, vocab_path=BERT_VOCAB_PATH, seed=SEED,
from utils.utils import seed_everything seed_everything() import warnings warnings.filterwarnings("ignore") import torch import torch.nn as nn import torchvision import pandas as pd import numpy as np import os from tqdm import tqdm from albumentations.pytorch import ToTensor from catalyst.data.sampler import BalanceClassSampler from torch.utils.data.sampler import SequentialSampler import torch.nn.functional as F from data_loader.alaska import Alaska from data_loader.generator import Alaska2Dataset from model.network import Net, AttentionNet from utils.metrics import alaska_weighted_auc from utils.data_augmentation import get_transforms from data_loader.dataset_retriever import DatasetRetriever from trainer.fitter import Fitter import json config_json = "./config/baseline.json" with open(config_json) as f: config = json.load(f)
def main(): seed_everything(3) print("done")
def main(): logger = logger_factory(log_name=config['model']['arch'], log_dir=config['output']['log_dir']) logger.info(f"seed is {config['train']['seed']}") n_gpu = torch.cuda.device_count() logger.info(f"Cuda device count:{n_gpu}") device = f"cuda: {config['train']['n_gpu'][0] if len(config['train']['n_gpu']) else 'cpu'}" seed_everything(seed=config['train']['seed'], device=device) logger.info('starting to load data from disk') torch.cuda.empty_cache() model_state_dict = None processor = MultiLabelTextProcessor(config['data']['data_path']) label_list, num_labels = load_labels(processor) logger.info(f"Labels loaded. Count: {num_labels}") print(label_list) tokenizer = BertTokenizer.from_pretrained( config['bert']['path'], do_lower_case=config['train']['do_lower_case']) train_examples = None num_train_steps = None if config['train']['do_train']: train_examples = processor.get_train_examples( config['data']['data_path'], logger=logger, size=config['train']['train_size']) num_train_steps = int( len(train_examples) / config['train']['train_batch_size'] / config['train']['gradient_accumulation_steps'] * config['train']['num_train_epochs']) logger.info(f"Training examples:{len(train_examples)}") logger.info(f"Training steps:{num_train_steps}") model = get_model(model_state_dict, num_labels) logger.info(f"fp16: {config['train']['fp16']}") if config['train']['fp16']: model.half() model.to(device) logger.info(f"Model loaded: {config['bert']['path']}") # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] t_total = num_train_steps if config['train']['fp16']: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=config['train']['learning_rate'], bias_correction=False, max_grad_norm=1.0) if config['train']['loss_scale'] == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer( optimizer, static_loss_scale=config['train']['loss_scale']) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=config['train']['learning_rate'], warmup=config['train']['warmup_proportion'], t_total=t_total) scheduler = CyclicLR(optimizer, base_lr=2e-5, max_lr=5e-5, step_size=2500, last_batch_iteration=0) eval_examples = processor.get_dev_examples( config['data']['data_path'], filename='training.csv', size=config['train']['val_size']) logger.info(f"Evaluation data loaded. Len: {len(eval_examples)}") train_features = convert_examples_to_features( train_examples, label_list, config['train']['max_seq_length'], tokenizer, logger) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", config['train']['train_batch_size']) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_ids for f in train_features], dtype=torch.float) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader( train_data, sampler=train_sampler, batch_size=config['train']['train_batch_size']) # Freeze BERT layers for 1 epoch # model.module.freeze_bert_encoder() # fit(1) model.unfreeze_bert_encoder() fit(model, device, n_gpu, optimizer, train_dataloader, logger, t_total, eval_examples, label_list, num_labels, tokenizer) # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(config['bert']['cache'], "finetuned_pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) logger.info(f"Model saved! Location: {output_model_file}") if None: # Load a trained model that you have fine-tuned model_state_dict = torch.load(output_model_file) model = BertForMultiLabelSequenceClassification.from_pretrained( config['bert']['path'], num_labels=num_labels, state_dict=model_state_dict) model.to(device) eval(model, device, logger, eval_examples, label_list, num_labels, config['train']['max_seq_length'], tokenizer) result = predict(model, device, config['data']['data_path'], logger, label_list, tokenizer) print(result.shape) result.to_csv(config['data']['data_path'] / 'prediction.csv', index=None)
def main(): parser = ArgumentParser() parser.add_argument("--pretrain", default="bert", type=str) parser.add_argument("--do_data", action="store_true") parser.add_argument("--do_train", action="store_true") parser.add_argument("--do_test", action="store_true") parser.add_argument("--save_best", action="store_true") parser.add_argument("--do_lower_case", action='store_true') parser.add_argument("--data_name", default="law", type=str) parser.add_argument("--train_data_num", default=0, type=int) parser.add_argument("--test_data_num", default=0, type=int) parser.add_argument("--epochs", default=5, type=int) parser.add_argument("--resume_path", default="", type=str) parser.add_argument("--mode", default="min", type=str) parser.add_argument("--monitor", default="valid_loss", type=str) parser.add_argument("--valid_size", default=0.2, type=float) parser.add_argument("--local_rank", type=int, default=-1) parser.add_argument("--sorted", default=1, type=int, help="1 : True 0:False") parser.add_argument("--n_gpu", type=str, default="0", help='"0,1,.." or "0" or "" ') parser.add_argument("--gradient_accumulation_steps", type=int, default=1) parser.add_argument("--train_batch_size", default=8, type=int) parser.add_argument("--eval_batch_size", default=8, type=int) parser.add_argument("--train_max_seq_len", default=256, type=int) parser.add_argument("--eval_max_seq_len", default=256, type=int) parser.add_argument("--loss_scale", type=float, default=0) parser.add_argument( "--warmup_proportion", default=0.1, type=int, ) parser.add_argument("--weight_decay", default=0.01, type=float) parser.add_argument("--adam_epsilon", default=1e-8, type=float) parser.add_argument("--grad_clip", default=1.0, type=float) parser.add_argument("--learning_rate", default=2e-5, type=float) parser.add_argument("--seed", type=int, default=42) parser.add_argument("--fp16", action="store_true") parser.add_argument("--fp16_opt_level", type=str, default="O1") args = parser.parse_args() try: pipeline = piop.read_yml("pipeline.yml") pl = AttrDict(pipeline["pipeline"]) config["preprocessor"] = pl.preprocessor config["pretrain"] = pl.pretrain config["postprocessor"] = pl.postprocessor config["classifier"] = pl.classifier except Exception as e: raise PipelineReadError config["checkpoint_dir"] = config["checkpoint_dir"] / config["classifier"] config["checkpoint_dir"].mkdir(exist_ok=True) torch.save(args, config["checkpoint_dir"] / "training_args.bin") seed_everything(args.seed) init_logger(log_file=config["log_dir"] / "{}.log".format(config["classifier"])) logger.info("Training/evaluation parameters %s", args) if args.do_data: from dataio.task_data import TaskData data = TaskData(args.train_data_num) labels, sents = data.read_data( raw_data_path=config["raw_data_path"], data_dir=config["data_dir"], preprocessor=Preprocessor(config["preprocessor"])( stopwords_path=config["stopwords_path"], userdict_path=config["userdict_path"]), is_train=True) data.train_val_split(X=sents, y=labels, valid_size=args.valid_size, data_dir=config["data_dir"], data_name=args.data_name) if config["pretrain"] == "Nopretrain": data.build_vocab(config["nopretrain_vocab_path"], sents, min_count=5) if args.do_train: train(args) if args.do_test: test(args)
from utils.utils import seed_everything from utils.prep_utils import mag_normalize import numpy as np import random import torch # from SpecAug.sparse_image_warp_pytorch import sparse_image_warp seed_everything(42) def mix_db(x,y,db): E_x = np.mean(x**2) E_y = np.mean(y**2) a = E_x/(E_y*(10**(db/10))) lam = 1/(1+a) return lam*x + (1-lam)*y # def time_warp(spec, W=5): # num_rows = spec.shape[1] ##F # spec_len = spec.shape[2] ##T # y = num_rows // 2 # horizontal_line_at_ctr = spec[0][y] # # assert len(horizontal_line_at_ctr) == spec_len # point_to_warp = horizontal_line_at_ctr[random.randrange(W, spec_len-W)] # # assert isinstance(point_to_warp, torch.Tensor) # # Uniform distribution from (0,W) with chance to be up to W negative # dist_to_warp = random.randrange(-W, W)
parser.add_argument('--debug', default=False, action='store_true', help='Debug') parser.add_argument('--opus', default=False, action='store_true', help='Change AMINER File Path for Opus') parser.add_argument('--debug_name', type=str, default="one_maml_graph", help='where to save/load') parser.add_argument('--namestr', type=str, default='Meta-Graph', \ help='additional info in output filename to describe experiments') parser.add_argument('--study_uid', type=str, default='') parser.add_argument('--gating', type=str, default=None, choices=[None, 'signature', 'weights', 'signature_cond', 'weights_cond']) parser.add_argument('--layer_norm', default=False, action='store_true', help='use layer norm') args = parser.parse_args() ''' Fix Random Seed ''' seed_everything(args.seed) # Check if settings file if os.path.isfile("settings.json"): with open('settings.json') as f: data = json.load(f) args.comet_apikey = data["apikey"] args.comet_username = data["username"] args.wandb_apikey = data["wandbapikey"] args.dev = torch.device('cuda' if torch.cuda.is_available() else 'cpu') if args.dataset=='PPI': project_name = 'meta-graph-ppi' elif args.dataset=='REDDIT-MULTI-12K': project_name = "meta-graph-reddit" elif args.dataset=='FIRSTMM_DB': project_name = "meta-graph-firstmmdb"
def main(): args = get_argparse().parse_args() print( json.dumps(vars(args), sort_keys=True, indent=4, separators=(', ', ': '), ensure_ascii=False)) init_logger(log_file="./log/{}.log".format( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))) seed_everything(args.seed) # 设置保存目录 if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) # device args.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") # tokenizer tokenizer = BertTokenizerFast.from_pretrained(args.model_name_or_path) # Dataset & Dataloader train_dataset = MrcDataset(args, json_path="./data/train.json", tokenizer=tokenizer) eval_dataset = MrcDataset(args, json_path="./data/dev.json", tokenizer=tokenizer) # eval_dataset, test_dataset = random_split(eval_dataset, # [round(0.5 * len(eval_dataset)), # len(eval_dataset) - round(0.5 * len(eval_dataset))], # generator=torch.Generator().manual_seed(42)) train_iter = DataLoader(train_dataset, shuffle=True, batch_size=args.per_gpu_train_batch_size, collate_fn=collate_fn, num_workers=10) eval_iter = DataLoader(eval_dataset, shuffle=False, batch_size=args.per_gpu_eval_batch_size, collate_fn=collate_fn, num_workers=10) # test_iter = DataLoader(test_dataset, # shuffle=False, # batch_size=args.per_gpu_eval_batch_size, # collate_fn=collate_fn, # num_workers=10) logger.info("The nums of the train_dataset examples is {}".format( len(train_dataset.examples))) logger.info("The nums of the train_dataset features is {}".format( len(train_dataset))) logger.info("The nums of the eval_dataset examples is {}".format( len(eval_dataset.examples))) logger.info("The nums of the eval_dataset features is {}".format( len(eval_dataset))) # model model = MRC_model(args.model_name_or_path) model.to(args.device) # 训练 best_f1 = 0 early_stop = 0 for epoch, _ in enumerate(range(int(args.num_train_epochs))): model.train() train(args, train_iter, model) # 每轮epoch在验证集上计算分数 eval_f1, eval_EM = evaluate(args, eval_iter, model, prefix="eval") logger.info("The F1-score is {}, The EM-score is {}".format( eval_f1, eval_EM)) if eval_f1 > best_f1: early_stop = 0 best_f1 = eval_f1 logger.info( "the best eval f1 is {:.4f}, saving model !!".format(best_f1)) best_model = copy.deepcopy( model.module if hasattr(model, "module") else model) torch.save(best_model.state_dict(), os.path.join(args.output_dir, "best_model.pkl")) else: early_stop += 1 if early_stop == args.early_stop: logger.info("Early stop in {} epoch!".format(epoch)) break