def getModel(args): if args.model == "rnn": model = RNN(input_dim=args.input_dims, nclasses=args.nclasses, hidden_dims=args.hidden_dims, num_rnn_layers=args.num_layers, dropout=args.dropout, bidirectional=True) if args.model == "msresnet": model = MSResNet(input_channel=args.input_dims, layers=[1, 1, 1, 1], num_classes=args.nclasses, hidden_dims=args.hidden_dims) if args.model == "tempcnn": model = TempCNN(input_dim=args.input_dims, nclasses=args.nclasses, sequence_length=args.samplet, hidden_dims=args.hidden_dims, kernel_size=args.kernel_size) elif args.model == "transformer": hidden_dims = args.hidden_dims # 256 n_heads = args.n_heads # 8 n_layers = args.n_layers # 6 len_max_seq = args.samplet dropout = args.dropout d_inner = hidden_dims * 4 model = TransformerEncoder(in_channels=args.input_dims, len_max_seq=len_max_seq, d_word_vec=hidden_dims, d_model=hidden_dims, d_inner=d_inner, n_layers=n_layers, n_head=n_heads, d_k=hidden_dims // n_heads, d_v=hidden_dims // n_heads, dropout=dropout, nclasses=args.nclasses) if torch.cuda.is_available(): model = model.cuda() pytorch_total_params = sum(p.numel() for p in model.parameters()) print("initialized {} model ({} parameters)".format( args.model, pytorch_total_params)) return model
def main(): #################################################################### ## Data #################################################################### all_datasets = [] for dataroot in args.dataroot: curr_dataset = BinaryDataset(root_dir=dataroot, binary_format='elf', targets=args.targets, mode='random-chunks', chunk_length=args.sequence_len) all_datasets.append(curr_dataset) train_data = torch.utils.data.ConcatDataset(all_datasets) logging.info("Train dataset len() = {0}".format(len(train_data))) train_loader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=True, num_workers=2) val_datasets = [] for dataroot in args.val_dataroot: curr_dataset = BinaryDataset(root_dir=dataroot, binary_format='elf', targets=args.targets, mode='random-chunks', chunk_length=args.sequence_len) val_datasets.append(curr_dataset) val_data = torch.utils.data.ConcatDataset(val_datasets) logging.info("Validation dataset len() = {0}".format(len(val_data))) val_loader = torch.utils.data.DataLoader(val_data, batch_size=args.batch_size, shuffle=True, num_workers=2) #################################################################### ## Model #################################################################### if args.targets == 'start' or args.targets == 'end': num_classes = 2 elif args.targets == 'both': # TODO: Make sure if this really is 4 or if it is only 3 in practice num_classes = 4 else: raise NotImplementedError() # Define model # For now, embedding dimension = hidden dimension if args.arch == 'gru': gru = torch.nn.GRU(input_size=args.hidden_size, hidden_size=args.hidden_size, num_layers=args.num_layers, bias=True, batch_first=True, bidirectional=True) embedder = torch.nn.Embedding(num_embeddings=256, embedding_dim=args.hidden_size) model = RNN(rnn=gru, embedder=embedder, output_size=num_classes).cuda() elif args.arch == 'bert': config = BertConfig( vocab_size=256, hidden_size=args.hidden_size, num_hidden_layers=args.num_layers, num_attention_heads=args.num_attn_heads, intermediate_size=args.hidden_size * 4, # BERT originally uses 4x hidden size for this, so copying that. hidden_act='gelu', hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=args.sequence_len, # Sequence length max type_vocab_size=1, initializer_range=0.02, layer_norm_eps=1e-12, pad_token_id=0, gradient_checkpointing=False, num_labels=num_classes) model = BertForTokenClassification(config=config).cuda() else: raise NotImplementedError() if args.optimizer == 'adam': optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) elif args.optimizer == 'rmsprop': optimizer = torch.optim.RMSprop(model.parameters(), lr=args.lr, alpha=0.99, eps=1e-08, weight_decay=0, momentum=0, centered=False) else: raise NotImplementedError() if args.lr_scheduler == 'cosine': def cosine_annealing(step, total_steps, lr_max, lr_min): return lr_min + (lr_max - lr_min) * 0.5 * ( 1 + np.cos(step / total_steps * np.pi)) scheduler = torch.optim.lr_scheduler.LambdaLR( optimizer, lr_lambda=lambda step: cosine_annealing( step, args.epochs * len(train_loader), 1, # since lr_lambda computes multiplicative factor 1e-6 / (args.lr * args.batch_size / 256.))) elif args.lr_scheduler == 'none': scheduler = None else: raise NotImplementedError() with open(os.path.join(args.savedir, 'training_log.csv'), 'w') as f: f.write('epoch,train_loss,train_f1_average,val_loss,val_f1_average\n') logging.info("Beginning training") for epoch in range(args.epochs): train_loss_avg, train_f1_avg = train(model, optimizer, scheduler, train_loader, epoch, num_classes) val_loss_avg, val_f1_avg = validate(model, val_loader, num_classes) # torch.save( # model.state_dict(), # os.path.join(save_dir, "model.pth") # ) # TODO: Save results and model with open(os.path.join(args.savedir, 'training_log.csv'), 'a') as f: f.write('%03d,%0.5f,%0.5f,%0.5f,%0.5f\n' % ((epoch + 1), train_loss_avg, train_f1_avg, val_loss_avg, val_f1_avg))
dropout_rate=args.dropout, ) elif MODEL_TYPE == "electra": net = Electra( output_size=len(label_to_ix), device=DEVICE, ) print('model: {}'.format(net)) if USE_PRETRAIN: net = load_model(net, args.pretrained_model, DEVICE) if torch.cuda.device_count() > 1: net.to(DEVICE) print("Using", torch.cuda.device_count(), "GPUs") net = nn.DataParallel(net) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(net.parameters(), args.lr) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.2, patience=8) \ if args.use_scheduler else None # Create save directory time_stamp = time.strftime("%m-%d-%H-%M", time.localtime()) save_dir = os.path.join(EXPERIMENT_DIR, time_stamp) if not os.path.exists(save_dir): os.makedirs(save_dir) # Save configs model_desc_output = [ ": ".join([str(k), str(v)]) for k, v in vars(args).items() ] with open(os.path.join(save_dir, 'configs.txt'), 'w') as file: file.writelines("\n".join(model_desc_output))