def __init__(self, yaml_path): config_file = yaml_path config = yaml.load(open(config_file), Loader=yaml.FullLoader) args = config["training"] SEED = args["seed"] DATASET = args["dataset"] # Multi30k or ISWLT MODEL = args["model"] # gru**2, gru_attn**2, transformer, gcn_gru, gcngru_gru, gcngruattn_gru, gcnattn_gru REVERSE = args["reverse"] BATCH_SIZE = args["batch_size"] ENC_EMB_DIM = args["encoder_embed_dim"] DEC_EMB_DIM = args["decoder_embed_dim"] ENC_HID_DIM = args["encoder_hidden_dim"] DEC_HID_DIM = args["decoder_hidden_dim"] ENC_DROPOUT = args["encoder_dropout"] DEC_DROPOUT = args["decoder_dropout"] NLAYERS = args["num_layers"] N_EPOCHS = args["num_epochs"] CLIP = args["grad_clip"] LR = args["lr"] LR_DECAY_RATIO = args["lr_decay_ratio"] ID = args["id"] PATIENCE = args["patience"] DIR = 'checkpoints/{}-{}-{}/'.format(DATASET, MODEL, ID) MODEL_PATH = DIR LOG_PATH = '{}test-log.log'.format(DIR) set_seed(SEED) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') self.config = args self.device = device if 'transformer' in MODEL: ENC_HEADS = args["encoder_heads"] DEC_HEADS = args["decoder_heads"] ENC_PF_DIM = args["encoder_pf_dim"] DEC_PF_DIM = args["decoder_pf_dim"] MAX_LEN = args["max_len"] SRC = Field(tokenize = lambda text: tokenize_de(text, REVERSE), init_token = '<sos>', eos_token = '<eos>', lower = True) TGT = Field(tokenize = tokenize_en, init_token = '<sos>', eos_token = '<eos>', lower = True) GRH = RawField(postprocessing=batch_graph) data_fields = [('src', SRC), ('trg', TGT), ('grh', GRH)] train_data = Dataset(torch.load("data/Multi30k/train_data.pt"), data_fields) valid_data = Dataset(torch.load("data/Multi30k/valid_data.pt"), data_fields) test_data = Dataset(torch.load("data/Multi30k/test_data.pt"), data_fields) self.train_data, self.valid_data, self.test_data = train_data, valid_data, test_data train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (train_data, valid_data, test_data), batch_size = BATCH_SIZE, sort_key = lambda x: len(x.src), sort_within_batch=False, device = device) self.train_iterator, self.valid_iterator, self.test_iterator = train_iterator, valid_iterator, test_iterator SRC.build_vocab(train_data, min_freq = 2) TGT.build_vocab(train_data, min_freq = 2) self.SRC, self.TGT, self.GRH = SRC, TGT, GRH print(f"Number of training examples: {len(train_data.examples)}") print(f"Number of validation examples: {len(valid_data.examples)}") print(f"Number of testing examples: {len(test_data.examples)}") print(f"Unique tokens in source (de) vocabulary: {len(SRC.vocab)}") print(f"Unique tokens in target (en) vocabulary: {len(TGT.vocab)}") src_c, tgt_c = get_sentence_lengths(train_data) src_lengths = counter2array(src_c) tgt_lengths = counter2array(tgt_c) print("maximum src, tgt sent lengths: ") np.quantile(src_lengths, 1), np.quantile(tgt_lengths, 1) # Get models and corresponding training scripts INPUT_DIM = len(SRC.vocab) OUTPUT_DIM = len(TGT.vocab) SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token] TGT_PAD_IDX = TGT.vocab.stoi[TGT.pad_token] self.SRC_PAD_IDX = SRC_PAD_IDX self.TGT_PAD_IDX = TGT_PAD_IDX if MODEL == "gru**2": # gru**2, gru_attn**2, transformer, gcn_gru from models.gru_seq2seq import GRUEncoder, GRUDecoder, Seq2Seq enc = GRUEncoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, NLAYERS, ENC_DROPOUT) dec = GRUDecoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, NLAYERS, DEC_DROPOUT) model = Seq2Seq(enc, dec, device).to(device) from src.train import train_epoch_gru, evaluate_gru, epoch_time train_epoch = train_epoch_gru evaluate = evaluate_gru self.enc, self.dec, self.model, self.train_epoch, self.evaluate = enc, dec, model, train_epoch, evaluate elif MODEL == "gru_attn**2": from models.gru_attn import GRUEncoder, GRUDecoder, Seq2Seq, Attention attn = Attention(ENC_HID_DIM, DEC_HID_DIM) enc = GRUEncoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, NLAYERS, ENC_DROPOUT) dec = GRUDecoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, NLAYERS, DEC_DROPOUT, attn) model = Seq2Seq(enc, dec, device).to(device) from src.train import train_epoch_gru_attn, evaluate_gru_attn, epoch_time train_epoch = train_epoch_gru_attn evaluate = evaluate_gru_attn self.enc, self.dec, self.model, self.train_epoch, self.evaluate, self.attn = enc, dec, model, train_epoch, evaluate, attn elif MODEL == "transformer": from models.transformer import Encoder, Decoder, Seq2Seq enc = Encoder(INPUT_DIM, ENC_HID_DIM, NLAYERS, ENC_HEADS, ENC_PF_DIM, ENC_DROPOUT, device, MAX_LEN) dec = Decoder(OUTPUT_DIM, DEC_HID_DIM, NLAYERS, DEC_HEADS, DEC_PF_DIM, DEC_DROPOUT, device, MAX_LEN) model = Seq2Seq(enc, dec, SRC_PAD_IDX, TGT_PAD_IDX, device).to(device) from src.train import train_epoch_tfmr, evaluate_tfmr, epoch_time train_epoch = train_epoch_tfmr evaluate = evaluate_tfmr self.enc, self.dec, self.model, self.train_epoch, self.evaluate = enc, dec, model, train_epoch, evaluate elif MODEL == "gcn_gru": from models.gru_seq2seq import GCNEncoder, GRUDecoder, GCN2Seq enc = GCNEncoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, NLAYERS, ENC_DROPOUT) dec = GRUDecoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, NLAYERS, DEC_DROPOUT) model = GCN2Seq(enc, dec, device).to(device) from src.train import train_epoch_gcn_gru, evaluate_gcn_gru, epoch_time train_epoch = train_epoch_gcn_gru evaluate = evaluate_gcn_gru self.enc, self.dec, self.model, self.train_epoch, self.evaluate = enc, dec, model, train_epoch, evaluate elif MODEL == "gcngru_gru": from models.gru_seq2seq import GCNGRUEncoder, GRUDecoder, GCN2Seq enc = GCNGRUEncoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, NLAYERS, ENC_DROPOUT, device) dec = GRUDecoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, NLAYERS, DEC_DROPOUT) model = GCN2Seq(enc, dec, device).to(device) from src.train import train_epoch_gcn_gru, evaluate_gcn_gru, epoch_time train_epoch = train_epoch_gcn_gru evaluate = evaluate_gcn_gru self.enc, self.dec, self.model, self.train_epoch, self.evaluate = enc, dec, model, train_epoch, evaluate elif MODEL == "gcnattn_gru": from models.gru_attn import GCNEncoder, GRUDecoder, GCN2Seq, Attention attn = Attention(ENC_HID_DIM, DEC_HID_DIM) enc = GCNEncoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, NLAYERS, ENC_DROPOUT) dec = GRUDecoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, NLAYERS, DEC_DROPOUT, attn) model = GCN2Seq(enc, dec, device).to(device) from src.train import train_epoch_gcnattn_gru, evaluate_gcnattn_gru, epoch_time train_epoch = train_epoch_gcnattn_gru evaluate = evaluate_gcnattn_gru self.enc, self.dec, self.model, self.train_epoch, self.evaluate, self.attn = enc, dec, model, train_epoch, evaluate, attn elif MODEL == "gcngruattn_gru": from models.gru_attn import GCNGRUEncoder, GRUDecoder, GCN2Seq, Attention attn = Attention(ENC_HID_DIM, DEC_HID_DIM) enc = GCNGRUEncoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, NLAYERS, ENC_DROPOUT, device) dec = GRUDecoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, NLAYERS, DEC_DROPOUT, attn) model = GCN2Seq(enc, dec, device).to(device) from src.train import train_epoch_gcnattn_gru, evaluate_gcnattn_gru, epoch_time train_epoch = train_epoch_gcnattn_gru evaluate = evaluate_gcnattn_gru self.enc, self.dec, self.model, self.train_epoch, self.evaluate, self.attn = enc, dec, model, train_epoch, evaluate, attn else: raise ValueError("Wrong model choice") if 'gcn' in MODEL: from src.utils import init_weights_uniform as init_weights else: from src.utils import init_weights_xavier as init_weights model.apply(init_weights) n_params = count_parameters(model) print("Model initialized...{} params".format(n_params)) self.criterion = nn.CrossEntropyLoss(ignore_index=TGT_PAD_IDX) print(os.path.join(MODEL_PATH, "checkpoint.pt")) # try: # state_dict = torch.load(os.path.join(MODEL_PATH, "checkpoint.pt"), map_location=device)['model_state_dict'] # except: # state_dict = torch.load(os.path.join(MODEL_PATH, "checkpoint.pt"), map_location=device) state_dict = torch.load(os.path.join(MODEL_PATH, "checkpoint.pt"), map_location=device) if 'model_state_dict' in state_dict: state_dict = state_dict['model_state_dict'] model.load_state_dict(state_dict) self.model = model
def run_model(args): device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') use_equiv = args.decoder == 'equiv' # Collect data and schema schema, data_original, dl = load_data(args.dataset, use_edge_data=args.use_edge_data, use_other_edges=args.use_other_edges, use_node_attrs=args.use_node_attrs, node_val=args.node_val) data, in_dims = select_features(data_original, schema, args.feats_type) data = data.to(device) # Precompute data indices indices_identity, indices_transpose = data.calculate_indices() # Get target relations and create data structure for embeddings target_rel_ids = dl.links_test['data'].keys() target_rels = [schema.relations[rel_id] for rel_id in target_rel_ids] target_ents = schema.entities # Get relations used by decoder if use_equiv: output_rels = schema.relations else: output_rels = {rel.id: rel for rel in target_rels} data_embedding = SparseMatrixData.make_entity_embeddings( target_ents, args.embedding_dim) data_embedding.to(device) # Get training and validation positive samples now train_pos_heads, train_pos_tails = dict(), dict() val_pos_heads, val_pos_tails = dict(), dict() for target_rel_id in target_rel_ids: train_val_pos = get_train_valid_pos(dl, target_rel_id) train_pos_heads[target_rel_id], train_pos_tails[target_rel_id], \ val_pos_heads[target_rel_id], val_pos_tails[target_rel_id] = train_val_pos # Get additional indices to be used when making predictions pred_idx_matrices = {} for target_rel in target_rels: if args.pred_indices == 'train': train_neg_head, train_neg_tail = get_train_neg( dl, target_rel.id, tail_weighted=args.tail_weighted) pred_idx_matrices[target_rel.id] = make_target_matrix( target_rel, train_pos_heads[target_rel.id], train_pos_tails[target_rel.id], train_neg_head, train_neg_tail, device) elif args.pred_indices == 'train_neg': # Get negative samples twice train_neg_head1, train_neg_tail1 = get_train_neg( dl, target_rel.id, tail_weighted=args.tail_weighted) train_neg_head2, train_neg_tail2 = get_train_neg( dl, target_rel.id, tail_weighted=args.tail_weighted) pred_idx_matrices[target_rel.id] = make_target_matrix( target_rel, train_neg_head1, train_neg_tail1, train_neg_head2, train_neg_tail2, device) elif args.pred_indices == 'none': pred_idx_matrices[target_rel.id] = None # Create network and optimizer net = EquivLinkPredictor(schema, in_dims, layers=args.layers, embedding_dim=args.embedding_dim, embedding_entities=target_ents, output_rels=output_rels, activation=eval('nn.%s()' % args.act_fn), final_activation=nn.Identity(), dropout=args.dropout, pool_op=args.pool_op, norm_affine=args.norm_affine, norm_embed=args.norm_embed, in_fc_layer=args.in_fc_layer, decode=args.decoder) net.to(device) optimizer = torch.optim.Adam(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) # Set up logging and checkpointing if args.wandb_log_run: wandb.init(config=args, settings=wandb.Settings(start_method='fork'), project="EquivariantHGN_LP", entity='danieltlevy') wandb.watch(net, log='all', log_freq=args.wandb_log_param_freq) print(args) print("Number of parameters: {}".format(count_parameters(net))) run_name = args.dataset + '_' + str(args.run) if args.wandb_log_run and wandb.run.name is not None: run_name = run_name + '_' + str(wandb.run.name) if args.checkpoint_path != '': checkpoint_path = args.checkpoint_path else: checkpoint_path = f"checkpoint/checkpoint_{run_name}.pt" print("Checkpoint Path: " + checkpoint_path) val_metric_best = -1e10 # training loss_func = nn.BCELoss() progress = tqdm(range(args.epoch), desc="Epoch 0", position=0, leave=True) for epoch in progress: net.train() # Make target matrix and labels to train on if use_equiv: # Target is same as input target_schema = schema data_target = data.clone() else: # Target is just target relation target_schema = DataSchema(schema.entities, target_rels) data_target = SparseMatrixData(target_schema) labels_train = torch.Tensor([]).to(device) for target_rel in target_rels: train_neg_head, train_neg_tail = get_train_neg( dl, target_rel.id, tail_weighted=args.tail_weighted) train_matrix = make_target_matrix(target_rel, train_pos_heads[target_rel.id], train_pos_tails[target_rel.id], train_neg_head, train_neg_tail, device) data_target[target_rel.id] = train_matrix labels_train_rel = train_matrix.values.squeeze() labels_train = torch.cat([labels_train, labels_train_rel]) # Make prediction if use_equiv: idx_id_tgt, idx_trans_tgt = data_target.calculate_indices() output_data = net(data, indices_identity, indices_transpose, data_embedding, data_target, idx_id_tgt, idx_trans_tgt) else: output_data = net(data, indices_identity, indices_transpose, data_embedding, data_target) logits_combined = torch.Tensor([]).to(device) for target_rel in target_rels: logits_rel = output_data[target_rel.id].values.squeeze() logits_combined = torch.cat([logits_combined, logits_rel]) logp = torch.sigmoid(logits_combined) train_loss = loss_func(logp, labels_train) # autograd optimizer.zero_grad() train_loss.backward() optimizer.step() # Update logging progress.set_description(f"Epoch {epoch}") progress.set_postfix(loss=train_loss.item()) wandb_log = {'Train Loss': train_loss.item(), 'epoch': epoch} # Evaluate on validation set net.eval() if epoch % args.val_every == 0: with torch.no_grad(): net.eval() left = torch.Tensor([]).to(device) right = torch.Tensor([]).to(device) labels_val = torch.Tensor([]).to(device) valid_masks = {} for target_rel in target_rels: if args.val_neg == '2hop': valid_neg_head, valid_neg_tail = get_valid_neg_2hop( dl, target_rel.id) elif args.val_neg == 'randomtw': valid_neg_head, valid_neg_tail = get_valid_neg( dl, target_rel.id, tail_weighted=True) else: valid_neg_head, valid_neg_tail = get_valid_neg( dl, target_rel.id) valid_matrix_full = make_target_matrix( target_rel, val_pos_heads[target_rel.id], val_pos_tails[target_rel.id], valid_neg_head, valid_neg_tail, device) valid_matrix, left_rel, right_rel, labels_val_rel = coalesce_matrix( valid_matrix_full) left = torch.cat([left, left_rel]) right = torch.cat([right, right_rel]) labels_val = torch.cat([labels_val, labels_val_rel]) if use_equiv: # Add in additional prediction indices pred_idx_matrix = pred_idx_matrices[target_rel.id] if pred_idx_matrix is None: valid_combined_matrix = valid_matrix valid_mask = torch.arange( valid_matrix.nnz()).to(device) else: valid_combined_matrix, valid_mask = combine_matrices( valid_matrix, pred_idx_matrix) valid_masks[target_rel.id] = valid_mask data_target[target_rel.id] = valid_combined_matrix else: data_target[target_rel.id] = valid_matrix if use_equiv: data_target.zero_() idx_id_val, idx_trans_val = data_target.calculate_indices() output_data = net(data, indices_identity, indices_transpose, data_embedding, data_target, idx_id_val, idx_trans_val) else: output_data = net(data, indices_identity, indices_transpose, data_embedding, data_target) logits_combined = torch.Tensor([]).to(device) for target_rel in target_rels: logits_rel_full = output_data[ target_rel.id].values.squeeze() if use_equiv: logits_rel = logits_rel_full[valid_masks[ target_rel.id]] else: logits_rel = logits_rel_full logits_combined = torch.cat([logits_combined, logits_rel]) logp = torch.sigmoid(logits_combined) val_loss = loss_func(logp, labels_val).item() wandb_log.update({'val_loss': val_loss}) left = left.cpu().numpy() right = right.cpu().numpy() edge_list = np.concatenate( [left.reshape((1, -1)), right.reshape((1, -1))], axis=0) res = dl.evaluate(edge_list, logp.cpu().numpy(), labels_val.cpu().numpy()) val_roc_auc = res['roc_auc'] val_mrr = res['MRR'] wandb_log.update(res) print("\nVal Loss: {:.3f} Val ROC AUC: {:.3f} Val MRR: {:.3f}". format(val_loss, val_roc_auc, val_mrr)) if args.val_metric == 'loss': val_metric = -val_loss elif args.val_metric == 'roc_auc': val_metric = val_roc_auc elif args.val_metric == 'mrr': val_metric = val_mrr if val_metric > val_metric_best: val_metric_best = val_metric print("New best, saving") torch.save( { 'epoch': epoch, 'net_state_dict': net.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'train_loss': train_loss.item(), 'val_loss': val_loss, 'val_roc_auc': val_roc_auc, 'val_mrr': val_mrr }, checkpoint_path) if args.wandb_log_run: wandb.summary["val_roc_auc_best"] = val_roc_auc wandb.summary["val_mrr_best"] = val_mrr wandb.summary["val_loss_best"] = val_loss wandb.summary["epoch_best"] = epoch wandb.summary["train_loss_best"] = train_loss.item() wandb.save(checkpoint_path) if args.wandb_log_run: wandb.log(wandb_log) # Evaluate on test set if args.evaluate: for target_rel in target_rels: print("Evaluating Target Rel " + str(target_rel.id)) checkpoint = torch.load(checkpoint_path, map_location=device) net.load_state_dict(checkpoint['net_state_dict']) net.eval() # Target is same as input data_target = data.clone() with torch.no_grad(): left_full, right_full, test_labels_full = get_test_neigh_from_file( dl, args.dataset, target_rel.id) test_matrix_full = make_target_matrix_test( target_rel, left_full, right_full, test_labels_full, device) test_matrix, left, right, test_labels = coalesce_matrix( test_matrix_full) if use_equiv: test_combined_matrix, test_mask = combine_matrices( test_matrix, train_matrix) data_target[target_rel.id] = test_combined_matrix data_target.zero_() idx_id_tst, idx_trans_tst = data_target.calculate_indices() data_out = net(data, indices_identity, indices_transpose, data_embedding, data_target, idx_id_tst, idx_trans_tst) logits_full = data_out[target_rel.id].values.squeeze() logits = logits_full[test_mask] else: data_target[target_rel.id] = test_matrix data_out = net(data, indices_identity, indices_transpose, data_embedding, data_target) logits_full = data_out[target_rel.id].values.squeeze() logits = logits_full pred = torch.sigmoid(logits).cpu().numpy() left = left.cpu().numpy() right = right.cpu().numpy() edge_list = np.vstack((left, right)) edge_list_full = np.vstack((left_full, right_full)) file_path = f"test_out/{run_name}.txt" gen_file_for_evaluate(dl, edge_list_full, edge_list, pred, target_rel.id, file_path=file_path)
model = GCN2Seq(enc, dec, device).to(device) from src.train import train_epoch_gcnattn_gru, evaluate_gcnattn_gru, epoch_time train_epoch = train_epoch_gcnattn_gru evaluate = evaluate_gcnattn_gru else: raise ValueError("Wrong model choice") if 'gcn' in MODEL: from src.utils import init_weights_uniform as init_weights else: from src.utils import init_weights_xavier as init_weights model.apply(init_weights) n_params = count_parameters(model) print(f'The model has {n_params:,} trainable parameters') # training optimizer = optim.Adam(model.parameters(), lr=LR) criterion = nn.CrossEntropyLoss(ignore_index=TGT_PAD_IDX) best_valid_loss = float('inf') early_stopper = EarlyStopping(MODEL_PATH, patience=PATIENCE) logger = Logger(LOG_PATH, append_time=False) logger.write(f'The model has {n_params:,} trainable parameters') for epoch in range(N_EPOCHS): start_time = time.time() train_loss = train_epoch(model, train_iterator, optimizer, criterion, CLIP) # payload contains all info needed for interpretation and viz valid_loss, payload = evaluate(model, valid_iterator, criterion)
# switch to eval mode self.eval() else: self.train() out = self.conv1(x) out = self.block1(out) out = self.block2(out) out = self.block3(out) out = self.relu(self.bn1(out)) out = F.avg_pool2d(out, 8) out = out.view(-1, self.nChannels) self.train() return self.fc(out) if __name__ == '__main__': i = torch.FloatTensor(4, 3, 32, 32) n = WideResNet(depth=34, num_classes=10, widen_factor=10, dropRate=0.0) i = i.cuda() n = n.cuda() print(n(i).size()) print(count_parameters(n))
def run_code(): # asses which device will be used, CPY or GPU device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device.type == 'cuda': torch.cuda.empty_cache() # Load Data # if args.data == 'PPD': dataset = ProtoPlanetaryDisks(machine=args.machine, transform=True, img_norm=True, subsample=False) elif args.data == 'MNIST': dataset = MNIST(args.machine) else: print('Error: Wrong dataset (MNIST, Proto Planetary Disk)...') raise if len(dataset) == 0: print('No items in training set...') print('Exiting!') sys.exit() print('Dataset size: ', len(dataset)) # data loaders for training and testing train_loader, val_loader = dataset.get_dataloader( batch_size=args.batch_size, shuffle=True, val_split=.2, random_seed=rnd_seed) if args.data == 'PPD' and args.cond == 'T': wandb.config.physics_dim = len(dataset.meta_names) else: wandb.config.physics_dim = 0 print('Physic dimension: ', wandb.config.physics_dim) # Define AE model, Ops, and Train # # To used other AE models change the following line, # different types of AE models are stored in src/ae_model.py if args.model_name == 'ConvUpSamp_AE': model = ConvUpSamp_AE(latent_dim=args.latent_dim, img_dim=dataset.img_dim, in_ch=dataset.img_channels) elif args.model_name == 'ResNet_UpSamp_AE': model = ResNet_UpSamp_AE(latent_dim=args.latent_dim, img_dim=dataset.img_dim, in_ch=dataset.img_channels) elif args.model_name == 'ResNet_Linear_AE': model = ResNet_Linear_AE(latent_dim=args.latent_dim, img_dim=dataset.img_dim, in_ch=dataset.img_channels) elif args.model_name == 'ResNet_Tconv_AE': model = ResNet_Tconv_AE(latent_dim=args.latent_dim, img_dim=dataset.img_dim, in_ch=dataset.img_channels) elif args.model_name == 'ConvLin_AE': model = ConvLin_AE(latent_dim=args.latent_dim, img_dim=dataset.img_dim, in_ch=dataset.img_channels) elif args.model_name == 'TranConv_AE': model = TranConv_AE(latent_dim=args.latent_dim, img_dim=dataset.img_dim, in_ch=dataset.img_channels) elif args.model_name == 'Linear_AE': model = Linear_AE(latent_dim=args.latent_dim, img_dim=dataset.img_dim, in_ch=dataset.img_channels) elif args.model_name == 'ConvLinUp_AE': model = ConvLinUp_AE(latent_dim=args.latent_dim, img_dim=dataset.img_dim, in_ch=dataset.img_channels, kernel=args.kernel_size, n_conv_blocks=args.conv_blocks) elif args.model_name == 'ConvLinTrans_AE': model = ConvLinTrans_AE(latent_dim=args.latent_dim, img_dim=dataset.img_dim, in_ch=dataset.img_channels, kernel=args.kernel_size, n_conv_blocks=args.conv_blocks) elif args.model_name == 'ResLinTrans_AE': model = ResLinTrans_AE(latent_dim=args.latent_dim, img_dim=dataset.img_dim, in_ch=dataset.img_channels) # log model architecture and gradients to wandb wandb.watch(model, log='gradients') wandb.config.n_train_params = count_parameters(model) print('Summary:') print(model) print('Num of trainable params: ', wandb.config.n_train_params) print('\n') # Initialize optimizers optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=1e-6) # Learning Rate scheduler if args.lr_sch == 'step': scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=25, gamma=0.5) elif args.lr_sch == 'exp': scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.985) elif args.lr_sch == 'cos': scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=50, eta_min=1e-5) elif args.lr_sch == 'plateau': scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=.5, verbose=True) else: scheduler = None print('Optimizer :', optimizer) print('LR Scheduler :', scheduler.__class__.__name__) print('########################################') print('######## Running in %4s #########' % (device)) print('########################################') # initialize trainer trainer = Trainer(model, optimizer, args.batch_size, wandb, scheduler=scheduler, print_every=500, device=device) if args.dry_run: print('******** DRY RUN ******** ') return # run training/testing iterations trainer.train(train_loader, val_loader, args.num_epochs, save=True, early_stop=args.early_stop)
assert os.path.isdir( BERT_PRETRAINED_FOLDER), 'BERT_PRETRAINED_FOLDER init fail: {}'.format( BERT_PRETRAINED_FOLDER) config = BertConfig(os.path.join(BERT_PRETRAINED_FOLDER, 'bert_config.json')) bert_encoder = dict() for l in range(1, 13): bert_encoder[l] = BertForMultipleChoiceEncoder( config, output_all_encoded_layers=args.output_all_encoded_layers, num_hidden_layers=l) params_count = list() for l in range(1, 13): n_param_encoder = count_parameters(bert_encoder[l].bert.encoder) n_param_pooler = count_parameters(bert_encoder[l].bert.pooler) n_param_embedding = count_parameters(bert_encoder[l].bert.embeddings) n_params_total = count_parameters(bert_encoder[l]) n_params_total_debug = n_param_encoder + n_param_pooler + n_param_embedding assert n_params_total == n_params_total_debug, 'total num params error' params_count.append([ l, n_param_embedding, n_param_pooler, n_param_encoder, n_params_total ]) params_count_df = pd.DataFrame( params_count, columns=['n_layers', '#embedding', '#pooler', '#encoder', '#total']) print('num params per encoder = %d' % count_parameters(bert_encoder[1].bert.encoder)) print(params_count_df)
def run_model(args): device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') if args.lgnn: load_data_fn = load_data_flat else: load_data_fn = load_data schema, schema_out, data, data_target, labels, \ train_val_test_idx, dl = load_data_fn(args.dataset, use_edge_data=args.use_edge_data, use_node_attrs=args.use_node_attr, feats_type=args.feats_type) target_entity_id = 0 # True for all current NC datasets data, in_dims = select_features(data, schema, args.feats_type, target_entity_id) if args.multi_label: labels = torch.FloatTensor(labels).to(device) else: labels = torch.LongTensor(labels).to(device) train_idx = train_val_test_idx['train_idx'] train_idx = np.sort(train_idx) val_idx = train_val_test_idx['val_idx'] val_idx = np.sort(val_idx) test_idx = train_val_test_idx['test_idx'] test_idx = np.sort(test_idx) data = data.to(device) data_embedding = SparseMatrixData.make_entity_embeddings( schema.entities, args.embedding_dim) data_embedding.to(device) indices_identity, indices_transpose = data.calculate_indices() data_target = data_target.to(device) num_classes = dl.labels_train['num_classes'] net = AlternatingHGN(schema, in_dims, width=args.width, depth=args.depth, embedding_dim=args.embedding_dim, activation=eval('nn.%s()' % args.act_fn), final_activation=nn.Identity(), dropout=args.dropout, output_dim=num_classes, norm=args.norm, pool_op=args.pool_op, norm_affine=args.norm_affine, norm_out=args.norm_out) net.to(device) optimizer = torch.optim.Adam(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) if args.wandb_log_run: wandb.init(config=args, settings=wandb.Settings(start_method='fork'), project="EquivariantHGN_NC", entity='danieltlevy') wandb.watch(net, log='all', log_freq=args.wandb_log_param_freq) print(args) print("Number of parameters: {}".format(count_parameters(net))) run_name = args.dataset + '_' + str(args.run) if args.wandb_log_run and wandb.run.name is not None: run_name = run_name + '_' + str(wandb.run.name) if args.checkpoint_path != '': checkpoint_path = args.checkpoint_path else: checkpoint_path = f"checkpoint/checkpoint_{run_name}.pt" print("Checkpoint Path: " + checkpoint_path) progress = tqdm(range(args.epoch), desc="Epoch 0", position=0, leave=True) # training loop net.train() val_micro_best = 0 for epoch in progress: # training net.train() optimizer.zero_grad() logits = net(data, data_embedding).squeeze() logp = regr_fcn(logits, args.multi_label) train_loss = loss_fcn(logp[train_idx], labels[train_idx], args.multi_label) train_loss.backward() optimizer.step() if args.multi_label: train_micro, train_macro = f1_scores_multi( logits[train_idx], dl.labels_train['data'][train_idx]) else: train_micro, train_macro = f1_scores(logits[train_idx], labels[train_idx]) with torch.no_grad(): progress.set_description(f"Epoch {epoch}") progress.set_postfix(loss=train_loss.item(), micr=train_micro) wandb_log = { 'Train Loss': train_loss.item(), 'Train Micro': train_micro, 'Train Macro': train_macro } if epoch % args.val_every == 0: # validation net.eval() logits = net(data, data_embedding).squeeze() logp = regr_fcn(logits, args.multi_label) val_loss = loss_fcn(logp[val_idx], labels[val_idx], args.multi_label) if args.multi_label: val_micro, val_macro = f1_scores_multi( logits[val_idx], dl.labels_train['data'][val_idx]) else: val_micro, val_macro = f1_scores(logits[val_idx], labels[val_idx]) print("\nVal Loss: {:.3f} Val Micro-F1: {:.3f} \ Val Macro-F1: {:.3f}".format(val_loss, val_micro, val_macro)) wandb_log.update({ 'Val Loss': val_loss.item(), 'Val Micro-F1': val_micro, 'Val Macro-F1': val_macro }) if val_micro > val_micro_best: val_micro_best = val_micro print("New best, saving") torch.save( { 'epoch': epoch, 'net_state_dict': net.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'train_loss': train_loss.item(), 'train_micro': train_micro, 'train_macro': train_macro, 'val_loss': val_loss.item(), 'val_micro': val_micro, 'val_macro': val_macro }, checkpoint_path) if args.wandb_log_run: wandb.run.summary["val_micro_best"] = val_micro wandb.run.summary["val_macro_best"] = val_macro wandb.run.summary["val_loss_best"] = val_loss.item() wandb.run.summary["epoch_best"] = epoch wandb.run.summary["train_loss_best"] = train_loss.item( ) wandb.run.summary['train_micro_best'] = train_micro wandb.run.summary['train_macro_best'] = train_macro wandb.save(checkpoint_path) if epoch % args.wandb_log_loss_freq == 0: if args.wandb_log_run: wandb.log(wandb_log, step=epoch) # testing with evaluate_results_nc if args.evaluate: checkpoint = torch.load(checkpoint_path) net.load_state_dict(checkpoint['net_state_dict']) net.eval() test_logits = [] with torch.no_grad(): logits = net(data, data_embedding).squeeze() test_logits = logits[test_idx] if args.multi_label: pred = (test_logits.cpu().numpy() > 0).astype(int) else: pred = test_logits.cpu().numpy().argmax(axis=1) onehot = np.eye(num_classes, dtype=np.int32) file_path = f"test_out/{run_name}.txt" dl.gen_file_for_evaluate(test_idx=test_idx, label=pred, file_path=file_path, multi_label=args.multi_label) if not args.multi_label: pred = onehot[pred] print(dl.evaluate(pred))
def __init__( self, image_shape, output_size, n_atoms, dueling, jumps, spr, augmentation, target_augmentation, eval_augmentation, dynamics_blocks, norm_type, noisy_nets, aug_prob, classifier, imagesize, time_offset, local_spr, global_spr, momentum_encoder, shared_encoder, distributional, dqn_hidden_size, momentum_tau, renormalize, renormalize_type, q_l1_type, dropout, final_classifier, model_rl, noisy_nets_std, residual_tm, pred_hidden_ratio, encoder_type, transition_type, conv_proj_channel, proj_hidden_size, gru_input_size, gru_proj_size, ln_ratio, use_maxpool=False, channels=None, # None uses default. kernel_sizes=None, strides=None, paddings=None, framestack=4, ): """Instantiates the neural network according to arguments; network defaults stored within this method.""" super().__init__() self.noisy = noisy_nets self.time_offset = time_offset self.aug_prob = aug_prob self.classifier_type = classifier self.distributional = distributional n_atoms = 1 if not self.distributional else n_atoms self.dqn_hidden_size = dqn_hidden_size self.transforms = [] self.eval_transforms = [] self.uses_augmentation = False for aug in augmentation: if aug == "affine": transformation = RandomAffine(5, (.14, .14), (.9, 1.1), (-5, 5)) eval_transformation = nn.Identity() self.uses_augmentation = True elif aug == "crop": transformation = RandomCrop((84, 84)) # Crashes if aug-prob not 1: use CenterCrop((84, 84)) or Resize((84, 84)) in that case. eval_transformation = CenterCrop((84, 84)) self.uses_augmentation = True imagesize = 84 elif aug == "rrc": transformation = RandomResizedCrop((100, 100), (0.8, 1)) eval_transformation = nn.Identity() self.uses_augmentation = True elif aug == "blur": transformation = GaussianBlur2d((5, 5), (1.5, 1.5)) eval_transformation = nn.Identity() self.uses_augmentation = True elif aug == "shift": transformation = nn.Sequential(nn.ReplicationPad2d(4), RandomCrop((84, 84))) eval_transformation = nn.Identity() elif aug == "intensity": transformation = Intensity(scale=0.05) eval_transformation = nn.Identity() elif aug == "none": transformation = eval_transformation = nn.Identity() else: raise NotImplementedError() self.transforms.append(transformation) self.eval_transforms.append(eval_transformation) self.dueling = dueling f, c = image_shape[:2] in_channels = np.prod(image_shape[:2]) if encoder_type == 'conv2d': self.conv = Conv2dModel( in_channels=in_channels, channels=[32, 64, 64], kernel_sizes=[8, 4, 3], strides=[4, 2, 1], paddings=[0, 0, 0], use_maxpool=False, dropout=dropout, conv_proj_channel=conv_proj_channel, ) elif encoder_type == 'resnet18': self.conv = resnet18() else: raise NotImplementedError fake_input = torch.zeros(1, f*c, imagesize, imagesize) fake_output = self.conv(fake_input) self.hidden_size = fake_output.shape[1] self.pixels = fake_output.shape[-1]*fake_output.shape[-2] print("Spatial latent size is {}".format(fake_output.shape[1:])) if proj_hidden_size: self.conv_proj = nn.Sequential( nn.Flatten(1, -1), nn.Linear(self.hidden_size * self.pixels, proj_hidden_size), nn.LayerNorm(proj_hidden_size), nn.ReLU(), nn.Dropout(dropout), ) else: self.conv_proj = nn.Identity() self.jumps = jumps self.model_rl = model_rl self.use_spr = spr self.target_augmentation = target_augmentation self.eval_augmentation = eval_augmentation self.num_actions = output_size self.transition_type = transition_type if dueling: self.head = DQNDistributionalDuelingHeadModel(self.hidden_size, output_size, hidden_size=self.dqn_hidden_size, pixels=self.pixels, noisy=self.noisy, n_atoms=n_atoms, std_init=noisy_nets_std, proj_hidden_size=proj_hidden_size) else: self.head = DQNDistributionalHeadModel(self.hidden_size, output_size, hidden_size=self.dqn_hidden_size, pixels=self.pixels, noisy=self.noisy, n_atoms=n_atoms, std_init=noisy_nets_std) if self.jumps > 0: repr_size = proj_hidden_size if proj_hidden_size else (self.pixels * self.hidden_size) if transition_type == 'gru': self.dynamics_model = GRUModel( input_size = gru_input_size, repr_size = repr_size, proj_size = gru_proj_size, num_layers = 1, num_actions = self.num_actions, renormalize=renormalize, renormalize_type=renormalize_type, dropout=dropout ) else: self.dynamics_model = TransitionModel(channels=self.hidden_size, num_actions=output_size, pixels=self.pixels, hidden_size=self.hidden_size, limit=1, blocks=dynamics_blocks, norm_type=norm_type, renormalize=renormalize, residual=residual_tm) else: self.dynamics_model = nn.Identity() self.renormalize = renormalize self.renormalize_type = renormalize_type self.ln_ratio = ln_ratio if renormalize_type == 'train_ln': self.renormalize_ln = nn.LayerNorm(repr_size) else: self.renormalize_ln = nn.Identity() if self.use_spr: self.local_spr = local_spr self.global_spr = global_spr self.momentum_encoder = momentum_encoder self.momentum_tau = momentum_tau self.shared_encoder = shared_encoder assert not (self.shared_encoder and self.momentum_encoder) # in case someone tries something silly like --local-spr 2 self.num_sprs = int(bool(self.local_spr)) + \ int(bool(self.global_spr)) if self.local_spr: self.local_final_classifier = nn.Identity() if self.classifier_type == "mlp": self.local_classifier = nn.Sequential(nn.Linear(self.hidden_size, self.hidden_size), nn.BatchNorm1d(self.hidden_size), nn.ReLU(), nn.Linear(self.hidden_size, self.hidden_size)) elif self.classifier_type == "bilinear": self.local_classifier = nn.Linear(self.hidden_size, self.hidden_size) elif self.classifier_type == "none": self.local_classifier = nn.Identity() if final_classifier == "mlp": self.local_final_classifier = nn.Sequential(nn.Linear(self.hidden_size, 2*self.hidden_size), nn.BatchNorm1d(2*self.hidden_size), nn.ReLU(), nn.Linear(2*self.hidden_size, self.hidden_size)) elif final_classifier == "linear": self.local_final_classifier = nn.Linear(self.hidden_size, self.hidden_size) else: self.local_final_classifier = nn.Identity() self.local_target_classifier = self.local_classifier else: self.local_classifier = self.local_target_classifier = nn.Identity() if self.global_spr: self.global_final_classifier = nn.Identity() if self.classifier_type == "mlp": self.global_classifier = nn.Sequential( nn.Flatten(-3, -1), nn.Linear(self.pixels*self.hidden_size, 512), nn.BatchNorm1d(512), nn.ReLU(), nn.Linear(512, 256) ) self.global_target_classifier = self.global_classifier global_spr_size = 256 elif self.classifier_type == "q_l1": self.global_classifier = QL1Head(self.head, dueling=dueling, type=q_l1_type) global_spr_size = self.global_classifier.out_features self.global_target_classifier = self.global_classifier elif self.classifier_type == "q_l2": self.global_classifier = nn.Sequential(self.head, nn.Flatten(-2, -1)) self.global_target_classifier = self.global_classifier global_spr_size = 256 elif self.classifier_type == "bilinear": self.global_classifier = nn.Sequential(nn.Flatten(-3, -1), nn.Linear(self.hidden_size*self.pixels, self.hidden_size*self.pixels)) self.global_target_classifier = nn.Flatten(-3, -1) elif self.classifier_type == "none": self.global_classifier = nn.Flatten(-3, -1) self.global_target_classifier = nn.Flatten(-3, -1) global_spr_size = self.hidden_size*self.pixels if final_classifier == "mlp": global_final_hidden_size = int(global_spr_size * pred_hidden_ratio) self.global_final_classifier = nn.Sequential( nn.Linear(global_spr_size, global_final_hidden_size), nn.BatchNorm1d(global_final_hidden_size), nn.ReLU(), nn.Linear(global_final_hidden_size, global_spr_size) ) elif final_classifier == "linear": self.global_final_classifier = nn.Sequential( nn.Linear(global_spr_size, global_spr_size), ) elif final_classifier == "none": self.global_final_classifier = nn.Identity() else: self.global_classifier = self.global_target_classifier = nn.Identity() if self.momentum_encoder: self.target_encoder = copy.deepcopy(self.conv) self.target_encoder_proj = copy.deepcopy(self.conv_proj) self.target_renormalize_ln = copy.deepcopy(self.renormalize_ln) self.global_target_classifier = copy.deepcopy(self.global_target_classifier) self.local_target_classifier = copy.deepcopy(self.local_target_classifier) for param in (list(self.target_encoder.parameters()) + list(self.target_encoder_proj.parameters()) + list(self.target_renormalize_ln.parameters()) + list(self.global_target_classifier.parameters()) + list(self.local_target_classifier.parameters())): param.requires_grad = False elif not self.shared_encoder: # Use a separate target encoder on the last frame only. self.global_target_classifier = copy.deepcopy(self.global_target_classifier) self.local_target_classifier = copy.deepcopy(self.local_target_classifier) if self.stack_actions: input_size = c - 1 else: input_size = c self.target_encoder = Conv2dModel(in_channels=input_size, channels=[32, 64, 64], kernel_sizes=[8, 4, 3], strides=[4, 2, 1], paddings=[0, 0, 0], use_maxpool=False, ) elif self.shared_encoder: self.target_encoder = self.conv print("Initialized model with {} parameters".format(count_parameters(self)))
def main(): parser = argparse.ArgumentParser() parser.add_argument("--task_name", default=None, type=str, help="The name of the task for training.") parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.") parser.add_argument("--bert_model", default="bert-base-uncased", type=str, help="student bert model configuration folder") parser.add_argument("--encoder_checkpoint", default=None, type=str, help="check point for student encoder") parser.add_argument("--cls_checkpoint", default=None, type=str, help="check point for student classifier") parser.add_argument("--alpha", default=0.95, type=float, help="alpha for distillation") parser.add_argument("--T", default=10., type=float, help="temperature for distillation") parser.add_argument("--beta", default=0.0, type=float, help="weight for AT loss") parser.add_argument("--fc_layer_idx", default=None, type=str, help="layers ids we will put FC layers on") parser.add_argument("--normalize_patience", default=False, help="normalize patience or not") parser.add_argument("--do_train", action='store_true', help="do training or not") parser.add_argument("--do_eval", action='store_true', help="do evaluation during training or not") parser.add_argument("--train_type", default="finetune_teacher", choices=["finetune_teacher","train_student"], help="choose which to train") parser.add_argument("--log_every_step", default=50, type=int, help="output to log every global x training steps, default is 1") parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=32, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=2e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--num_train_epochs", default=3, type=int, help="Total number of training epochs to perform.") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--logging_steps', type=int, default=1000, help="Log every X updates steps.") parser.add_argument('--student_hidden_layers', type=int, default=12, help="number of transformer layers for student, default is None (use all layers)") parser.add_argument('--teacher_prediction', type=str, default=None, help="teacher prediction file to guild the student's output") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") args = parser.parse_args() args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps logger.info('actual batch size on all GPU = %d' % args.train_batch_size) if args.train_type == 'finetune_teacher': args.student_hidden_layers = 12 if 'base' in args.bert_model else 24 args.alpha = 0.0 # alpha = 0 is equivalent to fine-tuning for KD elif args.train_type == "train_student": args.student_hidden_layers = 6 args.kd_model = "kd.cls" args.alpha = 0.7 args.beta = 500 args.T = 10 args.fc_layer_idx = "1,3,5,7,9" # this for pkd-skip args.normalize_patience = True else: raise ValueError("please pick train_type from finetune_teacher,train_student") if args.encoder_checkpoint is None: args.encoder_checkpoint = os.path.join(args.bert_model, 'pytorch_model.bin') logger.info('encoder checkpoint not provided, use pre-trained at %s instead' % args.encoder_checkpoint) if args.do_train: # Create output directory if needed if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train and not args.overwrite_output_dir: raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format( args.output_dir)) args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") args.n_gpu = torch.cuda.device_count() #args.n_gpu = 1 logger.info("device: {} n_gpu: {}".format(args.device, args.n_gpu)) # set seed set_seed(args) # prepare task args.task_name = args.task_name.lower() if args.task_name not in processors: raise ValueError("Task not found: %s" % (args.task_name)) processor = processors[args.task_name]() args.output_mode = output_modes[args.task_name] label_list = processor.get_labels() args.num_labels = len(label_list) # prepare tokenizer and model config = BertConfig() tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=True) config.output_hidden_states = True encoder = BertForSequenceClassificationEncoder(config, num_hidden_layers=args.student_hidden_layers) classifier = FCClassifierForSequenceClassification(config, args.num_labels, config.hidden_size, 0) n_student_layer = len(encoder.bert.encoder.layer) encoder = load_model(encoder, args.encoder_checkpoint, args, 'student', verbose=True) logger.info('*' * 77) classifier = load_model(classifier, args.cls_checkpoint, args, 'classifier', verbose=True) n_param_student = count_parameters(encoder) + count_parameters(classifier) logger.info('number of layers in student model = %d' % n_student_layer) logger.info('num parameters in student model are %d' % n_param_student) # Training if args.do_train: read_set = 'train' if args.train_type == "train_student": assert args.teacher_prediction is not None assert args.alpha > 0 logger.info('loading teacher\'s predictoin') teacher_predictions = pickle.load(open(args.teacher_prediction, 'rb'))['train'] if args.teacher_prediction is not None else None logger.info('teacher acc = %.2f, teacher loss = %.5f' % ( teacher_predictions['acc'] * 100, teacher_predictions['loss'])) train_examples, train_dataloader, _ = get_task_dataloader(args, read_set, tokenizer, SequentialSampler, batch_size=args.train_batch_size, knowledge=teacher_predictions['pred_logit'], extra_knowledge=teacher_predictions[ 'feature_maps']) else: assert args.alpha == 0 logger.info("runing teacher fine-tuning") train_examples, train_dataloader, _ = get_task_dataloader(args, read_set, tokenizer, SequentialSampler, batch_size=args.train_batch_size) global_step, tr_loss = train(args, train_dataloader, encoder, classifier, tokenizer) ################# # information of teacher model (like [CLS]) ################# if args.train_type == "finetune_teacher": all_res = {'train': None} encoder_file = os.path.join(args.output_dir,f'{args.train_type}_epoch{args.num_train_epochs-1}.encoder.pkl') cls_file = os.path.join(args.output_dir,f'{args.train_type}_epoch{args.num_train_epochs-1}.cls.pkl') print("encoder_file") encoder = BertForSequenceClassificationEncoder(config, num_hidden_layers=args.student_hidden_layers) classifier = FCClassifierForSequenceClassification(config, args.num_labels, config.hidden_size, 0) encoder = load_model(encoder, encoder_file, args, 'exact', verbose=True) classifier = load_model(classifier, cls_file, args, 'exact', verbose=True) train_res = eval_model_dataloader(encoder, classifier, train_dataloader, args.device, detailed=True, verbose=False) all_res['train'] = train_res logger.info('saving teacher results') fname = os.path.join(args.output_dir, args.task_name + f'_teacher_{args.student_hidden_layers}layer_information.pkl') with open(fname, 'wb') as fp: pickle.dump(all_res, fp) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Evaluation if args.do_eval: test_examples, test_dataloader, test_label_ids = get_task_dataloader(args, 'dev', tokenizer, SequentialSampler, batch_size=args.eval_batch_size) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(test_examples)) logger.info(" Batch size = %d", args.eval_batch_size) result = evaluate(args, test_label_ids, encoder,classifier,test_dataloader) output_test_file = os.path.join(args.output_dir, "test_results_" + '.txt') with open(output_test_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) return
def main(): parser = argparse.ArgumentParser() parser.add_argument('-config', type=str, default='config/aishell.yaml') parser.add_argument('-log', type=str, default='train.log') parser.add_argument('-mode', type=str, default='retrain') opt = parser.parse_args() configfile = open(opt.config) config = AttrDict(yaml.load(configfile, Loader=yaml.FullLoader)) exp_name = os.path.join('egs', config.data.name, 'exp', config.model.type, config.training.save_model) if not os.path.isdir(exp_name): os.makedirs(exp_name) logger = init_logger(os.path.join(exp_name, opt.log)) if opt.mode != 'continue': shutil.copyfile(opt.config, os.path.join(exp_name, 'config.yaml')) logger.info('Save config info.') os.environ["CUDA_VISIBLE_DEVICES"] = config.training.gpus config.training.num_gpu = num_gpus(config.training.gpus) num_workers = 6 * (config.training.num_gpu if config.training.num_gpu > 0 else 1) batch_size = config.data.batch_size * config.training.num_gpu if config.training.num_gpu > 0 else config.data.batch_size logger.info('batch_size from:' + str(batch_size) + " to =>:" + str(batch_size * config.training.accumulation_steps)) train_dataset = AudioDataset(config.data, 'train') train_sampler = Batch_RandomSampler(len(train_dataset), batch_size=batch_size, shuffle=config.data.shuffle) training_data = AudioDataLoader(dataset=train_dataset, num_workers=num_workers, batch_sampler=train_sampler) logger.info('Load Train Set!') dev_dataset = AudioDataset(config.data, 'dev') dev_sampler = Batch_RandomSampler(len(dev_dataset), batch_size=batch_size, shuffle=config.data.shuffle) validate_data = AudioDataLoader(dataset=dev_dataset, num_workers=num_workers, batch_sampler=dev_sampler) logger.info('Load Dev Set!') if config.training.num_gpu > 0: torch.cuda.manual_seed(config.training.seed) torch.backends.cudnn.deterministic = True else: torch.manual_seed(config.training.seed) logger.info('Set random seed: %d' % config.training.seed) if config.model.type == "transducer": model = Transducer(config.model) elif config.model.type == "ctc": model = CTC(config.model) else: raise NotImplementedError if config.training.load_model: if config.training.num_gpu == 0: checkpoint = torch.load(config.training.load_model, map_location='cpu') else: checkpoint = torch.load(config.training.load_model) logger.info("load_checkpoint:" + str(checkpoint.keys())) load_model(model, checkpoint) logger.info('Loaded model from %s' % config.training.new_model) if config.training.load_encoder or config.training.load_decoder: if config.training.load_encoder: checkpoint = torch.load(config.training.load_encoder) logger.info("load_checkpoint:" + str(checkpoint.keys())) model.encoder.load_state_dict(checkpoint['encoder']) logger.info('Loaded encoder from %s' % config.training.load_encoder) if config.training.load_decoder: checkpoint = torch.load(config.training.load_decoder) logger.info("load_checkpoint:" + str(checkpoint.keys())) model.decoder.load_state_dict(checkpoint['decoder']) logger.info('Loaded decoder from %s' % config.training.load_decoder) if config.training.num_gpu > 0: model = model.cuda() if config.training.num_gpu > 1: # dist.init_process_group(backend='nccl', world_size=4, rank=1) device_ids = list(range(config.training.num_gpu)) model = torch.nn.DataParallel(model, device_ids=device_ids) logger.info('Loaded the model to %d GPUs' % config.training.num_gpu) n_params, enc, dec = count_parameters(model) logger.info('# the number of parameters in the whole model: %d' % n_params) logger.info('# the number of parameters in the Encoder: %d' % enc) logger.info('# the number of parameters in the Decoder: %d' % dec) logger.info('# the number of parameters in the JointNet: %d' % (n_params - dec - enc)) optimizer = Optimizer(model.parameters(), config.optim) logger.info('Created a %s optimizer.' % config.optim.type) if opt.mode == 'continue': if not config.training.load_model: raise Exception( "if mode is 'continue', need 'config.training.load_model'") optimizer.load_state_dict(checkpoint['optimizer']) start_epoch = checkpoint['epoch'] logger.info('Load Optimizer State!') else: start_epoch = 0 # create a visualizer if config.training.visualization: visualizer = SummaryWriter(os.path.join(exp_name, 'log')) logger.info('Created a visualizer.') # visualizer.add_graph(model) #fix bug else: visualizer = None logger.info(model) for epoch in range(start_epoch, config.training.epochs): train(epoch, config, model, training_data, optimizer, logger, visualizer) save_name = os.path.join( exp_name, '%s.epoch%d.chkpt' % (config.training.save_model, epoch)) save_model(model, optimizer, config, save_name) logger.info('Epoch %d model has been saved.' % epoch) if config.training.eval_or_not: _ = eval(epoch, config, model, validate_data, logger, visualizer) if epoch >= config.optim.begin_to_adjust_lr: optimizer.decay_lr() # early stop if optimizer.lr < 5e-7: logger.info('The learning rate is too low to train.') break logger.info('Epoch %d update learning rate: %.6f' % (epoch, optimizer.lr)) logger.info('The training process is OVER!')
lr=args.lr, weight_decay=args.weight_decay) strTitle = args.data + '_' + sStartTime + '_alph{:}_{:}_{:}_{:}_{:}_{:}_m{:}'.format( int(alph[0]), int(alph[1]), int(alph[2]), int(alph[3]), int(alph[4]), int(alph[5]), m) logger.info(net) logger.info("--------------------------------------------------") logger.info(prob) logger.info("--------------------------------------------------") logger.info("DIMENSION={:} m={:} nTh={:} alpha={:}".format( d, m, nTh, alph)) logger.info("nt={:} nt_val={:}".format(nt, nt_val)) logger.info("Number of trainable parameters: {}".format( count_parameters(net))) logger.info("--------------------------------------------------") logger.info(str(optim)) # optimizer info logger.info("data={:} device={:}".format(args.data, device)) logger.info("n_train={:}".format(n_train)) logger.info("maxIters={:} val_freq={:} viz_freq={:}".format( args.niters, args.val_freq, args.viz_freq)) logger.info("saveLocation = {:}".format(args.save)) logger.info(strTitle) logger.info("--------------------------------------------------\n") # show Q and W values, but they're already included inside the L value log_msg = ( '{:5s} {:7s} {:6s} {:9s} {:8s} {:8s} {:8s} {:8s} {:8s} {:8s} {:8s} {:9s} {:8s} {:8s} {:8s} {:8s} {:8s} {:8s} {:8s}' .format('iter', 'lr', ' time', 'loss', 'L', 'G', 'HJt', 'HJfin', 'HJgrad', 'Q', 'W', 'valLoss', 'valL', 'valG', 'valHJt',
def run_code(): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device.type == 'cuda': torch.cuda.empty_cache() # Load Data # if args.data == 'PPD': dataset = ProtoPlanetaryDisks() elif args.data == 'MNIST': dataset = MNIST(args.machine) else: print('Error: Wrong dataset (MNIST, Proto Planetary Disk)...') raise if len(dataset) == 0: print('No items in training set...') print('Exiting!') sys.exit() print('Dataset size: ', len(dataset)) # data loaders for training and testing train_loader, test_loader = dataset.get_dataloader( batch_size=args.batch_size, shuffle=True, test_split=.2, random_seed=rnd_seed) img_dim = dataset[0][0].shape wandb.config.physics_dim = len( dataset.phy_names) if args.data == 'PPD' else 0 print('Physic dimension: ', wandb.config.physics_dim) # Define AE model, Ops, and Train # model = ConvLin_AutoEncoder(latent_dim=args.latent_dim, img_dim=img_dim[-1]) wandb.watch(model, log='gradients') wandb.config.n_train_params = count_parameters(model) print('Summary:') print(model) print('Num of trainable params: ', wandb.config.n_train_params) print('\n') # Initialize optimizers optimizer = optim.Adam(model.parameters(), lr=args.lr) # Learning Rate scheduler if args.lr_sch == 'step': scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.5) elif args.lr_sch == 'exp': scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.985) elif args.lr_sch == 'cos': scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=50, eta_min=1e-5) elif args.lr_sch == 'plateau': scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=.5, verbose=True) else: scheduler = None print('Optimizer :', optimizer) print('LR Scheduler :', scheduler.__class__.__name__) # Train model print('########################################') print('######## Running in %4s #########' % (device)) print('########################################') trainer = Trainer(model, optimizer, args.batch_size, wandb, scheduler=scheduler, print_every=500, device=device, beta=args.beta) if args.dry_run: print('******** DRY RUN ******** ') return trainer.train(train_loader, test_loader, args.num_epochs, save=True, early_stop=args.early_stop)