示例#1
0
    def __init__(self, yaml_path):
        config_file = yaml_path
        config = yaml.load(open(config_file), Loader=yaml.FullLoader)
        args = config["training"]
        SEED = args["seed"]
        DATASET = args["dataset"]  # Multi30k or ISWLT
        MODEL = args["model"]  # gru**2, gru_attn**2, transformer, gcn_gru, gcngru_gru, gcngruattn_gru, gcnattn_gru
        REVERSE = args["reverse"]
        BATCH_SIZE = args["batch_size"]
        ENC_EMB_DIM = args["encoder_embed_dim"]
        DEC_EMB_DIM = args["decoder_embed_dim"]
        ENC_HID_DIM = args["encoder_hidden_dim"]
        DEC_HID_DIM = args["decoder_hidden_dim"]
        ENC_DROPOUT = args["encoder_dropout"]
        DEC_DROPOUT = args["decoder_dropout"]
        NLAYERS = args["num_layers"]
        N_EPOCHS = args["num_epochs"]
        CLIP = args["grad_clip"]
        LR = args["lr"]
        LR_DECAY_RATIO = args["lr_decay_ratio"]
        ID = args["id"]
        PATIENCE = args["patience"]
        DIR = 'checkpoints/{}-{}-{}/'.format(DATASET, MODEL, ID)
        MODEL_PATH = DIR
        LOG_PATH = '{}test-log.log'.format(DIR)
        set_seed(SEED)
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        self.config = args
        self.device = device

        if 'transformer' in MODEL:
            ENC_HEADS = args["encoder_heads"]
            DEC_HEADS = args["decoder_heads"]
            ENC_PF_DIM = args["encoder_pf_dim"]
            DEC_PF_DIM = args["decoder_pf_dim"]
            MAX_LEN = args["max_len"]
            
        SRC = Field(tokenize = lambda text: tokenize_de(text, REVERSE), 
                    init_token = '<sos>', 
                    eos_token = '<eos>', 
                    lower = True)
        TGT = Field(tokenize = tokenize_en, 
                    init_token = '<sos>', 
                    eos_token = '<eos>', 
                    lower = True)
        GRH = RawField(postprocessing=batch_graph)
        data_fields = [('src', SRC), ('trg', TGT), ('grh', GRH)]
        
        train_data = Dataset(torch.load("data/Multi30k/train_data.pt"), data_fields)
        valid_data = Dataset(torch.load("data/Multi30k/valid_data.pt"), data_fields)
        test_data = Dataset(torch.load("data/Multi30k/test_data.pt"), data_fields)
        self.train_data, self.valid_data, self.test_data = train_data, valid_data, test_data
        
        train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
            (train_data, valid_data, test_data), 
            batch_size = BATCH_SIZE, 
            sort_key = lambda x: len(x.src),
            sort_within_batch=False,
            device = device)
        self.train_iterator, self.valid_iterator, self.test_iterator = train_iterator, valid_iterator, test_iterator
        
        SRC.build_vocab(train_data, min_freq = 2)
        TGT.build_vocab(train_data, min_freq = 2)
        self.SRC, self.TGT, self.GRH = SRC, TGT, GRH

        print(f"Number of training examples: {len(train_data.examples)}")
        print(f"Number of validation examples: {len(valid_data.examples)}")
        print(f"Number of testing examples: {len(test_data.examples)}")
        print(f"Unique tokens in source (de) vocabulary: {len(SRC.vocab)}")
        print(f"Unique tokens in target (en) vocabulary: {len(TGT.vocab)}")

        src_c, tgt_c = get_sentence_lengths(train_data)
        src_lengths = counter2array(src_c)
        tgt_lengths = counter2array(tgt_c)

        print("maximum src, tgt sent lengths: ")
        np.quantile(src_lengths, 1), np.quantile(tgt_lengths, 1)

        # Get models and corresponding training scripts

        INPUT_DIM = len(SRC.vocab)
        OUTPUT_DIM = len(TGT.vocab)
        SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]
        TGT_PAD_IDX = TGT.vocab.stoi[TGT.pad_token]
        self.SRC_PAD_IDX = SRC_PAD_IDX
        self.TGT_PAD_IDX = TGT_PAD_IDX

        if MODEL == "gru**2":  # gru**2, gru_attn**2, transformer, gcn_gru
            from models.gru_seq2seq import GRUEncoder, GRUDecoder, Seq2Seq
            enc = GRUEncoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, NLAYERS, ENC_DROPOUT)
            dec = GRUDecoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, NLAYERS, DEC_DROPOUT)
            model = Seq2Seq(enc, dec, device).to(device)

            from src.train import train_epoch_gru, evaluate_gru, epoch_time
            train_epoch = train_epoch_gru
            evaluate = evaluate_gru
            
            self.enc, self.dec, self.model, self.train_epoch, self.evaluate = enc, dec, model, train_epoch, evaluate
            
        elif MODEL == "gru_attn**2":
            from models.gru_attn import GRUEncoder, GRUDecoder, Seq2Seq, Attention
            attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
            enc = GRUEncoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, NLAYERS, ENC_DROPOUT)
            dec = GRUDecoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, NLAYERS, DEC_DROPOUT, attn)
            model = Seq2Seq(enc, dec, device).to(device)

            from src.train import train_epoch_gru_attn, evaluate_gru_attn, epoch_time
            train_epoch = train_epoch_gru_attn
            evaluate = evaluate_gru_attn
            
            self.enc, self.dec, self.model, self.train_epoch, self.evaluate, self.attn = enc, dec, model, train_epoch, evaluate, attn

        elif MODEL == "transformer":
            from models.transformer import Encoder, Decoder, Seq2Seq
            enc = Encoder(INPUT_DIM, ENC_HID_DIM, NLAYERS, ENC_HEADS, 
                          ENC_PF_DIM, ENC_DROPOUT, device, MAX_LEN)
            dec = Decoder(OUTPUT_DIM, DEC_HID_DIM, NLAYERS, DEC_HEADS, 
                          DEC_PF_DIM, DEC_DROPOUT, device, MAX_LEN)
            model = Seq2Seq(enc, dec, SRC_PAD_IDX, TGT_PAD_IDX, device).to(device)

            from src.train import train_epoch_tfmr, evaluate_tfmr, epoch_time
            train_epoch = train_epoch_tfmr
            evaluate = evaluate_tfmr

            self.enc, self.dec, self.model, self.train_epoch, self.evaluate = enc, dec, model, train_epoch, evaluate
            
        elif MODEL == "gcn_gru":
            from models.gru_seq2seq import GCNEncoder, GRUDecoder, GCN2Seq
            enc = GCNEncoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, NLAYERS, ENC_DROPOUT)
            dec = GRUDecoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, NLAYERS, DEC_DROPOUT)
            model = GCN2Seq(enc, dec, device).to(device)

            from src.train import train_epoch_gcn_gru, evaluate_gcn_gru, epoch_time
            train_epoch = train_epoch_gcn_gru
            evaluate = evaluate_gcn_gru

            self.enc, self.dec, self.model, self.train_epoch, self.evaluate = enc, dec, model, train_epoch, evaluate
            
        elif MODEL == "gcngru_gru":
            from models.gru_seq2seq import GCNGRUEncoder, GRUDecoder, GCN2Seq
            enc = GCNGRUEncoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, NLAYERS, ENC_DROPOUT, device)
            dec = GRUDecoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, NLAYERS, DEC_DROPOUT)
            model = GCN2Seq(enc, dec, device).to(device)

            from src.train import train_epoch_gcn_gru, evaluate_gcn_gru, epoch_time
            train_epoch = train_epoch_gcn_gru
            evaluate = evaluate_gcn_gru

            self.enc, self.dec, self.model, self.train_epoch, self.evaluate = enc, dec, model, train_epoch, evaluate
            
        elif MODEL == "gcnattn_gru":
            from models.gru_attn import GCNEncoder, GRUDecoder, GCN2Seq, Attention
            attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
            enc = GCNEncoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, NLAYERS, ENC_DROPOUT)
            dec = GRUDecoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, NLAYERS, DEC_DROPOUT, attn)
            model = GCN2Seq(enc, dec, device).to(device)

            from src.train import train_epoch_gcnattn_gru, evaluate_gcnattn_gru, epoch_time
            train_epoch = train_epoch_gcnattn_gru
            evaluate = evaluate_gcnattn_gru
            
            self.enc, self.dec, self.model, self.train_epoch, self.evaluate, self.attn = enc, dec, model, train_epoch, evaluate, attn

        elif MODEL == "gcngruattn_gru":
            from models.gru_attn import GCNGRUEncoder, GRUDecoder, GCN2Seq, Attention
            attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
            enc = GCNGRUEncoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, NLAYERS, ENC_DROPOUT, device)
            dec = GRUDecoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, NLAYERS, DEC_DROPOUT, attn)
            model = GCN2Seq(enc, dec, device).to(device)

            from src.train import train_epoch_gcnattn_gru, evaluate_gcnattn_gru, epoch_time
            train_epoch = train_epoch_gcnattn_gru
            evaluate = evaluate_gcnattn_gru
            
            self.enc, self.dec, self.model, self.train_epoch, self.evaluate, self.attn = enc, dec, model, train_epoch, evaluate, attn

        else:
            raise ValueError("Wrong model choice")

        if 'gcn' in MODEL:
            from src.utils import init_weights_uniform as init_weights
        else: 
            from src.utils import init_weights_xavier as init_weights

        model.apply(init_weights)
        n_params = count_parameters(model)
        print("Model initialized...{} params".format(n_params))
        
        self.criterion = nn.CrossEntropyLoss(ignore_index=TGT_PAD_IDX)
        
        print(os.path.join(MODEL_PATH, "checkpoint.pt"))
#         try:
#             state_dict = torch.load(os.path.join(MODEL_PATH, "checkpoint.pt"), map_location=device)['model_state_dict']
#         except:
#             state_dict = torch.load(os.path.join(MODEL_PATH, "checkpoint.pt"), map_location=device)
        state_dict = torch.load(os.path.join(MODEL_PATH, "checkpoint.pt"), map_location=device)
        if 'model_state_dict' in state_dict:
            state_dict = state_dict['model_state_dict']
        model.load_state_dict(state_dict)
        self.model = model
示例#2
0
def run_model(args):
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    use_equiv = args.decoder == 'equiv'

    # Collect data and schema
    schema, data_original, dl = load_data(args.dataset,
                                          use_edge_data=args.use_edge_data,
                                          use_other_edges=args.use_other_edges,
                                          use_node_attrs=args.use_node_attrs,
                                          node_val=args.node_val)
    data, in_dims = select_features(data_original, schema, args.feats_type)
    data = data.to(device)

    # Precompute data indices
    indices_identity, indices_transpose = data.calculate_indices()
    # Get target relations and create data structure for embeddings
    target_rel_ids = dl.links_test['data'].keys()
    target_rels = [schema.relations[rel_id] for rel_id in target_rel_ids]
    target_ents = schema.entities
    # Get relations used by decoder
    if use_equiv:
        output_rels = schema.relations
    else:
        output_rels = {rel.id: rel for rel in target_rels}
    data_embedding = SparseMatrixData.make_entity_embeddings(
        target_ents, args.embedding_dim)
    data_embedding.to(device)

    # Get training and validation positive samples now
    train_pos_heads, train_pos_tails = dict(), dict()
    val_pos_heads, val_pos_tails = dict(), dict()
    for target_rel_id in target_rel_ids:
        train_val_pos = get_train_valid_pos(dl, target_rel_id)
        train_pos_heads[target_rel_id], train_pos_tails[target_rel_id], \
            val_pos_heads[target_rel_id], val_pos_tails[target_rel_id] = train_val_pos

    # Get additional indices to be used when making predictions
    pred_idx_matrices = {}
    for target_rel in target_rels:
        if args.pred_indices == 'train':
            train_neg_head, train_neg_tail = get_train_neg(
                dl, target_rel.id, tail_weighted=args.tail_weighted)
            pred_idx_matrices[target_rel.id] = make_target_matrix(
                target_rel, train_pos_heads[target_rel.id],
                train_pos_tails[target_rel.id], train_neg_head, train_neg_tail,
                device)
        elif args.pred_indices == 'train_neg':
            # Get negative samples twice
            train_neg_head1, train_neg_tail1 = get_train_neg(
                dl, target_rel.id, tail_weighted=args.tail_weighted)
            train_neg_head2, train_neg_tail2 = get_train_neg(
                dl, target_rel.id, tail_weighted=args.tail_weighted)
            pred_idx_matrices[target_rel.id] = make_target_matrix(
                target_rel, train_neg_head1, train_neg_tail1, train_neg_head2,
                train_neg_tail2, device)
        elif args.pred_indices == 'none':
            pred_idx_matrices[target_rel.id] = None

    # Create network and optimizer
    net = EquivLinkPredictor(schema,
                             in_dims,
                             layers=args.layers,
                             embedding_dim=args.embedding_dim,
                             embedding_entities=target_ents,
                             output_rels=output_rels,
                             activation=eval('nn.%s()' % args.act_fn),
                             final_activation=nn.Identity(),
                             dropout=args.dropout,
                             pool_op=args.pool_op,
                             norm_affine=args.norm_affine,
                             norm_embed=args.norm_embed,
                             in_fc_layer=args.in_fc_layer,
                             decode=args.decoder)
    net.to(device)
    optimizer = torch.optim.Adam(net.parameters(),
                                 lr=args.lr,
                                 weight_decay=args.weight_decay)

    # Set up logging and checkpointing
    if args.wandb_log_run:
        wandb.init(config=args,
                   settings=wandb.Settings(start_method='fork'),
                   project="EquivariantHGN_LP",
                   entity='danieltlevy')
        wandb.watch(net, log='all', log_freq=args.wandb_log_param_freq)
    print(args)
    print("Number of parameters: {}".format(count_parameters(net)))
    run_name = args.dataset + '_' + str(args.run)
    if args.wandb_log_run and wandb.run.name is not None:
        run_name = run_name + '_' + str(wandb.run.name)
    if args.checkpoint_path != '':
        checkpoint_path = args.checkpoint_path
    else:
        checkpoint_path = f"checkpoint/checkpoint_{run_name}.pt"
    print("Checkpoint Path: " + checkpoint_path)
    val_metric_best = -1e10

    # training
    loss_func = nn.BCELoss()
    progress = tqdm(range(args.epoch), desc="Epoch 0", position=0, leave=True)
    for epoch in progress:
        net.train()
        # Make target matrix and labels to train on
        if use_equiv:
            # Target is same as input
            target_schema = schema
            data_target = data.clone()
        else:
            # Target is just target relation
            target_schema = DataSchema(schema.entities, target_rels)
            data_target = SparseMatrixData(target_schema)
        labels_train = torch.Tensor([]).to(device)
        for target_rel in target_rels:
            train_neg_head, train_neg_tail = get_train_neg(
                dl, target_rel.id, tail_weighted=args.tail_weighted)
            train_matrix = make_target_matrix(target_rel,
                                              train_pos_heads[target_rel.id],
                                              train_pos_tails[target_rel.id],
                                              train_neg_head, train_neg_tail,
                                              device)
            data_target[target_rel.id] = train_matrix
            labels_train_rel = train_matrix.values.squeeze()
            labels_train = torch.cat([labels_train, labels_train_rel])

        # Make prediction
        if use_equiv:
            idx_id_tgt, idx_trans_tgt = data_target.calculate_indices()
            output_data = net(data, indices_identity, indices_transpose,
                              data_embedding, data_target, idx_id_tgt,
                              idx_trans_tgt)
        else:
            output_data = net(data, indices_identity, indices_transpose,
                              data_embedding, data_target)
        logits_combined = torch.Tensor([]).to(device)
        for target_rel in target_rels:
            logits_rel = output_data[target_rel.id].values.squeeze()
            logits_combined = torch.cat([logits_combined, logits_rel])

        logp = torch.sigmoid(logits_combined)
        train_loss = loss_func(logp, labels_train)

        # autograd
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()

        # Update logging
        progress.set_description(f"Epoch {epoch}")
        progress.set_postfix(loss=train_loss.item())
        wandb_log = {'Train Loss': train_loss.item(), 'epoch': epoch}

        # Evaluate on validation set
        net.eval()
        if epoch % args.val_every == 0:
            with torch.no_grad():
                net.eval()
                left = torch.Tensor([]).to(device)
                right = torch.Tensor([]).to(device)
                labels_val = torch.Tensor([]).to(device)
                valid_masks = {}
                for target_rel in target_rels:
                    if args.val_neg == '2hop':
                        valid_neg_head, valid_neg_tail = get_valid_neg_2hop(
                            dl, target_rel.id)
                    elif args.val_neg == 'randomtw':
                        valid_neg_head, valid_neg_tail = get_valid_neg(
                            dl, target_rel.id, tail_weighted=True)
                    else:
                        valid_neg_head, valid_neg_tail = get_valid_neg(
                            dl, target_rel.id)
                    valid_matrix_full = make_target_matrix(
                        target_rel, val_pos_heads[target_rel.id],
                        val_pos_tails[target_rel.id], valid_neg_head,
                        valid_neg_tail, device)
                    valid_matrix, left_rel, right_rel, labels_val_rel = coalesce_matrix(
                        valid_matrix_full)
                    left = torch.cat([left, left_rel])
                    right = torch.cat([right, right_rel])
                    labels_val = torch.cat([labels_val, labels_val_rel])
                    if use_equiv:
                        # Add in additional prediction indices
                        pred_idx_matrix = pred_idx_matrices[target_rel.id]
                        if pred_idx_matrix is None:
                            valid_combined_matrix = valid_matrix
                            valid_mask = torch.arange(
                                valid_matrix.nnz()).to(device)
                        else:
                            valid_combined_matrix, valid_mask = combine_matrices(
                                valid_matrix, pred_idx_matrix)
                        valid_masks[target_rel.id] = valid_mask
                        data_target[target_rel.id] = valid_combined_matrix
                    else:
                        data_target[target_rel.id] = valid_matrix

                if use_equiv:
                    data_target.zero_()
                    idx_id_val, idx_trans_val = data_target.calculate_indices()
                    output_data = net(data, indices_identity,
                                      indices_transpose, data_embedding,
                                      data_target, idx_id_val, idx_trans_val)
                else:
                    output_data = net(data, indices_identity,
                                      indices_transpose, data_embedding,
                                      data_target)
                logits_combined = torch.Tensor([]).to(device)
                for target_rel in target_rels:
                    logits_rel_full = output_data[
                        target_rel.id].values.squeeze()
                    if use_equiv:
                        logits_rel = logits_rel_full[valid_masks[
                            target_rel.id]]
                    else:
                        logits_rel = logits_rel_full
                    logits_combined = torch.cat([logits_combined, logits_rel])

                logp = torch.sigmoid(logits_combined)
                val_loss = loss_func(logp, labels_val).item()

                wandb_log.update({'val_loss': val_loss})
                left = left.cpu().numpy()
                right = right.cpu().numpy()
                edge_list = np.concatenate(
                    [left.reshape((1, -1)),
                     right.reshape((1, -1))], axis=0)
                res = dl.evaluate(edge_list,
                                  logp.cpu().numpy(),
                                  labels_val.cpu().numpy())
                val_roc_auc = res['roc_auc']
                val_mrr = res['MRR']
                wandb_log.update(res)
                print("\nVal Loss: {:.3f} Val ROC AUC: {:.3f} Val MRR: {:.3f}".
                      format(val_loss, val_roc_auc, val_mrr))
                if args.val_metric == 'loss':
                    val_metric = -val_loss
                elif args.val_metric == 'roc_auc':
                    val_metric = val_roc_auc
                elif args.val_metric == 'mrr':
                    val_metric = val_mrr

                if val_metric > val_metric_best:
                    val_metric_best = val_metric
                    print("New best, saving")
                    torch.save(
                        {
                            'epoch': epoch,
                            'net_state_dict': net.state_dict(),
                            'optimizer_state_dict': optimizer.state_dict(),
                            'train_loss': train_loss.item(),
                            'val_loss': val_loss,
                            'val_roc_auc': val_roc_auc,
                            'val_mrr': val_mrr
                        }, checkpoint_path)
                    if args.wandb_log_run:
                        wandb.summary["val_roc_auc_best"] = val_roc_auc
                        wandb.summary["val_mrr_best"] = val_mrr
                        wandb.summary["val_loss_best"] = val_loss
                        wandb.summary["epoch_best"] = epoch
                        wandb.summary["train_loss_best"] = train_loss.item()
                        wandb.save(checkpoint_path)
        if args.wandb_log_run:
            wandb.log(wandb_log)

    # Evaluate on test set
    if args.evaluate:
        for target_rel in target_rels:
            print("Evaluating Target Rel " + str(target_rel.id))
            checkpoint = torch.load(checkpoint_path, map_location=device)
            net.load_state_dict(checkpoint['net_state_dict'])
            net.eval()

            # Target is same as input
            data_target = data.clone()
            with torch.no_grad():
                left_full, right_full, test_labels_full = get_test_neigh_from_file(
                    dl, args.dataset, target_rel.id)
                test_matrix_full = make_target_matrix_test(
                    target_rel, left_full, right_full, test_labels_full,
                    device)
                test_matrix, left, right, test_labels = coalesce_matrix(
                    test_matrix_full)
                if use_equiv:
                    test_combined_matrix, test_mask = combine_matrices(
                        test_matrix, train_matrix)
                    data_target[target_rel.id] = test_combined_matrix
                    data_target.zero_()
                    idx_id_tst, idx_trans_tst = data_target.calculate_indices()
                    data_out = net(data, indices_identity, indices_transpose,
                                   data_embedding, data_target, idx_id_tst,
                                   idx_trans_tst)
                    logits_full = data_out[target_rel.id].values.squeeze()
                    logits = logits_full[test_mask]
                else:
                    data_target[target_rel.id] = test_matrix
                    data_out = net(data, indices_identity, indices_transpose,
                                   data_embedding, data_target)
                    logits_full = data_out[target_rel.id].values.squeeze()
                    logits = logits_full

                pred = torch.sigmoid(logits).cpu().numpy()
                left = left.cpu().numpy()
                right = right.cpu().numpy()
                edge_list = np.vstack((left, right))
                edge_list_full = np.vstack((left_full, right_full))
                file_path = f"test_out/{run_name}.txt"
                gen_file_for_evaluate(dl,
                                      edge_list_full,
                                      edge_list,
                                      pred,
                                      target_rel.id,
                                      file_path=file_path)
示例#3
0
    model = GCN2Seq(enc, dec, device).to(device)

    from src.train import train_epoch_gcnattn_gru, evaluate_gcnattn_gru, epoch_time
    train_epoch = train_epoch_gcnattn_gru
    evaluate = evaluate_gcnattn_gru

else:
    raise ValueError("Wrong model choice")

if 'gcn' in MODEL:
    from src.utils import init_weights_uniform as init_weights
else:
    from src.utils import init_weights_xavier as init_weights

model.apply(init_weights)
n_params = count_parameters(model)
print(f'The model has {n_params:,} trainable parameters')

# training
optimizer = optim.Adam(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss(ignore_index=TGT_PAD_IDX)
best_valid_loss = float('inf')
early_stopper = EarlyStopping(MODEL_PATH, patience=PATIENCE)
logger = Logger(LOG_PATH, append_time=False)
logger.write(f'The model has {n_params:,} trainable parameters')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss = train_epoch(model, train_iterator, optimizer, criterion, CLIP)
    # payload contains all info needed for interpretation and viz
    valid_loss, payload = evaluate(model, valid_iterator, criterion)
示例#4
0
            # switch to eval mode
            self.eval()
        else:
            self.train()

        out = self.conv1(x)
        out = self.block1(out)
        out = self.block2(out)
        out = self.block3(out)
        out = self.relu(self.bn1(out))
        out = F.avg_pool2d(out, 8)
        out = out.view(-1, self.nChannels)

        self.train()

        return self.fc(out)


if __name__ == '__main__':
    i = torch.FloatTensor(4, 3, 32, 32)

    n = WideResNet(depth=34, num_classes=10, widen_factor=10, dropRate=0.0)

    i = i.cuda()
    n = n.cuda()

    print(n(i).size())

    print(count_parameters(n))

示例#5
0
def run_code():
    # asses which device will be used, CPY or GPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if device.type == 'cuda':
        torch.cuda.empty_cache()
    # Load Data #
    if args.data == 'PPD':
        dataset = ProtoPlanetaryDisks(machine=args.machine,
                                      transform=True,
                                      img_norm=True,
                                      subsample=False)
    elif args.data == 'MNIST':
        dataset = MNIST(args.machine)
    else:
        print('Error: Wrong dataset (MNIST, Proto Planetary Disk)...')
        raise

    if len(dataset) == 0:
        print('No items in training set...')
        print('Exiting!')
        sys.exit()
    print('Dataset size: ', len(dataset))

    # data loaders for training and testing
    train_loader, val_loader = dataset.get_dataloader(
        batch_size=args.batch_size,
        shuffle=True,
        val_split=.2,
        random_seed=rnd_seed)

    if args.data == 'PPD' and args.cond == 'T':
        wandb.config.physics_dim = len(dataset.meta_names)
    else:
        wandb.config.physics_dim = 0

    print('Physic dimension: ', wandb.config.physics_dim)

    # Define AE model, Ops, and Train #
    # To used other AE models change the following line,
    # different types of AE models are stored in src/ae_model.py
    if args.model_name == 'ConvUpSamp_AE':
        model = ConvUpSamp_AE(latent_dim=args.latent_dim,
                              img_dim=dataset.img_dim,
                              in_ch=dataset.img_channels)

    elif args.model_name == 'ResNet_UpSamp_AE':
        model = ResNet_UpSamp_AE(latent_dim=args.latent_dim,
                                 img_dim=dataset.img_dim,
                                 in_ch=dataset.img_channels)

    elif args.model_name == 'ResNet_Linear_AE':
        model = ResNet_Linear_AE(latent_dim=args.latent_dim,
                                 img_dim=dataset.img_dim,
                                 in_ch=dataset.img_channels)
    elif args.model_name == 'ResNet_Tconv_AE':
        model = ResNet_Tconv_AE(latent_dim=args.latent_dim,
                                img_dim=dataset.img_dim,
                                in_ch=dataset.img_channels)

    elif args.model_name == 'ConvLin_AE':
        model = ConvLin_AE(latent_dim=args.latent_dim,
                           img_dim=dataset.img_dim,
                           in_ch=dataset.img_channels)

    elif args.model_name == 'TranConv_AE':
        model = TranConv_AE(latent_dim=args.latent_dim,
                            img_dim=dataset.img_dim,
                            in_ch=dataset.img_channels)

    elif args.model_name == 'Linear_AE':
        model = Linear_AE(latent_dim=args.latent_dim,
                          img_dim=dataset.img_dim,
                          in_ch=dataset.img_channels)

    elif args.model_name == 'ConvLinUp_AE':
        model = ConvLinUp_AE(latent_dim=args.latent_dim,
                             img_dim=dataset.img_dim,
                             in_ch=dataset.img_channels,
                             kernel=args.kernel_size,
                             n_conv_blocks=args.conv_blocks)

    elif args.model_name == 'ConvLinTrans_AE':
        model = ConvLinTrans_AE(latent_dim=args.latent_dim,
                                img_dim=dataset.img_dim,
                                in_ch=dataset.img_channels,
                                kernel=args.kernel_size,
                                n_conv_blocks=args.conv_blocks)

    elif args.model_name == 'ResLinTrans_AE':
        model = ResLinTrans_AE(latent_dim=args.latent_dim,
                               img_dim=dataset.img_dim,
                               in_ch=dataset.img_channels)

    # log model architecture and gradients to wandb
    wandb.watch(model, log='gradients')

    wandb.config.n_train_params = count_parameters(model)
    print('Summary:')
    print(model)
    print('Num of trainable params: ', wandb.config.n_train_params)
    print('\n')

    # Initialize optimizers
    optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=1e-6)

    # Learning Rate scheduler
    if args.lr_sch == 'step':
        scheduler = optim.lr_scheduler.StepLR(optimizer,
                                              step_size=25,
                                              gamma=0.5)
    elif args.lr_sch == 'exp':
        scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.985)
    elif args.lr_sch == 'cos':
        scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                         T_max=50,
                                                         eta_min=1e-5)
    elif args.lr_sch == 'plateau':
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                         mode='min',
                                                         factor=.5,
                                                         verbose=True)
    else:
        scheduler = None

    print('Optimizer    :', optimizer)
    print('LR Scheduler :', scheduler.__class__.__name__)

    print('########################################')
    print('########  Running in %4s  #########' % (device))
    print('########################################')

    # initialize trainer
    trainer = Trainer(model,
                      optimizer,
                      args.batch_size,
                      wandb,
                      scheduler=scheduler,
                      print_every=500,
                      device=device)

    if args.dry_run:
        print('******** DRY RUN ******** ')
        return

    # run training/testing iterations
    trainer.train(train_loader,
                  val_loader,
                  args.num_epochs,
                  save=True,
                  early_stop=args.early_stop)
示例#6
0
assert os.path.isdir(
    BERT_PRETRAINED_FOLDER), 'BERT_PRETRAINED_FOLDER init fail: {}'.format(
        BERT_PRETRAINED_FOLDER)

config = BertConfig(os.path.join(BERT_PRETRAINED_FOLDER, 'bert_config.json'))

bert_encoder = dict()
for l in range(1, 13):
    bert_encoder[l] = BertForMultipleChoiceEncoder(
        config,
        output_all_encoded_layers=args.output_all_encoded_layers,
        num_hidden_layers=l)

params_count = list()
for l in range(1, 13):
    n_param_encoder = count_parameters(bert_encoder[l].bert.encoder)
    n_param_pooler = count_parameters(bert_encoder[l].bert.pooler)
    n_param_embedding = count_parameters(bert_encoder[l].bert.embeddings)
    n_params_total = count_parameters(bert_encoder[l])
    n_params_total_debug = n_param_encoder + n_param_pooler + n_param_embedding
    assert n_params_total == n_params_total_debug, 'total num params error'

    params_count.append([
        l, n_param_embedding, n_param_pooler, n_param_encoder, n_params_total
    ])
params_count_df = pd.DataFrame(
    params_count,
    columns=['n_layers', '#embedding', '#pooler', '#encoder', '#total'])
print('num params per encoder = %d' %
      count_parameters(bert_encoder[1].bert.encoder))
print(params_count_df)
def run_model(args):
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    if args.lgnn:
        load_data_fn = load_data_flat
    else:
        load_data_fn = load_data
    schema, schema_out, data, data_target, labels, \
        train_val_test_idx, dl = load_data_fn(args.dataset,
                       use_edge_data=args.use_edge_data,
                       use_node_attrs=args.use_node_attr,
                       feats_type=args.feats_type)
    target_entity_id = 0  # True for all current NC datasets
    data, in_dims = select_features(data, schema, args.feats_type,
                                    target_entity_id)
    if args.multi_label:
        labels = torch.FloatTensor(labels).to(device)
    else:
        labels = torch.LongTensor(labels).to(device)
    train_idx = train_val_test_idx['train_idx']
    train_idx = np.sort(train_idx)
    val_idx = train_val_test_idx['val_idx']
    val_idx = np.sort(val_idx)
    test_idx = train_val_test_idx['test_idx']
    test_idx = np.sort(test_idx)

    data = data.to(device)

    data_embedding = SparseMatrixData.make_entity_embeddings(
        schema.entities, args.embedding_dim)
    data_embedding.to(device)
    indices_identity, indices_transpose = data.calculate_indices()
    data_target = data_target.to(device)

    num_classes = dl.labels_train['num_classes']

    net = AlternatingHGN(schema,
                         in_dims,
                         width=args.width,
                         depth=args.depth,
                         embedding_dim=args.embedding_dim,
                         activation=eval('nn.%s()' % args.act_fn),
                         final_activation=nn.Identity(),
                         dropout=args.dropout,
                         output_dim=num_classes,
                         norm=args.norm,
                         pool_op=args.pool_op,
                         norm_affine=args.norm_affine,
                         norm_out=args.norm_out)

    net.to(device)
    optimizer = torch.optim.Adam(net.parameters(),
                                 lr=args.lr,
                                 weight_decay=args.weight_decay)

    if args.wandb_log_run:
        wandb.init(config=args,
                   settings=wandb.Settings(start_method='fork'),
                   project="EquivariantHGN_NC",
                   entity='danieltlevy')
        wandb.watch(net, log='all', log_freq=args.wandb_log_param_freq)
    print(args)
    print("Number of parameters: {}".format(count_parameters(net)))

    run_name = args.dataset + '_' + str(args.run)
    if args.wandb_log_run and wandb.run.name is not None:
        run_name = run_name + '_' + str(wandb.run.name)

    if args.checkpoint_path != '':
        checkpoint_path = args.checkpoint_path
    else:
        checkpoint_path = f"checkpoint/checkpoint_{run_name}.pt"

    print("Checkpoint Path: " + checkpoint_path)
    progress = tqdm(range(args.epoch), desc="Epoch 0", position=0, leave=True)
    # training loop
    net.train()
    val_micro_best = 0
    for epoch in progress:
        # training
        net.train()
        optimizer.zero_grad()
        logits = net(data, data_embedding).squeeze()
        logp = regr_fcn(logits, args.multi_label)
        train_loss = loss_fcn(logp[train_idx], labels[train_idx],
                              args.multi_label)
        train_loss.backward()
        optimizer.step()
        if args.multi_label:
            train_micro, train_macro = f1_scores_multi(
                logits[train_idx], dl.labels_train['data'][train_idx])
        else:
            train_micro, train_macro = f1_scores(logits[train_idx],
                                                 labels[train_idx])
        with torch.no_grad():
            progress.set_description(f"Epoch {epoch}")
            progress.set_postfix(loss=train_loss.item(), micr=train_micro)
            wandb_log = {
                'Train Loss': train_loss.item(),
                'Train Micro': train_micro,
                'Train Macro': train_macro
            }
            if epoch % args.val_every == 0:
                # validation
                net.eval()
                logits = net(data, data_embedding).squeeze()
                logp = regr_fcn(logits, args.multi_label)
                val_loss = loss_fcn(logp[val_idx], labels[val_idx],
                                    args.multi_label)
                if args.multi_label:
                    val_micro, val_macro = f1_scores_multi(
                        logits[val_idx], dl.labels_train['data'][val_idx])
                else:
                    val_micro, val_macro = f1_scores(logits[val_idx],
                                                     labels[val_idx])
                print("\nVal Loss: {:.3f} Val Micro-F1: {:.3f} \
Val Macro-F1: {:.3f}".format(val_loss, val_micro, val_macro))
                wandb_log.update({
                    'Val Loss': val_loss.item(),
                    'Val Micro-F1': val_micro,
                    'Val Macro-F1': val_macro
                })
                if val_micro > val_micro_best:

                    val_micro_best = val_micro
                    print("New best, saving")
                    torch.save(
                        {
                            'epoch': epoch,
                            'net_state_dict': net.state_dict(),
                            'optimizer_state_dict': optimizer.state_dict(),
                            'train_loss': train_loss.item(),
                            'train_micro': train_micro,
                            'train_macro': train_macro,
                            'val_loss': val_loss.item(),
                            'val_micro': val_micro,
                            'val_macro': val_macro
                        }, checkpoint_path)
                    if args.wandb_log_run:
                        wandb.run.summary["val_micro_best"] = val_micro
                        wandb.run.summary["val_macro_best"] = val_macro
                        wandb.run.summary["val_loss_best"] = val_loss.item()
                        wandb.run.summary["epoch_best"] = epoch
                        wandb.run.summary["train_loss_best"] = train_loss.item(
                        )
                        wandb.run.summary['train_micro_best'] = train_micro
                        wandb.run.summary['train_macro_best'] = train_macro
                        wandb.save(checkpoint_path)

            if epoch % args.wandb_log_loss_freq == 0:
                if args.wandb_log_run:
                    wandb.log(wandb_log, step=epoch)

    # testing with evaluate_results_nc
    if args.evaluate:

        checkpoint = torch.load(checkpoint_path)
        net.load_state_dict(checkpoint['net_state_dict'])
        net.eval()
        test_logits = []
        with torch.no_grad():
            logits = net(data, data_embedding).squeeze()
            test_logits = logits[test_idx]
            if args.multi_label:
                pred = (test_logits.cpu().numpy() > 0).astype(int)
            else:
                pred = test_logits.cpu().numpy().argmax(axis=1)
                onehot = np.eye(num_classes, dtype=np.int32)

            file_path = f"test_out/{run_name}.txt"
            dl.gen_file_for_evaluate(test_idx=test_idx,
                                     label=pred,
                                     file_path=file_path,
                                     multi_label=args.multi_label)
            if not args.multi_label:
                pred = onehot[pred]
            print(dl.evaluate(pred))
示例#8
0
文件: models.py 项目: kevinghst/SPR
    def __init__(
            self,
            image_shape,
            output_size,
            n_atoms,
            dueling,
            jumps,
            spr,
            augmentation,
            target_augmentation,
            eval_augmentation,
            dynamics_blocks,
            norm_type,
            noisy_nets,
            aug_prob,
            classifier,
            imagesize,
            time_offset,
            local_spr,
            global_spr,
            momentum_encoder,
            shared_encoder,
            distributional,
            dqn_hidden_size,
            momentum_tau,
            renormalize,
            renormalize_type,
            q_l1_type,
            dropout,
            final_classifier,
            model_rl,
            noisy_nets_std,
            residual_tm,
            pred_hidden_ratio,
            encoder_type,
            transition_type,
            conv_proj_channel,
            proj_hidden_size,
            gru_input_size,
            gru_proj_size,
            ln_ratio,
            use_maxpool=False,
            channels=None,  # None uses default.
            kernel_sizes=None,
            strides=None,
            paddings=None,
            framestack=4,
    ):
        """Instantiates the neural network according to arguments; network defaults
        stored within this method."""
        super().__init__()
        self.noisy = noisy_nets
        self.time_offset = time_offset
        self.aug_prob = aug_prob
        self.classifier_type = classifier

        self.distributional = distributional
        n_atoms = 1 if not self.distributional else n_atoms
        self.dqn_hidden_size = dqn_hidden_size

        self.transforms = []
        self.eval_transforms = []

        self.uses_augmentation = False
        for aug in augmentation:
            if aug == "affine":
                transformation = RandomAffine(5, (.14, .14), (.9, 1.1), (-5, 5))
                eval_transformation = nn.Identity()
                self.uses_augmentation = True
            elif aug == "crop":
                transformation = RandomCrop((84, 84))
                # Crashes if aug-prob not 1: use CenterCrop((84, 84)) or Resize((84, 84)) in that case.
                eval_transformation = CenterCrop((84, 84))
                self.uses_augmentation = True
                imagesize = 84
            elif aug == "rrc":
                transformation = RandomResizedCrop((100, 100), (0.8, 1))
                eval_transformation = nn.Identity()
                self.uses_augmentation = True
            elif aug == "blur":
                transformation = GaussianBlur2d((5, 5), (1.5, 1.5))
                eval_transformation = nn.Identity()
                self.uses_augmentation = True
            elif aug == "shift":
                transformation = nn.Sequential(nn.ReplicationPad2d(4), RandomCrop((84, 84)))
                eval_transformation = nn.Identity()
            elif aug == "intensity":
                transformation = Intensity(scale=0.05)
                eval_transformation = nn.Identity()
            elif aug == "none":
                transformation = eval_transformation = nn.Identity()
            else:
                raise NotImplementedError()
            self.transforms.append(transformation)
            self.eval_transforms.append(eval_transformation)

        self.dueling = dueling
        f, c = image_shape[:2]
        in_channels = np.prod(image_shape[:2])

        if encoder_type == 'conv2d':
            self.conv = Conv2dModel(
                in_channels=in_channels,
                channels=[32, 64, 64],
                kernel_sizes=[8, 4, 3],
                strides=[4, 2, 1],
                paddings=[0, 0, 0],
                use_maxpool=False,
                dropout=dropout,
                conv_proj_channel=conv_proj_channel,
            )
        elif encoder_type == 'resnet18':
            self.conv = resnet18()
        else:
            raise NotImplementedError

        fake_input = torch.zeros(1, f*c, imagesize, imagesize)
        fake_output = self.conv(fake_input)


        self.hidden_size = fake_output.shape[1]
        self.pixels = fake_output.shape[-1]*fake_output.shape[-2]
        print("Spatial latent size is {}".format(fake_output.shape[1:]))

        if proj_hidden_size:
            self.conv_proj = nn.Sequential(
                nn.Flatten(1, -1),
                nn.Linear(self.hidden_size * self.pixels, proj_hidden_size),
                nn.LayerNorm(proj_hidden_size),
                nn.ReLU(),
                nn.Dropout(dropout),
            )
        else:
            self.conv_proj = nn.Identity()

        self.jumps = jumps
        self.model_rl = model_rl
        self.use_spr = spr
        self.target_augmentation = target_augmentation
        self.eval_augmentation = eval_augmentation
        self.num_actions = output_size
        self.transition_type = transition_type

        if dueling:
            self.head = DQNDistributionalDuelingHeadModel(self.hidden_size,
                                                          output_size,
                                                          hidden_size=self.dqn_hidden_size,
                                                          pixels=self.pixels,
                                                          noisy=self.noisy,
                                                          n_atoms=n_atoms,
                                                          std_init=noisy_nets_std,
                                                          proj_hidden_size=proj_hidden_size)
        else:
            self.head = DQNDistributionalHeadModel(self.hidden_size,
                                                   output_size,
                                                   hidden_size=self.dqn_hidden_size,
                                                   pixels=self.pixels,
                                                   noisy=self.noisy,
                                                   n_atoms=n_atoms,
                                                   std_init=noisy_nets_std)

        if self.jumps > 0:
            repr_size = proj_hidden_size if proj_hidden_size else (self.pixels * self.hidden_size)

            if transition_type == 'gru':
                self.dynamics_model = GRUModel(
                    input_size = gru_input_size,
                    repr_size = repr_size,
                    proj_size = gru_proj_size,
                    num_layers = 1,
                    num_actions = self.num_actions,
                    renormalize=renormalize,
                    renormalize_type=renormalize_type,
                    dropout=dropout
                )
            else:
                self.dynamics_model = TransitionModel(channels=self.hidden_size,
                                                      num_actions=output_size,
                                                      pixels=self.pixels,
                                                      hidden_size=self.hidden_size,
                                                      limit=1,
                                                      blocks=dynamics_blocks,
                                                      norm_type=norm_type,
                                                      renormalize=renormalize,
                                                      residual=residual_tm)
        else:
            self.dynamics_model = nn.Identity()

        self.renormalize = renormalize
        self.renormalize_type = renormalize_type
        self.ln_ratio = ln_ratio

        if renormalize_type == 'train_ln':
            self.renormalize_ln = nn.LayerNorm(repr_size)
        else:
            self.renormalize_ln = nn.Identity()

        if self.use_spr:
            self.local_spr = local_spr
            self.global_spr = global_spr
            self.momentum_encoder = momentum_encoder
            self.momentum_tau = momentum_tau
            self.shared_encoder = shared_encoder
            assert not (self.shared_encoder and self.momentum_encoder)

            # in case someone tries something silly like --local-spr 2
            self.num_sprs = int(bool(self.local_spr)) + \
                            int(bool(self.global_spr))

            if self.local_spr:
                self.local_final_classifier = nn.Identity()
                if self.classifier_type == "mlp":
                    self.local_classifier = nn.Sequential(nn.Linear(self.hidden_size,
                                                                    self.hidden_size),
                                                          nn.BatchNorm1d(self.hidden_size),
                                                          nn.ReLU(),
                                                          nn.Linear(self.hidden_size,
                                                                    self.hidden_size))
                elif self.classifier_type == "bilinear":
                    self.local_classifier = nn.Linear(self.hidden_size, self.hidden_size)
                elif self.classifier_type == "none":
                    self.local_classifier = nn.Identity()
                if final_classifier == "mlp":
                    self.local_final_classifier = nn.Sequential(nn.Linear(self.hidden_size, 2*self.hidden_size),
                                                                nn.BatchNorm1d(2*self.hidden_size),
                                                                nn.ReLU(),
                                                                nn.Linear(2*self.hidden_size,
                                                                    self.hidden_size))
                elif final_classifier == "linear":
                    self.local_final_classifier = nn.Linear(self.hidden_size, self.hidden_size)
                else:
                    self.local_final_classifier = nn.Identity()

                self.local_target_classifier = self.local_classifier
            else:
                self.local_classifier = self.local_target_classifier = nn.Identity()
            if self.global_spr:
                self.global_final_classifier = nn.Identity()
                if self.classifier_type == "mlp":
                    self.global_classifier = nn.Sequential(
                                                nn.Flatten(-3, -1),
                                                nn.Linear(self.pixels*self.hidden_size, 512),
                                                nn.BatchNorm1d(512),
                                                nn.ReLU(),
                                                nn.Linear(512, 256)
                                                )
                    self.global_target_classifier = self.global_classifier
                    global_spr_size = 256
                elif self.classifier_type == "q_l1":
                    self.global_classifier = QL1Head(self.head, dueling=dueling, type=q_l1_type)
                    global_spr_size = self.global_classifier.out_features
                    self.global_target_classifier = self.global_classifier
                elif self.classifier_type == "q_l2":
                    self.global_classifier = nn.Sequential(self.head, nn.Flatten(-2, -1))
                    self.global_target_classifier = self.global_classifier
                    global_spr_size = 256
                elif self.classifier_type == "bilinear":
                    self.global_classifier = nn.Sequential(nn.Flatten(-3, -1),
                                                           nn.Linear(self.hidden_size*self.pixels,
                                                                     self.hidden_size*self.pixels))
                    self.global_target_classifier = nn.Flatten(-3, -1)
                elif self.classifier_type == "none":
                    self.global_classifier = nn.Flatten(-3, -1)
                    self.global_target_classifier = nn.Flatten(-3, -1)

                    global_spr_size = self.hidden_size*self.pixels
                if final_classifier == "mlp":
                    global_final_hidden_size = int(global_spr_size * pred_hidden_ratio)
                    self.global_final_classifier = nn.Sequential(
                        nn.Linear(global_spr_size, global_final_hidden_size),
                        nn.BatchNorm1d(global_final_hidden_size),
                        nn.ReLU(),
                        nn.Linear(global_final_hidden_size, global_spr_size)
                    )
                elif final_classifier == "linear":
                    self.global_final_classifier = nn.Sequential(
                        nn.Linear(global_spr_size, global_spr_size),
                    )
                elif final_classifier == "none":
                    self.global_final_classifier = nn.Identity()
            else:
                self.global_classifier = self.global_target_classifier = nn.Identity()

            if self.momentum_encoder:
                self.target_encoder = copy.deepcopy(self.conv)
                self.target_encoder_proj = copy.deepcopy(self.conv_proj)
                self.target_renormalize_ln = copy.deepcopy(self.renormalize_ln)
                self.global_target_classifier = copy.deepcopy(self.global_target_classifier)
                self.local_target_classifier = copy.deepcopy(self.local_target_classifier)
                for param in (list(self.target_encoder.parameters())
                            + list(self.target_encoder_proj.parameters())
                            + list(self.target_renormalize_ln.parameters())
                            + list(self.global_target_classifier.parameters())
                            + list(self.local_target_classifier.parameters())):
                    param.requires_grad = False

            elif not self.shared_encoder:
                # Use a separate target encoder on the last frame only.
                self.global_target_classifier = copy.deepcopy(self.global_target_classifier)
                self.local_target_classifier = copy.deepcopy(self.local_target_classifier)
                if self.stack_actions:
                    input_size = c - 1
                else:
                    input_size = c
                self.target_encoder = Conv2dModel(in_channels=input_size,
                                                  channels=[32, 64, 64],
                                                  kernel_sizes=[8, 4, 3],
                                                  strides=[4, 2, 1],
                                                  paddings=[0, 0, 0],
                                                  use_maxpool=False,
                                                  )

            elif self.shared_encoder:
                self.target_encoder = self.conv

        print("Initialized model with {} parameters".format(count_parameters(self)))
示例#9
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        help="The name of the task for training.")
    parser.add_argument("--data_dir", default=None, type=str, required=True,
                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
    parser.add_argument("--output_dir", default=None, type=str, required=True,
                        help="The output directory where the model predictions and checkpoints will be written.")
    parser.add_argument("--bert_model",
                        default="bert-base-uncased",
                        type=str,
                        help="student bert model configuration folder")
    parser.add_argument("--encoder_checkpoint",
                        default=None,
                        type=str,
                        help="check point for student encoder")
    parser.add_argument("--cls_checkpoint",
                        default=None,
                        type=str,
                        help="check point for student classifier")
    parser.add_argument("--alpha",
                        default=0.95,
                        type=float,
                        help="alpha for distillation")
    parser.add_argument("--T",
                        default=10.,
                        type=float,
                        help="temperature for distillation")
    parser.add_argument("--beta",
                        default=0.0,
                        type=float,
                        help="weight for AT loss")
    parser.add_argument("--fc_layer_idx",
                        default=None,
                        type=str,
                        help="layers ids we will put FC layers on")
    parser.add_argument("--normalize_patience",
                        default=False,
                        help="normalize patience or not")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="do training or not")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="do evaluation during training or not")

    parser.add_argument("--train_type", default="finetune_teacher",
                        choices=["finetune_teacher","train_student"],
                        help="choose which to train")
    parser.add_argument("--log_every_step",
                        default=50,
                        type=int,
                        help="output to log every global x training steps, default is 1")
    parser.add_argument("--max_seq_length",
                        default=128,
                        type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. \n"
                             "Sequences longer than this will be truncated, and sequences shorter \n"
                             "than this will be padded.")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=2e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--num_train_epochs",
                        default=3,
                        type=int,
                        help="Total number of training epochs to perform.")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=1,
                        help="Number of updates steps to accumulate before performing a backward/update pass.")
    parser.add_argument('--logging_steps',
                        type=int,
                        default=1000,
                        help="Log every X updates steps.")
    parser.add_argument('--student_hidden_layers',
                        type=int,
                        default=12,
                        help="number of transformer layers for student, default is None (use all layers)")
    parser.add_argument('--teacher_prediction',
                        type=str,
                        default=None,
                        help="teacher prediction file to guild the student's output")
    parser.add_argument("--warmup_steps", default=0, type=int,
                        help="Linear warmup over warmup_steps.")
    parser.add_argument('--overwrite_output_dir', action='store_true',
                        help="Overwrite the content of the output directory")
    args = parser.parse_args()

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
    logger.info('actual batch size on all GPU = %d' % args.train_batch_size)

    if args.train_type == 'finetune_teacher':
        args.student_hidden_layers = 12 if 'base' in args.bert_model else 24
        args.alpha = 0.0   # alpha = 0 is equivalent to fine-tuning for KD
    elif args.train_type == "train_student":
        args.student_hidden_layers = 6
        args.kd_model = "kd.cls"
        args.alpha = 0.7
        args.beta = 500
        args.T = 10
        args.fc_layer_idx = "1,3,5,7,9"   # this for pkd-skip
        args.normalize_patience = True
    else:
        raise ValueError("please pick train_type from finetune_teacher,train_student")

    if args.encoder_checkpoint is None:
        args.encoder_checkpoint = os.path.join(args.bert_model, 'pytorch_model.bin')
        logger.info('encoder checkpoint not provided, use pre-trained at %s instead' % args.encoder_checkpoint)

    if args.do_train:
        # Create output directory if needed
        if not os.path.exists(args.output_dir):
            os.makedirs(args.output_dir)

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and args.do_train and not args.overwrite_output_dir:
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
                args.output_dir))


    args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    args.n_gpu = torch.cuda.device_count()
    #args.n_gpu = 1
    logger.info("device: {} n_gpu: {}".format(args.device, args.n_gpu))

    # set seed
    set_seed(args)

    # prepare task
    args.task_name = args.task_name.lower()
    if args.task_name not in processors:
        raise ValueError("Task not found: %s" % (args.task_name))
    processor = processors[args.task_name]()
    args.output_mode = output_modes[args.task_name]
    label_list = processor.get_labels()
    args.num_labels = len(label_list)

    # prepare tokenizer and model
    config = BertConfig()
    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=True)

    config.output_hidden_states = True

    encoder = BertForSequenceClassificationEncoder(config, num_hidden_layers=args.student_hidden_layers)
    classifier = FCClassifierForSequenceClassification(config, args.num_labels, config.hidden_size, 0)

    n_student_layer = len(encoder.bert.encoder.layer)
    encoder = load_model(encoder, args.encoder_checkpoint, args, 'student', verbose=True)
    logger.info('*' * 77)
    classifier = load_model(classifier, args.cls_checkpoint, args, 'classifier', verbose=True)


    n_param_student = count_parameters(encoder) + count_parameters(classifier)
    logger.info('number of layers in student model = %d' % n_student_layer)
    logger.info('num parameters in student model are %d' % n_param_student)

    # Training
    if args.do_train:
        read_set = 'train'
        if args.train_type == "train_student":
            assert args.teacher_prediction is not None
            assert args.alpha > 0
            logger.info('loading teacher\'s predictoin')
            teacher_predictions = pickle.load(open(args.teacher_prediction, 'rb'))['train'] if args.teacher_prediction is not None else None
            logger.info('teacher acc = %.2f, teacher loss = %.5f' % (
            teacher_predictions['acc'] * 100, teacher_predictions['loss']))
            train_examples, train_dataloader, _ = get_task_dataloader(args, read_set, tokenizer,
                                                                      SequentialSampler,
                                                                      batch_size=args.train_batch_size,
                                                                      knowledge=teacher_predictions['pred_logit'],
                                                                      extra_knowledge=teacher_predictions[
                                                                          'feature_maps'])
        else:
            assert args.alpha == 0
            logger.info("runing teacher fine-tuning")
            train_examples, train_dataloader, _ = get_task_dataloader(args, read_set, tokenizer,
                                                                      SequentialSampler,
                                                                      batch_size=args.train_batch_size)

        global_step, tr_loss = train(args, train_dataloader, encoder, classifier, tokenizer)
        #################
        # information of teacher model (like [CLS])
        #################
        if args.train_type == "finetune_teacher":
            all_res = {'train': None}

            encoder_file = os.path.join(args.output_dir,f'{args.train_type}_epoch{args.num_train_epochs-1}.encoder.pkl')
            cls_file = os.path.join(args.output_dir,f'{args.train_type}_epoch{args.num_train_epochs-1}.cls.pkl')
            print("encoder_file")

            encoder = BertForSequenceClassificationEncoder(config, num_hidden_layers=args.student_hidden_layers)
            classifier = FCClassifierForSequenceClassification(config, args.num_labels, config.hidden_size, 0)

            encoder = load_model(encoder, encoder_file, args, 'exact', verbose=True)
            classifier = load_model(classifier, cls_file, args, 'exact', verbose=True)
            
            train_res = eval_model_dataloader(encoder, classifier, train_dataloader, args.device, detailed=True,
                                              verbose=False)
            all_res['train'] = train_res

            logger.info('saving teacher results')

            fname = os.path.join(args.output_dir,
                                 args.task_name + f'_teacher_{args.student_hidden_layers}layer_information.pkl')
            with open(fname, 'wb') as fp:
                pickle.dump(all_res, fp)

        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

    # Evaluation
    if args.do_eval:


        test_examples, test_dataloader, test_label_ids = get_task_dataloader(args, 'dev', tokenizer,
                                                                             SequentialSampler,
                                                                             batch_size=args.eval_batch_size)
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(test_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)
        result = evaluate(args, test_label_ids, encoder,classifier,test_dataloader)

        output_test_file = os.path.join(args.output_dir, "test_results_" + '.txt')
        with open(output_test_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
    return
示例#10
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-config', type=str, default='config/aishell.yaml')
    parser.add_argument('-log', type=str, default='train.log')
    parser.add_argument('-mode', type=str, default='retrain')
    opt = parser.parse_args()

    configfile = open(opt.config)
    config = AttrDict(yaml.load(configfile, Loader=yaml.FullLoader))

    exp_name = os.path.join('egs', config.data.name, 'exp', config.model.type,
                            config.training.save_model)
    if not os.path.isdir(exp_name):
        os.makedirs(exp_name)
    logger = init_logger(os.path.join(exp_name, opt.log))
    if opt.mode != 'continue':
        shutil.copyfile(opt.config, os.path.join(exp_name, 'config.yaml'))
        logger.info('Save config info.')

    os.environ["CUDA_VISIBLE_DEVICES"] = config.training.gpus

    config.training.num_gpu = num_gpus(config.training.gpus)
    num_workers = 6 * (config.training.num_gpu
                       if config.training.num_gpu > 0 else 1)
    batch_size = config.data.batch_size * config.training.num_gpu if config.training.num_gpu > 0 else config.data.batch_size
    logger.info('batch_size from:' + str(batch_size) + " to =>:" +
                str(batch_size * config.training.accumulation_steps))

    train_dataset = AudioDataset(config.data, 'train')
    train_sampler = Batch_RandomSampler(len(train_dataset),
                                        batch_size=batch_size,
                                        shuffle=config.data.shuffle)
    training_data = AudioDataLoader(dataset=train_dataset,
                                    num_workers=num_workers,
                                    batch_sampler=train_sampler)
    logger.info('Load Train Set!')

    dev_dataset = AudioDataset(config.data, 'dev')
    dev_sampler = Batch_RandomSampler(len(dev_dataset),
                                      batch_size=batch_size,
                                      shuffle=config.data.shuffle)
    validate_data = AudioDataLoader(dataset=dev_dataset,
                                    num_workers=num_workers,
                                    batch_sampler=dev_sampler)
    logger.info('Load Dev Set!')

    if config.training.num_gpu > 0:
        torch.cuda.manual_seed(config.training.seed)
        torch.backends.cudnn.deterministic = True
    else:
        torch.manual_seed(config.training.seed)
    logger.info('Set random seed: %d' % config.training.seed)

    if config.model.type == "transducer":
        model = Transducer(config.model)
    elif config.model.type == "ctc":
        model = CTC(config.model)
    else:
        raise NotImplementedError

    if config.training.load_model:
        if config.training.num_gpu == 0:
            checkpoint = torch.load(config.training.load_model,
                                    map_location='cpu')
        else:
            checkpoint = torch.load(config.training.load_model)
        logger.info("load_checkpoint:" + str(checkpoint.keys()))
        load_model(model, checkpoint)
        logger.info('Loaded model from %s' % config.training.new_model)
    if config.training.load_encoder or config.training.load_decoder:
        if config.training.load_encoder:
            checkpoint = torch.load(config.training.load_encoder)
            logger.info("load_checkpoint:" + str(checkpoint.keys()))
            model.encoder.load_state_dict(checkpoint['encoder'])
            logger.info('Loaded encoder from %s' %
                        config.training.load_encoder)
        if config.training.load_decoder:
            checkpoint = torch.load(config.training.load_decoder)
            logger.info("load_checkpoint:" + str(checkpoint.keys()))
            model.decoder.load_state_dict(checkpoint['decoder'])
            logger.info('Loaded decoder from %s' %
                        config.training.load_decoder)

    if config.training.num_gpu > 0:
        model = model.cuda()
        if config.training.num_gpu > 1:
            # dist.init_process_group(backend='nccl', world_size=4, rank=1)
            device_ids = list(range(config.training.num_gpu))
            model = torch.nn.DataParallel(model, device_ids=device_ids)
        logger.info('Loaded the model to %d GPUs' % config.training.num_gpu)

    n_params, enc, dec = count_parameters(model)
    logger.info('# the number of parameters in the whole model: %d' % n_params)
    logger.info('# the number of parameters in the Encoder: %d' % enc)
    logger.info('# the number of parameters in the Decoder: %d' % dec)
    logger.info('# the number of parameters in the JointNet: %d' %
                (n_params - dec - enc))

    optimizer = Optimizer(model.parameters(), config.optim)
    logger.info('Created a %s optimizer.' % config.optim.type)

    if opt.mode == 'continue':
        if not config.training.load_model:
            raise Exception(
                "if mode is 'continue', need 'config.training.load_model'")
        optimizer.load_state_dict(checkpoint['optimizer'])
        start_epoch = checkpoint['epoch']
        logger.info('Load Optimizer State!')
    else:
        start_epoch = 0

    # create a visualizer
    if config.training.visualization:
        visualizer = SummaryWriter(os.path.join(exp_name, 'log'))
        logger.info('Created a visualizer.')
        # visualizer.add_graph(model) #fix bug
    else:
        visualizer = None

    logger.info(model)
    for epoch in range(start_epoch, config.training.epochs):
        train(epoch, config, model, training_data, optimizer, logger,
              visualizer)

        save_name = os.path.join(
            exp_name, '%s.epoch%d.chkpt' % (config.training.save_model, epoch))
        save_model(model, optimizer, config, save_name)
        logger.info('Epoch %d model has been saved.' % epoch)

        if config.training.eval_or_not:
            _ = eval(epoch, config, model, validate_data, logger, visualizer)

        if epoch >= config.optim.begin_to_adjust_lr:
            optimizer.decay_lr()
            # early stop
            if optimizer.lr < 5e-7:
                logger.info('The learning rate is too low to train.')
                break
            logger.info('Epoch %d update learning rate: %.6f' %
                        (epoch, optimizer.lr))

    logger.info('The training process is OVER!')
示例#11
0
                             lr=args.lr,
                             weight_decay=args.weight_decay)

    strTitle = args.data + '_' + sStartTime + '_alph{:}_{:}_{:}_{:}_{:}_{:}_m{:}'.format(
        int(alph[0]), int(alph[1]), int(alph[2]), int(alph[3]), int(alph[4]),
        int(alph[5]), m)

    logger.info(net)
    logger.info("--------------------------------------------------")
    logger.info(prob)
    logger.info("--------------------------------------------------")
    logger.info("DIMENSION={:}  m={:}  nTh={:}   alpha={:}".format(
        d, m, nTh, alph))
    logger.info("nt={:}   nt_val={:}".format(nt, nt_val))
    logger.info("Number of trainable parameters: {}".format(
        count_parameters(net)))
    logger.info("--------------------------------------------------")
    logger.info(str(optim))  # optimizer info
    logger.info("data={:} device={:}".format(args.data, device))
    logger.info("n_train={:}".format(n_train))
    logger.info("maxIters={:} val_freq={:} viz_freq={:}".format(
        args.niters, args.val_freq, args.viz_freq))
    logger.info("saveLocation = {:}".format(args.save))
    logger.info(strTitle)
    logger.info("--------------------------------------------------\n")

    # show Q and W values, but they're already included inside the L value
    log_msg = (
        '{:5s} {:7s} {:6s}   {:9s}  {:8s}  {:8s}  {:8s}  {:8s}  {:8s}  {:8s}  {:8s}     {:9s}  {:8s}  {:8s}  {:8s}  {:8s}  {:8s}  {:8s}  {:8s}'
        .format('iter', 'lr', '  time', 'loss', 'L', 'G', 'HJt', 'HJfin',
                'HJgrad', 'Q', 'W', 'valLoss', 'valL', 'valG', 'valHJt',
示例#12
0
def run_code():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if device.type == 'cuda':
        torch.cuda.empty_cache()
    # Load Data #
    if args.data == 'PPD':
        dataset = ProtoPlanetaryDisks()
    elif args.data == 'MNIST':
        dataset = MNIST(args.machine)
    else:
        print('Error: Wrong dataset (MNIST, Proto Planetary Disk)...')
        raise

    if len(dataset) == 0:
        print('No items in training set...')
        print('Exiting!')
        sys.exit()

    print('Dataset size: ', len(dataset))
    # data loaders for training and testing
    train_loader, test_loader = dataset.get_dataloader(
        batch_size=args.batch_size,
        shuffle=True,
        test_split=.2,
        random_seed=rnd_seed)
    img_dim = dataset[0][0].shape
    wandb.config.physics_dim = len(
        dataset.phy_names) if args.data == 'PPD' else 0
    print('Physic dimension: ', wandb.config.physics_dim)

    # Define AE model, Ops, and Train #
    model = ConvLin_AutoEncoder(latent_dim=args.latent_dim,
                                img_dim=img_dim[-1])
    wandb.watch(model, log='gradients')

    wandb.config.n_train_params = count_parameters(model)
    print('Summary:')
    print(model)
    print('Num of trainable params: ', wandb.config.n_train_params)
    print('\n')

    # Initialize optimizers
    optimizer = optim.Adam(model.parameters(), lr=args.lr)

    # Learning Rate scheduler
    if args.lr_sch == 'step':
        scheduler = optim.lr_scheduler.StepLR(optimizer,
                                              step_size=20,
                                              gamma=0.5)
    elif args.lr_sch == 'exp':
        scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.985)
    elif args.lr_sch == 'cos':
        scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                         T_max=50,
                                                         eta_min=1e-5)
    elif args.lr_sch == 'plateau':
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                         mode='min',
                                                         factor=.5,
                                                         verbose=True)
    else:
        scheduler = None

    print('Optimizer    :', optimizer)
    print('LR Scheduler :', scheduler.__class__.__name__)

    # Train model
    print('########################################')
    print('########  Running in %4s  #########' % (device))
    print('########################################')

    trainer = Trainer(model,
                      optimizer,
                      args.batch_size,
                      wandb,
                      scheduler=scheduler,
                      print_every=500,
                      device=device,
                      beta=args.beta)

    if args.dry_run:
        print('******** DRY RUN ******** ')
        return

    trainer.train(train_loader,
                  test_loader,
                  args.num_epochs,
                  save=True,
                  early_stop=args.early_stop)