예제 #1
0
    def forward(self,
                x,
                attention_mask=None,
                labels=None,
                labels_normal=None,
                lm_labels=None,
                labels_sent=None,
                labels_op=None):

        h_ae = self.fc_ae_1(x)  # Eq 4
        h_op = self.fc_op_1(x)  # Eq 5

        # AE and OE auxiliary tasks
        o_ae = self.fc_ae(F.relu(h_ae))
        o_op = self.fc_op(F.relu(h_op))

        p_ae = self.softmax(o_ae)  # Eq 6
        p_op = self.softmax(o_op)  # Eq 7

        # B: 1, O: 2, Find probability of a word being part of an aspect term
        p_ae = p_ae[:, :, 1] + p_ae[:, :, 2]  # (bsz, seq_len)
        p_ae = p_ae.unsqueeze(1)  # (bsz, 1, seq_len)

        # Find probability of a word being part of an opinion term
        p_op = p_op[:, :, 1] + p_op[:, :, 2]  # (bsz, seq_len)
        p_op = p_op.unsqueeze(1)  # (bsz, 1, seq_len)

        seq_len = x.size()[1]  # N
        zero_diag = -1e18 * torch.eye(
            seq_len, seq_len, requires_grad=False).to(self.config.device)

        idxs = torch.arange(0, seq_len,
                            requires_grad=False).to(self.config.device)
        idxs = idxs.unsqueeze(1)  # (seq_len, 1)
        tmp = idxs * torch.ones(seq_len, seq_len, requires_grad=False).to(
            self.config.device)  # (seq_len, seq_len)
        dist_metric = torch.abs(tmp - tmp.transpose(0, 1)) + torch.eye(
            seq_len, seq_len, requires_grad=False).to(
                self.config.device)  # (seq_len, seq_len)
        dist_metric = 1 / dist_metric

        A = h_ae @ self.W @ h_op.transpose(1, 2)  # bsz, seq_len, seq_len
        A = A + zero_diag  # (bsz, seq_len, seq_len)
        # Score matrix Q, Eq 8
        A = A * dist_metric

        op_prime = self.softmax(A * p_op) @ h_op  # Eq 9 + 11
        ae_prime = self.softmax(A.transpose(1, 2) * p_ae) @ h_ae  # Eq 10 + 12

        c = torch.cat([h_ae, ae_prime, h_op, op_prime],
                      dim=2)  # (bsz, seq_len, 4 * h), Eq 13
        o_prime = self.fc(c)  # Eq 14

        # Loss computations
        loss = 0
        active_loss = attention_mask.view(-1) == 1

        # Aspect tag predictions (AE)
        active_logits = o_ae.view(-1,
                                  self.config.num_normal_labels)[active_loss]
        active_labels = labels_normal.view(-1)[active_loss]
        loss += self.loss_weight * nn.MultiMarginLoss(margin=1)(active_logits,
                                                                active_labels)

        # Opinion tag predictions (OE)
        active_logits = o_op.view(-1,
                                  self.config.num_normal_labels)[active_loss]
        active_labels = labels_op.view(-1)[active_loss]
        loss += self.loss_weight * nn.MultiMarginLoss(margin=1)(active_logits,
                                                                active_labels)

        # Unified tag predictions (U)
        active_logits = o_prime.view(-1, self.config.num_labels)[active_loss]
        active_labels = labels.view(-1)[active_loss]
        loss += nn.MultiMarginLoss(margin=3)(active_logits, active_labels)

        return loss, o_prime
예제 #2
0
])

loss = nn.ModuleDict([
    ['l1', nn.L1Loss()],
    ['nll', nn.NLLLoss()],
    ['kldiv', nn.KLDivLoss()],
    ['mse', nn.MSELoss()],
    ['bce', nn.BCELoss()],
    ['bce_with_logits', nn.BCEWithLogitsLoss()],
    ['cosine_embedding', nn.CosineEmbeddingLoss()],
    ['ctc', nn.CTCLoss()],
    ['hinge_embedding', nn.HingeEmbeddingLoss()],
    ['margin_ranking', nn.MarginRankingLoss()],
    ['multi_label_margin', nn.MultiLabelMarginLoss()],
    ['multi_label_soft_margin', nn.MultiLabelSoftMarginLoss()],
    ['multi_margin', nn.MultiMarginLoss()],
    ['smooth_l1', nn.SmoothL1Loss()],
    ['soft_margin', nn.SoftMarginLoss()],
    ['cross_entropy', nn.CrossEntropyLoss()],
    ['triplet_margin', nn.TripletMarginLoss()],
    ['poisson_nll', nn.PoissonNLLLoss()]
])


def _parse(
    identifier: typing.Union[str, typing.Type[nn.Module], nn.Module],
    dictionary: nn.ModuleDict,
    target: str
) -> nn.Module:
    """
    Parse loss and activation.
    def fine_tune(self, X, Y):
        self.log("==============================================================")
        self.log("Supervised learning with input " + str(X.shape))
        self.log("batch_size = " + str(self.batch_size))
        self.log("num_epochs = " + str(self.num_epochs))
        self.log("init_lr = " + str(self.init_lr))
        self.log("l2_regu_weight_decay = " + str(self.l2_regu_weight_decay))
        self.log("lr_schedule_step_size = " + str(self.lr_schedule_step_size))
        self.log("lr_schedule_gamma = " + str(self.lr_schedule_gamma))
        self.log("use_class_weights = " + str(self.use_class_weights))
        self.log("is_regr = " + str(self.is_regr))
        self.log("--------------------------------------------------------------")
        start_time = datetime.now()
        
        # Loss function
        if self.is_regr:
            criterion = nn.MSELoss()
            #criterion = nn.SmoothL1Loss()
            if self.use_class_weights:
                self.log("Regression will ignore class weights")
        else:
            #criterion = nn.CrossEntropyLoss()
            criterion = nn.MultiMarginLoss()
            # Compute the weight of each class (because the dataset is imbalanced)
            if self.use_class_weights:
                class_weights = float(X.shape[0]) / (output_size * np.bincount(Y.squeeze()))
                class_weights = torch.FloatTensor(class_weights)
                if self.use_cuda: class_weights = class_weights.cuda()
                criterion = nn.CrossEntropyLoss(weight=class_weights)

        # Optimizer
        optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, self.model.parameters()),
            lr=self.init_lr, weight_decay=self.l2_regu_weight_decay)

        # Learning rate scheduler
        rule = lambda epoch: self.lr_schedule_gamma ** (epoch // self.lr_schedule_step_size)
        scheduler = LambdaLR(optimizer, lr_lambda=[rule])

        # Save original training data
        self.train = {"X": deepcopy(X), "Y": deepcopy(Y)}

        # Break data into batches
        num_of_left_overs = self.batch_size - (X.shape[0] % self.batch_size)
        X = np.append(X, X[0:num_of_left_overs], 0)
        Y = np.append(Y, Y[0:num_of_left_overs], 0)
        num_of_batches = X.shape[0] // self.batch_size
        X = np.split(X, num_of_batches, 0)
        Y = np.split(Y, num_of_batches, 0)
        
        # Train the Model
        for epoch in range(1, self.num_epochs+1):
            X, Y = shuffle(X, Y) # shuffle batches
            loss_all = [] # for saving the loss in each step
            scheduler.step() # adjust learning rate
            # Loop through all batches
            for x, y in zip(X, Y):
                x = torch.FloatTensor(x)
                if self.is_regr:
                    y = torch.FloatTensor(y)
                else:
                    y = torch.LongTensor(y)
                if self.use_cuda:
                    x, y = x.cuda(), y.cuda()
                x, y = Variable(x), Variable(y)
                optimizer.zero_grad() # reset gradient
                outputs = self.model(x) # forward propagation
                loss = criterion(outputs, y) # compute loss
                loss.backward() # backward propagation
                optimizer.step() # optimize
                loss_all.append(loss.data[0]) # save loss for each step
            # Print the result for the entire epoch
            T_tr, P_tr = self.train["Y"], self.predict(self.train["X"])
            m_train = computeMetric(T_tr, P_tr, self.is_regr, flatten=True, simple=True, aggr_axis=True)
            if self.test is not None:
                T_te, P_te = self.test["Y"], self.predict(self.test["X"])
                m_test = computeMetric(T_te, P_te, self.is_regr, flatten=True, simple=True, aggr_axis=True)
            lr_now = optimizer.state_dict()["param_groups"][0]["lr"]
            avg_loss = np.mean(loss_all)
            if self.is_regr:
                if self.test is not None:
                    self.log('[%2d/%d], LR: %.8f, Loss: %.8f, [mse, r2], [%2f, %2f], [%2f, %2f]'
                        %(epoch, self.num_epochs, lr_now, avg_loss, m_train["mse"], m_train["r2"],
                        m_test["mse"], m_test["r2"]))
                else:
                    self.log('[%2d/%d], LR: %.8f, Loss: %.8f, [mse, r2], [%5d, %5d]'
                        %(epoch, self.num_epochs, lr_now, avg_loss, m_train["mse"], m_train["r2"]))
            else:
                cm_names = " ".join(m_train["cm"][0])
                cm_train = " ".join(map(lambda x: '%5d'%(x), m_train["cm"][1]))
                if self.test is not None:
                    cm_test = " ".join(map(lambda x: '%4d'%(x), m_test["cm"][1]))
                    self.log('[%2d/%d], LR: %.8f, Loss: %.8f, [%s], [%s], [%s]'
                        %(epoch, self.num_epochs, lr_now, avg_loss, cm_names, cm_train, cm_test))
                else:
                    self.log('[%2d/%d], LR: %.9f, Loss: %.9f, [%s], [%s]'
                        %(epoch, self.num_epochs, lr_now, avg_loss, cm_names, cm_train))

        self.log("--------------------------------------------------------------")
        self.log("From " + str(start_time) + " to " + str(datetime.now()))
        self.log("--------------------------------------------------------------")
        return self
예제 #4
0
def train():
    model = VGG11()
    if use_cuda:
        torch.cuda.set_device(gpu_id)
        model = model.cuda()

    data_transform = transforms.Compose([
        transforms.RandomResizedCrop(224),
        # transforms.RandomHorizontalFlip(p=0.5),
        transforms.RandomRotation(15),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    train_loader, valid_loader = make_train_data_loader(train_data_path, data_transform)
    print('trainset len:', len(train_loader.dataset))
    print('train loader len:', len(train_loader))
    print('valid loader len:', len(valid_loader))
    print('=========================================')
    # optimizer = torch.optim.Adam(params=model.parameters(), lr=0.001)
    optimizer = torch.optim.SGD(params=model.parameters(), lr=0.001, momentum=0.9)
    SVMloss = nn.MultiMarginLoss()

    train_loss_list = []
    valid_loss_list = []
    train_accuracy_list =[]
    best_accuracy = 0.0
    for epoch in range(num_epochs):
        print(f'Epoch: {epoch + 1}/{num_epochs}')
        print('-' * len(f'Epoch: {epoch + 1}/{num_epochs}'))

        train_loss = 0.0
        valid_loss = 0.0
        training_accuracy = 0.0
        predict_correct = 0
        for data, label in train_loader:
            if use_cuda:
                data, label = data.cuda(), label.cuda()
            optimizer.zero_grad()
            output = model(data)
            _, prediction = torch.max(output.data, 1)
            loss = SVMloss(output, label)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * data.size(0)
            predict_correct += torch.sum(prediction == label.data)

        model.eval()
        for data, label in valid_loader:
            if use_cuda:
                data, label = data.cuda(), label.cuda()
            output = model(data)
            loss = SVMloss(output, label)
            valid_loss += loss.item() * data.size(0)

        train_loss = train_loss / float(np.floor(len(train_loader.dataset) * (1 - valid_size)))
        train_loss_list.append(train_loss)
        valid_loss = valid_loss / float(np.floor(len(valid_loader.dataset) * valid_size))
        valid_loss_list.append(valid_loss)

        training_accuracy = float(predict_correct) / float(len(train_loader.dataset))
        train_accuracy_list.append(training_accuracy)
        print(f'Training loss: {train_loss:.4f}\nValidation loss: {valid_loss:.4f}\nAccuracy: {training_accuracy:.4f}')

        if training_accuracy > best_accuracy:
            best_accuracy = training_accuracy
            # torch.save(model.state_dict(), weight_path)
            best_weight = copy.deepcopy(model.state_dict())
            print(f'Best accuracy update, current best weights saved')
            # print(f'best accuracy update: {best_accuracy:.4f}, current best weights saved')
        print('\n')

    # model.load_state_dict(best_weight)
    torch.save(best_weight, weight_path)
    print(f'best weight saved at {weight_path}')

    x1 = range(0,len(train_accuracy_list))
    x2 = range(0,len(train_loss_list))
    x3 = range(0,len(valid_loss_list))
    y1 = train_accuracy_list
    y2 = train_loss_list
    y3 = valid_loss_list
    plt.subplots_adjust(left = 0.1, bottom = 0.2, right = 0.9, top = 0.9, wspace = 0.1, hspace = 0.9)
    plt.subplot(2, 1, 1)
    plt.plot(x1, y1, 'm', linestyle='-', label='Training accuracy')
    plt.xlabel(u'epoches')
    plt.ylabel(u'Accuracy')
    plt.xlim(0,len(train_accuracy_list))
    plt.title('Train accuracy vs. epoches')
    plt.grid('on')

    plt.subplot(2, 1, 2)
    plt.plot(x2, y2, 'c', linestyle='-', label='train loss')
    plt.plot(x3, y3, 'y', linestyle='-', label='valid loss')
    plt.xlabel(u'epoches')
    plt.ylabel(u'Loss')
    plt.xlim(0,len(train_accuracy_list))
    plt.title('Train loss vs. valid loss')
    plt.grid('on')
    plt.legend(loc=1)

    plt.savefig("accuracy_loss.png")
    plt.show()
        h = self.dropout(h)  # (batch_size * hidden_size)
        return h


#%%

hidden_size = 100
learning_rate = 1e-3  #another option is 3e-4
wd = 0
n_filter = 4
n_negative = 20  # number of negative questions
#dropout = 0.1 # two more options 0.2 and 0.3

cnnmodel = CNN(hidden_size, n_filter)

lossfunction = nn.MultiMarginLoss(p=1, margin=0.2)

optimizer = optim.Adam(cnnmodel.parameters(),
                       lr=learning_rate,
                       weight_decay=wd)

#%%
import sys


def cnntrain(query_embedding, positive_embedding, negative_embedding):
    n_batch = len(query_embedding)
    optimizer.zero_grad()
    similarity_matrix = Variable(torch.zeros(n_batch, n_negative + 1))

    #query_vec = cnnmodel(Variable(torch.FloatTensor(query_embedding)))
예제 #6
0
def main():
    global args, best_auc
    args = parser.parse_args()
    cuda_available = torch.cuda.is_available()
    print args

    embedding_file = 'data/glove/glove.pruned.txt.gz'
    embedding_iter = Embedding.iterator(embedding_file)
    embed_size = 300
    embedding = Embedding(embed_size, embedding_iter)
    print 'Embeddings loaded.'

    android_corpus_file = 'data/android/corpus.tsv.gz'
    android_dataset = AndroidDataset(android_corpus_file)
    android_corpus = android_dataset.get_corpus()
    android_ids = embedding.corpus_to_ids(android_corpus)
    print 'Got Android corpus ids.'

    ubuntu_corpus_file = 'data/askubuntu/text_tokenized.txt.gz'
    ubuntu_dataset = UbuntuDataset(ubuntu_corpus_file)
    ubuntu_corpus = ubuntu_dataset.get_corpus()
    ubuntu_ids = embedding.corpus_to_ids(ubuntu_corpus)
    print 'Got AskUbuntu corpus ids.'

    padding_id = embedding.vocab_ids['<padding>']

    ubuntu_train_file = 'data/askubuntu/train_random.txt'
    ubuntu_train_data = ubuntu_dataset.read_annotations(ubuntu_train_file)

    dev_pos_file = 'data/android/dev.pos.txt'
    dev_neg_file = 'data/android/dev.neg.txt'
    android_dev_data = android_dataset.read_annotations(
        dev_pos_file, dev_neg_file)

    android_dev_batches = batch_utils.generate_eval_batches(
        android_ids, android_dev_data, padding_id)

    assert args.model in ['lstm', 'cnn']
    if args.model == 'lstm':
        model_encoder = LSTM(embed_size, args.hidden)
    else:
        model_encoder = CNN(embed_size, args.hidden)
    model_classifier = FFN(args.hidden)
    print model_encoder
    print model_classifier

    optimizer_encoder = torch.optim.Adam(model_encoder.parameters(),
                                         lr=args.elr)
    criterion_encoder = nn.MultiMarginLoss(margin=args.margin)

    optimizer_classifier = torch.optim.Adam(model_classifier.parameters(),
                                            lr=args.clr)
    criterion_classifier = nn.CrossEntropyLoss()

    if cuda_available:
        criterion_encoder = criterion_encoder.cuda()
        criterion_classifier = criterion_classifier.cuda()

    if args.load:
        if os.path.isfile(args.load):
            print 'Loading checkpoint.'
            checkpoint = torch.load(args.load)
            args.start_epoch = checkpoint['epoch']
            best_auc = checkpoint.get('best_auc', -1)
            model_encoder.load_state_dict(checkpoint['encoder_state_dict'])
            model_classifier.load_state_dict(
                checkpoint['classifier_state_dict'])

            print 'Loaded checkpoint at epoch {}.'.format(checkpoint['epoch'])
        else:
            print 'No checkpoint found here.'

    if args.eval:
        test_pos_file = 'data/android/test.pos.txt'
        test_neg_file = 'data/android/test.neg.txt'
        android_test_data = android_dataset.read_annotations(
            test_pos_file, test_neg_file)

        android_test_batches = batch_utils.generate_eval_batches(
            android_ids, android_test_data, padding_id)

        print 'Evaluating on dev set.'
        train_utils.evaluate_auc(args, model_encoder, embedding,
                                 android_dev_batches, padding_id)

        print 'Evaluating on test set.'
        train_utils.evaluate_auc(args, model_encoder, embedding,
                                 android_test_batches, padding_id)
        return

    for epoch in xrange(args.start_epoch, args.epochs):
        encoder_train_batches = batch_utils.generate_train_batches(
            ubuntu_ids, ubuntu_train_data, args.batch_size, padding_id)
        classifier_train_batches = \
            batch_utils.generate_classifier_train_batches(
                ubuntu_ids, android_ids, args.batch_size,
                len(encoder_train_batches), padding_id)

        train_utils.train_encoder_classifer(
            args, model_encoder, model_classifier, embedding,
            optimizer_encoder, optimizer_classifier, criterion_encoder,
            criterion_classifier,
            zip(encoder_train_batches,
                classifier_train_batches), padding_id, epoch, args.lmbda)

        auc = train_utils.evaluate_auc(args, model_encoder, embedding,
                                       android_dev_batches, padding_id)

        is_best = auc > best_auc
        best_auc = max(auc, best_auc)
        save(
            args, {
                'epoch': epoch + 1,
                'arch': 'lstm',
                'encoder_state_dict': model_encoder.state_dict(),
                'classifier_state_dict': model_classifier.state_dict(),
                'best_auc': best_auc,
            }, is_best)
예제 #7
0
def train_model(lambda_val, embedding_size, hidden_size, filter_width,
                max_or_mean, max_num_epochs, batch_size, learning_rate_1,
                learning_rate_2, loss_margin, training_checkpoint,
                dropout_prob, eval_batch_size):
    global load_model_path, train_data_ubuntu_1, train_data_ubuntu_2, train_data_android_2, source_questions
    global dev_pos_data, dev_neg_data, test_pos_data, test_neg_data, target_questions
    global dev_data, dev_label_dict, test_data, test_label_dict, opt_mrr, opt_model_params

    # Generate model
    cnn = CNN(embedding_size, hidden_size, filter_width, max_or_mean,
              dropout_prob)
    optimizer_1 = optim.Adam(cnn.parameters(), lr=learning_rate_1)
    criterion_1 = nn.MultiMarginLoss(margin=loss_margin)
    ffn = FFN(hidden_size)
    optimizer_2 = optim.Adam(ffn.parameters(), lr=learning_rate_2)
    criterion_2 = nn.functional.cross_entropy
    init_epoch = 1

    # Training
    print("***************************************")
    print("Starting run with following parameters:")
    print(" --lambda:           %f" % (lambda_val))
    print(" --embedding size:   %d" % (cnn.input_size))
    print(" --hidden size:      %d" % (cnn.hidden_size))
    print(" --filter width:     %d" % (cnn.n))
    print(" --dropout:          %f" % (cnn.dropout_prob))
    print(" --pooling:          %s" % (cnn.max_or_mean))
    print(" --initial epoch:    %d" % (init_epoch))
    print(" --number of epochs: %d" % (max_num_epochs))
    print(" --batch size:       %d" % (batch_size))
    print(" --learning rate 1:  %f" % (learning_rate_1))
    print(" --learning rate 2:  %f" % (learning_rate_2))
    print(" --loss margin:      %f" % (loss_margin))

    start = time.time()
    current_loss = 0

    for iter in range(init_epoch, max_num_epochs + 1):
        current_loss += train(cnn, ffn, criterion_1, criterion_2, optimizer_1,
                              optimizer_2, train_data_ubuntu_1,
                              (train_data_ubuntu_2, train_data_android_2),
                              (source_questions, target_questions), batch_size,
                              lambda_val)
        if iter % training_checkpoint == 0:
            print("Epoch %d: Average Train Loss: %.5f, Time: %s" %
                  (iter,
                   (current_loss / training_checkpoint), timeSince(start)))
            d_auc = evaluate_auc(cnn, dev_pos_data, dev_neg_data,
                                 target_questions, eval_batch_size)
            t_auc = evaluate_auc(cnn, test_pos_data, test_neg_data,
                                 target_questions, eval_batch_size)
            print("Dev AUC(0.05): %.2f" % (d_auc))
            print("Test AUC(0.05): %.2f" % (t_auc))
            current_loss = 0

    # Compute final results
    print("-------")
    print("FINAL RESULTS:")
    d_auc = evaluate_auc(cnn, dev_pos_data, dev_neg_data, target_questions,
                         eval_batch_size)
    t_auc = evaluate_auc(cnn, test_pos_data, test_neg_data, target_questions,
                         eval_batch_size)
    print("Training time: %s" % (timeSince(start)))
    print("Dev AUC(0.05): %.2f" % (d_auc))
    print("Test AUC(0.05): %.2f" % (t_auc))

    return (d_auc, t_auc)
예제 #8
0
def train(train_loader, model, criterion, optimizer, epoch):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top3 = AverageMeter()
    top5 = AverageMeter()

    # switch to train mode
    model.train()

    end = time.time()
    for i, (input, target) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        #print type(target.float())
        if 'L1' in args.arch or args.L1 == 1 or args.labelboost > 1e-6 or args.focal > 0:
            targetTensor = np.zeros((input.size()[0], args.nclass))
            for j in range(input.size()[0]):
                targetTensor[j, target[j]] = 1.0
            #targetTensor = targetTensor[:input.size[0],:input.size[1]]
            targetTensor = torch.FloatTensor(targetTensor)
            targetTensor = targetTensor.cuda(async=True)
            target = target.cuda(async=True)
            target_var = torch.autograd.Variable(targetTensor)

        elif args.labelnocompete > 0:
            targetTensor = np.concatenate([ np.zeros((input.size()[0], args.nclass)) ,\
                                           np.ones((input.size()[0], args.nclass))], \
                                          axis=1)
            for j in range(input.size()[0]):
                targetTensor[j, target[j]] = 1.0
                targetTensor[j, target[j] + args.nclass] = 0.0
            targetTensor = torch.FloatTensor(targetTensor)
            targetTensor = targetTensor.cuda(async=True)
            target = target.cuda(async=True)
            target_var = torch.autograd.Variable(targetTensor)

        elif args.labelsm:
            targetTensor = np.zeros((input.size()[0], args.nclass))
            for j in range(input.size()[0]):
                targetTensor[j, target[j]] = 1.0
            targetTensor = (targetTensor * current_labelsm(epoch) +
                            (1 - current_labelsm(epoch)) / args.nclass)
            targetTensor = torch.FloatTensor(targetTensor)
            targetTensor = targetTensor.cuda(async=True)
            target = target.cuda(async=True)
            target_var = torch.autograd.Variable(targetTensor)
        else:
            target = target.cuda(async=True)
            target_var = torch.autograd.Variable(target)

        input_var = torch.autograd.Variable(input)

        # compute output
        output = model(input_var)

        if args.labelsm:
            #print input.size(), output.size(), target_var.size()
            output = nn.LogSoftmax()(output)
            #print output.data[0]
            loss = torch.mean(torch.sum(torch.mul(-output, target_var), 1))
        elif args.L1:
            output = nn.Softmax()(output)
            loss = nn.SmoothL1Loss()(output * args.nclass,
                                     target_var * args.nclass)
        elif args.MarginP > 0:
            loss = nn.MultiMarginLoss(p=args.MarginP,
                                      margin=args.MarginV)(output, target_var)
        elif abs(args.labelboost) > 1e-6:
            # Boosted CNN Implementation
            outq = nn.LogSoftmax()(output[:, :args.nclass])
            outp = nn.Softmax()(output[:, :args.nclass])
            #print "outp",(outp - outp[target]).data[0]

            # w = outp[target]#**(-1.0/args.nclass)
            # w = outp[target]
            #print outp.size(), target_var.size()
            #print (outp * target_var).data[0]
            w = (1.0 / args.nclass +
                 torch.sum(outp * target_var, 1))**(-1.0 / args.labelboost)
            w = w / torch.sum(w)

            #w = torch.exp(( - output + outp[target]) * (-0.5))
            #print "w",w.data[0]
            #print target_var.size(), (1 - torch.sum(w,1)).expand(input.size()[0], args.nclass).size()
            # w1 = w + torch.mul(target_var , ( - torch.sum(w,1) ).expand(input.size()[0], args.nclass)  )
            #print w1.data[0]
            #print torch.sum( torch.mul( -outq , w ) , 1 ).size()
            #print outq.size()

            #loss = torch.mean( torch.sum( torch.mul( -outq , (target_var + outp*args.labelboost)/(1.0 + args.labelboost) ) , 1 ))
            #loss = torch.mean( torch.sum( w , 1 ) )

            loss = torch.sum(
                torch.mul(w, torch.sum(torch.mul(-outq, target_var), 1)))
        elif args.focal > 0:

            outq = nn.LogSoftmax()(output[:, :args.nclass])
            outp = nn.Softmax()(output[:, :args.nclass])
            OneMinusPToGamma = (1.0 - torch.sum(outp * target_var, 1))**2
            LogP = torch.sum(-outq * target_var, 1)
            loss = torch.mean(torch.mul(OneMinusPToGamma, LogP))

        elif args.labelnocompete > 0:
            '''
            isout =  output[:,:args.nclass]
            notout = output[:,args.nclass:args.nclass*2]
            islabel = target_var[:,:args.nclass]
            notlabel = target_var[:,args.nclass:args.nclass*2]
            outdiv = torch.log(torch.exp(isout)+torch.exp(notout))
            isoutq = outdiv - isout
            notoutq = outdiv - notout
            loss = torch.mean(torch.sum(islabel * isoutq + notlabel * notoutq, 1))
            '''

            outq = nn.LogSoftmax()(torch.cat(
                [output[:, :args.nclass], -output[:, :args.nclass]], 1))
            loss = torch.mean(
                torch.sum((-outq * target_var)[:, :args.nclass], 1))
            """
            outp = nn.Softmax()(output)
            #print outq.size(),outp.size(),target_var.size()
            loss = torch.mean(\
                              torch.sum(\
                                torch.mul(-outq,  target_var * (1.0 + args.labelboost) - outp * (args.labelboost))
                                        ,1)\
                             )
                             """
        else:
            loss = criterion(output, target_var)

        # measure accuracy and record loss
        prec1, prec3, prec5 = accuracy(output.data[:, :args.nclass],
                                       target,
                                       topk=(1, 3, 5))
        losses.update(loss.data[0], input.size(0))
        top1.update(prec1[0], input.size(0))
        top3.update(prec3[0], input.size(0))
        top5.update(prec5[0], input.size(0))

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % args.print_freq == 0:
            print(
                'Epoch: [{0}][{1}/{2}]\t'
                'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                'Prec@3 {top3.val:.3f} ({top3.avg:.3f})\t'
                'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
                    epoch,
                    i,
                    len(train_loader),
                    batch_time=batch_time,
                    data_time=data_time,
                    loss=losses,
                    top1=top1,
                    top3=top3,
                    top5=top5))
예제 #9
0
def validate(val_loader, model, criterion):
    batch_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top3 = AverageMeter()
    top5 = AverageMeter()

    # switch to evaluate mode
    model.eval()

    end = time.time()
    for i, (input, target) in enumerate(val_loader):

        if 'L1' in args.arch or args.L1 == 1 or args.labelboost > 1e-6:
            targetTensor = np.zeros((input.size()[0], args.nclass))
            for j in range(input.size()[0]):
                targetTensor[j, target[j]] = 1.0
            #targetTensor = targetTensor[:input.size[0],:input.size[1]]
            targetTensor = torch.FloatTensor(targetTensor)
            targetTensor = targetTensor.cuda(async=True)
            target = target.cuda(async=True)
            target_var = torch.autograd.Variable(targetTensor)
        else:
            target = target.cuda(async=True)
            target_var = torch.autograd.Variable(target, volatile=True)

        input_var = torch.autograd.Variable(input, volatile=True)

        # compute output
        output = model(input_var)
        if args.L1:
            output = nn.Softmax()(output)
            loss = nn.SmoothL1Loss()(output * args.nclass,
                                     target_var * args.nclass)
        elif args.MarginP > 0:
            loss = nn.MultiMarginLoss(p=args.MarginP,
                                      margin=args.MarginV)(output, target_var)

        elif abs(args.labelboost) > 1e-6:
            outq = nn.LogSoftmax()(output[:, :args.nclass])
            outp = nn.Softmax()(output[:, :args.nclass])
            #print "outp",(outp - outp[target]).data[0]
            #w = torch.exp(( - output + outp[target]) * (-0.5))
            #print "w",w.data[0]
            #print target_var.size(), (1 - torch.sum(w,1)).expand(input.size()[0], args.nclass).size()
            # w1 = w + torch.mul(target_var , ( - torch.sum(w,1) ).expand(input.size()[0], args.nclass)  )
            #print w1.data[0]
            #print torch.sum( torch.mul( -outq , w ) , 1 ).size()

            loss = torch.mean(
                torch.sum(
                    torch.mul(-outq, (target_var + outp * args.labelboost) /
                              (1.0 + args.labelboost)), 1))
            #loss = torch.mean( torch.sum( w , 1 ) )
            """
            outp = nn.Softmax()(output)
            #print outq.size(),outp.size(),target_var.size()
            loss = torch.mean(\
                              torch.sum(\
                                torch.mul(-outq,  target_var * (1.0 + args.labelboost) - outp * (args.labelboost))
                                        ,1)\
                             )
                             """
        else:
            loss = criterion(output, target_var)

        # measure accuracy and record loss
        prec1, prec3, prec5 = accuracy(output.data[:, :args.nclass],
                                       target,
                                       topk=(1, 3, 5))
        losses.update(loss.data[0], input.size(0))
        top1.update(prec1[0], input.size(0))
        top3.update(prec3[0], input.size(0))
        top5.update(prec5[0], input.size(0))

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % args.print_freq == 0:
            print(
                'Test: [{0}/{1}]\t'
                'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                'Prec@3 {top3.val:.3f} ({top3.avg:.3f})\t'
                'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
                    i,
                    len(val_loader),
                    batch_time=batch_time,
                    loss=losses,
                    top1=top1,
                    top3=top3,
                    top5=top5))

    print(
        ' * Prec@1 {top1.avg:.3f} Prec@3 {top3.avg:.3f} Prec@5 {top5.avg:.3f}'.
        format(top1=top1, top3=top3, top5=top5))

    return top1.avg
예제 #10
0
def main(args):
    #torch.manual_seed(123)
    EMBEDDING_DIM = 200
    HIDDEN_DIM = 250
    num_epochs = 20
    task = args.task
    granularity = args.granularity
    dict = {}
    dict_char_ngram = {}
    word_freq = {}
    fake_dict = {}
    oov = []
    feature_maps = [50, 100, 150, 200, 200, 200, 200]
    kernels = [1, 2, 3, 4, 5, 6, 7]
    charcnn_embedding_size = 15
    max_word_length = 20
    c2w_mode = False
    character_ngrams = 3
    character_ngrams_2 = None
    character_ngrams_overlap = False
    glove_mode = None
    update_inv_mode = None
    update_oov_mode = None
    combine_mode = None
    lm_mode = None
    word_mode = (glove_mode, update_inv_mode, update_oov_mode)

    if torch.cuda.is_available():
        basepath = expanduser("~") + '/pytorch/DeepPairWiseWord'
    else:
        basepath = expanduser(
            "~") + '/Documents/research/pytorch/DeepPairWiseWord'

    if task == 'url':
        num_class = 2
        trainset = readURLdata(basepath + '/data/url/train/', granularity)
        testset = readURLdata(basepath + '/data/url/test_9324/', granularity)
    elif task == 'quora':
        num_class = 2
        trainset = readURLdata(basepath + '/data/quora/train/', granularity)
        testset = readURLdata(basepath + '/data/quora/test/', granularity)
    elif task == 'msrp':
        num_class = 2
        trainset = readURLdata(basepath + '/data/msrp/train/', granularity)
        testset = readURLdata(basepath + '/data/msrp/test/', granularity)
    elif task == 'sick':
        num_class = 5
        trainset = readSICKdata(basepath + '/data/sick/train/', granularity)
        devset = readSICKdata(basepath + '/data/sick/dev/', granularity)
        testset = readSICKdata(basepath + '/data/sick/test/', granularity)
    elif task == 'pit':
        num_class = 2
        trainset = readPITdata(basepath + '/data/pit/train/', granularity)
        #devset = readPITdata(basepath+'/data/pit/dev/',granularity)
        testset = readPITdata(basepath + '/data/pit/test/', granularity)
    elif task == 'hindi':
        num_class = 2
        trainset = read_Hindi_data(basepath + '/data/hindi/train/',
                                   granularity)
        testset = read_Hindi_data(basepath + '/data/hindi/test/', granularity)
    elif task == 'sts':
        num_class = 6
        trainset = readSTSdata(basepath + '/data/sts/train/', granularity)
        testset = readSTSdata(basepath + '/data/sts/test/', granularity)
    elif task == 'snli':
        num_class = 3
        trainset = readSNLIdata(basepath + '/data/snli/train/', granularity)
        testset = readSNLIdata(basepath + '/data/snli/test/', granularity)
    elif task == 'mnli':
        num_class = 3
        trainset = readMNLIdata(basepath + '/data/mnli/train/', granularity)
        devset_m = readMNLIdata(basepath + '/data/mnli/dev_m/', granularity)
        devset_um = readMNLIdata(basepath + '/data/mnli/dev_um/', granularity)
        testset_m = readMNLIdata(basepath + '/data/mnli/test_m/', granularity)
        testset_um = readMNLIdata(basepath + '/data/mnli/test_um/',
                                  granularity)
    elif task == 'wiki':
        '''
		_name_to_id = {
        'counter-vandalism': 0,
        'fact-update': 1,
        'refactoring': 2,
        'copy-editing': 3,
        'other': 4,
        'wikification': 5,
        'vandalism': 6,
        'simplification': 7,
        'elaboration': 8,
        'verifiability': 9,
        'process': 10,
        'clarification': 11,
        'disambiguation': 12,
        'point-of-view': 13
    }
		'''
        num_class = 14
        data = pickle.load(open(basepath + "/data/wiki/data.cpickle", "rb"))
        left = []
        right = []
        label = []
        id = []
        for i in range(2976):
            id.append(data[i][0])
            label.append([int(item) for item in data[i][3][0]])
            left_sent = [item.encode('utf-8') for item in data[i][1][0]]
            right_sent = [item.encode('utf-8') for item in data[i][2][0]]
            shared = []
            for item in left_sent:
                if item in right_sent:
                    shared.append(item)
            for item in shared:
                if item in left_sent and item in right_sent:
                    left_sent.remove(item)
                    right_sent.remove(item)
            if len(left_sent) == 0:
                left_sent = ['<EMPTY-EDIT>']
            if len(right_sent) == 0:
                right_sent = ['<EMPTY-EDIT>']
            left.append(left_sent)
            right.append(right_sent)
            #print(left_sent)
            #print(right_sent)
            #print(id[0])
            #print('*'*20)
        trainset = (left, right, label)
        #sys.exit()
        left = []
        right = []
        label = []
        for i in range(2376, 2976):
            id.append(data[i][0])
            label.append([int(item) for item in data[i][3][0]])
            left_sent = [item.encode('utf-8') for item in data[i][1][0]]
            right_sent = [item.encode('utf-8') for item in data[i][2][0]]
            shared = []
            for item in left_sent:
                if item in right_sent:
                    shared.append(item)
            for item in shared:
                if item in left_sent and item in right_sent:
                    left_sent.remove(item)
                    right_sent.remove(item)
            if len(left_sent) == 0:
                left_sent = ['<EMPTY-EDIT>']
            if len(right_sent) == 0:
                right_sent = ['<EMPTY-EDIT>']
            left.append(left_sent)
            right.append(right_sent)
        testset = (left, right, label)
    elif task == 'wikiqa':
        num_class = 2
        trainset = readURLdata(basepath + '/data/wikiqa/train/', granularity)
        testset = readURLdata(basepath + '/data/wikiqa/test/', granularity)
    elif task == 'trecqa':
        num_class = 2
        trainset = readURLdata(basepath + '/data/trecqa/train-all/',
                               granularity)
        testset = readURLdata(basepath + '/data/trecqa/raw-test/', granularity)
    else:
        print('wrong input for the first argument!')
        sys.exit()

    if granularity == 'word':
        tokens = []
        count = 0
        num_inv = 0
        num_oov = 0
        glove_mode = True
        update_inv_mode = True
        update_oov_mode = True
        word_mode = (glove_mode, update_inv_mode, update_oov_mode)
        if task == 'sick' or task == 'quora' or task == 'msrp':
            for line in open(basepath + '/data/' + task + '/vocab.txt'):
                tokens.append(line.strip())
            dict = {}
            EMBEDDING_DIM = 300
            #wv_dict, wv_arr, wv_size = load_word_vectors(basepath + '/VDPWI-NN-Torch/data/glove', 'glove.twitter.27B', EMBEDDING_DIM)
            wv_dict, wv_arr, wv_size = load_word_vectors(
                basepath + '/VDPWI-NN-Torch/data/glove', 'glove.840B',
                EMBEDDING_DIM)
            #wv_dict, wv_arr, wv_size = load_word_vectors(basepath + '/data/paragram/paragram_300_sl999/', 'paragram', EMBEDDING_DIM)
            #wv_dict={}
            #wv_arr={}
            for word in tokens:
                fake_dict[word] = torch.Tensor([
                    random.uniform(-0.05, 0.05) for i in range(EMBEDDING_DIM)
                ])
                try:
                    dict[word] = wv_arr[wv_dict[word]]
                    num_inv += 1
                except:
                    num_oov += 1
                    #print(word)
                    oov.append(word)
                    dict[word] = torch.Tensor([
                        random.uniform(-0.05, 0.05)
                        for i in range(EMBEDDING_DIM)
                    ])
        elif task == 'sts':
            for line in open(basepath + '/data/' + task + '/vocab.txt'):
                tokens.append(line.strip())
            dict = {}
            #EMBEDDING_DIM = 200
            #wv_dict, wv_arr, wv_size = load_word_vectors(basepath + '/VDPWI-NN-Torch/data/glove', 'glove.twitter.27B', EMBEDDING_DIM)
            #EMBEDDING_DIM = 300
            #wv_dict, wv_arr, wv_size = load_word_vectors(basepath + '/VDPWI-NN-Torch/data/glove', 'glove.840B', EMBEDDING_DIM)
            EMBEDDING_DIM = 300
            wv_dict, wv_arr, wv_size = load_word_vectors(
                basepath + '/data/paragram/paragram_300_sl999/', 'paragram',
                EMBEDDING_DIM)
            #wv_dict={}
            #wv_arr={}
            #oov = []
            #for line in open(basepath + '/data/' + task + '/oov.txt'):
            #	line = line.strip()
            #	oov.append(line)
            #inv = []
            #for line in open(basepath + '/data/' + task + '/inv_14000.txt'):
            #	line = line.strip()
            #	inv.append(line)
            # count=len(oov)+len(inv)
            #inv = tokens
            num_oov = 0
            num_inv = 0
            for word in tokens:
                fake_dict[word] = torch.Tensor([
                    random.uniform(-0.05, 0.05) for i in range(EMBEDDING_DIM)
                ])
                try:
                    dict[word] = wv_arr[wv_dict[word]]
                    num_inv += 1
                except:
                    num_oov += 1
                    oov.append(word)
                    dict[word] = torch.Tensor([
                        random.uniform(-0.05, 0.05)
                        for i in range(EMBEDDING_DIM)
                    ])
            dict_char_ngram = pickle.load(
                open(basepath + '/data/' + task + '/char_dict.p', "rb"))
            word_freq = pickle.load(
                open(basepath + '/data/' + task + '/word_freq.p', "rb"))
        elif task == 'snli' or task == 'wikiqa' or task == 'trecqa' or task == 'mnli':
            for line in open(basepath + '/data/' + task + '/vocab.txt'):
                tokens.append(line.strip())
            dict = {}
            #EMBEDDING_DIM = 200
            #wv_dict, wv_arr, wv_size = load_word_vectors(basepath + '/VDPWI-NN-Torch/data/glove', 'glove.twitter.27B', EMBEDDING_DIM)
            EMBEDDING_DIM = 300
            wv_dict, wv_arr, wv_size = load_word_vectors(
                basepath + '/VDPWI-NN-Torch/data/glove', 'glove.840B',
                EMBEDDING_DIM)
            #EMBEDDING_DIM = 300
            #wv_dict, wv_arr, wv_size = load_word_vectors(basepath+'/data/paragram/paragram_300_sl999/', 'paragram', EMBEDDING_DIM)
            num_oov = 0
            num_inv = 0
            for word in tokens:
                fake_dict[word] = torch.Tensor([
                    random.uniform(-0.05, 0.05) for i in range(EMBEDDING_DIM)
                ])
                try:
                    dict[word] = wv_arr[wv_dict[word]]
                    num_inv += 1
                except:
                    num_oov += 1
                    oov.append(word)
                    dict[word] = torch.Tensor([
                        random.uniform(-0.05, 0.05)
                        for i in range(EMBEDDING_DIM)
                    ])
            #dict_char_ngram = pickle.load(open(basepath + '/data/' + task + '/char_dict.p', "rb"))
            #word_freq = pickle.load(open(basepath + '/data/' + task + '/word_freq.p', "rb"))
            dict_char_ngram = {}
            word_freq = {}
        elif task == 'hindi':
            #words, embeddings = pickle.load(open(basepath+'/data/hindi/polyglot-hi.pkl', 'rb'))
            #print("Emebddings shape is {}".format(embeddings.shape))
            #print words[777], embeddings[777]
            embeddings_file_bin = basepath + '/data/hindi/hi/hi.bin'
            model_bin = KeyedVectors.load(embeddings_file_bin)
            #print(words[777], model_bin[words[777]])
            #sys.exit()
            for line in open(basepath + '/data/' + task + '/vocab.txt'):
                tokens.append(line.strip().decode('utf-8'))
            dict = {}
            EMBEDDING_DIM = 300
            for word in tokens:
                fake_dict[word] = torch.Tensor([
                    random.uniform(-0.05, 0.05) for i in range(EMBEDDING_DIM)
                ])
                try:
                    dict[word] = model_bin[word]
                    num_inv += 1
                except:
                    num_oov += 1
                    oov.append(word)
                    dict[word] = torch.Tensor([
                        random.uniform(-0.05, 0.05)
                        for i in range(EMBEDDING_DIM)
                    ])
        elif task == 'url' or task == 'pit':
            for line in open(basepath + '/data/' + task + '/vocab.txt'):
                tokens.append(line.strip())
            # print(len(tokens))
            dict = {}
            EMBEDDING_DIM = 200
            wv_dict, wv_arr, wv_size = load_word_vectors(
                basepath + '/VDPWI-NN-Torch/data/glove', 'glove.twitter.27B',
                EMBEDDING_DIM)
            #EMBEDDING_DIM = 300
            #wv_dict, wv_arr, wv_size = load_word_vectors(basepath + '/data/paragram/paragram_300_sl999/', 'paragram', EMBEDDING_DIM)
            #wv_dict={}
            #wv_arr={}
            # print(len(wv_dict))
            #oov = []
            #for line in open(basepath+'/data/'+task+'/oov.txt'):
            #	line = line.strip()
            #	oov.append(line)
            #inv=[]
            #for line in open(basepath+'/data/'+task+'/inv_4000.txt'):
            #	line = line.strip()
            #	inv.append(line)
            #count=len(oov)+len(inv)
            #inv = tokens
            num_oov = 0
            num_inv = 0
            for word in tokens:
                fake_dict[word] = torch.Tensor([
                    random.uniform(-0.05, 0.05) for i in range(EMBEDDING_DIM)
                ])
                try:
                    dict[word] = wv_arr[wv_dict[word]]
                    num_inv += 1
                except:
                    num_oov += 1
                    oov.append(word)
                    dict[word] = torch.Tensor([
                        random.uniform(-0.05, 0.05)
                        for i in range(EMBEDDING_DIM)
                    ])
            dict_char_ngram = pickle.load(
                open(basepath + '/data/' + task + '/char_dict.p', "rb"))
            word_freq = pickle.load(
                open(basepath + '/data/' + task + '/word_freq.p', "rb"))
        print('finished loading word vector, there are ' + str(num_inv) +
              ' INV words and ' + str(num_oov) + ' OOV words.')
        print('current task: ' + task + ', glove mode = ' + str(glove_mode) +
              ', update_inv_mode = ' + str(update_inv_mode) +
              ', update_oov_mode = ' + str(update_oov_mode))
        saved_file = 'current task: ' + task + ', glove mode = ' + str(
            glove_mode) + ', update_inv_mode = ' + str(
                update_inv_mode) + ', update_oov_mode = ' + str(
                    update_oov_mode) + '.txt'
    #subprocess.call(['echo','finished loading word vector, there are ',str(num_inv),' INV words and ',str(len(oov)),' OOV words.'])
    elif granularity == 'char':
        # charcnn parameters
        feature_maps = [50, 100, 150, 200, 200, 200, 200]
        kernels = [1, 2, 3, 4, 5, 6, 7]
        charcnn_embedding_size = 15
        max_word_length = 20

        #c2w parameters
        lm_mode = False
        c2w_mode = False
        character_ngrams = 1
        character_ngrams_overlap = True

        tokens = []
        if task != 'wiki':
            if task == 'hindi':
                for line in open(basepath + '/data/' + task + '/vocab.txt'):
                    tokens.append(line.strip().decode('utf-8'))
                tokens.append('<s>'.decode())
                tokens.append('</s>'.decode())
                tokens.append('oov'.decode())
            else:
                for line in open(basepath + '/data/' + task + '/vocab.txt'):
                    tokens.append(line.strip())
                org_tokens = tokens[:]
                tokens.append('<s>')
                tokens.append('</s>')
                tokens.append('oov')
            word_freq = pickle.load(
                open(basepath + '/data/' + task + '/word_freq.p', "rb"))
        if c2w_mode:
            EMBEDDING_DIM = 200
        else:
            EMBEDDING_DIM = 1100
        if character_ngrams == 1:
            dict_char_ngram = pickle.load(
                open(basepath + '/data/' + task + '/char_dict.p', "rb"))
        elif character_ngrams == 2 and character_ngrams_overlap:
            dict_char_ngram = pickle.load(
                open(basepath + '/data/' + task + '/bigram_dict.p', "rb"))
        elif character_ngrams == 2 and not character_ngrams_overlap:
            dict_char_ngram = pickle.load(
                open(basepath + '/data/' + task + '/bigram_dict_no_overlap.p',
                     "rb"))
        elif character_ngrams == 3 and character_ngrams_overlap:
            dict_char_ngram = pickle.load(
                open(basepath + '/data/' + task + '/trigram_dict.p', "rb"))
        elif character_ngrams == 3 and not character_ngrams_overlap:
            dict_char_ngram = pickle.load(
                open(basepath + '/data/' + task + '/trigram_dict_no_overlap.p',
                     "rb"))
        print('current task: ' + task + ', lm mode: ' + str(lm_mode) +
              ', c2w mode: ' + str(c2w_mode) + ', n = ' +
              str(character_ngrams) + ', overlap = ' +
              str(character_ngrams_overlap) + '.')
        saved_file = 'current task: ' + task + ', lm mode: ' + str(
            lm_mode) + ', c2w mode: ' + str(c2w_mode) + ', n = ' + str(
                character_ngrams) + ', overlap = ' + str(
                    character_ngrams_overlap) + '.txt'
    elif granularity == 'mix':
        tokens = []
        num_oov = 0
        num_inv = 0
        for line in open(basepath + '/data/' + task + '/vocab.txt'):
            tokens.append(line.strip())
        tokens.append('<s>')
        tokens.append('</s>')
        tokens.append('oov')
        # print(len(tokens))
        dict = {}
        #oov=[]
        if task == 'sts':
            EMBEDDING_DIM = 300
            wv_dict, wv_arr, wv_size = load_word_vectors(
                basepath + '/data/paragram/paragram_300_sl999/', 'paragram',
                EMBEDDING_DIM)
            #wv_dict, wv_arr, wv_size = load_word_vectors(basepath + '/VDPWI-NN-Torch/data/glove', 'glove.840B', EMBEDDING_DIM)
        else:
            EMBEDDING_DIM = 200
            wv_dict, wv_arr, wv_size = load_word_vectors(
                basepath + '/VDPWI-NN-Torch/data/glove', 'glove.twitter.27B',
                EMBEDDING_DIM)
        '''
		EMBEDDING_DIM = 300
		wv_dict, wv_arr, wv_size = load_word_vectors(basepath + '/data/paragram/paragram_300_sl999/', 'paragram', EMBEDDING_DIM)
		'''
        oov = []
        for word in tokens:
            '''
			if word in oov or word in inv:
				count+=1
				dict[word] = torch.Tensor([0 for i in range(EMBEDDING_DIM)])
			else:
				dict[word] = wv_arr[wv_dict[word]]
				num_inv+=1
			'''
            try:
                dict[word] = wv_arr[wv_dict[word]]
                num_inv += 1
            except:
                num_oov += 1
                oov.append(word)
                # print(word)
                dict[word] = torch.Tensor([
                    random.uniform(-0.05, 0.05) for i in range(EMBEDDING_DIM)
                ])
                #dict[word] = torch.Tensor([0 for i in range(EMBEDDING_DIM)])

        lm_mode = False
        combine_mode = 'g_0.75'  # 'concat', 'g_0.25', 'g_0.50', 'g_0.75', 'adaptive', 'attention', 'backoff'
        # c2w parameters
        c2w_mode = False
        character_ngrams = 1
        #character_ngrams_2 = 3
        character_ngrams_overlap = False
        if character_ngrams == 1:
            dict_char_ngram = pickle.load(
                open(basepath + '/data/' + task + '/char_dict.p', "rb"))
        elif character_ngrams == 2 and character_ngrams_overlap:
            dict_char_ngram = pickle.load(
                open(basepath + '/data/' + task + '/bigram_dict.p', "rb"))
        elif character_ngrams == 2 and not character_ngrams_overlap:
            dict_char_ngram = pickle.load(
                open(basepath + '/data/' + task + '/bigram_dict_no_overlap.p',
                     "rb"))
        elif character_ngrams == 3 and character_ngrams_overlap:
            dict_char_ngram = pickle.load(
                open(basepath + '/data/' + task + '/trigram_dict.p', "rb"))
        elif character_ngrams == 3 and not character_ngrams_overlap:
            dict_char_ngram = pickle.load(
                open(basepath + '/data/' + task + '/trigram_dict_no_overlap.p',
                     "rb"))
        '''
		if character_ngrams_2 == 1:
			dict_char_ngram_2 = pickle.load(open(basepath + '/data/' + task + '/char_dict.p', "rb"))
		elif character_ngrams_2 == 2 and character_ngrams_overlap:
			dict_char_ngram_2 = pickle.load(open(basepath + '/data/' + task + '/bigram_dict.p', "rb"))
		elif character_ngrams_2 == 2 and not character_ngrams_overlap:
			dict_char_ngram_2 = pickle.load(open(basepath + '/data/' + task + '/bigram_dict_no_overlap.p', "rb"))
		elif character_ngrams_2 == 3 and character_ngrams_overlap:
			dict_char_ngram_2 = pickle.load(open(basepath + '/data/' + task + '/trigram_dict.p', "rb"))
		elif character_ngrams_2 == 3 and not character_ngrams_overlap:
			dict_char_ngram_2 = pickle.load(open(basepath + '/data/' + task + '/trigram_dict_no_overlap.p', "rb"))
		'''
        word_freq = pickle.load(
            open(basepath + '/data/' + task + '/word_freq.p', "rb"))
        print('current task: ' + task + ', lm mode: ' + str(lm_mode) +
              ', combination mode: ' + combine_mode + ', c2w mode: ' +
              str(c2w_mode) + ', n = ' + str(character_ngrams) +
              ', overlap = ' + str(character_ngrams_overlap) + '.')
        print('finished loading word & char table, there are ' + str(num_inv) +
              ' INV words and ' + str(num_oov) + ' OOV words.')
    elif granularity == 'cross':
        oov = []
        dict_char = []
        tokens = []
        word_freq = []
        overlap = True
        if overlap:
            dict_ngram = pickle.load(
                open(basepath + '/data/' + task + '/cross_trigram_dict.p',
                     "rb"))
        else:
            dict_ngram = pickle.load(
                open(
                    basepath + '/data/' + task +
                    '/cross_trigram_dict_no_overlap.p', "rb"))
    else:
        print('wrong input for the second argument!')
        sys.exit()

    model = DeepPairWiseWord(EMBEDDING_DIM, HIDDEN_DIM, 1, task, granularity,
                             num_class, dict, fake_dict, dict_char_ngram, oov,
                             tokens, word_freq, feature_maps, kernels,
                             charcnn_embedding_size, max_word_length,
                             character_ngrams, c2w_mode,
                             character_ngrams_overlap, word_mode, combine_mode,
                             lm_mode)  #, corpus)
    #print(get_n_params(model))
    #sys.exit()
    #print(model.lm_train_data)
    #sys.exit()
    #premodel=DeepPairWiseWord(EMBEDDING_DIM,HIDDEN_DIM,1,task,granularity,num_class,dict,dict_char,oov)
    #premodel.load_state_dict(torch.load('model_char_only.pkl'))
    #premodel=torch.load('model_char_only.pkl')
    #model.embedding=premodel.embedding
    #model.lstm_c2w=premodel.lstm_c2w
    #model.df=premodel.df
    #model.db=premodel.db
    #model.bias=premodel.bias
    if torch.cuda.is_available():
        model = model.cuda()
    lsents, rsents, labels = trainset
    #print(len(lsents))
    #threshold=40000
    #lsents = lsents[:threshold]
    #rsents = rsents[:threshold]
    #labels = labels[:threshold]
    # Loss and Optimizer
    if task == 'sick' or task == 'sts' or task == 'snli':
        indices = torch.randperm(len(lsents))
        print('indices:')
        print(indices[:10])
        #for line in open('./data/sick/order.txt'):
        #	indices.append(int(line.strip()) - 1)
        criterion = nn.KLDivLoss()
        if torch.cuda.is_available():
            criterion = criterion.cuda()
    elif task == 'url' or task == 'pit' or task == 'hindi' or task == 'quora' or task == 'msrp' or task == 'wikiqa' or task == 'trecqa' or task == 'mnli':
        '''
		indices = torch.randperm(len(trainset[0]))
		with open('./data/'+task+'/order.txt','w') as f:
			for item in indices:
				f.writelines(str(item)+'\n')
		'''
        #indices = []
        #for line in open('./data/'+task+'/order.txt'):
        #	indices.append(int(line.strip()))
        indices = torch.randperm(len(lsents))
        #print('indices:')
        #print(indices[:10])
        criterion = nn.MultiMarginLoss(p=1,
                                       margin=1.0,
                                       weight=None,
                                       size_average=True)
        if torch.cuda.is_available():
            criterion = criterion.cuda()
    elif task == 'wiki':
        indices = torch.randperm(len(lsents))
        print('indices:')
        print(indices[:10])
        criterion = nn.MultiLabelSoftMarginLoss()
        if torch.cuda.is_available():
            criterion = criterion.cuda()
    optimizer = torch.optim.RMSprop(
        model.parameters(), lr=0.0001
    )  #, momentum=0.1, weight_decay=0.05)#,momentum=0.9,weight_decay=0.95)
    #optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
    # Train the Model
    #print(oov)
    print('start training')
    #subprocess.call(['echo','start training'])
    gold = []
    gold_um = []
    if task == 'url':
        for line in open(basepath + '/data/' + task + '/test_9324/sim.txt'):
            gold.append(int(line.strip()))
    elif task == 'snli':
        for line in open(basepath + '/data/' + task + '/test/sim.txt'):
            gold.append(line.strip())
    elif task == 'trecqa':
        for line in open(basepath + '/data/' + task + '/raw-test/sim.txt'):
            gold.append(float(line.strip()))
    elif task == 'mnli':
        pass
        '''
		for line in open(basepath+'/data/' + task + '/dev_m/sim.txt'):
			gold.append(float(['neutral', 'entailment','contradiction'].index(line.strip())))
		for line in open(basepath+'/data/' + task + '/dev_um/sim.txt'):
			gold_um.append(float(['neutral', 'entailment','contradiction'].index(line.strip())))
		'''
    else:
        for line in open(basepath + '/data/' + task + '/test/sim.txt'):
            gold.append(float(line.strip()))
    max_result = -1
    max_result_um = -1
    batch_size = 32
    report_interval = 50000
    for epoch in range(num_epochs):
        print('--' * 20)
        model.train()
        optimizer.zero_grad()
        start_time = time.time()
        data_loss = 0
        indices = torch.randperm(len(lsents))
        train_correct = 0
        #print(len(indices))
        for index, i in enumerate(indices):
            #print(index)
            #start_time = time.time()
            if granularity == 'word':
                sentA = lsents[i]
                sentB = rsents[i]
                '''
				#print(lsents[i])
				try:
					sentA = torch.cat((dict[word].view(1, EMBEDDING_DIM) for word in lsents[i]), 0)
					sentA = Variable(sentA)#.cuda()
					#print(lsents[i])
					#print(sentA)
					#print(rsents[i])
					sentB = torch.cat((dict[word].view(1, EMBEDDING_DIM) for word in rsents[i]), 0)
					sentB = Variable(sentB)#.cuda()
				except:
					print(lsents[i])
					print(rsents[i])
					sys.exit()
				#print(rsents[i])
				#print(sentB)
				#sys.exit()
				if torch.cuda.is_available():
					sentA=sentA.cuda()
					sentB=sentB.cuda()
				sentA = torch.unsqueeze(sentA, 0).view(-1, 1, EMBEDDING_DIM)
				sentB = torch.unsqueeze(sentB, 0).view(-1, 1, EMBEDDING_DIM)
				# label=torch.unsqueeze(label,0)
				'''
            elif granularity == 'char' or granularity == 'mix' or granularity == 'cross':
                #sentA=[]
                #sentB=[]
                #for word in lsents[i]:
                #	sentA.append([dict[char] for char in word])
                #for word in rsents[i]:
                #	sentB.append([dict[char] for char in word])
                #print(i)
                sentA = lsents[i]
                sentB = rsents[i]
            if task == 'sick' or task == 'sts' or task == 'snli' or task == 'wiki':
                label = Variable(torch.Tensor(labels[i]))
            else:
                label = Variable(torch.LongTensor(labels[i]))  #.cuda()
            if torch.cuda.is_available():
                label = label.cuda()
            # Forward + Backward + Optimize
            #elapsed_time = time.time() - start_time
            #print('data preparation time: '+str(timedelta(seconds=elapsed_time)))
            #print(sentA)
            #print(sentB)
            #print(id[i])
            #print('*'*20)
            output, extra_loss = model(sentA, sentB, index)
            #tmp_output = np.exp(output.data[0].cpu().numpy())
            #print index, 'gold: ', labels[i][0], 'predict: ', np.argmax(tmp_output)
            #print(extra_loss)
            loss = criterion(output, label) + extra_loss
            loss.backward()
            data_loss += loss.data[0]
            output = np.exp(output.data[0].cpu().numpy())
            if labels[i][0] == np.argmax(output):
                train_correct += 1
            #print(loss-extra_loss)
            #print('*'*20)
            if (index + 1) % batch_size == 0:
                optimizer.step()
                optimizer.zero_grad()

            if (index + 1) % report_interval == 0:
                msg = '%d completed epochs, %d batches' % (epoch, index + 1)
                msg += '\t train batch loss: %f' % (data_loss / (index + 1))
                train_acc = train_correct / (index + 1)
                msg += '\t train accuracy: %f' % train_acc
                print(msg)

            if (index + 1) % (int(len(lsents) / 2)) == 0:
                #print ('Epoch [%d/%d], Iter [%d/%d] Loss: %.6f'
                #	   % (epoch + 1, num_epochs, index + 1, len(lsents) // 1, data_loss))#loss.data[0]))
                #subprocess.call(['echo','Epoch ',str(epoch+1),'Loss: ',str(data_loss)])
                #break
                #data_loss = 0
                #torch.save(model.state_dict(), 'model.pkl')
                #model.load_state_dict(torch.load('model_char_only.pkl'))

                if task == 'sick' or task == 'sts' or task == 'snli' or task == 'wiki':
                    model.eval()
                    test_lsents, test_rsents, test_labels = testset
                    predicted = []
                    tmp_result = 0
                    #gold=[]
                    #for line in open('./data/sick/test/sim.txt'):
                    #	gold.append(float(line.strip()))
                    for test_i in range(len(test_lsents)):
                        if granularity == 'word':
                            '''
							sentA = torch.cat((dict[word].view(1, EMBEDDING_DIM) for word in test_lsents[test_i]), 0)
							sentA = Variable(sentA)
							# print(sentA)
							sentB = torch.cat((dict[word].view(1, EMBEDDING_DIM) for word in test_rsents[test_i]), 0)
							sentB = Variable(sentB)
							if torch.cuda.is_available():
								sentA = sentA.cuda()
								sentB = sentB.cuda()
							#label = torch.unsqueeze(label, 0)
							sentA = torch.unsqueeze(sentA, 0).view(-1, 1, EMBEDDING_DIM)
							sentB = torch.unsqueeze(sentB, 0).view(-1, 1, EMBEDDING_DIM)
							'''
                            sentA = test_lsents[test_i]
                            sentB = test_rsents[test_i]
                        elif granularity == 'char' or granularity == 'mix':
                            sentA = test_lsents[test_i]
                            sentB = test_rsents[test_i]
                        raw_output, _ = model(sentA, sentB, index)
                        #print(output)
                        if task == 'sick':
                            output = raw_output
                            output = np.exp(output.data[0].cpu().numpy())
                            predicted.append(1 * output[0] + 2 * output[1] +
                                             3 * output[2] + 4 * output[3] +
                                             5 * output[4])
                        elif task == 'snli':
                            output = raw_output
                            output = np.exp(output.data[0].cpu().numpy())
                            output = [output[0], output[1], output[2]]
                            tmp_output = output.index(max(output))
                            predicted.append(tmp_output)
                            if test_labels[test_i].index(
                                    max(test_labels[test_i])) == tmp_output:
                                tmp_result += 1
                        elif task == 'wiki':
                            output = torch.sigmoid(raw_output).data > 0.5
                            output = output.cpu()
                            predicted = list(output.numpy()[0])
                            if predicted == test_labels[test_i]:
                                tmp_result += 1
                        else:
                            output = raw_output
                            output = np.exp(output.data[0].cpu().numpy())
                            predicted.append(0 * output[0] + 1 * output[1] +
                                             2 * output[2] + 3 * output[3] +
                                             4 * output[4] + 5 * output[5])
                    #print(predicted)
                    #print(gold)
                    if task == 'sick':
                        result = pearson(predicted, gold)
                        print('Test Correlation: %.6f' % result)
                        if result > max_result:
                            max_result = result
                    elif task == 'snli' or task == 'wiki':
                        result = tmp_result / len(test_lsents)
                        print('Test Accuracy: %.6f' % result)
                        if result > max_result:
                            max_result = result
                    else:
                        result1 = pearson(predicted[0:450], gold[0:450])
                        result2 = pearson(predicted[450:750], gold[450:750])
                        result3 = pearson(predicted[750:1500], gold[750:1500])
                        result4 = pearson(predicted[1500:2250],
                                          gold[1500:2250])
                        result5 = pearson(predicted[2250:3000],
                                          gold[2250:3000])
                        result6 = pearson(predicted[3000:3750],
                                          gold[3000:3750])
                        print(
                            'deft-forum: %.6f, deft-news: %.6f, headlines: %.6f, images: %.6f, OnWN: %.6f, tweet-news: %.6f'
                            % (result1, result2, result3, result4, result5,
                               result6))
                        wt_mean = 0.12 * result1 + 0.08 * result2 + 0.2 * result3 + 0.2 * result4 + 0.2 * result5 + 0.2 * result6
                        print('weighted mean: %.6f' % wt_mean)
                        if wt_mean > max_result:
                            max_result = wt_mean
                        if task == 'sts':
                            with open(basepath + '/data/sts/sts_PWIM_prob.txt',
                                      'w') as f:
                                for item in predicted:
                                    f.writelines(str(item) + '\n')
                        #else:
                        #	with open('SICK_with_paragram_result.txt', 'w') as f:
                        #		for item in predicted:
                        #			f.writelines(str(item)+'\n')
                else:
                    model.eval()
                    msg = '%d completed epochs, %d batches' % (epoch,
                                                               index + 1)
                    if task == 'mnli':
                        test_lsents, test_rsents, test_labels = devset_m
                    else:
                        test_lsents, test_rsents, test_labels = testset
                    predicted = []
                    correct = 0
                    #gold=gold[:3000]
                    #print(len(gold))
                    for test_i in range(len(test_lsents)):
                        # start_time = time.time()
                        if granularity == 'word':
                            sentA = test_lsents[test_i]
                            sentB = test_rsents[test_i]
                            '''
							sentA = torch.cat((dict[word].view(1, EMBEDDING_DIM) for word in test_lsents[test_i]), 0)
							sentA = Variable(sentA)#.cuda()
							# print(sentA)
							sentB = torch.cat((dict[word].view(1, EMBEDDING_DIM) for word in test_rsents[test_i]), 0)
							sentB = Variable(sentB)#.cuda()
							# print(sentB)
							if torch.cuda.is_available():
								sentA=sentA.cuda()
								sentB=sentB.cuda()
							sentA = torch.unsqueeze(sentA, 0).view(-1, 1, EMBEDDING_DIM)
							sentB = torch.unsqueeze(sentB, 0).view(-1, 1, EMBEDDING_DIM)
						# label=torch.unsqueeze(label,0)
							'''
                        elif granularity == 'char' or granularity == 'mix':
                            sentA = test_lsents[test_i]
                            sentB = test_rsents[test_i]
                        output, _ = model(sentA, sentB, index)
                        #print(output)
                        output = np.exp(output.data[0].cpu().numpy())
                        if test_labels[test_i][0] == np.argmax(output):
                            correct += 1
                        predicted.append(output[1])
                    #result=float(correct)/len(test_lsents)
                    #print('Test Accuracy: %.4f'% result)
                    #result_acc, result_f1=URL_maxF1_eval(predict_result=predicted,test_data_label=gold)
                    result = correct / len(test_lsents)
                    msg += '\t dev m accuracy: %f' % result
                    print(msg)
                    if result > max_result:
                        max_result = result
                        test_lsents, test_rsents, test_labels = testset_m
                        predicted = []
                        for test_i in range(len(test_lsents)):
                            # start_time = time.time()
                            if granularity == 'word':
                                sentA = test_lsents[test_i]
                                sentB = test_rsents[test_i]
                            output, _ = model(sentA, sentB, index)
                            output = np.exp(output.data[0].cpu().numpy())
                            predicted.append(np.argmax(output))
                        with open(basepath + '/sub_m.csv', 'w+') as f:
                            label_dict = [
                                'neutral', 'entailment', 'contradiction'
                            ]
                            f.write("pairID,gold_label\n")
                            for i, k in enumerate(predicted):
                                f.write(
                                    str(i + 9847) + "," + label_dict[k] + "\n")
                        #with open(basepath+'/PWIM_prob_result_'+task, 'w') as f:
                        #	for item in predicted:
                        #		f.writelines(str(item)+'\n')
                    if task == 'mnli':
                        msg = '%d completed epochs, %d batches' % (epoch,
                                                                   index + 1)
                        test_lsents, test_rsents, test_labels = devset_um
                        predicted = []
                        correct = 0
                        for test_i in range(len(test_lsents)):
                            # start_time = time.time()
                            if granularity == 'word':
                                sentA = test_lsents[test_i]
                                sentB = test_rsents[test_i]
                            output, _ = model(sentA, sentB, index)
                            # print(output)
                            output = np.exp(output.data[0].cpu().numpy())
                            if test_labels[test_i][0] == np.argmax(output):
                                correct += 1
                            predicted.append(output[1])
                        #result_acc, result_f1 = URL_maxF1_eval(predict_result=predicted, test_data_label=gold_um)
                        result_acc = correct / len(test_lsents)
                        msg += '\t dev um accuracy: %f' % result_acc
                        print(msg)
                        if result_acc > max_result_um:
                            max_result_um = result_acc
                            test_lsents, test_rsents, test_labels = testset_um
                            predicted = []
                            for test_i in range(len(test_lsents)):
                                # start_time = time.time()
                                if granularity == 'word':
                                    sentA = test_lsents[test_i]
                                    sentB = test_rsents[test_i]
                                output, _ = model(sentA, sentB, index)
                                output = np.exp(output.data[0].cpu().numpy())
                                predicted.append(np.argmax(output))
                            with open(basepath + '/sub_um.csv', 'w+') as f:
                                label_dict = [
                                    'neutral', 'entailment', 'contradiction'
                                ]
                                f.write("pairID,gold_label\n")
                                for i, k in enumerate(predicted):
                                    f.write(
                                        str(i) + "," + label_dict[k] + "\n")
                        #with open('current task: '+task+', lm mode: '+str(lm_mode)+', combination mode: '+combine_mode+', c2w mode: '+str(c2w_mode)+', n = '+str(character_ngrams)+', overlap = '+str(character_ngrams_overlap)+'.txt','w') as f:
                        #	for item in predicted:
                        #		f.writelines(str(item)+'\n')
                        #torch.save(model, 'model_URL_unigram_CNN.pkl')
                        #torch.save(model, 'model_word_inv_18k.pkl')
                        #torch.save(model, 'model_word_inv_3k.pkl')
                        #torch.save(model, 'model_char_only.pkl')
                        #torch.save(model, 'model_word_only_pit.pkl')
                        #torch.save(model, 'model_word_char_backoff.pkl')
                        #torch.save(model, 'model_word_char_g_0.5.pkl')
                        #torch.save(model, 'model_word_char_adaptive.pkl')
                        #torch.save(model, 'model_word_char_attention.pkl')
                        #with open('model_word_inv_0k_result.txt', 'w') as f:
                        #with open('sts_model_word_only_inv_17k_result.txt', 'w') as f:
                        #with open('model_word_inv_3k_result.txt', 'w') as f:
                        #with open('model_char_only_result.txt', 'w') as f:
                        #with open('model_word_only_result_pit.txt', 'w') as f:
                        #with open('model_word_char_g_0.5_result.txt', 'w') as f:
                        #with open('model_word_char_backoff_result.txt', 'w') as f:
                        #with open('model_word_char_adaptive.txt', 'w') as f:
                        #with open('model_word_char_attention_result.txt','w') as f:
                        #	for item in predicted:
                        #		f.writelines(str(item)+'\n')
                        '''
						h = Variable(torch.zeros(2, 1, model.embedding_dim))  # 2 for bidirection
						c = Variable(torch.zeros(2, 1, model.embedding_dim))
						if torch.cuda.is_available():
							h = h.cuda()
							c = c.cuda()
						subword_embedding={}
						for word in org_tokens:
							tmp_indices = model.generate_word_indices(word)
							if not model.c2w_mode:
								if len(tmp_indices) < 20:
									tmp_indices = tmp_indices + [0 for i in range(model.charcnn_max_word_length - len(tmp_indices))]
								else:
									tmp_indices = tmp_indices[0:20]
							if model.c2w_mode:
								output = model.c2w_cell([tmp_indices], h, c)
							else:
								output = model.charCNN_cell([tmp_indices])
							subword_embedding[word]=output.data[0].cpu().numpy()
						pickle.dump(subword_embedding, open('URL_subword_lm_embedding.p', "wb"))
						'''
                elapsed_time = time.time() - start_time
                print('Epoch ' + str(epoch + 1) + ' finished within ' +
                      str(timedelta(seconds=elapsed_time)) +
                      ', and current time:' + str(datetime.now()))
                print('Best result until now: %.6f' % max_result)
                print('Best um result until now: %.6f' % max_result_um)
                #subprocess.call(['echo','Epoch ' , str(epoch + 1) , ' finished within ' , str(timedelta(seconds=elapsed_time)),', and current time:', str(datetime.now())])
                #subprocess.call(['echo','Best result until now: ',str(max_result)])
                model.train()
    i_1 = torch.log(1 / (1 + torch.exp(-inputs[0, 1])))
    i_2 = torch.log(1 / (1 + torch.exp(-inputs[0, 2])))

    loss_h = (i_0 + i_1 + i_2) / -3

    print(loss_h)

# ---------------------------------------------- 14 Multi Margin Loss -----------------------------------------
flag = 0
# flag = 1
if flag:

    x = torch.tensor([[0.1, 0.2, 0.7], [0.2, 0.5, 0.3]])
    y = torch.tensor([1, 2], dtype=torch.long)

    loss_f = nn.MultiMarginLoss(reduction='none')

    loss = loss_f(x, y)

    print("Multi Margin Loss: ", loss)

# --------------------------------- compute by hand
flag = 0
# flag = 1
if flag:

    x = x[0]
    margin = 1

    i_0 = margin - (x[1] - x[0])
    # i_1 = margin - (x[1] - x[1])
예제 #12
0
            out_params.append(param)
            out_names.append(name)
        else:
            in_params.append(param)
            in_names.append(name)

    in_size, out_size = [x.size() for x in in_params], [x.size() for x in out_params]
    in_sum, out_sum = sum([np.prod(x) for x in in_size]), sum([np.prod(x) for x in out_size])

    print "IN    : {} params".format(in_sum)
    #print print_params(in_names, in_size)
    print "OUT   : {} params".format(out_sum)
    #print print_params(out_names, out_size)
    print "TOTAL : {} params".format(in_sum + out_sum)

    loss_fn = {'xent':nn.CrossEntropyLoss(), 'mse':nn.MSELoss(), 'mrl':nn.MarginRankingLoss(), 'mlml':nn.MultiLabelMarginLoss(), 'mml':nn.MultiMarginLoss()}
    tt = torch
    if not args.cpu:
        loss_fn = {k:v.cuda() for (k,v) in loss_fn.iteritems()}
        tt = torch.cuda

    optimizer = torch.optim.Adam(in_params, lr=args.lr)

    out_data = {'train':{'x':[], 'y':[] }, \
                'valid':{'x':[], 'y':[] }, \
                'bleu':{'x':[], 'y':[] }, \
                'best_valid':{'x':[], 'y':[] } }

    best_epoch = -1
    best_bleu = {"valid":{0:0}, "test":{0:0}}
예제 #13
0
    # Load data
    print("LOADING DATA...")
    embedding = create_embedding_dict(word_embedding_path)
    questions = create_question_dict(question_path, embedding, hidden_size)
    train_data = read_training_data(train_data_path)
    dev_data, dev_label_dict, dev_scores = read_eval_data(dev_data_path)
    test_data, test_label_dict, test_scores = read_eval_data(test_data_path)

    if DEBUG:
        train_data = train_data[:
                                300]  # ONLY FOR DEBUGGING, REMOVE LINE TO RUN ON ALL TRAINING DATA

    # Create model
    rnn = RNN(n_features, hidden_size, n_layers, batch_size=22)
    optimizer = optim.Adam(rnn.parameters(), lr=learning_rate)
    criterion = nn.MultiMarginLoss(margin=0.2)

    # Training
    print(
        "Starting run with batch_size: %d, hidden size: %d, learning rate: %.4f"
        % (outer_batch_size, hidden_size, learning_rate))
    start = time.time()
    current_loss = 0

    for iter in range(1, n_epochs + 1):
        avg_loss = train(rnn,
                         criterion,
                         optimizer,
                         train_data,
                         questions,
                         hidden_size,
예제 #14
0
    #Displays an example of the code to make sure that everything is running correctly.
    dataiter = iter(training_loader)
    images, labels = dataiter.next()
    images.size()

    #Displays an example of the code to make sure that everything is running correctly.
    image_grid = torchvision.utils.make_grid(images, normalize=True)
    plt.imshow(np.transpose(image_grid.numpy(), (1, 2, 0)),
               interpolation='nearest')
    plt.show()

    #Instantiate the ConvolutionalNeuralNetwork()
    cnn = ConvolutionalNeuralNetwork()
    cnn.to(device=DEVICE)
    #Select the loss function
    loss_function = nn.MultiMarginLoss()
    #Select the learning rate
    learning_rate = 0.01
    #Select the optimizer
    optimizer = torch.optim.SGD(cnn.parameters(), lr=learning_rate)

    #Visualizes the network architecture.
    make_dot(cnn(images.to(device=DEVICE)),
             params=dict(cnn.named_parameters()))

    #iterations
    iter = 0
    epochs = 2000

    #There is a training and testing set. If you want to run the testing set, comment out the training set. If you want
    #to run the training set, comment out the testing set. If one of the two is not commented out, it will take much
예제 #15
0
def train():
    data_transforms = transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])
    train_data = PlantSeedlingDataset(root_dir='train',
                                      transform=data_transforms)
    data_loader = DataLoader(train_data, batch_size=32, shuffle=True)

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    model = VGG11(num_classes=train_data.num_classes).to(device)
    model.train()

    criterion = nn.MultiMarginLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

    train_loss, train_acc = [], []

    best_model_params = copy.deepcopy(model.state_dict())
    best_acc = 0

    num_epochs = 80

    for epoch in range(num_epochs):
        print('Epoch [{}/{}]'.format(epoch + 1, num_epochs))
        running_train_loss = 0.0
        running_train_acc = 0
        for i, data in enumerate(data_loader):
            images, labels = data[0].to(device), data[1].to(device)

            optimizer.zero_grad()

            # forward
            outputs = model(images)
            _, preds = torch.max(outputs.data, 1)
            loss = criterion(outputs, labels)

            # backward
            loss.backward()
            optimizer.step()

            running_train_loss += loss.item() * images.size(0)
            running_train_acc += torch.sum(preds == labels)

        print('Training Loss: {:.4f}, Training Accuracy: {:.4f}'.format(
            running_train_loss / len(train_data),
            torch.true_divide(running_train_acc, len(train_data))))

        train_loss.append(running_train_loss / len(train_data))
        train_acc.append(torch.true_divide(running_train_acc, len(train_data)))

        if running_train_acc > best_acc:
            best_acc = running_train_acc
            best_model_params = copy.deepcopy(model.state_dict())

        model.load_state_dict(best_model_params)
        torch.save(model, 'VGG11_model_SVM.pth')

    plt.title("Loss Curve")
    plt.plot(range(num_epochs), train_loss, color='red', label="Training loss")
    plt.xlabel("Loss")
    plt.ylabel("Epochs")
    plt.savefig("loss_curve_SVM.png")
    plt.show()

    plt.title("Accuracy Curve")
    plt.plot(range(num_epochs),
             train_acc,
             color='red',
             label="Training Accuracy")
    plt.xlabel("Accuracy")
    plt.ylabel("Epochs")
    plt.savefig("accuracy_curve_SVM.png")
    plt.show()
        out['r42'] = F.relu(self.conv4_2(out['r41']))
        out['r43'] = F.relu(self.conv4_3(out['r42']))
        out['r44'] = F.relu(self.conv4_4(out['r43']))
        out['p4'] = self.pool4(out['r44'])
        out['r51'] = F.relu(self.conv5_1(out['p4']))
        out['r52'] = F.relu(self.conv5_2(out['r51']))
        out['r53'] = F.relu(self.conv5_3(out['r52']))
        out['r54'] = F.relu(self.conv5_4(out['r53']))
        out['p5'] = self.pool5(out['r54'])
        return [out[key] for key in out_keys]


img_neighbor = np.load('img_neighbors_pairs_train_test.npy')

celoss = nn.CrossEntropyLoss()
marginLoss = nn.MultiMarginLoss(margin=1)


# gram matrix and loss
class GramMatrix(nn.Module):
    def forward(self, input):
        b, c, h, w = input.size()
        F = input.view(b, c, h * w)
        G = torch.bmm(F, F.transpose(1, 2))
        G.div_(h * w)
        return G


class GramMSELoss(nn.Module):
    def forward(self, input, target):
        out = torch.log(nn.MSELoss()(GramMatrix()(input), target))
예제 #17
0
def transfer_classification(config):
    class_criterion = nn.MultiMarginLoss()
    loss_config = config["loss"]
    transfer_criterion = loss.loss_dict[loss_config["name"]]
    if "params" not in loss_config:
        loss_config["params"] = {}
    class_num = 6
    ## set base networks
    net_config = config["network"]
    base_network_s = network.network_dict[net_config["name_s"]]()
    base_network_t = network.network_dict[net_config["name_t"]]()
    P_network_s = network.network_dict[net_config["name_s"]]()
    P_network_t = network.network_dict[net_config["name_t"]]()
    base_space_reverse = network.network_dict["base_space"]()
    generator_mmd = network.network_dict["generator_mmd"]()
    discriminator_mmd = network.network_dict["MMD_discriminator"]()
    classifier_layer_t = nn.Sequential(
        nn.Linear(generator_mmd.output_num(), class_num), )
    classifier_layer_s = nn.Sequential(
        nn.Linear(generator_mmd.output_num(), class_num), )

    reconstruct_common = nn.Sequential(
        nn.Linear(base_space_reverse.output_num(),
                  base_space_reverse.output_num(),
                  bias=False), )

    reconstruct_s = nn.Sequential(
        nn.Linear(base_space_reverse.output_num(), source_dim, bias=False), )
    reconstruct_t = nn.Sequential(
        nn.Linear(base_space_reverse.output_num(), target_dim, bias=False), )

    use_gpu = torch.cuda.is_available()
    if use_gpu:
        classifier_layer_t = classifier_layer_t.cuda()
        classifier_layer_s = classifier_layer_s.cuda()
        base_space_reverse = base_space_reverse.cuda()
        discriminator_mmd = discriminator_mmd.cuda()
        generator_mmd = generator_mmd.cuda()
        base_network_t = base_network_t.cuda()
        base_network_s = base_network_s.cuda()
        P_network_s = P_network_s.cuda()
        P_network_t = P_network_t.cuda()
        reconstruct_s = reconstruct_s.cuda()
        reconstruct_t = reconstruct_t.cuda()
        reconstruct_common = reconstruct_common.cuda()

    ## collect parameters
    parameter_list = [{
        "params": classifier_layer_s.parameters(),
        "lr": 1
    }, {
        "params": classifier_layer_t.parameters(),
        "lr": 1
    }, {
        "params": base_network_s.parameters(),
        "lr": 1
    }, {
        "params": base_network_t.parameters(),
        "lr": 1
    }, {
        "params": base_space_reverse.parameters(),
        "lr": 1
    }, {
        "params": reconstruct_s.parameters(),
        "lr": 1
    }, {
        "params": reconstruct_t.parameters(),
        "lr": 1
    }, {
        "params": reconstruct_common.parameters(),
        "lr": 1
    }, {
        "params": P_network_s.parameters(),
        "lr": 1
    }, {
        "params": P_network_t.parameters(),
        "lr": 1
    }]
    parameter_mmd_list = [{"params": discriminator_mmd.parameters(), "lr": 1}]
    parameter_mmd_gen_list = [{"params": generator_mmd.parameters(), "lr": 1}]

    assert base_network_s.output_num() == base_network_t.output_num()

    ## set optimizer
    optimizer_config = config["optimizer"]
    optimizer = optim_dict[optimizer_config["type"]](
        parameter_list, **(optimizer_config["optim_params"]))
    optimizer_mmd = optim_dict[optimizer_config["type"]](
        parameter_mmd_list, **(optimizer_config["optim_params"]))
    optimizer_mmd_gen = optim_dict[optimizer_config["type"]](
        parameter_mmd_gen_list, **(optimizer_config["optim_params"]))
    param_lr = []
    for param_group in optimizer.param_groups:
        param_lr.append(param_group["lr"])
    param_lr_mmd = []
    for param_group in optimizer_mmd.param_groups:
        param_lr_mmd.append(param_group["lr"])
    param_lr_mmd_gen = []
    for param_group in optimizer_mmd_gen.param_groups:
        param_lr_mmd_gen.append(param_group["lr"])

    schedule_param = optimizer_config["lr_param"]
    lr_scheduler = lr_schedule.schedule_dict[optimizer_config["lr_type"]]

    ## train
    transfer_loss = classifier_loss_t = classifier_loss_s = 0
    acc_list = []

    for epoch in range(config["num_iterations"]):
        ## test in the train
        if epoch % config["test_interval"] == 0:
            classifier_layer_t.train(False)
            classifier_layer_s.train(False)
            base_space_reverse.train(False)
            discriminator_mmd.train(False)
            generator_mmd.train(False)
            base_network_t.train(False)
            base_network_s.train(False)
            P_network_s.train(False)
            P_network_t.train(False)
            reconstruct_s.train(False)
            reconstruct_t.train(False)
            reconstruct_common.train(False)
            # For visualization purpose
            #visual_Data_t(nn.Sequential(base_network_t, base_space_reverse,generator_mmd), gpu=use_gpu)
            #visual_Data_s(nn.Sequential(base_network_s, base_space_reverse,generator_mmd), gpu=use_gpu)

            acc, valid_acc, test_acc = text_classification_test(nn.Sequential(
                base_network_t, base_space_reverse, generator_mmd,
                classifier_layer_t),
                                                                gpu=use_gpu)
            with open(
                    'results/1_batch30_%s_10_0.001_0.003_%s.txt' %
                (source_lang, str(number)), 'a+') as f:
                f.write(str(acc))
                f.write('\n')
            acc_list.append([acc])
            print(acc)
        for i in range(config["n_batches"]):

            ## train one iter
            optimizer = lr_scheduler(param_lr, optimizer, i, **schedule_param)
            optimizer_mmd = lr_scheduler(param_lr_mmd, optimizer_mmd, i,
                                         **schedule_param)
            optimizer_mmd_gen = lr_scheduler(param_lr_mmd_gen,
                                             optimizer_mmd_gen, i,
                                             **schedule_param)
            optimizer.zero_grad()
            optimizer_mmd.zero_grad()
            optimizer_mmd_gen.zero_grad()

            target_len = target_y.size
            source_len = source_y.size
            n_batches = config["n_batches"]

            local_Xs, local_ys = source_X[
                i * n_batches:(i + 1) *
                n_batches, ], source_y[i * n_batches:(i + 1) * n_batches, ]
            local_Xt, local_yt = target_X[
                i * n_batches:(i + 1) *
                n_batches, ], target_y[i * n_batches:(i + 1) * n_batches, ]

            local_Xs, local_ys = unison_shuffled_copies(local_Xs, local_ys)
            local_Xt, local_yt = unison_shuffled_copies(local_Xt, local_yt)

            if len(local_Xt) < n_batches:
                needed = n_batches - len(local_Xt)
                list_loc_t = [1] * needed + [0] * (len(target_X) - needed)
                shuffle(list_loc_t)
                filters = [x == 1 for x in list_loc_t]

                new_needed_samples_t = target_X[filters]
                new_needed_labels_t = target_y[filters]
                local_Xt = np.concatenate((new_needed_samples_t, local_Xt),
                                          axis=0)
                local_yt = np.concatenate((new_needed_labels_t, local_yt),
                                          axis=0)

            if len(local_Xs) < n_batches:
                needed = n_batches - len(local_Xs)
                list_loc_s = [1] * needed + [0] * (len(source_X) - needed)
                shuffle(list_loc_s)
                filters = [x == 1 for x in list_loc_s]
                new_needed_samples_s = source_X[filters]
                new_needed_labels_s = source_y[filters]
                local_Xs = np.concatenate((new_needed_samples_s, local_Xs),
                                          axis=0)
                local_ys = np.concatenate((new_needed_labels_s, local_ys),
                                          axis=0)

            local_Xs = torch.tensor(local_Xs, dtype=torch.float)
            local_ys = torch.tensor(local_ys, dtype=torch.long)
            local_Xt = torch.tensor(local_Xt, dtype=torch.float)
            local_yt = torch.tensor(local_yt, dtype=torch.long)

            content_target_tensor = torch.tensor(target_X, dtype=torch.float)
            if use_gpu:
                inputs_source, labels_source, inputs_target, labels_target, content_target = Variable(
                    local_Xs).cuda(), Variable(local_ys).cuda(), Variable(
                        target_X_train).cuda(), Variable(target_y_train).cuda(
                        ), Variable(content_target_tensor).cuda()
            else:
                inputs_source, labels_source, inputs_target, labels_target = Variable(
                    local_Xs), Variable(local_ys), Variable(
                        target_X_train), Variable(target_y_train)

            features_s = base_network_s(inputs_source)
            feature_s_basespace = base_space_reverse(features_s)
            aligned_features_s = generator_mmd(feature_s_basespace)
            outputs_s = classifier_layer_t(aligned_features_s)
            classifier_loss_s = class_criterion(
                outputs_s, labels_source.reshape(n_batches, ))

            features_t = base_network_t(inputs_target)
            feature_t_basespace = base_space_reverse(features_t)

            prejected_source = P_network_s(inputs_source)
            prejected_target = P_network_t(inputs_target)

            reconstructed_s = reconstruct_common(feature_s_basespace)
            reconstructed_t = reconstruct_common(feature_t_basespace)

            reconstruct_loss_s = rec_loss_cal(reconstructed_s,
                                              prejected_source)
            reconstruct_loss_t = rec_loss_cal(reconstructed_t,
                                              prejected_target)

            l1_regularization = torch.norm(
                feature_s_basespace, 1) + torch.norm(feature_t_basespace, 1)

            aligned_features_t = generator_mmd(feature_t_basespace)

            features_t_c = base_network_t(content_target)
            feature_t_c_basespace = base_space_reverse(features_t_c)
            aligned_features_c_t = generator_mmd(feature_t_c_basespace)

            outputs_t = classifier_layer_t(aligned_features_t)

            classifier_loss_t = class_criterion(
                outputs_t, labels_target.reshape(len(labels_target), ))

            feature_s_mmd = discriminator_mmd(aligned_features_s)
            feature_t_mmd = discriminator_mmd(aligned_features_c_t)
            transfer_loss = transfer_criterion(feature_s_mmd, feature_t_mmd,
                                               **loss_config["params"])

            for w_i in base_space_reverse.parameters():
                A_F_2 = torch.norm(torch.transpose(w_i, 0, 1), 2)

            for w_i in base_network_s.parameters():
                temp_value = (torch.norm(
                    (torch.mm(w_i, torch.transpose(w_i, 0, 1)) - eyeData), 2))
                B_s_reg = torch.mul(temp_value, temp_value)

            for w_i in base_network_t.parameters():
                temp_value = (torch.norm(
                    (torch.mm(w_i, torch.transpose(w_i, 0, 1)) - eyeData), 2))
                B_t_reg = torch.mul(temp_value, temp_value)

            for w_i in reconstruct_s.parameters():
                temp_value = (torch.norm(
                    (torch.mm(torch.transpose(w_i, 0, 1), w_i) - eyeData), 2))
                P_s_reg = torch.mul(temp_value, temp_value)
                P_s_F_2 = torch.norm(torch.transpose(w_i, 0, 1), 2)

            for w_i in reconstruct_t.parameters():
                temp_value = (torch.norm(
                    (torch.mm(torch.transpose(w_i, 0, 1), w_i) - eyeData), 2))
                P_t_reg = torch.mul(temp_value, temp_value)
                P_t_F_2 = torch.norm(torch.transpose(w_i, 0, 1), 2)

            for w_i in P_network_s.parameters():
                temp_value = (torch.norm(
                    (torch.mm(w_i, torch.transpose(w_i, 0, 1)) - eyeData), 2))
                Project_s_reg = torch.mul(temp_value, temp_value)

            for w_i in P_network_t.parameters():
                temp_value = (torch.norm(
                    (torch.mm(w_i, torch.transpose(w_i, 0, 1)) - eyeData), 2))
                Project_t_reg = torch.mul(temp_value, temp_value)
            for w_i in reconstruct_common.parameters():
                D_F_2 = torch.norm(torch.transpose(w_i, 0, 1), 2)

            classifier_layer_t.train(True)
            classifier_layer_s.train(True)
            base_space_reverse.train(True)
            discriminator_mmd.train(False)
            generator_mmd.train(True)
            base_network_t.train(True)
            base_network_s.train(True)
            reconstruct_s.train(True)
            reconstruct_t.train(True)
            reconstruct_common.train(True)

            coef = 1e-8
            total_loss = 10*classifier_loss_t+ classifier_loss_s+ coef*reconstruct_loss_t\
                +coef*reconstruct_loss_s+coef*A_F_2+coef*Project_s_reg+coef*Project_t_reg\
                +coef*D_F_2+coef*B_s_reg+coef*B_t_reg

            total_loss.backward(retain_graph=True)
            optimizer.step()

            base_space_reverse.apply(clipper)

            classifier_layer_t.train(False)
            classifier_layer_s.train(False)
            base_space_reverse.train(False)
            discriminator_mmd.train(True)
            generator_mmd.train(False)
            base_network_t.train(False)
            base_network_s.train(False)
            reconstruct_s.train(False)
            reconstruct_t.train(False)
            reconstruct_common.train(True)
            transfer_loss_reverse = -transfer_loss
            transfer_loss_reverse.backward(retain_graph=True)
            optimizer_mmd.step()

            classifier_layer_t.train(False)
            classifier_layer_s.train(False)
            base_space_reverse.train(False)
            discriminator_mmd.train(False)
            generator_mmd.train(True)
            base_network_t.train(False)
            base_network_s.train(False)
            reconstruct_s.train(False)
            reconstruct_t.train(False)
            reconstruct_common.train(False)
            transfer_loss_ = transfer_loss
            transfer_loss_.backward()
            optimizer_mmd_gen.step()
예제 #18
0
def run_epoch(data, is_training, encoder_model_optimizer,
              domain_model_optimizer, args):
    '''
    Train model for one pass of train data, and return loss, acccuracy
    '''
    encoder_model, encoder_optimizer = encoder_model_optimizer
    domain_model, domain_optimizer = domain_model_optimizer

    data_loader = torch.utils.data.DataLoader(data,
                                              batch_size=args.batch_size,
                                              shuffle=True,
                                              num_workers=args.num_workers,
                                              drop_last=True)

    losses = []

    if is_training:
        encoder_model.train()
        domain_model.train()
    else:
        encoder_model.eval()

    #nll_loss = nn.NLLLoss()

    #y_true = []
    #y_scores = []

    auc_met = meter.AUCMeter()

    for batch in tqdm(data_loader):

        cosine_similarity = nn.CosineSimilarity(dim=0, eps=1e-6)
        criterion = nn.MultiMarginLoss(margin=0.3)
        #pdb.set_trace()

        if is_training:
            encoder_optimizer.zero_grad()
            domain_optimizer.zero_grad()

        ###source question encoder####
        if is_training:
            samples = batch['samples']
        else:
            samples = batch

        #output - batch of samples, where every sample is 2d tensor of avg hidden states
        hidden_rep = runEncoderOnQuestions(samples, encoder_model, args)

        #Calculate cosine similarities here and construct X_scores
        #expected datastructure of hidden_rep = batchsize x number_of_q x hidden_size
        cs_tensor = autograd.Variable(
            torch.FloatTensor(hidden_rep.size(0),
                              hidden_rep.size(1) - 1))

        if args.cuda:
            cs_tensor = cs_tensor.cuda()

        #calculate cosine similarity for every query vs. neg q pair
        for j in range(1, hidden_rep.size(1)):
            for i in range(hidden_rep.size(0)):
                cs_tensor[i,
                          j - 1] = cosine_similarity(hidden_rep[i, 0, ],
                                                     hidden_rep[i, j, ])

        X_scores = torch.stack(cs_tensor, 0)
        y_targets = autograd.Variable(
            torch.zeros(hidden_rep.size(0)).type(torch.LongTensor))

        if args.cuda:
            y_targets = y_targets.cuda()

        if is_training:
            #####domain classifier#####
            cross_d_questions = batch['question']
            avg_hidden_rep = runEncoderOnQuestions(cross_d_questions,
                                                   encoder_model, args)

            predicted_domains = domain_model(avg_hidden_rep)

            true_domains = autograd.Variable(
                cross_d_questions['domain']).squeeze(1)

            if args.cuda:
                true_domains = true_domains.cuda()

            domain_classifier_loss = F.nll_loss(predicted_domains,
                                                true_domains)
            print "Domain loss in batch", domain_classifier_loss.data

            #calculate loss
            encoder_loss = criterion(X_scores, y_targets)
            print "Encoder loss in batch", encoder_loss.data
            '''
            if encoder_loss.cpu().data.numpy().item() == 0:
                new_lambda = -new_lambda
            else:
                new_lambda = args.lambda_d * 10**(int(math.log10(encoder_loss.cpu().data.numpy().item())) - int(math.log10(domain_classifier_loss.cpu().data.numpy().item())))
            print "new lambda is ", new_lambda
            '''

            task_loss = encoder_loss - args.lambda_d * domain_classifier_loss
            print "Task loss in batch", task_loss.data
            print "\n\n"

            task_loss.backward()
            encoder_optimizer.step()
            domain_optimizer.step()

            losses.append(task_loss.cpu().data[0])

        else:

            for i in range(args.batch_size):

                for j in range(20):
                    y_true = 0
                    if j == 0:
                        y_true = 1

                    x = cs_tensor[i, j].data

                    if args.cuda:
                        x = x.cpu().numpy()
                    else:
                        x = x.numpy()

                    auc_met.add(x, y_true)

    # Calculate epoch level scores
    if is_training:
        avg_loss = np.mean(losses)
        print('Average Train loss: {:.6f}'.format(avg_loss))
        print()
    else:
        print "AUC:", auc_met.value(0.05)
    def train(self):
        load_start_time = time.time()
        # get the ubuntu data (labeled ub)
        ub_questions = self.ub_preprocessor.get_question_dict()
        ub_candidate_ids = self.ub_preprocessor.get_candidate_ids()

        ub_ids_batches = self.ub_preprocessor.split_into_batches(
            ub_candidate_ids.keys(), params.batch_size)

        if params.use_dom_ad:
            # get the android data (labeled an)
            an_questions = self.an_preprocessor.get_question_dict()
            an_id_pairs = self.an_preprocessor.get_all_id_pairs()

            # batch the ids
            an_ids_batches = self.an_preprocessor.split_into_batches(
                an_id_pairs)

            # discriminator labels (ubuntu --> 0, android --> 1)
            # when forwarding thru discriminator, first half if ubuntu, second half if android
            total_questions_per_batch = self.get_total_questions_per_batch()
            discr_targets = torch.cat([
                torch.ones(total_questions_per_batch / 2),
                torch.zeros(total_questions_per_batch / 2)
            ])
            discr_targets = Variable(torch.FloatTensor(discr_targets),
                                     requires_grad=False)

            # loss for discriminator
            bcel = nn.BCELoss()

            # 2 different optimizers
            optimizer2 = optim.Adam([{
                'params': self.encoder_net.parameters(),
                'lr': params.lambda_reg * params.neg_lr
            }, {
                'params': self.discr_net.parameters()
            }],
                                    lr=params.forward_lr)

        # loss for classifier
        mml = nn.MultiMarginLoss(margin=params.margin)

        # 2 different optimizers
        optimizer1 = optim.Adam(params=self.encoder_net.parameters(),
                                lr=params.forward_lr)

        load_total_minutes = (time.time() - load_start_time) / 60.0
        print('load_total_time = %f' % (load_total_minutes))
        # start looping through batches
        last_time = time.time()
        start_time = time.time()
        total_time = 0.0

        if params.use_dom_ad:
            n_batches = min(len(ub_ids_batches), len(an_ids_batches))
        else:
            n_batches = len(ub_ids_batches)

        for i_batch in range(n_batches):
            ub_ids_batch = ub_ids_batches[i_batch]
            if params.use_dom_ad: an_ids_batch = an_ids_batches[i_batch]

            # get the input sequences
            ub_title_seqs, ub_body_seqs = self.get_ub_title_and_body_seqs(
                ub_questions, ub_candidate_ids, ub_ids_batch)
            if params.use_dom_ad:
                an_title_seqs, an_body_seqs = self.get_an_title_and_body_seqs(
                    an_questions, an_ids_batch)

            # get all the word embedding vectors
            ub_x_titles = [
                self.ub_preprocessor.sequence_to_vec(seq)
                for seq in ub_title_seqs
            ]
            ub_x_bodies = [
                self.ub_preprocessor.sequence_to_vec(seq)
                for seq in ub_body_seqs
            ]
            if params.use_dom_ad:
                an_x_titles = [
                    self.an_preprocessor.sequence_to_vec(seq)
                    for seq in an_title_seqs
                ]
                an_x_bodies = [
                    self.an_preprocessor.sequence_to_vec(seq)
                    for seq in an_body_seqs
                ]

            # get the lengths of all the sequences
            ub_lens_titles = [
                self.ub_preprocessor.get_seq_len(seq) for seq in ub_title_seqs
            ]
            ub_lens_bodies = [
                self.ub_preprocessor.get_seq_len(seq) for seq in ub_body_seqs
            ]
            if params.use_dom_ad:
                an_lens_titles = [
                    self.an_preprocessor.get_seq_len(seq)
                    for seq in an_title_seqs
                ]
                an_lens_bodies = [
                    self.an_preprocessor.get_seq_len(seq)
                    for seq in an_body_seqs
                ]

            # run the ubuntu data forward through the cnn model
            ub_output_titles = self.run_through_encoder(
                ub_x_titles, ub_lens_titles)
            ub_output_bodies = self.run_through_encoder(
                ub_x_bodies, ub_lens_bodies)
            if params.use_dom_ad:
                # run the android data forward through the cnn model
                an_output_titles = self.run_through_encoder(
                    an_x_titles, an_lens_titles)
                an_output_bodies = self.run_through_encoder(
                    an_x_bodies, an_lens_bodies)

            # average the representations
            ub_out_avg = (ub_output_titles + ub_output_bodies).div(2)
            if params.use_dom_ad:
                an_out_avg = (an_output_titles + an_output_bodies).div(2)

            # now we have the internal feature representations
            # these features will go to the classifier (just cosine similarity and loss1)
            # and the features will go through the discriminator network (ending with loss2)

            # do the classification and loss1 for just the ubuntu data
            ub_train_instances = torch.chunk(ub_out_avg, len(ub_ids_batch))
            ub_cos_scores, ub_targets = self.get_cosine_scores_target_data(
                ub_train_instances)

            loss1 = mml(ub_cos_scores, ub_targets)

            # do discrimination and loss2 for both ubuntu and android

            if params.use_dom_ad:
                # concatenate both ub and an
                both_out_avg = torch.cat([ub_out_avg, an_out_avg])

                # run through discriminator
                out_discr = self.run_through_discr(both_out_avg)

                # calculate loss2
                # print(out_discr.size())
                # print(discr_targets.size())
                loss2 = bcel(out_discr, discr_targets)

                # create the total loss
                total_loss = loss1 - params.lambda_reg * loss2

            # now back propagate both optimizers
            optimizer1.zero_grad()
            if params.use_dom_ad:
                optimizer2.zero_grad()

                total_loss.backward()

            optimizer1.step()
            if params.use_dom_ad: optimizer2.step()

            mod_size = 100.0

            i_batch_print = i_batch + 1
            if (i_batch_print % mod_size) == 0:
                print(
                    '---------------------------------------------|------------------|'
                )
                if params.use_dom_ad:
                    print(
                        'batch %d out of %d . . . loss per batch  =|  %s  |' %
                        (i_batch_print, n_batches, list(total_loss.data)[0]))
                else:
                    print(
                        'batch %d out of %d . . . loss per batch  =|  %s  |' %
                        (i_batch_print, n_batches, list(loss1.data)[0]))
                print(
                    '---------------------------------------------|------------------|'
                )
                print('loss1 = %f' % (list(loss1.data)[0]))
                if params.use_dom_ad:
                    print('loss2 = %f' % (list(loss2.data)[0]))

                total_time = time.time() - start_time
                print('training for %f minutes so far' % (total_time / 60.0))
                pred_time = (total_time / i_batch_print) * n_batches / 60.0
                print('training on track to take %f minutes' % (pred_time))

                last_time = time.time()

        torch.save(self.encoder_net, params.save_encoder_path)
        if params.use_dom_ad:
            torch.save(self.discr_net, params.save_discr_path)
        if params.use_dom_ad:
            print('models saved at %s and %s' %
                  (params.save_encoder_path, params.save_discr_path))
        else:
            print('model saved at %s' % (params.save_encoder_path))
            print('no discriminator used because params.use_dom_ad == False')
            print('this means that the android dataset was not used here')
 def __init__(self, weight):
     super(CustomCombinedLoss, self).__init__()
     self._weight = weight
     self._criterion_choice = nn.MultiMarginLoss(size_average=False, margin=0.5)
예제 #21
0
 def train_(self, test_desc, group_name=None):
     """ Training procedure
     """
     p = progressbar.ProgressBar()
     random.seed(SEED)
     loss_func_tran = nn.MultiMarginLoss()
     loss_func_nucl = nn.MultiMarginLoss()
     loss_func_rel = nn.CrossEntropyLoss()
     constraint_loss = nn.CrossEntropyLoss()
     optimizer = optim.Adam(self.model.parameters(),
                            lr=LEARNING_RATE_spinn,
                            weight_decay=l2_penalty)
     iter_count = batch_count = 0
     loss_tran = loss_nucl = loss_rel = loss_nr = 0.
     p.start(self.skip_steps)
     pro_idx = 1
     for epoch in range(EPOCH_ALL):
         random.shuffle(self.train_trees)
         for tree in self.train_trees:
             self.model.train()
             iter_count += 1
             session_gold = self.session(tree)
             tran_scores, nucl_scores, rel_scores, nr_scores = None, None, None, None
             tran_labels, nucl_labels, rel_labels, nr_labels = [], [], [], []
             for transition_rel in self.oracle(tree):
                 tran, nucl, rel = self.tran_rel_parser(
                     transition_rel)  # 解析
                 # 1. nucl and rel: scores and labels
                 if rel is not None:
                     if TRAIN_NR:
                         tmp_nr_score = self.model.score_nr(session_gold)
                         nr_scores = self.concat_torch(
                             nr_scores, tmp_nr_score)
                         nr_labels.append(nr2ids[nucl + "-" + rel])
                         if TRAIN_NUCL_CONSTRAINT:
                             tmp_nucl_score = self.model.score_nucl(
                                 session_gold)
                             nucl_scores = self.concat_torch(
                                 nucl_scores, tmp_nucl_score)
                             nucl_labels.append(nucl2ids[nucl])
                     else:
                         tmp_nucl_score = self.model.score_nucl(
                             session_gold)
                         nucl_scores = self.concat_torch(
                             nucl_scores, tmp_nucl_score)
                         nucl_labels.append(nucl2ids[nucl])
                         tmp_rel_score = self.model.score_rel(session_gold)
                         rel_scores = self.concat_torch(
                             rel_scores, tmp_rel_score)
                         rel_labels.append(coarse2ids[rel])
                 # 2. tran: scores and labels
                 tmp_tran_score = self.model.score_tran(session_gold)
                 tran_labels.append(action2ids[tran])
                 tran_scores = self.concat_torch(tran_scores,
                                                 tmp_tran_score)
                 session_gold, angle_prop_all = self.model(
                     session_gold, transition_rel)
             loss_tran += loss_func_tran(tran_scores,
                                         torch.Tensor(tran_labels).long())
             if TRAIN_NR:
                 loss_nr = loss_nr + constraint_loss(
                     nr_scores,
                     torch.Tensor(nr_labels).long())
                 if TRAIN_NUCL_CONSTRAINT:
                     loss_nucl += loss_func_nucl(
                         nucl_scores,
                         torch.Tensor(nucl_labels).long())
             else:
                 loss_nucl += loss_func_nucl(
                     nucl_scores,
                     torch.Tensor(nucl_labels).long())
                 loss_rel = loss_rel + loss_func_rel(
                     rel_scores,
                     torch.Tensor(rel_labels).long())
             # batch learn
             if iter_count % BATCH_SIZE_spinn == 0 and iter_count > 0:
                 p.update(pro_idx)
                 pro_idx += 1
                 batch_count += 1
                 optimizer.zero_grad()
                 loss_tran.backward(retain_graph=True)
                 optimizer.step()
                 if TRAIN_NR:
                     if TRAIN_NUCL_CONSTRAINT:
                         loss_nr = loss_nr + CONSTRAINT_LAMBDA * loss_nucl
                     optimizer.zero_grad()
                     loss_nr.backward()
                     optimizer.step()
                 else:
                     optimizer.zero_grad()
                     loss_nucl.backward(retain_graph=True)
                     optimizer.step()
                     optimizer.zero_grad()
                     loss_rel.backward()
                     optimizer.step()
                 loss_tran, loss_nucl, loss_rel, loss_nr = 0., 0., 0., 0.
                 if batch_count % self.skip_steps == 0:
                     p.finish()
                     better = self.evaluate(
                         trees_eval_path=self.dev_trees_path,
                         type_="dev",
                         save_per=True)
                     if better:
                         self.evaluate(trees_eval_path=self.test_trees_path,
                                       type_="test",
                                       save_per=False)
                     self.report(epoch, iter_count, test_desc, group_name)
                     if batch_count > self.skip_boundary and self.skip_steps > SKIP_STEP_min:
                         self.skip_steps -= SKIP_REDUCE_UNIT
                         self.skip_boundary += SKIP_BOUNDARY
                     p.start(self.skip_steps)
                     pro_idx = 1
예제 #22
0
def main(args):

    data_loader, dataset = get_loaderTrain(args.data_dir,
                                           batch_size=args.batch_size,
                                           shuffle=True,
                                           num_workers=args.num_workers,
                                           drop_last=False,
                                           args=args)
    data_loaderValid, datasetValid = get_loaderValid(
        args.data_dir,
        batch_size=args.batch_size,
        shuffle=True,
        num_workers=args.num_workers,
        drop_last=False,
        args=args)

    data_size = dataset.get_data_size()
    num_classes = dataset.get_num_classes()
    instance_size = dataset.get_instance_size()

    # Build the model
    model = fc_model(input_size=instance_size,
                     num_classes=num_classes,
                     dropout=args.dropout)

    # create optimizer
    params = list(model.parameters())
    optimizer = torch.optim.Adam(params,
                                 betas=(0.9, 0.98),
                                 eps=1e-9,
                                 lr=args.learning_rate)

    # multi-class hinge loss
    label_crit = nn.MultiMarginLoss(reduce=True)

    model = model.to(device)
    model.train()

    print("model created & starting training ...\n\n")
    # Training script
    for epoch in range(args.num_epochs):

        total_correct_preds = 0.0
        total = 1e-10
        loss = 0.0

        # step loop
        for step, (image_input, class_idxs) in enumerate(data_loader):

            #print("The size of tensor: ",(image_input.size()))

            # move all data loaded from dataloader to gpu
            class_idxs = class_idxs.to(device)
            image_input = image_input.to(device)

            # feed-forward data in the model
            output = model(image_input)  # 32 * 150528 --> 32 * 11

            # compute losses
            state_loss = label_crit(output, class_idxs)  # --> 32 * 1

            # aggregate loss for logging
            loss += state_loss.item()

            # back-propagate the loss in the model & optimize
            model.zero_grad()
            state_loss.backward()
            optimizer.step()

            # accuracy computation
            _, pred_idx = torch.max(output, dim=1)
            total_correct_preds += torch.sum(pred_idx == class_idxs).item()
            total += output.size(0)

        # epoch accuracy & loss
        accuracy = round(total_correct_preds / total, 2)
        loss = round(loss / total, 2)

        # you can save the model here at specific epochs (ckpt) to load and evaluate the model on the val set

        print('\repoch {}: accuracy: {}, loss: {}'.format(
            epoch, accuracy, loss),
              end="")
        x = validate(model, data_loaderValid, datasetValid)
        save_model(model, epoch, optimizer, loss, x)
    print()
예제 #23
0
 def forward(self, logits, target):
     criterion = nn.MultiMarginLoss(p=2, margin=0, 
                                    weight = self.weight, size_average=False)
     return criterion(logits, target)    
예제 #24
0
    ['glu', nn.GLU()],
])

loss = nn.ModuleDict(
    [['l1', nn.L1Loss()], ['nll', nn.NLLLoss()], ['kldiv',
                                                  nn.KLDivLoss()],
     ['mse', nn.MSELoss()], ['bce', nn.BCELoss()],
     ['bce_with_logits', nn.BCEWithLogitsLoss()],
     ['cosine_embedding', nn.CosineEmbeddingLoss()], ['ctc',
                                                      nn.CTCLoss()],
     ['hinge_embedding', nn.HingeEmbeddingLoss()],
     ['margin_ranking', nn.MarginRankingLoss()],
     ['multi_label_margin', nn.MultiLabelMarginLoss()],
     ['multi_label_soft_margin',
      nn.MultiLabelSoftMarginLoss()], ['multi_margin',
                                       nn.MultiMarginLoss()],
     ['smooth_l1', nn.SmoothL1Loss()], ['soft_margin',
                                        nn.SoftMarginLoss()],
     ['cross_entropy', nn.CrossEntropyLoss()],
     ['triplet_margin', nn.TripletMarginLoss()],
     ['poisson_nll', nn.PoissonNLLLoss()]])

optimizer = dict({
    'adadelta': optim.Adadelta,
    'adagrad': optim.Adagrad,
    'adam': optim.Adam,
    'sparse_adam': optim.SparseAdam,
    'adamax': optim.Adamax,
    'asgd': optim.ASGD,
    'lbfgs': optim.LBFGS,
    'rmsprop': optim.RMSprop,
예제 #25
0
 def __init__(self):
     super(loss, self).__init__()
     self.lossrating = tnn.CrossEntropyLoss()
     self.losscategory = tnn.MultiMarginLoss()
예제 #26
0
파일: SimpleQA.py 프로젝트: o7s8r6/GCNEP
    def __init__(self, args):
        super(SimpleQA, self).__init__()

        if args.word_pretrained is None:
            self.word_embedding = nn.Embedding(args.n_words, args.word_dim,
                                               args.padding_idx)
        else:
            self.word_embedding = nn.Embedding.from_pretrained(
                args.word_pretrained, freeze=args.freeze)

        if args.use_gcn:
            self.gcns = nn.ModuleList()
            for g in args.relation_graphs:
                gcn = RGCN(g,
                           args.n_relations,
                           args.sub_relation_dim,
                           args.sub_relation_dim,
                           args.relation_pretrained,
                           args.num_hidden_layers,
                           args.rgcn_dropout,
                           args.norm_type,
                           use_cuda=True)
                self.gcns.append(gcn)
        else:
            if args.relation_pretrained is None:
                self.relation_embedding = nn.Embedding(args.n_relations,
                                                       args.relation_dim,
                                                       args.padding_idx)
            else:
                self.relation_embedding = nn.Embedding.from_pretrained(
                    args.relation_pretrained, freeze=False)

        self.word_encoder = LSTMEncoder(input_size=args.word_dim,
                                        hidden_size=args.hidden_dim,
                                        num_layers=1,
                                        dropout=0.0,
                                        batch_first=True,
                                        bidirectional=True)

        self.question_encoder = LSTMEncoder(input_size=2 * args.hidden_dim,
                                            hidden_size=args.hidden_dim,
                                            num_layers=1,
                                            dropout=0.0,
                                            batch_first=True,
                                            bidirectional=True)

        self.gate = GateNetwork(2 * args.hidden_dim)

        self.loss_fn = nn.MultiMarginLoss(margin=args.margin)
        self.optimizer = torch.optim.Adam(self.parameters(), lr=args.lr)

        self.ns = args.ns
        self.score_function = nn.CosineSimilarity(dim=2)

        self.all_relation_words = args.all_relation_words

        self.n_relations = args.n_relations
        self.args = args

        global global_step
        global_step = 0
def loss_fn(weight):
    criterion = nn.MultiMarginLoss(weight=weight)
    return criterion
예제 #28
0
def train_model(use_lstm=True):
    if use_lstm:
        print_and_write("Training the LSTM model with the GPU:"
                        if USE_GPU else "Training the LSTM model:")
    else:
        print_and_write("Training the CNN model with the GPU:"
                        if USE_GPU else "Training the CNN model")

    get_id_to_text()
    embeddings = get_word_embeddings()
    model = LSTMQA(embeddings) if use_lstm else CNNQA(embeddings)
    if USE_GPU:
        model.cuda(GPU_NUM)
    loss_function = nn.MultiMarginLoss(
        margin=0.2)  # TODO: what about size_average?
    optimizer = optim.Adam(filter(lambda x: x.requires_grad,
                                  model.parameters()),
                           lr=LEARNING_RATE,
                           weight_decay=WEIGHT_DECAY)

    orig_time = time()

    for epoch in range(NUM_EPOCHS):
        samples = get_training_data(
        )  # recalculate this every epoch to get new random selections
        num_samples = len(samples)

        num_batches = int(math.ceil(1. * num_samples / BATCH_SIZE))
        total_loss = 0  # used for debugging
        for i in range(num_batches):
            # Get the samples ready
            batch = samples[i * BATCH_SIZE:(i + 1) * BATCH_SIZE]
            # If this is the last batch, then need to pad the batch to get the same shape as expected
            if i == num_batches - 1 and num_samples % BATCH_SIZE != 0:
                batch = np.concatenate(
                    (batch,
                     np.full(((i + 1) * BATCH_SIZE - num_samples, 22), "0")),
                    axis=0)

            # Convert from numpy arrays to tensors
            title_tensor, title_lengths = get_tensor_from_batch(batch,
                                                                use_title=True)
            body_tensor, body_lengths = get_tensor_from_batch(batch,
                                                              use_title=False)

            # Reset the model
            optimizer.zero_grad()

            # Run our forward pass and get the entire sequence of hidden states
            model.hidden = model.init_hidden()
            title_hidden = model(title_tensor)
            title_encoding = get_encodings(title_hidden,
                                           title_lengths,
                                           use_lstm=use_lstm)
            model.hidden = model.init_hidden()
            body_hidden = model(body_tensor)
            body_encoding = get_encodings(body_hidden,
                                          body_lengths,
                                          use_lstm=use_lstm)
            # Compute loss, gradients, update parameters
            # Could potentially do something about the last batch, but prolly won't affect training that much
            X, y = generate_score_matrix(title_encoding, body_encoding)
            loss = loss_function(X, y)
            total_loss += loss.data[0]
            loss.backward()
            optimizer.step()

            # every so while, check the dev accuracy
            # if i % 10 == 0:
            #     print_and_write("For batch number " + str(i) + " it has taken " + str(time() - orig_time) + " seconds and has loss " + str(total_loss))
            # if i > 0 and i % 100 == 0:
            #     evaluate_model(model, use_lstm=use_lstm)
        print_and_write("For epoch number " + str(epoch) + " it has taken " +
                        str(time() - orig_time) + " seconds and has loss " +
                        str(total_loss))
        evaluate_model(model, use_lstm=use_lstm)
        evaluate_model(model, use_test_data=True, use_lstm=use_lstm)
        if SAVE_MODELS:
            save_checkpoint(epoch, model, optimizer, use_lstm)
        #gc.collect()
    return model
예제 #29
0
    def _init_model(self, states=None):
        """Initialize model, override to change model setup."""
        opt = self.opt

        kwargs = opt_to_kwargs(opt)
        self.model = SteroidSeq2seq(len(self.dict),
                                    opt['embeddingsize'],
                                    opt['hiddensize'],
                                    padding_idx=self.NULL_IDX,
                                    start_idx=self.START_IDX,
                                    longest_label=states.get(
                                        'longest_label', 1),
                                    **kwargs)

        if (opt.get('dict_tokenizer') == 'bpe'
                and opt['embedding_type'] != 'random'):
            print('skipping preinitialization of embeddings for bpe')
        elif not states and opt['embedding_type'] != 'random':
            # `not states`: only set up embeddings if not loading model
            self._copy_embeddings(self.model.decoder.lt.weight,
                                  opt['embedding_type'])
            if opt['lookuptable'] in ['unique', 'dec_out']:
                # also set encoder lt, since it's not shared
                self._copy_embeddings(self.model.encoder.lt.weight,
                                      opt['embedding_type'],
                                      log=False)
        if opt['embedding_type'].endswith('fixed'):
            print('Seq2seq: fixing embedding weights.')
            self.model.decoder.lt.weight.requires_grad = False
            self.model.encoder.lt.weight.requires_grad = False
            if opt['lookuptable'] in ['dec_out', 'all']:
                self.model.decoder.e2s.weight.requires_grad = False
        self.id = 'SteroidSeq2seq'
        self.metrics['rank_loss'] = 0.0
        self.metrics['total_batches'] = 0.0
        self.metrics['overlap'] = 0
        self.overlap_count = {
            'predicted': 0,
            'ranked0': 0,
            'ranked1': 0,
            'ranked2': 0,
            'ranked3': 0,
            'ranked4': 0
        }  # for word overlap numerator
        self.num_predicted_count = 0  # for word overlap denominator
        self.injpred_selected_count = 0
        self.pred_count = 0
        self.iter_cand = opt['iter_cand']
        self.count_overlaps = opt['count_overlaps']
        self.howtorank = opt['howtorank']
        self.min_hamming_dist = opt['min_hamming_dist']

        if opt['cand_type'] == 'all':
            self.cand_type = ['current_labels', 'history']
        else:
            self.cand_type = [opt['cand_type']]

        self.model.post_ranker = nn.ModuleList()

        rank_hidden = self.opt.get('rankhiddensize', 512)
        rank_activation = getattr(nn, self.opt['rank_activation'])
        self.model.post_ranker.append(
            nn.Linear(self.opt['hiddensize'], rank_hidden))
        self.model.post_ranker.append(rank_activation())

        for i in range(self.opt.get('ranknl', 2) - 2):
            self.model.post_ranker.append(nn.Linear(rank_hidden, rank_hidden))
            self.model.post_ranker.append(rank_activation())
        self.model.post_ranker.append(nn.Linear(rank_hidden, 1))

        if states:
            # set loaded states if applicable
            self.model.load_state_dict(states['model'],
                                       strict=self.opt['strict_load'])

        if self.use_cuda:
            self.model.cuda()

        if self.opt['rankloss'] == 'margin':
            self.rank_criterion = nn.MultiMarginLoss(
                margin=self.opt['margin'],
                reduction=self.opt['ranklossreduce'])
        elif self.opt['rankloss'] == 'ce':
            self.rank_criterion = nn.CrossEntropyLoss(
                reduction=self.opt['ranklossreduce'])
        self.inject = self.opt['dump_all_preds']

        assert self.opt.get(
            'person_tokens',
            False) is True, 'We extract past labels using person tokens'
예제 #30
0
def mml_class_loss(pred_class, gt_class):
    _loss = nn.MultiMarginLoss(reduce=False)
    labels = gt_class.nonzero()[:, 1]
    l = _loss(pred_class, labels)
    l = l.mean()
    return l