示例#1
0
 def __init__(self, hidden_size, word_emb_size):
     super(NZSigmoidLoss, self).__init__()
     require_type_lst = utils.get_ontoNotes_train_types()
     self.weight = nn.Parameter(
         torch.zeros(len(require_type_lst),
                     hidden_size * 2 + word_emb_size))
     utils.init_weight(self.weight)
示例#2
0
 def __init__(self, hidden_size, word_emb_size):
     super(NZSigmoidLoss, self).__init__()
     require_type_lst = get_type_lst(fg_config['data'])
     self.weight = nn.Parameter(
         torch.zeros(len(require_type_lst),
                     hidden_size * 2 + word_emb_size))
     utils.init_weight(self.weight)
示例#3
0
    def __init__(self, V, D, K, activation):
        self.D = D
        self.f = activation

        # word embedding
        We = init_weight(V, D)

        # linear terms
        W1 = init_weight(D, D)
        W2 = init_weight(D, D)

        # bias
        bh = np.zeros(D)

        # output layer
        Wo = init_weight(D, K)
        bo = np.zeros(K)

        # make them tensorflow variables
        self.We = tf.Variable(We.astype(np.float32))
        self.W1 = tf.Variable(W1.astype(np.float32))
        self.W2 = tf.Variable(W2.astype(np.float32))
        self.bh = tf.Variable(bh.astype(np.float32))
        self.Wo = tf.Variable(Wo.astype(np.float32))
        self.bo = tf.Variable(bo.astype(np.float32))
        self.params = [self.We, self.W1, self.W2, self.Wo]
示例#4
0
    def __init__(self, block, num_classes=10):
        super(PreActResNet_MR, self).__init__()

        self.register_parameter('conv0', init_weight(64, 3, 3, 3))

        self.layer1_left = self.make_layer(block, 64, 64)
        self.layer1_right = self.make_layer(block, 64, 64)

        self.layer20_left = self.make_layer(block, 64, 128)
        self.layer20_right = self.make_layer(block, 64, 128)
        self.layer21_left = self.make_layer(block, 128, 128)
        self.layer21_right = self.make_layer(block, 128, 128)

        self.layer30_left = self.make_layer(block, 128, 256)
        self.layer30_right = self.make_layer(block, 128, 256)
        self.layer31_left = self.make_layer(block, 256, 256)
        self.layer31_right = self.make_layer(block, 256, 256)

        self.layer40_left = self.make_layer(block, 256, 512)
        self.layer40_right = self.make_layer(block, 256, 512)
        self.layer41_left = self.make_layer(block, 512, 512)
        self.layer41_right = self.make_layer(block, 512, 512)

        self.bn4 = nn.BatchNorm2d(512)
        self.register_parameter('fc', init_weight(num_classes, 512))
        self.bn5 = nn.BatchNorm1d(10)
        self.ls = nn.LogSoftmax(dim=1)
示例#5
0
    def __init__(self,
                 input_dim,
                 output_dim,
                 activation=None,
                 learning_rate=0.2):
        self.input_dim = input_dim  # 上层的神经元个数
        self.output_dim = output_dim  # 该层的神经元个数
        self.learning_rate = learning_rate

        if activation is None:
            self.activator = IdentityActivator()
        elif activation is "sigmoid":
            self.activator = SigmoidActivator()
        elif activation is "tanh":
            self.activator = TanhActivator()
        elif activation is "relu":
            self.activator = ReluActivator()
        elif activation is "softmax":
            self.activator = SoftmaxActivator()
        else:
            raise Exception('Non-supported activation function')

        # 初始化权重矩阵,偏置项
        self.W = init_weight(self.output_dim, self.input_dim)
        self.b = init_weight(self.output_dim, 1)
示例#6
0
def train(args, local_rank, distributed):
    model = EfficientNet.from_name(args.arch)
    init_weight(model)
    device = torch.device("cuda")
    model.to(device)

    optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay)
    scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, list(range(3, args.epochs, 3)), gamma=0.9)

    amp_opt_level = 'O0'
    if args.float16:
        amp_opt_level = 'O1'
    model, optimizer = amp.initialize(model, optimizer, opt_level=amp_opt_level)

    if distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[local_rank],
            output_device=local_rank,
            # 更新BN参数,comment下面选项
            # broadcast_buffers=False,
        )

    do_train(
        args,
        model,
        optimizer,
        scheduler,
        device,
    )
    return model
    def rand_init(self):
        # initialize weight
        for p in [self.fioux, self.iouh, self.fh]:
            utils.init_weight(p.weight)

        # initialize forget gate bias
        self.fioux.bias.data.zero_()
        self.fioux.bias.data[:self.mem_dim] = 1
示例#8
0
 def __init__(self, hidden_size, word_emb_size):
     super(CtxAtt, self).__init__()
     self.hidden_size = hidden_size
     self.word_emb_size = word_emb_size
     self.att_weight = nn.Parameter(
         torch.FloatTensor(hidden_size * 2, word_emb_size))
     utils.init_weight(self.att_weight)
     self.softmax = nn.Softmax()
    def rand_init(self):
        # initialize weight
        for p in [self.grzx, self.rzh, self.gh]:
            utils.init_weight(p.weight)

        # initialize forget gate bias
        self.grzx.bias.data.zero_()
        self.grzx.bias.data[self.
                            mem_dim:] = 1  # bias for z and r gate is init to 1
示例#10
0
 def __init__(self, hidden_size, word_emb_size):
     super(SigmoidLoss, self).__init__()
     self.weight0 = nn.Parameter(
         torch.zeros(hidden_size * 2 + word_emb_size * 2,
                     hidden_size * 2 + word_emb_size * 2))
     self.weight1 = nn.Parameter(
         torch.zeros(hidden_size * 2 + word_emb_size * 2, 1))
     utils.init_weight(self.weight0)
     utils.init_weight(self.weight1)
    def rand_init(self):
        """
        Initialize
        """
        # initialize weights
        for p in [self.ioux, self.iouh, self.fx, self.fh]:
            utils.init_weight(p.weight)

        # initialize forget gate bias
        self.ioux.bias.data.zero_()
        self.fx.bias.data[:] = 1
示例#12
0
 def __init__(self, emb_size):
     super(AttentionFlowLayer, self).__init__()
     self.att_w = nn.Parameter(torch.FloatTensor(3 * emb_size, 1))
     # torch.nn.init.uniform(self.att_w, -config['weight_scale'], config['weight_scale'])
     utils.init_weight(self.att_w)
     self.softmax = nn.Softmax()
     if config['gate']:
         self.gate_weight = nn.Parameter(
             torch.FloatTensor(4 * emb_size, 4 * emb_size))
         torch.nn.init.uniform(self.gate_weight, -config['weight_scale'],
                               config['weight_scale'])
            def __init__(self):
                # Output size is 5, because it needs to output the copied 5 bits
                self.size = 128

                # We'll have 1 read head, which produces a single read_vector of size 10. We also need to feed in the input, which is of size 5 (for the five bits)
                # so our total input size is 15
                self.fc_1 = init_weight(15, 128)
                self.fc_2 = init_weight(128, 128)

                # This is our controller output
                self.fc_3 = init_weight(128, 128)
示例#14
0
    def __init__(self, controller_size,
                 memory_slots, slot_size, batch_size):
        self.controller_size = controller_size
        self.memory_slots = memory_slots
        self.slot_size = slot_size

        self.batch_size = batch_size

        # For now we'll only allow one shift backwards or forwards
        self.weights = {
                            "controller->key"           : init_weight(self.controller_size, self.slot_size),
                            "controller->shift"         : init_weight(self.controller_size, 3),     
                            "controller->sharpen"       : init_weight(self.controller_size, 1),        
                            "controller->strengthen"    : init_weight(self.controller_size, 1),
                            "controller->interpolation" : init_weight(self.controller_size, 1),
                       }
示例#15
0
 def __init__(self, emb_file):
     with codecs.open(emb_file, mode='rb', encoding='utf-8') as f:
         for i, line in enumerate(f):
             line = line.strip()
             if line:
                 if i == 0:
                     parts = line.split(' ')
                     self.voc_size = int(parts[0]) + 8
                     self.emb_size = int(parts[1])
                     self.embedding_tensor = torch.zeros(
                         self.voc_size, self.emb_size)
                     utils.init_weight(self.embedding_tensor)
                 else:
                     parts = line.split(' ')
                     for j, part in enumerate(parts[1:]):
                         self.embedding_tensor[i + 2, j] = float(part)
示例#16
0
    def rand_init(self, init_embedding=False):
        """
        random initialization

        args:
            init_embedding: random initialize word embedding or not
        """
        if init_embedding:
            utils.init_embedding(self.word_embeds.weight)
        if self.position:
            utils.init_embedding(self.position_embeds.weight)
        utils.init_lstm(self.lstm)
        utils.init_linear(self.att2out)
        utils.init_weight(self.relation_embeds)
        self.attention.rand_init()
        self.att_weight = None
示例#17
0
 def __init__(self, M1, M2, an_id):
     self.id = an_id
     self.M1 = M1
     self.M2 = M2
     W = init_weight(M1, M2)
     b = np.zeros(M2)
     self.W = theano.shared(W, "W_%s" % self.id)
     self.b = theano.shared(b, "b_%s" % self.id)
     self.params = [self.W, self.b]
示例#18
0
    def __init__(self, Mi, Mo, activation):
        self.Mi = Mi
        self.Mo = Mo
        self.f = activation

        Wxh = init_weight(Mi, Mo)
        Whh = init_weight(Mo, Mo)
        bh = np.zeros(Mo)
        h0 = np.zeros(Mo)
        Wxr = init_weight(Mi, Mo)
        Whr = init_weight(Mo, Mo)
        br = np.zeros(Mo)
        Wxz = init_weight(Mi, Mo)
        Whz = init_weight(Mi, Mo)
        bz = np.zeros(Mo)

        self.Wxh = theano.shared(Wxh)
        self.Whh = theano.shared(Whh)
        self.bh = theano.shared(bh)
        self.h0 = theano.shared(h0)
        self.Wxr = theano.shared(Wxr)
        self.Whr = theano.shared(Whr)
        self.br = theano.shared(bh)
        self.Wxz = theano.shared(Wxz)
        self.Whz = theano.shared(Whz)
        self.bz = theano.shared(bh)
        self.params = [
            self.Wxh, self.Whh, self.bh, self.h0, self.Wxr, self.Whr, self.br,
            self.Wxz, self.Whz, self.bz
        ]
示例#19
0
    def __init__(self,
                 controller,
                 output_size,
                 memory_slots=32,
                 slot_size=10,
                 read_heads=1,
                 batch_size=10):
        """
            NTM.__init__(controller, memory_slots, slot_size, read_heads, batch_size) -> None

            initializes a Neural Turing Machine

            @param controller: controller is another class, which must have a method called `forward` and another called `get_weights`. This controller will process the read_vectors and the external input. This implemenation modularizes the NTM, meaning that the controller can be defined however the user wants to.
            @param memory_slots: the value M, the number of memory slots we have available
            @param slot_size: the value N, how much data can be stored in each individual slot
            @param read_heads: the number of read heads we have
            @param batch_size: batch size
        """

        self.controller = controller
        self.output_size = output_size

        # This represents the OUTPUT size of the controller
        self.controller_size = controller.size

        self.memory_slots = memory_slots
        self.slot_size = slot_size

        self.batch_size = batch_size

        self.read_heads = [
            readHead(controller_size=self.controller_size,
                     memory_slots=self.memory_slots,
                     slot_size=self.slot_size,
                     batch_size=self.batch_size) for i in range(read_heads)
        ]
        self.write_head = writeHead(controller_size=self.controller_size,
                                    memory_slots=self.memory_slots,
                                    slot_size=self.slot_size,
                                    batch_size=self.batch_size)

        self.output_weight = init_weight(self.controller_size,
                                         self.output_size)

        self.weights = [self.output_weight]
        #self.weights = []
        for head in self.read_heads:
            self.weights += head.get_weights()

        self.weights += self.write_head.get_weights()
        self.weights += self.controller.get_weights()
示例#20
0
 def __init__(self,
              hidden_size,
              word_emb_size,
              dropout_p=fg_config['dropout']):
     super(WARPLoss, self).__init__()
     require_type_lst = None
     if fg_config['data'] == 'onto':
         require_type_lst = utils.get_ontoNotes_train_types()
     elif fg_config['data'] == 'wiki':
         require_type_lst = utils.get_wiki_types()
     elif fg_config['data'] == 'bbn':
         require_type_lst = utils.get_bbn_types()
     num_labels = len(require_type_lst)
     self.weight = nn.Parameter(
         torch.zeros(hidden_size * 2 + word_emb_size, word_emb_size))
     utils.init_weight(self.weight)
     self.rank_weights = [1.0 / 1]
     for i in range(1, num_labels):
         self.rank_weights.append(self.rank_weights[i - 1] + (1.0 / i + 1))
     self.trans = nn.Linear(hidden_size * 2 + word_emb_size, word_emb_size)
     utils.init_linear(self.trans)
     self.activate = nn.ReLU()
     self.dropout = nn.Dropout(dropout_p)
示例#21
0
    def __init__(self, block, num_classes=10):
        super(PreActResNet_MR, self).__init__()

        self.register_parameter('conv0', init_weight(64, 3, 3, 3))

        self.layer1 = self.make_layer(block, 64, 64)

        self.layer20 = self.make_layer(block, 64, 128)
        self.register_parameter('shortcut1', init_weight(128, 64, 1, 1))
        self.layer21 = self.make_layer(block, 128, 128)

        self.layer30 = self.make_layer(block, 128, 256)
        self.register_parameter('shortcut2', init_weight(256, 128, 1, 1))
        self.layer31 = self.make_layer(block, 256, 256)

        self.layer40 = self.make_layer(block, 256, 512)
        self.register_parameter('shortcut3', init_weight(512, 256, 1, 1))
        self.layer41 = self.make_layer(block, 512, 512)

        self.bn4 = nn.BatchNorm2d(512)
        self.register_parameter('fc', init_weight(num_classes, 512))
        self.bn5 = nn.BatchNorm1d(10)
        self.ls = nn.LogSoftmax(dim=1)
示例#22
0
    def __init__(self, hidden_size, word_emb_size):
        super(NZCtxAtt, self).__init__()
        self.hidden_size = hidden_size
        self.word_emb_size = word_emb_size
        if fg_config['att'] == 'label_att':
            self.att_weight = nn.Parameter(
                torch.FloatTensor(hidden_size * 2, word_emb_size))
            utils.init_weight(self.att_weight)
        elif fg_config['att'] == 'orig_att':
            self.We = nn.Parameter(
                torch.FloatTensor(hidden_size * 2, fg_config['Da']))
            utils.init_weight(self.We)
            self.Wa = nn.Parameter(torch.FloatTensor(fg_config['Da'], 1))
            utils.init_weight(self.Wa)
        elif fg_config['att'] == 'no':
            self.att_weight = nn.Parameter(
                torch.FloatTensor(hidden_size * 2, word_emb_size))
            utils.init_weight(self.att_weight)

        self.softmax = nn.Softmax()
示例#23
0
    def __init__(self, units, input_dim):
        self.units = units
        self.input_dim = input_dim
        concat_len = input_dim + units

        self.wg = init_weight(units, concat_len)
        self.wi = init_weight(units, concat_len)
        self.wf = init_weight(units, concat_len)
        self.wo = init_weight(units, concat_len)
        self.bg = init_weight(units)
        self.bi = init_weight(units)
        self.bf = init_weight(units)
        self.bo = init_weight(units)
        # derivative of loss function w.r.t. all parameters
        self.wg_diff = np.zeros((units, concat_len))
        self.wi_diff = np.zeros((units, concat_len))
        self.wf_diff = np.zeros((units, concat_len))
        self.wo_diff = np.zeros((units, concat_len))
        self.bg_diff = np.zeros(units)
        self.bi_diff = np.zeros(units)
        self.bf_diff = np.zeros(units)
        self.bo_diff = np.zeros(units)
示例#24
0
def main():
    # Get training options
    opt = get_opt()

    device = torch.device("cuda") if opt.cuda else torch.device("cpu")

    # Define the networks
    # netG_A: used to transfer image from domain A to domain B
    netG_A = networks.Generator(opt.input_nc, opt.output_nc, opt.ngf, opt.n_res, opt.dropout)
    if opt.u_net:
        netG_A = networks.U_net(opt.input_nc, opt.output_nc, opt.ngf)

    # netD_B: used to test whether an image is from domain A
    netD_B = networks.Discriminator(opt.input_nc + opt.output_nc, opt.ndf)

    # Initialize the networks
    if opt.cuda:
        netG_A.cuda()
        netD_B.cuda()
    utils.init_weight(netG_A)
    utils.init_weight(netD_B)

    if opt.pretrained:
        netG_A.load_state_dict(torch.load('pretrained/netG_A.pth'))
        netD_B.load_state_dict(torch.load('pretrained/netD_B.pth'))


    # Define the loss functions
    criterion_GAN = utils.GANLoss()
    if opt.cuda:
        criterion_GAN.cuda()

    criterion_l1 = torch.nn.L1Loss()

    # Define the optimizers
    optimizer_G = torch.optim.Adam(netG_A.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999))
    optimizer_D_B = torch.optim.Adam(netD_B.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999))

    # Create learning rate schedulers
    lr_scheduler_G = torch.optim.lr_scheduler.LambdaLR(optimizer_G, lr_lambda = utils.Lambda_rule(opt.epoch, opt.n_epochs, opt.n_epochs_decay).step)
    lr_scheduler_D_B = torch.optim.lr_scheduler.LambdaLR(optimizer_D_B, lr_lambda = utils.Lambda_rule(opt.epoch, opt.n_epochs, opt.n_epochs_decay).step)


    # Define the transform, and load the data
    transform = transforms.Compose([transforms.Resize((opt.sizeh, opt.sizew)),
                transforms.ToTensor(),
                transforms.Normalize((0.5,), (0.5,))])
    dataloader = DataLoader(PairedImage(opt.rootdir, transform = transform, mode = 'train'), batch_size=opt.batch_size, shuffle=True, num_workers=opt.n_cpu)

    # numpy arrays to store the loss of epoch
    loss_G_array = np.zeros(opt.n_epochs + opt.n_epochs_decay)
    loss_D_B_array = np.zeros(opt.n_epochs + opt.n_epochs_decay)

    # Training
    for epoch in range(opt.epoch, opt.n_epochs + opt.n_epochs_decay):
        start = time.strftime("%H:%M:%S")
        print("current epoch :", epoch, " start time :", start)
        # Empty list to store the loss of each mini-batch
        loss_G_list = []
        loss_D_B_list = []

        for i, batch in enumerate(dataloader):
            if i % 20 == 1:
                print("current step: ", i)
                current = time.strftime("%H:%M:%S")
                print("current time :", current)
                print("last loss G_A:", loss_G_list[-1],  "last loss D_B:", loss_D_B_list[-1])

            real_A = batch['A'].to(device)
            real_B = batch['B'].to(device)

            # Train the generator
            utils.set_requires_grad([netG_A], True)
            optimizer_G.zero_grad()

            # Compute fake images and reconstructed images
            fake_B = netG_A(real_A)


            # discriminators require no gradients when optimizing generators
            utils.set_requires_grad([netD_B], False)


            # GAN loss
            prediction_fake_B = netD_B(torch.cat((fake_B, real_A), dim=1))
            loss_gan = criterion_GAN(prediction_fake_B, True)

            #L1 loss
            loss_l1 = criterion_l1(real_B, fake_B) * opt.l1_loss

            # total loss without the identity loss
            loss_G = loss_gan + loss_l1

            loss_G_list.append(loss_G.item())
            loss_G.backward()
            optimizer_G.step()

            # Train the discriminator
            utils.set_requires_grad([netG_A], False)
            utils.set_requires_grad([netD_B], True)

            # Train the discriminator D_B
            optimizer_D_B.zero_grad()
            # real images
            pred_real = netD_B(torch.cat((real_B, real_A), dim=1))
            loss_D_real = criterion_GAN(pred_real, True)

            # fake images
            fake_B = netG_A(real_A)
            pred_fake = netD_B(torch.cat((fake_B, real_A), dim=1))
            loss_D_fake = criterion_GAN(pred_fake, False)

            # total loss
            loss_D_B = (loss_D_real + loss_D_fake) * 0.5
            loss_D_B_list.append(loss_D_B.item())
            loss_D_B.backward()
            optimizer_D_B.step()

        # Update the learning rate
        lr_scheduler_G.step()
        lr_scheduler_D_B.step()

        # Save models checkpoints
        torch.save(netG_A.state_dict(), 'model/netG_A_pix.pth')
        torch.save(netD_B.state_dict(), 'model/netD_B_pix.pth')

        # Save other checkpoint information
        checkpoint = {'epoch': epoch,
                      'optimizer_G': optimizer_G.state_dict(),
                      'optimizer_D_B': optimizer_D_B.state_dict(),
                      'lr_scheduler_G': lr_scheduler_G.state_dict(),
                      'lr_scheduler_D_B': lr_scheduler_D_B.state_dict()}
        torch.save(checkpoint, 'model/checkpoint.pth')



        # Update the numpy arrays that record the loss
        loss_G_array[epoch] = sum(loss_G_list) / len(loss_G_list)
        loss_D_B_array[epoch] = sum(loss_D_B_list) / len(loss_D_B_list)
        np.savetxt('model/loss_G.txt', loss_G_array)
        np.savetxt('model/loss_D_B.txt', loss_D_B_array)

        end = time.strftime("%H:%M:%S")
        print("current epoch :", epoch, " end time :", end)
        print("G loss :", loss_G_array[epoch], "D_B loss :", loss_D_B_array[epoch])
示例#25
0
    def fit(self,
            fpath=None,
            data=None,
            n_features=None,
            n_epoch=1,
            r=1,
            c=1,
            **kargs):
        """
        Parameters
        -------------
        fpath: str. file dir for trainning set
        data: DataFrame. tranining set, needed only if fpath not given
        n_features: int. # features, needed only if fpath not given
        r: float. learning rate
        c: float. tradeoff between regularizer and loss
        n_epoch: int. max epoch

        Returns
        --------
        self: object
        """
        assert (r <= 1), "Learning rate must be no more than one!"

        if data is None:  # if data not given, read from file
            n_features, n_samples, data_train = get_data(fpath)
        else:  # else get data directly
            data_train = data
            n_samples = data_train.shape[0]
            x_temp = data_train.iloc[0]['X']
            if type(x_temp) == list:
                n_features = len(x_temp)
            else:
                n_features = x_temp.shape[0]

        # 1. initialize weight, r
        w = init_weight(n_features)
        r_0 = r

        # 2. for epoch = 1...T
        for epoch in range(1, n_epoch + 1):
            # shuffle traning set
            data = shuffle_samples(data_train, epoch)
            data.index = range(data.shape[0])
            self.data_train = data
            # for each example, update weight
            r = r_0 / epoch
            for t in range(n_samples):
                y = data.loc[t, 'y']
                x = data.loc[t, 'X']
                if type(x) == list: x = np.array(x)
                assert (w.shape == x.shape), "dim(w) != dim(x)"
                loss = y * np.dot(w, x)
                if loss <= 1:
                    w = (1 - r) * w + r * c * y * x
                else:
                    w = (1 - r) * w
        # print objective
            jw = 0.5 * np.dot(w, w) + c * max(0, 1 - loss)
            print("Epoch = %s   J(w) = %1.4f" % (epoch, jw))

        # 3. return w
        self.weight = w
示例#26
0
    def fit(self,
            fpath=None,
            data=None,
            n_features=None,
            n_epoch=1,
            r=1,
            sigma=1,
            **kargs):
        """
        Parameters
        -------------
        fpath: str. file dir for trainning set
        data: DataFrame. tranining set, needed only if fpath not given
        n_features: int. # features, needed only if fpath not given
        r: float. learning rate
        sigma: float. Tradeoff
        n_epoch: int. epoch

        Returns
        --------
        self: object
        """

        # 0. get training set
        if data is None:
            n_features, n_samples, data_train = get_data(fpath)
        else:
            data_train = data
            n_samples = data_train.shape[0]

            x_temp = data_train.iloc[0]['X']
            if type(x_temp) == list:
                n_features = len(x_temp)
            else:
                n_features = x_temp.shape[0]

        # 1. initialize weight, r
        w = init_weight(n_features)
        r_0 = r

        # 2. for each epoch:
        for epoch in range(1, n_epoch + 1):
            jw = []
            # (1) shuffle training set
            data_train = shuffle_samples(data_train, epoch)
            # (2) update weight
            #r = r_0 / epoch # diminishing r
            for i in range(n_samples):
                y = data_train.iloc[i, 0]
                x = data_train.iloc[i, 1]
                grad = (2 / sigma**2) * w - (y * x) / (
                    1 + np.exp(y * np.dot(w, x)))
                assert (w.shape == x.shape), "dim(w) != dim(x)"
                assert (grad.shape == w.shape), "dim(w) != dim(gradient)"
                w = w - r * grad
                jw.append((1 / sigma**2) * np.dot(w, w) +
                          np.log(1 + np.exp(-y * np.dot(w, x))))
            # print objective
            jw = np.mean(jw)
            print("Epoch = %s   J(w) = %1.4f" % (epoch, jw))

        # 3. return w
        self.weight = w

        return self
示例#27
0
    def fit(self,
            X,
            Y,
            learning_rate=10e-3,
            mu=0.99,
            reg=10e-12,
            eps=10e-10,
            epochs=400,
            batch_sz=20,
            print_period=1,
            show_fig=False):
        Y = Y.astype(np.int32)

        N, D = X.shape
        K = len(set(Y))
        self.hidden_layers = []
        M1 = D
        count = 0
        for M2 in self.hidden_layer_sizes:
            h = HiddenLayer(M1, M2, count)
            self.hidden_layers.append(h)
            M1 = M2
            count += 1

        # for the last layer
        W = init_weight(M1, K)
        b = np.zeros(K)
        self.W = theano.shared(W, "W_logreg")
        self.b = theano.shared(b, "b_logreg")
        self.params = [self.W, self.b]

        for h in self.hidden_layers:
            self.params += h.params

        #adding momentum
        dparams = [
            theano.shared(np.zeros(p.get_value().shape)) for p in self.params
        ]

        thX = T.matrix('X')
        thY = T.ivector('Y')
        pY = self.forward(thX)

        rcost = reg * T.sum([(p * p).sum() for p in self.params])
        cost = -T.mean(T.log(pY[T.arange(thY.shape[0]), thY])) + rcost
        prediction = self.predict(thX)
        grads = T.grad(cost, self.params)

        updates = [(p, p + mu * dp - learning_rate * g)
                   for p, dp, g in zip(self.params, dparams, grads)
                   ] + [(dp, mu * dp - learning_rate * g)
                        for dp, g in zip(dparams, grads)]

        train_op = theano.function(
            inputs=[thX, thY],
            outputs=[cost, prediction],
            updates=updates,
        )

        n_batches = N // batch_sz
        costs = []
        for i in range(epochs):
            X, Y = shuffle(X, Y)
            for j in range(n_batches):
                Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz)]
                Ybatch = Y[j * batch_sz:(j * batch_sz + batch_sz)]

                c, p = train_op(Xbatch, Ybatch)

                if j % print_period == 0:
                    costs.append(c)
                    e = np.mean(Ybatch != p)
                    print("i:", i, "j:", j, "nb:", n_batches, "cost:", c,
                          "error_rate:", e)

        if show_fig:
            plt.plot(costs)
            plt.show()
    def fit(self,
            X,
            Y,
            learning_rate=10e-1,
            mu=0.99,
            reg=1.0,
            epochs=500,
            show_fig=False,
            activation=T.tanh):
        M = self.M
        V = self.V
        K = len(set(Y))

        X, Y = shuffle(X, Y)
        Nvalid = 10
        Xvalid, Yvalid = X[-Nvalid:], Y[-Nvalid:]
        X, Y = X[:-Nvalid], Y[:-Nvalid]
        N = len(X)

        Wx = init_weight(V, M)
        Wh = init_weight(M, M)
        bh = np.zeros(M)
        h0 = np.zeros(M)
        Wo = init_weight(M, K)
        bo = np.zeros(K)
        thX, thY, py_x, prediction = self.set(Wx, Wh, bh, h0, Wo, bo,
                                              activation)

        cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY]))
        grad = T.grad(cost, self.param)
        dparams = [theano.shared(p.get_value() * 0) for p in self.params]
        lr = T.scalar('learning_rate')

        updates = [(p, p + mu * dp - lr * g)
                   for p, dp, g in zip(self.params, grad, dparams)
                   ] + [(dp, mu * dp - lr * g) for dp, g in zip(dparams, grad)]

        self.train_op = theano.function(
            inputs=[thX, thY, lr],
            outputs=[cost, prediction],
            updates=updates,
            allow_input_downcast=True,
        )

        costs = []
        for i in range(epochs):
            cost = 0
            Ncorrect = 0
            X, Y = shuffle(X, Y)
            for j in range(N):
                c, p = self.train_op(X[j], Y[j], learning_rate)
                cost += c
                if p == Y[j]:
                    Ncorrect += 1
            costs.append(cost)
            learning_rate *= 0.9999

            NVcorrect = 0
            for j in range(Nvalid):
                input = Xvalid[j]
                predict = self.predict_op(input)
                if predict == Yvalid[j]:
                    NVcorrect += 1

            print('epoch: %d, cost: %f ,accuracy: %f' %
                  (i, cost, Ncorrect / N))
            print('Validation accuracy: ', NVcorrect / Nvalid)

        if show_fig:
            plt.plot(costs)
            plt.show()
    def fit(self,
            X,
            learning_rate=10e-1,
            mu=0.99,
            reg=1.0,
            activation=T.tanh,
            epochs=500,
            show_fig=False):
        N = len(X)
        D = self.D
        M = self.M
        V = self.V
        self.f = activation

        We = init_weight(V, D)
        Wx = init_weight(D, M)
        Wh = init_weight(M, M)
        bh = np.zeros(M)
        h0 = np.zeros(M)
        Wo = init_weight(M, V)
        bo = np.zeros(V)

        self.We = theano.shared(We)
        self.Wx = theano.shared(Wx)
        self.Wh = theano.shared(Wh)
        self.bh = theano.shared(bh)
        self.h0 = theano.shared(h0)
        self.Wo = theano.shared(Wo)
        self.bo = theano.shared(bo)
        self.params = [
            self.We, self.Wx, self.Wh, self.bh, self.h0, self.Wo, self.bo
        ]

        thX = T.ivector('X')
        Ei = self.We[thX]  # TxD
        thY = T.ivector('Y')

        def recurrence(x_t, h_t1):
            #returns h(t), y(t)
            h_t = self.f(x_t.dot(self.Wx) + h_t1.dot(self.Wh) + self.bh)
            y_t = T.nnet.softmax(h_t.dot(self.Wo) + self.bo)
            return h_t, y_t

        [h, y], _ = theano.scan(
            fn=recurrence,
            outputs_info=[self.h0, None],
            sequences=Ei,
            n_steps=Ei.shape[0],
        )

        py_x = y[:, 0, :]
        prediction = T.argmax(py_x, axis=1)

        cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY]))
        grads = T.grad(cost, self.params)
        dparams = [theano.shared(p.get_value() * 0) for p in self.params]

        updates = [(p, p + mu * dp - learning_rate * g)
                   for p, dp, g in zip(self.params, dparams, grads)
                   ] + [(dp, mu * dp - learning_rate * g)
                        for dp, g in zip(dparams, grads)]

        self.predict_op = theano.function(
            inputs=[thX],
            outputs=prediction,
        )

        self.train_op = theano.function(
            inputs=[thX, thY],
            outputs=[cost, prediction],
            updates=updates,
        )

        costs = []
        n_total = sum((len(sentence) + 1) for sentence in X)
        for i in range(epochs):
            X = shuffle(X)
            cost = 0
            n_correct = 0
            for j in range(N):
                input_sequece = [0] + X[j]
                output_sequence = X[j] + [1]

                c, p = self.train_op(input_sequece, output_sequence)
                cost += c
                for pj, xj in zip(p, output_sequence):
                    if pj == xj:
                        n_correct += 1
            print('i:', i, 'cost:', cost, 'correct rate:',
                  (float(n_correct) / n_total))
            costs.append(cost)

        if show_fig:
            plt.plot(costs)
            plt.show()
    def fit(self,
            trees,
            test_trees,
            reg=1e-3,
            epochs=8,
            train_inner_nodes=False):
        D = self.D
        V = self.V
        K = self.K
        N = len(trees)

        We = init_weight(V, D)
        W11 = np.random.randn(D, D, D) / np.sqrt(3 * D)
        W22 = np.random.randn(D, D, D) / np.sqrt(3 * D)
        W12 = np.random.randn(D, D, D) / np.sqrt(3 * D)
        W1 = init_weight(D, D)
        W2 = init_weight(D, D)
        bh = np.zeros(D)
        Wo = init_weight(D, K)
        bo = np.zeros(K)

        self.We = tf.Variable(We.astype(np.float32))
        self.W11 = tf.Variable(W11.astype(np.float32))
        self.W22 = tf.Variable(W22.astype(np.float32))
        self.W12 = tf.Variable(W12.astype(np.float32))
        self.W1 = tf.Variable(W1.astype(np.float32))
        self.W2 = tf.Variable(W2.astype(np.float32))
        self.bh = tf.Variable(bh.astype(np.float32))
        self.Wo = tf.Variable(Wo.astype(np.float32))
        self.bo = tf.Variable(bo.astype(np.float32))
        self.weights = [
            self.We, self.W11, self.W22, self.W12, self.W1, self.W2, self.Wo
        ]

        words = tf.placeholder(tf.int32, shape=(None, ), name='words')
        left_children = tf.placeholder(tf.int32,
                                       shape=(None, ),
                                       name='left_children')
        right_children = tf.placeholder(tf.int32,
                                        shape=(None, ),
                                        name='right_children')
        labels = tf.placeholder(tf.int32, shape=(None, ), name='labels')

        # save for later
        self.words = words
        self.left = left_children
        self.right = right_children
        self.labels = labels

        def dot1(a, B):
            return tf.tensordot(a, B, axes=[[0], [1]])

        def dot2(B, a):
            return tf.tensordot(B, a, axes=[[1], [0]])

        def recursive_net_transform(hiddens, n):
            h_left = hiddens.read(left_children[n])
            h_right = hiddens.read(right_children[n])
            return self.f(
                dot1(h_left, dot2(self.W11, h_left)) +
                dot1(h_right, dot2(self.W22, h_right)) +
                dot1(h_left, dot2(self.W12, h_right)) + dot1(h_left, self.W1) +
                dot1(h_right, self.W2) + self.bh)

        def recurrence(hiddens, n):
            w = words[n]
            # any non-word will have index -1

            h_n = tf.cond(w >= 0, lambda: tf.nn.embedding_lookup(self.We, w),
                          lambda: recursive_net_transform(hiddens, n))
            hiddens = hiddens.write(n, h_n)
            n = tf.add(n, 1)
            return hiddens, n

        def condition(hiddens, n):
            # loop should continue while n < len(words)
            return tf.less(n, tf.shape(words)[0])

        hiddens = tf.TensorArray(tf.float32,
                                 size=0,
                                 dynamic_size=True,
                                 clear_after_read=False,
                                 infer_shape=False)

        hiddens, _ = tf.while_loop(condition,
                                   recurrence,
                                   [hiddens, tf.constant(0)],
                                   parallel_iterations=1)
        h = hiddens.stack()
        logits = tf.matmul(h, self.Wo) + self.bo

        prediction_op = tf.argmax(logits, axis=1)
        self.prediction_op = prediction_op

        rcost = reg * sum(tf.nn.l2_loss(p) for p in self.weights)
        if train_inner_nodes:
            # filter out -1s
            labeled_indices = tf.where(labels >= 0)

            cost_op = tf.reduce_mean(
                tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=tf.gather(logits, labeled_indices),
                    labels=tf.gather(labels, labeled_indices),
                )) + rcost
        else:
            cost_op = tf.reduce_mean(
                tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=logits[-1],
                    labels=labels[-1],
                )) + rcost

        train_op = tf.train.AdagradOptimizer(
            learning_rate=8e-3).minimize(cost_op)
        # train_op = tf.train.MomentumOptimizer(learning_rate=8e-3, momentum=0.9).minimize(cost_op)

        # NOTE: If you're using GPU, InteractiveSession breaks
        # AdagradOptimizer and some other optimizers
        # change to tf.Session() if so.
        self.session = tf.Session()
        init_op = tf.global_variables_initializer()
        self.session.run(init_op)

        costs = []
        sequence_indexes = range(N)
        for i in range(epochs):
            t0 = datetime.now()
            sequence_indexes = shuffle(sequence_indexes)
            n_correct = 0
            n_total = 0
            cost = 0
            it = 0
            for j in sequence_indexes:
                words_, left, right, lab = trees[j]
                # print("words_:", words_)
                # print("lab:", lab)
                c, p, _ = self.session.run(
                    (cost_op, prediction_op, train_op),
                    feed_dict={
                        words: words_,
                        left_children: left,
                        right_children: right,
                        labels: lab
                    })
                if np.isnan(c):
                    print("Cost is nan! Let's stop here. \
                        Why don't you try decreasing the learning rate?")
                    for p in self.params:
                        print(p.get_value().sum())
                    exit()
                cost += c
                n_correct += (p[-1] == lab[-1])
                n_total += 1

                it += 1
                if it % 10 == 0:
                    sys.stdout.write(
                        "j/N: %d/%d correct rate so far: %f, cost so far: %f\r"
                        % (it, N, float(n_correct) / n_total, cost))
                    sys.stdout.flush()

            # calculate the test score
            n_test_correct = 0
            n_test_total = 0
            for words_, left, right, lab in test_trees:
                p = self.session.run(prediction_op,
                                     feed_dict={
                                         words: words_,
                                         left_children: left,
                                         right_children: right,
                                         labels: lab
                                     })
                n_test_correct += (p[-1] == lab[-1])
                n_test_total += 1

            print("i:", i, "cost:", cost, "train acc:",
                  float(n_correct) / n_total, "test acc:",
                  float(n_test_correct) / n_test_total, "time for epoch:",
                  (datetime.now() - t0))
            costs.append(cost)

        plt.plot(costs)
        plt.show()