Пример #1
0
class lstmwrapper(nn.Module):
    def __init__(self,
                 input_size=66529,
                 output_size=5952,
                 hidden_size=52,
                 num_layers=16,
                 batch_first=True,
                 dropout=0.1):
        super(lstmwrapper, self).__init__()
        self.lstm = LSTM(input_size=input_size,
                         hidden_size=hidden_size,
                         num_layers=num_layers,
                         batch_first=batch_first,
                         dropout=dropout)
        self.output = nn.Linear(hidden_size, output_size)
        self.bn = nn.BatchNorm1d(input_size)
        self.reset_parameters()

    def reset_parameters(self):
        self.lstm.reset_parameters()
        self.output.reset_parameters()

    def forward(self, input, hx=None):
        input = self.bn(input)
        output, statetuple = self.lstm(input, hx)
        return self.output(output)
Пример #2
0
class LSTM_vocab(nn.Module):
    def __init__(self,
                 vocab_size=50000,
                 vocab_embed_d=512,
                 output_size=12,
                 hidden_size=256,
                 *args,
                 **kwargs):
        super(LSTM_vocab, self).__init__()
        self.src_word_emb = nn.Embedding(vocab_size,
                                         vocab_embed_d,
                                         padding_idx=0)
        self.lstm = LSTM(input_size=vocab_embed_d,
                         hidden_size=hidden_size,
                         *args,
                         **kwargs)
        self.output = nn.Linear(hidden_size, output_size)
        self.reset_parameters()

    def reset_parameters(self):
        self.lstm.reset_parameters()
        self.output.reset_parameters()

    def forward(self, input, hx=None):
        input = self.src_word_emb(input)
        output, statetuple = self.lstm(input, hx)
        # this is a design decision that can be experimented with
        output = self.output(output)
        # output=torch.max(output,dim=1)[0]
        output = output[:, -1, :]
        return output
Пример #3
0
def testbid():
    lstm=LSTM(input_size=100, hidden_size=77, batch_first=True, dropout=0.1)
    lstmbi=LSTM(input_size=100, hidden_size=77, batch_first=True, dropout=0.1, bidirectional=True)
    input=Variable(torch.Tensor(64,8,100))
    output=lstm(input, None)
    outputbi=lstmbi(input, None)
    print(output[0].shape, output[1][0].shape, output[1][1].shape)
    print(outputbi[0].shape, outputbi[1][0].shape, output[1][1].shape)
    print("done")
Пример #4
0
    def __init__(self, x, R, W, h, L, v_t):
        super(Stock_LSTM, self).__init__()

        self.x = x
        self.R = R
        self.W = W
        self.h = h
        self.L = L
        self.v_t= v_t

        self.LSTM=LSTM(input_size=self.x+self.R*self.W,hidden_size=h,num_layers=L,batch_first=True,
                       dropout=0.1)
        self.last=nn.Linear(self.h, self.v_t)
        self.st=None
Пример #5
0
 def __init__(self,
              input_size=66529,
              output_size=5952,
              hidden_size=52,
              num_layers=16,
              batch_first=True,
              dropout=0.1):
     super(lstmwrapper, self).__init__()
     self.lstm = LSTM(input_size=input_size,
                      hidden_size=hidden_size,
                      num_layers=num_layers,
                      batch_first=batch_first,
                      dropout=dropout)
     self.output = nn.Linear(hidden_size, output_size)
     self.reset_parameters()
    def __init__(self, hidden_size, num_window, num_mixture_components):
        super(HandwritingGenerator, self).__init__()
        # First LSTM layer, takes as input a tuple (x, y, eol)

        self.lstm1_layer = LSTM(input_size=3,
                                hidden_size=hidden_size,
                                batch_first=True)

        # self.window_layer = Attention(n_inputs=hidden_size,
        #                                    n_mixture_components=num_window)

        self.window_layer = GaussianWindow(
            input_size=hidden_size, num_components=num_mixture_components)

        self.mdn = mdn(n_inputs=hidden_size,
                       n_mixture_components=num_mixture_components)

        # Hidden State Variables

        self.hidden_size = hidden_size

        self.hidden1 = None
        self.prev_kappa = None
        # Initiliaze parameters
        self.reset_parameters()
Пример #7
0
 def __init__(self,
              input_size,
              embed_size,
              hidden_size,
              aspect_size,
              num_class,
              embedding=None):
     super(ATAELSTM, self).__init__()
     self.embed_size = embed_size
     self.aspect_size = aspect_size
     self.num_class = num_class
     # emeddding
     if embedding is not None:
         self.embeding = Embedding.from_pretrained(torch.Tensor(embedding))
         self.embeding.weight.requires_grad = False
     else:
         self.embeding = Embedding(input_size, embed_size, padding_idx=0)
     # (batch size, N, embedding size)
     self.apect_embeding = Embedding(aspect_size, embed_size)
     self.rnn = LSTM(input_size=embed_size,
                     hidden_size=hidden_size,
                     bidirectional=True,
                     batch_first=True,
                     num_layers=1)
     self.att = Attention(hidden_size * 2, aspect_size)
     self.fc = Linear(hidden_size * 2, num_class, bias=True)
Пример #8
0
 def __init__(self,
              vocab_size=50000,
              vocab_embed_d=512,
              output_size=12,
              hidden_size=256,
              *args,
              **kwargs):
     super(LSTM_vocab, self).__init__()
     self.src_word_emb = nn.Embedding(vocab_size,
                                      vocab_embed_d,
                                      padding_idx=0)
     self.lstm = LSTM(input_size=vocab_embed_d,
                      hidden_size=hidden_size,
                      *args,
                      **kwargs)
     self.output = nn.Linear(hidden_size, output_size)
     self.reset_parameters()
Пример #9
0
    def __init__(self,
                 prior,
                 input_size=52686,
                 output_size=2976,
                 hidden_size=128,
                 num_layers=16,
                 batch_first=True,
                 dropout=0.1):
        super(PriorLSTM, self).__init__()
        self.lstm = LSTM(input_size=input_size,
                         hidden_size=hidden_size,
                         num_layers=num_layers,
                         batch_first=batch_first,
                         dropout=dropout)
        self.bn = nn.BatchNorm1d(input_size)
        self.output = nn.Linear(hidden_size, output_size)
        self.reset_parameters()
        self.prior = prior
        '''prior'''
        # this is the prior probability of each label predicting true
        # this is added to the logit
        self.prior = prior
        if isinstance(self.prior, np.ndarray):
            self.prior = torch.from_numpy(self.prior).float()
            self.prior = Variable(self.prior, requires_grad=False)
        elif isinstance(self.prior, torch.Tensor):
            self.prior = Variable(self.prior, requires_grad=False)
        else:
            assert (isinstance(self.prior, Variable))

        # transform to logits
        # because we are using sigmoid, not softmax, self.prior=log(P(y))-log(P(not y))
        # sigmoid_input = z + self.prior
        # z = log(P(x|y)) - log(P(x|not y))
        # sigmoid output is the posterior positive
        self.prior = self.prior.clamp(1e-8, 1 - 1e-8)
        self.prior = torch.log(self.prior) - torch.log(1 - self.prior)
        a = Variable(torch.Tensor([0]))
        self.prior = torch.cat((a, self.prior))
        self.prior = self.prior.cuda()

        for name, param in self.named_parameters():
            print(name, param.data.shape)

        print("Using prior lstm")
Пример #10
0
class LSTMWrapper(nn.Module):
    def __init__(self, output_size=12, hidden_size=256, *args, **kwargs):
        super(LSTMWrapper, self).__init__()
        self.lstm = LSTM(hidden_size=hidden_size, *args, **kwargs)
        self.output = nn.Linear(hidden_size, output_size)
        self.reset_parameters()

    def reset_parameters(self):
        self.lstm.reset_parameters()
        self.output.reset_parameters()

    def forward(self, input, hx=None):
        output, statetuple = self.lstm(input, hx)
        # this is a design decision that can be experimented with
        output = self.output(output)
        # output=torch.max(output,dim=1)[0]
        output = output[:, -1, :]
        return output
Пример #11
0
    def __init__(self, input_size, hidden_size, label_size):
        super(ClassifierLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.label_size = label_size
        self.input_size = input_size

        self.lstm = LSTM(input_size, hidden_size)

        self.linear = nn.Linear(hidden_size, label_size)
Пример #12
0
class lstmwrapperJ(nn.Module):
    def __init__(self,
                 input_size=52686,
                 output_size=2976,
                 hidden_size=128,
                 num_layers=16,
                 batch_first=True,
                 dropout=0.1):
        super(lstmwrapperJ, self).__init__()
        self.lstm = LSTM(input_size=input_size,
                         hidden_size=hidden_size,
                         num_layers=num_layers,
                         batch_first=batch_first,
                         dropout=dropout)
        self.bn = nn.BatchNorm1d(input_size)
        self.output = nn.Linear(hidden_size, output_size)
        self.reset_parameters()

        for name, param in self.named_parameters():
            print(name, param.data.shape)

    def reset_parameters(self):
        self.lstm.reset_parameters()
        self.output.reset_parameters()

    def forward(self, input, hx=None):
        input = input.permute(0, 2, 1).contiguous()
        try:
            bnout = self.bn(input)
            bnout[(bnout != bnout).detach()] = 0
        except ValueError:
            if step_input.shape[0] == 1:
                print("Somehow the batch size is one for this input")
                bnout = step_input
            else:
                raise
        input = bnout.permute(0, 2, 1).contiguous()
        output, statetuple = self.lstm(input, hx)
        output = self.output(output)
        # (batch_size, seq_len, target_dim)
        # pdb.set_trace()
        # output=output.sum(1)
        output = output.max(1)[0]
        return output
Пример #13
0
    def __init__(self,
                 input_size=52686,
                 output_size=2976,
                 hidden_size=128,
                 num_layers=16,
                 batch_first=True,
                 dropout=0.1):
        super(lstmwrapperJ, self).__init__()
        self.lstm = LSTM(input_size=input_size,
                         hidden_size=hidden_size,
                         num_layers=num_layers,
                         batch_first=batch_first,
                         dropout=dropout)
        self.bn = nn.BatchNorm1d(input_size)
        self.output = nn.Linear(hidden_size, output_size)
        self.reset_parameters()

        for name, param in self.named_parameters():
            print(name, param.data.shape)
    def __init__(self, alphabet_size, hidden_size, num_window_components,
                 num_mixture_components):
        super(HandwritingGenerator, self).__init__()
        self.alphabet_size = alphabet_size
        self.hidden_size = hidden_size
        self.num_window_components = num_window_components
        self.num_mixture_components = num_mixture_components
        # First LSTM layer, takes as input a tuple (x, y, eol)
        self.lstm1_layer = LSTM(input_size=3,
                                hidden_size=hidden_size,
                                batch_first=True)
        # Gaussian Window layer
        self.window_layer = GaussianWindow(
            input_size=hidden_size, num_components=num_window_components)
        # Second LSTM layer, takes as input the concatenation of the input,
        # the output of the first LSTM layer
        # and the output of the Window layer
        self.lstm2_layer = LSTM(
            input_size=3 + hidden_size + alphabet_size + 1,
            hidden_size=hidden_size,
            batch_first=True,
        )

        # Third LSTM layer, takes as input the concatenation of the output of the first LSTM layer,
        # the output of the second LSTM layer
        # and the output of the Window layer
        self.lstm3_layer = LSTM(input_size=hidden_size,
                                hidden_size=hidden_size,
                                batch_first=True)

        # Mixture Density Network Layer
        self.output_layer = MDN(input_size=hidden_size,
                                num_mixtures=num_mixture_components)

        # Hidden State Variables
        self.prev_kappa = None
        self.hidden1 = None
        self.hidden2 = None
        self.hidden3 = None

        # Initiliaze parameters
        self.reset_parameters()
Пример #15
0
class Stock_LSTM(nn.Module):
    """
    I prefer using this Stock LSTM for numerical stability.
    """
    def __init__(self, x, R, W, h, L, v_t):
        super(Stock_LSTM, self).__init__()

        self.x = x
        self.R = R
        self.W = W
        self.h = h
        self.L = L
        self.v_t= v_t

        self.LSTM=LSTM(input_size=self.x+self.R*self.W,hidden_size=h,num_layers=L,batch_first=True,
                       dropout=0.1)
        self.last=nn.Linear(self.h, self.v_t)
        self.st=None

    def forward(self, input_x):
        """
        :param input_x: input and memory values
        :return:
        """
        assert (self.st is not None)
        o, st = self.LSTM(input_x, self.st)
        if (st[0]!=st[0]).any():
            with open("debug/lstm.pkl") as f:
                pickle.dump(self, f)
            with open("debug/lstm.pkl") as f:
                pickle.dump(input_x, f)
            raise ("LSTM produced a NAN, objects dumped.")
        return self.last(o), st

    def reset_parameters(self):
        self.LSTM.reset_parameters()
        self.last.reset_parameters()

    def assign_states_tuple(self, states_tuple):
        self.st=states_tuple
Пример #16
0
    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 hidden_size: int = 200,
                 num_layers: int = 2) -> None:
        super(SimpleTagger, self).__init__()

        self.vocab = vocab
        self.text_field_embedder = text_field_embedder
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.num_classes = self.vocab.get_vocab_size("tags")

        # TODO(Mark): support masking once utility functions are merged.
        self.stacked_encoders = LSTM(self.text_field_embedder.get_output_dim(),
                                     self.hidden_size,
                                     self.num_layers,
                                     batch_first=True)
        self.tag_projection_layer = TimeDistributed(
            Linear(self.hidden_size, self.num_classes))
        self.sequence_loss = torch.nn.CrossEntropyLoss()
Пример #17
0
class PriorLSTM(nn.Module):
    def __init__(self,
                 prior,
                 input_size=52686,
                 output_size=2976,
                 hidden_size=128,
                 num_layers=16,
                 batch_first=True,
                 dropout=0.1):
        super(PriorLSTM, self).__init__()
        self.lstm = LSTM(input_size=input_size,
                         hidden_size=hidden_size,
                         num_layers=num_layers,
                         batch_first=batch_first,
                         dropout=dropout)
        self.bn = nn.BatchNorm1d(input_size)
        self.output = nn.Linear(hidden_size, output_size)
        self.reset_parameters()
        self.prior = prior
        '''prior'''
        # this is the prior probability of each label predicting true
        # this is added to the logit
        self.prior = prior
        if isinstance(self.prior, np.ndarray):
            self.prior = torch.from_numpy(self.prior).float()
            self.prior = Variable(self.prior, requires_grad=False)
        elif isinstance(self.prior, torch.Tensor):
            self.prior = Variable(self.prior, requires_grad=False)
        else:
            assert (isinstance(self.prior, Variable))

        # transform to logits
        # because we are using sigmoid, not softmax, self.prior=log(P(y))-log(P(not y))
        # sigmoid_input = z + self.prior
        # z = log(P(x|y)) - log(P(x|not y))
        # sigmoid output is the posterior positive
        self.prior = self.prior.clamp(1e-8, 1 - 1e-8)
        self.prior = torch.log(self.prior) - torch.log(1 - self.prior)
        a = Variable(torch.Tensor([0]))
        self.prior = torch.cat((a, self.prior))
        self.prior = self.prior.cuda()

        for name, param in self.named_parameters():
            print(name, param.data.shape)

        print("Using prior lstm")

    def reset_parameters(self):
        self.lstm.reset_parameters()
        self.output.reset_parameters()

    def forward(self, input, hx=None):
        input = input.permute(0, 2, 1).contiguous()
        bnout = self.bn(input)
        bnout[(bnout != bnout).detach()] = 0
        input = bnout.permute(0, 2, 1).contiguous()
        output, statetuple = self.lstm(input, hx)
        output = self.output(output)
        # (batch_size, seq_len, target_dim)
        # pdb.set_trace()
        # output=output.sum(1)
        output = output.max(1)[0]
        output = output + self.prior

        return output
Пример #18
0
class HandwritingGenerator(Module):
    def __init__(self, alphabet_size, hidden_size, num_window_components,
                 num_mixture_components):
        super(HandwritingGenerator, self).__init__()
        self.alphabet_size = alphabet_size
        self.hidden_size = hidden_size
        self.num_window_components = num_window_components
        self.num_mixture_components = num_mixture_components
        # print(num_window_components)
        # print(num_mixture_components)
        # print(hidden_size)
        self.input_size = input_size = 3
        n_heads_1 = 2
        n_heads_2 = 10
        query_dimensions = 1
        self.n_pre_layers = 2
        self.n_layers = 4
        # n_heads_2 = 4
        # First LSTM layer, takes as input a tuple (x, y, eol)
        # self.lstm1_layer = LSTM(input_size=3, hidden_size=hidden_size, batch_first=True)
        # [
        #     TransformerEncoderLayer(
        #         AttentionLayer(FullAttention(), 768, 12),
        #         768,
        #         12,
        #         activation="gelu"
        #     ) for l in range(12)
        # ],
        # norm_layer=torch.nn.LayerNorm(768)

        self.lstm1_layer = LSTM(input_size=input_size,
                                hidden_size=hidden_size,
                                batch_first=True)

        # self.transformers1_layers = [
        #     RecurrentTransformerEncoderLayer(
        #         RecurrentAttentionLayer(RecurrentLinearAttention(query_dimensions), input_size, n_heads_1),
        #         input_size,
        #         hidden_size,
        #         activation="gelu"
        #     ) for l in range(self.n_pre_layers)
        # ]
        # self.norm1_layer = torch.nn.Linear(input_size, hidden_size)

        # Gaussian Window layer
        self.window_layer = GaussianWindow(
            input_size=hidden_size, num_components=num_window_components)
        # Second LSTM layer, takes as input the concatenation of the input,
        # the output of the first LSTM layer
        # and the output of the Window layer
        # self.lstm2_layer = LSTM(
        #     input_size=3 + hidden_size + alphabet_size + 1,
        #     hidden_size=hidden_size,
        #     batch_first=True,
        # )
        self.transformers2_layers = [
            RecurrentTransformerEncoderLayer(
                RecurrentAttentionLayer(
                    RecurrentLinearAttention(query_dimensions),
                    3 + hidden_size + alphabet_size + 1, n_heads_2),
                3 + hidden_size + alphabet_size + 1,
                # RecurrentAttentionLayer(RecurrentLinearAttention(query_dimensions), hidden_size, n_heads_2),
                # hidden_size,
                hidden_size,
                activation="gelu") for l in range(self.n_layers)
        ]

        # Third LSTM layer, takes as input the concatenation of the output of the first LSTM layer,
        # the output of the second LSTM layer
        # and the output of the Window layer
        # self.lstm3_layer = LSTM(
        #     input_size=hidden_size, hidden_size=hidden_size, batch_first=True
        # )
        # print( 3 + hidden_size + alphabet_size + 1)
        # print(hidden_size)
        self.norm2_layer = torch.nn.LayerNorm(3 + hidden_size + alphabet_size +
                                              1)
        # self.norm2_layer = torch.nn.LayerNorm(hidden_size)
        # self.norm2_layer = torch.nn.Linear(hidden_size)

        # Mixture Density Network Layer
        self.output_layer = MDN(
            input_size=3 + hidden_size + alphabet_size + 1,
            num_mixtures=num_mixture_components
            # input_size=hidden_size, num_mixtures=num_mixture_components
        )

        # Hidden State Variables
        self.prev_kappa = None
        # self.hidden1 = None
        self.hidden1 = None
        # self.hidden1 = [None] * self.n_pre_layers
        self.hidden2 = [None] * self.n_layers
        # self.hidden3 = None

        # Initiliaze parameters
        self.reset_parameters()

    def forward(self, strokes, onehot, bias=None):
        # First LSTM Layer
        # input_ = strokes.reshape(-1,self.input_size)
        input_ = strokes
        # self.lstm1_layer.flatten_parameters()
        # print(input_.shape)
        # print(self.hidden1)

        # for i, l in enumerate(self.transformers1_layers):
        #     input_, self.hidden1[i] = l(input_, self.hidden1[i])
        # # print(output1.shape)
        # output1 = self.norm1_layer(input_)
        # output1 = output1.reshape(-1,1,self.hidden_size)

        self.lstm1_layer.flatten_parameters()
        output1, self.hidden1 = self.lstm1_layer(input_, self.hidden1)

        # print(output1.shape)
        # print(onehot.shape)
        # print(self.prev_kappa)
        # print(output1.shape, self.hidden1.shape)
        # output1, self.hidden1 = self.lstm1_layer(input_, self.hidden1)
        # output1 = []
        # self.hidden1 = []
        # for i in input_:
        #     o = self.lstm1_layer(i, self.hidden1)
        #     print(o)
        #     output1.append(o)
        #     self.hidden1.append(h1)
        # print(output1.shape)
        # Gaussian Window Layer
        window, self.prev_kappa, phi = self.window_layer(
            output1, onehot, self.prev_kappa)
        # print(output1.shape)
        # print(strokes.shape)
        # print(window.shape)
        # print(self.hidden2)
        # Second LSTM Layer
        # torch.squeeze(output1)
        # print(torch.cat((strokes, output1, window), dim=2).shape)
        output2 = torch.cat(
            (strokes, output1, window),
            # dim=2).squeeze()
            dim=2).reshape(
                -1, strokes.shape[-1] + output1.shape[-1] + window.shape[-1])
        for i, l in enumerate(self.transformers2_layers):
            output2, self.hidden2[i] = l(output2, self.hidden2[i])
        # print(output2.shape)
        # print([h.shape for h in self.hidden2])
        # print(self.hidden3.shape)
        # Third LSTM Layer
        output3 = self.norm2_layer(output2)
        # MDN Layer
        eos, pi, mu1, mu2, sigma1, sigma2, rho = self.output_layer(
            output3.reshape(-1, 1, output3.shape[-1]), bias)
        return (eos, pi, mu1, mu2, sigma1, sigma2, rho), (window, phi)

    @staticmethod
    def sample_bivariate_gaussian(pi, mu1, mu2, sigma1, sigma2, rho):
        # Pick distribution from the MDN
        p = pi.data[0, 0, :].numpy()
        idx = np.random.choice(p.shape[0], p=p)
        m1 = mu1.data[0, 0, idx]
        m2 = mu2.data[0, 0, idx]
        s1 = sigma1.data[0, 0, idx]
        s2 = sigma2.data[0, 0, idx]
        r = rho.data[0, 0, idx]
        mean = [m1, m2]
        covariance = [[s1**2, r * s1 * s2], [r * s1 * s2, s2**2]]
        Z = torch.autograd.Variable(
            sigma1.data.new(np.random.multivariate_normal(mean, covariance,
                                                          1))).unsqueeze(0)
        X = Z[:, :, 0:1]
        Y = Z[:, :, 1:2]
        return X, Y

    def reset_state(self):
        self.prev_kappa = None
        self.hidden1 = None
        # self.hidden1 = [None] * self.n_pre_layers
        self.hidden2 = [None] * self.n_layers
        # self.hidden3 = None

    def reset_parameters(self):
        for parameter in self.parameters():
            if len(parameter.size()) == 2:
                torch.nn.init.xavier_uniform_(parameter, gain=1.0)
            else:
                stdv = 1.0 / parameter.size(0)
                torch.nn.init.uniform_(parameter, -stdv, stdv)

    def num_parameters(self):
        num = 0
        for weight in self.parameters():
            num = num + weight.numel()
        return num

    @classmethod
    def load_model(cls, parameters: dict, state_dict: dict):
        model = cls(**parameters)
        model.load_state_dict(state_dict)
        return model

    def __deepcopy__(self, *args, **kwargs):
        model = HandwritingGenerator(
            self.alphabet_size,
            self.hidden_size,
            self.num_window_components,
            self.num_mixture_components,
        )
        return model
Пример #19
0
 def __init__(self, output_size=12, hidden_size=256, *args, **kwargs):
     super(LSTMWrapper, self).__init__()
     self.lstm = LSTM(hidden_size=hidden_size, *args, **kwargs)
     self.output = nn.Linear(hidden_size, output_size)
     self.reset_parameters()
Пример #20
0
    def __init__(self, alphabet_size, hidden_size, num_window_components,
                 num_mixture_components):
        super(HandwritingGenerator, self).__init__()
        self.alphabet_size = alphabet_size
        self.hidden_size = hidden_size
        self.num_window_components = num_window_components
        self.num_mixture_components = num_mixture_components
        # print(num_window_components)
        # print(num_mixture_components)
        # print(hidden_size)
        self.input_size = input_size = 3
        n_heads_1 = 2
        n_heads_2 = 10
        query_dimensions = 1
        self.n_pre_layers = 2
        self.n_layers = 4
        # n_heads_2 = 4
        # First LSTM layer, takes as input a tuple (x, y, eol)
        # self.lstm1_layer = LSTM(input_size=3, hidden_size=hidden_size, batch_first=True)
        # [
        #     TransformerEncoderLayer(
        #         AttentionLayer(FullAttention(), 768, 12),
        #         768,
        #         12,
        #         activation="gelu"
        #     ) for l in range(12)
        # ],
        # norm_layer=torch.nn.LayerNorm(768)

        self.lstm1_layer = LSTM(input_size=input_size,
                                hidden_size=hidden_size,
                                batch_first=True)

        # self.transformers1_layers = [
        #     RecurrentTransformerEncoderLayer(
        #         RecurrentAttentionLayer(RecurrentLinearAttention(query_dimensions), input_size, n_heads_1),
        #         input_size,
        #         hidden_size,
        #         activation="gelu"
        #     ) for l in range(self.n_pre_layers)
        # ]
        # self.norm1_layer = torch.nn.Linear(input_size, hidden_size)

        # Gaussian Window layer
        self.window_layer = GaussianWindow(
            input_size=hidden_size, num_components=num_window_components)
        # Second LSTM layer, takes as input the concatenation of the input,
        # the output of the first LSTM layer
        # and the output of the Window layer
        # self.lstm2_layer = LSTM(
        #     input_size=3 + hidden_size + alphabet_size + 1,
        #     hidden_size=hidden_size,
        #     batch_first=True,
        # )
        self.transformers2_layers = [
            RecurrentTransformerEncoderLayer(
                RecurrentAttentionLayer(
                    RecurrentLinearAttention(query_dimensions),
                    3 + hidden_size + alphabet_size + 1, n_heads_2),
                3 + hidden_size + alphabet_size + 1,
                # RecurrentAttentionLayer(RecurrentLinearAttention(query_dimensions), hidden_size, n_heads_2),
                # hidden_size,
                hidden_size,
                activation="gelu") for l in range(self.n_layers)
        ]

        # Third LSTM layer, takes as input the concatenation of the output of the first LSTM layer,
        # the output of the second LSTM layer
        # and the output of the Window layer
        # self.lstm3_layer = LSTM(
        #     input_size=hidden_size, hidden_size=hidden_size, batch_first=True
        # )
        # print( 3 + hidden_size + alphabet_size + 1)
        # print(hidden_size)
        self.norm2_layer = torch.nn.LayerNorm(3 + hidden_size + alphabet_size +
                                              1)
        # self.norm2_layer = torch.nn.LayerNorm(hidden_size)
        # self.norm2_layer = torch.nn.Linear(hidden_size)

        # Mixture Density Network Layer
        self.output_layer = MDN(
            input_size=3 + hidden_size + alphabet_size + 1,
            num_mixtures=num_mixture_components
            # input_size=hidden_size, num_mixtures=num_mixture_components
        )

        # Hidden State Variables
        self.prev_kappa = None
        # self.hidden1 = None
        self.hidden1 = None
        # self.hidden1 = [None] * self.n_pre_layers
        self.hidden2 = [None] * self.n_layers
        # self.hidden3 = None

        # Initiliaze parameters
        self.reset_parameters()
class HandwritingGenerator(Module):
    def __init__(self, alphabet_size, hidden_size, num_window_components,
                 num_mixture_components):
        super(HandwritingGenerator, self).__init__()
        self.alphabet_size = alphabet_size
        self.hidden_size = hidden_size
        self.num_window_components = num_window_components
        self.num_mixture_components = num_mixture_components
        # First LSTM layer, takes as input a tuple (x, y, eol)
        self.lstm1_layer = LSTM(input_size=3,
                                hidden_size=hidden_size,
                                batch_first=True)
        # Gaussian Window layer
        self.window_layer = GaussianWindow(
            input_size=hidden_size, num_components=num_window_components)
        # Second LSTM layer, takes as input the concatenation of the input,
        # the output of the first LSTM layer
        # and the output of the Window layer
        self.lstm2_layer = LSTM(
            input_size=3 + hidden_size + alphabet_size + 1,
            hidden_size=hidden_size,
            batch_first=True,
        )

        # Third LSTM layer, takes as input the concatenation of the output of the first LSTM layer,
        # the output of the second LSTM layer
        # and the output of the Window layer
        self.lstm3_layer = LSTM(input_size=hidden_size,
                                hidden_size=hidden_size,
                                batch_first=True)

        # Mixture Density Network Layer
        self.output_layer = MDN(input_size=hidden_size,
                                num_mixtures=num_mixture_components)

        # Hidden State Variables
        self.prev_kappa = None
        self.hidden1 = None
        self.hidden2 = None
        self.hidden3 = None

        # Initiliaze parameters
        self.reset_parameters()

    def forward(self, strokes, onehot, bias=None):
        # First LSTM Layer
        input_ = strokes
        self.lstm1_layer.flatten_parameters()
        output1, self.hidden1 = self.lstm1_layer(input_, self.hidden1)
        # Gaussian Window Layer
        window, self.prev_kappa, phi = self.window_layer(
            output1, onehot, self.prev_kappa)
        # Second LSTM Layer
        output2, self.hidden2 = self.lstm2_layer(
            torch.cat((strokes, output1, window), dim=2), self.hidden2)
        # Third LSTM Layer
        output3, self.hidden3 = self.lstm3_layer(output2, self.hidden3)
        # MDN Layer
        eos, pi, mu1, mu2, sigma1, sigma2, rho = self.output_layer(
            output3, bias)
        return (eos, pi, mu1, mu2, sigma1, sigma2, rho), (window, phi)

    @staticmethod
    def sample_bivariate_gaussian(pi, mu1, mu2, sigma1, sigma2, rho):
        # Pick distribution from the MDN
        p = pi.data[0, 0, :].numpy()
        idx = np.random.choice(p.shape[0], p=p)
        m1 = mu1.data[0, 0, idx]
        m2 = mu2.data[0, 0, idx]
        s1 = sigma1.data[0, 0, idx]
        s2 = sigma2.data[0, 0, idx]
        r = rho.data[0, 0, idx]
        mean = [m1, m2]
        covariance = [[s1**2, r * s1 * s2], [r * s1 * s2, s2**2]]
        Z = torch.autograd.Variable(
            sigma1.data.new(np.random.multivariate_normal(mean, covariance,
                                                          1))).unsqueeze(0)
        X = Z[:, :, 0:1]
        Y = Z[:, :, 1:2]
        return X, Y

    def reset_state(self):
        self.prev_kappa = None
        self.hidden1 = None
        self.hidden2 = None
        self.hidden3 = None

    def reset_parameters(self):
        for parameter in self.parameters():
            if len(parameter.size()) == 2:
                torch.nn.init.xavier_uniform(parameter, gain=1.0)
            else:
                stdv = 1.0 / parameter.size(0)
                torch.nn.init.uniform(parameter, -stdv, stdv)

    def num_parameters(self):
        num = 0
        for weight in self.parameters():
            num = num + weight.numel()
        return num

    @classmethod
    def load_model(cls, parameters: dict, state_dict: dict):
        model = cls(**parameters)
        model.load_state_dict(state_dict)
        return model

    def __deepcopy__(self, *args, **kwargs):
        model = HandwritingGenerator(
            self.alphabet_size,
            self.hidden_size,
            self.num_window_components,
            self.num_mixture_components,
        )
        return model