예제 #1
0
 def sample(self, model, tokens, vocab, reverse_token_map):
     seqlen = self.seqlen
     vocab_size = len(vocab)
     token_ix = -1
     inpt = ["START" for i in range(self.seqlen)]
     output = ""
     mintokens = 15
     maxtokens = 100
     i = 0
     while i < maxtokens and (i < mintokens
                              or token_ix != reverse_token_map['START']):
         if self.embedding:
             x = np.zeros((1, seqlen))
             x[0] = [
                 get_ix_from_token(reverse_token_map, token)
                 for token in inpt
             ]
         else:
             x = np.zeros((1, seqlen, vocab_size))
             x[0] = [
                 token_to_oh(get_ix_from_token(reverse_token_map, token),
                             vocab_size) for token in inpt
             ]
         preds = model.predict(x, verbose=0)[0][min(i, self.seqlen - 1)]
         token_ix = np.random.choice(range(vocab_size), p=preds.ravel())
         while token_ix == reverse_token_map["<UNK>"]:
             token_ix = np.random.choice(range(vocab_size), p=preds.ravel())
         new_token = vocab[token_ix]
         output += new_token
         if (i + 1 < len(inpt)):
             inpt[i + 1] = new_token
         else:
             inpt = inpt[1:] + [new_token]
         i += 1
     return output
예제 #2
0
    def get_input_sequences(self,
                            tokens,
                            reverse_token_map,
                            full=True,
                            sliding_window=True):
        nummsgs = math.floor((len(tokens) - self.seqlen) / self.step) + 1
        seqs = []
        for context, target in tokens:
            padded_sequence = char_padded(target[:self.seqlen], " ",
                                          self.seqlen)
            decoder_output = [
                get_ix_from_token(reverse_token_map, token)
                for token in padded_sequence
            ]
            decoder_input = [get_ix_from_token(reverse_token_map, "START")
                             ] + decoder_output[:-1]

            encoder_input = char_padded(context[:self.context_len], " ",
                                        self.context_len)
            encoder_input = [
                get_ix_from_token(reverse_token_map, token)
                for token in encoder_input
            ]
            seqs.append((encoder_input, decoder_input, decoder_output))
        return seqs
예제 #3
0
 def get_input_sequences(self, tokens, reverse_token_map):
     seqs = []
     for i in range(0, len(tokens) - self.seqlen, self.step):
         # x0 = "<START>" if i == 0 else tokens[i - 1]
         # last_ix = min(i + self.seqlen, len(tokens) - 1)
         # padded_sequence = char_padded(tokens[i:last_ix], " ", self.seqlen)
         X = self.slice_padded_sequence(tokens, i)
         Y = self.slice_padded_sequence(tokens, i + 1)
         Xseq = [get_ix_from_token(reverse_token_map, token) for token in X]
         Yseq = [get_ix_from_token(reverse_token_map, token) for token in Y]
         # Yseq = [get_ix_from_token(reverse_token_map, token) for token in padded_sequence]
         # Xseq = [get_ix_from_token(reverse_token_map, x0)] + Yseq[:-1]
         seqs.append((Xseq, Yseq))
     return seqs
예제 #4
0
 def sliding_window_input_sequences(self, tokens, reverse_token_map):
     nummsgs = math.floor((len(tokens) - self.seqlen) / self.step) + 1
     seqs = []
     x0 = "START"
     for i in range(0, len(tokens) - self.seqlen, self.step):
         x0 = "START" if i == 0 else tokens[i - 1]
         last_ix = min(i + self.seqlen, len(tokens) - 1)
         padded_sequence = char_padded(tokens[i:last_ix], " ", self.seqlen)
         Yseq = [
             get_ix_from_token(reverse_token_map, token)
             for token in padded_sequence
         ]
         Xseq = [get_ix_from_token(reverse_token_map, x0)] + Yseq[:-1]
         seqs.append((Xseq, Yseq))
     return seqs
예제 #5
0
 def sample(self, model, tokens, vocab, reverse_token_map):
     seqlen = self.seqlen
     vocab_size = len(vocab)
     token_ix = -1
     i = random.randint(0, len(tokens) - seqlen - 1)
     inpt = tokens[i:i + seqlen]
     output = ""
     for t in inpt:
         output += t
     output += "->"
     mintokens = 15
     maxtokens = 100
     i = 0
     while i < maxtokens and (i < mintokens or token_ix != reverse_token_map['\n']):
         x = np.zeros((1, seqlen, vocab_size))
         x[0] = [token_to_oh(get_ix_from_token(reverse_token_map, token), vocab_size) for token in inpt]
         preds = model.predict(x, verbose=0)[0]
         preds = preds[min(i, self.seqlen - 1)]
         token_ix = np.random.choice(range(vocab_size), p=preds.ravel())
         new_token = vocab[token_ix]
         output += new_token
         inpt = inpt[1:] + [new_token]
         i+=1
     logger.info("\n" + output)
     return output
예제 #6
0
파일: kattn_lm.py 프로젝트: dlyle32/slackml
 def masked_sample(self, model, tokens, vocab, reverse_token_map):
     seqlen = self.seqlen
     vocab_size = len(vocab)
     token_ix = -1
     inpt = ["<MASK>" for i in range(self.seqlen)]
     inpt[0] = "<START>"
     output = ""
     mintokens = 15
     maxtokens = 100
     i = 1
     while i < maxtokens and (i < mintokens
                              or token_ix != reverse_token_map['<START>']):
         maskix = min(i, self.seqlen - 1)
         # x = np.zeros((1, seqlen))
         x = [get_ix_from_token(reverse_token_map, token) for token in inpt]
         x = np.asarray(x)
         x = x.reshape((1, seqlen))
         preds = model.predict(x, verbose=0)[0][min(i, self.seqlen - 1)]
         token_ix = np.random.choice(range(vocab_size), p=preds.ravel())
         while token_ix == reverse_token_map["<UNK>"]:
             token_ix = np.random.choice(range(vocab_size), p=preds.ravel())
         new_token = vocab[token_ix]
         output += new_token
         inpt[maskix] = new_token
         if maskix == self.seqlen - 1:
             inpt = inpt[1:] + ["<MASK>"]
         i += 1
     return output
예제 #7
0
 def sample(self, model, tokens, vocab, reverse_token_map, temp=1):
     seqlen = self.seqlen
     vocab_size = len(vocab)
     token_ix = -1
     inpt = [" " for i in range(self.seqlen)]
     inpt[0] = "<START>"
     output = ""
     mintokens = 15
     maxtokens = 100
     i = 0
     while i < maxtokens and (i < mintokens or token_ix != reverse_token_map['<START>']):
         x = [get_ix_from_token(reverse_token_map, token) for token in inpt]
         x = np.asarray(x)
         x = x.reshape((1,seqlen))
         preds = model.predict(x, verbose=0)[0]
         preds = preds[min(i, self.seqlen - 1)]
         token_ix = np.random.choice(range(vocab_size), p=preds.ravel())
         retries = 0
         while retries < 10 and token_ix == reverse_token_map["<UNK>"]:
             token_ix = np.random.choice(range(vocab_size), p=preds.ravel())
             retries += 1
         new_token = vocab[token_ix]
         output += new_token
         output += " "
         if (i + 1 < len(inpt)):
             inpt[i + 1] = new_token
         else:
             inpt = inpt[1:] + [new_token]
         i += 1
     logger.info(output)
     return output
예제 #8
0
 def tokens_to_sequences(self, tokens):
     if len(tokens) < self.seqlen:
         tokens = char_padded(tokens, "<PAD>", self.seqlen)
     Xseqs = []
     Yseqs = []
     pad_masks = []
     for i in range(0,len(tokens)-self.seqlen+1, self.step):
         x0 = "<START>" if i == 0 else tokens[i - 1]
         Yseq = [get_ix_from_token(self.reverse_token_map, token) for token in tokens[i:i+self.seqlen]]
         Xseq = [get_ix_from_token(self.reverse_token_map, x0)] + Yseq[:-1]
         Yseq = np.array(Yseq)
         Xseq = np.array(Xseq)
         # pad_mask = (Yseq != get_ix_from_token(self.reverse_token_map, "<PAD>")).astype(np.int64)
         # pad_masks.append(pad_mask)
         Yseqs.append(Yseq)
         Xseqs.append(Xseq)
     # Yseqs = tf.data.Dataset.from_tensor_slices(Yseqs)
     # Xseqs = tf.data.Dataset.from_tensor_slices(Xseqs, Yseqs)
     # seqs = tf.data.Dataset.from_tensor_slices((Xseqs,Yseqs))
     return Xseqs, Yseqs
예제 #9
0
 def get_input_sequences(self,
                         tokens,
                         reverse_token_map,
                         full=True,
                         sliding_window=True):
     if full:
         return self.get_full_input_sequences(tokens, reverse_token_map)
     if sliding_window:
         return self.sliding_window_input_sequences(tokens,
                                                    reverse_token_map)
     nummsgs = math.floor((len(tokens) - self.seqlen) / self.step) + 1
     seqs = []
     for line in tokens:
         padded_sequence = char_padded(line[:self.seqlen], " ", self.seqlen)
         Yseq = [
             get_ix_from_token(reverse_token_map, token)
             for token in padded_sequence
         ]
         Xseq = [get_ix_from_token(reverse_token_map, " ")] + Yseq[:-1]
         seqs.append((Xseq, Yseq))
     return seqs
예제 #10
0
 def get_full_input_sequences(self, tokens, reverse_token_map):
     tokens = sorted(tokens, key=lambda a: len(a))
     left = 0
     right = len(tokens) - 1
     seqs = []
     Xseq = []
     Yseq = []
     while left < right:
         if len(Yseq) + len(tokens[right][:self.seqlen]) <= self.seqlen:
             newSeq = tokens[right][:self.seqlen]
             Yseq += newSeq
             Xseq += ["START"] + newSeq[:-1]
             right -= 1
         if len(Yseq) + len(tokens[left][:self.seqlen]) <= self.seqlen:
             newSeq = tokens[left][:self.seqlen]
             Yseq += newSeq
             Xseq += ["START"] + newSeq[:-1]
             left += 1
         else:
             paddedX = [
                 get_ix_from_token(reverse_token_map, token)
                 for token in char_padded(Xseq, " ", self.seqlen)
             ]
             paddedY = [
                 get_ix_from_token(reverse_token_map, token)
                 for token in char_padded(Yseq, " ", self.seqlen)
             ]
             seqs.append((paddedX, paddedY))
             Yseq = []
             Xseq = []
     paddedX = [
         get_ix_from_token(reverse_token_map, token)
         for token in char_padded(Xseq, " ", self.seqlen)
     ]
     paddedY = [
         get_ix_from_token(reverse_token_map, token)
         for token in char_padded(Yseq, " ", self.seqlen)
     ]
     seqs.append((paddedX, paddedY))
     return seqs
예제 #11
0
파일: kattn_lm.py 프로젝트: dlyle32/slackml
 def sample(self, model, tokens, vocab, reverse_token_map, temp=1):
     seqlen = self.seqlen
     vocab_size = len(vocab)
     token_ix = -1
     # start = np.random.randint(0, len(tokens) - self.seqlen)
     # inpt = tokens[start:start+self.seqlen]
     inpt = [" " for i in range(self.seqlen)]
     inpt[0] = "<START>"
     output = ""
     mintokens = 15
     maxtokens = 100
     i = 0
     while i < maxtokens and (i < mintokens
                              or token_ix != reverse_token_map['<START>']):
         # x = np.zeros((1, seqlen))
         # logger.info(inpt)
         x = [get_ix_from_token(reverse_token_map, token) for token in inpt]
         x = np.asarray(x)
         x = x.reshape((1, seqlen))
         preds = model.predict(x, verbose=0)[0]
         preds = preds[min(i, self.seqlen - 1)]
         # topk = tf.math.top_k(preds, k=50)
         # topk_preds = keras.layers.Softmax()(topk.values/temp)
         # token_ix = np.random.choice(topk.indices, p=topk_preds)
         token_ix = np.random.choice(range(vocab_size), p=preds.ravel())
         retries = 0
         while retries < 10 and token_ix == reverse_token_map["<UNK>"]:
             token_ix = np.random.choice(range(vocab_size), p=preds.ravel())
             retries += 1
         new_token = vocab[token_ix]
         # logger.info(new_token)
         output += new_token
         output += " "
         if (i + 1 < len(inpt)):
             inpt[i + 1] = new_token
         else:
             inpt = inpt[1:] + [new_token]
         i += 1
     logger.info(output)
     return output
예제 #12
0
    def sample(self, model, tokens, vocab, reverse_token_map):
        seqlen = self.seqlen
        vocab_size = len(vocab)
        tf.keras.backend.set_floatx('float64')

        if (not hasattr(self, "encoder_model")
                or not hasattr(self, "decoder_model")):
            self.build_sample_model(model)

        i = random.randint(0, len(tokens))
        context = tokens[i][0]
        actual_message = tokens[i][1]
        encoder_input = [
            get_ix_from_token(reverse_token_map, token)
            for token in context[:self.context_len]
        ]
        encoder_input = [token_to_oh(ix, len(vocab)) for ix in encoder_input]
        encoder_input = np.array([encoder_input])
        encoder_state = self.encoder_model.predict(encoder_input)

        inpt = ["START" for i in range(self.seqlen)]
        output = ""
        token_ix = -1
        mintokens = 15
        maxtokens = 100
        i = 0
        while i < maxtokens and (i < mintokens
                                 or token_ix != reverse_token_map['\n']):
            if self.embedding:
                x = np.zeros((1, seqlen))
                x[0] = [
                    get_ix_from_token(reverse_token_map, token)
                    for token in inpt
                ]
            else:
                x = np.zeros((1, seqlen, vocab_size))
                x[0] = [
                    token_to_oh(get_ix_from_token(reverse_token_map, token),
                                vocab_size) for token in inpt
                ]
            preds = self.decoder_model.predict([x] + encoder_state,
                                               verbose=0)[0]
            preds = preds[0][min(i, self.seqlen - 1)]
            probs = preds.ravel()
            token_ix = np.random.choice(range(vocab_size), p=probs)
            retries = 0
            while (retries < 10 and token_ix == reverse_token_map["<UNK>"]
                   ) or (token_ix == reverse_token_map[" "]
                         and output[-1] == " "):
                token_ix = np.random.choice(range(vocab_size), p=preds.ravel())
                retries += 1
            new_token = vocab[token_ix]
            output += new_token
            if (i + 1 < len(inpt)):
                inpt[i + 1] = new_token
            else:
                inpt = inpt[1:] + [new_token]
            i += 1

        print(context)
        print(output)
        print(actual_message)
        print(len(output))
        return output