Exemplo n.º 1
0
 def unk_tensor(self, tensor):
     unk = self.vocab.w2i['<UNK>']
     mask = (tensor >= self.vocab.count).long()
     ones = torch.ones(mask.size()).long()
     ones = to_cuda(ones, self.iscuda)
     tensor = tensor * (ones - mask) + mask * unk
     return tensor
Exemplo n.º 2
0
    def forward(self, sources, src_simil, q_simil, context_len):

        bc, in_seq = sources.size()
        b = len(context_len)

        idx = 0
        similarity_list = []
        sources_list = []
        for i, c in enumerate(context_len):
            similarities = F.softmax(
                torch.mm(src_simil[idx:idx + c].squeeze(),
                         q_simil[i].view(-1, 1)).squeeze())
            similarity_list.append(
                similarities)  # distribution of each line, for later softmax
            max_idx = similarities.max(0)[1]
            sources_list.append(sources[idx +
                                        max_idx.data[0]])  # selected source
            idx += c

        sources = torch.stack(sources_list, 0)  # [b x seq]

        similarity_tensor = to_cuda(Variable(torch.zeros(b, 10)), self.iscuda)
        for i, sim in enumerate(similarity_list):
            length = len(sim)
            similarity_tensor[i, :length] = similarity_tensor[i, :length] + sim

        return sources, similarity_tensor
Exemplo n.º 3
0
def main(args):
    # obtain vocabulary
    vocab = Vocab(args.vocab_size)
    vocab.w2i = np.load(args.word2idx).item()
    vocab.i2w = np.load(args.idx2word).item()
    vocab.count = len(vocab.w2i)

    # obtain dataset in batches
    file_list = os.listdir(args.data_dir)
    batch = Batch(file_list, args.max_enc, args.max_dec)

    # load model
    if args.load_model != '':
        model = torch.load(args.load_model)
    else:
        model = Model(args)
    model = to_cuda(model)

    # computation for each epoch
    epoch = 1
    while (epoch <= args.epochs):
        random.shuffle(file_list)
        for file in file_list:
            with open(os.path.join(args.data_dir, file)) as f:
                minibatch = f.read()
            stories, summaries = batch.process_minibatch(minibatch, vocab)
            print(stories)
            print(summaries)
Exemplo n.º 4
0
 def forward(self, sources, queries, context_len):
     
     
     bc, in_seq = sources.size()
     b = len(context_len)
     
     idx = 0
     similarity_list = []
     sources_list = []
     for i,c in enumerate(context_len):
         query = queries[i].tolist()
         similarities = [self.lev(line.tolist(),query) for line in sources[idx:idx+c]]
         similarities = [x+1e-4 for x in similarities]
         similarities = torch.Tensor(similarities)
         
         similarity_list.append(similarities) # distribution of each line, for later softmax
         max_idx = similarities.max(0)[1]
         sources_list.append(sources[idx+max_idx.data[0]]) # selected source
         idx+=c
     
     sources = torch.stack(sources_list,0) # [b x seq]
     
     similarity_tensor = to_cuda(Variable(torch.zeros(b,10)),self.iscuda)
     for i,sim in enumerate(similarity_list):
         length = len(sim)
         similarity_tensor[i,:length] = similarity_tensor[i,:length] + sim
     
     return sources, similarity_tensor
     bc, in_seq = sources.size()
     b = len(context_len)
Exemplo n.º 5
0
    def forward(self, sources, queries, lengths, targets):
        """
        sources: [batch*context_lines x seq] OR [batch x seq]
        queries: [batch x seq]
        targets: [batxh x seq]
        """
        source_len, query_len, target_len, context_len = lengths

        # use similarity function to get closest lines from source
        if self.single == False:
            sources, similarities = self.similarity(sources, queries,
                                                    context_len)
        # merge sources and queries to one matrix
        source_lens = (sources > 0).long().sum(1)
        query_lens = (queries > 0).long().sum(1)
        max_len = (source_lens + query_lens).max()
        new_sources = torch.zeros(sources.size(0), max_len).long()
        new_sources = to_cuda(new_sources, self.iscuda)
        for i in range(sources.size(0)):
            new_sources[i, :source_lens[i]] += sources[i, :source_lens[i]]
            new_sources[i, source_lens[i]:source_lens[i] +
                        query_lens[i]] += queries[i, :query_lens[i]]

        self.inputs = new_sources  # inputs for the copynet model
        # get target outputs using the copynet model
        outputs = self.copynet(new_sources, targets)

        return outputs, similarities
Exemplo n.º 6
0
    def forward(self, sources, queries, context_len):
        """
        sources: LongTensor, [batch*context x src_seq]
        queries: LongTensor, [batch x qry_seq]
        context_len: [batch]
        """
        bc, in_seq = sources.size()
        b, q_seq = queries.size()

        embedded_sources = self.embedding(
            to_cuda(Variable(self.unk_tensor(sources)), self.iscuda))
        embedded_queries = self.embedding(
            to_cuda(Variable(self.unk_tensor(queries)), self.iscuda))

        src_mask = Variable(
            (sources >
             0).float().unsqueeze(2))  # [batch*context x src_seq x 1]
        q_mask = Variable(
            (queries > 0).float().unsqueeze(2))  # [batch x qry_seq x 1]
        q_len = q_mask.squeeze().sum(1).data.long().tolist()  # [batch]
        src_mask = to_cuda(src_mask, self.iscuda)
        q_mask = to_cuda(q_mask, self.iscuda)

        c_idx = 0
        source_list = []
        sim_list = []
        # print(context_len)
        # print(q_len)
        for i in range(b):
            # print(i,context_len[i],c_idx)
            tmp1 = embedded_sources[c_idx:c_idx + context_len[i], :q_len[i]]
            tmp2 = embedded_queries[i, :q_len[i]]
            sim = F.softmax((tmp1 * tmp2).sum(2).sum(1))
            sim_list.append(sim)
            top_score = sim.max(0)[1].data[0]  # argmax
            source_list.append(sources[c_idx + top_score])  # add answer
            c_idx += context_len[i]

        # get similarities
        similarities = torch.stack(sim_list, 0)
        sources = torch.stack(source_list, 0)

        return sources, similarities
    def forward(self, sources, queries):
        """
        sources: LongTensor, [batch*context x src_seq]
        queries: LongTensor, [batch x qry_seq]
        """
        bc, in_seq = sources.size()
        b, q_seq = queries.size()

        embedded_sources = self.embedding(
            to_cuda(Variable(self.unk_tensor(sources)), self.iscuda))
        embedded_queries = self.embedding(
            to_cuda(Variable(self.unk_tensor(queries)), self.iscuda))

        src_mask = Variable((sources > 0).float().unsqueeze(2))
        q_mask = Variable((queries > 0).float().unsqueeze(2))
        src_mask = to_cuda(src_mask, self.iscuda)
        q_mask = to_cuda(q_mask, self.iscuda)

        sources_out = embedded_sources * to_cuda(
            Variable(self.pos_emb[:in_seq]).unsqueeze(0).expand(
                bc, in_seq, self.embed), self.iscuda)
        queries_out = embedded_queries * to_cuda(
            Variable(self.pos_emb[:q_seq]).unsqueeze(0).expand(
                b, q_seq, self.embed), self.iscuda)

        # get resulting tensors of shape [bc x embed] & [b x embed]
        src_simil = (sources_out * src_mask).sum(1)
        q_simil = (queries_out * q_mask).sum(1)

        return src_simil, q_simil
Exemplo n.º 8
0
    def forward(self, sources, queries, lengths, targets):
        """
        sources: [batch*context_lines x seq] OR [batch x seq]
        queries: [batch x seq]
        targets: [batxh x seq]
        """
        source_len, query_len, target_len, context_len = lengths

        # use similarity function to get closest lines from source
        if self.single == False:
            # similarity_encode
            # similarity_compute
            if self.args.similarity == 'levenshtein':
                sources, similarities = self.similarity(
                    sources, queries, context_len)
            elif self.args.similarity == 'position_cosine':
                sources, similarities = self.position_cosine(
                    sources, queries, context_len)
            elif self.args.similarity == 'lstm_cosine':
                sources, similarities = self.lstm_cosine(
                    sources, queries, context_len)
            else:
                src_simil, q_simil = self.encoder(sources, queries)
                sources, similarities = self.similarity(
                    sources, src_simil, q_simil, context_len)
                # here, 'sources' are the selected lines

        # merge sources and queries to one matrix
        source_lens = (sources > 0).long().sum(1)
        query_lens = (queries > 0).long().sum(1)
        max_len = (source_lens + query_lens).max()
        new_sources = torch.zeros(sources.size(0), max_len).long()
        new_sources = to_cuda(new_sources, self.iscuda)
        for i in range(sources.size(0)):
            try:
                new_sources[i, :source_lens[i]] += sources[i, :source_lens[i]]
            except ValueError:
                pass
            new_sources[i, source_lens[i]:(
                source_lens[i] + query_lens[i])] += queries[i, :query_lens[i]]
        # get target outputs using the copynet model
        outputs = self.copynet(new_sources, targets)

        # delete irrelevant items
        del sources, targets, source_len, query_len, target_len, context_len, lengths
        del new_sources

        if self.single:
            return outputs
        else:
            return outputs, similarities
Exemplo n.º 9
0
    def forward(self, sources, queries):
        """
        sources: LongTensor, [batch*context x src_seq]
        queries: LongTensor, [batch x qry_seq]
        context_len: LongTensor, [batch]
        """
        bc, in_seq = sources.size()
        b, q_seq = queries.size()

        embedded_sources = self.embedding(
            to_cuda(Variable(self.unk_tensor(sources)), self.iscuda))
        embedded_queries = self.embedding(
            to_cuda(Variable(self.unk_tensor(queries)), self.iscuda))

        encoded_sources, _ = self.lstm(embedded_sources)
        encoded_queries, _ = self.lstm(embedded_queries)

        #         # here we will use the last hidden state
        source_len = (sources > 0).long().sum(1)
        query_len = (queries > 0).long().sum(1)

        # sources_last = [x[len()] for i,x in enumerate(encoded_sources)]
        # truncated
        # sources_last = [x[min([source_len[i]-1,query_len[int(i/10)]-1])] for i,x in enumerate(encoded_sources)]
        sources_last = [
            x[min([source_len[i] - 1, query_len[int(i / 10)] - 1])]
            for i, x in enumerate(encoded_sources)
        ]
        queries_last = [
            x[query_len[i] - 1] for i, x in enumerate(encoded_queries)
        ]

        # original
        # queries_last = [x[query_len[i]-1] for i,x in enumerate(encoded_queries)]

        src_simil = torch.stack(sources_last, 0)
        q_simil = torch.stack(queries_last, 0)
        return src_simil, q_simil
Exemplo n.º 10
0
    def forward(self, encoded_sources, sources, targets=None):
        """
        embedding: embedding function from above
        encoded_sources: Variable, [batch x seq x hidden]
        sources, targets: LongTensor, [batch x seq]
        """
        vocab_size = self.vocab_size
        hidden_size = self.hidden
        b, seq, _ = encoded_sources.size()

        source_lens = (sources > 0).long().sum(1)

        if targets is not None:
            self.max_out_seq = targets.size(1)
            target_lens = (targets > 0).long().sum(1)

        # 0. set initial states
        last_step = torch.stack(
            [x[source_lens[i] - 1] for i, x in enumerate(encoded_sources)],
            0)  # [batch x hidden*2]
        state = self.Ws(last_step).unsqueeze(0)  # [1 x batch x hidden*2]
        weighted = Variable(torch.Tensor(b, 1, hidden_size *
                                         2).zero_())  # [b x 1 x hidden]
        weighted = to_cuda(weighted, self.iscuda)

        out_list = []
        for i in range(self.max_out_seq):
            # 1. update states
            if self.is_train:
                inputs = self.embedding(
                    Variable(self.unk_tensor(targets[:, i])))
            gru_input = torch.cat([inputs.unsqueeze(1), weighted],
                                  2)  # [b x 1 x h+h]
            _, state = self.gru(gru_input, state)  # [ 1 x b x hidden]

            # 2. predict next word y_t
            # 2-1) get score_g
            score_g = self.Wo(state.squeeze())  # [b x vocab_size]

            # 2-2) get score_c
            score_c = F.tanh(
                self.Wc(encoded_sources.contiguous().view(-1,
                                                          hidden_size * 2)))
            score_c = score_c.view(b, -1, hidden_size)  # [b x seq x hid]
            score_c = torch.bmm(score_c, state.view(b, -1,
                                                    1)).squeeze()  # [b x seq]
            score_c = F.tanh(score_c)
            encoded_mask = Variable(
                (sources == 0).float() * (-1000))  # causing inplace error
            score_c = score_c + encoded_mask

            # 2-3) get softmax-ed probs
            score = torch.cat([score_g, score_c], 1)  # [b x (vocab+seq)]
            probs = F.softmax(score)
            prob_g = probs[:, :vocab_size]
            prob_c = probs[:, vocab_size:]

            ############################################################################################################

            # 2-4) add to prob_g slots for OOVs
            oovs = Variable(torch.Tensor(b, self.max_oovs).zero_()) + 1e-5
            oovs = to_cuda(oovs, self.iscuda)
            prob_g = torch.cat([prob_g, oovs], 1)

            # 2-5) add prob_c to prob_g
            numbers = sources.view(-1).tolist()
            set_numbers = list(set(numbers))  # unique numbers that appear
            c = Counter(numbers)
            dup_list = [k for k in set_numbers if (c[k] > 1)]
            dup_attn_sum = Variable(torch.zeros(b, seq))
            masked_idx_sum = Variable(torch.Tensor(b, seq).zero_())
            encoded_idx_var = Variable(sources)
            if self.iscuda:
                dup_attn_sum = dup_attn_sum.cuda()
                masked_idx_sum = masked_idx_sum.cuda()
                encoded_idx_var = encoded_idx_var.cuda()

            for dup in dup_list:
                mask = (encoded_idx_var == dup).float()
                masked_idx_sum += mask
                attn_mask = torch.mul(mask, prob_c)
                attn_sum = attn_mask.sum(1).unsqueeze(1)
                dup_attn_sum += torch.mul(mask, attn_sum)

            attn = torch.mul(prob_c, (1 - masked_idx_sum)) + dup_attn_sum
            batch_indices = torch.arange(start=0, end=b).long()
            batch_indices = batch_indices.expand(seq, b).transpose(
                1, 0).contiguous().view(-1)
            idx_repeat = torch.arange(start=0, end=seq).repeat(b).long()
            prob_c_to_g = Variable(
                torch.zeros(b, self.vocab_size + self.max_oovs))
            word_indices = sources.view(-1)
            if self.iscuda:
                #     batch_indices = batch_indices.cuda()
                #     idx_repeat = idx_repeat.cuda()
                #     prob_c_to_g = prob_c_to_g.cuda()
                attn = attn.cpu()
                word_indices = word_indices.cpu()

            prob_c_to_g[batch_indices, word_indices] += attn[batch_indices,
                                                             idx_repeat]
            if self.iscuda:
                prob_c_to_g = prob_c_to_g.cuda()
                attn = attn.cuda()
            # 2-6) get final output
            out = prob_g + prob_c_to_g + 1e-6

            # 3. get weighted attention to use for predicting next word
            # 3-1) get tensor that shows whether each decoder input has previously appeared in the encoder

            prev_input = (targets[:,
                                  i]).unsqueeze(1).expand(b, sources.size(1))
            idx_from_input = (sources == prev_input).float()
            idx_from_input = Variable(idx_from_input)
            for j in range(b):
                if idx_from_input[j].sum().data[0] > 1:
                    idx_from_input[j] = idx_from_input[j] / idx_from_input[
                        j].sum().data[0]

            # 3-2) multiply with prob_c to get final weighted representation
            weight_attn = prob_c * idx_from_input
            weight_attn = weight_attn.unsqueeze(1)  # [b x 1 x seq]
            weighted = torch.bmm(weight_attn,
                                 encoded_sources)  # weighted: [b x 1 x hidden]

            # 4. get next inputs
            max_vals = self.unk_tensor(out.max(1)[1].data)
            inputs = self.embedding(Variable(max_vals))

            out_list.append(out)  # out_seq @ [batch x vocab+oov]

        # get final outputs
        return torch.stack(out_list, 1)
Exemplo n.º 11
0
    def forward(self, sources, queries, context_len):
        """
        sources: LongTensor, [batch*context x src_seq]
        queries: LongTensor, [batch x qry_seq]
        context_len: LongTensor, [batch]
        """
        bc, in_seq = sources.size()
        b, q_seq = queries.size()

        embedded_sources = self.embedding(
            to_cuda(Variable(self.unk_tensor(sources)), self.iscuda))
        embedded_queries = self.embedding(
            to_cuda(Variable(self.unk_tensor(queries)), self.iscuda))

        src_mask = Variable((sources > 0).float().unsqueeze(2))
        q_mask = Variable((queries > 0).float().unsqueeze(2))
        src_mask = to_cuda(src_mask, self.iscuda)
        q_mask = to_cuda(q_mask, self.iscuda)

        sources_out = embedded_sources * to_cuda(
            Variable(self.pos_emb[:in_seq]).unsqueeze(0).expand(
                bc, in_seq, self.embed), self.iscuda)
        queries_out = embedded_queries * to_cuda(
            Variable(self.pos_emb[:q_seq]).unsqueeze(0).expand(
                b, q_seq, self.embed), self.iscuda)

        # get resulting tensors of shape [bc x embed] & [b x embed]
        src_simil = (sources_out * src_mask).sum(1)
        q_simil = (queries_out * q_mask).sum(1)

        idx = 0
        similarity_list = []
        sources_list = []
        for i, c in enumerate(context_len):
            similarities = F.softmax(
                torch.mm(src_simil[idx:idx + c],
                         q_simil[i].unsqueeze(1)).squeeze())
            similarity_list.append(
                similarities)  # distribution of each line, for later softmax
            max_idx = similarities.max(0)[1]
            sources_list.append(sources[idx +
                                        max_idx.data[0]])  # selected source
            idx += c

        sources = torch.stack(sources_list, 0)  # [b x seq]

        similarity_tensor = to_cuda(Variable(torch.zeros(b, 10)), self.iscuda)
        for i, sim in enumerate(similarity_list):
            length = len(sim)
            similarity_tensor[i, :length] = similarity_tensor[i, :length] + sim

        return sources, similarity_tensor

        # here we will use the last hidden state
        source_len = (sources > 0).long().sum(1)
        sources_last = [
            x[source_len[i] - 1].unsqueeze(0)
            for i, x in enumerate(encoded_sources)
        ]
        queries_last = [
            x[query_len[i] - 1].unsqueeze(0)
            for i, x in enumerate(encoded_queries)
        ]

        y_list = []
        for i, length in enumerate(context_len):
            y_list.append(queries_last[i].expand(length, hidden))
        x = torch.cat(sources_last, 0)
        y = torch.cat(y_list, 0)  # [batch*context x hidden]
        mul = F.cosine_similarity(x, y)  # [batch*context]

        temp = 0
        idx_list = []
        attn_list = []
        source_list = []
        encoded_list = []
        for i, length in enumerate(context_len):
            attn = F.softmax(mul[temp:temp + length])
            attn_list.append(attn)
            idx = attn.max(0)[1].data[0]
            idx_list.append(idx)
            out = (encoded_sources[temp:temp + length] *
                   attn.unsqueeze(1).unsqueeze(2)).sum(0)
            source_list.append(sources[temp + idx].unsqueeze(0))
            encoded_list.append(out.unsqueeze(0))
            temp += length
        out = torch.cat(encoded_list, 0)
        attns = torch.cat(attn_list, 0)
        sources = torch.stack(source_list, 0)
        return out, sources, attns, idx_list