示例#1
0
文件: day14.py 项目: groselt/aoc
def update_occurrences(pair_counts: defaultdict, rules: map) -> None:
    new_pair_counts = defaultdict(int)
    for pair, count in pair_counts.items():
        new = rules[pair]
        new_pair_counts[pair[0] + new] += count
        new_pair_counts[new + pair[1]] += count
    pair_counts.clear()
    pair_counts.update(new_pair_counts)
示例#2
0
def main():
    logging.basicConfig(level=logging.INFO)

    parser = argparse.ArgumentParser(
        description='''Script to find out what words BERT\'s
        attention attends to in layers 0, 9, 10, 11.''')
    parser.add_argument('-i', '--src', type=argparse.FileType('r'), metavar='PATH',
        help='''File containing multiple lines of input text, if not specified,
              uses some predefined text.''')
    parser.add_argument('-t', '--top', type=int, default=10, metavar='K',
        help='Find top K words that BERT attends to. Default 10.')
    parser.add_argument('-b', '--batch_size', type=int, default=20, metavar='B',
        help='Specify batch size=B. Default 20.')
    parser.add_argument('-g', '--gpu', action='store_true',
        help='Option to use GPU.')
    parser.add_argument('-a', '--all_layers', action='store_true',
        help='Output the attention of each layer')
    parser.add_argument('-T', '--out_top', type=int, metavar='N',
        help='Output top N words in final output. If -a is set, default value is 100')
    parser.add_argument('-m', '--mask', action='store_true',
        help='Mask attended words and compare predictions with original words. Not functional yet.')
    parser.add_argument('-o', '--out', type=argparse.FileType('w'),
        help='File to write results', required=True)
    args = parser.parse_args()

    top_k = args.top
    batch_size = args.batch_size
    use_gpu = args.gpu
    do_mask = args.mask
    all_layers = args.all_layers
    out_top_k = args.out_top

    print('all_layers', all_layers)

    if args.src is not None:
        src = args.src
    else:
        text = 'burma has put five cities on a security alert after religious unrest involving buddhists and moslems in the northern city of mandalay , an informed source said wednesday.'
        text1 = 'police arrested five anti-nuclear protesters friday after they sought to disrupt loading of a french antarctic research and supply vessel , a spokesman for the protesters said .'
        text2 = 'turkmen president gurbanguly berdymukhammedov will begin a two-day visit to russia , his country \'s main energy partner , on monday for trade talks , the kremlin press office said .'
        text3 = 'israel \'s new government barred yasser arafat from flying to the west bank to meet with former prime minister shimon peres on thursday , a move palestinian officials said violated the israel-plo peace accords .'
        src = [text, text1, text2, text3]

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased', output_attentions=True)
    model.eval()

    if do_mask:
        mask_model = BertForMaskedLM.from_predtrained('bert-base-uncased')
        mask_model.eval()
        mask_token = tokenizer.mask_token
        mask_id = tokenizer.convert_tokens_to_ids([mask_token])[0]
        ignore_tokens = ['[CLS]', '[SEP]', '.', ',',
        'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday',
        'said', '#', 'here', '<', 'news', '>', '`',
        'has', 'have', 'will', 'the', 'a', 'is', 'was', 'are',
        'on', 'as',  'after', 'this', 'in', 'with', 'to']
        ignore_ids = tokenizer.convert_tokens_to_ids(ignore_tokens)
        if use_gpu:
            mask_model = mask_model.cuda()

    if use_gpu:
        assert torch.cuda.is_available(), 'GPU unavailable!'
        model = model.cuda()
        print('-- Using GPU --')
    device = torch.device('cuda') if use_gpu else torch.device('cpu')

    total_words_attended = 0
    total_start_time = time.time()

    # if batch_size is None:
    #     count = 0
    #     layers = [0, 9, 10, 11]
    #     for line in src:
    #         if count % 100 == 0:
    #             print('Processed', count, 'lines')
    #         count += 1
    #         line = line.strip()
    #         tokens_tensor = str_to_idx_tensor(tokenizer, line)
    #         if use_gpu:
    #             tokens_tensor = tokens_tensor.cuda()
    #         str_tokens = tokenizer.convert_ids_to_tokens(tokens_tensor.tolist()[0])
    #         outputs = model(tokens_tensor)
    #
    #         cum_attn = []
    #
    #         for l in layers:
    #             layer = outputs[2][l]
    #             summed = layer.sum(dim=2).sum(dim=1).view(-1)
    #             summed = (summed / summed.sum(dim=0))
    #             cum_attn.append(summed)
    #
    #         all_attns = torch.stack(cum_attn).sum(dim=0)
    #         normalized_attn = (all_attns / all_attns.sum(dim=0)).tolist()
    #         sorted_by_attn = sorted(list(zip(normalized_attn, str_tokens)), key=lambda p: p[0], reverse=True)
    #
    #         for p in sorted_by_attn[:top_k]:
    #             attended_word_dict[p[1]] += 1
    #
    #         total_words_attended += len(sorted_by_attn[:top_k])
    #     if args.src is not None:
    #         args.src.close()

    # batch implementation
    if all_layers:
        attended_word_tensor = torch.zeros(12, tokenizer.vocab_size,
            device=device)
    else:
        layers = torch.tensor([0], device=device)
        attended_word_tensor = torch.zeros(tokenizer.vocab_size, device=device)

    batch_iterator = chunked(src, batch_size)
    iter = 0

    for batch in batch_iterator:
        start_time = time.time()
        if iter % 10 == 0:
            print('Processed', iter, 'batches')
        iter += 1

        batch_toks, _, attn_mask, _ = batch_to_idx_tensor(tokenizer, batch)

        if use_gpu:
            batch_toks = batch_toks.cuda()
            attn_mask = attn_mask.cuda()

        with torch.no_grad():
            outputs = model(batch_toks, attention_mask=attn_mask)

        attn = torch.stack(outputs[2])
        # attn has shape [num_layers (12), batch_size, num_heads, max_src_len, max_src_len]
        if all_layers:
            summed = attn.sum(dim=3).sum(dim=2) # [num_layers, batch_size, max_src_len]
            summed = summed / summed.sum(dim=2, keepdim=True) # normalize
            summed.transpose_(0,1)
            # summed has shape [bath_size, num_layers, max_src_len]
        else:
            attn = attn.index_select(0, layers)
            summed = attn.sum(dim=3).sum(dim=2).sum(dim=0).view(attn.shape[1], attn.shape[4])
            summed = summed / summed.sum(dim=1, keepdim=True) # normalize
            # summed has shape [batch_size, max_src_len]

        _, topk_idxs = summed.topk(top_k, sorted=True)
        # topk_idxs has shape [batch_size, top_k]

        # split by each item in batch
        split_batch_toks = batch_toks.split(1, dim=0)
        split_topk_idxs = topk_idxs.split(1, dim=0)

        for idxs, toks in zip(split_topk_idxs, split_batch_toks):
            attended_toks = toks.squeeze(0)[idxs.squeeze(0)]
            if all_layers:
                # record topk attended tokens for each layer
                for i in range(summed.shape[1]):
                    attended_word_tensor[i, attended_toks[i]] += 1
            else:
                attended_word_tensor[attended_toks] += 1

        total_words_attended += topk_idxs.shape[0] * topk_idxs.shape[1]

    if args.src is not None:
        args.src.close()

    if all_layers:
        f = args.out
        f.write('Total tokens attended: {}\n '.format(total_words_attended))
        for i in range(attended_word_tensor.shape[0]):
            non_zero_idxs = attended_word_tensor[i].nonzero().view(-1)
            counts = attended_word_tensor[i, non_zero_idxs].tolist()
            toks = tokenizer.convert_ids_to_tokens(non_zero_idxs.tolist())
            attended_word_dict = DD(int)
            attended_word_dict.update(zip(toks, counts))
            top_attn_count = sorted(attended_word_dict.items(),key=lambda p: p[1],reverse=True)

            out_top_k = 300 if not out_top_k else out_top_k
            f.write('\nLAYER {}\n'.format(i))
            for p in top_attn_count[:out_top_k]:
                f.write(p[0] + ' ' + str(p[1]) + '\n')
        f.close()
        print('Finished, total duration = {:.4}'.format(time.time() - total_start_time))
    else:
        non_zero_idxs = attended_word_tensor.nonzero().view(-1)
        counts = attended_word_tensor[non_zero_idxs].tolist()
        toks = tokenizer.convert_ids_to_tokens(non_zero_idxs.tolist())
        attended_word_dict = DD(int)
        attended_word_dict.update(zip(toks, counts))
        # end if of batch implementation
        print('Finished, total duration = {:.4}'.format(time.time() - total_start_time))

        top_attn_count = sorted(attended_word_dict.items(),key=lambda p: p[1],reverse=True)

        f = args.out
        f.write('Total tokens attended: {}\n '.format(total_words_attended))
        if out_top_k is not None:
            top_attn_count = top_attn_count[:out_top_k]
        for p in top_attn_count:
            f.write(p[0] + ' ' + str(p[1]) + '\n')
        f.close()