예제 #1
0
    def __init__(self, embedding_dir, model_name="bert-base-multilingual-cased", layer=-2):
        super(BertEncoder, self).__init__(embedding_dir)

        # Load pre-trained model (weights) and set to evaluation mode (no more training)
        self.model = BertModel.from_pretrained(model_name)
        self.model.eval()

        # Load word piece tokenizer
        self.tokenizer = BertTokenizer.from_pretrained(model_name)

        # Layer from which to get the embeddings
        self.layer = layer
예제 #2
0
def model_eval_ablation(model_path, filter_value=0.2, top_k_sent=5):
    bert_model_name = 'bert-base-uncased'
    bert_pretrain_path = config.PRO_ROOT / '.pytorch_pretrained_bert'

    lazy = False
    forward_size = 32
    do_lower_case = True
    pair_order = 'cq'
    debug_mode = False

    maxout_model = False

    num_class = 3

    tag = 'dev'
    exp = 'no_re_train'
    print("Filter value:", filter_value)
    print("top_k_sent:", top_k_sent)
    train_sent_filtering_prob = 0.2
    dev_sent_filtering_prob = filter_value
    test_sent_filtering_prob = 0.2

    # Data dataset and upstream sentence results.
    dev_sent_results_list = common.load_jsonl(
        config.PRO_ROOT / "data/p_fever/fever_sentence_level/04-24-00-11-19_fever_v0_slevel_retri_(ignore_non_verifiable-True)/fever_s_level_dev_results.jsonl")
    # train_sent_results_list = common.load_jsonl(
    #     config.PRO_ROOT / "data/p_fever/fever_sentence_level/04-24-00-11-19_fever_v0_slevel_retri_(ignore_non_verifiable-True)/fever_s_level_train_results.jsonl")
    test_sent_results_list = common.load_jsonl(
        config.PRO_ROOT / "data/p_fever/fever_sentence_level/04-24-00-11-19_fever_v0_slevel_retri_(ignore_non_verifiable-True)/fever_s_level_test_results.jsonl")

    dev_fitems, dev_list = get_nli_pair('dev', is_training=False,
                                        sent_level_results_list=dev_sent_results_list, debug=debug_mode,
                                        sent_top_k=top_k_sent, sent_filter_value=dev_sent_filtering_prob)
    # train_fitems, train_list = get_nli_pair('train', is_training=True,
    #                                         sent_level_results_list=train_sent_results_list, debug=debug_mode,
    #                                         sent_top_k=5, sent_filter_value=train_sent_filtering_prob)
    test_fitems, test_list = get_nli_pair('test', is_training=False,
                                          sent_level_results_list=test_sent_results_list, debug=debug_mode,
                                          sent_top_k=top_k_sent, sent_filter_value=test_sent_filtering_prob)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device_num = 0 if torch.cuda.is_available() else -1

    n_gpu = torch.cuda.device_count()

    unk_token_num = {'tokens': 1}  # work around for initiating vocabulary.
    vocab = ExVocabulary(unk_token_num=unk_token_num)
    vocab.add_token_to_namespace('SUPPORTS', namespace='labels')
    vocab.add_token_to_namespace('REFUTES', namespace='labels')
    vocab.add_token_to_namespace('NOT ENOUGH INFO', namespace='labels')
    vocab.add_token_to_namespace("hidden", namespace="labels")
    vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels')

    if debug_mode:
        dev_list = dev_list[:100]
        # train_list = train_list[:100]
        test_list = test_list[:100]
        eval_frequency = 2

    # est_datasize = len(train_fitems)

    bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=do_lower_case,
                                                   cache_dir=bert_pretrain_path)
    bert_cs_reader = BertFeverNLIReader(bert_tokenizer, lazy, is_paired=True, query_l=64,
                                        example_filter=None, max_l=384, pair_order=pair_order)

    bert_encoder = BertModel.from_pretrained(bert_model_name, cache_dir=bert_pretrain_path)
    if not maxout_model:
        model = BertMultiLayerSeqClassification(bert_encoder, num_labels=num_class, num_of_pooling_layer=1,
                                                act_type='tanh', use_pretrained_pooler=True, use_sigmoid=False)
    else:
        model = BertPairMaxOutMatcher(bert_encoder, num_of_class=num_class, act_type="gelu", num_of_out_layers=2)

    model.load_state_dict(torch.load(model_path))

    dev_instances = bert_cs_reader.read(dev_fitems)
    # train_instances = bert_cs_reader.read(train_fitems)
    test_instances = bert_cs_reader.read(test_fitems)

    biterator = BasicIterator(batch_size=forward_size)
    biterator.index_with(vocab)

    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    if tag == 'dev':
        dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False)

        cur_eval_results_list = eval_model(model, dev_iter, device_num, with_probs=True, make_int=True,
                                           feed_input_span=maxout_model, show_progress=True)
        common.save_jsonl(cur_eval_results_list, f"nli_{tag}_label_results_th{dev_sent_filtering_prob}_{exp}.jsonl")

        ema_results_dict = list_dict_data_tool.list_to_dict(cur_eval_results_list, 'oid')
        copied_dev_list = copy.deepcopy(dev_list)
        list_dict_data_tool.append_item_from_dict_to_list(copied_dev_list, ema_results_dict,
                                                          'id', 'predicted_label')

        common.save_jsonl(copied_dev_list, f"nli_{tag}_cp_results_th{dev_sent_filtering_prob}_{exp}.jsonl")
        mode = {'standard': True}
        strict_score, acc_score, pr, rec, f1 = fever_scorer.fever_score(copied_dev_list, dev_list,
                                                                        mode=mode, max_evidence=5)
        logging_item = {
            'ss': strict_score, 'ac': acc_score,
            'pr': pr, 'rec': rec, 'f1': f1,
        }

        print(logging_item)
        common.save_json(logging_item,
                         f"nli_th{dev_sent_filtering_prob}_{exp}_ss:{strict_score}_ac:{acc_score}_pr:{pr}_rec:{rec}_f1:{f1}.jsonl")

    elif tag == 'test':
        test_iter = biterator(test_instances, num_epochs=1, shuffle=False)

        cur_eval_results_list = eval_model(model, test_iter, device_num, with_probs=True, make_int=True,
                                           feed_input_span=maxout_model, show_progress=True)

        common.save_jsonl(cur_eval_results_list, f"nli_{tag}_label_results_th{test_sent_filtering_prob}.jsonl")

        ema_results_dict = list_dict_data_tool.list_to_dict(cur_eval_results_list, 'oid')
        copied_test_list = copy.deepcopy(test_list)
        list_dict_data_tool.append_item_from_dict_to_list(copied_test_list, ema_results_dict,
                                                          'id', 'predicted_label')

        common.save_jsonl(copied_test_list, f"nli_{tag}_cp_results_th{test_sent_filtering_prob}.jsonl")
예제 #3
0
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

# Load pre-trained model tokenizer (vocabulary)
modelpath = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(modelpath)

text = "dummy. although he had already eaten a large meal, he was still very hungry."
target = "hungry"
tokenized_text = tokenizer.tokenize(text)

# Mask a token that we will try to predict back with `BertForMaskedLM`
masked_index = tokenized_text.index(target)
tokenized_text[masked_index] = '[MASK]'

# Convert token to vocabulary indices
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
segments_ids = [1] * len(tokenized_text)
# this is for the dummy first sentence.
segments_ids[0] = 0
segments_ids[1] = 0

# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])
# Load pre-trained model (weights)
model = BertForMaskedLM.from_pretrained(modelpath)
model.eval()

# Predict all tokens
예제 #4
0
from decouple import config
import tweepy
import basilica

# Pytorch and BERT
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
mallet_path = '/Users/mattkirby/Social-Analysis/tweet-analysis/mallet-2.0.8/bin/mallet'

# Spacy for lemmatization
import spacy
nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner'])

# NLTK
import nltk
from nltk.corpus import stopwords


#Preprocess for BERT
def bert_preprocess(list_of_stings):
    sentences = []
    begin_tag = "[CLS] "
    end_tag = " [SEP]"
예제 #5
0
from my_utils.log_wrapper import create_logger
from my_utils.tokenizer import END, build_vocab
from my_utils.utils import set_environment
from my_utils.word2vec_utils import load_emb_vocab, build_embedding
from pytorch_pretrained_bert import BertTokenizer

"""
This script is to preprocess SQuAD dataset.
"""

# Turn off
DEBUG_ON = False
DEBUG_SIZE = 2000

NLP = spacy.load('en_core_web_md', disable=['vectors', 'textcat', 'parser'])
BERT_TOKENIZER = BertTokenizer.from_pretrained('bert-base-cased')


def load_data(path, is_train=True, v2_on=False):
    rows = []
    with open(path, encoding="utf8") as f:
        data = json.load(f)['data']
    for article in tqdm.tqdm(data, total=len(data)):
        for paragraph in article['paragraphs']:
            context = paragraph['context']
            if v2_on:
                context = '{} {}'.format(context, END)
            for qa in paragraph['qas']:
                uid, question = qa['id'], qa['question']
                answers = qa.get('answers', [])
                # used for v2.0
        print("F1 score: ", f1_score_curr, " at threshold: ", threshold)
        if f1_score_curr > best_f1_score:
            best_f1_score = f1_score_curr
            best_threshold = threshold
    return best_threshold


if __name__ == '__main__':
    # load the dataset
    data = json.load(open('../data/data_with_cuis.json', 'r'))
    # concept to cui mappings
    cui_to_concept, concept_to_cui = concept_cui_mapping(data, aspect)
    # # create the vocabulary for the input
    idx_to_cui, cui_to_idx = extract_target_vocab(data, aspect)
    # Load pre-trained model tokenizer (vocabulary)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', max_len=512)

    # load label count
    label_cnt = json.load(open('../data/label_counts.json', 'r'))
    p_label_cnt = label_cnt['p_label_cnt']
    i_label_cnt = label_cnt['i_label_cnt']
    o_label_cnt = label_cnt['o_label_cnt']

    # Split train and test data
    train_idx = rd.sample(range(len(data)), int(0.8 * len(data)))
    test_idx = [i for i in range(len(data)) if i not in train_idx]

    train_data = [data[i] for i in train_idx]
    test_data = [data[i] for i in test_idx]

    val_idx = rd.sample(range(len(train_data)), int(0.1 * len(train_data)))
예제 #7
0
def process_mrc_example():
    csv_reader = csv.reader(open(TRAIN_DIR), delimiter='\t')
    rows = [row for row in csv_reader]
    docid_name = rows[0][1]
    question_name = rows[0][2]
    answer_name = rows[0][3]
    json_positive_dirs = join(MRC_DIR, '200_sample')
    if not exists(json_positive_dirs):
        os.makedirs(json_positive_dirs)
        print('Dir used for mrc samples Created ')
    with open(REALATE_DIR, 'rb') as v:
        relation_dict = pickle.load(v)
    sample_rows = rows[:200]
    tmp_dict = {}
    count = 0
    maxlen = 0
    for i, sample_raw in enumerate(sample_rows):
        if (i == 0):
            continue
        else:
            print('start processing {}'.format(i))

            try:
                new_docid = relation_dict[sample_raw[1]]
                tmp_dict['new_docid'] = new_docid
                with open(
                        join(join(MRC_DIR, 'context'),
                             '{}.json'.format(new_docid)), 'rb') as p:
                    context = json.load(p)

            except KeyError:
                print('mrc sample {} - related document not found')

            # tmp_dict[docid_name] = sample_raw[1]
            tokenizer = BertTokenizer.from_pretrained('./MRC_pretrain')

            text = context['text']
            text_tok = tokenizer.tokenize(text)
            text_id = tokenizer.convert_tokens_to_ids(text_tok)
            text_len = len(text_id)

            question = filter_text(sample_raw[2].replace(' ', '').replace(
                ' ', ''))
            ques_tok = tokenizer.tokenize("[CLS] " + question + " [SEP]")
            ques_id = tokenizer.convert_tokens_to_ids(ques_tok)
            question_len = len(ques_id)
            maxlen = question_len if question_len > maxlen else maxlen

            answer = filter_text(sample_raw[3].replace(' ', '').replace(
                ' ', ''))
            ans_tok = tokenizer.tokenize(answer)
            ans_id = tokenizer.convert_tokens_to_ids(ans_tok)
            ans_len = len(ans_id)

            suppose_start = []  #可能的start位置
            for i in range(text_len):
                if (text_id[i] == ans_id[0]):
                    suppose_start.append(i)

            s = 0
            e = 0
            if (len(suppose_start) <= 0):
                continue

            else:
                for t in range(len(suppose_start)):
                    start = suppose_start[t]
                    end = suppose_start[t]
                    for m in range(ans_len):
                        if (m + start >= text_len):
                            break
                        elif (ans_id[m] == text_id[m + start]):
                            end += 1
                        else:
                            break
                    if (end - start != ans_len):
                        continue
                    else:
                        s = suppose_start[t]
                        e = end
                        break
            if (s == 0 and e == 0):
                continue
            else:
                span_arr = [0] * (s - 0) + [1] * (e - s) + [0] * (text_len - e)
                assert len(span_arr) == text_len

                tmp_dict['question'] = ques_id
                tmp_dict['question_length'] = question_len
                tmp_dict['text'] = text_id
                tmp_dict['text_length'] = text_len
                tmp_dict['answer_span'] = span_arr
                tmp_dict['text_tok'] = text_tok
                tmp_dict['original_text'] = text

                with open(join(json_positive_dirs, '{}.json'.format(count)),
                          'w',
                          encoding='utf-8') as f:
                    json.dump(tmp_dict, f, ensure_ascii=False)
                    count += 1

    # print('sample index larger than 512 is {}'.format(count))
    print('Pre-processed {} mrc samples finished'.format(count))
예제 #8
0
    def create_dataloader(self):
        # 读取输入输出
        print("Load data")
        train_comments = self.train_df["comment_text"].astype(str)
        train_label = self.train_df["target"].values
        train_type_labels = self.train_df[self.toxicity_type_list].values

        # 新的 np 任务
        train_np_labels = np.zeros((len(self.train_df), 4))
        train_np_identity_labels = np.zeros(
            (len(self.train_df), len(self.identity_list) * 4))
        train_df_copy = self.train_df[self.identity_list + ["target"]]
        for column in self.identity_list + ["target"]:
            train_df_copy[column] = np.where(train_df_copy[column] > 0.5, True,
                                             False)
        pp_label_bool = train_df_copy["target"] & np.where(
            train_df_copy[self.identity_list].sum(axis=1) > 0, True, False)
        np_label_bool = ~train_df_copy["target"] & np.where(
            train_df_copy[self.identity_list].sum(axis=1) > 0, True, False)
        pn_label_bool = train_df_copy["target"] & np.where(
            (train_df_copy[self.identity_list]).sum(axis=1) == 0, True, False)
        nn_label_bool = ~train_df_copy["target"] & np.where(
            (train_df_copy[self.identity_list]).sum(axis=1) == 0, True, False)
        train_np_labels[:, 0] = np.where(pp_label_bool > 0, 1, 0)
        train_np_labels[:, 1] = np.where(np_label_bool > 0, 1, 0)
        train_np_labels[:, 2] = np.where(pn_label_bool > 0, 1, 0)
        train_np_labels[:, 3] = np.where(nn_label_bool > 0, 1, 0)
        for i, column in enumerate(self.identity_list):
            pp_label_bool = train_df_copy["target"] & train_df_copy[column]
            np_label_bool = ~train_df_copy["target"] & train_df_copy[column]
            pn_label_bool = train_df_copy["target"] & (~train_df_copy[column])
            nn_label_bool = ~train_df_copy["target"] & (~train_df_copy[column])
            train_np_identity_labels[:, i * 4 + 0] = np.where(
                pp_label_bool > 0, 1, 0)
            train_np_identity_labels[:, i * 4 + 1] = np.where(
                np_label_bool > 0, 1, 0)
            train_np_identity_labels[:, i * 4 + 2] = np.where(
                pn_label_bool > 0, 1, 0)
            train_np_identity_labels[:, i * 4 + 3] = np.where(
                nn_label_bool > 0, 1, 0)

        # 身份原始值
        train_identity_values = self.train_df[self.identity_list].fillna(
            0.).values
        # 所有身份原始值之和
        train_identity_sum = train_identity_values.sum(axis=1)
        # 将身份之和限制在1以下(sigmoid)
        train_identity_sum_label = np.where(train_identity_sum > 1, 1,
                                            train_identity_sum)
        # 身份01值
        train_identity_binary = copy.deepcopy(
            self.train_df[self.identity_list])
        for column in self.identity_list:
            train_identity_binary[column] = np.where(
                train_identity_binary[column] > 0.5, 1, 0)
        # 身份01值有一个就算1
        train_identity_binary_sum = train_identity_binary.sum(axis=1)
        train_identity_or_binary = np.where(train_identity_binary_sum >= 1, 1,
                                            0)
        # 所有身份标签
        train_identity_type_labels = train_identity_values
        train_identity_type_binary_lables = train_identity_binary
        train_identity_sum_label = train_identity_sum_label
        train_identity_binary_label = train_identity_or_binary

        # tokenizer 训练
        print("Init tokenizer")
        bert_tokenizer = BertTokenizer.from_pretrained(self.bert_model_path,
                                                       cache_dir=None,
                                                       do_lower_case=True)
        print("Tokenizing")
        train_bert_tokens = self.convert_lines(
            self.train_df["comment_text"].fillna("DUMMY_VALUE"), self.max_len,
            bert_tokenizer)
        # 划分训练集和验证集
        valid_tokens = train_bert_tokens[self.train_len:]
        valid_label = train_label[self.train_len:]
        valid_type_labels = train_type_labels[self.train_len:]
        train_tokens = train_bert_tokens[:int(
            self.train_len * 0.5
        )] if self.half == 1 else train_bert_tokens[int(self.train_len *
                                                        0.5):self.train_len]
        train_label = train_label[:int(
            self.train_len *
            0.5)] if self.half == 1 else train_label[int(self.train_len *
                                                         0.5):self.train_len]
        train_type_labels = train_type_labels[:int(
            self.train_len * 0.5
        )] if self.half == 1 else train_type_labels[int(self.train_len *
                                                        0.5):self.train_len]
        valid_identity_type_labels = train_identity_type_labels[self.
                                                                train_len:]
        train_identity_type_labels = train_identity_type_labels[:int(
            self.train_len *
            0.5)] if self.half == 1 else train_identity_type_labels[
                int(self.train_len * 0.5):self.train_len]
        valid_identity_type_binary_lables = train_identity_type_binary_lables[
            self.train_len:]
        train_identity_type_binary_lables = train_identity_type_binary_lables[:int(
            self.train_len *
            0.5)] if self.half == 1 else train_identity_type_binary_lables[
                int(self.train_len * 0.5):self.train_len]
        valid_identity_sum_label = train_identity_sum_label[self.train_len:]
        train_identity_sum_label = train_identity_sum_label[:int(
            self.train_len *
            0.5)] if self.half == 1 else train_identity_sum_label[
                int(self.train_len * 0.5):self.train_len]
        valid_identity_binary_label = train_identity_binary_label[self.
                                                                  train_len:]
        train_identity_binary_label = train_identity_binary_label[:int(
            self.train_len *
            0.5)] if self.half == 1 else train_identity_binary_label[
                int(self.train_len * 0.5):self.train_len]
        valid_np_labels = train_np_labels[self.train_len:]
        train_np_labels = train_np_labels[:int(
            self.train_len * 0.5
        )] if self.half == 1 else train_np_labels[int(self.train_len *
                                                      0.5):self.train_len]
        valid_np_identity_labels = train_np_identity_labels[self.train_len:]
        train_np_identity_labels = train_np_identity_labels[:int(
            self.train_len *
            0.5)] if self.half == 1 else train_np_identity_labels[
                int(self.train_len * 0.5):self.train_len]

        # 计算样本权重
        target_weight, aux_weight, identity_weight, np_weight, np_identity_weight = self.cal_sample_weights(
        )

        # 将符号化数据转成 tensor
        train_x_tensor = torch.tensor(train_tokens, dtype=torch.long)
        valid_x_tensor = torch.tensor(valid_tokens, dtype=torch.long)
        train_y_tensor = torch.tensor(np.hstack([
            train_label[:, np.newaxis], train_type_labels,
            train_identity_type_labels, train_np_labels
        ]),
                                      dtype=torch.float32)
        valid_y_tensor = torch.tensor(np.hstack([
            valid_label[:, np.newaxis], valid_type_labels,
            valid_identity_type_labels, valid_np_labels
        ]),
                                      dtype=torch.float32)
        target_weight_tensor = torch.tensor(target_weight, dtype=torch.float32)
        aux_weight_tensor = torch.tensor(aux_weight, dtype=torch.float32)
        identity_weight_tensor = torch.tensor(identity_weight,
                                              dtype=torch.float32)
        np_weight_tensor = torch.tensor(np_weight, dtype=torch.float32)
        np_identity_weight_tensor = torch.tensor(np_identity_weight,
                                                 dtype=torch.float32)
        if torch.cuda.is_available():
            train_x_tensor = train_x_tensor.to(self.device)
            valid_x_tensor = valid_x_tensor.to(self.device)
            train_y_tensor = train_y_tensor.to(self.device)
            valid_y_tensor = valid_y_tensor.to(self.device)
            target_weight_tensor = target_weight_tensor.to(self.device)
            aux_weight_tensor = aux_weight_tensor.to(self.device)
            identity_weight_tensor = identity_weight_tensor.to(self.device)
            np_weight_tensor = np_weight_tensor.cuda()
            np_identity_weight_tensor = np_identity_weight_tensor.cuda()
        # 将 tensor 转成 dataset,训练数据和标签一一对应,用 dataloader 加载的时候 dataset[:-1] 是 x,dataset[-1] 是 y
        train_dataset = data.TensorDataset(train_x_tensor, train_y_tensor,
                                           target_weight_tensor,
                                           aux_weight_tensor,
                                           identity_weight_tensor,
                                           np_weight_tensor,
                                           np_identity_weight_tensor)
        valid_dataset = data.TensorDataset(valid_x_tensor, valid_y_tensor)
        # 将 dataset 转成 dataloader
        train_loader = torch.utils.data.DataLoader(
            train_dataset, batch_size=self.base_batch_size, shuffle=True)
        valid_loader = torch.utils.data.DataLoader(
            valid_dataset, batch_size=self.base_batch_size, shuffle=False)
        # 返回训练数据
        return train_loader, valid_loader
예제 #9
0
        print(encoded_layers)
        for i in encoded_layers:
            enc, _ = self.rnn(i)
        logits = self.fc(enc)
        if y is not None:
            loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
            loss = loss_fct(logits.view(-1, logits.shape[-1]), y.view(-1))
            return loss
        return logits


if __name__ == '__main__':
    input_text = '[CLS] I go to school by bus [SEP]'
    target_text = '我搭公車上學'

    tokenizer = BertTokenizer.from_pretrained('./vocab.txt')
    example_pair = dict()

    # 数据预处理
    for i in range(0, len(target_text) + 1):
        tokenized_text = tokenizer.tokenize(input_text)  # 对输入文本分词
        tokenized_text.extend(target_text[:i])   # 每次的输入加一步解码的信息
        tokenized_text.append('[MASK]')
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        tokens_tensor = torch.tensor([indexed_tokens])

        # 用-1标记的都是不求损失的 
        loss_ids = [-1] * (len(tokenizer.convert_tokens_to_ids(tokenized_text)) - 1)
        if i == len(target_text):
            loss_ids.append(tokenizer.convert_tokens_to_ids(tokenizer.tokenize('[SEP]'))[0])  # 最后加个sep
        else:
예제 #10
0
def process_ace34_file(path):
    BERT_ML = ACE34_UNCASE['bert_ml']
    ORI_ML = ACE34_UNCASE['ori_ml']

    pretrained_bert_name = ACE34_UNCASE['pretrained_bert_name']
    tokenizer = BertTokenizer.from_pretrained(pretrained_bert_name)
    fin = open('{}.graph'.format(path), 'rb')
    idx2gragh = pickle.load(fin)
    fin.close()

    raw_data = read_ace34_file(path)
    data = []
    for sample_id, tokens, label, target, anchor_index in raw_data:
        sent_len = len(tokens)
        # On raw text
        tok_bert_indices = []
        for tok in tokens:
            tok = tok.replace('...', '.').replace('...',
                                                  '.').replace('...', '.')
            bert_tokens = tokenizer.tokenize(tok)
            tok_bert_indices.append(
                tokenizer.convert_tokens_to_ids(bert_tokens))
        bert_len = sum([len(x) for x in tok_bert_indices])

        assert bert_len <= BERT_ML, 'Bert length: {}\n{}'.format(
            bert_len, tokens)
        assert sent_len <= ORI_ML, 'Ori length: {}'.format(sent_len)

        transform = zeros(ORI_ML, BERT_ML, 0.0)

        # Create transform to convert tokenized length to original length
        offset = 1
        raw_text_bert_indices = []
        # print(sample_id)
        # print(sum([len(x) for x in tok_bert_indices]))
        # print(' '.join(tokens))
        for i, indices in enumerate(tok_bert_indices):
            l = len(indices)
            raw_text_bert_indices += indices
            for j in range(l):
                assert i <= ORI_ML, '| i={}'.format(i)
                assert offset + j < BERT_ML, '| offset={} j={}, sum={}'.format(
                    offset, j, offset + j)
                transform[i][offset + j] = 1 / l
            # if i == anchor_index:
            #     bert_anchor_index = offset
            offset += l

        assert offset == 1 + len(raw_text_bert_indices), "Wrong offset"

        dep_matrix = idx2gragh[sample_id]

        aspect_indices = tok_bert_indices[anchor_index]

        #   CLS + Sentence + SEP + aspect + SEP
        cls_text_sep_aspect_sep_indices = [101] + raw_text_bert_indices + [
            102
        ] + aspect_indices + [102]

        cls_text_sep_aspect_sep_length = len(cls_text_sep_aspect_sep_indices)
        assert cls_text_sep_aspect_sep_length <= BERT_ML, 'CLS indices length: {}\n{}'.format(
            cls_text_sep_aspect_sep_length, tokens)
        cls_text_sep_aspect_sep_mask = [
            0 for _ in range(cls_text_sep_aspect_sep_length)
        ] + [1 for _ in range(BERT_ML - cls_text_sep_aspect_sep_length)]
        cls_text_sep_aspect_sep_indices += [
            0 for _ in range(BERT_ML - cls_text_sep_aspect_sep_length)
        ]
        cls_text_sep_aspect_sep_segments_ids = [0 for _ in range(BERT_ML)]

        for i in range(bert_len + 2, bert_len + 2 + len(aspect_indices) + 1):
            cls_text_sep_aspect_sep_segments_ids[i] = 1

        cls_text_sep_aspect_sep_aspect_mask = [1 for _ in range(BERT_ML)]
        for i in range(offset + 1, offset + 1 + len(aspect_indices)):
            cls_text_sep_aspect_sep_aspect_mask[i] = 0

        # CLS + sentence + SEP
        cls_text_sep_indices = [101] + raw_text_bert_indices + [102]
        cls_text_sep_length = len(cls_text_sep_indices)
        cls_text_sep_indices += [
            0 for x in range(BERT_ML - cls_text_sep_length)
        ]
        cls_text_sep_segments_ids = [0 for _ in range(BERT_ML)]

        # Original sentence
        mask = [0 for _ in range(sent_len)
                ] + [1 for _ in range(ORI_ML - sent_len)]
        dist = [-1 for i in range(sent_len)]
        dist_to_target = get_dist_to_target(dep_matrix, anchor_index, dist,
                                            sent_len)
        dist_padding = [0] * (ORI_ML - len(dist_to_target))
        pad_dist_to_target = dist_to_target + dist_padding

        # pad_aspect_indices = aspect_indices + [BERT_ML - len(aspect_indices)]

        item = {
            'token': tokens,
            'cls_text_sep_aspect_sep_indices': cls_text_sep_aspect_sep_indices,
            'cls_text_sep_aspect_sep_length': cls_text_sep_aspect_sep_length,
            'cls_text_sep_aspect_sep_segments_ids':
            cls_text_sep_aspect_sep_segments_ids,
            'cls_text_sep_aspect_sep_aspect_mask':
            cls_text_sep_aspect_sep_aspect_mask,
            'cls_text_sep_aspect_sep_mask': cls_text_sep_aspect_sep_mask,
            'cls_text_sep_indices': cls_text_sep_indices,
            'cls_text_sep_length': cls_text_sep_length,
            'cls_text_sep_segments_ids': cls_text_sep_segments_ids,
            'anchor_index': anchor_index,
            'transform': transform,
            'sentence_length': sent_len,
            'bert_length': bert_len,
            'polarity': target,
            'dependency_graph': dep_matrix,
            'mask': mask,
            'dist_to_target': pad_dist_to_target
        }
        # offset += len(aspect_indices)

        data.append(item)
    preprocessed_file = path.replace('.tsv', '.proc')
    with open(preprocessed_file, 'wb') as f:
        pickle.dump(data, f)
    print(preprocessed_file)
예제 #11
0
 def __init__(self, pretrained_model: str):
     self.tokenizer = BertTokenizer.from_pretrained(pretrained_model)
예제 #12
0
def process_ace_file(path):
    BERT_ML = ACE_CASE['bert_ml']
    ORI_ML = ACE_CASE['ori_ml']
    MAX_TARGET_VALUE = ACE_CASE['n_class']
    tokenizer = BertTokenizer.from_pretrained(ACE_CASE['pretrained_bert_name'],
                                              do_lower_case=False)

    fin = open('{}.graph'.format(path), 'rb')
    idx2gragh = pickle.load(fin)
    fin.close()

    raw_data = read_ace_file(path)
    data = []
    for sample_id, tokens, labels, targets in raw_data:
        sent_len = len(tokens)
        # On raw text
        tok_bert_indices = []
        for tok in tokens:
            bert_tokens = tokenizer.tokenize(tok)
            tok_bert_indices.append(
                tokenizer.convert_tokens_to_ids(bert_tokens))
        bert_len = sum([len(x) for x in tok_bert_indices])

        transform = zeros(ORI_ML, BERT_ML, 0.0)

        # Create transform to convert tokenized length to original length
        offset = 1
        raw_text_bert_indices = []
        # print(sample_id)
        # print(sum([len(x) for x in tok_bert_indices]))
        # print(' '.join(tokens))
        for i, indices in enumerate(tok_bert_indices):
            l = len(indices)
            raw_text_bert_indices += indices
            for j in range(l):
                transform[i][offset + j] = 1 / l
            offset += l

        dep_matrix = idx2gragh[sample_id]

        offset = 1  # Because of the CLS
        for anchor_index, (aspect_indices,
                           target) in enumerate(zip(tok_bert_indices,
                                                    targets)):

            # Discard O label (it is not Other label)
            if target > MAX_TARGET_VALUE:
                continue
            #   CLS + Sentence + SEP + aspect + SEP
            cls_text_sep_aspect_sep_indices = [101] + raw_text_bert_indices + [
                102
            ] + aspect_indices + [102]
            cls_text_sep_aspect_sep_length = len(
                cls_text_sep_aspect_sep_indices)
            cls_text_sep_aspect_sep_mask = [
                0 for _ in range(cls_text_sep_aspect_sep_length)
            ] + [1 for _ in range(BERT_ML - cls_text_sep_aspect_sep_length)]
            cls_text_sep_aspect_sep_indices += [
                0 for _ in range(BERT_ML - cls_text_sep_aspect_sep_length)
            ]
            cls_text_sep_aspect_sep_segments_ids = [0 for _ in range(BERT_ML)]
            for i in range(bert_len + 2,
                           bert_len + 2 + len(aspect_indices) + 1):
                cls_text_sep_aspect_sep_segments_ids[i] = 1

            cls_text_sep_aspect_sep_aspect_mask = [1 for _ in range(BERT_ML)]
            for i in range(offset, offset + len(aspect_indices)):
                cls_text_sep_aspect_sep_aspect_mask[i] = 0

            offset += len(aspect_indices)

            # CLS + sentence + SEP
            cls_text_sep_indices = [101] + raw_text_bert_indices + [102]
            cls_text_sep_length = len(cls_text_sep_indices)
            cls_text_sep_indices += [
                0 for x in range(BERT_ML - cls_text_sep_length)
            ]
            cls_text_sep_segments_ids = [0 for _ in range(BERT_ML)]

            # Original sentence
            mask = [0 for _ in range(sent_len)
                    ] + [1 for _ in range(ORI_ML - sent_len)]
            dist = [-1 for i in range(sent_len)]
            dist_to_target = get_dist_to_target(dep_matrix, anchor_index, dist,
                                                sent_len)
            max_dist = max(dist_to_target) + 1
            dist_padding = [max_dist] * (ORI_ML - len(dist_to_target))
            pad_dist_to_target = dist_to_target + dist_padding

            item = {
                'cls_text_sep_aspect_sep_indices':
                cls_text_sep_aspect_sep_indices,
                'cls_text_sep_aspect_sep_length':
                cls_text_sep_aspect_sep_length,
                'cls_text_sep_aspect_sep_segments_ids':
                cls_text_sep_aspect_sep_segments_ids,
                'cls_text_sep_aspect_sep_aspect_mask':
                cls_text_sep_aspect_sep_aspect_mask,
                'cls_text_sep_aspect_sep_mask': cls_text_sep_aspect_sep_mask,
                'cls_text_sep_indices': cls_text_sep_indices,
                'cls_text_sep_length': cls_text_sep_length,
                'cls_text_sep_segments_ids': cls_text_sep_segments_ids,
                'anchor_index': anchor_index,
                'transform': transform,
                'sentence_length': sent_len,
                'bert_length': bert_len,
                'polarity': target,
                'dependency_graph': dep_matrix,
                'mask': mask,
                'dist_to_target': pad_dist_to_target
            }
            data.append(item)
    preprocessed_file = path.replace('.tsv', '.proc')
    with open(preprocessed_file, 'wb') as f:
        pickle.dump(data, f)
    print(preprocessed_file)
예제 #13
0
class BertWithJumanModel():
    def __init__(self, bert_path, vocab_file_name="vocab.txt", use_cuda=False):
        # 日本語文章をBERTに食わせるためにJumanを読み込む
        self.juman_tokenizer = JumanTokenizer()
        # 事前学習済みのBERTモデルのMaskedLMタスクモデルを読み込む
        self.model = BertForMaskedLM.from_pretrained(bert_path)
        # 事前学習済みのBERTモデルのTokenizerを読み込む
        self.bert_tokenizer = BertTokenizer(Path(bert_path) / vocab_file_name,
                                            do_lower_case=False,
                                            do_basic_tokenize=False)
        # CUDA-GPUを利用するかどうかのフラグ読み込み
        self.use_cuda = use_cuda

    def _preprocess_text(self, text):
        # 事前処理、テキストの半角スペースは削除
        return text.replace(" ", "")  # for Juman

    def paraphrase(self, text):
        # テキストの半角スペースを削除する
        preprocessed_text = self._preprocess_text(text)
        # 日本語のテキストを分かち書きし、トークンリストに変換する
        tokens = self.juman_tokenizer.tokenize(preprocessed_text)
        # トークンを半角スペースで結合しstrに変換する
        bert_tokens = self.bert_tokenizer.tokenize(" ".join(tokens))
        # テキストのサイズは128までなので、ヘッダ + トークン126個 + フッタを作成
        # トークンをidに置換する
        ids = self.bert_tokenizer.convert_tokens_to_ids(
            ["[CLS]"] + bert_tokens[:126] + ["[SEP]"])  # max_seq_len-2
        generated_token_ids = torch.tensor(ids).reshape(1, -1)

        if self.use_cuda:
            # GPUの利用チェック、利用
            generated_token_ids = generated_token_ids.to('cuda')
            self.model.to('cuda')

        # モデルを評価モードに変更
        self.model.eval()
        with torch.no_grad():
            for i in range(10):
                for j, _ in enumerate(tokens):
                    # 文章のトークン1つをMASKに変換する
                    # ヘッダは除くから、+1から
                    masked_index = j + 1

                    pre_token = generated_token_ids[0, masked_index].item()

                    generated_token_ids[
                        0, masked_index] = self.bert_tokenizer.vocab["[MASK]"]

                    outputs = self.model(generated_token_ids)
                    predictions = outputs[0]

                    _, predicted_indexes = torch.topk(
                        predictions[0, masked_index], k=5)
                    predicted_tokens = self.bert_tokenizer.convert_ids_to_tokens(
                        predicted_indexes.tolist())

                    print(predicted_tokens)

                    predict_token = predicted_indexes.tolist()[0]

                    # if pre_token == predict_token:
                    #     predict_token = predicted_indexes.tolist()[1]

                    generated_token_ids[0, masked_index] = predict_token

                    # idから文字列に変換、結合
                    sampled_sequence = [
                        self.bert_tokenizer.ids_to_tokens[token_id]
                        for token_id in generated_token_ids[0].cpu().numpy()
                    ]
                    sampled_sequence = "".join([
                        token[2:] if token.startswith("##") else token
                        for token in list(
                            filter(lambda x: x != '[PAD]', sampled_sequence))
                    ])

                    logger.info(
                        "sampled sequence: {}".format(sampled_sequence))
model_dir_zh = "model/BERT/newdata_5fold/zh/"

MAX_fold = 5
PATH_list_en = [
    os.path.join(model_dir_en, "{}fold_bert.model".format(fold))
    for fold in range(1, MAX_fold + 1, 1)
]
PATH_list_zh = [
    os.path.join(model_dir_zh, "{}fold_bert.model".format(fold))
    for fold in range(1, MAX_fold + 1, 1)
]

y_dummy = torch.empty(len(test1_en), dtype=torch.long).random_(5)

# tokenizer_en = BertTokenizer.from_pretrained('bert-large-uncased')
tokenizer_en = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer_zh = BertTokenizer.from_pretrained('bert-base-chinese')

test_dataset_en = BERTDataset(test1_en,
                              test2_en,
                              y_dummy,
                              tokenizer_en,
                              seq_length=max_seq_en)
test_dataset_zh = BERTDataset(test1_zh,
                              test2_zh,
                              y_dummy,
                              tokenizer_zh,
                              seq_length=max_seq_zh)

test_loader_en = DataLoader(test_dataset_en, batch_size=batch, shuffle=False)
test_loader_zh = DataLoader(test_dataset_zh, batch_size=batch, shuffle=False)
예제 #15
0
    def __init__(
            self,
            emb_dim: int = 768,
            dropout_value: float = 0.0,
            aggregation_type: str = "sum",
            bert_type: str = "bert-base-uncased",
            device: Union[torch.device, str] = torch.device("cpu"),
    ):
        """ Bert Embedder that embeds the given instance to BERT embeddings

        Parameters
        ----------
        emb_dim : int
            Embedding dimension
        dropout_value : float
            The amount of dropout to be added after the embedding
        aggregation_type : str
            The kind of aggregation of different layers. BERT produces representations from
            different layers. This specifies the strategy to aggregating them
            One of

            sum
                Sum the representations from all the layers
            average
                Average the representations from all the layers

        bert_type : type
            The kind of BERT embedding to be used

            bert-base-uncased
                12 layer transformer trained on lowercased vocab

            bert-large-uncased:
                24 layer transformer trained on lowercased vocab

            bert-base-cased:
                12 layer transformer trained on cased vocab

            bert-large-cased:
                24 layer transformer train on cased vocab

            scibert-base-cased
                12 layer transformer trained on scientific document on cased normal vocab
            scibert-sci-cased
                12 layer transformer trained on scientific documents on cased scientifc vocab

            scibert-base-uncased
                12 layer transformer trained on scientific docments on uncased normal vocab

            scibert-sci-uncased
                12 layer transformer train on scientific documents on ncased scientific vocab

        device :  Union[torch.device, str]
            The device on which the model is run.
        """
        super(BertEmbedder, self).__init__()
        self.emb_dim = emb_dim
        self.dropout_value = dropout_value
        self.aggregation_type = aggregation_type
        self.bert_type = bert_type
        if isinstance(device, str):
            self.device = torch.device(device)
        else:
            self.device = device
        self.msg_printer = wasabi.Printer()
        self.allowed_bert_types = [
            "bert-base-uncased",
            "bert-large-uncased",
            "bert-base-cased",
            "bert-large-cased",
            "scibert-base-cased",
            "scibert-sci-cased",
            "scibert-base-uncased",
            "scibert-sci-uncased",
        ]
        self.scibert_foldername_mapping = {
            "scibert-base-cased": "scibert_basevocab_cased",
            "scibert-sci-cased": "scibert_scivocab_cased",
            "scibert-base-uncased": "scibert_basevocab_uncased",
            "scibert-sci-uncased": "scibert_scivocab_uncased",
        }
        self.model_type_or_folder_url = None
        self.vocab_type_or_filename = None

        assert self.bert_type in self.allowed_bert_types

        if "scibert" in self.bert_type:
            foldername = self.scibert_foldername_mapping[self.bert_type]
            self.model_type_or_folder_url = os.path.join(
                EMBEDDING_CACHE_DIR, foldername, "weights.tar.gz")
            self.vocab_type_or_filename = os.path.join(EMBEDDING_CACHE_DIR,
                                                       foldername, "vocab.txt")
        else:
            self.model_type_or_folder_url = self.bert_type
            self.vocab_type_or_filename = self.bert_type

        # load the bert model
        with self.msg_printer.loading(" Loading Bert tokenizer and model. "):
            self.bert_tokenizer = BertTokenizer.from_pretrained(
                self.vocab_type_or_filename)
            self.model = BertModel.from_pretrained(
                self.model_type_or_folder_url)
            self.model.eval()
            self.model.to(self.device)

        self.msg_printer.good(
            f"Finished Loading {self.bert_type} model and tokenizer")
예제 #16
0
import uvicorn
from fastai.text import *
from fastai.vision import *
from fastapi import FastAPI
from pytorch_pretrained_bert import BertTokenizer
from pytorch_pretrained_bert.modeling import BertForSequenceClassification
from sklearn.model_selection import train_test_split
from starlette.middleware.cors import CORSMiddleware
from starlette.requests import Request
from starlette.responses import HTMLResponse, JSONResponse
from starlette.responses import Response
from starlette.staticfiles import StaticFiles

gc.collect()
bert_tok = BertTokenizer.from_pretrained("bert-base-uncased")


class FastAiBertTokenizer(BaseTokenizer):
    """Wrapper around BertTokenizer to be compatible with fast.ai"""
    def __init__(self,
                 tokenizer: BertTokenizer,
                 max_seq_len: int = 128,
                 **kwargs):
        self._pretrained_tokenizer = tokenizer
        self.max_seq_len = max_seq_len

    def __call__(self, *args, **kwargs):
        return self

    def tokenizer(self, t: str) -> List[str]:
        """Limits the maximum sequence length"""
예제 #17
0
def report_on_stdin(args):
    """Runs a trained structural probe on sentences piped to stdin.

    Sentences should be space-tokenized.
    A single distance image and depth image will be printed for each line of stdin.

    Args:
    args: the yaml config dictionary
    """

    # Define the BERT model and tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-large-cased')
    model = BertModel.from_pretrained('bert-large-cased')
    LAYER_COUNT = 24
    FEATURE_COUNT = 1024
    model.to(args['device'])
    model.eval()

    # Define the distance probe
    distance_probe = probe.TwoWordPSDProbe(args)
    distance_probe.load_state_dict(torch.load(args['probe']['distance_params_path'], map_location=args['device']))

    # Define the depth probe
    depth_probe = probe.OneWordPSDProbe(args)
    depth_probe.load_state_dict(torch.load(args['probe']['depth_params_path'], map_location=args['device']))

    for index, line in tqdm(enumerate(sys.stdin), desc='[demoing]'):
        # Tokenize the sentence and create tensor inputs to BERT
        untokenized_sent = line.strip().split()
        tokenized_sent = tokenizer.wordpiece_tokenizer.tokenize('[CLS] ' + ' '.join(line.strip().split()) + ' [SEP]')
        untok_tok_mapping = data.SubwordDataset.match_tokenized_to_untokenized(tokenized_sent, untokenized_sent)

        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_sent)
        segment_ids = [1 for x in tokenized_sent]

        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensors = torch.tensor([segment_ids])

        tokens_tensor = tokens_tensor.to(args['device'])
        segments_tensors = segments_tensors.to(args['device'])


    with torch.no_grad():
        # Run sentence tensor through BERT after averaging subwords for each token
        print('no gradient area')
        print('tokens & segments')
        print(tokens_tensor.shape, segments_tensors.shape)
        encoded_layers, _ = model(tokens_tensor, segments_tensors)
        print('num layers & each layer')
        print(len(encoded_layers), encoded_layers[0].shape)
        single_layer_features = encoded_layers[args['model']['model_layer']]
        representation = torch.stack([torch.mean(single_layer_features[0,untok_tok_mapping[i][0]:untok_tok_mapping[i][-1]+1,:], dim=0) for i in range(len(untokenized_sent))], dim=0)
        representation = representation.view(1, *representation.size())

        # Run BERT token vectors through the trained probes
        distance_predictions = distance_probe(representation.to(args['device'])).detach().cpu()[0][:len(untokenized_sent),:len(untokenized_sent)].numpy()
        depth_predictions = depth_probe(representation).detach().cpu()[0][:len(untokenized_sent)].numpy()

        print(distance_predictions)
        print(depth_predictions)
        # Print results visualizations
        print_distance_image(args, untokenized_sent, distance_predictions, index)
        print_depth_image(args, untokenized_sent, depth_predictions, index)

        predicted_edges = reporter.prims_matrix_to_edges(distance_predictions, untokenized_sent, untokenized_sent)
예제 #18
0
def main():
    #parse arguments
    config.parse()
    args = config.args
    for k, v in vars(args).items():
        logger.info(f"{k}:{v}")
    #set seeds
    torch.manual_seed(args.random_seed)
    torch.cuda.manual_seed_all(args.random_seed)
    np.random.seed(args.random_seed)
    random.seed(args.random_seed)

    #arguments check
    device, n_gpu = args_check(args)
    os.makedirs(args.output_dir, exist_ok=True)
    forward_batch_size = int(args.train_batch_size /
                             args.gradient_accumulation_steps)
    args.forward_batch_size = forward_batch_size

    #load bert config
    bert_config_S = BertConfig.from_json_file(args.bert_config_file_S)
    assert args.max_seq_length <= bert_config_S.max_position_embeddings

    #Prepare GLUE task
    processor = processors[args.task_name]()
    args.output_mode = output_modes[args.task_name]
    label_list = processor.get_labels()
    num_labels = len(label_list)

    #read data
    train_dataset = None
    eval_datasets = None
    num_train_steps = None
    tokenizer = BertTokenizer(vocab_file=args.vocab_file,
                              do_lower_case=args.do_lower_case)
    if args.do_train:
        train_dataset = load_and_cache_examples(args,
                                                args.task_name,
                                                tokenizer,
                                                evaluate=False)
        if args.aux_task_name:
            aux_train_dataset = load_and_cache_examples(args,
                                                        args.aux_task_name,
                                                        tokenizer,
                                                        evaluate=False,
                                                        is_aux=True)
            train_dataset = torch.utils.data.ConcatDataset(
                [train_dataset, aux_train_dataset])
        num_train_steps = int(
            len(train_dataset) / args.train_batch_size) * args.num_train_epochs
    if args.do_predict:
        eval_datasets = []
        eval_task_names = ("mnli",
                           "mnli-mm") if args.task_name == "mnli" else (
                               args.task_name, )
        for eval_task in eval_task_names:
            eval_datasets.append(
                load_and_cache_examples(args,
                                        eval_task,
                                        tokenizer,
                                        evaluate=True))
    logger.info("Data loaded")

    #Build Model and load checkpoint
    model_S = BertForGLUESimple(bert_config_S,
                                num_labels=num_labels,
                                args=args)
    #Load student
    if args.load_model_type == 'bert':
        assert args.init_checkpoint_S is not None
        state_dict_S = torch.load(args.init_checkpoint_S, map_location='cpu')
        if args.only_load_embedding:
            state_weight = {
                k[5:]: v
                for k, v in state_dict_S.items()
                if k.startswith('bert.embeddings')
            }
            missing_keys, _ = model_S.bert.load_state_dict(state_weight,
                                                           strict=False)
            logger.info(f"Missing keys {list(missing_keys)}")
        else:
            state_weight = {
                k[5:]: v
                for k, v in state_dict_S.items() if k.startswith('bert.')
            }
            missing_keys, _ = model_S.bert.load_state_dict(state_weight,
                                                           strict=False)
            assert len(missing_keys) == 0
        logger.info("Model loaded")
    elif args.load_model_type == 'all':
        assert args.tuned_checkpoint_S is not None
        state_dict_S = torch.load(args.tuned_checkpoint_S, map_location='cpu')
        model_S.load_state_dict(state_dict_S)
        logger.info("Model loaded")
    else:
        logger.info("Model is randomly initialized.")
    model_S.to(device)

    if args.local_rank != -1 or n_gpu > 1:
        if args.local_rank != -1:
            raise NotImplementedError
        elif n_gpu > 1:
            model_S = torch.nn.DataParallel(model_S)  #,output_device=n_gpu-1)

    if args.do_train:
        #parameters
        params = list(model_S.named_parameters())
        all_trainable_params = divide_parameters(params, lr=args.learning_rate)
        logger.info("Length of all_trainable_params: %d",
                    len(all_trainable_params))

        optimizer = BERTAdam(all_trainable_params,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_steps,
                             schedule=args.schedule,
                             s_opt1=args.s_opt1,
                             s_opt2=args.s_opt2,
                             s_opt3=args.s_opt3)

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_dataset))
        logger.info("  Forward batch size = %d", forward_batch_size)
        logger.info("  Num backward steps = %d", num_train_steps)

        ########### DISTILLATION ###########
        train_config = TrainingConfig(
            gradient_accumulation_steps=args.gradient_accumulation_steps,
            ckpt_frequency=args.ckpt_frequency,
            log_dir=args.output_dir,
            output_dir=args.output_dir,
            device=args.device)

        distiller = BasicTrainer(train_config=train_config,
                                 model=model_S,
                                 adaptor=BertForGLUESimpleAdaptorTraining)

        if args.local_rank == -1:
            train_sampler = RandomSampler(train_dataset)
        else:
            raise NotImplementedError
        train_dataloader = DataLoader(train_dataset,
                                      sampler=train_sampler,
                                      batch_size=args.forward_batch_size,
                                      drop_last=True)
        callback_func = partial(predict,
                                eval_datasets=eval_datasets,
                                args=args)
        with distiller:
            distiller.train(optimizer,
                            scheduler=None,
                            dataloader=train_dataloader,
                            num_epochs=args.num_train_epochs,
                            callback=callback_func)

    if not args.do_train and args.do_predict:
        res = predict(model_S, eval_datasets, step=0, args=args)
        print(res)
 def __init__(self, model_name):
     self.tokenizer = BertTokenizer.from_pretrained(model_name)
     self.model = BertModel.from_pretrained(model_name).eval()
     self.model.cuda()
예제 #20
0
@author: peterawest
"""

import random
from pytorch_pretrained_bert import BertTokenizer, BertModel
import torch
import numpy as np
import math
import os

import time

from word_embeddings import BERT_word_embedding

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert = BertModel.from_pretrained('bert-base-uncased')
bert.eval()
bert.to('cuda')
sig = torch.nn.Sigmoid()

from torch import nn
MAX_LENGTH = 100
sm_1 = torch.nn.Softmax(dim=1)
sm = torch.nn.Softmax()

max_posts = 1000


# calculates position embeddings given seq_len and n_dim
def position_embeddings(n_dim, seq_len):
예제 #21
0
def main(args):
    # process_mrc_example()
    if not exists(TEST_DIR):
        os.makedirs(TEST_DIR)
    if not exists(os.path.join(TEST_DIR, "reference")):
        os.makedirs(os.path.join(TEST_DIR, "reference"))
    if not exists(os.path.join(TEST_DIR, "decoded")):
        os.makedirs(os.path.join(TEST_DIR, "decoded"))

    meta = json.load(open(join(DATA_DIR, 'meta.json')))
    nargs = meta['net_args']
    ckpt = load_best_ckpt(DATA_DIR)
    net = BertReader(**nargs)
    net.load_state_dict(ckpt)
    if args.cuda:
        net = net.to('cuda')
    net.eval()
    tokenizer = BertTokenizer.from_pretrained('./MRC_pretrain')
    count = 0
    bulids = []
    answers = []
    with torch.no_grad():
        for index in range(197):

            with open(
                    join(join(MRC_DIR, '200_sample'),
                         '{}.json'.format(index))) as f:
                js_data = json.load(f)
                print('loading: {}'.format(index))
                question, question_length, text, text_length, answer_span, text_tok, original_text = (
                    js_data['question'], js_data['question_length'],
                    js_data['text'], js_data['text_length'],
                    js_data['answer_span'], js_data['text_tok'],
                    js_data['original_text'])
            if (question_length + text_length <= 512):
                concat_text = question + text
                token_tensor, segment_tensor, mask_tensor = pad_batch_tensorize(
                    [concat_text], args.cuda)
                question_lengths = torch.tensor([question_length])
                question_lengths = question_lengths.cuda()

                text_lengths = torch.tensor([text_length])
                text_lengths = text_lengths.cuda()

                fw_args = (token_tensor, segment_tensor, mask_tensor,
                           question_lengths, text_lengths)
                net_out = net(*fw_args)

                net_out = torch.squeeze(net_out)
                net_out = net_out[question_length:question_length +
                                  text_length]
                leng = net_out.size(0)
                propuse = []
                for i in range(leng):
                    if (net_out[i].item() > 0.5):
                        propuse.append(1)
                    else:
                        propuse.append(0)

                if (not (1 in propuse)):
                    propuse.clear()
                    for i in range(leng):
                        if (net_out[i].item() > 1e-4):
                            propuse.append(1)
                        else:
                            propuse.append(0)
                bulid = []
                output = ''

                for t in range(len(propuse)):
                    if (propuse[t] == 1):

                        bulid.append(text[t])

                        output += text_tok[t] if (
                            text_tok[t] != '[UNK]') else ''
                output = output.replace('##', '')
                print(output)
                bulid = [str(x) for x in bulid]
                answer_index = answer_span.index(1)
                one = 0
                for o in range(len(answer_span)):
                    if answer_span[o] == 1:
                        one += 1
                print(one)
                answer = text[answer_index:answer_index + one]

                # answer = text[answer_index:answer_index + len(answer_span)]
                answer = [str(x) for x in answer]
                answers.append(answer)
                with open(
                        join(os.path.join(TEST_DIR, "decoded"),
                             "%d_decoded.txt" % index), 'w') as f:
                    # for i, item in enumerate(bulids):

                    f.write(' '.join(bulid))

                with open(
                        join(os.path.join(TEST_DIR, "reference"),
                             "%d_reference.txt" % index),
                        'w',
                ) as f:
                    # for i, item in enumerate(answers):
                    #   print(item)
                    f.write(' '.join(answer))

            else:
                sp = 0
                ep = 412
                sub_text_arr = []
                sub_text_length_arr = []
                start_index = []
                while (True):
                    if (ep >= text_length and sp < text_length):

                        sub_text = text[sp:text_length]
                        sub_text_arr.append(sub_text)

                        sub_text_length = text_length - sp
                        sub_text_length_arr.append(sub_text_length)
                        start_index.append(sp)

                        assert question_length + text_length - sp <= 512
                        sp += 312
                        ep += 312

                    else:
                        if (ep > text_length):
                            break
                        else:
                            sub_text = text[sp:ep]
                            sub_text_arr.append(sub_text)

                            sub_text_length = ep - sp
                            sub_text_length_arr.append(sub_text_length)
                            start_index.append(sp)

                            assert question_length + ep - sp <= 512

                            sp += 312
                            ep += 312

                meta_s = json.load(open(join('matcher', 'meta.json')))
                nargs_s = meta_s['net_args']
                ckpt_s = load_best_ckpt('matcher')
                net_s = BertMatcher(**nargs_s)
                net_s.load_state_dict(ckpt_s)
                if args.cuda:
                    net_s = net_s.cuda()
                net_s.eval()
                with torch.no_grad():
                    highest_score = [0]
                    current = -1
                    for i in range(len(sub_text_arr)):
                        concat_text = question + sub_text_arr[i]
                        token_tensor, segment_tensor, mask_tensor = pad_batch_tensorize(
                            [concat_text], args.cuda)
                        fw_args = (token_tensor, segment_tensor, mask_tensor)
                        net_out = net_s(*fw_args)
                        if (net_out[0][0].item() > highest_score[-1]):
                            highest_score.clear()
                            highest_score.append(net_out[0][0].item())
                            current = i
                used_text = sub_text_arr[current]

                propuse = [0] * text_length

                concat_text = question + used_text

                token_tensor, segment_tensor, mask_tensor = pad_batch_tensorize(
                    [concat_text], args.cuda)
                question_lengths = torch.tensor([question_length])
                question_lengths = question_lengths.cuda()

                text_lengths = torch.tensor([sub_text_length_arr[current]])
                text_lengths = text_lengths.cuda()

                fw_args = (token_tensor, segment_tensor, mask_tensor,
                           question_lengths, text_lengths)
                net_out = net(*fw_args)

                net_out = torch.squeeze(net_out)
                net_out = net_out[question_length:question_length +
                                  text_length]
                leng = net_out.size(0)

                for ga in range(leng):

                    if (net_out[ga].item() > 0.5):

                        propuse[ga + start_index[current]] = 1

                if (not (1 in propuse)):
                    for ga in range(leng):

                        if (net_out[ga].item() > 1e-4):
                            propuse[ga + start_index[current]] = 1

                bulid = []
                output = ''
                for t in range(len(propuse)):
                    if (propuse[t] == 1):
                        bulid.append(text[t])
                        output += text_tok[t] if (
                            text_tok[t] != '[UNK]') else ''
                output = output.replace('##', '')
                print(output)

                bulid = [str(x) for x in bulid]
                answer_index = answer_span.index(1)
                one = 0
                for o in range(len(answer_span)):
                    if answer_span[o] == 1:
                        one += 1
                print(one)
                answer = text[answer_index:answer_index + one]
                # answer = text[answer_index:answer_index + len(answer_span)]
                answer = [str(x) for x in answer]
                answers.append(answer)
                with open(
                        join(os.path.join(TEST_DIR, "decoded"),
                             "%d_decoded.txt" % index), 'w') as f:
                    # for i, item in enumerate(bulids):

                    f.write(' '.join(bulid))

                with open(
                        join(os.path.join(TEST_DIR, "reference"),
                             "%d_reference.txt" % index),
                        'w',
                ) as f:
                    # for i, item in enumerate(answers):
                    #   print(item)
                    f.write(' '.join(answer))

    r = pyrouge.Rouge155('/home/wanglihan/ROUGE/RELEASE-1.5.5/')
    r.model_filename_pattern = '#ID#_reference.txt'
    r.system_filename_pattern = '(\d+)_decoded.txt'
    r.model_dir = os.path.join(TEST_DIR, "reference")
    r.system_dir = os.path.join(TEST_DIR, "decoded")
    rouge_results = r.convert_and_evaluate(
        '/home/wanglihan/ROUGE/RELEASE-1.5.5/')
    print(rouge_results)
예제 #22
0
    gradient_accumulation_steps = 1
    train_batch_size = 32
    eval_batch_size = 128
    train_batch_size = train_batch_size // gradient_accumulation_steps
    output_dir = OutputDir
    num_train_epochs = NUMofEPOCH
    num_train_optimization_steps = int(
        len(TrainExamples) / train_batch_size /
        gradient_accumulation_steps) * num_train_epochs
    cache_dir = CacheDir
    learning_rate = LearningRate
    warmup_proportion = 0.1
    max_seq_length = MAXSEQLEN

    # Load model
    tokenizer = BertTokenizer.from_pretrained(BERTModel)
    Model = BertForSequenceClassification.from_pretrained(
        BERTModel, cache_dir=cache_dir, num_labels=len(LabelList))
    Model.to(device)
    if n_gpu > 1:
        Model = torch.nn.DataParallel(Model)

    # Load a trained model and config that you have fine-tuned
    # tokenizer = BertTokenizer.from_pretrained(BERTModel)
    # config = BertConfig(load_config_file)
    # Model = BertForSequenceClassification(config, num_labels = len(LabelList))
    # Model.load_state_dict(torch.load(load_model_file))
    # Model.to(device)  # important to specific device
    # if n_gpu > 1:
    # 	Model = torch.nn.DataParallel(Model)
예제 #23
0
sentences = ["[CLS] " + str(sentence) + " [SEP]" for sentence in sentences]
labels = df.leaning.values
for i in range(len(labels)):
    if labels[i] == 'right':
        labels[i] = 0
    if labels[i] == 'left':
        labels[i] = 1

labels = np.array(labels, dtype=np.int64)
print(labels.dtype)
"""## Inputs

Next, import the BERT tokenizer, used to convert our text into tokens that correspond to BERT's vocabulary.
"""

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                          do_lower_case=True)

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
print("Tokenize the first sentence:")
print(tokenized_texts[0])
"""BERT requires specifically formatted inputs. For each tokenized input sentence, we need to create:

- **input ids**: a sequence of integers identifying each input token to its index number in the BERT tokenizer vocabulary
- **segment mask**: (optional) a sequence of 1s and 0s used to identify whether the input is one sentence or two sentences long. For one sentence inputs, this is simply a sequence of 0s. For two sentence inputs, there is a 0 for each token of the first sentence, followed by a 1 for each token of the second sentence
- **attention mask**: (optional) a sequence of 1s and 0s, with 1s for all input tokens and 0s for all padding tokens (we'll detail this in the next paragraph)
- **labels**: a single value of 1 or 0. In our task 1 means "grammatical" and 0 means "ungrammatical"

Although we can have variable length input sentences, BERT does requires our input arrays to be the same size. We address this by first choosing a maximum sentence length, and then padding and truncating our inputs until every input sequence is of the same length. 

To "pad" our inputs in this context means that if a sentence is shorter than the maximum sentence length, we simply add 0s to the end of the sequence until it is the maximum sentence length. 
예제 #24
0
import nltk
import numpy as np
import os
import random
import string
import torch
from fitbert import FitBert
from matplotlib import pyplot as plt
from nltk.corpus import stopwords
from pytorch_pretrained_bert import BertTokenizer, BertForMaskedLM
from tqdm import tqdm
from utils import color_print_top_words

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('Initialize BERT vocabulary...')
bert_tokenizer = BertTokenizer(vocab_file='data/BERT_model_reddit/vocab.txt')
print('Initialize BERT model...')
bert_model = BertForMaskedLM.from_pretrained('data/BERT_model_reddit').to(
    device)
bert_model.eval()


def MLM(sgs, drug_formal, thres=1, skip_flag=1):
    def to_bert_input(tokens, bert_tokenizer):
        token_idx = torch.tensor(bert_tokenizer.convert_tokens_to_ids(tokens))
        sep_idx = tokens.index('[SEP]')
        segment_idx = token_idx * 0
        segment_idx[(sep_idx + 1):] = 1
        mask = (token_idx != 0)
        return token_idx.unsqueeze(0).to(device), segment_idx.unsqueeze(0).to(
            device), mask.unsqueeze(0).to(device)
예제 #25
0
def model_go(th_filter_prob=0.2, top_k_sent=5):
    seed = 12
    torch.manual_seed(seed)
    # bert_model_name = 'bert-large-uncased'
    bert_model_name = 'bert-base-uncased'
    bert_pretrain_path = config.PRO_ROOT / '.pytorch_pretrained_bert'
    lazy = False
    # lazy = True
    forward_size = 32
    # batch_size = 64
    # batch_size = 192
    batch_size = 32
    gradient_accumulate_step = int(batch_size / forward_size)
    warmup_proportion = 0.1
    # schedule_type = 'warmup_constant'
    # 'warmup_cosine': warmup_cosine,
    # 'warmup_constant': warmup_constant,
    # 'warmup_linear': warmup_linear,
    schedule_type = 'warmup_linear'
    learning_rate = 5e-5
    num_train_epochs = 5
    eval_frequency = 4000
    do_lower_case = True
    pair_order = 'cq'
    # debug_mode = True
    # debug_mode = True
    debug_mode = False
    do_ema = True

    maxout_model = False
    # est_datasize = 900_000

    num_class = 3
    # num_train_optimization_steps
    top_k = top_k_sent

    train_sent_filtering_prob = th_filter_prob
    dev_sent_filtering_prob = th_filter_prob
    experiment_name = f'fever_v2_nli_th{train_sent_filtering_prob}_tk{top_k}'

    # Data dataset and upstream sentence results.
    dev_sent_results_list = common.load_jsonl(
        config.PRO_ROOT / "data/p_fever/fever_sentence_level/04-24-00-11-19_fever_v0_slevel_retri_(ignore_non_verifiable-True)/fever_s_level_dev_results.jsonl")
    train_sent_results_list = common.load_jsonl(
        config.PRO_ROOT / "data/p_fever/fever_sentence_level/04-24-00-11-19_fever_v0_slevel_retri_(ignore_non_verifiable-True)/fever_s_level_train_results.jsonl")

    dev_fitems, dev_list = get_nli_pair('dev', is_training=False,
                                        sent_level_results_list=dev_sent_results_list, debug=debug_mode,
                                        sent_top_k=top_k_sent, sent_filter_value=dev_sent_filtering_prob)
    train_fitems, train_list = get_nli_pair('train', is_training=True,
                                            sent_level_results_list=train_sent_results_list, debug=debug_mode,
                                            sent_top_k=top_k_sent, sent_filter_value=train_sent_filtering_prob)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device_num = 0 if torch.cuda.is_available() else -1

    n_gpu = torch.cuda.device_count()

    unk_token_num = {'tokens': 1}  # work around for initiating vocabulary.
    vocab = ExVocabulary(unk_token_num=unk_token_num)
    vocab.add_token_to_namespace('SUPPORTS', namespace='labels')
    vocab.add_token_to_namespace('REFUTES', namespace='labels')
    vocab.add_token_to_namespace('NOT ENOUGH INFO', namespace='labels')
    vocab.add_token_to_namespace("hidden", namespace="labels")
    vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels')

    if debug_mode:
        dev_list = dev_list[:100]
        train_list = train_list[:100]
        eval_frequency = 2

    est_datasize = len(train_fitems)

    bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=do_lower_case,
                                                   cache_dir=bert_pretrain_path)
    bert_cs_reader = BertFeverNLIReader(bert_tokenizer, lazy, is_paired=True, query_l=64,
                                        example_filter=None, max_l=384, pair_order=pair_order)

    bert_encoder = BertModel.from_pretrained(bert_model_name, cache_dir=bert_pretrain_path)
    if not maxout_model:
        model = BertMultiLayerSeqClassification(bert_encoder, num_labels=num_class, num_of_pooling_layer=1,
                                                act_type='tanh', use_pretrained_pooler=True, use_sigmoid=False)
    else:
        model = BertPairMaxOutMatcher(bert_encoder, num_of_class=num_class, act_type="gelu", num_of_out_layers=2)

    ema = None
    if do_ema:
        ema = EMA(model, model.named_parameters(), device_num=1)

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]

    num_train_optimization_steps = int(est_datasize / forward_size / gradient_accumulate_step) * \
                                   num_train_epochs

    if debug_mode:
        num_train_optimization_steps = 100

    print("Estimated training size", est_datasize)
    print("Number of optimization steps:", num_train_optimization_steps)
    print("Do EMA:", do_ema)

    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=learning_rate,
                         warmup=warmup_proportion,
                         t_total=num_train_optimization_steps,
                         schedule=schedule_type)

    dev_instances = bert_cs_reader.read(dev_fitems)

    biterator = BasicIterator(batch_size=forward_size)
    biterator.index_with(vocab)

    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    forbackward_step = 0
    update_step = 0

    logging_agent = save_tool.ScoreLogger({})

    file_path_prefix = '.'
    if not debug_mode:
        file_path_prefix, date = save_tool.gen_file_prefix(f"{experiment_name}")
        # # # Create Log File
        # Save the source code.
        script_name = os.path.basename(__file__)
        with open(os.path.join(file_path_prefix, script_name), 'w') as out_f, open(__file__, 'r') as it:
            out_f.write(it.read())
            out_f.flush()
        # # # Log File end

    for epoch_i in range(num_train_epochs):
        print("Epoch:", epoch_i)

        train_fitems_list, _ = get_nli_pair('train', is_training=True,
                                            sent_level_results_list=train_sent_results_list, debug=debug_mode,
                                            sent_top_k=5, sent_filter_value=train_sent_filtering_prob)

        random.shuffle(train_fitems_list)
        train_instance = bert_cs_reader.read(train_fitems_list)
        train_iter = biterator(train_instance, num_epochs=1, shuffle=True)

        for batch in tqdm(train_iter):
            model.train()
            batch = move_to_device(batch, device_num)

            paired_sequence = batch['paired_sequence']
            paired_segments_ids = batch['paired_segments_ids']
            labels_ids = batch['label']
            att_mask, _ = torch_util.get_length_and_mask(paired_sequence)
            s1_span = batch['bert_s1_span']
            s2_span = batch['bert_s2_span']

            if not maxout_model:
                loss = model(paired_sequence, token_type_ids=paired_segments_ids, attention_mask=att_mask,
                             mode=BertMultiLayerSeqClassification.ForwardMode.TRAIN,
                             labels=labels_ids)
            else:
                loss = model(paired_sequence, token_type_ids=paired_segments_ids, attention_mask=att_mask,
                             s1_span=s1_span, s2_span=s2_span,
                             mode=BertPairMaxOutMatcher.ForwardMode.TRAIN,
                             labels=labels_ids)

            if n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu.

            if gradient_accumulate_step > 1:
                loss = loss / gradient_accumulate_step

            loss.backward()
            forbackward_step += 1

            if forbackward_step % gradient_accumulate_step == 0:
                optimizer.step()
                if ema is not None and do_ema:
                    updated_model = model.module if hasattr(model, 'module') else model
                    ema(updated_model.named_parameters())
                optimizer.zero_grad()
                update_step += 1

                if update_step % eval_frequency == 0:
                    print("Update steps:", update_step)
                    # dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False)
                    #
                    # cur_eval_results_list = eval_model(model, dev_iter, device_num, with_probs=True, make_int=True,
                    #                                    feed_input_span=maxout_model)
                    #
                    # ema_results_dict = list_dict_data_tool.list_to_dict(cur_eval_results_list, 'oid')
                    # copied_dev_list = copy.deepcopy(dev_list)
                    # list_dict_data_tool.append_item_from_dict_to_list(copied_dev_list, ema_results_dict,
                    #                                                   'id', 'predicted_label')
                    #
                    # mode = {'standard': True}
                    # strict_score, acc_score, pr, rec, f1 = fever_scorer.fever_score(copied_dev_list, dev_list,
                    #                                                                 mode=mode, max_evidence=5)
                    # logging_item = {
                    #     'ss': strict_score, 'ac': acc_score,
                    #     'pr': pr, 'rec': rec, 'f1': f1,
                    # }
                    #
                    # if not debug_mode:
                    #     save_file_name = f'i({update_step})|e({epoch_i})' \
                    #         f'|ss({strict_score})|ac({acc_score})|pr({pr})|rec({rec})|f1({f1})' \
                    #         f'|seed({seed})'
                    #
                    #     common.save_jsonl(copied_dev_list, Path(file_path_prefix) /
                    #                       f"{save_file_name}_dev_nli_results.json")
                    #
                    #     # print(save_file_name)
                    #     logging_agent.incorporate_results({}, save_file_name, logging_item)
                    #     logging_agent.logging_to_file(Path(file_path_prefix) / "log.json")
                    #
                    #     model_to_save = model.module if hasattr(model, 'module') else model
                    #     output_model_file = Path(file_path_prefix) / save_file_name
                    #     torch.save(model_to_save.state_dict(), str(output_model_file))

                    if do_ema and ema is not None:
                        ema_model = ema.get_inference_model()
                        ema_device_num = 0
                        ema_model = ema_model.to(device)
                        ema_model = torch.nn.DataParallel(ema_model)
                        dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False)
                        cur_ema_eval_results_list = eval_model(ema_model, dev_iter, ema_device_num, with_probs=True,
                                                               make_int=True,
                                                               feed_input_span=maxout_model)

                        ema_results_dict = list_dict_data_tool.list_to_dict(cur_ema_eval_results_list, 'oid')
                        copied_dev_list = copy.deepcopy(dev_list)
                        list_dict_data_tool.append_item_from_dict_to_list(copied_dev_list, ema_results_dict,
                                                                          'id', 'predicted_label')

                        mode = {'standard': True}
                        strict_score, acc_score, pr, rec, f1 = fever_scorer.fever_score(copied_dev_list, dev_list,
                                                                                        mode=mode, max_evidence=5)
                        ema_logging_item = {
                            'label': 'ema',
                            'ss': strict_score, 'ac': acc_score,
                            'pr': pr, 'rec': rec, 'f1': f1,
                        }

                        if not debug_mode:
                            save_file_name = f'ema_i({update_step})|e({epoch_i})' \
                                f'|ss({strict_score})|ac({acc_score})|pr({pr})|rec({rec})|f1({f1})' \
                                f'|seed({seed})'

                            common.save_jsonl(copied_dev_list, Path(file_path_prefix) /
                                              f"{save_file_name}_dev_nli_results.json")

                            # print(save_file_name)
                            logging_agent.incorporate_results({}, save_file_name, ema_logging_item)
                            logging_agent.logging_to_file(Path(file_path_prefix) / "log.json")

                            model_to_save = ema_model.module if hasattr(ema_model, 'module') else ema_model
                            output_model_file = Path(file_path_prefix) / save_file_name
                            torch.save(model_to_save.state_dict(), str(output_model_file))
예제 #26
0
import sys
import pickle
import random

from pytorch_pretrained_bert import BertTokenizer  # Use BertTokenizer for tokenize

random.seed(2018)

tokenizer = BertTokenizer.from_pretrained(
    '../../data/bert-base-chinese-vocab.txt'
)  # Initialize tokenizer, maybe we can use a local way here


def load_cluster_data(if_union=False):
    # if_union == False means we only take the same annotation data as our dataset else we take the union of both annotations
    if if_union == False:
        file = open('../../data/pickle/clusters_separation.pickle', 'rb')
    else:
        file = open('../../data/pickle/clusters_separation_union.pickle', 'rb')
    training_clusters, validation_clusters, test_clusters = pickle.load(file)
    return training_clusters, validation_clusters, test_clusters


def data_generator(training_clusters,
                   validation_clusters,
                   test_clusters,
                   if_union=False):

    cluster2pair(training_clusters, 'training', if_union)
    cluster2pair(validation_clusters, 'validation', if_union)
    cluster2pair(test_clusters, 'test', if_union)
예제 #27
0
    if col2[i] == []:
        
        del col1[i]
        del col2[i]

#print (len(df1))

# Convert to tokenization system supported by BERT

labels = col2

MAX_LEN = 64

device = torch.device("cpu")

tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

def tokenize_and_preserve_labels(sentence, text_labels):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
# 拆分训练集
train1_data, train2_data = train_test_split(train_data,
                                            test_size=0.1,
                                            random_state=1)
trainloader1 = torch.utils.data.DataLoader(dataset=MyDataset(
    train1_data, subject_data, stockname_data),
                                           batch_size=BS,
                                           shuffle=True,
                                           collate_fn=collate_fn_link)

for num_words in [num_words_]:
    for max_len in [max_len_]:
        for embedding_name in [Bert_name]:  # ['roberta_wwm','wwm','ernie']
            bert_path = './Bert/' + embedding_name + '/'
            dataset.tokenizer = BertTokenizer.from_pretrained(bert_path +
                                                              'vocab.txt')
            dataset.BERT = BertModel.from_pretrained(bert_path).to(device)
            dataset.BERT.eval()
            dataset.max_len = max_len
            for loss_weight in [loss_weight_]:
                accu_ = 0
                while accu_ < k:
                    # vocab_size有pad和unknow,维度+2
                    model = Net(vocab_size=len(word_index) + 2,
                                embedding_dim=EMBEDDING_DIM,
                                num_layers=num_layers,
                                hidden_dim=hidden_dim,
                                embedding=embedding,
                                device=device).to(device)

                    optimizer = optim.Adam(model.parameters(), lr=lr)
예제 #29
0
 def __init__(self, config: dict):
     super().__init__(config)
     self.save_treshold = 0.55
     self.modeltype = config["variant"]
     self.tokenizer = BertTokenizer.from_pretrained(
         self.modeltype, cache_dir="./.BERTcache", do_lower_case=True)
예제 #30
0
def impute():
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    id_list = []
    labels = []
    texts = []
    predict_texts = []
    with open('masked_f1.tsv', 'r') as f:
        read_tsv = csv.reader(f, delimiter="\t")
        for row in read_tsv:
            id_list.append(row[0])
            labels.append(row[1])
            texts.append(row[3])

    # Load pre-trained model (weights)
    model = BertForMaskedLM.from_pretrained('bert-base-uncased')
    model.eval()

    for i in range(len(texts)):
        repeat_flag = True
        next_predict_text = texts[i]
        while repeat_flag:
            repeat_flag = False
            # if i % 100 == 0:
            # 	print("Now: ", i/len(texts))
            text = next_predict_text
            words = text.split()[:290]
            tmp_str = ""
            for word_idx in range(len(words)):
                tmp_str += words[word_idx]
                tmp_str += " "
            texts[i] = tmp_str
            text = texts[i]
            # print(text)
            tokenized_text = tokenizer.tokenize(text)
            indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

            # Create the segments tensors.
            segments_ids = [0] * len(tokenized_text)

            # Convert inputs to PyTorch tensors
            tokens_tensor = torch.tensor([indexed_tokens])
            segments_tensors = torch.tensor([segments_ids])

            # Predict all tokens
            with torch.no_grad():
                predictions = model(tokens_tensor, segments_tensors)
            if '[MASK]' not in tokenized_text:
                indices = []
                prev_sent_indices = []
            else:
                indices = [
                    p for p, x in enumerate(tokenized_text) if x == '[MASK]'
                ]
                prev_sent_indices = [
                    q for q, x in enumerate(text.split()) if x == '[MASK]'
                ]

            # print(predictions)
            # print(indices)
            # print("Previous:\n%s" %(text))
            # print(tokenized_text)

            last_index = -2
            predict_result = []
            for each_index in indices:
                if last_index + 1 != each_index:
                    # predicted_index = torch.argmax(predictions[0, each_index]).item()
                    # print(predictions[0, masked_index])
                    sort_result = torch.sort(predictions[0, each_index])[1]
                    final_result = []
                    for j in range(20):
                        curr_item = tokenizer.convert_ids_to_tokens(
                            [sort_result[-j - 1].item()])
                        if curr_item[0] in ['.', ',', '-', ';', '?', '!', '|']:
                            pass
                        else:
                            final_result += [curr_item]
                    # print(tokenizer.convert_ids_to_tokens([sort_result[-j-1].item()]))
                    # predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
                    predict_result += [final_result[0]]
                else:
                    repeat_flag = True
                    predict_result += [['[MASK]']]
                last_index = each_index

            if not repeat_flag:
                words = text.split()
                result = ""

                for k in range(len(words)):
                    if words[k] == '[CLS]' or words[k] == '[SEP]':
                        continue
                    elif k not in prev_sent_indices:
                        result += words[k]
                    else:
                        # print(predict_result[indices.index(k)])
                        result += predict_result[prev_sent_indices.index(
                            k)][0].upper()
                    result += ' '
                # print("After: \n%s" %(result))
                predict_texts.append(result)
                # print("DONE///////////")
                # print(result)
                # print('///////////////')
            else:
                words = text.split()
                result = ""
                for k in range(len(words)):
                    if k not in prev_sent_indices:
                        result += words[k]
                    else:
                        # print(predict_result[indices.index(k)])
                        result += predict_result[prev_sent_indices.index(
                            k)][0].upper()
                    result += ' '
                # print("Next predict: " + result)
                next_predict_text = result

    df_bert = pd.DataFrame({
        'id': id_list,
        'label': labels,
        'alpha': ['a'] * len(predict_texts),
        'text': predict_texts
    })

    df_bert.to_csv('bert_test_f1_seq.tsv', sep='\t', index=False, header=False)
from pytorch_pretrained_bert import GPT2Tokenizer, GPT2Model, GPT2LMHeadModel
from pytorch_pretrained_bert import TransfoXLTokenizer, TransfoXLModel, TransfoXLLMHeadModel
## 下面可以找到参数的下载链接
# https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling.py
# https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/tokenization.py

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
logging.basicConfig(level=logging.INFO)
home = os.getenv('HOME')

##################################################################
## BERT
##################################################################
## BertTokenizer
tokenizer = BertTokenizer.from_pretrained(home + '/datasets/WordVec/pytorch_pretrained_bert/bert-large-uncased-vocab.txt')  # Load pre-trained model tokenizer (vocabulary)
print(tokenizer.max_len)  # 1000000000000; 512 for not large
print(len(tokenizer.vocab))  # 30522; words
print(type(tokenizer.vocab))  # <class 'collections.OrderedDict'>
print(tokenizer.vocab.get('hello', 0))  # 7592
print(tokenizer.vocab.get('helloworld', 0))  # 0
print(tokenizer.ids_to_tokens.get(7592, 'hello'))  # hello
print(tokenizer.ids_to_tokens.get(75920, 'hello'))  # hello
print(tokenizer.convert_ids_to_tokens([0, 1, 99, 100, 101, 102, 103, 104, 998, 999]))  # ['[PAD]', '[unused0]', '[unused98]', '[UNK]', '[CLS]', '[SEP]', '[MASK]', '[unused99]', '[unused993]', '!']
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
tokenized_text = tokenizer.tokenize(text)  # Tokenized input
print(tokenized_text)  # ['[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', 'jim', 'henson', 'was', 'a', 'puppet', '##eer', '[SEP]']

## Mask a token that we will try to predict back with `BertForMaskedLM`
masked_index = 8
tokenized_text[masked_index] = '[MASK]'
예제 #32
0
class Generater:
    def __init__(self, bert_path):
        vocab_file_name = 'vocab.txt'
        # 日本語文章をBERTに食わせるためにJumanを読み込む
        self.juman_tokenizer = JumanTokenizer()
        # 事前学習済みのBERTモデルを読み込む
        self.model = BertModel.from_pretrained(bert_path)
        # 事前学習済みのBERTモデルのTokenizerを読み込む
        self.bert_tokenizer = BertTokenizer(Path(bert_path) / vocab_file_name,
                                            do_lower_case=False, do_basic_tokenize=False)
        self.vocab_size = len(self.bert_tokenizer.vocab)

        # 事前学習済みのBERTモデルのMaskedLMタスクモデルを読み込む
        self.model = BertForMaskedLM.from_pretrained(bert_path)

        # 除外するヘッダ等トークン
        except_tokens = ["[MASK]", 
        #"[PAD]",
        "[UNK]", "[CLS]", "[SEP]",
        "(", ")", "・", "/", "、", "。", "!", "?", "「", "」", "…", "’", "』", "『", ":", "※"
        ]
        self.except_ids = [self.bert_tokenizer.vocab[token] for token in except_tokens]

        # vocab_sizeのうち、except_ids以外は、利用する
        self.candidate_ids = [i for i in range(self.vocab_size)
                        if i not in self.except_ids]


    def _preprocess_text(self, text):
        # 事前処理、テキストの半角スペースは削除
        return text.replace(" ", "").replace('#', '')  # for Juman

    def text2tokens(self, text):
        # テキストの半角スペースを削除する
        preprocessed_text = self._preprocess_text(text)
        # 日本語のテキストを分かち書きし、トークンリストに変換する
        tokens = self.juman_tokenizer.tokenize(preprocessed_text)
        # トークンを半角スペースで結合しstrに変換する
        bert_tokens = self.bert_tokenizer.tokenize(" ".join(tokens))
        # テキストのサイズは128までなので、ヘッダ + トークン126個 + フッタを作成
        # トークンをidに置換する
        ids = self.bert_tokenizer.convert_tokens_to_ids(["[CLS]"] + bert_tokens[:126] + ["[SEP]"]) # max_seq_len-2
        generated_token_ids = torch.tensor(ids).reshape(1, -1)
        return generated_token_ids

    def tokens2text(self, tokens):
        sampled_sequence = [self.bert_tokenizer.ids_to_tokens[token_id]
                                        for token_id in tokens[0].cpu().numpy()]
        sampled_sequence = "".join(
            [
                token[2:] if token.startswith("##") else token
                for token in list(filter(lambda x: x != '[PAD]' and x != '[CLS]' and x != '[SEP]', sampled_sequence))
            ]
        )
        return sampled_sequence


    def likelihood(self, tokens):
        outputs = self.model(tokens)
        predictions = outputs[0]

        score_sum = 0.0
        for idx, scores in zip(tokens[0].tolist(), predictions[0].tolist()):
            score_sum += scores[idx]
        return score_sum

    def initialization_text(self, length=10):
        init_tokens = []
        # ヘッダ
        init_tokens.append(self.bert_tokenizer.vocab["[CLS]"])
        for _ in range(length):
            # ランダムに文字を選択
            init_tokens.append(random.choice(self.candidate_ids))
        # フッタ
        init_tokens.append(self.bert_tokenizer.vocab["[SEP]"])

        return torch.tensor(init_tokens).reshape(1, -1)

    def scoring(self, tokens):
        return self.likelihood(tokens) + self.juman_tokenizer.tanka_score_subsets(self.tokens2text(tokens)) + self.juman_tokenizer.tanka_score_flow(self.tokens2text(tokens))

    def select(self, l_tokens, size=5):
        scores = list(map(self.scoring, l_tokens))
        print(sorted(scores, reverse=True)[:3])
        selected = list(map(
            lambda x: x[0],
            sorted(
                list(zip(l_tokens, scores)), 
                key=lambda x: x[1],
                reverse=True
            )
        ))

        return selected

    def crossover(self, tokens_0, tokens_1):
        l_tokens_0 = tokens_0.numpy().reshape(-1).tolist()
        l_tokens_1 = tokens_1.numpy().reshape(-1).tolist()

        start = random.randint(1, len(l_tokens_0) - 3)
        end = random.randint(start, len(l_tokens_0) - 2)

        for num in range(start, end):
            l_tokens_0[num] = l_tokens_1[num]

        return torch.tensor(l_tokens_0).reshape(1, -1)

    def mutation(self, tokens, N=3):
        l_tokens = tokens.numpy().reshape(-1).tolist()

        for num in range(N):
            num = random.randint(1, len(l_tokens) - 2)
            l_tokens[num] = self.bert_tokenizer.vocab["[MASK]"]
            
            outputs = self.model(torch.tensor(l_tokens).reshape(1, -1))
            predictions = outputs[0]
            _, predicted_indexes = torch.topk(predictions[0, num], k=10)

            # random_tokens = [random.choice(self.candidate_ids) for i in range(1)]
            random_tokens = []

            predicted_indexes = list(
                set(predicted_indexes.tolist() + random_tokens) - set(self.except_ids)
            )

            predicted_tokens = self.bert_tokenizer.convert_ids_to_tokens(predicted_indexes)
            predict_token = random.choice(predicted_indexes)

            l_tokens[num] = predict_token

        return torch.tensor(l_tokens).reshape(1, -1)