def __init__(self, embedding_dir, model_name="bert-base-multilingual-cased", layer=-2): super(BertEncoder, self).__init__(embedding_dir) # Load pre-trained model (weights) and set to evaluation mode (no more training) self.model = BertModel.from_pretrained(model_name) self.model.eval() # Load word piece tokenizer self.tokenizer = BertTokenizer.from_pretrained(model_name) # Layer from which to get the embeddings self.layer = layer
def model_eval_ablation(model_path, filter_value=0.2, top_k_sent=5): bert_model_name = 'bert-base-uncased' bert_pretrain_path = config.PRO_ROOT / '.pytorch_pretrained_bert' lazy = False forward_size = 32 do_lower_case = True pair_order = 'cq' debug_mode = False maxout_model = False num_class = 3 tag = 'dev' exp = 'no_re_train' print("Filter value:", filter_value) print("top_k_sent:", top_k_sent) train_sent_filtering_prob = 0.2 dev_sent_filtering_prob = filter_value test_sent_filtering_prob = 0.2 # Data dataset and upstream sentence results. dev_sent_results_list = common.load_jsonl( config.PRO_ROOT / "data/p_fever/fever_sentence_level/04-24-00-11-19_fever_v0_slevel_retri_(ignore_non_verifiable-True)/fever_s_level_dev_results.jsonl") # train_sent_results_list = common.load_jsonl( # config.PRO_ROOT / "data/p_fever/fever_sentence_level/04-24-00-11-19_fever_v0_slevel_retri_(ignore_non_verifiable-True)/fever_s_level_train_results.jsonl") test_sent_results_list = common.load_jsonl( config.PRO_ROOT / "data/p_fever/fever_sentence_level/04-24-00-11-19_fever_v0_slevel_retri_(ignore_non_verifiable-True)/fever_s_level_test_results.jsonl") dev_fitems, dev_list = get_nli_pair('dev', is_training=False, sent_level_results_list=dev_sent_results_list, debug=debug_mode, sent_top_k=top_k_sent, sent_filter_value=dev_sent_filtering_prob) # train_fitems, train_list = get_nli_pair('train', is_training=True, # sent_level_results_list=train_sent_results_list, debug=debug_mode, # sent_top_k=5, sent_filter_value=train_sent_filtering_prob) test_fitems, test_list = get_nli_pair('test', is_training=False, sent_level_results_list=test_sent_results_list, debug=debug_mode, sent_top_k=top_k_sent, sent_filter_value=test_sent_filtering_prob) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device_num = 0 if torch.cuda.is_available() else -1 n_gpu = torch.cuda.device_count() unk_token_num = {'tokens': 1} # work around for initiating vocabulary. vocab = ExVocabulary(unk_token_num=unk_token_num) vocab.add_token_to_namespace('SUPPORTS', namespace='labels') vocab.add_token_to_namespace('REFUTES', namespace='labels') vocab.add_token_to_namespace('NOT ENOUGH INFO', namespace='labels') vocab.add_token_to_namespace("hidden", namespace="labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels') if debug_mode: dev_list = dev_list[:100] # train_list = train_list[:100] test_list = test_list[:100] eval_frequency = 2 # est_datasize = len(train_fitems) bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=do_lower_case, cache_dir=bert_pretrain_path) bert_cs_reader = BertFeverNLIReader(bert_tokenizer, lazy, is_paired=True, query_l=64, example_filter=None, max_l=384, pair_order=pair_order) bert_encoder = BertModel.from_pretrained(bert_model_name, cache_dir=bert_pretrain_path) if not maxout_model: model = BertMultiLayerSeqClassification(bert_encoder, num_labels=num_class, num_of_pooling_layer=1, act_type='tanh', use_pretrained_pooler=True, use_sigmoid=False) else: model = BertPairMaxOutMatcher(bert_encoder, num_of_class=num_class, act_type="gelu", num_of_out_layers=2) model.load_state_dict(torch.load(model_path)) dev_instances = bert_cs_reader.read(dev_fitems) # train_instances = bert_cs_reader.read(train_fitems) test_instances = bert_cs_reader.read(test_fitems) biterator = BasicIterator(batch_size=forward_size) biterator.index_with(vocab) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) if tag == 'dev': dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False) cur_eval_results_list = eval_model(model, dev_iter, device_num, with_probs=True, make_int=True, feed_input_span=maxout_model, show_progress=True) common.save_jsonl(cur_eval_results_list, f"nli_{tag}_label_results_th{dev_sent_filtering_prob}_{exp}.jsonl") ema_results_dict = list_dict_data_tool.list_to_dict(cur_eval_results_list, 'oid') copied_dev_list = copy.deepcopy(dev_list) list_dict_data_tool.append_item_from_dict_to_list(copied_dev_list, ema_results_dict, 'id', 'predicted_label') common.save_jsonl(copied_dev_list, f"nli_{tag}_cp_results_th{dev_sent_filtering_prob}_{exp}.jsonl") mode = {'standard': True} strict_score, acc_score, pr, rec, f1 = fever_scorer.fever_score(copied_dev_list, dev_list, mode=mode, max_evidence=5) logging_item = { 'ss': strict_score, 'ac': acc_score, 'pr': pr, 'rec': rec, 'f1': f1, } print(logging_item) common.save_json(logging_item, f"nli_th{dev_sent_filtering_prob}_{exp}_ss:{strict_score}_ac:{acc_score}_pr:{pr}_rec:{rec}_f1:{f1}.jsonl") elif tag == 'test': test_iter = biterator(test_instances, num_epochs=1, shuffle=False) cur_eval_results_list = eval_model(model, test_iter, device_num, with_probs=True, make_int=True, feed_input_span=maxout_model, show_progress=True) common.save_jsonl(cur_eval_results_list, f"nli_{tag}_label_results_th{test_sent_filtering_prob}.jsonl") ema_results_dict = list_dict_data_tool.list_to_dict(cur_eval_results_list, 'oid') copied_test_list = copy.deepcopy(test_list) list_dict_data_tool.append_item_from_dict_to_list(copied_test_list, ema_results_dict, 'id', 'predicted_label') common.save_jsonl(copied_test_list, f"nli_{tag}_cp_results_th{test_sent_filtering_prob}.jsonl")
import torch from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM # Load pre-trained model tokenizer (vocabulary) modelpath = "bert-base-uncased" tokenizer = BertTokenizer.from_pretrained(modelpath) text = "dummy. although he had already eaten a large meal, he was still very hungry." target = "hungry" tokenized_text = tokenizer.tokenize(text) # Mask a token that we will try to predict back with `BertForMaskedLM` masked_index = tokenized_text.index(target) tokenized_text[masked_index] = '[MASK]' # Convert token to vocabulary indices indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) # Define sentence A and B indices associated to 1st and 2nd sentences (see paper) segments_ids = [1] * len(tokenized_text) # this is for the dummy first sentence. segments_ids[0] = 0 segments_ids[1] = 0 # Convert inputs to PyTorch tensors tokens_tensor = torch.tensor([indexed_tokens]) segments_tensors = torch.tensor([segments_ids]) # Load pre-trained model (weights) model = BertForMaskedLM.from_pretrained(modelpath) model.eval() # Predict all tokens
from decouple import config import tweepy import basilica # Pytorch and BERT import torch from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM # Load pre-trained model tokenizer (vocabulary) tokenizer = BertTokenizer.from_pretrained('bert-large-uncased') # Gensim import gensim import gensim.corpora as corpora from gensim.utils import simple_preprocess from gensim.models import CoherenceModel mallet_path = '/Users/mattkirby/Social-Analysis/tweet-analysis/mallet-2.0.8/bin/mallet' # Spacy for lemmatization import spacy nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner']) # NLTK import nltk from nltk.corpus import stopwords #Preprocess for BERT def bert_preprocess(list_of_stings): sentences = [] begin_tag = "[CLS] " end_tag = " [SEP]"
from my_utils.log_wrapper import create_logger from my_utils.tokenizer import END, build_vocab from my_utils.utils import set_environment from my_utils.word2vec_utils import load_emb_vocab, build_embedding from pytorch_pretrained_bert import BertTokenizer """ This script is to preprocess SQuAD dataset. """ # Turn off DEBUG_ON = False DEBUG_SIZE = 2000 NLP = spacy.load('en_core_web_md', disable=['vectors', 'textcat', 'parser']) BERT_TOKENIZER = BertTokenizer.from_pretrained('bert-base-cased') def load_data(path, is_train=True, v2_on=False): rows = [] with open(path, encoding="utf8") as f: data = json.load(f)['data'] for article in tqdm.tqdm(data, total=len(data)): for paragraph in article['paragraphs']: context = paragraph['context'] if v2_on: context = '{} {}'.format(context, END) for qa in paragraph['qas']: uid, question = qa['id'], qa['question'] answers = qa.get('answers', []) # used for v2.0
print("F1 score: ", f1_score_curr, " at threshold: ", threshold) if f1_score_curr > best_f1_score: best_f1_score = f1_score_curr best_threshold = threshold return best_threshold if __name__ == '__main__': # load the dataset data = json.load(open('../data/data_with_cuis.json', 'r')) # concept to cui mappings cui_to_concept, concept_to_cui = concept_cui_mapping(data, aspect) # # create the vocabulary for the input idx_to_cui, cui_to_idx = extract_target_vocab(data, aspect) # Load pre-trained model tokenizer (vocabulary) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', max_len=512) # load label count label_cnt = json.load(open('../data/label_counts.json', 'r')) p_label_cnt = label_cnt['p_label_cnt'] i_label_cnt = label_cnt['i_label_cnt'] o_label_cnt = label_cnt['o_label_cnt'] # Split train and test data train_idx = rd.sample(range(len(data)), int(0.8 * len(data))) test_idx = [i for i in range(len(data)) if i not in train_idx] train_data = [data[i] for i in train_idx] test_data = [data[i] for i in test_idx] val_idx = rd.sample(range(len(train_data)), int(0.1 * len(train_data)))
def process_mrc_example(): csv_reader = csv.reader(open(TRAIN_DIR), delimiter='\t') rows = [row for row in csv_reader] docid_name = rows[0][1] question_name = rows[0][2] answer_name = rows[0][3] json_positive_dirs = join(MRC_DIR, '200_sample') if not exists(json_positive_dirs): os.makedirs(json_positive_dirs) print('Dir used for mrc samples Created ') with open(REALATE_DIR, 'rb') as v: relation_dict = pickle.load(v) sample_rows = rows[:200] tmp_dict = {} count = 0 maxlen = 0 for i, sample_raw in enumerate(sample_rows): if (i == 0): continue else: print('start processing {}'.format(i)) try: new_docid = relation_dict[sample_raw[1]] tmp_dict['new_docid'] = new_docid with open( join(join(MRC_DIR, 'context'), '{}.json'.format(new_docid)), 'rb') as p: context = json.load(p) except KeyError: print('mrc sample {} - related document not found') # tmp_dict[docid_name] = sample_raw[1] tokenizer = BertTokenizer.from_pretrained('./MRC_pretrain') text = context['text'] text_tok = tokenizer.tokenize(text) text_id = tokenizer.convert_tokens_to_ids(text_tok) text_len = len(text_id) question = filter_text(sample_raw[2].replace(' ', '').replace( ' ', '')) ques_tok = tokenizer.tokenize("[CLS] " + question + " [SEP]") ques_id = tokenizer.convert_tokens_to_ids(ques_tok) question_len = len(ques_id) maxlen = question_len if question_len > maxlen else maxlen answer = filter_text(sample_raw[3].replace(' ', '').replace( ' ', '')) ans_tok = tokenizer.tokenize(answer) ans_id = tokenizer.convert_tokens_to_ids(ans_tok) ans_len = len(ans_id) suppose_start = [] #可能的start位置 for i in range(text_len): if (text_id[i] == ans_id[0]): suppose_start.append(i) s = 0 e = 0 if (len(suppose_start) <= 0): continue else: for t in range(len(suppose_start)): start = suppose_start[t] end = suppose_start[t] for m in range(ans_len): if (m + start >= text_len): break elif (ans_id[m] == text_id[m + start]): end += 1 else: break if (end - start != ans_len): continue else: s = suppose_start[t] e = end break if (s == 0 and e == 0): continue else: span_arr = [0] * (s - 0) + [1] * (e - s) + [0] * (text_len - e) assert len(span_arr) == text_len tmp_dict['question'] = ques_id tmp_dict['question_length'] = question_len tmp_dict['text'] = text_id tmp_dict['text_length'] = text_len tmp_dict['answer_span'] = span_arr tmp_dict['text_tok'] = text_tok tmp_dict['original_text'] = text with open(join(json_positive_dirs, '{}.json'.format(count)), 'w', encoding='utf-8') as f: json.dump(tmp_dict, f, ensure_ascii=False) count += 1 # print('sample index larger than 512 is {}'.format(count)) print('Pre-processed {} mrc samples finished'.format(count))
def create_dataloader(self): # 读取输入输出 print("Load data") train_comments = self.train_df["comment_text"].astype(str) train_label = self.train_df["target"].values train_type_labels = self.train_df[self.toxicity_type_list].values # 新的 np 任务 train_np_labels = np.zeros((len(self.train_df), 4)) train_np_identity_labels = np.zeros( (len(self.train_df), len(self.identity_list) * 4)) train_df_copy = self.train_df[self.identity_list + ["target"]] for column in self.identity_list + ["target"]: train_df_copy[column] = np.where(train_df_copy[column] > 0.5, True, False) pp_label_bool = train_df_copy["target"] & np.where( train_df_copy[self.identity_list].sum(axis=1) > 0, True, False) np_label_bool = ~train_df_copy["target"] & np.where( train_df_copy[self.identity_list].sum(axis=1) > 0, True, False) pn_label_bool = train_df_copy["target"] & np.where( (train_df_copy[self.identity_list]).sum(axis=1) == 0, True, False) nn_label_bool = ~train_df_copy["target"] & np.where( (train_df_copy[self.identity_list]).sum(axis=1) == 0, True, False) train_np_labels[:, 0] = np.where(pp_label_bool > 0, 1, 0) train_np_labels[:, 1] = np.where(np_label_bool > 0, 1, 0) train_np_labels[:, 2] = np.where(pn_label_bool > 0, 1, 0) train_np_labels[:, 3] = np.where(nn_label_bool > 0, 1, 0) for i, column in enumerate(self.identity_list): pp_label_bool = train_df_copy["target"] & train_df_copy[column] np_label_bool = ~train_df_copy["target"] & train_df_copy[column] pn_label_bool = train_df_copy["target"] & (~train_df_copy[column]) nn_label_bool = ~train_df_copy["target"] & (~train_df_copy[column]) train_np_identity_labels[:, i * 4 + 0] = np.where( pp_label_bool > 0, 1, 0) train_np_identity_labels[:, i * 4 + 1] = np.where( np_label_bool > 0, 1, 0) train_np_identity_labels[:, i * 4 + 2] = np.where( pn_label_bool > 0, 1, 0) train_np_identity_labels[:, i * 4 + 3] = np.where( nn_label_bool > 0, 1, 0) # 身份原始值 train_identity_values = self.train_df[self.identity_list].fillna( 0.).values # 所有身份原始值之和 train_identity_sum = train_identity_values.sum(axis=1) # 将身份之和限制在1以下(sigmoid) train_identity_sum_label = np.where(train_identity_sum > 1, 1, train_identity_sum) # 身份01值 train_identity_binary = copy.deepcopy( self.train_df[self.identity_list]) for column in self.identity_list: train_identity_binary[column] = np.where( train_identity_binary[column] > 0.5, 1, 0) # 身份01值有一个就算1 train_identity_binary_sum = train_identity_binary.sum(axis=1) train_identity_or_binary = np.where(train_identity_binary_sum >= 1, 1, 0) # 所有身份标签 train_identity_type_labels = train_identity_values train_identity_type_binary_lables = train_identity_binary train_identity_sum_label = train_identity_sum_label train_identity_binary_label = train_identity_or_binary # tokenizer 训练 print("Init tokenizer") bert_tokenizer = BertTokenizer.from_pretrained(self.bert_model_path, cache_dir=None, do_lower_case=True) print("Tokenizing") train_bert_tokens = self.convert_lines( self.train_df["comment_text"].fillna("DUMMY_VALUE"), self.max_len, bert_tokenizer) # 划分训练集和验证集 valid_tokens = train_bert_tokens[self.train_len:] valid_label = train_label[self.train_len:] valid_type_labels = train_type_labels[self.train_len:] train_tokens = train_bert_tokens[:int( self.train_len * 0.5 )] if self.half == 1 else train_bert_tokens[int(self.train_len * 0.5):self.train_len] train_label = train_label[:int( self.train_len * 0.5)] if self.half == 1 else train_label[int(self.train_len * 0.5):self.train_len] train_type_labels = train_type_labels[:int( self.train_len * 0.5 )] if self.half == 1 else train_type_labels[int(self.train_len * 0.5):self.train_len] valid_identity_type_labels = train_identity_type_labels[self. train_len:] train_identity_type_labels = train_identity_type_labels[:int( self.train_len * 0.5)] if self.half == 1 else train_identity_type_labels[ int(self.train_len * 0.5):self.train_len] valid_identity_type_binary_lables = train_identity_type_binary_lables[ self.train_len:] train_identity_type_binary_lables = train_identity_type_binary_lables[:int( self.train_len * 0.5)] if self.half == 1 else train_identity_type_binary_lables[ int(self.train_len * 0.5):self.train_len] valid_identity_sum_label = train_identity_sum_label[self.train_len:] train_identity_sum_label = train_identity_sum_label[:int( self.train_len * 0.5)] if self.half == 1 else train_identity_sum_label[ int(self.train_len * 0.5):self.train_len] valid_identity_binary_label = train_identity_binary_label[self. train_len:] train_identity_binary_label = train_identity_binary_label[:int( self.train_len * 0.5)] if self.half == 1 else train_identity_binary_label[ int(self.train_len * 0.5):self.train_len] valid_np_labels = train_np_labels[self.train_len:] train_np_labels = train_np_labels[:int( self.train_len * 0.5 )] if self.half == 1 else train_np_labels[int(self.train_len * 0.5):self.train_len] valid_np_identity_labels = train_np_identity_labels[self.train_len:] train_np_identity_labels = train_np_identity_labels[:int( self.train_len * 0.5)] if self.half == 1 else train_np_identity_labels[ int(self.train_len * 0.5):self.train_len] # 计算样本权重 target_weight, aux_weight, identity_weight, np_weight, np_identity_weight = self.cal_sample_weights( ) # 将符号化数据转成 tensor train_x_tensor = torch.tensor(train_tokens, dtype=torch.long) valid_x_tensor = torch.tensor(valid_tokens, dtype=torch.long) train_y_tensor = torch.tensor(np.hstack([ train_label[:, np.newaxis], train_type_labels, train_identity_type_labels, train_np_labels ]), dtype=torch.float32) valid_y_tensor = torch.tensor(np.hstack([ valid_label[:, np.newaxis], valid_type_labels, valid_identity_type_labels, valid_np_labels ]), dtype=torch.float32) target_weight_tensor = torch.tensor(target_weight, dtype=torch.float32) aux_weight_tensor = torch.tensor(aux_weight, dtype=torch.float32) identity_weight_tensor = torch.tensor(identity_weight, dtype=torch.float32) np_weight_tensor = torch.tensor(np_weight, dtype=torch.float32) np_identity_weight_tensor = torch.tensor(np_identity_weight, dtype=torch.float32) if torch.cuda.is_available(): train_x_tensor = train_x_tensor.to(self.device) valid_x_tensor = valid_x_tensor.to(self.device) train_y_tensor = train_y_tensor.to(self.device) valid_y_tensor = valid_y_tensor.to(self.device) target_weight_tensor = target_weight_tensor.to(self.device) aux_weight_tensor = aux_weight_tensor.to(self.device) identity_weight_tensor = identity_weight_tensor.to(self.device) np_weight_tensor = np_weight_tensor.cuda() np_identity_weight_tensor = np_identity_weight_tensor.cuda() # 将 tensor 转成 dataset,训练数据和标签一一对应,用 dataloader 加载的时候 dataset[:-1] 是 x,dataset[-1] 是 y train_dataset = data.TensorDataset(train_x_tensor, train_y_tensor, target_weight_tensor, aux_weight_tensor, identity_weight_tensor, np_weight_tensor, np_identity_weight_tensor) valid_dataset = data.TensorDataset(valid_x_tensor, valid_y_tensor) # 将 dataset 转成 dataloader train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=self.base_batch_size, shuffle=True) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=self.base_batch_size, shuffle=False) # 返回训练数据 return train_loader, valid_loader
print(encoded_layers) for i in encoded_layers: enc, _ = self.rnn(i) logits = self.fc(enc) if y is not None: loss_fct = nn.CrossEntropyLoss(ignore_index=-1) loss = loss_fct(logits.view(-1, logits.shape[-1]), y.view(-1)) return loss return logits if __name__ == '__main__': input_text = '[CLS] I go to school by bus [SEP]' target_text = '我搭公車上學' tokenizer = BertTokenizer.from_pretrained('./vocab.txt') example_pair = dict() # 数据预处理 for i in range(0, len(target_text) + 1): tokenized_text = tokenizer.tokenize(input_text) # 对输入文本分词 tokenized_text.extend(target_text[:i]) # 每次的输入加一步解码的信息 tokenized_text.append('[MASK]') indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) tokens_tensor = torch.tensor([indexed_tokens]) # 用-1标记的都是不求损失的 loss_ids = [-1] * (len(tokenizer.convert_tokens_to_ids(tokenized_text)) - 1) if i == len(target_text): loss_ids.append(tokenizer.convert_tokens_to_ids(tokenizer.tokenize('[SEP]'))[0]) # 最后加个sep else:
def process_ace34_file(path): BERT_ML = ACE34_UNCASE['bert_ml'] ORI_ML = ACE34_UNCASE['ori_ml'] pretrained_bert_name = ACE34_UNCASE['pretrained_bert_name'] tokenizer = BertTokenizer.from_pretrained(pretrained_bert_name) fin = open('{}.graph'.format(path), 'rb') idx2gragh = pickle.load(fin) fin.close() raw_data = read_ace34_file(path) data = [] for sample_id, tokens, label, target, anchor_index in raw_data: sent_len = len(tokens) # On raw text tok_bert_indices = [] for tok in tokens: tok = tok.replace('...', '.').replace('...', '.').replace('...', '.') bert_tokens = tokenizer.tokenize(tok) tok_bert_indices.append( tokenizer.convert_tokens_to_ids(bert_tokens)) bert_len = sum([len(x) for x in tok_bert_indices]) assert bert_len <= BERT_ML, 'Bert length: {}\n{}'.format( bert_len, tokens) assert sent_len <= ORI_ML, 'Ori length: {}'.format(sent_len) transform = zeros(ORI_ML, BERT_ML, 0.0) # Create transform to convert tokenized length to original length offset = 1 raw_text_bert_indices = [] # print(sample_id) # print(sum([len(x) for x in tok_bert_indices])) # print(' '.join(tokens)) for i, indices in enumerate(tok_bert_indices): l = len(indices) raw_text_bert_indices += indices for j in range(l): assert i <= ORI_ML, '| i={}'.format(i) assert offset + j < BERT_ML, '| offset={} j={}, sum={}'.format( offset, j, offset + j) transform[i][offset + j] = 1 / l # if i == anchor_index: # bert_anchor_index = offset offset += l assert offset == 1 + len(raw_text_bert_indices), "Wrong offset" dep_matrix = idx2gragh[sample_id] aspect_indices = tok_bert_indices[anchor_index] # CLS + Sentence + SEP + aspect + SEP cls_text_sep_aspect_sep_indices = [101] + raw_text_bert_indices + [ 102 ] + aspect_indices + [102] cls_text_sep_aspect_sep_length = len(cls_text_sep_aspect_sep_indices) assert cls_text_sep_aspect_sep_length <= BERT_ML, 'CLS indices length: {}\n{}'.format( cls_text_sep_aspect_sep_length, tokens) cls_text_sep_aspect_sep_mask = [ 0 for _ in range(cls_text_sep_aspect_sep_length) ] + [1 for _ in range(BERT_ML - cls_text_sep_aspect_sep_length)] cls_text_sep_aspect_sep_indices += [ 0 for _ in range(BERT_ML - cls_text_sep_aspect_sep_length) ] cls_text_sep_aspect_sep_segments_ids = [0 for _ in range(BERT_ML)] for i in range(bert_len + 2, bert_len + 2 + len(aspect_indices) + 1): cls_text_sep_aspect_sep_segments_ids[i] = 1 cls_text_sep_aspect_sep_aspect_mask = [1 for _ in range(BERT_ML)] for i in range(offset + 1, offset + 1 + len(aspect_indices)): cls_text_sep_aspect_sep_aspect_mask[i] = 0 # CLS + sentence + SEP cls_text_sep_indices = [101] + raw_text_bert_indices + [102] cls_text_sep_length = len(cls_text_sep_indices) cls_text_sep_indices += [ 0 for x in range(BERT_ML - cls_text_sep_length) ] cls_text_sep_segments_ids = [0 for _ in range(BERT_ML)] # Original sentence mask = [0 for _ in range(sent_len) ] + [1 for _ in range(ORI_ML - sent_len)] dist = [-1 for i in range(sent_len)] dist_to_target = get_dist_to_target(dep_matrix, anchor_index, dist, sent_len) dist_padding = [0] * (ORI_ML - len(dist_to_target)) pad_dist_to_target = dist_to_target + dist_padding # pad_aspect_indices = aspect_indices + [BERT_ML - len(aspect_indices)] item = { 'token': tokens, 'cls_text_sep_aspect_sep_indices': cls_text_sep_aspect_sep_indices, 'cls_text_sep_aspect_sep_length': cls_text_sep_aspect_sep_length, 'cls_text_sep_aspect_sep_segments_ids': cls_text_sep_aspect_sep_segments_ids, 'cls_text_sep_aspect_sep_aspect_mask': cls_text_sep_aspect_sep_aspect_mask, 'cls_text_sep_aspect_sep_mask': cls_text_sep_aspect_sep_mask, 'cls_text_sep_indices': cls_text_sep_indices, 'cls_text_sep_length': cls_text_sep_length, 'cls_text_sep_segments_ids': cls_text_sep_segments_ids, 'anchor_index': anchor_index, 'transform': transform, 'sentence_length': sent_len, 'bert_length': bert_len, 'polarity': target, 'dependency_graph': dep_matrix, 'mask': mask, 'dist_to_target': pad_dist_to_target } # offset += len(aspect_indices) data.append(item) preprocessed_file = path.replace('.tsv', '.proc') with open(preprocessed_file, 'wb') as f: pickle.dump(data, f) print(preprocessed_file)
def __init__(self, pretrained_model: str): self.tokenizer = BertTokenizer.from_pretrained(pretrained_model)
def process_ace_file(path): BERT_ML = ACE_CASE['bert_ml'] ORI_ML = ACE_CASE['ori_ml'] MAX_TARGET_VALUE = ACE_CASE['n_class'] tokenizer = BertTokenizer.from_pretrained(ACE_CASE['pretrained_bert_name'], do_lower_case=False) fin = open('{}.graph'.format(path), 'rb') idx2gragh = pickle.load(fin) fin.close() raw_data = read_ace_file(path) data = [] for sample_id, tokens, labels, targets in raw_data: sent_len = len(tokens) # On raw text tok_bert_indices = [] for tok in tokens: bert_tokens = tokenizer.tokenize(tok) tok_bert_indices.append( tokenizer.convert_tokens_to_ids(bert_tokens)) bert_len = sum([len(x) for x in tok_bert_indices]) transform = zeros(ORI_ML, BERT_ML, 0.0) # Create transform to convert tokenized length to original length offset = 1 raw_text_bert_indices = [] # print(sample_id) # print(sum([len(x) for x in tok_bert_indices])) # print(' '.join(tokens)) for i, indices in enumerate(tok_bert_indices): l = len(indices) raw_text_bert_indices += indices for j in range(l): transform[i][offset + j] = 1 / l offset += l dep_matrix = idx2gragh[sample_id] offset = 1 # Because of the CLS for anchor_index, (aspect_indices, target) in enumerate(zip(tok_bert_indices, targets)): # Discard O label (it is not Other label) if target > MAX_TARGET_VALUE: continue # CLS + Sentence + SEP + aspect + SEP cls_text_sep_aspect_sep_indices = [101] + raw_text_bert_indices + [ 102 ] + aspect_indices + [102] cls_text_sep_aspect_sep_length = len( cls_text_sep_aspect_sep_indices) cls_text_sep_aspect_sep_mask = [ 0 for _ in range(cls_text_sep_aspect_sep_length) ] + [1 for _ in range(BERT_ML - cls_text_sep_aspect_sep_length)] cls_text_sep_aspect_sep_indices += [ 0 for _ in range(BERT_ML - cls_text_sep_aspect_sep_length) ] cls_text_sep_aspect_sep_segments_ids = [0 for _ in range(BERT_ML)] for i in range(bert_len + 2, bert_len + 2 + len(aspect_indices) + 1): cls_text_sep_aspect_sep_segments_ids[i] = 1 cls_text_sep_aspect_sep_aspect_mask = [1 for _ in range(BERT_ML)] for i in range(offset, offset + len(aspect_indices)): cls_text_sep_aspect_sep_aspect_mask[i] = 0 offset += len(aspect_indices) # CLS + sentence + SEP cls_text_sep_indices = [101] + raw_text_bert_indices + [102] cls_text_sep_length = len(cls_text_sep_indices) cls_text_sep_indices += [ 0 for x in range(BERT_ML - cls_text_sep_length) ] cls_text_sep_segments_ids = [0 for _ in range(BERT_ML)] # Original sentence mask = [0 for _ in range(sent_len) ] + [1 for _ in range(ORI_ML - sent_len)] dist = [-1 for i in range(sent_len)] dist_to_target = get_dist_to_target(dep_matrix, anchor_index, dist, sent_len) max_dist = max(dist_to_target) + 1 dist_padding = [max_dist] * (ORI_ML - len(dist_to_target)) pad_dist_to_target = dist_to_target + dist_padding item = { 'cls_text_sep_aspect_sep_indices': cls_text_sep_aspect_sep_indices, 'cls_text_sep_aspect_sep_length': cls_text_sep_aspect_sep_length, 'cls_text_sep_aspect_sep_segments_ids': cls_text_sep_aspect_sep_segments_ids, 'cls_text_sep_aspect_sep_aspect_mask': cls_text_sep_aspect_sep_aspect_mask, 'cls_text_sep_aspect_sep_mask': cls_text_sep_aspect_sep_mask, 'cls_text_sep_indices': cls_text_sep_indices, 'cls_text_sep_length': cls_text_sep_length, 'cls_text_sep_segments_ids': cls_text_sep_segments_ids, 'anchor_index': anchor_index, 'transform': transform, 'sentence_length': sent_len, 'bert_length': bert_len, 'polarity': target, 'dependency_graph': dep_matrix, 'mask': mask, 'dist_to_target': pad_dist_to_target } data.append(item) preprocessed_file = path.replace('.tsv', '.proc') with open(preprocessed_file, 'wb') as f: pickle.dump(data, f) print(preprocessed_file)
class BertWithJumanModel(): def __init__(self, bert_path, vocab_file_name="vocab.txt", use_cuda=False): # 日本語文章をBERTに食わせるためにJumanを読み込む self.juman_tokenizer = JumanTokenizer() # 事前学習済みのBERTモデルのMaskedLMタスクモデルを読み込む self.model = BertForMaskedLM.from_pretrained(bert_path) # 事前学習済みのBERTモデルのTokenizerを読み込む self.bert_tokenizer = BertTokenizer(Path(bert_path) / vocab_file_name, do_lower_case=False, do_basic_tokenize=False) # CUDA-GPUを利用するかどうかのフラグ読み込み self.use_cuda = use_cuda def _preprocess_text(self, text): # 事前処理、テキストの半角スペースは削除 return text.replace(" ", "") # for Juman def paraphrase(self, text): # テキストの半角スペースを削除する preprocessed_text = self._preprocess_text(text) # 日本語のテキストを分かち書きし、トークンリストに変換する tokens = self.juman_tokenizer.tokenize(preprocessed_text) # トークンを半角スペースで結合しstrに変換する bert_tokens = self.bert_tokenizer.tokenize(" ".join(tokens)) # テキストのサイズは128までなので、ヘッダ + トークン126個 + フッタを作成 # トークンをidに置換する ids = self.bert_tokenizer.convert_tokens_to_ids( ["[CLS]"] + bert_tokens[:126] + ["[SEP]"]) # max_seq_len-2 generated_token_ids = torch.tensor(ids).reshape(1, -1) if self.use_cuda: # GPUの利用チェック、利用 generated_token_ids = generated_token_ids.to('cuda') self.model.to('cuda') # モデルを評価モードに変更 self.model.eval() with torch.no_grad(): for i in range(10): for j, _ in enumerate(tokens): # 文章のトークン1つをMASKに変換する # ヘッダは除くから、+1から masked_index = j + 1 pre_token = generated_token_ids[0, masked_index].item() generated_token_ids[ 0, masked_index] = self.bert_tokenizer.vocab["[MASK]"] outputs = self.model(generated_token_ids) predictions = outputs[0] _, predicted_indexes = torch.topk( predictions[0, masked_index], k=5) predicted_tokens = self.bert_tokenizer.convert_ids_to_tokens( predicted_indexes.tolist()) print(predicted_tokens) predict_token = predicted_indexes.tolist()[0] # if pre_token == predict_token: # predict_token = predicted_indexes.tolist()[1] generated_token_ids[0, masked_index] = predict_token # idから文字列に変換、結合 sampled_sequence = [ self.bert_tokenizer.ids_to_tokens[token_id] for token_id in generated_token_ids[0].cpu().numpy() ] sampled_sequence = "".join([ token[2:] if token.startswith("##") else token for token in list( filter(lambda x: x != '[PAD]', sampled_sequence)) ]) logger.info( "sampled sequence: {}".format(sampled_sequence))
model_dir_zh = "model/BERT/newdata_5fold/zh/" MAX_fold = 5 PATH_list_en = [ os.path.join(model_dir_en, "{}fold_bert.model".format(fold)) for fold in range(1, MAX_fold + 1, 1) ] PATH_list_zh = [ os.path.join(model_dir_zh, "{}fold_bert.model".format(fold)) for fold in range(1, MAX_fold + 1, 1) ] y_dummy = torch.empty(len(test1_en), dtype=torch.long).random_(5) # tokenizer_en = BertTokenizer.from_pretrained('bert-large-uncased') tokenizer_en = BertTokenizer.from_pretrained('bert-base-uncased') tokenizer_zh = BertTokenizer.from_pretrained('bert-base-chinese') test_dataset_en = BERTDataset(test1_en, test2_en, y_dummy, tokenizer_en, seq_length=max_seq_en) test_dataset_zh = BERTDataset(test1_zh, test2_zh, y_dummy, tokenizer_zh, seq_length=max_seq_zh) test_loader_en = DataLoader(test_dataset_en, batch_size=batch, shuffle=False) test_loader_zh = DataLoader(test_dataset_zh, batch_size=batch, shuffle=False)
def __init__( self, emb_dim: int = 768, dropout_value: float = 0.0, aggregation_type: str = "sum", bert_type: str = "bert-base-uncased", device: Union[torch.device, str] = torch.device("cpu"), ): """ Bert Embedder that embeds the given instance to BERT embeddings Parameters ---------- emb_dim : int Embedding dimension dropout_value : float The amount of dropout to be added after the embedding aggregation_type : str The kind of aggregation of different layers. BERT produces representations from different layers. This specifies the strategy to aggregating them One of sum Sum the representations from all the layers average Average the representations from all the layers bert_type : type The kind of BERT embedding to be used bert-base-uncased 12 layer transformer trained on lowercased vocab bert-large-uncased: 24 layer transformer trained on lowercased vocab bert-base-cased: 12 layer transformer trained on cased vocab bert-large-cased: 24 layer transformer train on cased vocab scibert-base-cased 12 layer transformer trained on scientific document on cased normal vocab scibert-sci-cased 12 layer transformer trained on scientific documents on cased scientifc vocab scibert-base-uncased 12 layer transformer trained on scientific docments on uncased normal vocab scibert-sci-uncased 12 layer transformer train on scientific documents on ncased scientific vocab device : Union[torch.device, str] The device on which the model is run. """ super(BertEmbedder, self).__init__() self.emb_dim = emb_dim self.dropout_value = dropout_value self.aggregation_type = aggregation_type self.bert_type = bert_type if isinstance(device, str): self.device = torch.device(device) else: self.device = device self.msg_printer = wasabi.Printer() self.allowed_bert_types = [ "bert-base-uncased", "bert-large-uncased", "bert-base-cased", "bert-large-cased", "scibert-base-cased", "scibert-sci-cased", "scibert-base-uncased", "scibert-sci-uncased", ] self.scibert_foldername_mapping = { "scibert-base-cased": "scibert_basevocab_cased", "scibert-sci-cased": "scibert_scivocab_cased", "scibert-base-uncased": "scibert_basevocab_uncased", "scibert-sci-uncased": "scibert_scivocab_uncased", } self.model_type_or_folder_url = None self.vocab_type_or_filename = None assert self.bert_type in self.allowed_bert_types if "scibert" in self.bert_type: foldername = self.scibert_foldername_mapping[self.bert_type] self.model_type_or_folder_url = os.path.join( EMBEDDING_CACHE_DIR, foldername, "weights.tar.gz") self.vocab_type_or_filename = os.path.join(EMBEDDING_CACHE_DIR, foldername, "vocab.txt") else: self.model_type_or_folder_url = self.bert_type self.vocab_type_or_filename = self.bert_type # load the bert model with self.msg_printer.loading(" Loading Bert tokenizer and model. "): self.bert_tokenizer = BertTokenizer.from_pretrained( self.vocab_type_or_filename) self.model = BertModel.from_pretrained( self.model_type_or_folder_url) self.model.eval() self.model.to(self.device) self.msg_printer.good( f"Finished Loading {self.bert_type} model and tokenizer")
import uvicorn from fastai.text import * from fastai.vision import * from fastapi import FastAPI from pytorch_pretrained_bert import BertTokenizer from pytorch_pretrained_bert.modeling import BertForSequenceClassification from sklearn.model_selection import train_test_split from starlette.middleware.cors import CORSMiddleware from starlette.requests import Request from starlette.responses import HTMLResponse, JSONResponse from starlette.responses import Response from starlette.staticfiles import StaticFiles gc.collect() bert_tok = BertTokenizer.from_pretrained("bert-base-uncased") class FastAiBertTokenizer(BaseTokenizer): """Wrapper around BertTokenizer to be compatible with fast.ai""" def __init__(self, tokenizer: BertTokenizer, max_seq_len: int = 128, **kwargs): self._pretrained_tokenizer = tokenizer self.max_seq_len = max_seq_len def __call__(self, *args, **kwargs): return self def tokenizer(self, t: str) -> List[str]: """Limits the maximum sequence length"""
def report_on_stdin(args): """Runs a trained structural probe on sentences piped to stdin. Sentences should be space-tokenized. A single distance image and depth image will be printed for each line of stdin. Args: args: the yaml config dictionary """ # Define the BERT model and tokenizer tokenizer = BertTokenizer.from_pretrained('bert-large-cased') model = BertModel.from_pretrained('bert-large-cased') LAYER_COUNT = 24 FEATURE_COUNT = 1024 model.to(args['device']) model.eval() # Define the distance probe distance_probe = probe.TwoWordPSDProbe(args) distance_probe.load_state_dict(torch.load(args['probe']['distance_params_path'], map_location=args['device'])) # Define the depth probe depth_probe = probe.OneWordPSDProbe(args) depth_probe.load_state_dict(torch.load(args['probe']['depth_params_path'], map_location=args['device'])) for index, line in tqdm(enumerate(sys.stdin), desc='[demoing]'): # Tokenize the sentence and create tensor inputs to BERT untokenized_sent = line.strip().split() tokenized_sent = tokenizer.wordpiece_tokenizer.tokenize('[CLS] ' + ' '.join(line.strip().split()) + ' [SEP]') untok_tok_mapping = data.SubwordDataset.match_tokenized_to_untokenized(tokenized_sent, untokenized_sent) indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_sent) segment_ids = [1 for x in tokenized_sent] tokens_tensor = torch.tensor([indexed_tokens]) segments_tensors = torch.tensor([segment_ids]) tokens_tensor = tokens_tensor.to(args['device']) segments_tensors = segments_tensors.to(args['device']) with torch.no_grad(): # Run sentence tensor through BERT after averaging subwords for each token print('no gradient area') print('tokens & segments') print(tokens_tensor.shape, segments_tensors.shape) encoded_layers, _ = model(tokens_tensor, segments_tensors) print('num layers & each layer') print(len(encoded_layers), encoded_layers[0].shape) single_layer_features = encoded_layers[args['model']['model_layer']] representation = torch.stack([torch.mean(single_layer_features[0,untok_tok_mapping[i][0]:untok_tok_mapping[i][-1]+1,:], dim=0) for i in range(len(untokenized_sent))], dim=0) representation = representation.view(1, *representation.size()) # Run BERT token vectors through the trained probes distance_predictions = distance_probe(representation.to(args['device'])).detach().cpu()[0][:len(untokenized_sent),:len(untokenized_sent)].numpy() depth_predictions = depth_probe(representation).detach().cpu()[0][:len(untokenized_sent)].numpy() print(distance_predictions) print(depth_predictions) # Print results visualizations print_distance_image(args, untokenized_sent, distance_predictions, index) print_depth_image(args, untokenized_sent, depth_predictions, index) predicted_edges = reporter.prims_matrix_to_edges(distance_predictions, untokenized_sent, untokenized_sent)
def main(): #parse arguments config.parse() args = config.args for k, v in vars(args).items(): logger.info(f"{k}:{v}") #set seeds torch.manual_seed(args.random_seed) torch.cuda.manual_seed_all(args.random_seed) np.random.seed(args.random_seed) random.seed(args.random_seed) #arguments check device, n_gpu = args_check(args) os.makedirs(args.output_dir, exist_ok=True) forward_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) args.forward_batch_size = forward_batch_size #load bert config bert_config_S = BertConfig.from_json_file(args.bert_config_file_S) assert args.max_seq_length <= bert_config_S.max_position_embeddings #Prepare GLUE task processor = processors[args.task_name]() args.output_mode = output_modes[args.task_name] label_list = processor.get_labels() num_labels = len(label_list) #read data train_dataset = None eval_datasets = None num_train_steps = None tokenizer = BertTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) if args.do_train: train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False) if args.aux_task_name: aux_train_dataset = load_and_cache_examples(args, args.aux_task_name, tokenizer, evaluate=False, is_aux=True) train_dataset = torch.utils.data.ConcatDataset( [train_dataset, aux_train_dataset]) num_train_steps = int( len(train_dataset) / args.train_batch_size) * args.num_train_epochs if args.do_predict: eval_datasets = [] eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else ( args.task_name, ) for eval_task in eval_task_names: eval_datasets.append( load_and_cache_examples(args, eval_task, tokenizer, evaluate=True)) logger.info("Data loaded") #Build Model and load checkpoint model_S = BertForGLUESimple(bert_config_S, num_labels=num_labels, args=args) #Load student if args.load_model_type == 'bert': assert args.init_checkpoint_S is not None state_dict_S = torch.load(args.init_checkpoint_S, map_location='cpu') if args.only_load_embedding: state_weight = { k[5:]: v for k, v in state_dict_S.items() if k.startswith('bert.embeddings') } missing_keys, _ = model_S.bert.load_state_dict(state_weight, strict=False) logger.info(f"Missing keys {list(missing_keys)}") else: state_weight = { k[5:]: v for k, v in state_dict_S.items() if k.startswith('bert.') } missing_keys, _ = model_S.bert.load_state_dict(state_weight, strict=False) assert len(missing_keys) == 0 logger.info("Model loaded") elif args.load_model_type == 'all': assert args.tuned_checkpoint_S is not None state_dict_S = torch.load(args.tuned_checkpoint_S, map_location='cpu') model_S.load_state_dict(state_dict_S) logger.info("Model loaded") else: logger.info("Model is randomly initialized.") model_S.to(device) if args.local_rank != -1 or n_gpu > 1: if args.local_rank != -1: raise NotImplementedError elif n_gpu > 1: model_S = torch.nn.DataParallel(model_S) #,output_device=n_gpu-1) if args.do_train: #parameters params = list(model_S.named_parameters()) all_trainable_params = divide_parameters(params, lr=args.learning_rate) logger.info("Length of all_trainable_params: %d", len(all_trainable_params)) optimizer = BERTAdam(all_trainable_params, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps, schedule=args.schedule, s_opt1=args.s_opt1, s_opt2=args.s_opt2, s_opt3=args.s_opt3) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Forward batch size = %d", forward_batch_size) logger.info(" Num backward steps = %d", num_train_steps) ########### DISTILLATION ########### train_config = TrainingConfig( gradient_accumulation_steps=args.gradient_accumulation_steps, ckpt_frequency=args.ckpt_frequency, log_dir=args.output_dir, output_dir=args.output_dir, device=args.device) distiller = BasicTrainer(train_config=train_config, model=model_S, adaptor=BertForGLUESimpleAdaptorTraining) if args.local_rank == -1: train_sampler = RandomSampler(train_dataset) else: raise NotImplementedError train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.forward_batch_size, drop_last=True) callback_func = partial(predict, eval_datasets=eval_datasets, args=args) with distiller: distiller.train(optimizer, scheduler=None, dataloader=train_dataloader, num_epochs=args.num_train_epochs, callback=callback_func) if not args.do_train and args.do_predict: res = predict(model_S, eval_datasets, step=0, args=args) print(res)
def __init__(self, model_name): self.tokenizer = BertTokenizer.from_pretrained(model_name) self.model = BertModel.from_pretrained(model_name).eval() self.model.cuda()
@author: peterawest """ import random from pytorch_pretrained_bert import BertTokenizer, BertModel import torch import numpy as np import math import os import time from word_embeddings import BERT_word_embedding tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') bert = BertModel.from_pretrained('bert-base-uncased') bert.eval() bert.to('cuda') sig = torch.nn.Sigmoid() from torch import nn MAX_LENGTH = 100 sm_1 = torch.nn.Softmax(dim=1) sm = torch.nn.Softmax() max_posts = 1000 # calculates position embeddings given seq_len and n_dim def position_embeddings(n_dim, seq_len):
def main(args): # process_mrc_example() if not exists(TEST_DIR): os.makedirs(TEST_DIR) if not exists(os.path.join(TEST_DIR, "reference")): os.makedirs(os.path.join(TEST_DIR, "reference")) if not exists(os.path.join(TEST_DIR, "decoded")): os.makedirs(os.path.join(TEST_DIR, "decoded")) meta = json.load(open(join(DATA_DIR, 'meta.json'))) nargs = meta['net_args'] ckpt = load_best_ckpt(DATA_DIR) net = BertReader(**nargs) net.load_state_dict(ckpt) if args.cuda: net = net.to('cuda') net.eval() tokenizer = BertTokenizer.from_pretrained('./MRC_pretrain') count = 0 bulids = [] answers = [] with torch.no_grad(): for index in range(197): with open( join(join(MRC_DIR, '200_sample'), '{}.json'.format(index))) as f: js_data = json.load(f) print('loading: {}'.format(index)) question, question_length, text, text_length, answer_span, text_tok, original_text = ( js_data['question'], js_data['question_length'], js_data['text'], js_data['text_length'], js_data['answer_span'], js_data['text_tok'], js_data['original_text']) if (question_length + text_length <= 512): concat_text = question + text token_tensor, segment_tensor, mask_tensor = pad_batch_tensorize( [concat_text], args.cuda) question_lengths = torch.tensor([question_length]) question_lengths = question_lengths.cuda() text_lengths = torch.tensor([text_length]) text_lengths = text_lengths.cuda() fw_args = (token_tensor, segment_tensor, mask_tensor, question_lengths, text_lengths) net_out = net(*fw_args) net_out = torch.squeeze(net_out) net_out = net_out[question_length:question_length + text_length] leng = net_out.size(0) propuse = [] for i in range(leng): if (net_out[i].item() > 0.5): propuse.append(1) else: propuse.append(0) if (not (1 in propuse)): propuse.clear() for i in range(leng): if (net_out[i].item() > 1e-4): propuse.append(1) else: propuse.append(0) bulid = [] output = '' for t in range(len(propuse)): if (propuse[t] == 1): bulid.append(text[t]) output += text_tok[t] if ( text_tok[t] != '[UNK]') else '' output = output.replace('##', '') print(output) bulid = [str(x) for x in bulid] answer_index = answer_span.index(1) one = 0 for o in range(len(answer_span)): if answer_span[o] == 1: one += 1 print(one) answer = text[answer_index:answer_index + one] # answer = text[answer_index:answer_index + len(answer_span)] answer = [str(x) for x in answer] answers.append(answer) with open( join(os.path.join(TEST_DIR, "decoded"), "%d_decoded.txt" % index), 'w') as f: # for i, item in enumerate(bulids): f.write(' '.join(bulid)) with open( join(os.path.join(TEST_DIR, "reference"), "%d_reference.txt" % index), 'w', ) as f: # for i, item in enumerate(answers): # print(item) f.write(' '.join(answer)) else: sp = 0 ep = 412 sub_text_arr = [] sub_text_length_arr = [] start_index = [] while (True): if (ep >= text_length and sp < text_length): sub_text = text[sp:text_length] sub_text_arr.append(sub_text) sub_text_length = text_length - sp sub_text_length_arr.append(sub_text_length) start_index.append(sp) assert question_length + text_length - sp <= 512 sp += 312 ep += 312 else: if (ep > text_length): break else: sub_text = text[sp:ep] sub_text_arr.append(sub_text) sub_text_length = ep - sp sub_text_length_arr.append(sub_text_length) start_index.append(sp) assert question_length + ep - sp <= 512 sp += 312 ep += 312 meta_s = json.load(open(join('matcher', 'meta.json'))) nargs_s = meta_s['net_args'] ckpt_s = load_best_ckpt('matcher') net_s = BertMatcher(**nargs_s) net_s.load_state_dict(ckpt_s) if args.cuda: net_s = net_s.cuda() net_s.eval() with torch.no_grad(): highest_score = [0] current = -1 for i in range(len(sub_text_arr)): concat_text = question + sub_text_arr[i] token_tensor, segment_tensor, mask_tensor = pad_batch_tensorize( [concat_text], args.cuda) fw_args = (token_tensor, segment_tensor, mask_tensor) net_out = net_s(*fw_args) if (net_out[0][0].item() > highest_score[-1]): highest_score.clear() highest_score.append(net_out[0][0].item()) current = i used_text = sub_text_arr[current] propuse = [0] * text_length concat_text = question + used_text token_tensor, segment_tensor, mask_tensor = pad_batch_tensorize( [concat_text], args.cuda) question_lengths = torch.tensor([question_length]) question_lengths = question_lengths.cuda() text_lengths = torch.tensor([sub_text_length_arr[current]]) text_lengths = text_lengths.cuda() fw_args = (token_tensor, segment_tensor, mask_tensor, question_lengths, text_lengths) net_out = net(*fw_args) net_out = torch.squeeze(net_out) net_out = net_out[question_length:question_length + text_length] leng = net_out.size(0) for ga in range(leng): if (net_out[ga].item() > 0.5): propuse[ga + start_index[current]] = 1 if (not (1 in propuse)): for ga in range(leng): if (net_out[ga].item() > 1e-4): propuse[ga + start_index[current]] = 1 bulid = [] output = '' for t in range(len(propuse)): if (propuse[t] == 1): bulid.append(text[t]) output += text_tok[t] if ( text_tok[t] != '[UNK]') else '' output = output.replace('##', '') print(output) bulid = [str(x) for x in bulid] answer_index = answer_span.index(1) one = 0 for o in range(len(answer_span)): if answer_span[o] == 1: one += 1 print(one) answer = text[answer_index:answer_index + one] # answer = text[answer_index:answer_index + len(answer_span)] answer = [str(x) for x in answer] answers.append(answer) with open( join(os.path.join(TEST_DIR, "decoded"), "%d_decoded.txt" % index), 'w') as f: # for i, item in enumerate(bulids): f.write(' '.join(bulid)) with open( join(os.path.join(TEST_DIR, "reference"), "%d_reference.txt" % index), 'w', ) as f: # for i, item in enumerate(answers): # print(item) f.write(' '.join(answer)) r = pyrouge.Rouge155('/home/wanglihan/ROUGE/RELEASE-1.5.5/') r.model_filename_pattern = '#ID#_reference.txt' r.system_filename_pattern = '(\d+)_decoded.txt' r.model_dir = os.path.join(TEST_DIR, "reference") r.system_dir = os.path.join(TEST_DIR, "decoded") rouge_results = r.convert_and_evaluate( '/home/wanglihan/ROUGE/RELEASE-1.5.5/') print(rouge_results)
gradient_accumulation_steps = 1 train_batch_size = 32 eval_batch_size = 128 train_batch_size = train_batch_size // gradient_accumulation_steps output_dir = OutputDir num_train_epochs = NUMofEPOCH num_train_optimization_steps = int( len(TrainExamples) / train_batch_size / gradient_accumulation_steps) * num_train_epochs cache_dir = CacheDir learning_rate = LearningRate warmup_proportion = 0.1 max_seq_length = MAXSEQLEN # Load model tokenizer = BertTokenizer.from_pretrained(BERTModel) Model = BertForSequenceClassification.from_pretrained( BERTModel, cache_dir=cache_dir, num_labels=len(LabelList)) Model.to(device) if n_gpu > 1: Model = torch.nn.DataParallel(Model) # Load a trained model and config that you have fine-tuned # tokenizer = BertTokenizer.from_pretrained(BERTModel) # config = BertConfig(load_config_file) # Model = BertForSequenceClassification(config, num_labels = len(LabelList)) # Model.load_state_dict(torch.load(load_model_file)) # Model.to(device) # important to specific device # if n_gpu > 1: # Model = torch.nn.DataParallel(Model)
sentences = ["[CLS] " + str(sentence) + " [SEP]" for sentence in sentences] labels = df.leaning.values for i in range(len(labels)): if labels[i] == 'right': labels[i] = 0 if labels[i] == 'left': labels[i] = 1 labels = np.array(labels, dtype=np.int64) print(labels.dtype) """## Inputs Next, import the BERT tokenizer, used to convert our text into tokens that correspond to BERT's vocabulary. """ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences] print("Tokenize the first sentence:") print(tokenized_texts[0]) """BERT requires specifically formatted inputs. For each tokenized input sentence, we need to create: - **input ids**: a sequence of integers identifying each input token to its index number in the BERT tokenizer vocabulary - **segment mask**: (optional) a sequence of 1s and 0s used to identify whether the input is one sentence or two sentences long. For one sentence inputs, this is simply a sequence of 0s. For two sentence inputs, there is a 0 for each token of the first sentence, followed by a 1 for each token of the second sentence - **attention mask**: (optional) a sequence of 1s and 0s, with 1s for all input tokens and 0s for all padding tokens (we'll detail this in the next paragraph) - **labels**: a single value of 1 or 0. In our task 1 means "grammatical" and 0 means "ungrammatical" Although we can have variable length input sentences, BERT does requires our input arrays to be the same size. We address this by first choosing a maximum sentence length, and then padding and truncating our inputs until every input sequence is of the same length. To "pad" our inputs in this context means that if a sentence is shorter than the maximum sentence length, we simply add 0s to the end of the sequence until it is the maximum sentence length.
import nltk import numpy as np import os import random import string import torch from fitbert import FitBert from matplotlib import pyplot as plt from nltk.corpus import stopwords from pytorch_pretrained_bert import BertTokenizer, BertForMaskedLM from tqdm import tqdm from utils import color_print_top_words device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print('Initialize BERT vocabulary...') bert_tokenizer = BertTokenizer(vocab_file='data/BERT_model_reddit/vocab.txt') print('Initialize BERT model...') bert_model = BertForMaskedLM.from_pretrained('data/BERT_model_reddit').to( device) bert_model.eval() def MLM(sgs, drug_formal, thres=1, skip_flag=1): def to_bert_input(tokens, bert_tokenizer): token_idx = torch.tensor(bert_tokenizer.convert_tokens_to_ids(tokens)) sep_idx = tokens.index('[SEP]') segment_idx = token_idx * 0 segment_idx[(sep_idx + 1):] = 1 mask = (token_idx != 0) return token_idx.unsqueeze(0).to(device), segment_idx.unsqueeze(0).to( device), mask.unsqueeze(0).to(device)
def model_go(th_filter_prob=0.2, top_k_sent=5): seed = 12 torch.manual_seed(seed) # bert_model_name = 'bert-large-uncased' bert_model_name = 'bert-base-uncased' bert_pretrain_path = config.PRO_ROOT / '.pytorch_pretrained_bert' lazy = False # lazy = True forward_size = 32 # batch_size = 64 # batch_size = 192 batch_size = 32 gradient_accumulate_step = int(batch_size / forward_size) warmup_proportion = 0.1 # schedule_type = 'warmup_constant' # 'warmup_cosine': warmup_cosine, # 'warmup_constant': warmup_constant, # 'warmup_linear': warmup_linear, schedule_type = 'warmup_linear' learning_rate = 5e-5 num_train_epochs = 5 eval_frequency = 4000 do_lower_case = True pair_order = 'cq' # debug_mode = True # debug_mode = True debug_mode = False do_ema = True maxout_model = False # est_datasize = 900_000 num_class = 3 # num_train_optimization_steps top_k = top_k_sent train_sent_filtering_prob = th_filter_prob dev_sent_filtering_prob = th_filter_prob experiment_name = f'fever_v2_nli_th{train_sent_filtering_prob}_tk{top_k}' # Data dataset and upstream sentence results. dev_sent_results_list = common.load_jsonl( config.PRO_ROOT / "data/p_fever/fever_sentence_level/04-24-00-11-19_fever_v0_slevel_retri_(ignore_non_verifiable-True)/fever_s_level_dev_results.jsonl") train_sent_results_list = common.load_jsonl( config.PRO_ROOT / "data/p_fever/fever_sentence_level/04-24-00-11-19_fever_v0_slevel_retri_(ignore_non_verifiable-True)/fever_s_level_train_results.jsonl") dev_fitems, dev_list = get_nli_pair('dev', is_training=False, sent_level_results_list=dev_sent_results_list, debug=debug_mode, sent_top_k=top_k_sent, sent_filter_value=dev_sent_filtering_prob) train_fitems, train_list = get_nli_pair('train', is_training=True, sent_level_results_list=train_sent_results_list, debug=debug_mode, sent_top_k=top_k_sent, sent_filter_value=train_sent_filtering_prob) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device_num = 0 if torch.cuda.is_available() else -1 n_gpu = torch.cuda.device_count() unk_token_num = {'tokens': 1} # work around for initiating vocabulary. vocab = ExVocabulary(unk_token_num=unk_token_num) vocab.add_token_to_namespace('SUPPORTS', namespace='labels') vocab.add_token_to_namespace('REFUTES', namespace='labels') vocab.add_token_to_namespace('NOT ENOUGH INFO', namespace='labels') vocab.add_token_to_namespace("hidden", namespace="labels") vocab.change_token_with_index_to_namespace("hidden", -2, namespace='labels') if debug_mode: dev_list = dev_list[:100] train_list = train_list[:100] eval_frequency = 2 est_datasize = len(train_fitems) bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=do_lower_case, cache_dir=bert_pretrain_path) bert_cs_reader = BertFeverNLIReader(bert_tokenizer, lazy, is_paired=True, query_l=64, example_filter=None, max_l=384, pair_order=pair_order) bert_encoder = BertModel.from_pretrained(bert_model_name, cache_dir=bert_pretrain_path) if not maxout_model: model = BertMultiLayerSeqClassification(bert_encoder, num_labels=num_class, num_of_pooling_layer=1, act_type='tanh', use_pretrained_pooler=True, use_sigmoid=False) else: model = BertPairMaxOutMatcher(bert_encoder, num_of_class=num_class, act_type="gelu", num_of_out_layers=2) ema = None if do_ema: ema = EMA(model, model.named_parameters(), device_num=1) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] num_train_optimization_steps = int(est_datasize / forward_size / gradient_accumulate_step) * \ num_train_epochs if debug_mode: num_train_optimization_steps = 100 print("Estimated training size", est_datasize) print("Number of optimization steps:", num_train_optimization_steps) print("Do EMA:", do_ema) optimizer = BertAdam(optimizer_grouped_parameters, lr=learning_rate, warmup=warmup_proportion, t_total=num_train_optimization_steps, schedule=schedule_type) dev_instances = bert_cs_reader.read(dev_fitems) biterator = BasicIterator(batch_size=forward_size) biterator.index_with(vocab) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) forbackward_step = 0 update_step = 0 logging_agent = save_tool.ScoreLogger({}) file_path_prefix = '.' if not debug_mode: file_path_prefix, date = save_tool.gen_file_prefix(f"{experiment_name}") # # # Create Log File # Save the source code. script_name = os.path.basename(__file__) with open(os.path.join(file_path_prefix, script_name), 'w') as out_f, open(__file__, 'r') as it: out_f.write(it.read()) out_f.flush() # # # Log File end for epoch_i in range(num_train_epochs): print("Epoch:", epoch_i) train_fitems_list, _ = get_nli_pair('train', is_training=True, sent_level_results_list=train_sent_results_list, debug=debug_mode, sent_top_k=5, sent_filter_value=train_sent_filtering_prob) random.shuffle(train_fitems_list) train_instance = bert_cs_reader.read(train_fitems_list) train_iter = biterator(train_instance, num_epochs=1, shuffle=True) for batch in tqdm(train_iter): model.train() batch = move_to_device(batch, device_num) paired_sequence = batch['paired_sequence'] paired_segments_ids = batch['paired_segments_ids'] labels_ids = batch['label'] att_mask, _ = torch_util.get_length_and_mask(paired_sequence) s1_span = batch['bert_s1_span'] s2_span = batch['bert_s2_span'] if not maxout_model: loss = model(paired_sequence, token_type_ids=paired_segments_ids, attention_mask=att_mask, mode=BertMultiLayerSeqClassification.ForwardMode.TRAIN, labels=labels_ids) else: loss = model(paired_sequence, token_type_ids=paired_segments_ids, attention_mask=att_mask, s1_span=s1_span, s2_span=s2_span, mode=BertPairMaxOutMatcher.ForwardMode.TRAIN, labels=labels_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if gradient_accumulate_step > 1: loss = loss / gradient_accumulate_step loss.backward() forbackward_step += 1 if forbackward_step % gradient_accumulate_step == 0: optimizer.step() if ema is not None and do_ema: updated_model = model.module if hasattr(model, 'module') else model ema(updated_model.named_parameters()) optimizer.zero_grad() update_step += 1 if update_step % eval_frequency == 0: print("Update steps:", update_step) # dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False) # # cur_eval_results_list = eval_model(model, dev_iter, device_num, with_probs=True, make_int=True, # feed_input_span=maxout_model) # # ema_results_dict = list_dict_data_tool.list_to_dict(cur_eval_results_list, 'oid') # copied_dev_list = copy.deepcopy(dev_list) # list_dict_data_tool.append_item_from_dict_to_list(copied_dev_list, ema_results_dict, # 'id', 'predicted_label') # # mode = {'standard': True} # strict_score, acc_score, pr, rec, f1 = fever_scorer.fever_score(copied_dev_list, dev_list, # mode=mode, max_evidence=5) # logging_item = { # 'ss': strict_score, 'ac': acc_score, # 'pr': pr, 'rec': rec, 'f1': f1, # } # # if not debug_mode: # save_file_name = f'i({update_step})|e({epoch_i})' \ # f'|ss({strict_score})|ac({acc_score})|pr({pr})|rec({rec})|f1({f1})' \ # f'|seed({seed})' # # common.save_jsonl(copied_dev_list, Path(file_path_prefix) / # f"{save_file_name}_dev_nli_results.json") # # # print(save_file_name) # logging_agent.incorporate_results({}, save_file_name, logging_item) # logging_agent.logging_to_file(Path(file_path_prefix) / "log.json") # # model_to_save = model.module if hasattr(model, 'module') else model # output_model_file = Path(file_path_prefix) / save_file_name # torch.save(model_to_save.state_dict(), str(output_model_file)) if do_ema and ema is not None: ema_model = ema.get_inference_model() ema_device_num = 0 ema_model = ema_model.to(device) ema_model = torch.nn.DataParallel(ema_model) dev_iter = biterator(dev_instances, num_epochs=1, shuffle=False) cur_ema_eval_results_list = eval_model(ema_model, dev_iter, ema_device_num, with_probs=True, make_int=True, feed_input_span=maxout_model) ema_results_dict = list_dict_data_tool.list_to_dict(cur_ema_eval_results_list, 'oid') copied_dev_list = copy.deepcopy(dev_list) list_dict_data_tool.append_item_from_dict_to_list(copied_dev_list, ema_results_dict, 'id', 'predicted_label') mode = {'standard': True} strict_score, acc_score, pr, rec, f1 = fever_scorer.fever_score(copied_dev_list, dev_list, mode=mode, max_evidence=5) ema_logging_item = { 'label': 'ema', 'ss': strict_score, 'ac': acc_score, 'pr': pr, 'rec': rec, 'f1': f1, } if not debug_mode: save_file_name = f'ema_i({update_step})|e({epoch_i})' \ f'|ss({strict_score})|ac({acc_score})|pr({pr})|rec({rec})|f1({f1})' \ f'|seed({seed})' common.save_jsonl(copied_dev_list, Path(file_path_prefix) / f"{save_file_name}_dev_nli_results.json") # print(save_file_name) logging_agent.incorporate_results({}, save_file_name, ema_logging_item) logging_agent.logging_to_file(Path(file_path_prefix) / "log.json") model_to_save = ema_model.module if hasattr(ema_model, 'module') else ema_model output_model_file = Path(file_path_prefix) / save_file_name torch.save(model_to_save.state_dict(), str(output_model_file))
import sys import pickle import random from pytorch_pretrained_bert import BertTokenizer # Use BertTokenizer for tokenize random.seed(2018) tokenizer = BertTokenizer.from_pretrained( '../../data/bert-base-chinese-vocab.txt' ) # Initialize tokenizer, maybe we can use a local way here def load_cluster_data(if_union=False): # if_union == False means we only take the same annotation data as our dataset else we take the union of both annotations if if_union == False: file = open('../../data/pickle/clusters_separation.pickle', 'rb') else: file = open('../../data/pickle/clusters_separation_union.pickle', 'rb') training_clusters, validation_clusters, test_clusters = pickle.load(file) return training_clusters, validation_clusters, test_clusters def data_generator(training_clusters, validation_clusters, test_clusters, if_union=False): cluster2pair(training_clusters, 'training', if_union) cluster2pair(validation_clusters, 'validation', if_union) cluster2pair(test_clusters, 'test', if_union)
if col2[i] == []: del col1[i] del col2[i] #print (len(df1)) # Convert to tokenization system supported by BERT labels = col2 MAX_LEN = 64 device = torch.device("cpu") tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False) def tokenize_and_preserve_labels(sentence, text_labels): tokenized_sentence = [] labels = [] for word, label in zip(sentence, text_labels): # Tokenize the word and count # of subwords the word is broken into tokenized_word = tokenizer.tokenize(word) n_subwords = len(tokenized_word) # Add the tokenized word to the final tokenized word list tokenized_sentence.extend(tokenized_word) # Add the same label to the new list of labels `n_subwords` times
# 拆分训练集 train1_data, train2_data = train_test_split(train_data, test_size=0.1, random_state=1) trainloader1 = torch.utils.data.DataLoader(dataset=MyDataset( train1_data, subject_data, stockname_data), batch_size=BS, shuffle=True, collate_fn=collate_fn_link) for num_words in [num_words_]: for max_len in [max_len_]: for embedding_name in [Bert_name]: # ['roberta_wwm','wwm','ernie'] bert_path = './Bert/' + embedding_name + '/' dataset.tokenizer = BertTokenizer.from_pretrained(bert_path + 'vocab.txt') dataset.BERT = BertModel.from_pretrained(bert_path).to(device) dataset.BERT.eval() dataset.max_len = max_len for loss_weight in [loss_weight_]: accu_ = 0 while accu_ < k: # vocab_size有pad和unknow,维度+2 model = Net(vocab_size=len(word_index) + 2, embedding_dim=EMBEDDING_DIM, num_layers=num_layers, hidden_dim=hidden_dim, embedding=embedding, device=device).to(device) optimizer = optim.Adam(model.parameters(), lr=lr)
def __init__(self, config: dict): super().__init__(config) self.save_treshold = 0.55 self.modeltype = config["variant"] self.tokenizer = BertTokenizer.from_pretrained( self.modeltype, cache_dir="./.BERTcache", do_lower_case=True)
def impute(): tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') id_list = [] labels = [] texts = [] predict_texts = [] with open('masked_f1.tsv', 'r') as f: read_tsv = csv.reader(f, delimiter="\t") for row in read_tsv: id_list.append(row[0]) labels.append(row[1]) texts.append(row[3]) # Load pre-trained model (weights) model = BertForMaskedLM.from_pretrained('bert-base-uncased') model.eval() for i in range(len(texts)): repeat_flag = True next_predict_text = texts[i] while repeat_flag: repeat_flag = False # if i % 100 == 0: # print("Now: ", i/len(texts)) text = next_predict_text words = text.split()[:290] tmp_str = "" for word_idx in range(len(words)): tmp_str += words[word_idx] tmp_str += " " texts[i] = tmp_str text = texts[i] # print(text) tokenized_text = tokenizer.tokenize(text) indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) # Create the segments tensors. segments_ids = [0] * len(tokenized_text) # Convert inputs to PyTorch tensors tokens_tensor = torch.tensor([indexed_tokens]) segments_tensors = torch.tensor([segments_ids]) # Predict all tokens with torch.no_grad(): predictions = model(tokens_tensor, segments_tensors) if '[MASK]' not in tokenized_text: indices = [] prev_sent_indices = [] else: indices = [ p for p, x in enumerate(tokenized_text) if x == '[MASK]' ] prev_sent_indices = [ q for q, x in enumerate(text.split()) if x == '[MASK]' ] # print(predictions) # print(indices) # print("Previous:\n%s" %(text)) # print(tokenized_text) last_index = -2 predict_result = [] for each_index in indices: if last_index + 1 != each_index: # predicted_index = torch.argmax(predictions[0, each_index]).item() # print(predictions[0, masked_index]) sort_result = torch.sort(predictions[0, each_index])[1] final_result = [] for j in range(20): curr_item = tokenizer.convert_ids_to_tokens( [sort_result[-j - 1].item()]) if curr_item[0] in ['.', ',', '-', ';', '?', '!', '|']: pass else: final_result += [curr_item] # print(tokenizer.convert_ids_to_tokens([sort_result[-j-1].item()])) # predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0] predict_result += [final_result[0]] else: repeat_flag = True predict_result += [['[MASK]']] last_index = each_index if not repeat_flag: words = text.split() result = "" for k in range(len(words)): if words[k] == '[CLS]' or words[k] == '[SEP]': continue elif k not in prev_sent_indices: result += words[k] else: # print(predict_result[indices.index(k)]) result += predict_result[prev_sent_indices.index( k)][0].upper() result += ' ' # print("After: \n%s" %(result)) predict_texts.append(result) # print("DONE///////////") # print(result) # print('///////////////') else: words = text.split() result = "" for k in range(len(words)): if k not in prev_sent_indices: result += words[k] else: # print(predict_result[indices.index(k)]) result += predict_result[prev_sent_indices.index( k)][0].upper() result += ' ' # print("Next predict: " + result) next_predict_text = result df_bert = pd.DataFrame({ 'id': id_list, 'label': labels, 'alpha': ['a'] * len(predict_texts), 'text': predict_texts }) df_bert.to_csv('bert_test_f1_seq.tsv', sep='\t', index=False, header=False)
from pytorch_pretrained_bert import GPT2Tokenizer, GPT2Model, GPT2LMHeadModel from pytorch_pretrained_bert import TransfoXLTokenizer, TransfoXLModel, TransfoXLLMHeadModel ## 下面可以找到参数的下载链接 # https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling.py # https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/tokenization.py # OPTIONAL: if you want to have more information on what's happening, activate the logger as follows import logging logging.basicConfig(level=logging.INFO) home = os.getenv('HOME') ################################################################## ## BERT ################################################################## ## BertTokenizer tokenizer = BertTokenizer.from_pretrained(home + '/datasets/WordVec/pytorch_pretrained_bert/bert-large-uncased-vocab.txt') # Load pre-trained model tokenizer (vocabulary) print(tokenizer.max_len) # 1000000000000; 512 for not large print(len(tokenizer.vocab)) # 30522; words print(type(tokenizer.vocab)) # <class 'collections.OrderedDict'> print(tokenizer.vocab.get('hello', 0)) # 7592 print(tokenizer.vocab.get('helloworld', 0)) # 0 print(tokenizer.ids_to_tokens.get(7592, 'hello')) # hello print(tokenizer.ids_to_tokens.get(75920, 'hello')) # hello print(tokenizer.convert_ids_to_tokens([0, 1, 99, 100, 101, 102, 103, 104, 998, 999])) # ['[PAD]', '[unused0]', '[unused98]', '[UNK]', '[CLS]', '[SEP]', '[MASK]', '[unused99]', '[unused993]', '!'] text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]" tokenized_text = tokenizer.tokenize(text) # Tokenized input print(tokenized_text) # ['[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', 'jim', 'henson', 'was', 'a', 'puppet', '##eer', '[SEP]'] ## Mask a token that we will try to predict back with `BertForMaskedLM` masked_index = 8 tokenized_text[masked_index] = '[MASK]'
class Generater: def __init__(self, bert_path): vocab_file_name = 'vocab.txt' # 日本語文章をBERTに食わせるためにJumanを読み込む self.juman_tokenizer = JumanTokenizer() # 事前学習済みのBERTモデルを読み込む self.model = BertModel.from_pretrained(bert_path) # 事前学習済みのBERTモデルのTokenizerを読み込む self.bert_tokenizer = BertTokenizer(Path(bert_path) / vocab_file_name, do_lower_case=False, do_basic_tokenize=False) self.vocab_size = len(self.bert_tokenizer.vocab) # 事前学習済みのBERTモデルのMaskedLMタスクモデルを読み込む self.model = BertForMaskedLM.from_pretrained(bert_path) # 除外するヘッダ等トークン except_tokens = ["[MASK]", #"[PAD]", "[UNK]", "[CLS]", "[SEP]", "(", ")", "・", "/", "、", "。", "!", "?", "「", "」", "…", "’", "』", "『", ":", "※" ] self.except_ids = [self.bert_tokenizer.vocab[token] for token in except_tokens] # vocab_sizeのうち、except_ids以外は、利用する self.candidate_ids = [i for i in range(self.vocab_size) if i not in self.except_ids] def _preprocess_text(self, text): # 事前処理、テキストの半角スペースは削除 return text.replace(" ", "").replace('#', '') # for Juman def text2tokens(self, text): # テキストの半角スペースを削除する preprocessed_text = self._preprocess_text(text) # 日本語のテキストを分かち書きし、トークンリストに変換する tokens = self.juman_tokenizer.tokenize(preprocessed_text) # トークンを半角スペースで結合しstrに変換する bert_tokens = self.bert_tokenizer.tokenize(" ".join(tokens)) # テキストのサイズは128までなので、ヘッダ + トークン126個 + フッタを作成 # トークンをidに置換する ids = self.bert_tokenizer.convert_tokens_to_ids(["[CLS]"] + bert_tokens[:126] + ["[SEP]"]) # max_seq_len-2 generated_token_ids = torch.tensor(ids).reshape(1, -1) return generated_token_ids def tokens2text(self, tokens): sampled_sequence = [self.bert_tokenizer.ids_to_tokens[token_id] for token_id in tokens[0].cpu().numpy()] sampled_sequence = "".join( [ token[2:] if token.startswith("##") else token for token in list(filter(lambda x: x != '[PAD]' and x != '[CLS]' and x != '[SEP]', sampled_sequence)) ] ) return sampled_sequence def likelihood(self, tokens): outputs = self.model(tokens) predictions = outputs[0] score_sum = 0.0 for idx, scores in zip(tokens[0].tolist(), predictions[0].tolist()): score_sum += scores[idx] return score_sum def initialization_text(self, length=10): init_tokens = [] # ヘッダ init_tokens.append(self.bert_tokenizer.vocab["[CLS]"]) for _ in range(length): # ランダムに文字を選択 init_tokens.append(random.choice(self.candidate_ids)) # フッタ init_tokens.append(self.bert_tokenizer.vocab["[SEP]"]) return torch.tensor(init_tokens).reshape(1, -1) def scoring(self, tokens): return self.likelihood(tokens) + self.juman_tokenizer.tanka_score_subsets(self.tokens2text(tokens)) + self.juman_tokenizer.tanka_score_flow(self.tokens2text(tokens)) def select(self, l_tokens, size=5): scores = list(map(self.scoring, l_tokens)) print(sorted(scores, reverse=True)[:3]) selected = list(map( lambda x: x[0], sorted( list(zip(l_tokens, scores)), key=lambda x: x[1], reverse=True ) )) return selected def crossover(self, tokens_0, tokens_1): l_tokens_0 = tokens_0.numpy().reshape(-1).tolist() l_tokens_1 = tokens_1.numpy().reshape(-1).tolist() start = random.randint(1, len(l_tokens_0) - 3) end = random.randint(start, len(l_tokens_0) - 2) for num in range(start, end): l_tokens_0[num] = l_tokens_1[num] return torch.tensor(l_tokens_0).reshape(1, -1) def mutation(self, tokens, N=3): l_tokens = tokens.numpy().reshape(-1).tolist() for num in range(N): num = random.randint(1, len(l_tokens) - 2) l_tokens[num] = self.bert_tokenizer.vocab["[MASK]"] outputs = self.model(torch.tensor(l_tokens).reshape(1, -1)) predictions = outputs[0] _, predicted_indexes = torch.topk(predictions[0, num], k=10) # random_tokens = [random.choice(self.candidate_ids) for i in range(1)] random_tokens = [] predicted_indexes = list( set(predicted_indexes.tolist() + random_tokens) - set(self.except_ids) ) predicted_tokens = self.bert_tokenizer.convert_ids_to_tokens(predicted_indexes) predict_token = random.choice(predicted_indexes) l_tokens[num] = predict_token return torch.tensor(l_tokens).reshape(1, -1)