def maximize_coverage(source: CoNLL2003Dataset, target: CoNLL2003Dataset, n: int, tokenizer: BertTokenizer) -> CoNLL2003Dataset: MAX_SEQ_LEN = 150 target_vocab = set() for document in target.documents: for sentence in document: for token, ner_tag in sentence: if token is None: continue word_pieces = tokenizer.tokenize(token) target_vocab.update(word_pieces) annotated_train_sentences = [] for document in source.documents: annotated_train_sentences += document tokenized_train_sentences = [] for i, annotated_sentence in enumerate(annotated_train_sentences): sentence_word_pieces = [] for token, _ in annotated_sentence: if token is None: continue word_pieces = tokenizer.tokenize(token) sentence_word_pieces += word_pieces sentence_word_pieces = set(sentence_word_pieces[:MAX_SEQ_LEN]) coverage = len(target_vocab & sentence_word_pieces) tokenized_train_sentences.append({ "id": i, "set": sentence_word_pieces, "coverage": coverage, }) selected_train_sentences = [] for i in range(n): tokenized_train_sentences.sort(key=lambda s: s["coverage"]) best_sentence = tokenized_train_sentences.pop() selected_train_sentences.append( annotated_train_sentences[best_sentence["id"]]) new_word_pieces = target_vocab & best_sentence["set"] for new_word_piece in new_word_pieces: target_vocab.remove(new_word_piece) for j in range(len(tokenized_train_sentences)): if new_word_piece in tokenized_train_sentences[j]["set"]: tokenized_train_sentences[j]["set"].remove(new_word_piece) tokenized_train_sentences[j]["coverage"] -= 1 output = deepcopy(source) output.documents = [[sentence] for sentence in selected_train_sentences] return output
class NERDataSet(Dataset): def __init__(self, data_path, config, add_cls=False, add_sep=False): self.config = config self.sents, self.tags = load_tsv(data_path, add_cls=add_cls, add_sep=add_sep) self.tokenizer = BertTokenizer(vocab_file=config.vocab_path, do_lower_case=False) self.tokenize() def __len__(self): return len(self.sents) def tokenize(self): alltok_sents, alltok_tags = [], [] for sent_words, sent_tags in zip(self.sents, self.tags): tok_sent, tok_tag = [], [] for w, t in zip(sent_words, sent_tags): # tokenize the words tokens = self.tokenizer.tokenize(w) tok_ids = self.tokenizer.convert_tokens_to_ids(tokens) tok_tags = [t] + [self.config.piece_tag] * (len(tokens) - 1) ttags_ids = [self.config.tag2idx[tt] for tt in tok_tags] tok_sent.extend(tok_ids) tok_tag.extend(ttags_ids) alltok_sents.append(tok_sent) alltok_tags.append(tok_tag) self.tok_sents = alltok_sents self.tok_tags = alltok_tags def __getitem__(self, idx): return self.tok_sents[idx], self.tok_tags[idx]
class JapaneseWorker: def __init__(self): self.juman_tokenizer = JumanTokenizer() self.bert_tokenizer = BertTokenizer(config['DEFAULT']['vocab_path'], do_basic_tokenize=False) self.cls_id = self.bert_tokenizer.vocab['[CLS]'] self.mask_id = self.bert_tokenizer.vocab['[MASK]'] self.bert_model = 'PATH_TO_BERTJPN' self.cp = 'checkpoint/jp/cp_step_710000.pt' self.opt = 'checkpoint/jp/opt_step_710000.pt' @staticmethod def linesplit(src): """ :param src: type str, String type article :return: type list, punctuation seperated sentences """ def remove_newline(x): x = x.replace('\n', '') return x def remove_blank(x): x = x.replace(' ', '') return x def remove_unknown(x): unknown = ['\u3000'] for h in unknown: x = x.replace(h, '') return x src = remove_blank(src) src = remove_newline(src) src = remove_unknown(src) src_line = re.split('。(?<!」)|!(?<!」)|?(?!」)', src) src_line = [x for x in src_line if x is not ''] return src_line def tokenizer(self, src): """ :param src: type list, punctuation seperated sentences :return: token: type list, numberized tokens token_id: type list, tokens """ token = [] token_id = [] def _preprocess_text(text): return text.replace(" ", "") # for Juman for sentence in src: preprocessed_text = _preprocess_text(sentence) juman_tokens = self.juman_tokenizer(preprocessed_text) tokens = self.bert_tokenizer.tokenize(" ".join(juman_tokens)) tokens = ["[CLS]"] + tokens + ["[SEP]"] ids = self.bert_tokenizer.convert_tokens_to_ids(tokens) token += tokens token_id += ids return token, token_id
class BertWithJumanModel(): def __init__(self, bert_path, vocab_file_name="vocab.txt", use_cuda=False,is_tokenized=False): self.juman_tokenizer = JumanTokenizer() self.model = BertModel.from_pretrained(bert_path) self.bert_tokenizer = BertTokenizer(Path(bert_path) / vocab_file_name, do_lower_case=False, do_basic_tokenize=False) self.use_cuda = use_cuda self.is_tokenized = is_tokenized def _preprocess_text(self, text): return text.replace(" ", "") # for Juman def get_sentence_embedding(self, text, pooling_layer=-2, pooling_strategy="REDUCE_MEAN"): if not self.is_tokenized: preprocessed_text = self._preprocess_text(text) tokens = self.juman_tokenizer.tokenize(preprocessed_text) bert_tokens = self.bert_tokenizer.tokenize(" ".join(tokens)) else: bert_tokens = self.bert_tokenizer.tokenize(" ".join(text)) ids = self.bert_tokenizer.convert_tokens_to_ids(["[CLS]"] + bert_tokens[:126] + ["[SEP]"]) # max_seq_len-2 tokens_tensor = torch.tensor(ids).reshape(1, -1) if self.use_cuda: tokens_tensor = tokens_tensor.to('cuda') self.model.to('cuda') self.model.eval() with torch.no_grad(): all_encoder_layers, _ = self.model(tokens_tensor) embedding = all_encoder_layers[pooling_layer].cpu().numpy()[0] if pooling_strategy == "REDUCE_MEAN": return np.mean(embedding, axis=0) elif pooling_strategy == "REDUCE_MAX": return np.max(embedding, axis=0) elif pooling_strategy == "REDUCE_MEAN_MAX": return np.r_[np.max(embedding, axis=0), np.mean(embedding, axis=0)] elif pooling_strategy == "CLS_TOKEN": return embedding[0] else: raise ValueError("specify valid pooling_strategy: {REDUCE_MEAN, REDUCE_MAX, REDUCE_MEAN_MAX, CLS_TOKEN}")
def get_sample_bert_token_id_seq(bert_tokenizer: BertTokenizer, left_seq_str, right_seq_str, max_seq_len): left_bert_token_seq = bert_tokenizer.tokenize(left_seq_str) right_bert_token_seq = bert_tokenizer.tokenize(right_seq_str) if len(right_bert_token_seq) + 3 > max_seq_len: right_bert_token_seq = right_bert_token_seq[:max_seq_len - 3] if len(right_bert_token_seq) + len(left_bert_token_seq) + 3 > max_seq_len: left_bert_token_seq = left_bert_token_seq[:max_seq_len - len(right_bert_token_seq) - 3] bert_token_seq = ['[CLS]'] + left_bert_token_seq + [ '[SEP]' ] + right_bert_token_seq + ['[SEP]'] # print(bert_token_seq) bert_token_id_seq = bert_tokenizer.convert_tokens_to_ids(bert_token_seq) return bert_token_id_seq
class BertWithJumanModel(): """学習済みBertを使うやつ Fork:https://github.com/yagays/pytorch_bert_japanese""" def __init__(self, bert_path, vocab_file_name="vocab.txt", use_cuda=False): self.juman_tokenizer = JumanTokenizer() self.model = BertModel.from_pretrained(bert_path) self.bert_tokenizer = BertTokenizer(Path(bert_path) / vocab_file_name, do_lower_case=False, do_basic_tokenize=False) self.use_cuda = use_cuda def _preprocess_text(self, text): return text.replace(" ", "") def get_sentence_embedding(self, text, pooling_layer=-2, pooling_strategy="REDUCE_MEAN"): preprocessed_text = self._preprocess_text(text) n = math.ceil(len(preprocessed_text) / 2048) result = [ preprocessed_text[idx:idx + n] for idx in range(0, len(preprocessed_text), n) ] tokens = [] for t in result: tokens += self.juman_tokenizer.tokenize(t) bert_tokens = self.bert_tokenizer.tokenize(" ".join(tokens)) ids = self.bert_tokenizer.convert_tokens_to_ids( ["[CLS]"] + bert_tokens[:126] + ["[SEP]"]) # max_seq_len-2 tokens_tensor = torch.tensor(ids).reshape(1, -1) if self.use_cuda: tokens_tensor = tokens_tensor.to('cuda') self.model.to('cuda') self.model.eval() with torch.no_grad(): all_encoder_layers, _ = self.model(tokens_tensor) embedding = all_encoder_layers[pooling_layer].cpu().numpy()[0] if pooling_strategy == "REDUCE_MEAN": return np.mean(embedding, axis=0) elif pooling_strategy == "REDUCE_MAX": return np.max(embedding, axis=0) elif pooling_strategy == "REDUCE_MEAN_MAX": return np.r_[np.max(embedding, axis=0), np.mean(embedding, axis=0)] elif pooling_strategy == "CLS_TOKEN": return embedding[0] else: raise ValueError( "specify valid pooling_strategy: {REDUCE_MEAN, REDUCE_MAX, REDUCE_MEAN_MAX, CLS_TOKEN}" )
def get_words_for_blank_slow_decode(text: str, model: BertForMaskedLM, tokenizer: BertTokenizer): random.seed(42) np.random.seed(42) torch.manual_seed(42) mask_positions = [] tokenized_text = tokenizer.tokenize(text) top_words_all = [] for i in range(len(tokenized_text)): if tokenized_text[i] == '_': tokenized_text[i] = '[MASK]' mask_positions.append(i) while mask_positions: top_words = [] # Convert tokens to vocab indices token_ids = tokenizer.convert_tokens_to_ids(tokenized_text) tokens_tensor = torch.tensor([token_ids]) # Call BERT to calculate unnormalized probabilities for all pos model.eval() predictions = model(tokens_tensor) # get predictions mask_preds = predictions[0, mask_positions, :] candidates = [] #(word, prob) for mask_pos in mask_positions: mask_preds = predictions[0, mask_pos, :] top_idxs = mask_preds.detach().numpy().argsort()[::-1] top_idx = top_idxs[0] top_prob = mask_preds[top_idx] top_word = tokenizer.ids_to_tokens[top_idx] candidates.append((top_word, top_prob.detach().item())) top_words_pos = [] for i in top_idxs[:20]: top_words_pos.append((tokenizer.ids_to_tokens[i], mask_preds[i].detach().item())) top_words.append(top_words_pos) best_candidate = max(candidates, key = lambda x: x[1]) best_pos = mask_positions[candidates.index(best_candidate)] tokenized_text[best_pos] = best_candidate[0] mask_positions = [i for i in mask_positions if i != best_pos] top_words_all.append(top_words[candidates.index(best_candidate)]) pred_sent = ' '.join(tokenized_text).replace(' ##', '') return (pred_sent, top_words_all)
class FedPredictDataset(Dataset): def __init__(self, texts, vocab_path, max_seq_length=512, vocab='finance-uncased'): self.texts = texts self.dict_labels = {'lower': 0, 'maintain': 1, 'raise': 2} self.max_seq_length = max_seq_length self.vocab = vocab if self.vocab == 'finance-uncased': self.tokenizer = BertTokenizer(vocab_file=vocab_path, do_lower_case=True, do_basic_tokenize=True) def __len__(self): return len(self.texts) def __getitem__(self, index): tokenized_review = self.tokenizer.tokenize(self.texts[index]) if len(tokenized_review) > self.max_seq_length: tokenized_review = tokenized_review[:self.max_seq_length] ids_review = self.tokenizer.convert_tokens_to_ids(tokenized_review) mask_input = [1] * len(ids_review) padding = [0] * (self.max_seq_length - len(ids_review)) ids_review += padding mask_input += padding input_type = [0] * self.max_seq_length assert len(ids_review) == self.max_seq_length assert len(mask_input) == self.max_seq_length assert len(input_type) == self.max_seq_length ids_review = torch.tensor(ids_review) mask_input = torch.tensor(mask_input) input_type = torch.tensor(input_type) input_feature = { "token_type_ids": input_type, "attention_mask": mask_input, "input_ids": ids_review } return input_feature
def _bert_embed_sentence(sentence, bert_model: BertModel, bert_tokenizer: BertTokenizer): text = "[CLS] {} [SEP]".format(sentence) tokenized_text = bert_tokenizer.tokenize(text) indexed_tokens = bert_tokenizer.convert_tokens_to_ids(tokenized_text) tokens_tensor = torch.tensor([indexed_tokens]) segments_ids = [0] * len(indexed_tokens) segments_tensors = torch.tensor([segments_ids]) device = 'cuda' if torch.cuda.is_available() else 'cpu' tokens_tensor = tokens_tensor.to(device) segments_tensors = segments_tensors.to(device) with torch.no_grad(): encoded_layers, _ = bert_model(tokens_tensor, segments_tensors, output_all_encoded_layers=False) # Embedding of the [CLS] token return encoded_layers[0][0]
def convert_data2(path1, path2, max_length, number, seq1, seq2): """转ID,进行padding,再加上CLP、SEP之后""" tokenizer = BertTokenizer('./model/bert-base-chinese/vocab.txt') input_id = [] input_mask = [] segment_id = [] # number = 0 print(len(seq1)) for i in range(number): tokens_a = tokenizer.tokenize(seq1[i]) tokens_b = tokenizer.tokenize(seq2[i]) # print(seq2[i]) # print(tokens_b) while True: if (len(tokens_a) + len(tokens_b)) <= max_length - 3: break else: # print(tokens_b) # tokens_b.pop() tokens_a = tokens_a[: int((max_length - 3) * len(tokens_a)/(len(tokens_a) + len(tokens_b)))] tokens_b = tokens_b[: int((max_length - 3) * len(tokens_b)/(len(tokens_a) + len(tokens_b)))] # 头尾加上[CLS] [SEP]标签 tokens_a = ['[CLS]'] + tokens_a + ['[SEP]'] tokens = tokens_a + tokens_b + ['[SEP]'] input_id_ = tokenizer.convert_tokens_to_ids(tokens) segment_id_ = [0] * len(tokens_a) + [1] * (len(tokens_b) + 1) input_mask_ = [1] * len(tokens) # segment_id是用于区分token_a和token_b的 # input_mask用于区分padding padding_ = [0] * (max_length - len(tokens)) # 所有的输入进入bert的配置参数都要加上padding input_id_ += padding_ segment_id_ += padding_ input_mask_ += padding_ # 每条语句放入列表中[sentence_num, MAX_LENGTH] input_id.append(input_id_) input_mask.append(input_mask_) segment_id.append(segment_id_) return input_id, input_mask, segment_id
def predict_word(text: str, model: BertForMaskedLM, tokenizer: BertTokenizer, tgt_word: str, tgt_pos: int): # print('Template sentence: ', text) mask_positions = [] # insert mask tokens tokenized_text = tokenizer.tokenize(text) for i in range(len(tokenized_text)): if tokenized_text[i] == '_': tokenized_text[i] = '[MASK]' mask_positions.append(i) # Convert tokens to vocab indices token_ids = tokenizer.convert_tokens_to_ids(tokenized_text) tokens_tensor = torch.tensor([token_ids]) # Call BERT to calculate unnormalized probabilities for all pos model.eval() predictions = model(tokens_tensor) # normalize by softmax predictions = F.softmax(predictions, dim=2) # For the target word position, get probabilities for each word of interest normalized = predictions[0, tgt_pos, :] out_prob = normalized[tokenizer.vocab[tgt_word]].item() # Also, fill in all blanks by max prob, and print for inspection for mask_pos in mask_positions: predicted_index = torch.argmax(predictions[0, mask_pos, :]).item() predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0] tokenized_text[mask_pos] = predicted_token for mask_pos in mask_positions: tokenized_text[mask_pos] = "_" + tokenized_text[mask_pos] + "_" pred_sent = ' '.join(tokenized_text).replace(' ##', '') # print(pred_sent) return out_prob, pred_sent
class mod_eventclass(BasePlugin): """ Web Scraping plugin: mod_eventclass For classifying news events. """ minArticleLengthInChars = 400 pluginType = Types.MODULE_DATA_PROCESSOR # implies data post-processor dataFrame = None device = None model = None sentencesColList = [ 'url', 'sentence', 'sentence_no', 'neutral_prob', 'positive_prob', 'negative_prob' ] sentencesRec = None def __init__(self): """ Initialize the object """ super().__init__() def additionalConfig(self, sessionHistoryObj): """ Perform additional configuration that is specific to this plugin. :param sessionHistoryObj: The session history object to be used by this plugin for putting items into the data processing competed queue. :return: """ self.workDir = self.app_config.data_dir self.sessionHistDB = sessionHistoryObj self.pretuned_modelfile = self.app_config.checkAndSanitizeConfigString( 'plugins', 'mod_eventclass_modelfile') self.model_weights_path = self.app_config.checkAndSanitizeConfigString( 'plugins', 'mod_eventclass_weightspath') self.vocab_path = self.app_config.checkAndSanitizeConfigString( 'plugins', 'mod_eventclass_vocab_path') self.labels = {0: 'neutral', 1: 'positive', 2: 'negative'} # TODO: fix model load error: self.setupModel() self.sentencesRec = pd.DataFrame(np.zeros( (1, len(self.sentencesColList)), dtype=np.unicode_), columns=self.sentencesColList) # convert last 4 into float32 dtype for colname in [ "sentence_no", "neutral_prob", "positive_prob", "negative_prob" ]: self.sentencesRec[colname] = pd.to_numeric( self.sentencesRec[colname]) def setupModel(self): """ Load the classification model. """ num_labels = len(self.labels) vocab_type = "finance-uncased" self.max_seq_length = 256 if torch.cuda.is_available(): self.device = torch.device("cuda") else: self.device = torch.device("cpu") self.model = BertClassification(weight_path=self.model_weights_path, num_labels=num_labels, vocab=vocab_type) self.model.load_state_dict( torch.load(self.pretuned_modelfile, map_location=self.device)) self.model.to(self.device) self.tokenizer = BertTokenizer(vocab_file=self.vocab_path, do_lower_case=True, do_basic_tokenize=True) def processDataObj(self, newsEventObj): """ Process given data object by this plugin. :param newsEventObj: The NewsEvent object to be classified. :type newsEventObj: NewsEvent """ assert type(newsEventObj) == NewsEvent # Do not proceed if the articles has already been classified, i.e. contains scores if newsEventObj.getClassification() is None: # TODO: lock file to avoid conflicting writes, release lock at the end of the method logger.debug( f"Started news event classification for data in: {newsEventObj.getFileName()}" ) classificationObj = self.classifyText(newsEventObj.getText(), newsEventObj.getURL()) # put classification field in NewsEvent document: newsEventObj.setClassification(classificationObj) # prepare filename: fileNameWOExt = newsEventObj.getFileName().replace('.json', '') # save document to file: newsEventObj.writeFiles(fileNameWOExt, '', saveHTMLFile=False) logger.info( f"Completed classifying news event in: {fileNameWOExt} as: {classificationObj}" ) def classifyText(self, textValue, url): """ Examine and classify the text from the document and return classification scores text. :param textValue: Text to be examined and classified. :type textValue: str :return: Classification scores :rtype: dict{str:float} """ sentenceDF = None classificationScores = { 'positive': 0.0, 'neutral': 0.0, 'negative': 0.0 } try: logger.debug( f'Classifying using finbert model for text of length {len(textValue)}' ) if len(textValue) > self.minArticleLengthInChars: thisRec = self.sentencesRec.copy(deep=True) thisRec['url'] = url sentences = sent_tokenize(textValue.lower()) self.model.eval() for index, sent in enumerate(sentences): thisRec['sentence'] = sent thisRec['sentence_no'] = index # apply model on the sentence to get classification scores [neutralProb, positiveProb, negativeProb] = self.classifySentences(sent) thisRec['neutral_prob'] = neutralProb thisRec['positive_prob'] = positiveProb thisRec['negative_prob'] = negativeProb if sentenceDF is None: sentenceDF = thisRec else: sentenceDF = sentenceDF.append(thisRec) aggscores = sentenceDF.groupby('url').agg({ 'neutral_prob': 'sum', 'positive_prob': 'sum', 'negative_prob': 'sum' }) classificationScores = { 'positive': aggscores['positive_prob'][0], 'neutral': aggscores['neutral_prob'][0], 'negative': aggscores['negative_prob'][0] } except Exception as e: print("Error getting sentence classification:", e) return (classificationScores) def classifySentences(self, sent): """ Classify one text sentence at a time. """ tokenized_sent = self.tokenizer.tokenize(sent) if len(tokenized_sent) > self.max_seq_length: tokenized_sent = tokenized_sent[:self.max_seq_length] ids_review = self.tokenizer.convert_tokens_to_ids(tokenized_sent) mask_input = [1] * len(ids_review) padding = [0] * (self.max_seq_length - len(ids_review)) ids_review += padding mask_input += padding input_type = [0] * self.max_seq_length input_ids = torch.tensor(ids_review).to(self.device).reshape(-1, 256) attention_mask = torch.tensor(mask_input).to(self.device).reshape( -1, 256) token_type_ids = torch.tensor(input_type).to(self.device).reshape( -1, 256) with torch.set_grad_enabled(False): outputs = self.model(input_ids, token_type_ids, attention_mask) outputs = F.softmax(outputs, dim=1) # print('\n FinBERT predicted sentiment: ', labels[torch.argmax(outputs).item()]) return ([i.item() for i in outputs.data[0]])
cnn_sentence = word_tokenize(text.lower()) cnn_sentence = [cnn_vocabulary.w2i[w] for w in cnn_sentence] cnn_sentence += [cnn_vocabulary.w2i['[SEP]']] cnn_sentence = [cnn_vocabulary.w2i['[CLS]']] + cnn_sentence sent_len = [len(cnn_sentence)] cnn_sentence = torch.tensor([cnn_sentence ]).type(torch.LongTensor).to(device) cnn_prob = cnn_model(cnn_sentence, (cnn_sentence > 0)) final_prob += cnn_prob lstm_prob = lstm_model(cnn_sentence, sent_len) final_prob += lstm_prob bert_sentence = tokenizer.tokenize(text) bert_sentence = [bert_vocabulary.w2i[w] for w in bert_sentence] bert_sentence += [bert_vocabulary.w2i['[SEP]']] bert_sentence = [bert_vocabulary.w2i['[CLS]']] + bert_sentence bert_sentence = torch.tensor([bert_sentence ]).type(torch.LongTensor).to(device) bert_sent_prob = bert_sent_model(bert_sentence) final_prob += bert_sent_prob bert_word_prob = bert_word_model(bert_sentence) final_prob += bert_word_prob _, pred_topic = torch.max(final_prob, 1) pred_topic = pred_topic.cpu().numpy()[0] results.append(i2t[pred_topic])
def evaluate(args:Dict): model_root = args['--model-root'] if args['--model-root'] else './models' print("load model from {}".format(model_root), file=sys.stderr) dataLoader = sentence.Sentence(args['--test-src']) device = torch.device("cuda:0" if args['--cuda'] else "cpu") output_model_file = os.path.join(model_root, "model_file.bin") output_config_file = os.path.join(model_root, "config_file.bin") output_vocab_file = os.path.join(model_root, "vocab.txt") config = BertConfig.from_json_file(output_config_file) model = BertForTokenClassification(config,num_labels=len(dataLoader.tag2idx)) state_dict = torch.load(output_model_file) model.load_state_dict(state_dict) tokenizer = BertTokenizer(output_vocab_file, do_lower_case=False) tokenized_texts = [tokenizer.tokenize(sent) for sent in dataLoader.sentences] if args['--cuda']: model = model.to(torch.device("cuda:0")) MAX_LEN = int(args['--max-len']) input_ids_test = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts], maxlen=MAX_LEN, dtype="long", truncating="post", padding="post") tags_test = pad_sequences([[dataLoader.tag2idx.get(l) for l in lab] for lab in dataLoader.labels], maxlen=MAX_LEN, value=dataLoader.tag2idx["O"], padding="post", dtype="long", truncating="post") attention_masks_test = [[float(i > 0) for i in ii] for ii in input_ids_test] for i, inp in enumerate(input_ids_test): if (102 not in inp): inp[-1] = 102 tags_test[i][-1] = dataLoader.tag2idx.get("O") te_inputs = torch.tensor(input_ids_test).to(torch.int64) te_tags = torch.tensor(tags_test).to(torch.int64) te_masks = torch.tensor(attention_masks_test) test_data = TensorDataset(te_inputs, te_masks, te_tags) test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=int(args['--batch-size'])) model.eval() predictions = [] true_labels = [] eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for batch in test_dataloader: batch = tuple(t.to(device) for t in batch) b_input_ids, b_input_mask, b_labels = batch with torch.no_grad(): tmp_eval_loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) logits = logits.detach().cpu().numpy() predictions.extend([list(p) for p in np.argmax(logits, axis=2)]) label_ids = b_labels.to('cpu').numpy() true_labels.append(label_ids) tmp_eval_accuracy = flat_accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += b_input_ids.size(0) nb_eval_steps += 1 pred_tags = [[dataLoader.tags_vals[p_i] for p_i in p] for p in predictions] test_tags = [[dataLoader.tags_vals[l_ii] for l_ii in l_i] for l in true_labels for l_i in l] tags_test_fin = list() for l in tags_test: temp_tag = list() for l_i in l: temp_tag.append(dataLoader.tags_vals[l_i]) tags_test_fin.append(temp_tag) print("Test loss: {}".format(eval_loss / nb_eval_steps)) print("Test Accuracy: {}".format(eval_accuracy / nb_eval_steps)) print("Test F1-Score: {}".format(f1_score(tags_test_fin, pred_tags))) print(classification_report(tags_test_fin, pred_tags)) print("Number of Test sentences: ", len(tags_test_fin))
class Vocabulary: """Vocubulary for Dataset Read the train dataset's words into vocabulary; Conduct necessary preprocessing; Obtain word2index and topic2index dictionary; """ def __init__(self): self.tokenizer = BertTokenizer( vocab_file=os.path.join(main_dir, 'pretrained_bert', 'uncased_L-12_H-768_A-12', 'vocab.txt')) # generate w2i, t2i, and train data self.get_vocab() # self.get_dataset(split='train') def get_num_words(self): return len(self.w2i) def get_num_topics(self): return len(self.t2i) def get_dataset(self, split): if split == 'train': try: return self.train_data except: self.train_data = self.read_dataset( os.path.join(main_dir, 'data/topicclass_train.txt')) return self.train_data elif split == 'valid': try: return self.valid_data except: self.valid_data = self.read_dataset( os.path.join(main_dir, 'data/topicclass_valid.txt')) return self.valid_data elif split == 'test': try: return self.test_data except: self.test_data = self.read_dataset( os.path.join(main_dir, 'data/topicclass_test.txt')) return self.test_data else: raise ValueError("Unkown split, split must in train/valid/test!") def get_vocab(self): """ Generate vocabulary from train dataset """ # create word2index and topic2index dict w2i = defaultdict(lambda: len(w2i)) filename = os.path.join( main_dir, 'pretrained_bert/uncased_L-12_H-768_A-12/vocab.txt') with open(filename, "r") as f: for word in f: index = w2i[word.rstrip('\n')] UNK = w2i['[UNK]'] # fix the word2index thus any new words in valid and test dataset will be unkown self.w2i = defaultdict(lambda: UNK, w2i) # self.t2i self.t2i = defaultdict(lambda: len(self.t2i)) filename = os.path.join(main_dir, 'data/topicclass_train.txt') self.train_data = [] with open(filename, "r") as f: for line in tqdm(f): topic, text = line.lower().strip().split(" ||| ") sentence = self.tokenizer.tokenize(text) sentence = [self.w2i[w] for w in sentence] sentence += [self.w2i['[SEP]']] sentence = [self.w2i['[CLS]']] + sentence # make train data self.train_data.append((sentence, self.t2i[topic])) def read_dataset(self, filename): """ Read rawdata using word2index and topic2index """ data = [] logger.info("Reading {} into dataset...".format(filename)) with open(filename, "r") as f: for line in tqdm(f): topic, text = line.lower().strip().split(" ||| ") sentence = self.tokenizer.tokenize(text) sentence = [self.w2i[w] for w in sentence] sentence += [self.w2i['[SEP]']] sentence = [self.w2i['[CLS]']] + sentence data.append((sentence, self.t2i[topic])) return data
def bert_sentence_pair_preprocessing(dataset: pd.DataFrame, tokenizer: BertTokenizer, max_sequence_length=64): max_bert_input_length = 70 dataset_input_ids = torch.empty((len(dataset), max_bert_input_length), dtype=torch.long) dataset_token_type_ids = torch.empty((len(dataset), max_bert_input_length), dtype=torch.long) dataset_attention_masks = torch.empty((len(dataset), max_bert_input_length), dtype=torch.long) dataset_lengths = torch.empty((len(dataset), 1), dtype=torch.long) dataset_labels = torch.empty((len(dataset), 1), dtype=torch.long) dataset_other_type_ids = torch.empty((len(dataset), 18), dtype=torch.long) # dataset_input_tensors = torch.empty(len(dataset), 4, max_bert_input_length, dtype=torch.float) for idx, data in dataset.iterrows(): tokens = [] input_type_ids = [] # other type 전처리 other_type_ids = [] other_type_ids.append(data['addr0']) other_type_ids.append(data['addr1']) other_type_ids.append(data['addr2']) other_type_ids.append(data['addr3']) other_type_ids.append(data['addr4']) other_type_ids.append(data['addr5']) other_type_ids.append(data['phone0']) other_type_ids.append(data['phone1']) other_type_ids.append(data['phone2']) other_type_ids.append(data['phone3']) other_type_ids.append(data['cate0']) other_type_ids.append(data['cate1']) other_type_ids.append(data['cate2']) other_type_ids.append(data['cate3']) other_type_ids.append(data['cate4']) other_type_ids.append(data['cname0']) other_type_ids.append(data['cname1']) other_type_ids.append(data['cname2']) dataset_other_type_ids[idx] = torch.tensor(other_type_ids, dtype=torch.long) sentence_1_tokenized, sentence_2_tokenized = tokenizer.tokenize(data['full_placename1']), tokenizer.tokenize(data['full_placename2']) tokens.append("[CLS]") input_type_ids.append(0) for token in sentence_1_tokenized: tokens.append(token) input_type_ids.append(0) tokens.append("[SEP]") input_type_ids.append(0) for token in sentence_2_tokenized: tokens.append(token) input_type_ids.append(1) tokens.append("[SEP]") input_type_ids.append(1) # 전처리한 token 바탕으로 인덱스값 얻음 input_ids = tokenizer.convert_tokens_to_ids(tokens) # attention mask 전처리 attention_masks = [1] * len(input_ids) # input_ids length 저장 dataset_lengths[idx] = torch.tensor(len(input_ids), dtype=torch.long) while len(input_ids) < max_bert_input_length: input_ids.append(0) attention_masks.append(0) input_type_ids.append(0) dataset_input_ids[idx] = torch.tensor(input_ids, dtype=torch.long) dataset_token_type_ids[idx] = torch.tensor(input_type_ids, dtype=torch.long) dataset_attention_masks[idx] = torch.tensor(attention_masks, dtype=torch.long) dataset_labels[idx] = torch.tensor(data['label'], dtype=torch.long) return dataset_input_ids, dataset_token_type_ids, dataset_attention_masks, dataset_other_type_ids, dataset_lengths, dataset_labels
class for_BERT(): def __init__(self, mode='training'): self.mode = mode with open(dir_path + '/data/tag2idx.json', 'r') as f: self.tag2idx = json.load(f) self.idx2tag = dict(zip(self.tag2idx.values(), self.tag2idx.keys())) # load pretrained BERT tokenizer self.tokenizer = BertTokenizer.from_pretrained( 'bert-base-multilingual-cased', do_lower_case=False) # load BERT tokenizer with untokenizing frames never_split_tuple = ("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]") added_never_split = [] added_never_split.append('<tgt>') added_never_split.append('</tgt>') added_never_split_tuple = tuple(added_never_split) never_split_tuple += added_never_split_tuple vocab_file_path = dir_path + '/data/bert-multilingual-cased-dict-add-frames' self.tokenizer_with_frame = BertTokenizer( vocab_file_path, do_lower_case=False, max_len=256, never_split=never_split_tuple) def idx2tag(self, predictions): pred_tags = [self.idx2tag[p_i] for p in predictions for p_i in p] # bert tokenizer and assign to the first token def bert_tokenizer(self, text): orig_tokens = text.split(' ') bert_tokens = [] orig_to_tok_map = [] bert_tokens.append("[CLS]") for orig_token in orig_tokens: orig_to_tok_map.append(len(bert_tokens)) bert_tokens.extend(self.tokenizer_with_frame.tokenize(orig_token)) bert_tokens.append("[SEP]") return orig_tokens, bert_tokens, orig_to_tok_map def convert_to_bert_input(self, input_data): tokenized_texts, args = [], [] orig_tok_to_maps = [] for i in range(len(input_data)): data = input_data[i] text = ' '.join(data[0]) orig_tokens, bert_tokens, orig_to_tok_map = self.bert_tokenizer( text) orig_tok_to_maps.append(orig_to_tok_map) tokenized_texts.append(bert_tokens) if self.mode == 'training': ori_args = data[2] arg_sequence = [] for i in range(len(bert_tokens)): if i in orig_to_tok_map: idx = orig_to_tok_map.index(i) ar = ori_args[idx] arg_sequence.append(ar) else: arg_sequence.append('X') args.append(arg_sequence) input_ids = pad_sequences([ self.tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts ], maxlen=MAX_LEN, dtype="long", truncating="post", padding="post") orig_tok_to_maps = pad_sequences(orig_tok_to_maps, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post", value=-1) if self.mode == 'training': arg_ids = pad_sequences([[self.tag2idx.get(ar) for ar in arg] for arg in args], maxlen=MAX_LEN, value=self.tag2idx["X"], padding="post", dtype="long", truncating="post") attention_masks = [[float(i > 0) for i in ii] for ii in input_ids] data_inputs = torch.tensor(input_ids) data_orig_tok_to_maps = torch.tensor(orig_tok_to_maps) data_masks = torch.tensor(attention_masks) if self.mode == 'training': data_args = torch.tensor(arg_ids) bert_inputs = TensorDataset(data_inputs, data_orig_tok_to_maps, data_args, data_masks) else: bert_inputs = TensorDataset(data_inputs, data_orig_tok_to_maps, data_masks) return bert_inputs
class NER: def __init__(self, encoding, base_model="bert-base-uncased", num_ner=0, tag_dropout=0.3, pos_dropout=0.3, ner_dropout=None, tag_dropout_2=0.3, pos_dropout_2=0.3, ner_dropout_2=None, architecture="simple", ner=False, middle_layer=None): """ There are only two base_model options allowed: "bert-base-uncased" and "finbert-uncased" """ # Fine Tuning parameters self.ner = ner self.num_ner = num_ner self.ner_dropout = ner_dropout self.architecture = architecture self.middle_layer = middle_layer self.tag_dropout = tag_dropout self.pos_dropout = pos_dropout self.tag_dropout_2 = tag_dropout_2 self.pos_dropout_2 = pos_dropout_2 self.ner_dropout_2 = ner_dropout_2 # configuration self.config = config # Accuracies and Losses self.list_train_losses = [] self.list_test_losses = [] self.list_tag_acc = [] self.list_pos_acc = [] # std means standardized, in our case the tags are replaced by integers for the classification self.pos_std = None self.tag_std = None self.device = None # define the encoding of the dataframe if "utf" in encoding.lower(): self.encoding = "utf-8" elif "latin-1" in encoding.lower(): self.encoding = "latin-1" else: self.encoding = encoding # be sure the model's name follows the correct structure self.base_model = base_model.replace("_", "-") # Fix the tokenizer and special tokens if base_model == "bert-base-uncased": self.tokenizer = BertTokenizer( vocab_file=config.BERT_UNCASED_VOCAB, do_lower_case=True, do_basic_tokenize=True) self.special_tokens_dict = special_tokens_dict( config.BERT_UNCASED_VOCAB) elif base_model == "finbert-uncased": self.tokenizer = BertTokenizer( vocab_file=config.FINBERT_UNCASED_VOCAB, do_lower_case=True, do_basic_tokenize=True) self.special_tokens_dict = special_tokens_dict( config.FINBERT_UNCASED_VOCAB) def training(self, saving=True): logger.info("Preprocessing data ...") # We preprocess and normalize (as categories) the data and output it as np.arrays/ pd.series sentences, pos, tag, self.pos_std, self.tag_std = preprocess_data_BERT( self.config.TRAINING_FILE, self.encoding) logger.info("Data has been preprocessed") # Checkpoint for the standardized pos and tag. tag <-> integer value logger.info("Making checkpoint for the preprocessed data ...") if saving: data_check_pt = {"pos_std": self.pos_std, "tag_std": self.tag_std} joblib.dump(value=data_check_pt, filename=config.CHECKPOINTS_META_PATH) else: pass # Save the number of classes per classification problem num_tag = len(list(self.tag_std.classes_)) num_pos = len(list(self.pos_std.classes_)) data4 = np.array(num_pos) np.savez(join(config.BASE_DATA_PATH, "num_pos"), data4) data3 = np.array(num_tag) np.savez(join(config.BASE_DATA_PATH, "num_tag"), data3) # Split training set with skl logger.info(" Splitting data and creating data sets ...") self.train_sentences, self.test_sentences, self.train_pos, self.test_pos, self.train_tag, self.test_tag \ = train_test_split(sentences, pos, tag, random_state=42, test_size=0.2) # Format based on Entities_dataset: getitem outputs pandas dataframes self.train = dataset.Entities_dataset( texts=self.train_sentences, pos=self.train_pos, tags=self.train_tag, tokenizer=self.tokenizer, special_tokens=self.special_tokens_dict, model_name=self.base_model) self.test = dataset.Entities_dataset( texts=self.test_sentences, pos=self.test_pos, tags=self.test_tag, tokenizer=self.tokenizer, special_tokens=self.special_tokens_dict, model_name=self.base_model) # Loaders from torch: it formats the data for pytorch and fixes the batch and the num of kernels # "workers" means subprocess no gpus in the cuda self.train_data_loader = DataLoader( self.train, batch_size=self.config.TRAIN_BATCH_SIZE, num_workers=4) self.test_data_loader = DataLoader( self.test, batch_size=self.config.VALID_BATCH_SIZE, num_workers=4) # Load model to device and hyperparameters logger.info("Moving model to cuda ...") self.model_device(phase="train", num_tag=num_tag, num_pos=num_pos) self.hyperparameters() # initialize the loss best_loss = np.inf best_tag_acc = 0 best_pos_acc = 0 # EPOCHS logger.info("Starting Fine-tuning ...") for epoch in range(self.config.EPOCHS): # Training logger.info("Start epoch {}".format(epoch + 1)) train_loss = train_val_loss.train(self.train_data_loader, self.model, self.optimizer, self.device, self.scheduler) test_loss, tag_acc, pos_acc = train_val_loss.validation( self.test_data_loader, self.model, self.device) # Accuracies and Losses logger.info("Train Loss = {}".format(train_loss)) logger.info("Test Loss = {}".format(test_loss)) logger.info("Accuracy for tags is = {}".format(tag_acc)) logger.info("Accuracy for pos is = {}".format(pos_acc)) self.list_train_losses.append(float(train_loss)) self.list_test_losses.append(float(test_loss)) self.list_tag_acc.append(float(tag_acc)) self.list_pos_acc.append(float(pos_acc)) logger.info("End epoch {}".format(epoch + 1)) logger.info("Testing epoch {}".format(epoch + 1)) if test_loss < best_loss: torch.save(self.model.state_dict(), self.config.CHECKPOINTS_MODEL_PATH) best_loss = test_loss if pos_acc > best_pos_acc: best_pos_acc = pos_acc if tag_acc > best_tag_acc: best_tag_acc = tag_acc logger.info("End epoch {} with loss {} asnd best loss {}".format( epoch + 1, test_loss, best_loss)) logger.info("Fine-tuning finished") logger.info("With training losses: {}".format(self.list_train_losses)) logger.info("With test losses: {}".format(self.list_test_losses)) # plotting losses_accuracies = { "Tag accuracy": self.list_tag_acc, "Pos accuracy": self.list_pos_acc, "Train loss": self.list_train_losses, "Test loss": self.list_test_losses } name = "model=" + self.base_model + "_epochs=" + str( config.EPOCHS) + "_test_batch=" name += str(config.VALID_BATCH_SIZE) + "_train_batch=" + str( config.TRAIN_BATCH_SIZE) + "_max_len=" name += str(config.MAX_LEN) + "_dropouts=" + str( self.tag_dropout) + "_" + str(self.pos_dropout) name += "_" + str(self.ner_dropout) + "_architecture=" + str( self.architecture) name += '_POS=' + str(best_pos_acc) + '_TAG=' + str(best_tag_acc) ploter(output_path=config.BASE_DATA_PATH, name=name, num_epochs=self.config.EPOCHS, **losses_accuracies) # Saving results data_pos = np.array(self.list_pos_acc) np.savez(join(config.BASE_DATA_PATH, "pos_accuracies_" + name), data_pos) data_tag = np.array(self.list_tag_acc) np.savez(join(config.BASE_DATA_PATH, "tag_accuracies_" + name), data_tag) data1 = np.array(self.list_train_losses) np.savez(join(config.BASE_DATA_PATH, "train_losses_" + name), data1) data2 = np.array(self.list_test_losses) np.savez(join(config.BASE_DATA_PATH, "test_losses_" + name), data2) return best_loss def predict(self, text): """ Given a example text it predicts and prints the tokens and their labels for tag and pos""" # Loading the results num_tag = np.load(join(config.BASE_DATA_PATH, "num_tag.npz")) num_tag = num_tag.f.arr_0 num_pos = np.load(join(config.BASE_DATA_PATH, "num_pos.npz")) num_pos = num_pos.f.arr_0 # check pos and tag if self.pos_std is None: std_data = joblib.load(config.CHECKPOINTS_META_PATH) self.pos_std = std_data["pos_std"] self.tag_std = std_data["tag_std"] else: pass # preprocessing sentence = text.split() # tokenizing tokenized_text = self.tokenizer.tokenize(text) # converting into iterable input for the model tets_text = dataset.Entities_dataset( texts=[sentence], pos=[[0] * len(sentence)], tags=[[0] * len(sentence)], tokenizer=self.tokenizer, special_tokens=self.special_tokens_dict, model_name=self.base_model) # move model to device and fix not update for the gradients since it is a prediction self.model_device(phase="predict", num_tag=num_tag, num_pos=num_pos) with torch.no_grad(): data = tets_text[0] for k, v in data.items(): data[k] = v.to(self.device).unsqueeze(0) tag, pos, _ = self.model(**data) # argmax: max value axis 2, the distribution ; cpu().numpy(): convert to cuda variable print(tokenized_text) print( self.tag_std.inverse_transform( tag.argmax(2).cpu().numpy().reshape(-1)) [1:len(tokenized_text) + 1]) print( self.pos_std.inverse_transform( pos.argmax(2).cpu().numpy().reshape(-1)) [1:len(tokenized_text) + 1]) def model_device(self, phase, num_tag, num_pos): """ Use GPU, load model and move it there -- device or cpu if cuda is not available """ self.device = check_device() self.model = BERT_NER(num_tag=num_tag, num_pos=num_pos, num_ner=self.num_ner, base_model=self.base_model, tag_dropout=self.tag_dropout, pos_dropout=self.pos_dropout, ner_dropout=self.ner_dropout, tag_dropout_2=self.tag_dropout_2, pos_dropout_2=self.pos_dropout_2, ner_dropout_2=self.ner_dropout_2, architecture=self.architecture, ner=self.ner, middle_layer=self.middle_layer) if phase == "train": self.model.to(self.device) elif phase == "predict": self.model.load_state_dict( torch.load(self.config.CHECKPOINTS_MODEL_PATH)) self.model.to(self.device) else: pass def hyperparameters(self): """ This method fix the parameters and makes a filter over to exclude LayerNorm and biases """ # nn.module list of parameters: all parameters from BERT plus the pos and tag layer self.param_optimizer = list(self.model.named_parameters()) # exclude LayerNorm and biases no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [{ "params": [ p for n, p in self.param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.001 }, { "params": [ p for n, p in self.param_optimizer if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }] num_train_steps = int( len(self.train_sentences) / self.config.TRAIN_BATCH_SIZE * self.config.EPOCHS) self.optimizer = AdamW(optimizer_parameters, lr=3e-5) # Scheduler self.scheduler = get_linear_schedule_with_warmup( self.optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)
bert.eval() # ## Setup tokenizer # In[ ]: tokenizer = BertTokenizer( vocab_file='../input/torch-bert-weights/bert-base-uncased-vocab.txt') # ## Make prediction # In[ ]: # lets tokenize some text (I intentionally mispelled 'plastic' to check berts subword information handling) text = 'hi my name is Dieter and I like wearing my yellow pglastic hat while coding.' tokens = tokenizer.tokenize(text) tokens # In[ ]: # added start and end token and convert to ids tokens = ["[CLS]"] + tokens + ["[SEP]"] input_ids = tokenizer.convert_tokens_to_ids(tokens) input_ids # In[ ]: # put input on gpu and make prediction bert_output = bert(torch.tensor([input_ids]).cuda()) bert_output
help='show top k predictions') if __name__ == '__main__': args = parser.parse_args() bert_tokenizer = BertTokenizer( vocab_file='/media/lonelyprince7/mydisk/NLP-dataset/bert_models/bert-base-uncased-vocab.txt') bert_model = BertForMaskedLM.from_pretrained( '/media/lonelyprince7/mydisk/NLP-dataset/bert_models/bert-base-uncased.tar.gz') sentences, res = read_data() print(res) predict_res = [] mask_cnt = 0 for sentence in sentences: sentence = sentence.strip() sentence = sentence.replace('_', '[MASK]') # print(sentence) tokens = bert_tokenizer.tokenize(sentence) if len(tokens) == 0: continue if tokens[0] != CLS: tokens = [CLS] + tokens if tokens[-1] != SEP: tokens.append(SEP) token_idx, segment_idx, mask = to_bert_input(tokens, bert_tokenizer) with torch.no_grad(): logits = bert_model(token_idx, segment_idx, mask, masked_lm_labels=None) logits = logits.squeeze(0) probs = torch.softmax(logits, dim=-1) for idx, token in enumerate(tokens): if token == MASK: mask_cnt += 1
def main(annotated_text_file=str, text_file=str, vocab_file: str, word_file: FileType, threshold: int, missing_tokens_file: str, output_file: str): nlp = spacy.load('en_core_web_lg', disable=['tokenizer', 'tagger', 'ner', 'textcat']) ###### Find missing tokens # subprocess.call("1_extract_vocab.sh", shell = True) subprocess.check_call("1_extract_vocab.sh -i %s -o %s" % (annotated_text_file, missing_tokens_file), shell=True) vocab_file = expanduser(vocab_file) tokenizer = BertTokenizer(vocab_file, do_lower_case=False) f = open(missing_tokens_file, "w+") print('count,original,splitted', file=f) # file header for line in tqdm(word_file, 'words'): c_word = line.strip().split() if len(c_word) == 1: # word is a space continue count, word = c_word count = int(count) if count < threshold: break tokens = tokenizer.tokenize(word) if len(tokens) > 1: # we have subwords if len(tokens) == 2 and tokens[1] == '##s': continue print(count, word, '#'.join(t.strip('#') for t in tokens), sep=',', file=f) # create csv from that output ######2nd Stage count_unused = 0 vocab = [] count = 0 f = open(missing_tokens_file, "r") for x in f: ### the first line is a warning from bert # if count > 0: vocab.append(x.replace("\n", "").split(',')) count += 1 new_vocab = pd.DataFrame(vocab[1:], columns=vocab[0]) new_vocab['count'] = new_vocab['count'].apply(int) new_vocab.sort_values('count', ascending=False).to_csv('new_vocab.csv') missing_tokens = pd.read_csv('new_vocab.csv') with open(output_file, 'w') as write: with open(vocab_file, 'r') as read: for line in tqdm(read): if '[unused' in line and count_unused < missing_tokens.shape[0]: write.write(missing_tokens.iloc[count_unused]['original'] + '\n') count_unused += 1 else: write.write(line) ### TO SEPARATE THE FULL TEXT INTO DOCUMENTS df = pd.read_csv(text_file, delimiter="\n\n", header=None) docs = df[0].apply(lambda x: x.replace('Operator', "")) #### LOAD THE ANNOTATED DATA with open(annotated_text_file) as json_file: data_annotated = json.load(json_file) content = [] sentiment = [] for i in range(len(data_annotated['data'])): try: content.append(data_annotated['data'][i]['content']) try: sentiment.append( data_annotated['data'][i]['annotation']['sentiment']) except: print('pb sentiment') del content[i] except: print('pb') continue docs2 = pd.Series(content) docs = pd.concat([docs, docs2]) # vocab = [] # count = 0 # f = open("new_vocab.txt", "r") # for x in f: # vocab.append(x.replace("\n", "").split(',')) # count += 1 # # ###### ADDING NEW VOCABULARY # new_vocab = pd.DataFrame(vocab[1:], columns=vocab[0]) # new_vocab['count'] = new_vocab['count'].apply(int) # new_vocab.sort_values('count', ascending=False).to_csv('new_vocab.csv') documents_liste = docs.tolist() ## Writing the documents to separate files : propportion to fasten the execution for i, document in enumerate(documents_liste): if i < len(documents_liste) * 0.99: output_file = 'data/transcript_' + str(i) + '.txt' else: output_file = 'test/transcript_' + str(i) + '.txt' new_file = open(output_file, mode="w+", encoding="utf-8") new_file.write(document) new_file.close()
class WordPieceVectorizer1D(AbstractVectorizer): """Define a Baseline Vectorizer that can do WordPiece with BERT tokenizer If you use tokens=wordpiece, this vectorizer is used, and so then there is a dependency on bert_pretrained_pytorch """ def __init__(self, **kwargs): """Loads a BertTokenizer using bert_pretrained_pytorch :param kwargs: """ super(WordPieceVectorizer1D, self).__init__(kwargs.get('transform_fn')) from pytorch_pretrained_bert import BertTokenizer self.max_seen = 128 handle = kwargs.get('embed_file') custom_vocab = kwargs.get('vocab_file') if custom_vocab is None: self.tokenizer = BertTokenizer.from_pretrained(handle, do_lower_case=True) else: special_tokens = kwargs.get('special_tokens') never_split = ('[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]') + special_tokens self.tokenizer = BertTokenizer(custom_vocab, do_basic_tokenize=True, never_split=never_split) self.mxlen = kwargs.get('mxlen', -1) @property def vocab(self): return self.tokenizer.vocab def count(self, tokens): seen = 0 counter = Counter() for tok in self.iterable(tokens): counter[tok] += 1 seen += 1 self.max_seen = max(self.max_seen, seen) return counter def iterable(self, tokens): for tok in tokens: if tok == '<unk>': yield '[UNK]' elif tok == '<EOS>': yield '[SEP]' else: for subtok in self.tokenizer.tokenize(tok): yield subtok def _next_element(self, tokens, vocab): for atom in self.iterable(tokens): value = vocab.get(atom) if value is None: value = vocab['[UNK]'] yield value def run(self, tokens, vocab): if self.mxlen < 0: self.mxlen = self.max_seen vec1d = np.zeros(self.mxlen, dtype=np.long) for i, atom in enumerate(self._next_element(tokens, vocab)): if i == self.mxlen: i -= 1 break vec1d[i] = atom valid_length = i + 1 return vec1d, valid_length def get_dims(self): return self.mxlen,
class text_dataset(Dataset): def __init__(self, x_y_list, vocab_path, max_seq_length=256, vocab='base-cased', transform=None): self.max_seq_length = max_seq_length self.x_y_list = x_y_list self.vocab = vocab if self.vocab == 'base-cased': self.tokenizer = BertTokenizer.from_pretrained( 'bert-base-cased', do_lower_case=False, do_basic_tokenize=True) elif self.vocab == 'finance-cased': self.tokenizer = BertTokenizer(vocab_file=vocab_path, do_lower_case=False, do_basic_tokenize=True) elif self.vocab == 'base-uncased': self.tokenizer = BertTokenizer.from_pretrained( 'bert-base-uncased', do_lower_case=True, do_basic_tokenize=True) elif self.vocab == 'finance-uncased': self.tokenizer = BertTokenizer(vocab_file=vocab_path, do_lower_case=True, do_basic_tokenize=True) def __getitem__(self, index): tokenized_review = self.tokenizer.tokenize(self.x_y_list[0][index]) if len(tokenized_review) > self.max_seq_length: tokenized_review = tokenized_review[:self.max_seq_length] ids_review = self.tokenizer.convert_tokens_to_ids(tokenized_review) mask_input = [1] * len(ids_review) padding = [0] * (self.max_seq_length - len(ids_review)) ids_review += padding mask_input += padding input_type = [0] * self.max_seq_length assert len(ids_review) == self.max_seq_length assert len(mask_input) == self.max_seq_length assert len(input_type) == self.max_seq_length ids_review = torch.tensor(ids_review) mask_input = torch.tensor(mask_input) input_type = torch.tensor(input_type) sentiment = self.x_y_list[1][index] list_of_labels = [torch.from_numpy(np.array(sentiment))] input_feature = { "token_type_ids": input_type, "attention_mask": mask_input, "input_ids": ids_review } return input_feature, list_of_labels[0] def __len__(self): return len(self.x_y_list[0])
class Preprocess: def __init__(self): self.juman_tokenizer = JumanTokenizer() self.rouge_calculator = RougeNCalc() self.bert_tokenizer = BertTokenizer(config['DEFAULT']['vocab_path'], do_lower_case=False, do_basic_tokenize=False) self.trim_input = 0 self.trim_clss = 0 def __call__(self, data_dic, length): self.src_body = data_dic['body'] self.src_summary = data_dic['summary'].split('<sep>') self._init_data() if self.src_body is '': raise ValueError('Empty data') # step 1. article to lines self._split_line() # step 2. pick extractive summary by rouge self._rougematch() # step 3. tokenize self._tokenize() # step 4. clss process self._prep_clss() # step 5. segs process self._prep_segs() # step 6. trim length for input self._set_length(length) return { 'src': self.tokenid, 'labels': self.label, 'segs': self.segs, 'mask': self.mask, 'mask_cls': self.mask_cls, 'clss': self.clss, 'src_str': self.src_line } def _init_data(self): self.src_line = [] self.label = [] self.tokenid = [] self.token = [] self.clss = [] self.segs = [] self.mask = [] self.mask_cls = [] # step 1. def _split_line(self): # regex note: (?!...) Negative Lookahead # e.g. /foo(?!bar)/ for "foobar foobaz" get "foobaz" only self.src_line = re.split('。(?<!」)|!(?<!」)|?(?!」)', self.src_body) self.src_line = [x for x in self.src_line if x is not ''] # step 2. def _rougematch(self): self.label = [0] * len(self.src_line) for summ in self.src_summary: scores = [self.rouge_calculator(x, summ) for x in self.src_line] self.label[scores.index(max(scores))] = 1 # step 3. def _tokenize(self): def _preprocess_text(text): return text.replace(" ", "") # for Juman for sentence in self.src_line: preprocessed_text = _preprocess_text(sentence) juman_tokens = self.juman_tokenizer(preprocessed_text) tokens = self.bert_tokenizer.tokenize(" ".join(juman_tokens)) tokens = ["[CLS]"] + tokens + ["[SEP]"] ids = self.bert_tokenizer.convert_tokens_to_ids(tokens) self.token += tokens self.tokenid += ids # step 4. def _prep_clss(self): self.clss = [ i for i, x in enumerate(self.tokenid) if x == self.bert_tokenizer.vocab['[CLS]'] ] # step 5. def _prep_segs(self): flag = 1 for idx in self.tokenid: if idx == self.bert_tokenizer.vocab['[CLS]']: flag = not flag self.segs.append(int(flag)) # step 6. def _set_length(self, n): self.__trim_data(n) self.__add_mask(n) def __trim_data(self, n): if len(self.tokenid) > n: # If last sentence starts after 512 if self.clss[-1] > 512: for i, idx in enumerate(self.clss): if idx > n: # Index of last [SEP] in length=n self.trim_input = self.clss[i - 1] - 1 # Index of last [CLS] index in clss self.trim_clss = i - 2 break # If src longer than 512 but last sentence start < 512 else: self.trim_input = self.clss[len(self.clss) - 1] - 1 self.trim_clss = len(self.clss) - 2 # Do nothing if length < n if self.trim_clss * self.trim_input == 0: return self.tokenid = self.tokenid[:(self.trim_input + 1)] self.segs = self.segs[:(self.trim_input + 1)] self.clss = self.clss[:(self.trim_clss + 1)] self.label = self.label[:(self.trim_clss + 1)] self.src_line = self.src_line[:(self.trim_clss + 1)] def __add_mask(self, n): # from index to len: +1 pad_len = (n - len(self.tokenid)) self.tokenid = self.tokenid + ([self.bert_tokenizer.vocab['[MASK]']] * pad_len) self.segs = self.segs + ([int(not self.segs[-1])] * pad_len)
def annotate_example_for_bert( example: Dict, table: Dict, bert_tokenizer: BertTokenizer, table_representation_method: Optional[str] = 'canonical'): e_id = example['id'] # sub-tokenize the question question_tokens = example['tokens'] example['original_tokens'] = question_tokens token_position_map = OrderedDict( ) # map of token index before and after sub-tokenization question_feature = example['features'] cur_idx = 0 new_question_feature = [] question_subtokens = [] for old_idx, token in enumerate(question_tokens): if token == '<DECODE>': token = '[MASK]' if token == '<START>': token = '[MASK]' sub_tokens = bert_tokenizer.tokenize(token) question_subtokens.extend(sub_tokens) token_new_idx_start = cur_idx token_new_idx_end = cur_idx + len(sub_tokens) token_position_map[old_idx] = (token_new_idx_start, token_new_idx_end) new_question_feature.extend([question_feature[old_idx]] * len(sub_tokens)) cur_idx = token_new_idx_end token_position_map[len(question_tokens)] = (len(question_subtokens), len(question_subtokens)) example['tokens'] = question_subtokens example['features'] = new_question_feature for entity in example['entities']: old_token_start = entity['token_start'] old_token_end = entity['token_end'] new_token_start = token_position_map[old_token_start][0] new_token_end = token_position_map[old_token_end][0] entity['token_start'] = new_token_start entity['token_end'] = new_token_end if table_representation_method == 'concate': columns, column_info = get_columns_concate(example, table, bert_tokenizer) elif table_representation_method == 'canonical': columns, column_info = get_columns_canonical(example, table) else: raise RuntimeError('Unknown table representation') # gather table data for column in columns: column.name_tokens = bert_tokenizer.tokenize(str(column.name)) column.sample_value_tokens = bert_tokenizer.tokenize( str(column.sample_value)) rows = [table['kg'][row_id] for row_id in sorted(table['kg'])] valid_rows = [] untokenized_rows = [] for row in rows: valid_row = {} untokenized_row = {} for col in columns: cell_val = row.get(col.raw_name, []) if cell_val: cell_val = str(cell_val[0]) untokenized_row[col.name] = cell_val cell_tokens = bert_tokenizer.tokenize(cell_val) else: cell_tokens = [] untokenized_row[col.name] = '' valid_row[col.name] = cell_tokens valid_rows.append(valid_row) untokenized_rows.append(untokenized_row) table = Table(id=example['context'], header=columns, data=valid_rows, column_info=column_info) untokenized_table = Table(id=example['context'], header=columns, data=untokenized_rows) example['table'] = table example['untokenized_table'] = untokenized_table return example
def home(): # global model, BERT_FP, bert, tokenizer, nlp model = torch.load('model_sciBERT_CRF10.pth') BERT_FP = 'scibert_scivocab_uncased' bert = BertModel.from_pretrained(BERT_FP) tokenizer = BertTokenizer(vocab_file=BERT_FP + '/vocab.txt') nlp = en_core_web_sm.load() datatowrite = [] result = '' if (request.method == 'POST'): token_indices = [] file_raw = request.form.get('abstract') actual_file = open('abstract_str/abstract.txt', 'w') actual_file.write(file_raw) actual_file.close() file = file_raw.lower() tokens_list = tokenizer.tokenize(file) n = 0 for i, item in enumerate(tokens_list): try: start_index = file.index(item.strip('#')) except: start_index = 100 if ((start_index < 5 or unk == 1) and item != '[UNK]'): token_indices.append( (start_index + n, n + start_index + len(item.strip('#')))) n = token_indices[-1][-1] file = file[start_index + len(item.strip('#')):] else: token_indices.append((-1, -1)) if (item != '[UNK]'): n += len(item.strip('#')) file = file[len(item.strip('#')):] with torch.no_grad(): inputs = tokenizer.convert_tokens_to_ids(tokens_list) inputs = bert(torch.tensor([inputs]))[0] for j in range(len(inputs)): inputs[j] = inputs[j].numpy() inputs = torch.tensor(np.array(inputs)) prediction = model(inputs.permute(1, 2, 0, 3).squeeze(0)) output = prediction[0] dic = {} dataarr = file_raw tagsarr = output indicesarr = token_indices indicesdata = [] datatowrite = [] for j in range(len(tagsarr)): if (tagsarr[j] == 0 or tagsarr[j] == 4): indicesdata.append(list(indicesarr[j])) if (tagsarr[j] == 1 or tagsarr[j] == 2): indicesdata[-1][1] = indicesarr[j][1] indicestowrite = indicesdata ind_temp = [] data_temp = [] for j in indicestowrite: ind_temp.append(j) data_temp.append(dataarr[j[0]:j[1]]) indicestowrite = [] datatowrite = [] for j in range(len(ind_temp)): temp = nlp(data_temp[j]) count = 0 for k in temp: count += 1 if (count == 1): ind = [ [k.start() + 1, k.start() + 1 + len(data_temp[j])] for k in re.finditer( '[^a-z]' + re.escape(data_temp[j].lower()) + '[^a-z]', dataarr.lower()) if [k.start() + 1, k.start() + 1 + len(data_temp[j])] not in ind_temp and [k.start() + 1, k.start() + 1 + len(data_temp[j])] not in indicestowrite ] temp_ind = [] dat = [] for l in ind: if (dataarr[l[0]:l[1]].lower() != dataarr[l[0]:l[1]]): dat.append(dataarr[l[0]:l[1]]) temp_ind.append(l) indicestowrite += temp_ind datatowrite += dat ind_temp = ind_temp + indicestowrite data_temp = data_temp + datatowrite indicestowrite = [] datatowrite = [] for j in range(len(data_temp)): temp_2 = nlp(data_temp[j]) temp = [] for word in temp_2: temp.append((len(word.text), word.text)) if (len(temp) == 1): if (str(temp[0][1]).lower() != str(temp[0][1]) or re.match('^[a-z]+$', temp[0][1]) == None or len(temp[0][1]) > 3): indicestowrite.append(ind_temp[j]) datatowrite.append(data_temp[j]) else: indicestowrite.append(ind_temp[j]) datatowrite.append(data_temp[j]) indicestowrite = sorted(indicestowrite, key=lambda x: x[0]) if (len(indicestowrite) == 0): return render_template("index.html", keyphrases=file_raw) print(indicestowrite) annotation_file = open('abstract_str/abstract.ann', 'w') for qwe in range(len(indicestowrite)): annotation_file.write( 'T' + str(qwe + 1) + '\t' + 'Process ' + str(indicestowrite[qwe][0]) + ' ' + str(indicestowrite[qwe][1]) + '\t' + file_raw[indicestowrite[qwe][0]:indicestowrite[qwe][1]] + '\n') annotation_file.close() X_test, y_test_gold, _, test_entities = read_and_map( 'abstract_str', mapper) loaded_model = pickle.load(open('finalized_model_joined.sav', 'rb')) predictions = loaded_model.predict(X_test) y_values = ['Process', 'Material', 'Task'] document_abbr = {} asd = os.listdir('abstract_str') for i in range(len(asd)): document_abbr[asd[i][:-4]] = {} for i in range(len(predictions)): if (test_entities[i].string == test_entities[i].string.upper() and len(test_entities[i].string) > 1): if (y_values[predictions[i]] == "Material"): predictions[i] = y_values.index("Process") if (test_entities[i].string == test_entities[i].string.capitalize() and len(test_entities[i].string) == 2): predictions[i] = y_values.index("Material") tmp = test_entities[i].string.split(" ") if (len(tmp) == 1): if (test_entities[i].string == test_entities[i].string.upper() and hasNumbers(test_entities[i].string)): predictions[i] = y_values.index("Material") if (test_entities[i].string == test_entities[i].string.upper()): try: predictions[i] = document_abbr[test_entities[i].docid][ test_entities[i].string] except: obracket = test_entities[i].start - 1 cbracket = test_entities[i].end file = open( 'abstract_str/' + test_entities[i].docid + '.txt', 'r').read() if (file[obracket] == '(' and file[cbracket] == ')'): if (test_entities[i].start - test_entities[i - 1].end == 2): # print(test_entities[i].string, '\t',test_entities[i-1].string ,'\t' ,test_entities[i].start, '\t',test_entities[i-1].end ) document_abbr[test_entities[i].docid][ test_entities[i].string] = predictions[i - 1] predictions[i] = predictions[i - 1] for j in range(len(tmp)): if (len(tmp[j]) == 1 and tmp[j] == tmp[j].upper()): predictions[i] = y_values.index("Material") # print(predictions) n = 0 result = [] last_closing = 0 for i in range(len(indicestowrite)): qwe_temp = file_raw[n:indicestowrite[i][0]] if (qwe_temp != ''): result.append(qwe_temp) temp = '' if (predictions[i] == 0): temp = '<span style="background-color:rgba(152, 252, 3, 0.5);"><strong>' + file_raw[ indicestowrite[i][0]:indicestowrite[i] [1]] + '</strong></span>' elif (predictions[i] == 1): temp = '<span style="background-color:rgba(252, 152, 3, 0.5);"><strong>' + file_raw[ indicestowrite[i][0]:indicestowrite[i] [1]] + '</strong></span>' elif (predictions[i] == 2): temp = '<span style="background-color:rgba(3, 152, 252, 0.5);"><strong>' + file_raw[ indicestowrite[i][0]:indicestowrite[i] [1]] + '</strong></span>' if (indicestowrite[i][1] > last_closing): result.append(temp) last_closing = indicestowrite[i][1] n = indicestowrite[i][1] # else: # ov_string = file_raw[indicestowrite[i][0]:indicestowrite[i][1]] # temp_start = result[-1].index(ov_string) # result[-1] = result[-1][:temp_start] + temp + result[-1][ temp_start+indicestowrite[i][1] - indicestowrite[i][0]:] # result += '<span style="background-color:rgba(152, 252, 3, 0.5);"><strong>' + file_raw[i[0]:i[1]] + '</strong></span>' result += file_raw[n:] # print(result) result = "".join(result) return render_template("index.html", keyphrases=result)
class for_BERT(): def __init__(self, mode='training', language='ko', version=1.0): version = str(version) self.mode = mode if language == 'en': data_path = dir_path + '/koreanframenet/resource/info/fn' + version + '_' else: data_path = dir_path + '/koreanframenet/resource/info/kfn' + version + '_' with open(data_path + 'lu2idx.json', 'r') as f: self.lu2idx = json.load(f) if version == '1.5': fname = dir_path + '/koreanframenet/resource/info/fn1.5_frame2idx.json' else: fname = dir_path + '/koreanframenet/resource/info/fn1.7_frame2idx.json' with open(fname, 'r') as f: #self.sense2idx = json.load(f) self.frame2idx = json.load(f) with open(data_path + 'lufrmap.json', 'r') as f: #self.lusensemap = json.load(f) self.lufrmap = json.load(f) with open(dir_path + '/koreanframenet/resource/info/fn1.7_fe2idx.json', 'r') as f: self.arg2idx = json.load(f) with open( dir_path + '/koreanframenet/resource/info/fn1.7_frargmap.json', 'r') as f: self.frargmap = json.load(f) with open( dir_path + '/koreanframenet/resource/info/fn1.7_bio_fe2idx.json', 'r') as f: self.bio_arg2idx = json.load(f) with open( dir_path + '/koreanframenet/resource/info/fn1.7_bio_frargmap.json', 'r') as f: self.bio_frargmap = json.load(f) self.idx2frame = dict( zip(self.frame2idx.values(), self.frame2idx.keys())) self.idx2lu = dict(zip(self.lu2idx.values(), self.lu2idx.keys())) self.idx2arg = dict(zip(self.arg2idx.values(), self.arg2idx.keys())) self.idx2bio_arg = dict( zip(self.bio_arg2idx.values(), self.bio_arg2idx.keys())) # load pretrained BERT tokenizer self.tokenizer = BertTokenizer.from_pretrained( 'bert-base-multilingual-cased', do_lower_case=False) # load BERT tokenizer with untokenizing frames never_split_tuple = ("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]") added_never_split = [] added_never_split.append('<tgt>') added_never_split.append('</tgt>') # for frame in self.frame2idx: # added_never_split.append('['+frame+']') added_never_split_tuple = tuple(added_never_split) never_split_tuple += added_never_split_tuple vocab_file_path = dir_path + '/data/bert-multilingual-cased-dict-add-frames' self.tokenizer_with_frame = BertTokenizer( vocab_file_path, do_lower_case=False, max_len=512, never_split=never_split_tuple) def idx2tag(self, predictions, model='frameid'): if model == 'frameid': pred_tags = [self.idx2frame[p_i] for p in predictions for p_i in p] elif model == 'argclassification': pred_tags = [self.idx2arg[p_i] for p in predictions for p_i in p] elif model == 'argid': pred_tags = [ self.idx2bio_arg[p_i] for p in predictions for p_i in p ] return pred_tags def get_masks(self, datas, model='frameid'): if model == 'frameid': mapdata = self.lufrmap num_label = len(self.frame2idx) elif model == 'argclassification': mapdata = self.frargmap num_label = len(self.arg2idx) elif model == 'argid': mapdata = self.bio_frargmap num_label = len(self.bio_arg2idx) masks = [] for idx in datas: mask = torch.zeros(num_label) try: candis = mapdata[str(int(idx[0]))] except KeyboardInterrupt: raise except: candis = mapdata[int(idx[0])] for candi_idx in candis: mask[candi_idx] = 1 masks.append(mask) masks = torch.stack(masks) return masks # bert tokenizer and assign to the first token def bert_tokenizer(self, text): orig_tokens = text.split(' ') bert_tokens = [] orig_to_tok_map = [] bert_tokens.append("[CLS]") for orig_token in orig_tokens: orig_to_tok_map.append(len(bert_tokens)) bert_tokens.extend(self.tokenizer_with_frame.tokenize(orig_token)) bert_tokens.append("[SEP]") return orig_tokens, bert_tokens, orig_to_tok_map def convert_to_bert_input_frameid(self, input_data): tokenized_texts, lus, frames = [], [], [] for i in range(len(input_data)): data = input_data[i] text = ' '.join(data[0]) orig_tokens, bert_tokens, orig_to_tok_map = self.bert_tokenizer( text) tokenized_texts.append(bert_tokens) ori_lus = data[1] lu_sequence = [] for i in range(len(bert_tokens)): if i in orig_to_tok_map: idx = orig_to_tok_map.index(i) l = ori_lus[idx] lu_sequence.append(l) else: lu_sequence.append('_') lus.append(lu_sequence) if self.mode == 'training': ori_frames = data[2] frame_sequence = [] for i in range(len(bert_tokens)): if i in orig_to_tok_map: idx = orig_to_tok_map.index(i) l = ori_frames[idx] frame_sequence.append(l) else: frame_sequence.append('_') frames.append(frame_sequence) input_ids = pad_sequences([ self.tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts ], maxlen=MAX_LEN, dtype="long", truncating="post", padding="post") tgt_seq, lu_seq, frame_seq = [], [], [] for sent_idx in range(len(lus)): lu_items = lus[sent_idx] tgt, lu = [], [] for idx in range(len(lu_items)): if lu_items[idx] != '_': if len(tgt) == 0: tgt.append(idx) lu.append(self.lu2idx[lu_items[idx]]) tgt_seq.append(tgt) lu_seq.append(lu) if self.mode == 'training': frame_items = frames[sent_idx] frame = [] for idx in range(len(frame_items)): if frame_items[idx] != '_': if len(frame) == 0: frame.append(self.frame2idx[frame_items[idx]]) frame_seq.append(frame) attention_masks = [[float(i > 0) for i in ii] for ii in input_ids] data_inputs = torch.tensor(input_ids) data_tgt_idx = torch.tensor(tgt_seq) data_lus = torch.tensor(lu_seq) data_frames = torch.tensor(frame_seq) data_masks = torch.tensor(attention_masks) if self.mode == 'training': bert_inputs = TensorDataset(data_inputs, data_tgt_idx, data_lus, data_frames, data_masks) else: bert_inputs = TensorDataset(data_inputs, data_tgt_idx, data_lus, data_masks) return bert_inputs
def __init__(self, path: str, fields: List[Tuple[str, tt.data.Field]], tokenizer: BertTokenizer, max_length: int = 512, include_features=False, **kwargs): max_length = max_length - 3 # Count without special tokens with open(path) as dataf: data_json = json.load(dataf) examples = [] # Each input needs to have at most 2 segments # We will create following input # - [CLS] source post, previous post [SEP] choice_1 [SEP] for example in data_json["Examples"]: make_ids = lambda x: tokenizer.convert_tokens_to_ids( tokenizer.tokenize(x)) text = make_ids(example["spacy_processed_text"]) prev = make_ids(example["spacy_processed_text_prev"]) src = make_ids(example["spacy_processed_text_src"]) segment_A = src segment_C = prev segment_B = text text_ids = [tokenizer.vocab["[CLS]"]] + segment_A + [tokenizer.vocab["[SEP]"]] + segment_C + \ [tokenizer.vocab["[SEP]"]] + segment_B + [tokenizer.vocab["[SEP]"]] # truncate if exceeds max length if len(text_ids) > max_length: # Truncate segment A segment_C = segment_C[:max_length // 2] text_ids = [tokenizer.vocab["[CLS]"]] + segment_A + [tokenizer.vocab["[SEP]"]] + segment_C + \ [tokenizer.vocab["[SEP]"]] + segment_B + [tokenizer.vocab["[SEP]"]] if len(text_ids) > max_length: # Truncate segment A segment_A = segment_A[:max_length // 2] text_ids = [tokenizer.vocab["[CLS]"]] + segment_A + [tokenizer.vocab["[SEP]"]] + segment_C + \ [tokenizer.vocab["[SEP]"]] + segment_B + [tokenizer.vocab["[SEP]"]] if len(text_ids) > max_length: # Truncate also segment B segment_B = segment_B[:max_length // 2] text_ids = [tokenizer.vocab["[CLS]"]] + segment_A + [tokenizer.vocab["[SEP]"]] + segment_C + \ [tokenizer.vocab["[SEP]"]] + segment_B + [tokenizer.vocab["[SEP]"]] segment_ids = [0] * (len(segment_A) + 2) + [2] * ( len(segment_C) + 1) + [1] * (len(segment_B) + 1) # example_list = list(example.values())[:-3] + [text_ids, segment_ids] if include_features: example_list = list( example.values()) + [text_ids, segment_ids] else: example_list = [ example["id"], example["branch_id"], example["tweet_id"], example["stance_label"], example["veracity_label"], "\n-----------\n".join([ example["raw_text_src"], example["raw_text_prev"], example["raw_text"] ]), example["issource"] ] + [text_ids, segment_ids] examples.append(Example.fromlist(example_list, fields)) super(RumourEval2019Dataset_BERTTriplets_3Segments, self).__init__(examples, fields, **kwargs)
class BertWithJumanModel(): def __init__(self, bert_path, vocab_file_name="vocab.txt", use_cuda=False): # 日本語文章をBERTに食わせるためにJumanを読み込む self.juman_tokenizer = JumanTokenizer() # 事前学習済みのBERTモデルを読み込む self.model = BertModel.from_pretrained(bert_path) # 事前学習済みのBERTモデルのTokenizerを読み込む self.bert_tokenizer = BertTokenizer(Path(bert_path) / vocab_file_name, do_lower_case=False, do_basic_tokenize=False) # CUDA-GPUを利用するかどうかのフラグ読み込み self.use_cuda = use_cuda def _preprocess_text(self, text): # 事前処理、テキストの半角スペースは削除 try: return text.replace(" ", "") # for Juman except: return '' def get_sentence_embedding(self, text, pooling_layer=-2, pooling_strategy="REDUCE_MEAN"): # テキストの半角スペースを削除する preprocessed_text = self._preprocess_text(text) # 日本語のテキストを分かち書きし、トークンリストに変換する tokens = self.juman_tokenizer.tokenize(preprocessed_text) # トークンを半角スペースで結合しstrに変換する bert_tokens = self.bert_tokenizer.tokenize(" ".join(tokens)) # テキストのサイズは128までなので、ヘッダ + トークン126個 + フッタを作成 # トークンをidに置換する ids = self.bert_tokenizer.convert_tokens_to_ids( ["[CLS]"] + bert_tokens[:126] + ["[SEP]"]) # max_seq_len-2 tokens_tensor = torch.tensor(ids).reshape(1, -1) if self.use_cuda: # GPUの利用チェック、利用 tokens_tensor = tokens_tensor.to('cuda') self.model.to('cuda') # モデルを評価モードに変更 self.model.eval() with torch.no_grad(): # 自動微分を適用しない(メモリ・高速化などなど) # id列からベクトル表現を計算する all_encoder_layers, _ = self.model(tokens_tensor) # SWEMと同じ方法でベクトルを時間方向にaverage-poolingしているらしい # 文章列によって次元が可変になってしまうので、伸びていく方向に対してプーリングを行い次元を固定化する # https://yag-ays.github.io/project/swem/ embedding = all_encoder_layers[pooling_layer].cpu().numpy()[0] if pooling_strategy == "REDUCE_MEAN": return np.mean(embedding, axis=0) elif pooling_strategy == "REDUCE_MAX": return np.max(embedding, axis=0) elif pooling_strategy == "REDUCE_MEAN_MAX": return np.r_[np.max(embedding, axis=0), np.mean(embedding, axis=0)] elif pooling_strategy == "CLS_TOKEN": return embedding[0] else: raise ValueError( "specify valid pooling_strategy: {REDUCE_MEAN, REDUCE_MAX, REDUCE_MEAN_MAX, CLS_TOKEN}" )
if __name__ == '__main__': args = parser.parse_args() assert os.path.exists(args.bert_model), '{} does not exist'.format( args.bert_model) assert os.path.exists(args.bert_vocab), '{} does not exist'.format( args.bert_vocab) assert args.topk > 0, '{} should be positive'.format(args.topk) print('Initialize BERT vocabulary from {}...'.format(args.bert_vocab)) bert_tokenizer = BertTokenizer(vocab_file=args.bert_vocab) print('Initialize BERT model from {}...'.format(args.bert_model)) bert_model = BertForMaskedLM.from_pretrained(args.bert_model) while True: message = input('Enter your message: ').strip() tokens = bert_tokenizer.tokenize(message) if len(tokens) == 0: continue if tokens[0] != CLS: tokens = [CLS] + tokens if tokens[-1] != SEP: tokens.append(SEP) token_idx, segment_idx, mask = to_bert_input(tokens, bert_tokenizer) with torch.no_grad(): logits = bert_model(token_idx, segment_idx, mask, masked_lm_labels=None) logits = logits.squeeze(0) probs = torch.softmax(logits, dim=-1)