def generate_joshi_jsondocs(task_map_items, tokenizer: BertTokenizerFast, nlp: spacy.language.Language) -> Iterator[dict]: """Given a set of task items generate json docs to be serialised""" for task_id, task_records in tqdm(task_map_items): news_doc = nlp(task_records[0].news_text) sci_doc = nlp(task_records[0].sci_text) jsondoc = { 'doc_key': f"nw", "clusters": [], 'subtoken_map': [], 'sentence_map': [], 'sentences': [], 'speakers': [], "doc_ids": [f"news_{task_id[0]}", f"science_{task_id[0]}"], 'doc_boundaries': [] } sent_id = 0 i = 0 for doc in [news_doc, sci_doc]: jsondoc['doc_boundaries'].append(i) i = 0 for sent in doc.sents: r = tokenizer.tokenize(sent.text, add_special_tokens=True) jsondoc['sentences'].append(r) jsondoc['speakers'].append(['[SPL]'] + (['-'] * (len(r) - 2)) + ['[SPL]']) for tok in r[:-1]: jsondoc['sentence_map'].append(sent_id) jsondoc['subtoken_map'].append(i) if tok not in ['[CLS]'] and not tok.startswith('##'): i += 1 # increment sentence id sent_id += 1 yield jsondoc
def inference_model( model: BertForSquad, tokenizer: BertTokenizerFast, context: str, question: str, input_ids: List[int], token_type_ids: List[int] ) -> str: """ Inferene function with the model Because we don't know how your model works, we can't not infer the answer from your model. Implement inference process for you model. Please use inference_start_end and inference_answer functions you have implemented Argumentes: model -- Model you have trained. tokenizer -- Tokenizer to encode and decode the string context -- Context string question -- Question string input_ids -- Input ids token_type_dis -- Token type ids Return: answer -- Answer string """ answer: str = None tuple = input_ids, token_type_ids, -1, -1 tmp = [tuple] input_ids, attention_mask, token_type_ids, _, _ = squad_feature_collate_fn(tmp) input_ids = input_ids.to(device) attention_mask = attention_mask.to(device) token_type_ids = token_type_ids.to(device) start, end = model(input_ids, attention_mask, token_type_ids) context_start = len(tokenizer.tokenize(question)) + 2 context_end = len(input_ids[0])-2 start, end = inference_start_end(start[0], end[0],context_start, context_end) answer = inference_answer(question, context, input_ids, token_type_ids, start.to('cpu'), end.to('cpu'), tokenizer) return answer
def paragraphs2batch(paragraphs: List[str], tokenizer: BertTokenizerFast) -> \ Tuple[List[List[str]], Dict]: """ Convert a list of paragraphs to a batch. This essentially does these things: 1. Tokenize paragraphs. 2. Pad all input_ids tensors, remove excessively long input_ids. 3. Generate the correct attention_masks. :param paragraphs: List of paragraphs. :param tokenizer: The BERT tokenizer. :return: Tokenized paragraphs and the batch that can be used as model inputs. """ all_tokenized = [] input_ids = [] attention_mask = [] for p in paragraphs: tokenized = tokenizer.tokenize(p) all_tokenized.append(tokenized) one_hot = tokenizer.convert_tokens_to_ids(tokenized) input_ids.append(torch.tensor(one_hot, dtype=torch.long)) attention_mask.append(torch.ones_like(input_ids[-1], dtype=torch.float)) input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0) input_ids = input_ids[:, :max_input] attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0.) attention_mask = attention_mask[:, :max_input] return all_tokenized, { 'input_ids': input_ids, 'attention_mask': attention_mask, }
def inference_answer( question: str, context: str, input_ids: List[int], token_type_ids: List[int], start_pos: int, end_pos: int, tokenizer: BertTokenizerFast ) -> str: """ Inference fucntion for the answer. Because the tokenizer lowers the capital letters and splits punctuation marks, you may get wrong answer words if you detokenize it directly. For example, if you encode "$5.000 Dollars" and decode it, you get different words from the orignal. "$5.00 USD" --(Tokenize)--> ["$", "5", ".", "00", "usd"] --(Detokenize)--> "$ 5. 00 usd" Thus, you should find the original words in the context by the start and end token positions of the answer. Implement the function inferencing the answer from the context and the answer token postion. Note 1: We have already implmented direct decoding so you can skip this problem if you want. Note 2: When we implement squad_feature, we have arbitrarily split tokens if the answer is a subword, so it is very tricky to extract the original word by start_pos and end_pos.` However, as None is entered into the answer when evaluating, you can assume the word tokens follow general tokenizing rule in this problem. In fact, the most appropriate solution is storing the character index when tokenizing them. Hint: You can find a simple solution if you carefully search the documentation of the transformers library. Library Link: https://huggingface.co/transformers/index.html Arguments: question -- Question string context -- Context string input_ids -- Input ids token_type_ids -- Token type ids start_pos -- Predicted start token position of the answer end_pos -- Predicted end token position of the answer tokenizer -- Tokenizer to encode and decode the string Return: answer -- Answer string """ tokens = tokenizer.tokenize(context) context_as_is = context context = context.lower() token2char_map = {} start = 0 for j in range(len(tokens)): for i in range(len(tokens[j])): if tokens[j][i] == '#': continue else: break token = tokens[j][i:] start = context.find(token,start) end = start + len(token) token2char_map[j] = [start, end-1] start = end question_tokens = ['[CLS]']+tokenizer.tokenize(question)+['[SEP]'] start = token2char_map[int(start_pos)-len(question_tokens)][0] end = token2char_map[int(end_pos)-len(question_tokens)][1] answer = context_as_is[start:end+1] return answer
def main(model, tokenizer: BertTokenizerFast, condition_type: str, metrics_output_path: str): """Compute the BERT representations + cosine similarities.""" ## Get Relevant data subject_id_to_patient_info = get_subject_id_to_patient_info( condition_type=condition_type) condition_code_to_count = get_condition_code_to_count( condition_type=condition_type) condition_code_to_description = get_condition_code_to_descriptions( condition_type=condition_type) set_to_use = filter_condition_code_by_count(condition_code_to_count, min_count=0, max_count=500000) condition_code_to_index: Dict[str, int] = dict( zip(set_to_use, range(len(set_to_use)))) mean_differential_sim, max_differential_sim, all_pair_differential_sim = [], [], [] ## For each patient and condition, get a template, pass through BERT and return similarities all_subject_ids = sorted(list(subject_id_to_patient_info.keys())) all_subject_ids = sorted( resample(all_subject_ids, replace=False, n_samples=10000, random_state=2021)) for subject_id in tqdm(all_subject_ids): patient_info = subject_id_to_patient_info[subject_id] templates = [] for condition in set_to_use: desc = condition_code_to_description[condition] templates.append( generate_template(patient_info.FIRST_NAME, patient_info.LAST_NAME, patient_info.GENDER, desc)) name = patient_info.FIRST_NAME + " " + patient_info.LAST_NAME name_length = len(tokenizer.tokenize(name)) ## Following info may change if we change the template structure. ## Following are on basis of structure [CLS] {title} {name} is a yo patient with {condition} [SEP] example_template = tokenizer.tokenize(templates[0]) name_start_index = 2 # Name Starts after [CLS] {title} name_end_index = name_start_index + name_length condition_start_index = example_template.index("patient") + 2 condition_end_index = -1 assert (tokenizer.convert_tokens_to_string( example_template[name_start_index:name_end_index]) == " ".join( name.lower().split())), breakpoint() assert (tokenizer.convert_tokens_to_string( example_template[condition_start_index:condition_end_index]) == " ".join(condition_code_to_description[ set_to_use[0]].lower().split())), breakpoint() ## Pass all templates to BERT and return similarities mean_similarities, max_similarities, all_pair_similarities = get_name_condition_similarities( model, tokenizer, templates, name_start_index, name_end_index, condition_start_index, condition_end_index, ) condition_labels = get_condition_labels_as_vector( patient_info.CONDITIONS, condition_code_to_index) mean_differential_sim.append( differential_score(condition_labels, mean_similarities)) max_differential_sim.append( differential_score(condition_labels, max_similarities)) all_pair_differential_sim.append( differential_score(condition_labels, all_pair_similarities)) print(f"Mean Mean Pos-Neg {np.average(mean_differential_sim)}") print(f"SD Mean Pos-Neg {np.std(mean_differential_sim)}") print(f"Mean Max Pos-Neg {np.average(max_differential_sim)}") print(f"SD Max Pos-Neg {np.std(max_differential_sim)}") print(f"Mean All Pair Pos-Neg {np.average(all_pair_differential_sim)}") print(f"SD All Pair Pos-Neg {np.std(all_pair_differential_sim)}") from experiments.MLM.common import mean_std_as_string with open(f"{metrics_output_path}/results.txt", "w") as f: f.write(mean_std_as_string("Mean Sim", mean_differential_sim)) f.write(mean_std_as_string("Max Sim", max_differential_sim)) f.write(mean_std_as_string("All Pair Sim", all_pair_differential_sim))
def _create_xtoken_df(morph_df: pd.DataFrame, xtokenizer: BertTokenizerFast, sos, eos) -> pd.DataFrame: token_df = morph_df[['sent_id', 'token_id', 'token']].drop_duplicates() sent_groups = sorted(token_df.groupby([token_df.sent_id])) num_sentences = len(sent_groups) tq = tqdm(total=num_sentences, desc="Sentence") data_rows = [] for sent_id, sent_df in sent_groups: xtokens = [(tid, t, xt) for tid, t in zip(sent_df.token_id, sent_df.token) for xt in xtokenizer.tokenize(t)] sent_token_indices = [0] + [tid for tid, t, xt in xtokens] + [sent_df.token_id.max() + 1] sent_tokens = [sos] + [t for tid, t, xt in xtokens] + [eos] sent_xtokens = [xtokenizer.cls_token] + [xt for tid, t, xt in xtokens] + [xtokenizer.sep_token] sent_index = [sent_id] * len(sent_xtokens) data_rows.extend(list(zip(sent_index, sent_token_indices, sent_tokens, sent_xtokens))) tq.update(1) tq.close() return pd.DataFrame(data_rows, columns=['sent_id', 'token_id', 'token', 'xtoken'])
def squad_features( context: str, question: str, answer: Union[str, None], start_char_pos: Union[int, None], tokenizer: BertTokenizerFast) -> Tuple[List[int], List[int], int, int]: """ Squad feature extractor Implement the feature extractor from a Squad sample for your model Return values should follow [CLS + question + SEP + context + SEP] form. In addition, because start_char_pos is based on character index, you should convert it to proper token index. Check the test cases to know the functionality in detail. Note: input_ids and token_type_ids follows the transfomer library documentation https://huggingface.co/transformers/glossary.html Arguments: context -- Context string question -- Question string anwser -- Answer string. If the answer is None, return None for start_token_pos and end_token_pos start_char_pos -- Character index which the answer starts from in the context. If the answer is None, this argument is also None. tokenizer -- Tokenizer to encode text strings. Explanation: https://huggingface.co/transformers/model_doc/bert.html#berttokenizerfast Returns: input_ids -- Input ids token_type_ids -- Token type ids start_token_pos -- Token index which the answer starts from in the input_ids list. None if no answer is given. end_token_pos -- Token index which the answer ends by in the input_ids list. This includes the last token which located in the index. None if no answer is given. """ ### YOUR CODE HERE (~18 lines) input_ids: List[int] = None token_type_ids: List[int] = None start_token_pos: int = None end_token_pos: int = None token_question = tokenizer.tokenize(question) #token_context = tokenizer.tokenize(context) tokens = ["[CLS]"] + token_question + ["[SEP]"] token_type_ids = [0] * len(tokens) #set_trace() # Answer available if start_char_pos is not None: token_answer = tokenizer.tokenize(answer) back_context_ = tokenizer.tokenize(context[start_char_pos:]) if _is_whitespace(context[start_char_pos + len(answer)]) is False: if back_context_[len(token_answer) - 1] is not token_answer[-1]: back_context = tokenizer.tokenize(context[start_char_pos + len(answer):]) back_context[0] = "##" + back_context[0] else: back_context = back_context_[len(token_answer):] else: back_context = back_context_[len(token_answer):] if start_char_pos == 0: front_context = [] token_answer = tokenizer.tokenize(answer) else: if _is_whitespace(context[start_char_pos - 1]): front_context = tokenizer.tokenize(context[:start_char_pos]) # if previous chr of answer is not space else: front_context = tokenizer.tokenize(context[:start_char_pos]) token_answer[0] = "##" + token_answer[0] start_token_pos = len(tokens) + len(front_context) end_token_pos = start_token_pos + len(token_answer) - 1 token_context = front_context + token_answer + back_context token_type_ids = token_type_ids + [1] * (len(token_context) + 1) tokens = tokens + token_context + ["[SEP]"] input_ids = tokenizer.convert_tokens_to_ids(tokens) # No answer case else: token_context = tokenizer.tokenize(context) tokens = tokens + token_context + ["[SEP]"] token_type_ids = token_type_ids + [1] * (len(token_context) + 1) input_ids = tokenizer.convert_tokens_to_ids(tokens) start_token_pos = None end_token_pos = None #token_answer = tokenizer.tokenize(answer) #if len(token_answer) > 1: ### END YOUR CODE return input_ids, token_type_ids, start_token_pos, end_token_pos
def classify( model: BertForTokenClassification, tokenizer: BertTokenizerFast, # 输入 sequence: str, labels: List[int] = None, ) -> List[Entity]: """ classify的功能: 给定model、tokenizer,sequence: 给定label(对应以下label不为空的情况),计算loss; label为空:预测entities 训练好的模型,可以直接使用 """ # ensure model is configured to return dict # otherwise this code will break # 确保模型配置为返回dict,否则此代码将中断 if not model.config.return_dict: raise ValueError( 'Model should be instantiated with `return_dict=True`') # convert input sequence (and optional labels) into an inputs bundle # 将输入序列(和可选标签)转换为输入包 inputs, mask = pack_sequence_as_inputs( tokenizer=tokenizer, sequence=sequence, labels=labels, max_token_length=model.config.max_position_embeddings, ) # put data on the gpu (if available) # if torch.cuda.is_available(): # model.cuda() # inputs = {k: v.cuda() for k, v in inputs.items()} # if labels is not None, it means that the caller is interested in the loss # value of the given input sequence. So, this should be done in a grad context. # 如果labels不是None,则表示调用者对给定输入序列的损失值感兴趣。所以,这应该在毕业的背景下进行。 if labels is not None: return model(**inputs).loss # if labels is None, it means that the caller is interested in the entities # to be recognized by the model. In this case, the outputs can be computed # without a grad context # 如果labels为None,则表示调用者对模型要识别的实体感兴趣。在这种情况下,可以在没有梯度上下文的情况下计算输出 with torch.no_grad(): logits = model(**inputs).logits.cpu() # decode model's output # 解码模型输出 entities = extract_entities( sequence=sequence, logits=logits[:, 1:-1][mask], encode=tokenizer.encode, decode=tokenizer.decode, ) entities = realign_extracted_entities( sequence=sequence, tokens=tokenizer.tokenize(sequence), entities=entities, vocab=tokenizer.get_vocab(), ) return list(entities)
def squad_features( context: str, question: str, answer: Union[str, None], start_char_pos: Union[int, None], tokenizer: BertTokenizerFast) -> Tuple[List[int], List[int], int, int]: """ Squad feature extractor Implement the feature extractor from a Squad sample for your model Return values should follow [CLS + question + SEP + context + SEP] form. In addition, because start_char_pos is based on character index, you should convert it to proper token index. Check the test cases to know the functionality in detail. Note: input_ids and token_type_ids follows the transfomer library documentation https://huggingface.co/transformers/glossary.html Arguments: context -- Context string question -- Question string answer -- Answer string. If the answer is None, return None for start_token_pos and end_token_pos start_char_pos -- Character index which the answer starts from in the context. If the answer is None, this argument is also None. tokenizer -- Tokenizer to encode text strings. Explanation: https://huggingface.co/transformers/model_doc/bert.html#berttokenizerfast Returns: input_ids -- Input ids token_type_ids -- Token type ids start_token_pos -- Token index which the answer starts from in the input_ids list. None if no answer is given. end_token_pos -- Token index which the answer ends by in the input_ids list. This includes the last token which located in the index. None if no answer is given. """ ### YOUR CODE HERE (~18 lines) encoded_dict = tokenizer.encode_plus(question, context) input_ids = encoded_dict["input_ids"] token_type_ids = encoded_dict["token_type_ids"] input_ids_tokens = tokenizer.convert_ids_to_tokens(input_ids) # print("Input (tokens): ", input_ids_tokens) if answer is None and start_char_pos is None: start_token_pos = None end_token_pos = None return input_ids, token_type_ids, start_token_pos, end_token_pos start_token_pos, end_token_pos = 0, 0 start_token_pos += token_type_ids.count(0) start_token_pos += len(tokenizer.tokenize(context[:start_char_pos])) end_token_pos += len(tokenizer.tokenize(answer)) + start_token_pos - 1 # Extract tokenized answer part only tokenized_answer = " ".join( tokenizer.convert_ids_to_tokens( input_ids[start_token_pos:end_token_pos + 1])) subword_prefix_original = "##" if "##" in tokenized_answer else "" subword_prefix = "##" tokenized_answer = tokenized_answer.replace('#', '') if tokenized_answer != answer.lower( ) and start_token_pos == end_token_pos and answer in tokenized_answer: # A single word but different subword tokenization case new_subword_list = [ subword_prefix_original + tokenized_answer[:len(answer)], subword_prefix + tokenized_answer[len(answer):] ] # print('new_subword_list : ', new_subword_list) input_ids = input_ids[: start_token_pos] + tokenizer.convert_tokens_to_ids( new_subword_list) + input_ids[end_token_pos + 1:] token_type_ids.append(1) # print("Input ids: ", input_ids) # input_ids_tokens = tokenizer.convert_ids_to_tokens(input_ids) # print("Input (tokens) (ADJUSTED): ", input_ids_tokens) # print("Segmend Ids: ", token_type_ids) # print('START_CHAR_POS: ', start_char_pos) # print("ANSWER: ", answer) # print("START: ", start_token_pos) # print("END: ", end_token_pos) # print("ANSWER SPAN: ", input_ids_tokens[start_token_pos:end_token_pos+1]) assert len(input_ids) == len(token_type_ids) ### END YOUR CODE return input_ids, token_type_ids, start_token_pos, end_token_pos
def squad_features_1( context: str, question: str, answer: Union[str, None], start_char_pos: Union[int, None], tokenizer: BertTokenizerFast ) -> Tuple[List[int], List[int], int, int]: """ Squad feature extractor Implement the feature extractor from a Squad sample for your model Return values should follow [CLS + question + SEP + context + SEP] form. In addition, because start_char_pos is based on character index, you should convert it to proper token index. Check the test cases to know the functionality in detail. Note: input_ids and token_type_ids follows the transfomer library documentation https://huggingface.co/transformers/glossary.html Arguments: context -- Context string question -- Question string anwser -- Answer string. If the answer is None, return None for start_token_pos and end_token_pos start_char_pos -- Character index which the answer starts from in the context. If the answer is None, this argument is also None. tokenizer -- Tokenizer to encode text strings. Explanation: https://huggingface.co/transformers/model_doc/bert.html#berttokenizerfast Returns: input_ids -- Input ids token_type_ids -- Token type ids start_token_pos -- Token index which the answer starts from in the input_ids list. None if no answer is given. end_token_pos -- Token index which the answer ends by in the input_ids list. This includes the last token which located in the index. None if no answer is given. """ input_ids: List[int] = None token_type_ids: List[int] = None start_token_pos: int = None end_token_pos: int = None encoded_dict = tokenizer.encode_plus(question, context) tokens = tokenizer.tokenize(context) input_ids = encoded_dict['input_ids'] token_type_ids = encoded_dict['token_type_ids'] if answer == None: return input_ids, token_type_ids, None, None context = context.lower() token2char_map = {} start = 0 for j in range(len(tokens)): for i in range(len(tokens[j])): if tokens[j][i] == '#': continue else: break token = tokens[j][i:] start = context.find(token,start) end = start + len(token) token2char_map[j] = [start, end-1] start = end for i in range(len(tokens)): if token2char_map[i][0]>=start_char_pos: start_token_pos = i break end_token_pos = len(tokens) - 1 for i in range(start_token_pos, len(tokens)): if token2char_map[i][0]>=start_char_pos+len(answer): end_token_pos = i-1 break num_tokens_before_context = input_ids.index(102) + 1 start_token_pos += num_tokens_before_context end_token_pos += num_tokens_before_context return input_ids, token_type_ids, start_token_pos, end_token_pos
def process_data(data_file, output_file, vocab_file): """ Adapted from the `gap_to_jsonlines.py` for propara data prep """ tokenizer = BertTokenizerFast(vocab_file=vocab_file) # Need to have this other tokenizer so we can build the sub-token # to token map. It seems huggingface tokenizer doesn't have this # functionality basic_tokenizer = BasicTokenizer(do_lower_case=False) # Load data with open(data_file, 'r') as fp: data = json.load(fp) output_jsons = [] # Format as jsonlines & tokenize for para in tqdm(data): output = {} paragraph_text = " ".join(para['sentence_texts']) # Sentence map sentence_map = [0] for sent_num, sent in enumerate(para['sentence_texts']): tokens = tokenizer.tokenize(sent) sentence_map += [sent_num] * len(tokens) sentence_map += [sentence_map[-1]] # All tokens # Note this is the same as what we used to calculate the sentence map # even though they are done separately tokenized_paragraph = tokenizer(paragraph_text, return_offsets_mapping=True) paragraph_tokens = tokenizer.batch_decode( tokenized_paragraph['input_ids']) token_character_offsets = tokenized_paragraph['offset_mapping'] # Subtoken map # 0 element is for CLS subtoken_map = [0] for tok_id, token in enumerate( basic_tokenizer.tokenize(paragraph_text)): subtokens = tokenizer.tokenize(token) subtoken_map += [tok_id] * len(subtokens) # Add on last subtoken for SEP subtoken_map += [subtoken_map[-1]] output['para_id'] = para['para_id'] output['speakers'] = [['[SPL]'] + ['-'] * \ (len(paragraph_tokens) - 2) + ['[SPL]']] output['sentences'] = [paragraph_tokens] output['sentence_map'] = sentence_map output['clusters'] = [[]] output['subtoken_map'] = subtoken_map output['token_char_spans'] = token_character_offsets output['original_text'] = paragraph_text output['doc_key'] = "wb" # Test, if we know we have a mention on tokens 2-8 # how do we translate that to a span in the original sentence? output_jsons.append(output) # output to output_file with open(output_file, 'w') as fp: for out in output_jsons: fp.write(json.dumps(out) + '\n')