Python BertTokenizerFast.tokenize примеры, transformers.BertTokenizerFast.tokenize Python примеры использования

Пример #1

0

Показать файл

def generate_joshi_jsondocs(task_map_items, tokenizer: BertTokenizerFast,
                            nlp: spacy.language.Language) -> Iterator[dict]:
    """Given a set of task items generate json docs to be serialised"""

    for task_id, task_records in tqdm(task_map_items):

        news_doc = nlp(task_records[0].news_text)
        sci_doc = nlp(task_records[0].sci_text)

        jsondoc = {
            'doc_key': f"nw",
            "clusters": [],
            'subtoken_map': [],
            'sentence_map': [],
            'sentences': [],
            'speakers': [],
            "doc_ids": [f"news_{task_id[0]}", f"science_{task_id[0]}"],
            'doc_boundaries': []
        }

        sent_id = 0
        i = 0
        for doc in [news_doc, sci_doc]:

            jsondoc['doc_boundaries'].append(i)
            i = 0
            for sent in doc.sents:
                r = tokenizer.tokenize(sent.text, add_special_tokens=True)

                jsondoc['sentences'].append(r)

                jsondoc['speakers'].append(['[SPL]'] + (['-'] * (len(r) - 2)) +
                                           ['[SPL]'])

                for tok in r[:-1]:
                    jsondoc['sentence_map'].append(sent_id)
                    jsondoc['subtoken_map'].append(i)

                    if tok not in ['[CLS]'] and not tok.startswith('##'):
                        i += 1

                # increment sentence id
                sent_id += 1

        yield jsondoc

Пример #2

0

Показать файл

Файл: inference.py Проект: hash2430/Korean_Intent_classifier

def inference_model(
    model: BertForSquad,
    tokenizer: BertTokenizerFast,

    context: str,
    question: str,
    input_ids: List[int],
    token_type_ids: List[int]
) -> str:
    """ Inferene function with the model 
    Because we don't know how your model works, we can't not infer the answer from your model.
    Implement inference process for you model.
    Please use inference_start_end and inference_answer functions you have implemented
    
    Argumentes:
    model -- Model you have trained.
    tokenizer -- Tokenizer to encode and decode the string
    context -- Context string
    question -- Question string
    input_ids -- Input ids
    token_type_dis -- Token type ids

    Return:
    answer -- Answer string
    """
    answer: str = None
    tuple = input_ids, token_type_ids, -1, -1
    tmp = [tuple]
    input_ids, attention_mask, token_type_ids, _, _ = squad_feature_collate_fn(tmp)
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)
    token_type_ids = token_type_ids.to(device)
    start, end = model(input_ids, attention_mask, token_type_ids)
    context_start = len(tokenizer.tokenize(question)) + 2
    context_end = len(input_ids[0])-2
    start, end = inference_start_end(start[0], end[0],context_start, context_end)
    answer = inference_answer(question, context, input_ids, token_type_ids, start.to('cpu'), end.to('cpu'), tokenizer)

    return answer

Пример #3

0

Показать файл

def paragraphs2batch(paragraphs: List[str], tokenizer: BertTokenizerFast) -> \
        Tuple[List[List[str]], Dict]:
    """
    Convert a list of paragraphs to a batch. This essentially does these things:

    1. Tokenize paragraphs.
    2. Pad all input_ids tensors, remove excessively long input_ids.
    3. Generate the correct attention_masks.

    :param paragraphs: List of paragraphs.
    :param tokenizer: The BERT tokenizer.
    :return: Tokenized paragraphs and the batch that can be used as model inputs.
    """
    all_tokenized = []
    input_ids = []
    attention_mask = []

    for p in paragraphs:
        tokenized = tokenizer.tokenize(p)
        all_tokenized.append(tokenized)
        one_hot = tokenizer.convert_tokens_to_ids(tokenized)

        input_ids.append(torch.tensor(one_hot, dtype=torch.long))
        attention_mask.append(torch.ones_like(input_ids[-1],
                                              dtype=torch.float))

    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)
    input_ids = input_ids[:, :max_input]

    attention_mask = pad_sequence(attention_mask,
                                  batch_first=True,
                                  padding_value=0.)
    attention_mask = attention_mask[:, :max_input]

    return all_tokenized, {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
    }

Пример #4

0

Показать файл

Файл: inference.py Проект: hash2430/Korean_Intent_classifier

def inference_answer(
    question: str,
    context: str,

    input_ids: List[int],
    token_type_ids: List[int],
    start_pos: int,
    end_pos: int,

    tokenizer: BertTokenizerFast
) -> str:
    """ Inference fucntion for the answer.

    Because the tokenizer lowers the capital letters and splits punctuation marks,
    you may get wrong answer words if you detokenize it directly.
    For example, if you encode "$5.000 Dollars" and decode it, you get different words from the orignal.

    "$5.00 USD" --(Tokenize)--> ["$", "5", ".", "00", "usd"] --(Detokenize)--> "$ 5. 00 usd"

    Thus, you should find the original words in the context by the start and end token positions of the answer.
    Implement the function inferencing the answer from the context and the answer token postion.

    Note 1: We have already implmented direct decoding so you can skip this problem if you want.

    Note 2: When we implement squad_feature, we have arbitrarily split tokens if the answer is a subword,
            so it is very tricky to extract the original word by start_pos and end_pos.`
            However, as None is entered into the answer when evaluating,
            you can assume the word tokens follow general tokenizing rule in this problem.
            In fact, the most appropriate solution is storing the character index when tokenizing them.

    Hint: You can find a simple solution if you carefully search the documentation of the transformers library.
    Library Link: https://huggingface.co/transformers/index.html

    Arguments:
    question -- Question string
    context -- Context string

    input_ids -- Input ids
    token_type_ids -- Token type ids
    start_pos -- Predicted start token position of the answer
    end_pos -- Predicted end token position of the answer

    tokenizer -- Tokenizer to encode and decode the string

    Return:
    answer -- Answer string
    """
    tokens = tokenizer.tokenize(context)
    context_as_is = context
    context = context.lower()
    token2char_map = {}
    start = 0
    for j in range(len(tokens)):
        for i in range(len(tokens[j])):
            if tokens[j][i] == '#':
                continue
            else:
                break
        token = tokens[j][i:]
        start = context.find(token,start)
        end = start + len(token)
        token2char_map[j] = [start, end-1]
        start = end
    question_tokens = ['[CLS]']+tokenizer.tokenize(question)+['[SEP]']
    start = token2char_map[int(start_pos)-len(question_tokens)][0]
    end = token2char_map[int(end_pos)-len(question_tokens)][1]
    answer = context_as_is[start:end+1]

    return answer

Пример #5

0

Показать файл

def main(model, tokenizer: BertTokenizerFast, condition_type: str,
         metrics_output_path: str):
    """Compute the BERT representations + cosine similarities."""

    ## Get Relevant data

    subject_id_to_patient_info = get_subject_id_to_patient_info(
        condition_type=condition_type)
    condition_code_to_count = get_condition_code_to_count(
        condition_type=condition_type)
    condition_code_to_description = get_condition_code_to_descriptions(
        condition_type=condition_type)

    set_to_use = filter_condition_code_by_count(condition_code_to_count,
                                                min_count=0,
                                                max_count=500000)

    condition_code_to_index: Dict[str, int] = dict(
        zip(set_to_use, range(len(set_to_use))))

    mean_differential_sim, max_differential_sim, all_pair_differential_sim = [], [], []

    ## For each patient and condition, get a template, pass through BERT and return similarities

    all_subject_ids = sorted(list(subject_id_to_patient_info.keys()))
    all_subject_ids = sorted(
        resample(all_subject_ids,
                 replace=False,
                 n_samples=10000,
                 random_state=2021))

    for subject_id in tqdm(all_subject_ids):
        patient_info = subject_id_to_patient_info[subject_id]
        templates = []
        for condition in set_to_use:
            desc = condition_code_to_description[condition]
            templates.append(
                generate_template(patient_info.FIRST_NAME,
                                  patient_info.LAST_NAME, patient_info.GENDER,
                                  desc))

        name = patient_info.FIRST_NAME + " " + patient_info.LAST_NAME
        name_length = len(tokenizer.tokenize(name))

        ## Following info may change if we change the template structure.
        ## Following are on basis of structure [CLS] {title} {name} is a yo patient with {condition} [SEP]
        example_template = tokenizer.tokenize(templates[0])
        name_start_index = 2  # Name Starts after [CLS] {title}
        name_end_index = name_start_index + name_length
        condition_start_index = example_template.index("patient") + 2
        condition_end_index = -1

        assert (tokenizer.convert_tokens_to_string(
            example_template[name_start_index:name_end_index]) == " ".join(
                name.lower().split())), breakpoint()
        assert (tokenizer.convert_tokens_to_string(
            example_template[condition_start_index:condition_end_index]) ==
                " ".join(condition_code_to_description[
                    set_to_use[0]].lower().split())), breakpoint()

        ## Pass all templates to BERT and return similarities

        mean_similarities, max_similarities, all_pair_similarities = get_name_condition_similarities(
            model,
            tokenizer,
            templates,
            name_start_index,
            name_end_index,
            condition_start_index,
            condition_end_index,
        )

        condition_labels = get_condition_labels_as_vector(
            patient_info.CONDITIONS, condition_code_to_index)

        mean_differential_sim.append(
            differential_score(condition_labels, mean_similarities))
        max_differential_sim.append(
            differential_score(condition_labels, max_similarities))
        all_pair_differential_sim.append(
            differential_score(condition_labels, all_pair_similarities))

    print(f"Mean Mean Pos-Neg {np.average(mean_differential_sim)}")
    print(f"SD Mean Pos-Neg {np.std(mean_differential_sim)}")
    print(f"Mean Max Pos-Neg {np.average(max_differential_sim)}")
    print(f"SD Max Pos-Neg {np.std(max_differential_sim)}")
    print(f"Mean All Pair Pos-Neg {np.average(all_pair_differential_sim)}")
    print(f"SD All Pair Pos-Neg {np.std(all_pair_differential_sim)}")

    from experiments.MLM.common import mean_std_as_string

    with open(f"{metrics_output_path}/results.txt", "w") as f:
        f.write(mean_std_as_string("Mean Sim", mean_differential_sim))
        f.write(mean_std_as_string("Max Sim", max_differential_sim))
        f.write(mean_std_as_string("All Pair Sim", all_pair_differential_sim))

Пример #6

0

Показать файл

def _create_xtoken_df(morph_df: pd.DataFrame, xtokenizer: BertTokenizerFast, sos, eos) -> pd.DataFrame:
    token_df = morph_df[['sent_id', 'token_id', 'token']].drop_duplicates()
    sent_groups = sorted(token_df.groupby([token_df.sent_id]))
    num_sentences = len(sent_groups)
    tq = tqdm(total=num_sentences, desc="Sentence")
    data_rows = []
    for sent_id, sent_df in sent_groups:
        xtokens = [(tid, t, xt) for tid, t in zip(sent_df.token_id, sent_df.token) for xt in xtokenizer.tokenize(t)]
        sent_token_indices = [0] + [tid for tid, t, xt in xtokens] + [sent_df.token_id.max() + 1]
        sent_tokens = [sos] + [t for tid, t, xt in xtokens] + [eos]
        sent_xtokens = [xtokenizer.cls_token] + [xt for tid, t, xt in xtokens] + [xtokenizer.sep_token]
        sent_index = [sent_id] * len(sent_xtokens)
        data_rows.extend(list(zip(sent_index, sent_token_indices, sent_tokens, sent_xtokens)))
        tq.update(1)
    tq.close()
    return pd.DataFrame(data_rows, columns=['sent_id', 'token_id', 'token', 'xtoken'])

Пример #7

0

Показать файл

Файл: dataset.py Проект: KookHoiKim/NLP_assignment

def squad_features(
        context: str, question: str, answer: Union[str, None],
        start_char_pos: Union[int, None],
        tokenizer: BertTokenizerFast) -> Tuple[List[int], List[int], int, int]:
    """ Squad feature extractor
    Implement the feature extractor from a Squad sample for your model
    Return values should follow [CLS + question + SEP + context + SEP] form.
    In addition, because start_char_pos is based on character index, you should convert it to proper token index.
    Check the test cases to know the functionality in detail.

    Note: input_ids and token_type_ids follows the transfomer library documentation 
    https://huggingface.co/transformers/glossary.html

    Arguments:
    context -- Context string
    question -- Question string
    anwser -- Answer string. If the answer is None, return None for start_token_pos and end_token_pos
    start_char_pos -- Character index which the answer starts from in the context.
                      If the answer is None, this argument is also None.
    tokenizer -- Tokenizer to encode text strings.
                 Explanation: https://huggingface.co/transformers/model_doc/bert.html#berttokenizerfast

    Returns:
    input_ids -- Input ids
    token_type_ids -- Token type ids 
    start_token_pos -- Token index which the answer starts from in the input_ids list. 
                       None if no answer is given.
    end_token_pos -- Token index which the answer ends by in the input_ids list.
                     This includes the last token which located in the index.
                     None if no answer is given.
    """
    ### YOUR CODE HERE (~18 lines)
    input_ids: List[int] = None
    token_type_ids: List[int] = None
    start_token_pos: int = None
    end_token_pos: int = None

    token_question = tokenizer.tokenize(question)
    #token_context = tokenizer.tokenize(context)

    tokens = ["[CLS]"] + token_question + ["[SEP]"]

    token_type_ids = [0] * len(tokens)

    #set_trace()

    # Answer available
    if start_char_pos is not None:

        token_answer = tokenizer.tokenize(answer)
        back_context_ = tokenizer.tokenize(context[start_char_pos:])

        if _is_whitespace(context[start_char_pos + len(answer)]) is False:
            if back_context_[len(token_answer) - 1] is not token_answer[-1]:
                back_context = tokenizer.tokenize(context[start_char_pos +
                                                          len(answer):])
                back_context[0] = "##" + back_context[0]
            else:
                back_context = back_context_[len(token_answer):]

        else:
            back_context = back_context_[len(token_answer):]

        if start_char_pos == 0:
            front_context = []
            token_answer = tokenizer.tokenize(answer)

        else:
            if _is_whitespace(context[start_char_pos - 1]):
                front_context = tokenizer.tokenize(context[:start_char_pos])

            # if previous chr of answer is not space
            else:

                front_context = tokenizer.tokenize(context[:start_char_pos])
                token_answer[0] = "##" + token_answer[0]

        start_token_pos = len(tokens) + len(front_context)
        end_token_pos = start_token_pos + len(token_answer) - 1

        token_context = front_context + token_answer + back_context
        token_type_ids = token_type_ids + [1] * (len(token_context) + 1)

        tokens = tokens + token_context + ["[SEP]"]

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # No answer case
    else:
        token_context = tokenizer.tokenize(context)
        tokens = tokens + token_context + ["[SEP]"]
        token_type_ids = token_type_ids + [1] * (len(token_context) + 1)
        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        start_token_pos = None
        end_token_pos = None

        #token_answer = tokenizer.tokenize(answer)
        #if len(token_answer) > 1:

    ### END YOUR CODE

    return input_ids, token_type_ids, start_token_pos, end_token_pos

Пример #8

0

Показать файл

Файл: modeling.py Проект: lishuokp31/mimic-predictors-fastapi

def classify(
    model: BertForTokenClassification,
    tokenizer: BertTokenizerFast,
    # 输入
    sequence: str,
    labels: List[int] = None,
) -> List[Entity]:
    """
    classify的功能：
    给定model、tokenizer，sequence：
    给定label(对应以下label不为空的情况)，计算loss；
    label为空：预测entities

    训练好的模型，可以直接使用
    """
    # ensure model is configured to return dict
    # otherwise this code will break
    # 确保模型配置为返回dict，否则此代码将中断
    if not model.config.return_dict:
        raise ValueError(
            'Model should be instantiated with `return_dict=True`')

    # convert input sequence (and optional labels) into an inputs bundle
    # 将输入序列（和可选标签）转换为输入包
    inputs, mask = pack_sequence_as_inputs(
        tokenizer=tokenizer,
        sequence=sequence,
        labels=labels,
        max_token_length=model.config.max_position_embeddings,
    )

    # put data on the gpu (if available)
    # if torch.cuda.is_available():
    #     model.cuda()
    #     inputs = {k: v.cuda() for k, v in inputs.items()}

    # if labels is not None, it means that the caller is interested in the loss
    # value of the given input sequence. So, this should be done in a grad context.
    # 如果labels不是None，则表示调用者对给定输入序列的损失值感兴趣。所以，这应该在毕业的背景下进行。
    if labels is not None:
        return model(**inputs).loss

    # if labels is None, it means that the caller is interested in the entities
    # to be recognized by the model. In this case, the outputs can be computed
    # without a grad context
    # 如果labels为None，则表示调用者对模型要识别的实体感兴趣。在这种情况下，可以在没有梯度上下文的情况下计算输出
    with torch.no_grad():
        logits = model(**inputs).logits.cpu()

    # decode model's output
    # 解码模型输出
    entities = extract_entities(
        sequence=sequence,
        logits=logits[:, 1:-1][mask],
        encode=tokenizer.encode,
        decode=tokenizer.decode,
    )
    entities = realign_extracted_entities(
        sequence=sequence,
        tokens=tokenizer.tokenize(sequence),
        entities=entities,
        vocab=tokenizer.get_vocab(),
    )

    return list(entities)

Пример #9

0

Показать файл

def squad_features(
        context: str, question: str, answer: Union[str, None],
        start_char_pos: Union[int, None],
        tokenizer: BertTokenizerFast) -> Tuple[List[int], List[int], int, int]:
    """ Squad feature extractor
    Implement the feature extractor from a Squad sample for your model
    Return values should follow [CLS + question + SEP + context + SEP] form.
    In addition, because start_char_pos is based on character index, you should convert it to proper token index.
    Check the test cases to know the functionality in detail.

    Note: input_ids and token_type_ids follows the transfomer library documentation 
    https://huggingface.co/transformers/glossary.html

    Arguments:
    context -- Context string
    question -- Question string
    answer -- Answer string. If the answer is None, return None for start_token_pos and end_token_pos
    start_char_pos -- Character index which the answer starts from in the context.
                      If the answer is None, this argument is also None.
    tokenizer -- Tokenizer to encode text strings.
                 Explanation: https://huggingface.co/transformers/model_doc/bert.html#berttokenizerfast

    Returns:
    input_ids -- Input ids
    token_type_ids -- Token type ids
    start_token_pos -- Token index which the answer starts from in the input_ids list. 
                       None if no answer is given.
    end_token_pos -- Token index which the answer ends by in the input_ids list.
                     This includes the last token which located in the index.
                     None if no answer is given.
    """
    ### YOUR CODE HERE (~18 lines)
    encoded_dict = tokenizer.encode_plus(question, context)
    input_ids = encoded_dict["input_ids"]
    token_type_ids = encoded_dict["token_type_ids"]
    input_ids_tokens = tokenizer.convert_ids_to_tokens(input_ids)
    # print("Input (tokens): ", input_ids_tokens)
    if answer is None and start_char_pos is None:
        start_token_pos = None
        end_token_pos = None
        return input_ids, token_type_ids, start_token_pos, end_token_pos

    start_token_pos, end_token_pos = 0, 0
    start_token_pos += token_type_ids.count(0)
    start_token_pos += len(tokenizer.tokenize(context[:start_char_pos]))
    end_token_pos += len(tokenizer.tokenize(answer)) + start_token_pos - 1
    # Extract tokenized answer part only
    tokenized_answer = " ".join(
        tokenizer.convert_ids_to_tokens(
            input_ids[start_token_pos:end_token_pos + 1]))

    subword_prefix_original = "##" if "##" in tokenized_answer else ""
    subword_prefix = "##"
    tokenized_answer = tokenized_answer.replace('#', '')
    if tokenized_answer != answer.lower(
    ) and start_token_pos == end_token_pos and answer in tokenized_answer:
        # A single word but different subword tokenization case
        new_subword_list = [
            subword_prefix_original + tokenized_answer[:len(answer)],
            subword_prefix + tokenized_answer[len(answer):]
        ]
        # print('new_subword_list : ', new_subword_list)
        input_ids = input_ids[:
                              start_token_pos] + tokenizer.convert_tokens_to_ids(
                                  new_subword_list) + input_ids[end_token_pos +
                                                                1:]
        token_type_ids.append(1)

    # print("Input ids: ", input_ids)
    # input_ids_tokens = tokenizer.convert_ids_to_tokens(input_ids)
    # print("Input (tokens) (ADJUSTED): ", input_ids_tokens)
    # print("Segmend Ids: ", token_type_ids)
    # print('START_CHAR_POS: ', start_char_pos)
    # print("ANSWER: ", answer)
    # print("START: ", start_token_pos)
    # print("END: ", end_token_pos)
    # print("ANSWER SPAN: ", input_ids_tokens[start_token_pos:end_token_pos+1])
    assert len(input_ids) == len(token_type_ids)

    ### END YOUR CODE

    return input_ids, token_type_ids, start_token_pos, end_token_pos

Пример #10

0

Показать файл

Файл: dataset.py Проект: hash2430/Korean_Intent_classifier

def squad_features_1(
    context: str,
    question: str,
    answer: Union[str, None],
    start_char_pos: Union[int, None],
    tokenizer: BertTokenizerFast
) -> Tuple[List[int], List[int], int, int]:
    """ Squad feature extractor
    Implement the feature extractor from a Squad sample for your model
    Return values should follow [CLS + question + SEP + context + SEP] form.
    In addition, because start_char_pos is based on character index, you should convert it to proper token index.
    Check the test cases to know the functionality in detail.

    Note: input_ids and token_type_ids follows the transfomer library documentation 
    https://huggingface.co/transformers/glossary.html

    Arguments:
    context -- Context string
    question -- Question string
    anwser -- Answer string. If the answer is None, return None for start_token_pos and end_token_pos
    start_char_pos -- Character index which the answer starts from in the context.
                      If the answer is None, this argument is also None.
    tokenizer -- Tokenizer to encode text strings.
                 Explanation: https://huggingface.co/transformers/model_doc/bert.html#berttokenizerfast

    Returns:
    input_ids -- Input ids
    token_type_ids -- Token type ids 
    start_token_pos -- Token index which the answer starts from in the input_ids list. 
                       None if no answer is given.
    end_token_pos -- Token index which the answer ends by in the input_ids list.
                     This includes the last token which located in the index.
                     None if no answer is given.
    """
    input_ids: List[int] = None
    token_type_ids: List[int] = None
    start_token_pos: int = None
    end_token_pos: int = None

    encoded_dict = tokenizer.encode_plus(question, context)
    tokens = tokenizer.tokenize(context)
    input_ids = encoded_dict['input_ids']
    token_type_ids = encoded_dict['token_type_ids']

    if answer == None:
        return input_ids, token_type_ids, None, None
    context = context.lower()
    token2char_map = {}
    start = 0
    for j in range(len(tokens)):
        for i in range(len(tokens[j])):
            if tokens[j][i] == '#':
                continue
            else:
                break
        token = tokens[j][i:]
        start = context.find(token,start)
        end = start + len(token)
        token2char_map[j] = [start, end-1]
        start = end

    for i in range(len(tokens)):
        if token2char_map[i][0]>=start_char_pos:
            start_token_pos = i
            break
    end_token_pos = len(tokens) - 1
    for i in range(start_token_pos, len(tokens)):
        if token2char_map[i][0]>=start_char_pos+len(answer):
            end_token_pos = i-1
            break

    num_tokens_before_context = input_ids.index(102) + 1
    start_token_pos += num_tokens_before_context
    end_token_pos += num_tokens_before_context

    return input_ids, token_type_ids, start_token_pos, end_token_pos

Пример #11

0

Показать файл

Файл: propara_to_jsonlines.py Проект: drewhayward/coref

def process_data(data_file, output_file, vocab_file):
    """
    Adapted from the `gap_to_jsonlines.py` for propara data prep
    """
    tokenizer = BertTokenizerFast(vocab_file=vocab_file)
    # Need to have this other tokenizer so we can build the sub-token
    # to token map. It seems huggingface tokenizer doesn't have this
    # functionality
    basic_tokenizer = BasicTokenizer(do_lower_case=False)

    # Load data
    with open(data_file, 'r') as fp:
        data = json.load(fp)

    output_jsons = []
    # Format as jsonlines & tokenize
    for para in tqdm(data):
        output = {}
        paragraph_text = " ".join(para['sentence_texts'])

        # Sentence map
        sentence_map = [0]
        for sent_num, sent in enumerate(para['sentence_texts']):
            tokens = tokenizer.tokenize(sent)
            sentence_map += [sent_num] * len(tokens)
        sentence_map += [sentence_map[-1]]

        # All tokens
        # Note this is the same as what we used to calculate the sentence map
        # even though they are done separately
        tokenized_paragraph = tokenizer(paragraph_text,
                                        return_offsets_mapping=True)
        paragraph_tokens = tokenizer.batch_decode(
            tokenized_paragraph['input_ids'])
        token_character_offsets = tokenized_paragraph['offset_mapping']

        # Subtoken map
        # 0 element is for CLS
        subtoken_map = [0]
        for tok_id, token in enumerate(
                basic_tokenizer.tokenize(paragraph_text)):
            subtokens = tokenizer.tokenize(token)
            subtoken_map += [tok_id] * len(subtokens)
        # Add on last subtoken for SEP
        subtoken_map += [subtoken_map[-1]]

        output['para_id'] = para['para_id']
        output['speakers'] = [['[SPL]'] + ['-'] * \
            (len(paragraph_tokens) - 2) + ['[SPL]']]
        output['sentences'] = [paragraph_tokens]
        output['sentence_map'] = sentence_map
        output['clusters'] = [[]]
        output['subtoken_map'] = subtoken_map
        output['token_char_spans'] = token_character_offsets
        output['original_text'] = paragraph_text
        output['doc_key'] = "wb"

        # Test, if we know we have a mention on tokens 2-8
        # how do we translate that to a span in the original sentence?
        output_jsons.append(output)

    # output to output_file
    with open(output_file, 'w') as fp:
        for out in output_jsons:
            fp.write(json.dumps(out) + '\n')

Python BertTokenizerFast.tokenize примеры использования