Exemplo n.º 1
0
def convert_examples_to_features(example, tokenizer=None, truncate_length=512, cls_token=None,
                                 sep_token=None, class_labels=None, label_alias=None, vocab=None,
                                 is_test=False):
    """convert glue examples into necessary features"""
    if not is_test:
        label_dtype = 'int32' if class_labels else 'float32'
        # get the label
        label = example[-1]
        example = example[:-1]
        #create label maps if classification task
        if class_labels:
            label_map = {}
            for (i, l) in enumerate(class_labels):
                label_map[l] = i
            if label_alias:
                for key in label_alias:
                    label_map[key] = label_map[label_alias[key]]
            label = label_map[label]
        label = np.array([label], dtype=label_dtype)

    # tokenize raw text
    tokens_raw = [tokenizer(l) for l in example]
    # truncate to the truncate_length,
    tokens_trun = truncate_seqs_equal(tokens_raw, truncate_length)
    # concate the sequences with special tokens
    tokens_trun[0] = [cls_token] + tokens_trun[0]
    tokens, segment_ids, _ = concat_sequences(tokens_trun, [[sep_token]] * len(tokens_trun))
    # convert the token to ids
    input_ids = vocab[tokens]
    valid_length = len(input_ids)
    if not is_test:
        return input_ids, segment_ids, valid_length, label
    else:
        return input_ids, segment_ids, valid_length
Exemplo n.º 2
0
def convert_examples_to_features(example, tokenizer=None, truncate_length=512, cls_token=None,
                                 sep_token=None, class_labels=None, label_alias=None, vocab=None,
                                 is_test=False):
    """Convert GLUE/SuperGLUE classification and regression examples into 
        the necessary features"""
    if not is_test:
        label_dtype = 'int32' if class_labels else 'float32'
        example, label = example[:-1], example[-1]
        # create label maps if classification task
        if class_labels:
            label_map = {}
            for (i, l) in enumerate(class_labels):
                label_map[l] = i
            if label_alias:
                for key in label_alias:
                    label_map[key] = label_map[label_alias[key]]
            # Fix for BoolQ, WSC, and MultiRC, json values get loaded as boolean and not as string
            # assignments.
            if type(label) == bool:
                label = "true" if label else "false"
            # Fix for COPA
            if type(label) == int:
                label = "0" if label == 0 else "1"
            label = label_map[label]
        label = np.array([label], dtype=label_dtype)
    tokens_raw = [tokenizer(l) for l in example]
    tokens_trun = truncate_seqs_equal(tokens_raw, truncate_length)
    tokens_trun[0] = [cls_token] + tokens_trun[0]
    tokens, segment_ids, _ = concat_sequences(
        tokens_trun, [[sep_token]] * len(tokens_trun))
    input_ids = vocab[tokens]
    valid_length = len(input_ids)

    if not is_test:
        return input_ids, segment_ids, valid_length, label
    else:
        return input_ids, segment_ids, valid_length
Exemplo n.º 3
0
def convert_examples_to_features(example,
                                 tokenizer=None,
                                 cls_token=None,
                                 sep_token=None,
                                 vocab=None,
                                 max_seq_length=384,
                                 doc_stride=128,
                                 max_query_length=64,
                                 cls_index=0):
    """convert the examples to the BERT features"""
    query_tokenized = [cls_token] + tokenizer(
        example.question_text)[:max_query_length]
    #tokenize paragraph and get start/end position of the answer in tokenized paragraph
    tok_start_position, tok_end_position, all_doc_tokens, _, tok_to_orig_index = \
        tokenize_and_align_positions(example.doc_tokens,
                                     example.start_position,
                                     example.end_position,
                                     tokenizer)
    # get doc spans using sliding window
    doc_spans, doc_spans_indices = get_doc_spans(
        all_doc_tokens, max_seq_length - len(query_tokenized) - 2, doc_stride)

    if not example.is_impossible:
        (tok_start_position, tok_end_position) = improve_answer_span(
            all_doc_tokens, tok_start_position, tok_end_position, tokenizer,
            example.orig_answer_text)
        # get the new start/end position
        positions = [
            align_position2doc_spans([tok_start_position, tok_end_position],
                                     doc_idx,
                                     offset=len(query_tokenized) + 1,
                                     default_value=0)
            for doc_idx in doc_spans_indices
        ]
    else:
        # if the question is impossible to answer, set the start/end position to cls index
        positions = [[cls_index, cls_index] for _ in doc_spans_indices]

    # record whether the tokens in a docspan have max context
    token_is_max_context = [{
        len(query_tokenized) + p:
        check_is_max_context(doc_spans_indices, i, p + doc_spans_indices[i][0])
        for p in range(len(doc_span))
    } for (i, doc_span) in enumerate(doc_spans)]

    token_to_orig_map = [{
        len(query_tokenized) + p + 1:
        tok_to_orig_index[p + doc_spans_indices[i][0]]
        for p in range(len(doc_span))
    } for (i, doc_span) in enumerate(doc_spans)]

    #get sequence features: tokens, segment_ids, p_masks
    seq_features = [
        concat_sequences([query_tokenized, doc_span], [[sep_token]] * 2)
        for doc_span in doc_spans
    ]

    features = [
        SquadBERTFeautre(example_id=example.example_id,
                         qas_id=example.qas_id,
                         doc_tokens=example.doc_tokens,
                         valid_length=len(tokens),
                         tokens=tokens,
                         token_to_orig_map=t2o,
                         token_is_max_context=is_max,
                         input_ids=vocab[tokens],
                         p_mask=p_mask,
                         segment_ids=segment_ids,
                         start_position=start,
                         end_position=end,
                         is_impossible=example.is_impossible)
        for (tokens, segment_ids, p_mask), (start, end), is_max, t2o in zip(
            seq_features, positions, token_is_max_context, token_to_orig_map)
    ]
    return features