示例#1
0
def convert_examples_to_features(example, tokenizer=None, truncate_length=512, cls_token=None,
                                 sep_token=None, class_labels=None, label_alias=None, vocab=None,
                                 is_test=False):
    """convert glue examples into necessary features"""
    if not is_test:
        label_dtype = 'int32' if class_labels else 'float32'
        # get the label
        label = example[-1]
        example = example[:-1]
        #create label maps if classification task
        if class_labels:
            label_map = {}
            for (i, l) in enumerate(class_labels):
                label_map[l] = i
            if label_alias:
                for key in label_alias:
                    label_map[key] = label_map[label_alias[key]]
            label = label_map[label]
        label = np.array([label], dtype=label_dtype)

    # tokenize raw text
    tokens_raw = [tokenizer(l) for l in example]
    # truncate to the truncate_length,
    tokens_trun = truncate_seqs_equal(tokens_raw, truncate_length)
    # concate the sequences with special tokens
    tokens_trun[0] = [cls_token] + tokens_trun[0]
    tokens, segment_ids, _ = concat_sequences(tokens_trun, [[sep_token]] * len(tokens_trun))
    # convert the token to ids
    input_ids = vocab[tokens]
    valid_length = len(input_ids)
    if not is_test:
        return input_ids, segment_ids, valid_length, label
    else:
        return input_ids, segment_ids, valid_length
示例#2
0
def convert_examples_to_features(example, tokenizer=None, truncate_length=512, cls_token=None,
                                 sep_token=None, class_labels=None, label_alias=None, vocab=None,
                                 is_test=False):
    """Convert GLUE/SuperGLUE classification and regression examples into 
        the necessary features"""
    if not is_test:
        label_dtype = 'int32' if class_labels else 'float32'
        example, label = example[:-1], example[-1]
        # create label maps if classification task
        if class_labels:
            label_map = {}
            for (i, l) in enumerate(class_labels):
                label_map[l] = i
            if label_alias:
                for key in label_alias:
                    label_map[key] = label_map[label_alias[key]]
            # Fix for BoolQ, WSC, and MultiRC, json values get loaded as boolean and not as string
            # assignments.
            if type(label) == bool:
                label = "true" if label else "false"
            # Fix for COPA
            if type(label) == int:
                label = "0" if label == 0 else "1"
            label = label_map[label]
        label = np.array([label], dtype=label_dtype)
    tokens_raw = [tokenizer(l) for l in example]
    tokens_trun = truncate_seqs_equal(tokens_raw, truncate_length)
    tokens_trun[0] = [cls_token] + tokens_trun[0]
    tokens, segment_ids, _ = concat_sequences(
        tokens_trun, [[sep_token]] * len(tokens_trun))
    input_ids = vocab[tokens]
    valid_length = len(input_ids)

    if not is_test:
        return input_ids, segment_ids, valid_length, label
    else:
        return input_ids, segment_ids, valid_length