예제 #1
0
def map_fn_senti(examples, tokenizer):
    log.debug('load data %d' % len(examples))
    contexts = [example['context'] for example in examples]
    tokenized_examples = tokenizer(contexts, max_seq_len=args.max_seq_len)
    tokenized_examples = convert_tokenizer_res_to_old_version(
        tokenized_examples)

    return tokenized_examples
예제 #2
0
def map_fn_DuCheckList(examples, args, tokenizer):
    # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    #NOTE: Almost the same functionality as HuggingFace's prepare_train_features function. The main difference is
    # that HugggingFace uses ArrowTable as basic data structure, while we use list of dictionary instead.
    if args.language == 'en':
        contexts = [
            examples[i]['context'].encode('ascii',
                                          errors='replace').decode('UTF-8')
            for i in range(len(examples))
        ]
        questions = [
            examples[i]['question'].encode('ascii',
                                           errors='replace').decode('UTF-8')
            for i in range(len(examples))
        ]
    else:
        contexts = [examples[i]['context'] for i in range(len(examples))]
        questions = [examples[i]['question'] for i in range(len(examples))]

    tokenized_examples = tokenizer(questions,
                                   contexts,
                                   stride=args.doc_stride,
                                   max_seq_len=args.max_seq_len)
    tokenized_examples = convert_tokenizer_res_to_old_version(
        tokenized_examples)

    # For validation, there is no need to compute start and end positions
    for i, tokenized_example in enumerate(tokenized_examples):
        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_example['token_type_ids']

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = tokenized_example['overflow_to_sample']
        tokenized_examples[i]["example_id"] = examples[sample_index]['id']

        # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
        # position is part of the context or not.
        if args.language == 'ch':
            tokenized_examples[i]["offset_mapping"] = [
                (o if sequence_ids[k] == 1 else None)
                for k, o in enumerate(tokenized_example["offset_mapping"])
            ]
        else:
            n = tokenized_example['offset_mapping'].index(
                (0, 0), 1) + 2  # context start position
            m = len(tokenized_example['offset_mapping']
                    ) - 1  # context end position + 1
            tokenized_examples[i]["offset_mapping"] = [
                (o if n <= k <= m else None)
                for k, o in enumerate(tokenized_example["offset_mapping"])
            ]
    return tokenized_examples
예제 #3
0
def map_fn_DuCheckList(examples, args, tokenizer):
    # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    questions = [examples[i]['question'] for i in range(len(examples))]
    contexts = [
        examples[i]['context'] + examples[i]['title']
        for i in range(len(examples))
    ]

    tokenized_examples = tokenizer(questions,
                                   contexts,
                                   stride=args.doc_stride,
                                   max_seq_len=args.max_seq_len)
    tokenized_examples = convert_tokenizer_res_to_old_version(
        tokenized_examples)

    log.debug('\nexample: %d' % len(examples))
    log.debug('feature: %d\n' % len(tokenized_examples))

    # For validation, there is no need to compute start and end positions
    for i, tokenized_example in enumerate(tokenized_examples):
        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_example['token_type_ids']

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = tokenized_example['overflow_to_sample']
        tokenized_examples[i]["example_id"] = examples[sample_index]['id']

        # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
        # position is part of the context or not.
        if args.language == 'ch':
            tokenized_examples[i]["offset_mapping"] = [
                (o if sequence_ids[k] == 1 else None)
                for k, o in enumerate(tokenized_example["offset_mapping"])
            ]
        else:
            n = tokenized_example['offset_mapping'].index(
                (0, 0), 1) + 2  # context start position
            m = len(tokenized_example['offset_mapping']
                    ) - 1  # context end position + 1
            tokenized_examples[i]["offset_mapping"] = [
                (o if n <= k < m else None)
                for k, o in enumerate(tokenized_example["offset_mapping"])
            ]

        tokenized_examples[i]['question'] = examples[sample_index]['question']
        tokenized_examples[i]['context'] = examples[sample_index]['context']
        #tokenized_examples[i]['answer'] = examples[sample_index]['answers'][0]
        #tokenized_examples[i]['is_impossible'] = examples[sample_index]['is_impossible']

    return tokenized_examples
예제 #4
0
def map_fn_senti(examples, tokenizer, language):
    print('load data %d' % len(examples))
    if language == 'ch':
        q_name = "query"
        t_name = "title"
    else:
        q_name = "sentence1"
        t_name = "sentence2"
    queries = [example[q_name] for example in examples]
    titles = [example[t_name] for example in examples]
    tokenized_examples = tokenizer(
        queries, titles, max_seq_len=args.max_seq_len)
    tokenized_examples = convert_tokenizer_res_to_old_version(
        tokenized_examples)

    return tokenized_examples
예제 #5
0
def map_fn_senti(examples, tokenizer):
    print('load data %d' % len(examples))
    if args.language == 'ch':
        query = 'query'
        title = 'title'
    else:
        query = 'sentence1'
        title = 'sentence2'
    queries = [example[query] for example in examples]
    titles = [example[title] for example in examples]
    tokenized_examples = tokenizer(queries,
                                   titles,
                                   max_seq_len=args.max_seq_len)
    tokenized_examples = convert_tokenizer_res_to_old_version(
        tokenized_examples)

    return tokenized_examples
def map_fn_senti(examples, tokenizer, args):
    log.debug('load data %d' % len(examples))
    if args.language == 'en':
        contexts = [
            example['context'].encode('ascii',
                                      errors='replace').decode('UTF-8')
            for example in examples
        ]
    else:
        contexts = [example['context'] for example in examples]
    tokenized_examples = tokenizer(contexts, max_seq_len=args.max_seq_len)
    tokenized_examples = convert_tokenizer_res_to_old_version(
        tokenized_examples)
    for i in range(len(tokenized_examples)):
        tokenized_examples[i]['offset_mapping'] = [
            (0, 0)
        ] + tokenizer.get_offset_mapping(
            contexts[i])[:args.max_seq_len - 2] + [(0, 0)]
    return tokenized_examples
예제 #7
0
def map_fn_DuCheckList(examples, args, tokenizer):
    # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    if args.language == 'en':
        questions = [
            examples[i]['question'].encode('ascii',
                                           errors='replace').decode('UTF-8')
            for i in range(len(examples))
        ]
        contexts = [
            examples[i]['context'].encode('ascii',
                                          errors='replace').decode('UTF-8')
            for i in range(len(examples))
        ]
    else:
        questions = [examples[i]['question'] for i in range(len(examples))]
        contexts = [examples[i]['context'] for i in range(len(examples))]
    tokenized_examples = tokenizer(questions,
                                   contexts,
                                   stride=args.doc_stride,
                                   max_seq_len=args.max_seq_len)
    tokenized_examples = convert_tokenizer_res_to_old_version(
        tokenized_examples)

    log.debug('\nexample: %d' % len(examples))
    log.debug('feature: %d\n' % len(tokenized_examples))

    # For validation, there is no need to compute start and end positions
    for i, tokenized_example in enumerate(tokenized_examples):
        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_example['token_type_ids']

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = tokenized_example['overflow_to_sample']
        tokenized_examples[i]["example_id"] = examples[sample_index]['id']
        tokenized_examples[i]['question'] = examples[sample_index]['question']
        tokenized_examples[i]['context'] = examples[sample_index]['context']
        tokenized_examples[i]['sent_token'] = examples[sample_index][
            'sent_token']

    return tokenized_examples
def map_fn_senti(examples, tokenizer, language):
    print('load data %d' % len(examples))
    if language == 'ch':
        q_name = "query"
        t_name = "title"
        queries = [example[q_name] for example in examples]
        titles = [example[t_name] for example in examples]
    else:
        q_name = "sentence1"
        t_name = "sentence2"
        queries = [
            example[q_name].encode('ascii', errors='replace').decode('UTF-8')
            for example in examples
        ]
        titles = [
            example[t_name].encode('ascii', errors='replace').decode('UTF-8')
            for example in examples
        ]
    tokenized_examples = tokenizer(queries,
                                   titles,
                                   max_seq_len=args.max_seq_len)

    tokenized_examples = convert_tokenizer_res_to_old_version(
        tokenized_examples)

    for i in range(len(tokenized_examples)):
        tokenized_examples[i]['query_offset_mapping'] = [
            (0, 0)
        ] + tokenizer.get_offset_mapping(
            queries[i])[:args.max_seq_len - 2] + [(0, 0)]
        tokenized_examples[i]['title_offset_mapping'] = [
            (0, 0)
        ] + tokenizer.get_offset_mapping(
            titles[i])[:args.max_seq_len - 2] + [(0, 0)]

    return tokenized_examples
예제 #9
0
def map_fn_DuCheckList_finetune(examples):
    # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    questions = [examples[i]['question'] for i in range(len(examples))]
    contexts = [
        examples[i]['context'] + examples[i]['title']
        for i in range(len(examples))
    ]

    tokenized_examples = tokenizer(
        questions,
        contexts,
        stride=args.doc_stride,
        max_seq_len=args.max_seq_len)
    tokenized_examples = convert_tokenizer_res_to_old_version(
        tokenized_examples)

    for i, tokenized_example in enumerate(tokenized_examples):

        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_example["input_ids"]  # list(seq)
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_example['token_type_ids']  # list(seq)

        # The offset mappings will give us a map from token to character position in the original context. This will
        # help us compute the start_positions and end_positions.
        offsets = tokenized_example['offset_mapping']  # list(seq)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = tokenized_example['overflow_to_sample']  # int
        if args.language == 'ch':
            answers = examples[sample_index]['answers']  # list
            answer_starts = examples[sample_index]['answer_starts']  # list
        else:
            example = examples[sample_index]
            example['question_len'] = len(example['question'].split())
            example['context_len'] = len(example['context'].split())

            answers = example['answers']  # list
            answer_starts = example['answer_starts']  # list

        # If no answers are given, set the cls_index as answer.
        if len(answer_starts) == 0:
            tokenized_examples[i]["start_positions"] = cls_index
            tokenized_examples[i]["end_positions"] = cls_index
            tokenized_examples[i]['answerable_label'] = 0
        else:
            # Start/end character index of the answer in the text.
            start_char = answer_starts[0]
            end_char = start_char + len(answers[0])
            if args.language == 'ch':
                # Start token index of the current span in the text.
                token_start_index = 0
                while sequence_ids[token_start_index] != 1:
                    token_start_index += 1

                # End token index of the current span in the text.
                token_end_index = len(input_ids) - 2
                while sequence_ids[token_end_index] != 1:
                    token_end_index -= 1
            else:

                token_start_index = tokenized_example['context_start_id']
                token_end_index = tokenized_example['context_end_id']

            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char and
                    offsets[token_end_index][1] >= end_char):
                tokenized_examples[i]["start_positions"] = cls_index
                tokenized_examples[i]["end_positions"] = cls_index
                tokenized_examples[i]['answerable_label'] = 0
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[
                        token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples[i]["start_positions"] = token_start_index - 1
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples[i]["end_positions"] = token_end_index + 1
                tokenized_examples[i]['answerable_label'] = 1

    return tokenized_examples