示例#1
0
def get_feature_from_data(tokenizer,
                          maxlen,
                          input,
                          previous,
                          target=None,
                          ntarget=None,
                          reserved_len=0,
                          handle_exceed='noop',
                          **kwargs):
    feature_dict_list = []
    pred_len = len(tokenizer.convert_tokens_to_ids(
        target)) if target is not None else len(previous)
    t_input_list, _ = tok.handle_exceed(tokenizer, input,
                                        maxlen - 3 - pred_len - reserved_len,
                                        handle_exceed)
    for t_input in t_input_list:  # -2 for cls and sep
        row_dict = dict()
        t_input = [tok.tok_begin(tokenizer)
                   ] + t_input + [tok.tok_begin(tokenizer)]
        t_input.extend(previous)
        t_input_id = tokenizer.convert_tokens_to_ids(t_input)
        target_start = len(t_input_id) - 1

        row_dict['target'] = [-1] * maxlen
        row_dict['ntarget'] = [-1] * maxlen

        if target is not None:
            t_input_id.extend(
                tokenizer.convert_tokens_to_ids(target[:len(target)]))
            tokenized_target_id = [-1] * target_start
            # tokenized_target_id = tokenizer.convert_tokens_to_ids(t_input[1:])
            tokenized_target_id.extend(
                tokenizer.convert_tokens_to_ids(target +
                                                [tok.tok_sep(tokenizer)]))
            tokenized_target_id.extend([-1] *
                                       (maxlen - len(tokenized_target_id)))
            row_dict['target'] = tokenized_target_id
        if ntarget is not None and len(tokenizer.tokenize(ntarget)) > 0:
            tokenized_ntarget = tokenizer.convert_tokens_to_ids(
                tokenizer.tokenize(ntarget))
            tokenized_ntarget_id = [-1] * target_start
            # tokenized_ntarget_id = tokenizer.convert_tokens_to_ids(t_input[1:])
            tokenized_ntarget_id.extend(tokenized_ntarget)
            tokenized_ntarget_id.extend([-1] *
                                        (maxlen - len(tokenized_ntarget_id)))
            if len(tokenized_ntarget_id) <= maxlen:
                row_dict['ntarget'] = tokenized_ntarget_id

        mask_id = [1] * len(t_input_id)
        t_input_id.extend(
            tokenizer.convert_tokens_to_ids([tok.tok_pad(tokenizer)]) *
            (maxlen - len(t_input_id)))
        mask_id.extend([0] * (maxlen - len(mask_id)))
        row_dict['input'] = t_input_id
        row_dict['mask'] = mask_id
        row_dict['start'] = target_start
        feature_dict_list.append(row_dict)

    return feature_dict_list
示例#2
0
def get_feature_from_data(tokenizer,
                          maxlen,
                          tasks,
                          task,
                          input,
                          target=None,
                          handle_exceed='slide',
                          **kwargs):
    feature_dict_list = []
    t_input_list, _ = tok.handle_exceed(tokenizer, input, maxlen - 2,
                                        handle_exceed)
    for t_input in t_input_list:  # -2 for cls and sep
        row_dict = dict()
        row_dict['task'] = task
        input_token = [tok.tok_begin(tokenizer)
                       ] + t_input + [tok.tok_sep(tokenizer)]
        tokenized_input_id = tokenizer.convert_tokens_to_ids(input_token)
        mask_id = [1] * len(tokenized_input_id)
        tokenized_input_id.extend([tokenizer.pad_token_id] *
                                  (maxlen - len(tokenized_input_id)))
        mask_id.extend([-1] * (maxlen - len(mask_id)))
        row_dict['input'] = tokenized_input_id
        row_dict['mask'] = mask_id
        row_dict['target'] = [-1]
        if target is not None:
            if 'multi_label' in task:
                mlb = MultiLabelBinarizer(classes=tasks[task])
                tar = mlb.fit_transform([target])
                tokenize_label = tar
            else:
                tokenize_label = [tasks[task].index(target[0])]
            row_dict['target'] = tokenize_label
        feature_dict_list.append(row_dict)
    return feature_dict_list
示例#3
0
def get_feature_from_data(tokenizer,
                          maxlen,
                          input,
                          target=None,
                          ntarget=None,
                          reserved_len=0,
                          handle_exceed='start_slice',
                          add_end_tok=True,
                          **kwargs):
    feature_dict_list = []
    tokenized_target = tokenizer.tokenize(target) if target is not None else []
    t_input_list, _ = tok.handle_exceed(tokenizer, input,
                                        maxlen - 3 - len(tokenized_target),
                                        handle_exceed)
    for t_input in t_input_list:  # -2 for cls and sep and prediction end sep
        row_dict = dict()
        tokenized_input = [
            tok.tok_begin(tokenizer)
        ] + t_input[:maxlen - reserved_len - 3] + [tok.tok_sep(tokenizer)]

        row_dict['target'] = [-1] * maxlen
        row_dict['target_once'] = [-1] * maxlen
        tokenized_input_id = tokenizer.convert_tokens_to_ids(tokenized_input)
        target_start = len(tokenized_input_id)
        target_end = maxlen
        target_length = target_end - target_start

        if target is not None:
            if add_end_tok:
                tokenized_target += [tok.tok_sep(tokenizer)]
            tokenized_target_id = []
            tokenized_target_once_id = [-1] * len(tokenized_input)
            target_ids = tokenizer.convert_tokens_to_ids(tokenized_target)
            target_length = len(target_ids)
            tokenized_target_id.extend(target_ids)
            tokenized_target_once_id.extend(target_ids)
            target_end = len(tokenized_target_id) - 1
            tokenized_target_id.extend([-1] *
                                       (maxlen - len(tokenized_target_id)))
            tokenized_target_once_id.extend(
                [-1] * (maxlen - len(tokenized_target_once_id)))
            row_dict['target'] = tokenized_target_id
            row_dict['target_once'] = tokenized_target_once_id

        input_length = min(maxlen, target_start * 3)
        tokenized_input_id.extend([tokenizer.mask_token_id] *
                                  (maxlen - len(tokenized_input_id)))
        mask_id = [1] * input_length
        mask_id.extend([0] * (maxlen - len(mask_id)))
        row_dict['input'] = tokenized_input_id
        row_dict['mask'] = mask_id
        row_dict['start'] = target_start
        row_dict['end'] = target_end
        row_dict['input_length'] = input_length
        row_dict['target_length'] = target_length
        feature_dict_list.append(row_dict)

    return feature_dict_list
示例#4
0
def get_feature_from_data(tokenizer,
                          maxlen,
                          input,
                          previous,
                          target=None,
                          ntarget=None,
                          reserved_len=0,
                          handle_exceed='noop',
                          **kwargs):
    feature_dict_list = []
    t_input_list, _ = tok.handle_exceed(tokenizer, input,
                                        maxlen - 2 - len(previous) - 1,
                                        handle_exceed)
    for t_input in t_input_list:  # -2 for cls and sep
        row_dict = dict()
        t_input = [tok.tok_begin(tokenizer)] + \
                  t_input[:maxlen - reserved_len - 2] + \
                  [tok.tok_sep(tokenizer)]
        t_input.extend(previous)
        t_input.append(tok.tok_mask(tokenizer))
        t_input_id = tokenizer.convert_tokens_to_ids(t_input)
        mask_id = [1] * len(t_input)
        target_start = len(t_input_id) - 1
        target_end = maxlen
        t_input_id.extend([0] * (maxlen - len(t_input_id)))
        row_dict['target'] = [-1] * maxlen
        row_dict['ntarget'] = [-1] * maxlen
        tokenized_target_id = None
        if target is not None:
            tokenized_target_id = [-1] * target_start
            tokenized_target_id.append(
                tokenizer.convert_tokens_to_ids(target)[-1])
            target_end = len(tokenized_target_id) - 1
            tokenized_target_id.extend([-1] *
                                       (maxlen - len(tokenized_target_id)))
            row_dict['target'] = tokenized_target_id
        if ntarget is not None and len(tokenizer.tokenize(ntarget)) > 0:
            tokenized_ntarget = tokenizer.convert_tokens_to_ids(
                tokenizer.tokenize(ntarget))
            tokenized_ntarget_id = [-1] * target_start
            tokenized_ntarget_id.extend(tokenized_ntarget)
            tokenized_ntarget_id.extend([-1] *
                                        (maxlen - len(tokenized_ntarget_id)))
            if len(tokenized_ntarget_id) <= maxlen:
                row_dict['ntarget'] = tokenized_ntarget_id

        mask_id.extend([0] * (maxlen - len(mask_id)))
        type_id = [0] * len(t_input)
        type_id.extend([1] * (maxlen - len(type_id)))
        row_dict['input'] = t_input_id
        row_dict['type'] = type_id
        row_dict['mask'] = mask_id
        row_dict['start'] = target_start
        row_dict['end'] = target_end
        feature_dict_list.append(row_dict)

    return feature_dict_list
示例#5
0
def get_feature_from_data(tokenizer, maxlen, input, target=None, handle_exceed='start_slice', **kwargs):
    feature_dict_list = []
    t_input_list, _ = tok.handle_exceed(tokenizer, input, maxlen - 2, handle_exceed)

    for t_input in t_input_list:  # -2 for cls and sep
        row_dict = dict()
        tokenized_input = [tok.tok_begin(tokenizer)] + t_input + [tok.tok_sep(tokenizer)]
        tokenized_input_id = tokenizer.convert_tokens_to_ids(tokenized_input)

        row_dict['target'] = [-1] * maxlen
        if target is not None:
            tokenized_target = []
            targets_pointer = 0
            for tok_pos, text in enumerate(tokenized_input):
                if text == tok.tok_mask(tokenizer):
                    if targets_pointer == int(target):
                        tok_target = 1
                    else:
                        tok_target = 0
                    tokenized_target.extend([tok_target])
                    targets_pointer += 1
                else:
                    tokenized_target.append(-1)
            tokenized_target.extend([-1] * (maxlen - len(tokenized_target)))
            row_dict['target'] = tokenized_target
        target_pos_list = []
        for tok_pos, text in enumerate(tokenized_input):
            if text == tok.tok_mask(tokenizer):
                target_pos_list.append(tok_pos)
        target_pos_list.extend([0] * (4 - len(target_pos_list)))
        if len(target_pos_list) != 4:
            continue
        row_dict['target_pos'] = target_pos_list

        mask_id = [1] * len(tokenized_input)
        type_id = [0] * len(tokenized_input)
        tokenized_input_id.extend(
            [tokenizer.convert_tokens_to_ids([tok.tok_pad(tokenizer)])[0]] * (maxlen - len(tokenized_input_id)))
        mask_id.extend([0] * (maxlen - len(mask_id)))
        type_id.extend([1] * (maxlen - len(type_id)))
        row_dict['input'] = tokenized_input_id
        row_dict['type'] = type_id
        row_dict['mask'] = mask_id
        feature_dict_list.append(row_dict)
    return feature_dict_list
示例#6
0
def get_feature_from_data(tokenizer, maxlen, input, target=None, ntarget=None, reserved_len=0,
                          handle_exceed='start_slice', add_end_tok=True, **kwargs):
    feature_dict_list = []
    tokenized_target = tokenizer.tokenize(target) if target is not None else []
    t_input_list, _ = tok.handle_exceed(tokenizer, input, maxlen - 3 - len(tokenized_target), handle_exceed)
    for t_input in t_input_list:  # -2 for cls and sep and prediction end sep
        row_dict = dict()
        tokenized_input = [tok.tok_begin(tokenizer)] + t_input[:maxlen - reserved_len - 3] + [tok.tok_sep(tokenizer)]
        mask_id = [1] * len(tokenized_input)
        type_id = [0] * len(tokenized_input)

        row_dict['target'] = [-1] * maxlen
        row_dict['ntarget'] = [-1] * maxlen

        tokenized_input_id = tokenizer.convert_tokens_to_ids(tokenized_input)
        target_start = len(tokenized_input_id)
        if target is not None:
            if add_end_tok:
                tokenized_target += [tok.tok_sep(tokenizer)]
            tokenized_target_id = [-1] * len(tokenized_input)
            tokenized_target_id.extend(tokenizer.convert_tokens_to_ids(tokenized_target))
            tokenized_target_id.extend([-1] * (maxlen - len(tokenized_target_id)))
            print(len(tokenized_target_id), len(tokenized_input), len(t_input), tokenized_target_id)
            row_dict['target'] = tokenized_target_id

        if ntarget is not None:
            tokenized_ntarget = tokenizer.tokenize(ntarget)
            tokenized_ntarget_id = [-1] * target_start
            tokenized_ntarget_id.extend(tokenizer.convert_tokens_to_ids(tokenized_ntarget))
            tokenized_ntarget_id.extend([-1] * (maxlen - len(tokenized_ntarget_id)))
            row_dict['ntarget'] = tokenized_ntarget_id

        tokenized_input_id.extend([tokenizer.mask_token_id] * (maxlen - len(tokenized_input_id)))
        mask_id.extend([0] * (maxlen - len(mask_id)))
        type_id.extend([1] * (maxlen - len(type_id)))

        row_dict['input'] = tokenized_input_id
        row_dict['type'] = type_id
        row_dict['mask'] = mask_id
        row_dict['start'] = target_start
        feature_dict_list.append(row_dict)
    return feature_dict_list
示例#7
0
def preprocessing_data(item,
                       tokenizer,
                       maxlen=512,
                       handle_exceed='start_slice',
                       likelihood=['none', 'pos', 'neg', 'both'],
                       reserved_len=0,
                       **kwargs):
    likelihood = likelihood[0] if isinstance(likelihood, list) else likelihood
    tasks, task, input, targets = item
    p_target, n_target = targets
    input = input.strip()

    tokenized_target = tokenizer.tokenize(" ".join(p_target))
    param_dict = {
        'tokenizer': tokenizer,
        'maxlen': maxlen,
        'handle_exceed': handle_exceed,
        'reserved_len': reserved_len
    }

    if "neg" in likelihood or 'both' in likelihood:
        # formatting neg data in csv
        if n_target is None:
            ntext_arr = [
                tokenizer.convert_tokens_to_string([tok.tok_begin(tokenizer)] +
                                                   tokenized_target)
            ]
        elif "[SEP]" in n_target:
            ntext_arr = [ntext.strip() for ntext in n_target.split("[SEP]")]
        else:
            ntext_arr = [n_target.strip()]
        for neg_text in ntext_arr:
            yield get_feature_from_data, {
                **{
                    'input': input,
                    'previous': [],
                    'target': tokenized_target,
                    'ntarget': neg_text
                },
                **param_dict
            }
    else:
        yield get_feature_from_data, {
            **{
                'input': input,
                'previous': [],
                'target': tokenized_target,
                'ntarget': None
            },
            **param_dict
        }

    # whole sentence masking
    if 'pos' in likelihood:
        yield once.get_feature_from_data, {
            **{
                'input': input,
                'target': " ".join(p_target)
            },
            **param_dict
        }
    elif 'both' in likelihood:
        # formatting neg data in csv
        if n_target is None:
            ntext_arr = []
        elif "[SEP]" in n_target:
            ntext_arr = [ntext.strip() for ntext in n_target.split("[SEP]")]
        else:
            ntext_arr = [n_target.strip()]
        for neg_text in ntext_arr:
            yield once.get_feature_from_data, {
                **{
                    'input': input,
                    'target': " ".join(p_target),
                    'ntarget': neg_text
                },
                **param_dict
            }
示例#8
0
def get_feature_from_data(tokenizer,
                          maxlen,
                          input,
                          previous,
                          target=None,
                          ntarget=None,
                          reserved_len=0,
                          handle_exceed='noop',
                          **kwargs):
    feature_dict_list = []

    pred_len = len(tokenizer.convert_tokens_to_ids(
        target)) + 1 if target is not None else len(previous) - 1
    t_input_list, _ = tok.handle_exceed(tokenizer, input,
                                        maxlen - 2 - pred_len, handle_exceed)
    for t_input in t_input_list:  # -2 for cls and sep
        row_dict = dict()
        t_input = [tok.tok_begin(tokenizer)] + \
                  t_input[:maxlen - reserved_len - 2] + \
                  [tok.tok_sep(tokenizer)]
        t_input_id = tokenizer.convert_tokens_to_ids(t_input)
        encoder_mask_id = [1] * (len(t_input))
        encoder_mask_id.extend([0] * (maxlen - len(encoder_mask_id)))
        t_input_id.extend(
            tokenizer.convert_tokens_to_ids([tok.tok_pad(tokenizer)]) *
            (maxlen - len(t_input_id)))

        if target is not None:
            tokenized_target_id = []
            tokenized_prev_id = []
            tokenized_prev_id.extend(
                tokenizer.convert_tokens_to_ids([tok.tok_begin(tokenizer)] +
                                                target))
            tokenized_target_id.extend(
                tokenizer.convert_tokens_to_ids(target +
                                                [tok.tok_sep(tokenizer)]))
            decoder_mask_id = [1] * (len(tokenized_prev_id))
            decoder_mask_id.extend([0] * (maxlen - len(decoder_mask_id)))
            tokenized_prev_id.extend(
                tokenizer.convert_tokens_to_ids([tok.tok_pad(tokenizer)]) *
                (maxlen - len(tokenized_prev_id)))
            tokenized_target_id.extend([-100] *
                                       (maxlen - len(tokenized_target_id)))
            row_dict['target'] = tokenized_target_id
            row_dict['prev'] = tokenized_prev_id
            if ntarget is not None and len(tokenizer.tokenize(ntarget)) > 0:
                tokenized_ntarget = tokenizer.convert_tokens_to_ids(
                    tokenizer.tokenize(ntarget))
                tokenized_ntarget_id = tokenized_ntarget
                tokenized_ntarget_id.extend(
                    [-100] * (maxlen - len(tokenized_ntarget_id)))
                if len(tokenized_ntarget_id) <= maxlen:
                    row_dict['ntarget'] = tokenized_ntarget_id
        else:
            tokenized_prev_id = [
                tokenizer.convert_tokens_to_ids(tok.tok_begin(tokenizer))
            ]
            tokenized_prev_id.extend(tokenizer.convert_tokens_to_ids(previous))
            target_start = len(tokenized_prev_id) - 1
            row_dict['start'] = target_start
            decoder_mask_id = [1] * (len(tokenized_prev_id))
            row_dict['prev'] = tokenized_prev_id

        row_dict['input'] = t_input_id
        row_dict['encoder_mask'] = encoder_mask_id
        row_dict['decoder_mask'] = decoder_mask_id
        feature_dict_list.append(row_dict)

    return feature_dict_list
示例#9
0
def get_feature_from_data(tokenizer,
                          labels,
                          input,
                          target=None,
                          maxlen=512,
                          separator=" ",
                          handle_exceed='slide'):
    feature_dict_list = []

    mapping_index = []
    pos = 1  # cls as start 0
    for i in input.split(" "):
        for _ in range(len(tokenizer.tokenize(i))):
            if _ < 1:
                mapping_index.append({'char': i, 'pos': pos})
            pos += 1
    if target is not None:
        target = target.split(separator)

    t_input_list, t_pos_list = tok.handle_exceed(tokenizer,
                                                 input,
                                                 maxlen - 2,
                                                 mode=handle_exceed,
                                                 keep_after_sep=False)
    for t_input, t_pos in zip(t_input_list, t_pos_list):  # -2 for cls and sep
        # ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
        row_dict = dict()
        tokenized_input = [tok.tok_begin(tokenizer)
                           ] + t_input + [tok.tok_sep(tokenizer)]
        input_id = tokenizer.convert_tokens_to_ids(tokenized_input)

        if target is not None:
            target_token = []
            pev = 0

            for tok_map, target_label in zip(mapping_index, target):
                if t_pos[0] < tok_map['pos'] <= t_pos[1]:
                    for _ in range(tok_map['pos'] - pev):
                        target_token += [labels.index(target_label)]
                pev = tok_map['pos']

            if "O" in labels:
                target_id = [labels.index("O")
                             ] + target_token + [labels.index("O")]
            else:
                target_id = [target_token[0]
                             ] + target_token + [target_token[-1]]

            if len(input_id) != len(target_id):
                print("input target len not equal ", len(input_id),
                      len(target_id))
            target_id.extend([0] * (maxlen - len(target_id)))
            row_dict['target'] = target_id

        map_start = 0
        map_end = len(mapping_index)
        for pos, tok_map in enumerate(mapping_index):
            if t_pos[0] == tok_map['pos']:
                map_start = pos
            elif t_pos[1] == tok_map['pos']:
                map_end = pos

        row_dict['mapping'] = mapping_index[map_start:map_end + 1]
        mask_id = [1] * len(input_id)
        mask_id.extend([0] * (maxlen - len(mask_id)))
        row_dict['mask'] = mask_id
        row_dict['end'] = len(input_id)
        input_id.extend([0] * (maxlen - len(input_id)))
        row_dict['input'] = input_id
        row_dict['pos'] = [map_start, map_end]
        feature_dict_list.append(row_dict)

    return feature_dict_list
示例#10
0
def get_feature_from_data(tokenizer,
                          labels,
                          input,
                          target=None,
                          maxlen=512,
                          separator=" ",
                          handle_exceed='slide'):
    feature_dict_list = []

    word_token_mapping = []
    token_word_mapping = []
    pos = 0
    for word_i, word in enumerate(input.split(separator)):
        tokenize_word = tokenizer.tokenize(word)
        for _ in range(len(tokenize_word)):
            if _ < 1:  # only record first token (one word one record)
                word_token_mapping.append({
                    'char': word,
                    'pos': pos,
                    'len': len(tokenize_word)
                })
            token_word_mapping.append({
                'tok': tokenize_word[_],
                'word': word,
                'pos': len(word_token_mapping) - 1
            })
            pos += 1

    t_input_list, t_pos_list = tok.handle_exceed(tokenizer,
                                                 input,
                                                 maxlen - 1,
                                                 mode=handle_exceed,
                                                 keep_after_sep=False)
    for t_input, t_pos in zip(t_input_list, t_pos_list):  # -1 for cls
        # ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
        row_dict = dict()
        tokenized_input = [tok.tok_begin(tokenizer)] + t_input
        input_id = tokenizer.convert_tokens_to_ids(tokenized_input)

        if target is not None:
            target_token = []
            for input_word, target_label in zip(word_token_mapping,
                                                target.split(separator)):
                if t_pos[0] <= input_word['pos'] < t_pos[1]:
                    for _ in range(input_word['len']):
                        target_token += [labels.index(target_label)]

            if "O" in labels:
                target_id = [labels.index("O")] + target_token
            else:
                target_id = [target_token[0]] + target_token

            if len(input_id) != len(target_id):
                print(
                    list(zip(input.split(separator), target.split(separator))))
                print(tokenizer.decode(input_id))
                print(input_id)
                print(target_id)
                print("input target len not equal ", len(input_id),
                      len(target_id))
                continue

            target_id.extend([0] * (maxlen - len(target_id)))
            row_dict['target'] = target_id

        row_dict['word_token_mapping'] = word_token_mapping
        row_dict['token_word_mapping'] = token_word_mapping
        mask_id = [1] * len(input_id)
        mask_id.extend([0] * (maxlen - len(mask_id)))
        row_dict['mask'] = mask_id
        row_dict['end'] = len(input_id)
        row_dict['pos'] = t_pos
        input_id.extend([0] * (maxlen - len(input_id)))
        row_dict['input'] = input_id
        feature_dict_list.append(row_dict)

    return feature_dict_list