示例#1
0
def get_or_make_label_encoder(params,
                              problem,
                              mode,
                              label_list=None,
                              zero_class=None):
    """Simple function to create or load existing label encoder
    If mode is train, alway create new label_encder

    Arguments:
        problem {str} -- problem name
        mode {mode} -- mode

    Keyword Arguments:
        label_list {list} -- label list to fit the encoder (default: {None})
        zero_class {str} -- what to assign as 0 (default: {'O'})

    Returns:
        LabelEncoder -- label encoder
    """
    if label_list is None:
        return None
    problem_path = params.ckpt_dir
    create_path(problem_path)
    le_path = os.path.join(problem_path, '%s_label_encoder.pkl' % problem)
    is_seq2seq_text = params.problem_type[problem] == 'seq2seq_text'
    is_multi_cls = params.problem_type[problem] == 'multi_cls'
    is_seq2seq_tag = params.problem_type[problem] == 'seq2seq_tag'

    if mode == 'train' and not os.path.exists(le_path):

        if is_seq2seq_text:
            label_encoder = load_transformer_tokenizer(
                params.transformer_tokenizer_name)
            pickle.dump(label_encoder, open(le_path, 'wb'))

        elif is_multi_cls:
            label_encoder = MultiLabelBinarizer()
            label_encoder.fit(label_list)
            pickle.dump(label_encoder, open(le_path, 'wb'))

        else:
            if isinstance(label_list[0], list):
                label_list = [
                    item for sublist in label_list for item in sublist
                ]
                if is_seq2seq_tag:
                    label_list.extend([BOS_TOKEN, EOS_TOKEN])
            label_encoder = LabelEncoder()

            label_encoder.fit(label_list, zero_class=zero_class)
            label_encoder.dump(le_path)

    else:

        if is_seq2seq_text or is_multi_cls:
            label_encoder = pickle.load(open(le_path, 'rb'))
        else:
            label_encoder = LabelEncoder()
            label_encoder.load(le_path)

    if not is_seq2seq_text:
        if is_multi_cls:
            params.num_classes[problem] = label_encoder.classes_.shape[0]
        else:
            params.num_classes[problem] = len(label_encoder.encode_dict)
            if EOS_TOKEN in label_encoder.encode_dict:
                params.eos_id[problem] = int(
                    label_encoder.transform([EOS_TOKEN])[0])
    else:
        params.num_classes[problem] = len(label_encoder.vocab)
        params.eos_id[problem] = label_encoder.convert_tokens_to_ids(
            [EOS_TOKEN])

    return label_encoder
示例#2
0
def get_or_make_label_encoder(params, problem: str, mode: str, label_list=None) -> Union[LabelEncoder, MultiLabelBinarizer, PreTrainedTokenizer]:
    """Function to unify ways to get or create label encoder for various
    problem type.

    cls: LabelEncoder
    seq_tag: LabelEncoder
    multi_cls: MultiLabelBinarizer
    seq2seq_text: Tokenizer

    Arguments:
        problem {str} -- problem name
        mode {mode} -- mode

    Keyword Arguments:
        label_list {list} -- label list to fit the encoder (default: {None})

    Returns:
        LabelEncoder -- label encoder
    """

    problem_path = params.ckpt_dir
    create_path(problem_path)
    le_path = os.path.join(problem_path, '%s_label_encoder.pkl' % problem)
    is_seq2seq_text = params.problem_type[problem] == 'seq2seq_text'
    is_multi_cls = params.problem_type[problem] == 'multi_cls'
    is_seq = params.problem_type[problem] == 'seq_tag'
    is_pretrain = params.problem_type[problem] == 'pretrain'

    if is_pretrain:
        return None

    if mode == 'train' and not os.path.exists(le_path):
        if is_seq2seq_text:
            label_encoder = load_transformer_tokenizer(
                params.transformer_decoder_tokenizer_name, params.transformer_decoder_tokenizer_loading)
            pickle.dump(label_encoder, open(le_path, 'wb'))

        elif is_multi_cls:
            label_encoder = MultiLabelBinarizer()
            label_encoder.fit(label_list)
            pickle.dump(label_encoder, open(le_path, 'wb'))

        else:
            if isinstance(label_list[0], list):
                label_list = [
                    item for sublist in label_list for item in sublist]
            if is_seq:
                label_list.append('[PAD]')

            label_encoder = LabelEncoder()

            label_encoder.fit(label_list)
            label_encoder.dump(le_path)

    else:

        if is_seq2seq_text or is_multi_cls:
            label_encoder = pickle.load(open(le_path, 'rb'))
        else:
            label_encoder = LabelEncoder()
            label_encoder.load(le_path)

    if not is_seq2seq_text:
        if is_multi_cls:
            params.num_classes[problem] = label_encoder.classes_.shape[0]
        else:
            params.num_classes[problem] = len(label_encoder.encode_dict)
    else:
        try:
            params.num_classes[problem] = len(label_encoder.vocab)
        except AttributeError:
            # models like xlnet's vocab size can only be retrieved from config instead of tokenizer
            params.num_classes[problem] = params.bert_decoder_config.vocab_size

    return label_encoder
def get_or_make_label_encoder(params,
                              problem,
                              mode,
                              label_list=None,
                              zero_class=None):
    """Simple function to create or load existing label encoder
    If mode is train, alway create new label_encder

    Arguments:
        problem {str} -- problem name
        mode {mode} -- mode

    Keyword Arguments:
        label_list {list} -- label list to fit the encoder (default: {None})
        zero_class {str} -- what to assign as 0 (default: {'O'})

    Returns:
        LabelEncoder -- label encoder
    """
    if label_list is None:
        return None
    problem_path = params.ckpt_dir
    create_path(problem_path)
    le_path = os.path.join(problem_path, '%s_label_encoder.pkl' % problem)
    is_seq2seq_text = params.problem_type[problem] == 'seq2seq_text'
    is_multi_cls = params.problem_type[problem] == 'multi_cls'
    is_seq2seq_tag = params.problem_type[problem] == 'seq2seq_tag'

    if mode == 'train' and not os.path.exists(le_path):

        if is_seq2seq_text:
            label_encoder = load_transformer_tokenizer(
                params.transformer_decoder_tokenizer_name,
                params.transformer_decoder_tokenizer_loading)
            pickle.dump(label_encoder, open(le_path, 'wb'))

        elif is_multi_cls:
            label_encoder = MultiLabelBinarizer()
            label_encoder.fit(label_list)
            pickle.dump(label_encoder, open(le_path, 'wb'))

        else:
            if isinstance(label_list[0], list):
                label_list = [
                    item for sublist in label_list for item in sublist
                ]
            label_encoder = LabelEncoder()

            label_encoder.fit(label_list, zero_class=zero_class)
            label_encoder.dump(le_path)

    else:

        if is_seq2seq_text or is_multi_cls:
            label_encoder = pickle.load(open(le_path, 'rb'))
        else:
            label_encoder = LabelEncoder()
            label_encoder.load(le_path)

    if not is_seq2seq_text:
        if is_multi_cls:
            params.num_classes[problem] = label_encoder.classes_.shape[0]
        else:
            params.num_classes[problem] = len(label_encoder.encode_dict)
    else:
        try:
            params.num_classes[problem] = len(label_encoder.vocab)
        except AttributeError:
            # models like xlnet's vocab size can only be retrieved from config instead of tokenizer
            params.num_classes[problem] = params.bert_decoder_config.vocab_size

    return label_encoder