def get_or_make_label_encoder(params, problem, mode, label_list=None, zero_class=None): """Simple function to create or load existing label encoder If mode is train, alway create new label_encder Arguments: problem {str} -- problem name mode {mode} -- mode Keyword Arguments: label_list {list} -- label list to fit the encoder (default: {None}) zero_class {str} -- what to assign as 0 (default: {'O'}) Returns: LabelEncoder -- label encoder """ if label_list is None: return None problem_path = params.ckpt_dir create_path(problem_path) le_path = os.path.join(problem_path, '%s_label_encoder.pkl' % problem) is_seq2seq_text = params.problem_type[problem] == 'seq2seq_text' is_multi_cls = params.problem_type[problem] == 'multi_cls' is_seq2seq_tag = params.problem_type[problem] == 'seq2seq_tag' if mode == 'train' and not os.path.exists(le_path): if is_seq2seq_text: label_encoder = load_transformer_tokenizer( params.transformer_tokenizer_name) pickle.dump(label_encoder, open(le_path, 'wb')) elif is_multi_cls: label_encoder = MultiLabelBinarizer() label_encoder.fit(label_list) pickle.dump(label_encoder, open(le_path, 'wb')) else: if isinstance(label_list[0], list): label_list = [ item for sublist in label_list for item in sublist ] if is_seq2seq_tag: label_list.extend([BOS_TOKEN, EOS_TOKEN]) label_encoder = LabelEncoder() label_encoder.fit(label_list, zero_class=zero_class) label_encoder.dump(le_path) else: if is_seq2seq_text or is_multi_cls: label_encoder = pickle.load(open(le_path, 'rb')) else: label_encoder = LabelEncoder() label_encoder.load(le_path) if not is_seq2seq_text: if is_multi_cls: params.num_classes[problem] = label_encoder.classes_.shape[0] else: params.num_classes[problem] = len(label_encoder.encode_dict) if EOS_TOKEN in label_encoder.encode_dict: params.eos_id[problem] = int( label_encoder.transform([EOS_TOKEN])[0]) else: params.num_classes[problem] = len(label_encoder.vocab) params.eos_id[problem] = label_encoder.convert_tokens_to_ids( [EOS_TOKEN]) return label_encoder
def get_or_make_label_encoder(params, problem: str, mode: str, label_list=None) -> Union[LabelEncoder, MultiLabelBinarizer, PreTrainedTokenizer]: """Function to unify ways to get or create label encoder for various problem type. cls: LabelEncoder seq_tag: LabelEncoder multi_cls: MultiLabelBinarizer seq2seq_text: Tokenizer Arguments: problem {str} -- problem name mode {mode} -- mode Keyword Arguments: label_list {list} -- label list to fit the encoder (default: {None}) Returns: LabelEncoder -- label encoder """ problem_path = params.ckpt_dir create_path(problem_path) le_path = os.path.join(problem_path, '%s_label_encoder.pkl' % problem) is_seq2seq_text = params.problem_type[problem] == 'seq2seq_text' is_multi_cls = params.problem_type[problem] == 'multi_cls' is_seq = params.problem_type[problem] == 'seq_tag' is_pretrain = params.problem_type[problem] == 'pretrain' if is_pretrain: return None if mode == 'train' and not os.path.exists(le_path): if is_seq2seq_text: label_encoder = load_transformer_tokenizer( params.transformer_decoder_tokenizer_name, params.transformer_decoder_tokenizer_loading) pickle.dump(label_encoder, open(le_path, 'wb')) elif is_multi_cls: label_encoder = MultiLabelBinarizer() label_encoder.fit(label_list) pickle.dump(label_encoder, open(le_path, 'wb')) else: if isinstance(label_list[0], list): label_list = [ item for sublist in label_list for item in sublist] if is_seq: label_list.append('[PAD]') label_encoder = LabelEncoder() label_encoder.fit(label_list) label_encoder.dump(le_path) else: if is_seq2seq_text or is_multi_cls: label_encoder = pickle.load(open(le_path, 'rb')) else: label_encoder = LabelEncoder() label_encoder.load(le_path) if not is_seq2seq_text: if is_multi_cls: params.num_classes[problem] = label_encoder.classes_.shape[0] else: params.num_classes[problem] = len(label_encoder.encode_dict) else: try: params.num_classes[problem] = len(label_encoder.vocab) except AttributeError: # models like xlnet's vocab size can only be retrieved from config instead of tokenizer params.num_classes[problem] = params.bert_decoder_config.vocab_size return label_encoder
def get_or_make_label_encoder(params, problem, mode, label_list=None, zero_class=None): """Simple function to create or load existing label encoder If mode is train, alway create new label_encder Arguments: problem {str} -- problem name mode {mode} -- mode Keyword Arguments: label_list {list} -- label list to fit the encoder (default: {None}) zero_class {str} -- what to assign as 0 (default: {'O'}) Returns: LabelEncoder -- label encoder """ if label_list is None: return None problem_path = params.ckpt_dir create_path(problem_path) le_path = os.path.join(problem_path, '%s_label_encoder.pkl' % problem) is_seq2seq_text = params.problem_type[problem] == 'seq2seq_text' is_multi_cls = params.problem_type[problem] == 'multi_cls' is_seq2seq_tag = params.problem_type[problem] == 'seq2seq_tag' if mode == 'train' and not os.path.exists(le_path): if is_seq2seq_text: label_encoder = load_transformer_tokenizer( params.transformer_decoder_tokenizer_name, params.transformer_decoder_tokenizer_loading) pickle.dump(label_encoder, open(le_path, 'wb')) elif is_multi_cls: label_encoder = MultiLabelBinarizer() label_encoder.fit(label_list) pickle.dump(label_encoder, open(le_path, 'wb')) else: if isinstance(label_list[0], list): label_list = [ item for sublist in label_list for item in sublist ] label_encoder = LabelEncoder() label_encoder.fit(label_list, zero_class=zero_class) label_encoder.dump(le_path) else: if is_seq2seq_text or is_multi_cls: label_encoder = pickle.load(open(le_path, 'rb')) else: label_encoder = LabelEncoder() label_encoder.load(le_path) if not is_seq2seq_text: if is_multi_cls: params.num_classes[problem] = label_encoder.classes_.shape[0] else: params.num_classes[problem] = len(label_encoder.encode_dict) else: try: params.num_classes[problem] = len(label_encoder.vocab) except AttributeError: # models like xlnet's vocab size can only be retrieved from config instead of tokenizer params.num_classes[problem] = params.bert_decoder_config.vocab_size return label_encoder