def index_datasets(self, *datasets, field_name, add_cls_sep=True, add_prefix_space=True): r""" 使用bert的tokenizer新生成word_pieces列加入到datasets中,并将他们设置为input。如果首尾不是 [CLS]与[SEP]会在首尾额外加入[CLS]与[SEP], 且将word_pieces这一列的pad value设置为了bert的pad value。 :param datasets: DataSet对象 :param field_name: 基于哪一列index :param bool add_cls_sep: 是否在句首句尾添加cls和sep的index :param bool add_prefix_space: 是否在句子开头添加空格,预训练时RoBERTa该值为True :return: """ encode_func = partial(self.tokenzier.encode, add_special_tokens=add_cls_sep, add_prefix_space=add_prefix_space) for index, dataset in enumerate(datasets): try: dataset.apply_field(encode_func, field_name=field_name, new_field_name='word_pieces', is_input=True) dataset.set_pad_val('word_pieces', self._wordpiece_pad_index) except Exception as e: logger.error( f"Exception happens when processing the {index} dataset.") raise e
def index_dataset(self, *datasets, field_name, add_cls_sep=True): """ 使用bert的tokenizer新生成word_pieces列加入到datasets中,并将他们设置为input。如果首尾不是 [CLS]与[SEP]会在首尾额外加入[CLS]与[SEP], 且将word_pieces这一列的pad value设置为了bert的pad value。 :param datasets: DataSet对象 :param field_name: 基于哪一列index :return: """ def convert_words_to_word_pieces(words): word_pieces = [] for word in words: tokens = self.tokenzier.wordpiece_tokenizer.tokenize(word) word_piece_ids = self.tokenzier.convert_tokens_to_ids(tokens) word_pieces.extend(word_piece_ids) if add_cls_sep: if word_pieces[0] != self._cls_index: word_pieces.insert(0, self._cls_index) if word_pieces[-1] != self._sep_index: word_pieces.insert(-1, self._sep_index) return word_pieces for index, dataset in enumerate(datasets): try: dataset.apply_field(convert_words_to_word_pieces, field_name=field_name, new_field_name='word_pieces', is_input=True) dataset.set_pad_val('word_pieces', self._wordpiece_pad_index) except Exception as e: logger.error( f"Exception happens when processing the {index} dataset.") raise e
def _get_bert_dir(model_dir_or_name: str = 'en-base-uncased'): if model_dir_or_name.lower() in PRETRAINED_BERT_MODEL_DIR: model_url = _get_embedding_url('bert', model_dir_or_name.lower()) model_dir = cached_path(model_url, name='embedding') # 检查是否存在 elif os.path.isdir(os.path.abspath(os.path.expanduser(model_dir_or_name))): model_dir = os.path.abspath(os.path.expanduser(model_dir_or_name)) else: logger.error(f"Cannot recognize BERT dir or name ``{model_dir_or_name}``.") raise ValueError(f"Cannot recognize BERT dir or name ``{model_dir_or_name}``.") return str(model_dir)
def index_datasets(self, *datasets, field_name, add_cls_sep=True, add_prefix_space=True): encode_func = partial( self.tokenzier.encode, add_special_tokens=add_cls_sep) # , add_prefix_space=add_prefix_space) for index, dataset in enumerate(datasets): try: dataset.apply_field(encode_func, field_name=field_name, new_field_name='word_pieces', is_input=True) dataset.set_pad_val('word_pieces', self._wordpiece_pad_index) except Exception as e: logger.error( f"Exception happens when processing the {index} dataset.") raise e
def _load_with_vocab(self, embed_filepath, vocab, dtype=np.float32, padding='<pad>', unknown='<unk>', error='ignore', init_method=None): """ 从embed_filepath这个预训练的词向量中抽取出vocab这个词表的词的embedding。EmbedLoader将自动判断embed_filepath是 word2vec(第一行只有两个元素)还是glove格式的数据。 :param str embed_filepath: 预训练的embedding的路径。 :param vocab: 词表 :class:`~fastNLP.Vocabulary` 类型,读取出现在vocab中的词的embedding。 没有出现在vocab中的词的embedding将通过找到的词的embedding的正态分布采样出来,以使得整个Embedding是同分布的。 :param dtype: 读出的embedding的类型 :param str padding: 词表中padding的token :param str unknown: 词表中unknown的token :param str error: `ignore` , `strict` ; 如果 `ignore` ,错误将自动跳过; 如果 `strict` , 错误将抛出。 这里主要可能出错的地方在于词表有空行或者词表出现了维度不一致。 :param init_method: 如何初始化没有找到的值。可以使用torch.nn.init.*中各种方法。默认使用torch.nn.init.zeros_ :return torch.tensor: shape为 [len(vocab), dimension], dimension由pretrain的embedding决定。 """ assert isinstance(vocab, Vocabulary), "Only fastNLP.Vocabulary is supported." if not os.path.exists(embed_filepath): raise FileNotFoundError("`{}` does not exist.".format(embed_filepath)) with open(embed_filepath, 'r', encoding='utf-8') as f: line = f.readline().strip() parts = line.split() start_idx = 0 if len(parts) == 2: dim = int(parts[1]) start_idx += 1 else: dim = len(parts) - 1 f.seek(0) matrix = {} if vocab.padding: matrix[vocab.padding_idx] = torch.zeros(dim) if vocab.unknown: matrix[vocab.unknown_idx] = torch.zeros(dim) found_count = 0 found_unknown = False for idx, line in enumerate(f, start_idx): try: parts = line.strip().split() word = ''.join(parts[:-dim]) nums = parts[-dim:] # 对齐unk与pad if word == padding and vocab.padding is not None: word = vocab.padding elif word == unknown and vocab.unknown is not None: word = vocab.unknown found_unknown = True if word in vocab: index = vocab.to_index(word) matrix[index] = torch.from_numpy(np.fromstring(' '.join(nums), sep=' ', dtype=dtype, count=dim)) if self.only_norm_found_vector: matrix[index] = matrix[index] / np.linalg.norm(matrix[index]) found_count += 1 except Exception as e: if error == 'ignore': warnings.warn("Error occurred at the {} line.".format(idx)) else: logger.error("Error occurred at the {} line.".format(idx)) raise e logger.info("Found {} out of {} words in the pre-training embedding.".format(found_count, len(vocab))) for word, index in vocab: if index not in matrix and not vocab._is_word_no_create_entry(word): if found_unknown: # 如果有unkonwn,用unknown初始化 matrix[index] = matrix[vocab.unknown_idx] else: matrix[index] = None # matrix中代表是需要建立entry的词 vectors = self._randomly_init_embed(len(matrix), dim, init_method) if vocab.unknown is None: # 创建一个专门的unknown unknown_idx = len(matrix) vectors = torch.cat((vectors, torch.zeros(1, dim)), dim=0).contiguous() else: unknown_idx = vocab.unknown_idx self.register_buffer('words_to_words', torch.full((len(vocab),), fill_value=unknown_idx).long()) for index, (index_in_vocab, vec) in enumerate(matrix.items()): if vec is not None: vectors[index] = vec self.words_to_words[index_in_vocab] = index return vectors
def __init__(self, vocab_size=50257, n_positions=1024, n_ctx=1024, n_embd=768, n_layer=12, n_head=12, resid_pdrop=0.1, embd_pdrop=0.1, attn_pdrop=0.1, layer_norm_epsilon=1e-5, initializer_range=0.02, summary_type="cls_index", summary_use_proj=True, summary_activation=None, summary_proj_to_labels=True, summary_first_dropout=0.1, **kwargs): """Constructs GPT2Config. Args: vocab_size: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file. n_positions: Number of positional embeddings. n_ctx: Size of the causal mask (usually same as n_positions). n_embd: Dimensionality of the embeddings and hidden states. n_layer: Number of hidden layers in the Transformer encoder. n_head: Number of attention heads for each attention layer in the Transformer encoder. layer_norm_epsilon: epsilon to use in the layer norm layers resid_pdrop: The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. attn_pdrop: The dropout ratio for the attention probabilities. embd_pdrop: The dropout ratio for the embeddings. initializer_range: The sttdev of the truncated_normal_initializer for initializing all weight matrices. """ self.output_attentions = kwargs.pop("output_attentions", False) self.output_hidden_states = kwargs.pop("output_hidden_states", False) self.output_past = kwargs.pop("output_past", True) # Not used by all models self.torchscript = kwargs.pop("torchscript", False) # Only used by PyTorch models self.use_bfloat16 = kwargs.pop("use_bfloat16", False) self.pruned_heads = kwargs.pop("pruned_heads", {}) # Is decoder is used in encoder-decoder models to differentiate encoder from decoder self.is_decoder = kwargs.pop("is_decoder", False) # Parameters for sequence generation self.max_length = kwargs.pop("max_length", 20) self.do_sample = kwargs.pop("do_sample", False) self.num_beams = kwargs.pop("num_beams", 1) self.temperature = kwargs.pop("temperature", 1.0) self.top_k = kwargs.pop("top_k", 50) self.top_p = kwargs.pop("top_p", 1.0) self.repetition_penalty = kwargs.pop("repetition_penalty", 1.0) self.bos_token_id = kwargs.pop("bos_token_id", 0) self.pad_token_id = kwargs.pop("pad_token_id", 0) self.eos_token_ids = kwargs.pop("eos_token_ids", 0) self.length_penalty = kwargs.pop("length_penalty", 1.0) self.num_return_sequences = kwargs.pop("num_return_sequences", 1) # Fine-tuning task arguments self.finetuning_task = kwargs.pop("finetuning_task", None) self.num_labels = kwargs.pop("num_labels", 2) self.id2label = kwargs.pop( "id2label", {i: "LABEL_{}".format(i) for i in range(self.num_labels)}) self.id2label = dict( (int(key), value) for key, value in self.id2label.items()) self.label2id = kwargs.pop( "label2id", dict(zip(self.id2label.values(), self.id2label.keys()))) self.label2id = dict( (key, int(value)) for key, value in self.label2id.items()) # Additional attributes without default values for key, value in kwargs.items(): try: setattr(self, key, value) except AttributeError as err: logger.error("Can't set {} with value {} for {}".format( key, value, self)) raise err self.vocab_size = vocab_size self.n_ctx = n_ctx self.n_positions = n_positions self.n_embd = n_embd self.n_layer = n_layer self.n_head = n_head self.resid_pdrop = resid_pdrop self.embd_pdrop = embd_pdrop self.attn_pdrop = attn_pdrop self.layer_norm_epsilon = layer_norm_epsilon self.initializer_range = initializer_range self.summary_type = summary_type self.summary_use_proj = summary_use_proj self.summary_activation = summary_activation self.summary_first_dropout = summary_first_dropout self.summary_proj_to_labels = summary_proj_to_labels
def from_pretrained(cls, model_dir_or_name, layer_num=12, *inputs, **kwargs): if layer_num > 12: return None state_dict = kwargs.get('state_dict', None) kwargs.pop('state_dict', None) kwargs.pop('cache_dir', None) kwargs.pop('from_tf', None) # get model dir from name or dir pretrained_model_dir = _get_bert_dir(model_dir_or_name) # Load config config_file = _get_file_name_base_on_postfix(pretrained_model_dir, '.json') config = BertConfig.from_json_file(config_file) if state_dict is None: weights_path = _get_file_name_base_on_postfix( pretrained_model_dir, '.bin') state_dict = torch.load(weights_path, map_location='cpu') else: logger.error( f'Cannot load parameters through `state_dict` variable.') raise RuntimeError( f'Cannot load parameters through `state_dict` variable.') model_type = 'BERT' old_keys = [] new_keys = [] for key in state_dict.keys(): new_key = None for key_name in BERT_KEY_RENAME_MAP_1: if key_name in key: new_key = key.replace(key_name, BERT_KEY_RENAME_MAP_1[key_name]) if 'distilbert' in key: model_type = 'DistilBert' break if new_key: old_keys.append(key) new_keys.append(new_key) for old_key, new_key in zip(old_keys, new_keys): state_dict[new_key] = state_dict.pop(old_key) old_keys = [] new_keys = [] for key in state_dict.keys(): new_key = None for key_name in BERT_KEY_RENAME_MAP_2: if key_name in key: new_key = key.replace(key_name, BERT_KEY_RENAME_MAP_2[key_name]) break if new_key: old_keys.append(key) new_keys.append(new_key) for old_key, new_key in zip(old_keys, new_keys): state_dict[new_key] = state_dict.pop(old_key) ## 下段为fastHan处理所需 old_keys = [] new_keys = [] for key in state_dict.keys(): new_key = None for key_name in ['embed.model.encoder']: if key_name in key: new_key = key.replace(key_name, 'bert') break if new_key: old_keys.append(key) new_keys.append(new_key) for old_key, new_key in zip(old_keys, new_keys): state_dict[new_key] = state_dict.pop(old_key) # Instantiate model. config.num_hidden_layers = layer_num model = cls(config, model_type=model_type, *inputs, **kwargs) missing_keys = [] unexpected_keys = [] error_msgs = [] # copy state_dict so _load_from_state_dict can modify it metadata = getattr(state_dict, '_metadata', None) state_dict = state_dict.copy() if metadata is not None: state_dict._metadata = metadata def load(module, prefix=''): local_metadata = {} if metadata is None else metadata.get( prefix[:-1], {}) module._load_from_state_dict(state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs) for name, child in module._modules.items(): if child is not None: load(child, prefix + name + '.') load(model, prefix='' if hasattr(model, 'bert') else 'bert.') if len(missing_keys) > 0: logger.warning( "Weights of {} not initialized from pretrained model: {}". format(model.__class__.__name__, missing_keys)) #if len(unexpected_keys) > 0: # logger.warning("Weights from pretrained model not used in {}: {}".format( # model.__class__.__name__, unexpected_keys)) logger.info( f"Load pre-trained {model_type} parameters from file {weights_path}." ) return model