def __init__(self, device, tag_to_ix, n_layers, hidden_dim, hidden_dim_pp, char_cnn, n_chars, char_cnn_filters, pairwise_gate, train_type="sequence", normalization="weight", elmo_dropout_ratio=0., dropout_ratio=0., shared_lstm=False, inp_config="full", pairwise_query_type='mul', bilinear_dim=300, elmo_dim=1024, attn='multi', all_test=False, gate_bias=-1., monitor=None, logger=None): super(CRF_FB, self).__init__() self.device = device self.hidden_dim = hidden_dim self.hidden_dim_pp = hidden_dim_pp self.bilinear_dim = bilinear_dim self.tag_to_ix = tag_to_ix self.tagset_size = len(tag_to_ix) self.monitor = monitor self.embedding_dim = elmo_dim self.normalization = normalization self.elmo_dropout_ratio = elmo_dropout_ratio self.dropout_ratio = dropout_ratio self.train_type = train_type.lower() self.n_layers = n_layers self.char_cnn = char_cnn self.pairwise_gate = pairwise_gate self.bilinear_inp_dim = self.embedding_dim self.bilinear_out_dim = hidden_dim self.char_cnn_highway_bias = -1. self.query_dim = hidden_dim self.attn_dim = hidden_dim self.inp_config = inp_config self.shared_lstm = shared_lstm self.pairwise_query_type = pairwise_query_type self.pairwise_bilinear_pooling = True self.all_test = all_test self.logger = logger self.logger.info("Pairwise Type = {}".format(self.pairwise_query_type)) if self.inp_config != "w2v": self.elmo = Elmo(ELMO_OPTIONS_FILE, ELMO_WEIGHT_FILE, 1, requires_grad=False, dropout=self.elmo_dropout_ratio) self.elmo.to(self.device) self.act = nn.ELU() self.layer_norm = nn.LayerNorm(self.embedding_dim) if self.train_type != "no_unary": self.logger.info("Unary Config") self.lstm = nn.LSTM(self.embedding_dim, self.hidden_dim, num_layers=self.n_layers, dropout=self.dropout_ratio, bidirectional=True).to(device=device) self.unary_fc = weight_norm(nn.Linear(2 * hidden_dim, 2 * hidden_dim, bias=True).to(device=device), dim=None) self.init_parameters(self.unary_fc, 'relu') self.out_dropout_u_fc = nn.Dropout(self.dropout_ratio) self.out_dropout_u_skip = nn.Dropout(self.dropout_ratio) self.hidden2tag = weight_norm(nn.Linear( 2 * hidden_dim, self.tagset_size).to(device=device), dim=None) self.init_parameters(self.hidden2tag, 'linear') tran_init = torch.empty(self.tagset_size, self.tagset_size, dtype=torch.float, requires_grad=True) torch.nn.init.normal_(tran_init, mean=0.0, std=1.) self.transitions = nn.Parameter(tran_init.to(device=device)) self.transitions.data[:, tag_to_ix[DatasetPreprosessed. __START_TAG__]] = -100. self.transitions.data[ tag_to_ix[DatasetPreprosessed.__STOP_TAG__], :] = -100. if self.train_type != "no_pairwise": self.logger.info("Pairwise Config") if not self.shared_lstm: self.logger.info("Separate LSTMs") self.lstm_pairwise = nn.LSTM( self.embedding_dim, self.hidden_dim, num_layers=self.n_layers, dropout=self.dropout_ratio, bidirectional=True).to(device=device) else: self.logger.info("Shared LSTM") self.U = weight_norm(nn.Linear( 2 * self.hidden_dim, self.hidden_dim_pp).to(device=device), dim=None) self.init_parameters(self.U, 'relu') self.V = weight_norm(nn.Linear( 2 * self.hidden_dim, self.hidden_dim_pp).to(device=device), dim=None) self.init_parameters(self.V, 'relu') self.P = weight_norm(nn.Linear( self.hidden_dim_pp, self.bilinear_dim).to(device=device), dim=None) self.init_parameters(self.P, 'relu') self.pairwise_fc = weight_norm( nn.Linear(self.bilinear_dim, self.bilinear_dim, bias=True).to(device=device), dim=None) self.init_parameters(self.pairwise_fc, 'relu') self.dropout_p_mul = nn.Dropout(self.dropout_ratio) self.out_dropout_p_fc = nn.Dropout(self.dropout_ratio) self.out_dropout_p_skip = nn.Dropout(self.dropout_ratio) self.hidden2tag_pp = weight_norm(nn.Linear( self.bilinear_dim, self.tagset_size**2).to(device=device), dim=None) self.init_parameters(self.hidden2tag_pp, 'linear') self.__start__ = torch.tensor( self.tag_to_ix[DatasetPreprosessed.__START_TAG__], dtype=torch.long).to(device=device) self.__stop__ = torch.tensor( self.tag_to_ix[DatasetPreprosessed.__STOP_TAG__], dtype=torch.long).to(device=device)
def __init__(self, config): super().__init__() inner_dim = config.n_inner if config.n_inner is not None else 4 * config.n_embd self.ln_1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon) self.attn = GPTJAttention(config) self.mlp = GPTJMLP(inner_dim, config)
def forward(self, inputs): residual = inputs output = nn.ReLU()(self.w_1(inputs)) output = self.w_2(output) return nn.LayerNorm(d_model)(output + residual)
def __init__(self, layer, num_layers): super(Decoder, self).__init__() self.layers = clones(layer, num_layers) self.norm = nn.LayerNorm(layer.size)
def __init__(self, config): super().__init__() self.save_hyperparameters() bert_config = BertConfig( vocab_size=config["vocab_size"], hidden_size=config["hidden_size"], num_hidden_layers=config["num_layers"], num_attention_heads=config["num_heads"], intermediate_size=config["hidden_size"] * config["mlp_ratio"], max_position_embeddings=config["max_text_len"], hidden_dropout_prob=config["drop_rate"], attention_probs_dropout_prob=config["drop_rate"], ) self.tempeture_max_OT = config['tempeture_max_OT'] self.text_embeddings = BertEmbeddings(bert_config) self.text_embeddings.apply(objectives.init_weights) self.token_type_embeddings = nn.Embedding(2, config["hidden_size"]) self.token_type_embeddings.apply(objectives.init_weights) import vilt.modules.vision_transformer as vit if self.hparams.config["load_path"] == "": self.transformer = getattr(vit, self.hparams.config["vit"])( pretrained=config["pretrained_flag"], config=self.hparams.config) else: self.transformer = getattr(vit, self.hparams.config["vit"])( pretrained=False, config=self.hparams.config ) self.pooler = heads.Pooler(config["hidden_size"]) self.pooler.apply(objectives.init_weights) if config["loss_names"]["mlm"] > 0: self.mlm_score = heads.MLMHead(bert_config) self.mlm_score.apply(objectives.init_weights) if config["loss_names"]["itm"] > 0: self.itm_score = heads.ITMHead(config["hidden_size"]) self.itm_score.apply(objectives.init_weights) if config["loss_names"]["mpp"] > 0: self.mpp_score = heads.MPPHead(bert_config) self.mpp_score.apply(objectives.init_weights) # ===================== Downstream ===================== # if ( self.hparams.config["load_path"] != "" and not self.hparams.config["test_only"] ): ckpt = torch.load(self.hparams.config["load_path"], map_location="cpu") state_dict = ckpt["state_dict"] self.load_state_dict(state_dict, strict=False) print(f'Loading checkpoint from {self.hparams.config["load_path"]}') hs = self.hparams.config["hidden_size"] if self.hparams.config["loss_names"]["vqa"] > 0: vs = self.hparams.config["vqav2_label_size"] self.vqa_classifier = nn.Sequential( nn.Linear(hs, hs * 2), nn.LayerNorm(hs * 2), nn.GELU(), nn.Linear(hs * 2, vs), ) self.vqa_classifier.apply(objectives.init_weights) if self.hparams.config["loss_names"]["nlvr2"] > 0: self.nlvr2_classifier = nn.Sequential( nn.Linear(hs * 2, hs * 2), nn.LayerNorm(hs * 2), nn.GELU(), nn.Linear(hs * 2, 2), ) self.nlvr2_classifier.apply(objectives.init_weights) emb_data = self.token_type_embeddings.weight.data self.token_type_embeddings = nn.Embedding(3, hs) self.token_type_embeddings.apply(objectives.init_weights) self.token_type_embeddings.weight.data[0, :] = emb_data[0, :] self.token_type_embeddings.weight.data[1, :] = emb_data[1, :] self.token_type_embeddings.weight.data[2, :] = emb_data[1, :] if self.hparams.config["loss_names"]["irtr"] > 0: self.rank_output = nn.Linear(hs, 1) self.rank_output.weight.data = self.itm_score.fc.weight.data[1:, :] self.rank_output.bias.data = self.itm_score.fc.bias.data[1:] self.margin = 0.2 for p in self.itm_score.parameters(): p.requires_grad = False vilt_utils.set_metrics(self) self.current_tasks = list() # ===================== load downstream (test_only) ====================== if self.hparams.config["load_path"] != "" and self.hparams.config["test_only"]: ckpt = torch.load(self.hparams.config["load_path"], map_location="cpu") state_dict = ckpt["state_dict"] self.load_state_dict(state_dict, strict=False) print(f'Loading checkpoint from {self.hparams.config["load_path"]}')
def block(inp, out, activation, block_device): return nn.Sequential( nn.Linear(inp, out, bias=False), nn.LayerNorm(out), # Recommended by Gulrajani et al 2017 activation(), ).to(block_device)
def __init__(self, dim_model): super(SublayerConnection, self).__init__() self.norm = nn.LayerNorm(dim_model)
def __init__(self, size, dropout): super(SublayerConnection, self).__init__() self.norm = nn.LayerNorm(size) self.dropout = nn.Dropout(dropout)
def __init__(self, backbone, hidden_size=2560, class_num=168 * 11 * 7): super(BengalModel, self).__init__() self.backbone = backbone self._avg_pooling = nn.AdaptiveAvgPool2d(1) self.fc = nn.Linear(hidden_size, class_num) self.ln = nn.LayerNorm(hidden_size)
def __init__(self, hidden_dim: int, sublayer: nn.Module): super(AddAndNorm, self).__init__() self.norm = nn.LayerNorm(hidden_dim) self.sublayer = sublayer return
def __init__(self, layer, N): super(Decoder, self).__init__() self.layers = clones(layer, N) self.norm = nn.LayerNorm(layer.size) self.count = nn.Embedding(200, 256)
def __init__(self, classifier_dims, num_classes, gaussian_noise, dropout, internal_dims, n_layers, featurizer, final_layer_builder, n_tokens_in=64, n_tokens_out=16, use_as_super=False, **kwargs): embedding_dims = 768 super(AlbertClassifer, self).__init__(classifier_dims, num_classes, embedding_dims, gaussian_noise, dropout, internal_dims, n_layers, featurizer, final_layer_builder, n_tokens_in, n_tokens_out, True, **kwargs) self.word_masking_proba = kwargs["word_masking_proba"] if "word_masking_proba" in kwargs else 0.0 self.need_fasttext = "fasttext_vector_config" in kwargs if "fasttext_vector_config" in kwargs: import fasttext ftvc = kwargs["fasttext_vector_config"] gru_layers = ftvc.pop("gru_layers", 0) fasttext_crawl = fasttext.load_model("crawl-300d-2M-subword.bin") fasttext_wiki = fasttext.load_model("wiki-news-300d-1M-subword.bin") bpe = BPEmb(dim=200) cngram = CharNGram() self.word_vectorizers = dict(fasttext_crawl=fasttext_crawl, fasttext_wiki=fasttext_wiki, bpe=bpe, cngram=cngram) crawl_nn = ExpandContract(900, embedding_dims, dropout, use_layer_norm=True, unit_norm=False, groups=(4, 4)) self.crawl_nn = crawl_nn n_tokens_in = n_tokens_in + (8 * int(self.n_tokens_in/(8*1.375) + 1)) if gru_layers > 0: lstm = nn.Sequential(GaussianNoise(gaussian_noise), nn.GRU(embedding_dims, int(embedding_dims / 2), gru_layers, batch_first=True, bidirectional=True, dropout=dropout)) pre_query_layer = nn.Sequential(lstm, LambdaLayer(lambda x: x[0]), nn.LayerNorm(embedding_dims)) else: pre_query_layer = nn.LayerNorm(embedding_dims) self.pre_query_layer = pre_query_layer if not use_as_super: model = kwargs["model"] if "model" in kwargs else 'albert-base-v2' global_dir = get_global("models_dir") model = os.path.join(global_dir, model) if model in os.listdir(global_dir) else model self.tokenizer = AutoTokenizer.from_pretrained(model) self.model = AutoModel.from_pretrained(model) print("Pick stored Model", model, "Model Class = ", type(self.model), "Tokenizer Class = ", type(self.tokenizer)) if featurizer == "cnn": self.featurizer = CNN1DFeaturizer(n_tokens_in, embedding_dims, n_tokens_out, classifier_dims, internal_dims, n_layers, gaussian_noise, dropout) elif featurizer == "gru": self.featurizer = GRUFeaturizer(n_tokens_in, embedding_dims, n_tokens_out, classifier_dims, internal_dims, n_layers, gaussian_noise, dropout) elif featurizer == "basic": self.featurizer = BasicFeaturizer(n_tokens_in, embedding_dims, n_tokens_out, classifier_dims, internal_dims, n_layers, gaussian_noise, dropout) elif featurizer == "transformer": self.attention_drop_proba = kwargs["attention_drop_proba"] if "attention_drop_proba" in kwargs else 0.0 n_encoders = kwargs.pop("n_encoders", n_layers) n_decoders = kwargs.pop("n_decoders", n_layers) self.featurizer = TransformerFeaturizer(n_tokens_in, embedding_dims, n_tokens_out, classifier_dims, internal_dims, n_encoders, n_decoders, gaussian_noise, dropout, self.attention_drop_proba) else: raise NotImplementedError() self.final_layer = final_layer_builder(classifier_dims, n_tokens_out, num_classes, dropout, **kwargs) if "stored_model" in kwargs: load_stored_params(self, kwargs["stored_model"]) self.reg_layers = [(c, c.p if hasattr(c, "p") else c.sigma) for c in self.children() if c.__class__ == GaussianNoise or c.__class__ == nn.Dropout]
def LayerNorm(embedding_dim): m = nn.LayerNorm(embedding_dim) return m
def __init__(self, d_in, d_hid, dropout): super().__init__() self.w_1 = nn.Conv1d(d_in, d_hid, 1) # position-wise self.w_2 = nn.Conv1d(d_hid, d_in, 1) # position-wise self.layer_norm = nn.LayerNorm(d_in) self.dropout = nn.Dropout(dropout)
def __init__(self, params, dico, is_encoder, with_output): """ Transformer model (encoder or decoder). """ super().__init__() # encoder / decoder, output layer self.is_encoder = is_encoder self.is_decoder = not is_encoder self.with_output = with_output # dictionary / languages self.n_langs = params.n_langs self.n_words = params.n_words self.eos_index = params.eos_index self.pad_index = params.pad_index self.dico = dico self.id2lang = params.id2lang self.lang2id = params.lang2id self.use_lang_emb = getattr(params, 'use_lang_emb', True) assert len(self.dico) == self.n_words assert len(self.id2lang) == len(self.lang2id) == self.n_langs # model parameters self.dim = params.emb_dim_encoder if is_encoder else params.emb_dim_decoder # 512 by default self.hidden_dim = self.dim * 4 # 2048 by default self.n_heads = params.n_heads # 8 by default self.n_layers = params.n_layers_encoder if is_encoder else params.n_layers_decoder self.dropout = params.dropout self.attention_dropout = params.attention_dropout assert self.dim % self.n_heads == 0, 'transformer dim must be a multiple of n_heads' # embeddings self.position_embeddings = Embedding(N_MAX_POSITIONS, self.dim) if params.sinusoidal_embeddings: create_sinusoidal_embeddings(N_MAX_POSITIONS, self.dim, out=self.position_embeddings.weight) if params.n_langs > 1 and self.use_lang_emb: self.lang_embeddings = Embedding(self.n_langs, self.dim) self.embeddings = Embedding(self.n_words, self.dim, padding_idx=self.pad_index) self.layer_norm_emb = nn.LayerNorm(self.dim, eps=1e-12) # transformer layers self.attentions = nn.ModuleList() self.layer_norm1 = nn.ModuleList() self.ffns = nn.ModuleList() self.layer_norm2 = nn.ModuleList() if self.is_decoder: self.layer_norm15 = nn.ModuleList() self.encoder_attn = nn.ModuleList() self.cache = None for layer_id in range(self.n_layers): self.attentions.append( MultiHeadAttention(self.n_heads, self.dim, dropout=self.attention_dropout)) self.layer_norm1.append(nn.LayerNorm(self.dim, eps=1e-12)) if self.is_decoder: self.layer_norm15.append(nn.LayerNorm(self.dim, eps=1e-12)) self.encoder_attn.append( MultiHeadAttention(self.n_heads, self.dim, dim_encoder=params.emb_dim_encoder, dropout=self.attention_dropout)) self.ffns.append( TransformerFFN(self.dim, self.hidden_dim, self.dim, dropout=self.dropout, gelu_activation=params.gelu_activation)) self.layer_norm2.append(nn.LayerNorm(self.dim, eps=1e-12)) # output layer if self.with_output: self.pred_layer = PredLayer(params) if params.share_inout_emb: self.pred_layer.proj.weight = self.embeddings.weight
def __init__(self, input_size, hidden_size, bias=True): super().__init__(input_size, hidden_size, bias) self.ln_ih = nn.LayerNorm(4 * hidden_size) self.ln_hh = nn.LayerNorm(4 * hidden_size) self.ln_ho = nn.LayerNorm(hidden_size)
def __init__(self, n_features): super(LayerNorm, self).__init__() self.layer_norm = nn.LayerNorm(n_features)
def __init__(self, dim, fn, context_dim=None): super().__init__() self.fn = fn self.norm = nn.LayerNorm(dim) self.norm_context = nn.LayerNorm(context_dim) if exists( context_dim) else None
def build(self): """ Construct Submodule And Prepare Parameters """ self.state_ndim = len(self.state_shape) self.state_size = np.prod(self.state_shape) last_size = self.state_size # fully connected before LSTM if (self.fc_config_before_lstm is not None and len(self.fc_config_before_lstm) > 0): submodule = OrderedDict() for i_layer, layer_config in enumerate( self.fc_config_before_lstm): num_hidden_unit, add_bias, activation = layer_config[:3] normalization_config = (layer_config[3] if len(layer_config) > 3 else None) add_bias = (add_bias and (normalization_config is None)) last_layer = submodule['fc%d' % i_layer] = nn.Linear( last_size, num_hidden_unit, bias = add_bias) nn.init.xavier_uniform_(last_layer.weight, calculate_gain_from_activation(activation)) if (add_bias): nn.init.constant_(last_layer.bias, 0) last_size = num_hidden_unit if (normalization_config is not None and normalization_config == 'layernorm'): submodule['fc_layernorm%d' % i_layer] = nn.LayerNorm([last_size]) if (activation is not None): activation_type, activation_module = get_activation( activation) submodule[activation_type + str(i_layer)] = activation_module self.fc_function_before_lstm = nn.Sequential(submodule) else: self.fc_function_before_lstm = None # LSTM self.lstm_list = [] if (self.contain_lstm()): module_id = 'lstm' self.lstm_list.append(module_id) self.__setattr__(module_id, nn.LSTM(last_size, self.lstm_h_size)) nn.init.xavier_uniform_(self.__getattr__(module_id).weight_hh_l0) nn.init.xavier_uniform_(self.__getattr__(module_id).weight_ih_l0) nn.init.constant_(self.__getattr__(module_id).bias_ih_l0, 0) nn.init.constant_(self.__getattr__(module_id).bias_hh_l0, 0) last_size = self.lstm_h_size # fully connected after LSTM if (self.fc_config_after_lstm is not None and len(self.fc_config_after_lstm) > 0): submodule = OrderedDict() for i_layer, layer_config in enumerate( self.fc_config_after_lstm): num_hidden_unit, add_bias, activation = layer_config[:3] normalization_config = (layer_config[3] if len(layer_config) > 3 else None) add_bias = (add_bias and (normalization_config is None)) last_layer = submodule['fc%d' % i_layer] = nn.Linear( last_size, num_hidden_unit, bias = add_bias) nn.init.xavier_uniform_(last_layer.weight, calculate_gain_from_activation(activation)) if (add_bias): nn.init.constant_(last_layer.bias, 0) last_size = num_hidden_unit if (normalization_config is not None and normalization_config == 'layernorm'): submodule['fc_layernorm%d' % i_layer] = nn.LayerNorm([last_size]) if (activation is not None): activation_type, activation_module = get_activation( activation) submodule[activation_type + str(i_layer)] = activation_module self.fc_function_after_lstm = nn.Sequential(submodule) else: self.fc_function_after_lstm = None # policy and value self.policy_branch = nn.Linear(last_size, self.n_action, bias = False) nn.init.xavier_uniform_(self.policy_branch.weight) self.value_branch = nn.Linear(last_size, 1, bias = False) nn.init.xavier_uniform_(self.value_branch.weight) self.policy_softmax = nn.Softmax(dim = -1)
def __init__(self, dim_model, dim_ff, dropout=0.1): super(PositionwiseFeedForward, self).__init__() self.linear_1 = nn.Linear(dim_model, dim_ff) self.linear_2 = nn.Linear(dim_ff, dim_model) self.dropout = nn.Dropout(dropout) self.layer_norm = nn.LayerNorm(dim_model)
def __init__(self, layer, num_layers): super(Encoder, self).__init__() self.layers = clones(layer, num_layers) self.norm = nn.LayerNorm(layer.dim_model)
def __init__(self, dim_model, dim_hidden, dropout=0.1): super(PositionwiseFeedForwardWithConv, self).__init__() self.conv_1 = nn.Conv1d(dim_model, dim_hidden, 1) self.conv_2 = nn.Conv1d(dim_hidden, dim_model, 1) self.dropout = nn.Dropout(dropout) self.layer_norm = nn.LayerNorm(dim_model)
def __init__(self, channels): super().__init__() self.ln = nn.LayerNorm(channels)
def __init__(self, num_inputs, num_actions, hidden_size, action_range=1., init_w=3e-3, log_std_min=-20, log_std_max=2): super(PolicyNetwork, self).__init__() self.log_std_min = log_std_min self.log_std_max = log_std_max # self.linear1 = nn.Linear(num_inputs, hidden_size) # self.linear2 = nn.Linear(hidden_size, hidden_size) # self.linear3 = nn.Linear(hidden_size, hidden_size) # self.linear4 = nn.Linear(hidden_size, hidden_size) # self.tcn = TemporalConvNet(input_channels, num_channels, kernel_size=kernel_size, dropout=dropout) # self.tcn1 = nn.Conv1d(input_channels, out_channels = 256, kernel_size = kernel_size, stride=1, padding=0, dilation=1) # self.tcn2 = nn.Conv1d(256, out_channels = 256, kernel_size = kernel_size, stride=1, padding=0, dilation=1) # torch.nn.Conv1d(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, padding_mode='zeros') # self.fc1 = nn.Linear(num_channels[-1], hidden_size) # self.linear1 = nn.Linear(num_channels[-1], hidden_size) # self.conv1d1 = nn.Conv1d(input_channels, out_channels = hidden_size, kernel_size = kernel_size, stride=1, padding=0, dilation=1) # self.conv1d2 = nn.Conv1d(hidden_size, out_channels = hidden_size, kernel_size = kernel_size, stride=1, padding=0, dilation=1) # self.conv1d3 = nn.Conv1d(hidden_size, out_channels = hidden_size, kernel_size = kernel_size, stride=1, padding=0, dilation=1) # self.LN1 = nn.LayerNorm([-1, hidden_size, state_seq_len], elementwise_affine=True) self.model_conv = nn.Sequential( nn.Conv1d(input_channels, out_channels=hidden_size, kernel_size=kernel_size, stride=1, padding=0, dilation=1), # 输入10维,隐层20维 # nn.LayerNorm([-1, hidden_size, state_seq_len], elementwise_affine=True), nn.ReLU(), # 激活函数 nn.Conv1d(hidden_size, out_channels=hidden_size, kernel_size=kernel_size, stride=1, padding=0, dilation=1), # 输入10维,隐层20维 # nn.LayerNorm([-1, hidden_size, state_seq_len], elementwise_affine=True), nn.ReLU(), nn.Conv1d(hidden_size, out_channels=hidden_size, kernel_size=kernel_size, stride=1, padding=0, dilation=1), # 输入10维,隐层20维 # nn.LayerNorm([-1, hidden_size, state_seq_len], elementwise_affine=True), nn.ReLU(), ) # self.model_lstm = nn.Sequential( # nn.LSTM(hidden_size, hidden_size , 2), # 输入10维,隐层20维 # # nn.LayerNorm([-1, hidden_size, state_seq_len], elementwise_affine=True), # # nn.ReLU(), # 激活函数 # nn.LSTM(10, 20, 2) # 输入10维,隐层20维 # # nn.LayerNorm([-1, hidden_size, state_seq_len], elementwise_affine=True), # # nn.ReLU(), # ) # self.lstm = nn.LSTM(hidden_size, hidden_size , 1, batch_first = True) self.model = nn.Sequential( nn.Linear(hidden_size, hidden_size), # 输入10维,隐层20维 nn.LayerNorm(hidden_size, elementwise_affine=True), nn.ReLU(), # 激活函数 nn.Linear(hidden_size, hidden_size), # 输入10维,隐层20维 nn.LayerNorm(hidden_size, elementwise_affine=True), nn.ReLU(), nn.Linear(hidden_size, hidden_size), # 输入10维,隐层20维 nn.LayerNorm(hidden_size, elementwise_affine=True), nn.ReLU(), ) # self.model = nn.Sequential( # nn.Linear(num_inputs, hidden_size), # 输入10维,隐层20维 # nn.LayerNorm(hidden_size, elementwise_affine=True), # nn.ReLU(), # 激活函数 # ) self.mean_linear = nn.Linear(hidden_size, num_actions) self.mean_linear.weight.data.uniform_(-init_w, init_w) self.mean_linear.bias.data.uniform_(-init_w, init_w) self.log_std_linear = nn.Linear(hidden_size, num_actions) self.log_std_linear.weight.data.uniform_(-init_w, init_w) self.log_std_linear.bias.data.uniform_(-init_w, init_w) self.action_range = action_range self.num_actions = num_actions print('#########')
def LayerNorm(embedding_dim): m = nn.LayerNorm(embedding_dim) nn.init.constant_(m.weight, 1) nn.init.constant_(m.bias, 0) return m
def __init__(self, params, id2word, is_encoder, with_output): """ Transformer model (encoder or decoder). """ super().__init__() # encoder / decoder, output layer self.is_encoder = is_encoder self.is_decoder = not is_encoder self.with_output = with_output # dictionary self.n_words = params.n_words self.eos_index = params.eos_index self.pad_index = params.pad_index self.id2word = id2word assert len(self.id2word) == self.n_words # model parameters self.dim = params.emb_dim # 512 by default self.hidden_dim = self.dim * 4 # 2048 by default self.n_heads = params.n_heads # 8 by default self.n_layers = params.n_enc_layers if is_encoder else params.n_dec_layers self.dropout = params.dropout self.attention_dropout = params.attention_dropout self.nb_features = params.nb_features assert self.dim % self.n_heads == 0, 'transformer dim must be a multiple of n_heads' # embeddings self.position_embeddings = Embedding(N_MAX_POSITIONS, self.dim) if params.sinusoidal_embeddings: create_sinusoidal_embeddings(N_MAX_POSITIONS, self.dim, out=self.position_embeddings.weight) self.embeddings = Embedding(self.n_words, self.dim, padding_idx=self.pad_index) self.layer_norm_emb = nn.LayerNorm(self.dim, eps=1e-12) # transformer layers self.attentions = nn.ModuleList() self.layer_norm1 = nn.ModuleList() self.ffns = nn.ModuleList() self.layer_norm2 = nn.ModuleList() if self.is_decoder: self.layer_norm15 = nn.ModuleList() self.encoder_attn = nn.ModuleList() for layer_id in range(self.n_layers): self.attentions.append( MultiHeadAttention(self.n_heads, self.dim, dropout=self.attention_dropout, nb_features=self.nb_features, causal=False)) self.layer_norm1.append(nn.LayerNorm(self.dim, eps=1e-12)) if self.is_decoder: self.layer_norm15.append(nn.LayerNorm(self.dim, eps=1e-12)) self.encoder_attn.append( MultiHeadAttention(self.n_heads, self.dim, dropout=self.attention_dropout, nb_features=self.nb_features, causal=True)) self.ffns.append( TransformerFFN(self.dim, self.hidden_dim, self.dim, dropout=self.dropout)) self.layer_norm2.append(nn.LayerNorm(self.dim, eps=1e-12)) # output layer if self.with_output: self.proj = nn.Linear(self.dim, params.n_words, bias=True) if params.share_inout_emb: self.proj.weight = self.embeddings.weight
def layer_norm(batch_x, gamma, beta, eps=1e-5): # Manual implementation n, d = batch_x.shape sample_mean = batch_x.mean(axis=1).view(2, 1) sample_var = batch_x.var(axis=1, unbiased=False).view(2, 1) std = torch.sqrt(sample_var + eps) x_centered = batch_x - sample_mean x_norm = x_centered / std out = gamma * x_norm + beta cache = (x_norm, x_centered, std, gamma) return out, cache x = torch.rand(2, 3) print(x) x_norm, cache = layer_norm(x, gamma=0.02, beta=0.01) print(x_norm) print(cache[0]) # Pytorch implementation # With/Without Learnable Parameters model = nn.LayerNorm(normalized_shape=3) output = model(x) print(output)
def __init__(self, dim, fn): super().__init__() self.norm = nn.LayerNorm(dim) self.fn = fn
def __init__( self, input_width, input_height, input_channels, output_size, kernel_sizes, n_channels, strides, paddings, hidden_sizes=None, added_fc_input_size=0, conv_normalization_type='none', fc_normalization_type='none', init_w=1e-4, hidden_init=nn.init.xavier_uniform_, hidden_activation=nn.ReLU(), output_activation=identity, output_conv_channels=False, pool_type='none', pool_sizes=None, pool_strides=None, pool_paddings=None, image_augmentation=False, image_augmentation_padding=4, ): if hidden_sizes is None: hidden_sizes = [] assert len(kernel_sizes) == \ len(n_channels) == \ len(strides) == \ len(paddings) assert conv_normalization_type in {'none', 'batch', 'layer'} assert fc_normalization_type in {'none', 'batch', 'layer'} assert pool_type in {'none', 'max2d'} if pool_type == 'max2d': assert len(pool_sizes) == len(pool_strides) == len(pool_paddings) super().__init__() self.hidden_sizes = hidden_sizes self.input_width = input_width self.input_height = input_height self.input_channels = input_channels self.output_size = output_size self.output_activation = output_activation self.hidden_activation = hidden_activation self.conv_normalization_type = conv_normalization_type self.fc_normalization_type = fc_normalization_type self.added_fc_input_size = added_fc_input_size self.conv_input_length = self.input_width * self.input_height * self.input_channels self.output_conv_channels = output_conv_channels self.pool_type = pool_type self.image_augmentation = image_augmentation self.image_augmentation_padding = image_augmentation_padding self.conv_layers = nn.ModuleList() self.conv_norm_layers = nn.ModuleList() self.pool_layers = nn.ModuleList() self.fc_layers = nn.ModuleList() self.fc_norm_layers = nn.ModuleList() for i, (out_channels, kernel_size, stride, padding) in enumerate( zip(n_channels, kernel_sizes, strides, paddings)): conv = nn.Conv2d(input_channels, out_channels, kernel_size, stride=stride, padding=padding) hidden_init(conv.weight) conv.bias.data.fill_(0) conv_layer = conv self.conv_layers.append(conv_layer) input_channels = out_channels if pool_type == 'max2d': if pool_sizes[i] > 1: self.pool_layers.append( nn.MaxPool2d( kernel_size=pool_sizes[i], stride=pool_strides[i], padding=pool_paddings[i], )) # use torch rather than ptu because initially the model is on CPU test_mat = torch.zeros( 1, self.input_channels, self.input_width, self.input_height, ) # find output dim of conv_layers by trial and add norm conv layers for i, conv_layer in enumerate(self.conv_layers): test_mat = conv_layer(test_mat) if self.conv_normalization_type == 'batch': self.conv_norm_layers.append(nn.BatchNorm2d(test_mat.shape[1])) if self.conv_normalization_type == 'layer': self.conv_norm_layers.append(nn.LayerNorm(test_mat.shape[1:])) if self.pool_type != 'none' and len(self.pool_layers) > i: test_mat = self.pool_layers[i](test_mat) self.conv_output_flat_size = int(np.prod(test_mat.shape)) if self.output_conv_channels: self.last_fc = None else: fc_input_size = self.conv_output_flat_size # used only for injecting input directly into fc layers fc_input_size += added_fc_input_size for idx, hidden_size in enumerate(hidden_sizes): fc_layer = nn.Linear(fc_input_size, hidden_size) fc_input_size = hidden_size fc_layer.weight.data.uniform_(-init_w, init_w) fc_layer.bias.data.uniform_(-init_w, init_w) self.fc_layers.append(fc_layer) if self.fc_normalization_type == 'batch': self.fc_norm_layers.append(nn.BatchNorm1d(hidden_size)) if self.fc_normalization_type == 'layer': self.fc_norm_layers.append(nn.LayerNorm(hidden_size)) self.last_fc = nn.Linear(fc_input_size, output_size) self.last_fc.weight.data.uniform_(-init_w, init_w) self.last_fc.bias.data.uniform_(-init_w, init_w) if self.image_augmentation: self.augmentation_transform = RandomCrop( input_height, self.image_augmentation_padding, device='cuda')
def __init__(self, size, dropout, layer_norm_rescale=True): super(SublayerConnection, self).__init__() self.norm = nn.LayerNorm(size, elementwise_affine=layer_norm_rescale) self.dropout = nn.Dropout(dropout)