def test_transformer_parameters_to_attention(self): with self.assertRaises(ValueError): transformer = TransformerEncoderBuilder.from_kwargs( attention_type="test-attention").get() transformer = TransformerEncoderBuilder.from_kwargs( attention_type="test-attention", n_heads=8, query_dimensions=64).get()
def test_builder_factory_methods(self): builder = TransformerEncoderBuilder.from_kwargs( n_layers=1, n_heads=4, attention_type="linear") with self.assertRaises(ValueError): TransformerEncoderBuilder.from_kwargs(foobar=1) TransformerEncoderBuilder.from_kwargs(foobar=1, strict=False) parser = argparse.ArgumentParser() parser.add_argument("--n_layers", type=int) parser.add_argument("--n_heads", type=int) args = parser.parse_args(["--n_heads", "42"]) builder = TransformerEncoderBuilder.from_namespace(args) self.assertEqual(builder.n_heads, 42) self.assertTrue(builder.n_layers is None)
def __init__(self, config, output_attentions=False, keep_multihead_output=False): super(TransformerEncoder, self).__init__() self.output_attentions = output_attentions self.pre_layer_norm = config.pre_layer_norm builder = TransformerEncoderBuilder.from_kwargs( n_layers=config.num_hidden_layers, n_heads=config.num_attention_heads, feed_forward_dimensions=config.intermediate_size, query_dimensions=int(config.hidden_size / config.num_attention_heads), value_dimensions=int(config.hidden_size / config.num_attention_heads), dropout=config.hidden_dropout_prob ) if config.softmax_temp: builder.attention.softmax_temp = config.softmax_temp builder.attention.attention_dropout = config.attention_probs_dropout_prob builder.attention.clusters = config.clusters builder.attention.bits = config.bits builder.attention.hash_bias = config.hash_bias builder.attention.iterations = config.iterations builder.attention.topk = config.topk builder.attention.local_context = config.local_context builder.attention.length_limit = config.length_limit attention_type = config.attention_type if attention_type == "improved-clustered": attention_type = "conditional-full:improved-clustered" builder.attention_type = attention_type self.transformer = builder.get()
def test_longformer(self): config = LongformerConfig() config.attention_mode = "n2" config.attention_window = [256] * 12 config.attention_dilation = [1] * 12 longformer = Longformer(config) encoder = TransformerEncoderBuilder.from_kwargs( n_layers=12, n_heads=12, query_dimensions=64, value_dimensions=64, feed_forward_dimensions=3072, attention_type="full", final_normalization=False, activation="gelu").get() longformer.eval() encoder.eval() # Before the weight copy they should be different x = torch.rand(3, 10, 768) o1 = longformer.encoder(x, head_mask=[None] * 12)[0] o2 = encoder(x) self.assertGreater(torch.abs(o1 - o2).max().item(), 1) # And after the copy they should be exactly the same encoder.load_state_dict(LongformerMapper().map( longformer.encoder.state_dict())) o1 = longformer.encoder(x, head_mask=[None] * 12)[0] o2 = encoder(x) self.assertLess(torch.abs(o1 - o2).max().item(), 1e-4)
def test_huggin_bert(self): bert = BertModel(BertConfig()) encoder = TransformerEncoderBuilder.from_kwargs( n_layers=12, n_heads=12, query_dimensions=64, value_dimensions=64, feed_forward_dimensions=3072, attention_type="full", final_normalization=False, activation="gelu").get() bert.eval() encoder.eval() # Before the weight copy they should be different x = torch.rand(3, 10, 768) o1 = bert.encoder(x, head_mask=[None] * 12)[0] o2 = encoder(x) self.assertGreater(torch.abs(o1 - o2).max().item(), 1) # And after the copy they should be exactly the same encoder.load_state_dict(HugginfaceBertEncoderMapper().map( bert.encoder.state_dict())) o1 = bert.encoder(x, head_mask=[None] * 12)[0] o2 = encoder(x) self.assertLess(torch.abs(o1 - o2).max().item(), 1e-4)
def __init__(self, num_feats, output_feats, lstm_layers, n_layers, n_heads, hidden_dim, ff_dim, tf_depth=3, dropout=0.15): super(LSTUT, self).__init__() self.num_feats = num_feats self.output_feats = output_feats self.lstm_layers = lstm_layers self.n_layers = n_layers self.n_heads = n_heads self.hidden_dim = hidden_dim self.ff_dim = ff_dim self.tf_depth = tf_depth self.d_model = self.hidden_dim * self.n_heads encoder_builder = TransformerEncoderBuilder.from_kwargs( n_layers=self.n_layers, n_heads=self.n_heads, query_dimensions=self.hidden_dim, value_dimensions=self.hidden_dim, feed_forward_dimensions=self.ff_dim, attention_type='linear', dropout=dropout ) self.initial_ff = nn.Linear(self.num_feats, self.d_model) self.lstm1 = nn.LSTM(self.d_model, self.d_model // 2, self.lstm_layers, batch_first=True, bidirectional=True) self.encoder = encoder_builder.get() self.lstm2 = nn.LSTM(self.d_model, self.d_model // 2, self.lstm_layers, batch_first=True, bidirectional=True) self.final_ff = nn.Linear(self.d_model, self.output_feats)
def test_attention_composition(self): transformer = TransformerEncoderBuilder.from_kwargs( attention_type="conditional-full:improved-clustered", attention_dropout=0.1, softmax_temp=0.125, clusters=256, bits=32, topk=32, length_limit=512).get() with self.assertRaises(TypeError): transformer = TransformerEncoderBuilder.from_kwargs( attention_type="conditional-full", attention_dropout=0.1, softmax_temp=0.125, length_limit=512).get()
def __init__( self, attention_type, out_channels, num_layers=6, nhead=8, d_ffn=1024, dropout=0, activation="relu", reformer_bucket_size=32, ): super(FastTransformerBlock, self).__init__() from fast_transformers.builders import TransformerEncoderBuilder builder = TransformerEncoderBuilder.from_kwargs( attention_type=attention_type, n_layers=num_layers, n_heads=nhead, feed_forward_dimensions=d_ffn, query_dimensions=out_channels // nhead, value_dimensions=out_channels // nhead, dropout=dropout, attention_dropout=dropout, chunk_size=reformer_bucket_size, ) self.mdl = builder.get() self.attention_type = attention_type self.reformer_bucket_size = reformer_bucket_size
def __init__( self, file_re, input_dimensions=5, # project_dimension=128, n_layers=8, n_heads=8, query_dimensions=64, value_dimensions=64, feed_forward_dimensions=1024, attention_type='full', num_workers=1, batch_size=2, lr=1e-7, seq_len=1000, seed=100, **kwargs): super(TimeSeriesTransformer, self).__init__() builder = TransformerEncoderBuilder.from_kwargs( n_layers=n_layers, n_heads=n_heads, query_dimensions=query_dimensions, value_dimensions=value_dimensions, feed_forward_dimensions=feed_forward_dimensions) # Build a transformer with softmax attention builder.attention_type = attention_type self.transformer = builder.get() project_dimension = query_dimensions * n_heads self.project_dimension = project_dimension # self.stock_num = stock_num # self.stock_mapping_fn = stock_mapping_fn # self.model_projection = nn.Conv1d(input_dimensions, project_dimension, 1) self.model_projection = nn.Linear(input_dimensions, project_dimension) self.positional_encoder = PositionalEncoding(project_dimension) self.seq_len = seq_len self.loss = MaskedRMSLE() self.file_re = file_re self.batch_size = batch_size self.metric_object = MaskedAPE() self.output_projection = nn.Linear(n_heads * value_dimensions, 1) # self.output_projection = nn.Conv1d(n_heads * value_dimensions, 1, 1) self.filenames = glob.glob(self.file_re) self.lr = lr self.seed = seed np.random.seed(seed) # self.split_date = split_date # self.end_date = end_date # random.shuffle(self.filenames) # self.training_files = self.filenames # self.valid_files = glob.glob(valid_file_re) # if not self.training_files: # # if not self.training_files or (not self.valid_files): # raise ValueError(f"input file train {self.training_files} is empty") self.num_workers = num_workers
def __init__(self, input_feats, output_feats, n_layers, n_heads, hidden_dim, ff_dim, tf_depth=3, dropout=0.15): super(TransformerEncoderDecoder, self).__init__() self.input_feats = input_feats self.output_feats = output_feats self.n_layers = n_layers self.n_heads = n_heads self.hidden_dim = hidden_dim self.ff_dim = ff_dim self.d_model = hidden_dim * n_heads self.tf_depth = tf_depth self.pe = PositionalEncoding(self.d_model, dropout=dropout, max_len=4096) self.A = nn.GELU() encoder_builder = TransformerEncoderBuilder.from_kwargs( n_layers=n_layers, n_heads=n_heads, query_dimensions=hidden_dim, value_dimensions=hidden_dim, feed_forward_dimensions=ff_dim, attention_type='linear', dropout=dropout ) decoder_builder = TransformerDecoderBuilder.from_kwargs( n_layers=n_layers, n_heads=n_heads, query_dimensions=hidden_dim, value_dimensions=hidden_dim, feed_forward_dimensions=ff_dim, cross_attention_type='linear', self_attention_type='causal-linear', dropout=dropout ) self.encoder = encoder_builder.get() self.decoder = decoder_builder.get() self.src_embed = nn.Linear(self.input_feats, self.d_model) self.tgt_embed = nn.Linear(self.output_feats, self.d_model) self.final_ff = nn.Linear(self.d_model, self.output_feats) self.tgt_mask = fast_transformers.masking.TriangularCausalMask(N=12)
def __init__(self, d_model, coords_target, flatten_order_target, attention_type="linear", n_layers=4, n_heads=4, d_query=32, dropout=0.1, attention_dropout=0.1, d_conv=8): super(TRecOnlyFBP, self).__init__() self.fbp_fourier_coefficient_embedding = torch.nn.Linear(2, d_model // 2) self.pos_embedding_target = PositionalEncoding2D(d_model // 2, coords=coords_target, flatten_order=flatten_order_target) self.encoder = TransformerEncoderBuilder.from_kwargs( attention_type=attention_type, n_layers=n_layers, n_heads=n_heads, feed_forward_dimensions=n_heads * d_query * 4, query_dimensions=d_query, value_dimensions=d_query, dropout=dropout, attention_dropout=attention_dropout ).get() self.predictor_amp = torch.nn.Linear( n_heads * d_query, 1 ) self.predictor_phase = torch.nn.Linear( n_heads * d_query, 1 ) self.conv_block = torch.nn.Sequential( torch.nn.Conv2d(1, d_conv, kernel_size=3, stride=1, padding=1), torch.nn.ReLU(), torch.nn.BatchNorm2d(d_conv), torch.nn.Conv2d(d_conv, 1, kernel_size=1, stride=1, padding=0) )
def test_mapping(self): t1 = nn.TransformerEncoder( nn.TransformerEncoderLayer(128, 4, dim_feedforward=256), 4) t2 = TransformerEncoderBuilder.from_kwargs( n_layers=4, n_heads=4, query_dimensions=128 // 4, value_dimensions=128 // 4, feed_forward_dimensions=256, attention_type="full", final_normalization=False).get() t1.eval() t2.eval() with self.assertRaises(RuntimeError): t2.load_state_dict(t1.state_dict()) t2.load_state_dict(PytorchMapper().map(t1.state_dict())) x = torch.rand(3, 10, 128) o1 = t2(x) o2 = t1(x.permute(1, 0, 2)).permute(1, 0, 2) self.assertLess(torch.abs(o1 - o2).max().item(), 1e-5)
def __init__(self, d_model, coords, flatten_order, attention_type="linear", n_layers=4, n_heads=4, d_query=32, dropout=0.1, attention_dropout=0.1): super(SResTransformerTrain, self).__init__() self.fourier_coefficient_embedding = torch.nn.Linear(2, d_model // 2) self.pos_embedding = PositionalEncoding2D( d_model // 2, coords=coords, flatten_order=flatten_order, persistent=False ) self.encoder = TransformerEncoderBuilder.from_kwargs( attention_type=attention_type, n_layers=n_layers, n_heads=n_heads, feed_forward_dimensions=n_heads * d_query * 4, query_dimensions=d_query, value_dimensions=d_query, dropout=dropout, attention_dropout=attention_dropout ).get() self.predictor_amp = torch.nn.Linear( n_heads * d_query, 1 ) self.predictor_phase = torch.nn.Linear( n_heads * d_query, 1 )
def __init__(self, d_model, sequence_length, mixtures, attention_type="full", n_layers=4, n_heads=4, d_query=32, dropout=0.1, softmax_temp=None, attention_dropout=0.1, bits=32, rounds=4, chunk_size=32, masked=True): super(ImageGenerator, self).__init__() self.pos_embedding = PositionalEncoding(d_model // 2, max_len=sequence_length) self.value_embedding = torch.nn.Embedding(256, d_model // 2) self.transformer = TransformerEncoderBuilder.from_kwargs( attention_type=attention_type, n_layers=n_layers, n_heads=n_heads, feed_forward_dimensions=n_heads * d_query * 4, query_dimensions=d_query, value_dimensions=d_query, dropout=dropout, softmax_temp=softmax_temp, attention_dropout=attention_dropout, bits=bits, rounds=rounds, chunk_size=chunk_size, masked=masked).get() hidden_size = n_heads * d_query self.predictor = torch.nn.Linear(hidden_size, mixtures * 3)
def __init__(self, input_shape, bins, width=128, depth=2, heads=1, attn_dropout=0.0, resid_dropout=0.0, emb_dropout=0.0, mask=True, zero_out=False, pos_init=False, x_cond=False, y_cond=False, encoder_dims=0, only_encode=False, merged_decoder=False, prime_len=None, m_attn=0.25, m_mlp=1, init_scale=1.0, checkpoint_res=0, train=False): super().__init__() self.input_shape = input_shape self.input_dims = input_dims = np.prod(input_shape) self.encoder_dims = encoder_dims self.bins = bins self.width = width self.depth = depth self.x_emb = nn.Embedding(bins, width) nn.init.normal_(self.x_emb.weight, std=0.02 * init_scale) self.x_emb_dropout = nn.Dropout(emb_dropout) self.y_cond = y_cond self.x_cond = x_cond if not y_cond: self.start_token = nn.Parameter( get_normal(1, width, std=0.01 * init_scale)) self.pos_emb = PositionEmbedding(input_shape=input_shape, width=width, init_scale=init_scale, pos_init=pos_init) self.pos_emb_dropout = nn.Dropout(emb_dropout) if train: self.transformer = TransformerEncoderBuilder.from_kwargs( n_layers=depth, n_heads=heads, feed_forward_dimensions=int(m_mlp * width), model_dimensions=width, query_dimensions=int(m_attn * width) // heads, value_dimensions=int(m_attn * width) // heads, activation='gelu', dropout=attn_dropout, attention_type="causal-linear", ).get() else: # encoder (inference) print(' [o] using RNN backend.') self.transformer = RecurrentEncoderBuilder.from_kwargs( n_layers=depth, n_heads=heads, model_dimensions=width, feed_forward_dimensions=int(m_mlp * width), query_dimensions=int(m_attn * width) // heads, value_dimensions=int(m_attn * width) // heads, dropout=attn_dropout, activation='gelu', attention_type="causal-linear", ).get() self.only_encode = only_encode self.prime_len = prime_len if merged_decoder: # Merged piped model uses this setup self.add_cond_after_transformer = False self.share_x_emb_x_out = False else: self.add_cond_after_transformer = True self.share_x_emb_x_out = True if not only_encode: self.x_out = nn.Linear(width, bins, bias=False) if self.share_x_emb_x_out: self.x_out.weight = self.x_emb.weight self.loss = t.nn.CrossEntropyLoss()
def __init__(self, n_token, is_training=True): super(TransformerModel, self).__init__() # --- params config --- # self.n_token = n_token self.d_model = D_MODEL self.n_layer = N_LAYER # self.dropout = 0.1 self.n_head = N_HEAD # self.d_head = D_MODEL // N_HEAD self.d_inner = 2048 self.loss_func = nn.CrossEntropyLoss(reduction='none') self.emb_sizes = [128, 256, 64, 32, 512, 128, 128] # --- modules config --- # # embeddings print('>>>>>:', self.n_token) self.word_emb_tempo = Embeddings(self.n_token[0], self.emb_sizes[0]) self.word_emb_chord = Embeddings(self.n_token[1], self.emb_sizes[1]) self.word_emb_barbeat = Embeddings(self.n_token[2], self.emb_sizes[2]) self.word_emb_type = Embeddings(self.n_token[3], self.emb_sizes[3]) self.word_emb_pitch = Embeddings(self.n_token[4], self.emb_sizes[4]) self.word_emb_duration = Embeddings(self.n_token[5], self.emb_sizes[5]) self.word_emb_velocity = Embeddings(self.n_token[6], self.emb_sizes[6]) self.pos_emb = PositionalEncoding(self.d_model, self.dropout) # linear self.in_linear = nn.Linear(np.sum(self.emb_sizes), self.d_model) # encoder if is_training: # encoder (training) self.transformer_encoder = TransformerEncoderBuilder.from_kwargs( n_layers=self.n_layer, n_heads=self.n_head, query_dimensions=self.d_model // self.n_head, value_dimensions=self.d_model // self.n_head, feed_forward_dimensions=2048, activation='gelu', dropout=0.1, attention_type="causal-linear", ).get() else: # encoder (inference) print(' [o] using RNN backend.') self.transformer_encoder = RecurrentEncoderBuilder.from_kwargs( n_layers=self.n_layer, n_heads=self.n_head, query_dimensions=self.d_model // self.n_head, value_dimensions=self.d_model // self.n_head, feed_forward_dimensions=2048, activation='gelu', dropout=0.1, attention_type="causal-linear", ).get() # blend with type self.project_concat_type = nn.Linear(self.d_model + 32, self.d_model) # individual output self.proj_tempo = nn.Linear(self.d_model, self.n_token[0]) self.proj_chord = nn.Linear(self.d_model, self.n_token[1]) self.proj_barbeat = nn.Linear(self.d_model, self.n_token[2]) self.proj_type = nn.Linear(self.d_model, self.n_token[3]) self.proj_pitch = nn.Linear(self.d_model, self.n_token[4]) self.proj_duration = nn.Linear(self.d_model, self.n_token[5]) self.proj_velocity = nn.Linear(self.d_model, self.n_token[6])
def __init__(self, embedding_dim, hidden_dim, problem, attention_type, n_encode_layers=2, feed_forward_dim=512, tanh_clipping=10., mask_inner=True, mask_logits=True, normalization='batch', n_heads=8, encoding_knn_size=None, decoding_knn_size=None, checkpoint_encoder=False, shrink_size=None): super(AttentionModel, self).__init__() self.embedding_dim = embedding_dim self.hidden_dim = hidden_dim self.n_encode_layers = n_encode_layers self.decode_type = None self.temp = 1.0 self.allow_partial = problem.NAME == 'sdvrp' self.is_vrp = problem.NAME == 'cvrp' or problem.NAME == 'sdvrp' self.is_orienteering = problem.NAME == 'op' self.is_pctsp = problem.NAME == 'pctsp' self.tanh_clipping = tanh_clipping self.mask_inner = mask_inner self.mask_logits = mask_logits self.problem = problem self.n_heads = n_heads self.checkpoint_encoder = checkpoint_encoder self.shrink_size = shrink_size # Problem specific context parameters (placeholder and step context dimension) if self.is_vrp or self.is_orienteering or self.is_pctsp: # Embedding of last node + remaining_capacity / remaining length / remaining prize to collect step_context_dim = embedding_dim + 1 if self.is_pctsp: node_dim = 4 # x, y, expected_prize, penalty else: node_dim = 3 # x, y, demand / prize # Special embedding projection for depot node self.init_embed_depot = nn.Linear(2, embedding_dim) if self.is_vrp and self.allow_partial: # Need to include the demand if split delivery allowed self.project_node_step = nn.Linear(1, 3 * embedding_dim, bias=False) else: # TSP assert problem.NAME == "tsp", "Unsupported problem: {}".format( problem.NAME) step_context_dim = 2 * embedding_dim # Embedding of first and last node node_dim = 2 # x, y # Learned input symbols for first action self.W_placeholder = nn.Parameter(torch.Tensor(2 * embedding_dim)) self.W_placeholder.data.uniform_( -1, 1) # Placeholder should be in range of activations self.encoding_knn_size = encoding_knn_size self.decoding_knn_size = decoding_knn_size self.init_embed = nn.Linear(node_dim, embedding_dim) self.attention_type = attention_type if attention_type == 'original': self.embedder = GraphAttentionEncoder( n_heads=n_heads, embed_dim=embedding_dim, feed_forward_dim=feed_forward_dim, n_layers=n_encode_layers, normalization=normalization) else: self.embedder = TransformerEncoderBuilder.from_kwargs( n_layers=n_encode_layers, n_heads=n_heads, query_dimensions=embedding_dim // n_heads, value_dimensions=embedding_dim // n_heads, feed_forward_dimensions=512, attention_dropout=0.0, local_context=20, clusters=5, topk=20, feature_map=Favor.factory(n_dims=128), attention_type=attention_type).get() # For each node we compute (glimpse key, glimpse value, logit key) so 3 * embedding_dim self.project_node_embeddings = nn.Linear(embedding_dim, 3 * embedding_dim, bias=False) self.project_fixed_context = nn.Linear(embedding_dim, embedding_dim, bias=False) self.project_step_context = nn.Linear(step_context_dim, embedding_dim, bias=False) assert embedding_dim % n_heads == 0 # Note n_heads * val_dim == embedding_dim so input to project_out is embedding_dim self.project_out = nn.Linear(embedding_dim, embedding_dim, bias=False)
def __init__(self, num_feats, num_output_points, lstm_layers, n_layers, n_heads, hidden_dim, ff_dim, tf_depth=3, dropout=0.15): super(SetTransformer, self).__init__() self.num_feats = num_feats self.k = num_output_points def dup(x): return (x, x) if type(x) == int else x self.lstm_layers = lstm_layers self.n_layers = dup(n_layers) self.n_heads = dup(n_heads) self.hidden_dim = dup(hidden_dim) self.ff_dim = dup(ff_dim) self.tf_depth = dup(tf_depth) self.d_model = [self.hidden_dim[i] * self.n_heads[i] for i in [0, 1]] encoder_builder_pre = TransformerEncoderBuilder.from_kwargs( n_layers=self.n_layers[0], n_heads=self.n_heads[0], query_dimensions=self.hidden_dim[0], value_dimensions=self.hidden_dim[0], feed_forward_dimensions=self.ff_dim[0], attention_type='linear', dropout=dropout) encoder_builder_post = TransformerEncoderBuilder.from_kwargs( n_layers=self.n_layers[1], n_heads=self.n_heads[1], query_dimensions=self.hidden_dim[1], value_dimensions=self.hidden_dim[1], feed_forward_dimensions=self.ff_dim[1], attention_type='linear', dropout=dropout) self.seeds = nn.Parameter(torch.normal(0, 1, (self.k, self.d_model[0]))) self.encoder_pre = encoder_builder_pre.get() self.encoder_post = encoder_builder_post.get() self.initial_ff = nn.Linear(self.num_feats, self.d_model[0]) # self.pos_encoding = PositionalEncoding(self.d_model[0], dropout=dropout) self.lstm = nn.LSTM(self.d_model[0], self.d_model[0], 2, batch_first=True, bidirectional=False) self.attn_pooling = AttentionLayer(LinearAttention(self.d_model[0]), self.d_model[0], self.n_heads[0]) self.final_ff = nn.Linear(self.d_model[1], self.num_feats) # init masks to meaningless values, doesn't matter what. these are all empty anyway. self.mask = FullMask(N=self.k, M=5) self.kl_mask = LengthMask(torch.ones(5) * 5) self.ql_mask = LengthMask(torch.ones(5) * self.k)