def test_cutoffs_no_projection_bind(self): input_layer = keras.layers.Input(shape=(None, )) embed_layer = AdaptiveEmbedding( input_dim=30, output_dim=8, cutoffs=[10, 20, 25], div_val=2, mask_zero=True, force_projection=False, return_embeddings=True, return_projections=True, )(input_layer) softmax_layer = AdaptiveSoftmax( input_dim=8, output_dim=30, cutoffs=[10, 20, 25], div_val=2, force_projection=False, bind_embeddings=True, bind_projections=True, )(embed_layer) model = keras.models.Model(input_layer, softmax_layer) model_path = os.path.join( tempfile.gettempdir(), 'test_ada_softmax_%f.h5' % np.random.random()) model.save(model_path) model = keras.models.load_model(model_path, custom_objects={ 'AdaptiveEmbedding': AdaptiveEmbedding, 'AdaptiveSoftmax': AdaptiveSoftmax, }) model.summary()
def test_force_projection_no_binding(self): input_layer = keras.layers.Input(shape=(None, )) embed_layer = AdaptiveEmbedding( input_dim=3, output_dim=16, force_projection=True, return_embeddings=True, return_projections=True, )(input_layer) softmax_layer = AdaptiveSoftmax( input_dim=16, output_dim=3, force_projection=True, )(embed_layer) model = keras.models.Model(input_layer, softmax_layer) model_path = os.path.join( tempfile.gettempdir(), 'test_ada_softmax_%f.h5' % np.random.random()) model.save(model_path) model = keras.models.load_model(model_path, custom_objects={ 'AdaptiveEmbedding': AdaptiveEmbedding, 'AdaptiveSoftmax': AdaptiveSoftmax, }) model.summary()
def test_fit(self): input_layer = keras.layers.Input(shape=(None, )) embed_layer = AdaptiveEmbedding( input_dim=30, output_dim=32, cutoffs=[5, 15, 25], div_val=2, return_embeddings=True, return_projections=True, mask_zero=True, )(input_layer) dense_layer = keras.layers.Dense( units=32, activation='tanh', )(embed_layer[0]) softmax_layer = AdaptiveSoftmax( input_dim=32, output_dim=30, cutoffs=[5, 15, 25], div_val=2, bind_embeddings=True, bind_projections=True, )([dense_layer] + embed_layer[1:]) model = keras.models.Model(inputs=input_layer, outputs=softmax_layer) model.compile('adam', 'sparse_categorical_crossentropy') model.summary() inputs = np.random.randint(0, 30, (4096, 10)) outputs = np.expand_dims(inputs, axis=-1) model.fit( inputs, outputs, epochs=100, callbacks=[ keras.callbacks.EarlyStopping(monitor='loss', min_delta=1e-4, patience=2), ], ) model = keras.models.Model(input_layer, softmax_layer) model_path = os.path.join( tempfile.gettempdir(), 'test_ada_softmax_%f.h5' % np.random.random()) model.save(model_path) model = keras.models.load_model(model_path, custom_objects={ 'AdaptiveEmbedding': AdaptiveEmbedding, 'AdaptiveSoftmax': AdaptiveSoftmax, }) inputs = np.random.randint(0, 30, (128, 10)) outputs = model.predict(inputs).argmax(axis=-1) outputs *= np.not_equal(inputs, 0).astype('int32') diff = np.sum(np.not_equal(inputs, outputs)) self.assertLess(diff, 5)
def test_sample_default(self): input_layer = keras.layers.Input(shape=(None, )) embed_layer = AdaptiveEmbedding( input_dim=3, output_dim=16, return_embeddings=True, return_projections=True, )(input_layer) func = K.function([input_layer], embed_layer) outputs = func([np.array([[0, 1, 2]])]) self.assertTrue(np.allclose(outputs[0], outputs[1]))
def test_single_projection(self): input_layer = keras.layers.Input(shape=(None, )) embed_layer = AdaptiveEmbedding( input_dim=3, output_dim=16, embed_dim=5, return_embeddings=True, return_projections=True, )(input_layer) model = keras.models.Model(input_layer, embed_layer) model_path = os.path.join(tempfile.gettempdir(), 'test_ada_embed_%f.h5' % np.random.random()) model.save(model_path) model = keras.models.load_model( model_path, custom_objects={'AdaptiveEmbedding': AdaptiveEmbedding}) model.summary()
def build_albert(token_num, pos_num=512, seq_len=512, embed_dim=128, hidden_dim=768, transformer_num=12, head_num=12, feed_forward_dim=3072, dropout_rate=0.1, attention_activation=None, feed_forward_activation='gelu', training=True, trainable=None, output_layers=None): """Get ALBERT model. See: https://arxiv.org/pdf/1909.11942.pdf :param token_num: Number of tokens. :param pos_num: Maximum position. :param seq_len: Maximum length of the input sequence or None. :param embed_dim: Dimensions of embeddings. :param hidden_dim: Dimensions of hidden layers. :param transformer_num: Number of transformers. :param head_num: Number of heads in multi-head attention in each transformer. :param feed_forward_dim: Dimension of the feed forward layer in each transformer. :param dropout_rate: Dropout rate. :param attention_activation: Activation for attention layers. :param feed_forward_activation: Activation for feed-forward layers. :param training: A built model with MLM and NSP outputs will be returned if it is `True`, otherwise the input layers and the last feature extraction layer will be returned. :param trainable: Whether the model is trainable. :param output_layers: A list of indices of output layers. """ if attention_activation == 'gelu': attention_activation = gelu if feed_forward_activation == 'gelu': feed_forward_activation = gelu if trainable is None: trainable = training def _trainable(_layer): if isinstance(trainable, (list, tuple, set)): for prefix in trainable: if _layer.name.startswith(prefix): return True return False return trainable # Build inputs input_token = keras.layers.Input(shape=(seq_len, ), name='Input-Token') input_segment = keras.layers.Input(shape=(seq_len, ), name='Input-Segment') inputs = [input_token, input_segment] # Build embeddings embed_token, embed_weights, embed_projection = AdaptiveEmbedding( input_dim=token_num, output_dim=hidden_dim, embed_dim=embed_dim, mask_zero=True, trainable=trainable, return_embeddings=True, return_projections=True, name='Embed-Token', )(input_token) embed_segment = keras.layers.Embedding( input_dim=2, output_dim=hidden_dim, trainable=trainable, name='Embed-Segment', )(input_segment) embed_layer = keras.layers.Add(name='Embed-Token-Segment')( [embed_token, embed_segment]) embed_layer = PositionEmbedding( input_dim=pos_num, output_dim=hidden_dim, mode=PositionEmbedding.MODE_ADD, trainable=trainable, name='Embedding-Position', )(embed_layer) if dropout_rate > 0.0: dropout_layer = keras.layers.Dropout( rate=dropout_rate, name='Embedding-Dropout', )(embed_layer) else: dropout_layer = embed_layer embed_layer = LayerNormalization( trainable=trainable, name='Embedding-Norm', )(dropout_layer) # Build shared transformer attention_layer = MultiHeadAttention( head_num=head_num, activation=attention_activation, name='Attention', ) attention_normal = LayerNormalization(name='Attention-Normal') feed_forward_layer = FeedForward(units=feed_forward_dim, activation=feed_forward_activation, name='Feed-Forward') feed_forward_normal = LayerNormalization(name='Feed-Forward-Normal') transformed = embed_layer transformed_layers = [] for i in range(transformer_num): attention_input = transformed transformed = attention_layer(transformed) if dropout_rate > 0.0: transformed = keras.layers.Dropout( rate=dropout_rate, name='Attention-Dropout-{}'.format(i + 1), )(transformed) transformed = keras.layers.Add( name='Attention-Add-{}'.format(i + 1), )( [attention_input, transformed]) transformed = attention_normal(transformed) feed_forward_input = transformed transformed = feed_forward_layer(transformed) if dropout_rate > 0.0: transformed = keras.layers.Dropout( rate=dropout_rate, name='Feed-Forward-Dropout-{}'.format(i + 1), )(transformed) transformed = keras.layers.Add( name='Feed-Forward-Add-{}'.format(i + 1), )( [feed_forward_input, transformed]) transformed = feed_forward_normal(transformed) transformed_layers.append(transformed) if training: # Build tasks mlm_dense_layer = keras.layers.Dense( units=hidden_dim, activation=feed_forward_activation, name='MLM-Dense', )(transformed) mlm_norm_layer = LayerNormalization(name='MLM-Norm')(mlm_dense_layer) mlm_pred_layer = AdaptiveSoftmax( input_dim=hidden_dim, output_dim=token_num, embed_dim=embed_dim, bind_embeddings=True, bind_projections=True, name='MLM-Sim', )([mlm_norm_layer, embed_weights, embed_projection]) masked_layer = Masked(name='MLM')([mlm_pred_layer, inputs[-1]]) extract_layer = Extract(index=0, name='Extract')(transformed) nsp_dense_layer = keras.layers.Dense( units=hidden_dim, activation='tanh', name='SOP-Dense', )(extract_layer) nsp_pred_layer = keras.layers.Dense( units=2, activation='softmax', name='SOP', )(nsp_dense_layer) model = keras.models.Model(inputs=inputs, outputs=[masked_layer, nsp_pred_layer]) for layer in model.layers: layer.trainable = _trainable(layer) return model if output_layers is not None: if isinstance(output_layers, list): output_layers = [ transformed_layers[index] for index in output_layers ] output = keras.layers.Concatenate(name='Output', )(output_layers) else: output = transformed_layers[output_layers] model = keras.models.Model(inputs=inputs, outputs=output) return model model = keras.models.Model(inputs=inputs, outputs=transformed) for layer in model.layers: layer.trainable = _trainable(layer) return inputs, transformed
def build_transformer_xl(units, embed_dim, hidden_dim, num_token, num_block, num_head, batch_size, memory_len, target_len, dropout=0.0, attention_dropout=0.0, cutoffs=None, div_val=1, force_projection=None, bind_embeddings=True, bind_projections=True, clamp_len=None, share_biases=True): """Build transformer-XL model. :param units: Units inside the transformer. :param embed_dim: Dimension of embeddings. :param hidden_dim: Dimension inside position-wise feed-forward layer. :param num_token: Number of distinct input tokens. :param num_block: Number of basic encoder blocks. :param num_head: Number of heads for attention. :param batch_size: Maximum batch size. :param memory_len: The maximum length of memories. :param target_len: The length of prediction block. :param dropout: General dropout rate. :param attention_dropout: Dropout rate inside attention layer. :param cutoffs: Cutoffs of adaptive embedding. :param div_val: Scale factor of adaptive embedding. :param force_projection: Add projection when the dimensions are equal. :param bind_embeddings: Whether to bind embeddings to adaptive softmax. :param bind_projections: Whether to bind projections to adaptive softmax. :param clamp_len: The maximum value of relative position. :param share_biases: Whether to use the same biases for all layers. :return: The built model. """ token_input = keras.layers.Input(shape=(target_len,), name='Input-Token') memory_length_input = keras.layers.Input(shape=(1,), name='Input-Memory-Length') inputs = [token_input, memory_length_input] results = AdaptiveEmbedding( input_dim=num_token, output_dim=units, embed_dim=embed_dim, cutoffs=cutoffs, div_val=div_val, mask_zero=True, force_projection=force_projection, return_embeddings=True, return_projections=True, name='Embed-Token', )(token_input) token_embed, embedding_weights = results[0], results[1:] token_embed = Scale(scale=np.sqrt(units), name='Embed-Token-Scaled')(token_embed) last_memory = Memory( batch_size=batch_size, memory_len=memory_len, target_len=target_len, output_dim=units, name='Memory-0', )([token_embed, memory_length_input]) position_embed = PositionalEmbedding( output_dim=units, clamp_len=clamp_len, name='Embed-Position', )([token_input, last_memory]) if 0.0 < dropout < 1.0: token_embed = keras.layers.Dropout(rate=dropout, name='Embed-Token-Dropped')(token_embed) position_embed = keras.layers.Dropout(rate=dropout, name='Embed-Position-Dropped')(position_embed) context_bias, relative_bias = None, None if share_biases: context_bias, relative_bias = RelativeBias(units=units, name='Biases')(last_memory) outputs = [token_embed] for i in range(num_block): block_input, block_output = outputs[-1], outputs[-1] if not share_biases: context_bias, relative_bias = RelativeBias(units=units, name='Biases-{}'.format(i + 1))(last_memory) block_output = RelativePartialMultiHeadSelfAttention( units=units, num_head=num_head, use_bias=False, attention_dropout=attention_dropout, name='Attention-{}'.format(i + 1), )([block_output, position_embed, last_memory, context_bias, relative_bias]) if 0.0 < dropout < 1.0: block_output = keras.layers.Dropout(rate=dropout, name='Attention-Dropped-{}'.format(i + 1))(block_output) block_output = keras.layers.Add(name='Attention-Res-{}'.format(i + 1))([block_input, block_output]) block_output = LayerNormalization(name='Attention-Norm-{}'.format(i + 1))(block_output) block_input = block_output block_output = FeedForward( units=hidden_dim, dropout_rate=dropout, name='FeedForward-{}'.format(i + 1), )(block_output) if 0.0 < dropout < 1.0: block_output = keras.layers.Dropout(rate=dropout, name='FeedForward-Dropped-{}'.format(i + 1))(block_output) block_output = keras.layers.Add(name='FeedForward-Res-{}'.format(i + 1))([block_input, block_output]) block_output = LayerNormalization(name='FeedForward-Norm-{}'.format(i + 1))(block_output) if i < num_block - 1: last_memory = Memory( batch_size=batch_size, memory_len=memory_len, target_len=target_len, output_dim=units, name='Memory-{}'.format(i + 1), )([block_output, memory_length_input]) outputs.append(block_output) if 0.0 < dropout < 1.0: outputs[-1] = keras.layers.Dropout(rate=dropout, name='Output-Dropped')(outputs[-1]) softmax = AdaptiveSoftmax( input_dim=units, output_dim=num_token, embed_dim=embed_dim, cutoffs=cutoffs, div_val=div_val, force_projection=force_projection, bind_embeddings=bind_embeddings, bind_projections=bind_projections, name='Softmax', )(outputs[-1:] + embedding_weights) model = keras.models.Model(inputs=inputs, outputs=softmax) return model
def test_sample_cutoffs(self): embed_0 = np.array([ [ 0.7562694862279867, -0.7532437781410828, -0.2882295795429552, -1.6990371818805843, -0.09864164817566004, -0.5235034477186453, -1.600153091413999, 0.03441732751250957, ], [ -0.3680529905261407, 1.1673600332887637, -0.6914459306809843, -0.7645030146906124, 2.0434827620248606, -0.2743642839675437, 0.04834288951969495, -1.0368596183756285, ], [ -0.8440324158987662, 0.05585795322288273, -0.5827731797867599, 1.502853709909658, -0.09311037618863122, 1.366316512453695, -0.3834091917878978, -1.2647642860801802, ], [ 1.5212768184170435, -0.7854311748221854, -0.4674213048014483, -1.0460200278367862, 0.3705555995848165, -0.12273261562651422, 1.8138708310050653, -0.26957084415202764, ], [ -0.15162771245260723, -0.19654664890167275, -1.77930041719533, -0.6987101769248606, 0.32681036318004547, 0.19156716698736181, 0.8386004334587568, -1.8390076172747616, ], [ -1.1363779747587972, -0.15233131547247872, 0.158423477487577, -0.6984487776859649, 1.2424950830966563, -0.16130616338419873, -1.6298737099566283, 1.7229575808498785, ], [ 0.613169803410901, -1.5391239758406403, -1.2476893436624792, -0.05514513857644962, -0.5537408608863357, -0.9965187549427492, -0.6842234254089083, -1.2420165307152238, ], [ -0.4086071455923046, -0.7286151488450243, 1.2938629380821804, 0.7450912596769113, -0.13042129128885002, -1.4269400640112133, -0.713571658756806, -0.5036154349645774, ], [ 0.7326026846217363, 0.12752591749386982, 0.7387086112901271, -1.4161019970745967, -0.6396944907214142, -2.0010110577965783, 0.5843029435066284, -0.4033331631189724, ], [ 1.22301664512685, -0.024541032664251092, -0.27128167541306714, 1.910258142043872, -0.9673069099782774, 0.6614265651081772, -1.165650716838653, -0.5085143504562967, ], ]) embed_1 = np.array([ [ 0.6593494357199338, -0.06233478795012013, 0.3394579881849406, 0.05894554241531747 ], [ 1.0015451559801243, 0.7487130375684998, -0.4244371286817957, -0.45182923128222996 ], [ -0.41965070720383035, -0.2875756074838825, 1.8712603426351773, 2.531083895835167 ], [ -0.6800689195006436, -0.39454047242128376, 0.5442439581019961, -0.21672610899025968 ], [ -1.3119449289237803, 1.5645034642903253, 1.3203132828621442, 1.7673879116655695 ], [ -0.8817194029613362, -0.6655645822150862, 0.2341787847442309, -0.7641095447924122 ], [ -0.47497798682688624, 1.0109350638555383, -0.5514102704837403, -0.1450007600387442 ], [ -0.531267085230172, 0.12862169808408846, 0.18339345878624577, 1.5279135983387981 ], [ 0.43338928943049837, 0.2660771849859784, 1.4227633495535283, -0.5072818940455809 ], [ 0.8704222505796531, 0.9361117741463981, 0.7442665348863866, 0.91392694614948 ], ]) embed_2 = np.array([ [1.2712292341556446, 1.009655780936284], [0.4420362222435132, 1.5186087787070979], [-0.10018465175352317, -0.09182475290216006], [-1.246047485363712, 1.6404603895987184], [1.4427767754835976, 1.2102150762070925], ]) embed_3 = np.array([ [0.8285545743394414], [0.7111875779008273], [0.35799413043562894], [-0.15005629449852656], [0.6263946579941496], ]) proj_0 = np.array([ [0.3409731658714878, 0.032745006392315756, 0.668797744010083], [-0.3082491589087075, -1.0028023345331745, 0.2122102239605163], [-0.3751562822576601, -0.5825445529201775, 0.43389258576225614], [0.26067868083146517, 0.8192897299406429, 0.073726048897453], [1.1346146882950412, -2.456072992985481, -0.054474463562940736], [-1.0283521269636255, -0.1983876737118115, 1.0132159972212373], [2.72334361610427, 0.5683724225575054, 2.403638230905517], [-0.2137114185905606, 0.3048293347650425, 1.510425235737199], ]) proj_1 = np.array([ [0.42186259731067743, 0.6034344571434473, 2.362015513199549], [-0.9313583984951119, -0.8242699945665621, 0.2596454482698166], [0.8871149648450185, -0.663397984939589, -1.195129355668761], [0.8016784490871957, 0.13830808473255815, -0.6580242457235711], ]) proj_2 = np.array([ [1.4802477891158519, 0.12638370704617574, -0.18503256737397666], [-0.3900434531439191, 0.14771223879593204, -0.8863321455068343], ]) proj_3 = np.array( [[-0.589729339138385, 2.018799784975004, -0.08431336326635828]]) input_layer = keras.layers.Input(shape=(None, )) embed_layer = AdaptiveEmbedding( input_dim=30, output_dim=3, embed_dim=8, cutoffs=[10, 20, 25], div_val=2, mask_zero=True, return_embeddings=True, return_projections=True, ) func = K.function([input_layer], embed_layer(input_layer)) embed_layer.set_weights([ embed_0, proj_0, embed_1, proj_1, embed_2, proj_2, embed_3, proj_3, ]) outputs = func([np.array([list(range(30))])]) expected = np.array([ [-3.783413887023926, -0.9968423843383789, -4.223631381988525], [2.528728485107422, -6.659335613250732, -2.194012403488159], [-1.9791769981384277, 0.8412808179855347, -2.137157917022705], [6.2075581550598145, 0.31576472520828247, 4.379002094268799], [3.3448808193206787, -0.268412709236145, -1.552351474761963], [-3.813311815261841, -3.9697980880737305, -2.3214385509490967], [-0.06424117088317871, 3.0353987216949463, -4.962082862854004], [-0.7221541404724121, 0.6183103322982788, -3.726100444793701], [2.573601245880127, 0.48284363746643066, -0.4642190933227539], [-3.8191750049591064, 3.2147698402404785, -2.0111422538757324], [0.6846045255661011, 0.23221178352832794, 1.0967247486114502], [-1.013551950454712, 0.20630428194999695, 3.3646368980407715], [3.7799394130706787, -0.9075126051902771, -4.967802047729492], [0.3896251916885376, -0.4761944115161896, -2.216604709625244], [0.5775725841522217, -2.712695360183716, -5.433547496795654], [-0.1569119393825531, -0.24449113011360168, -2.0325169563293457], [-1.7473266124725342, -0.7741519212722778, -0.10500013828277588], [1.04367196559906, -0.33694392442703247, -2.4460482597351074], [0.7904950380325317, -0.971816897392273, -0.2738245725631714], [0.8882685303688049, -0.6137074828147888, 0.8081271648406982], [1.487924575805664, 0.3098011910915375, -1.130109190940857], [0.06199967861175537, 0.2801832854747772, -1.4277828931808472], [-0.1124824583530426, -0.026225347071886063, 0.09992465376853943], [-2.484309673309326, 0.0848359763622284, -1.2234333753585815], [1.6636306047439575, 0.3611070513725281, -1.3396131992340088], [-0.4886229634284973, 1.6726857423782349, -0.06985822319984436], [-0.4194082021713257, 1.435745358467102, -0.059962619096040726], [-0.2111196517944336, 0.7227184772491455, -0.030183689668774605], [0.08849260210990906, -0.30293360352516174, 0.012651749886572361], [-0.36940330266952515, 1.264565348625183, -0.05281343683600426], ]) self.assertTrue(np.allclose(expected, outputs[0][0]))