class TiedGraphAutoencoderFP(Layer):
    def __init__(self,
                 inner_layer_arg,
                 activ,
                 bias,
                 init,
                 original_atom_bond_features,
                 tied_to=None,
                 encode=False,
                 decode=False,
                 activity_reg=None,
                 **kwargs):
        # Initialise
        self.tied_to = tied_to
        self.encode = encode
        self.decode = decode
        self.original_atom_bond_features = original_atom_bond_features
        self.bias = bias
        self.reg = activity_reg

        if isinstance(inner_layer_arg, (int, np.int64)):
            self.fp_length = inner_layer_arg
            self.create_inner_layer_fn = lambda: DenseTied(
                self.fp_length,
                activation=activ,
                use_bias=bias,
                kernel_initializer=init,
                tied_to=self.tied_to,
                idx=None,
                activity_regularizer=self.reg,
                **kwargs)  ### add inputs to dense layer
        else:
            raise ValueError(
                'NeuralGraphHidden has to be initialised with fp_length.')

        super(TiedGraphAutoencoderFP, self).__init__(**kwargs)

    def build(self, inputs_shape):
        # Set the index for the DenseTied weight values
        # Import dimensions
        (max_atoms, _, num_atom_features, num_bond_features,
         _) = mol_shapes_to_dims(mol_shapes=inputs_shape)

        # Add the dense layer that contains the trainable parameters
        # Initialise dense layer with specified params (kwargs) and name
        self.trainable_weights = []
        self.non_trainable_weights = []

        inner_layer = self.create_inner_layer_fn()
        inner_layer_type = inner_layer.__class__.__name__.lower()
        inner_layer.name = self.name + '_inner_' + inner_layer_type

        # Initialise TimeDistributed layer wrapper in order to parallelise
        #   dense layer across atoms
        inner_3D_layer_name = self.name + '_inner_timedistributed'
        self.inner_3D_layer = TimeDistributed(inner_layer,
                                              name=inner_3D_layer_name)

        # Build the TimeDistributed layer (which will build the Dense layer)
        if self.encode:
            self.inner_3D_layer.build(
                (None, max_atoms, num_atom_features + num_bond_features))
        else:
            self.inner_3D_layer.build((None, max_atoms, self.fp_length))

        # Store dense_3D_layer and it's weights

        if self.tied_to is not None:
            self.non_trainable_weights.append(self.inner_3D_layer.layer.kernel)
            if self.bias:
                self.trainable_weights.append(self.inner_3D_layer.layer.bias)
        else:
            self.trainable_weights = self.inner_3D_layer.trainable_weights

    def call(self, inputs, mask=None):

        if self.encode:
            return self.encoder(inputs)
        elif self.decode:
            return self.decoder(inputs)

    def encoder(self, inputs):
        atoms, bonds, edges = inputs

        final_fp_out = self.process_through_layers(atoms, bonds, edges)
        return final_fp_out

    def decoder(self, inputs):
        fp_out, _, _ = inputs

        vxi_dot = self.inner_3D_layer(fp_out)
        return vxi_dot

    def process_through_layers(self, atoms, bonds, edges):
        # Create a matrix that stores for each atom, the degree it is, use it
        #   to create a general atom mask (unused atoms are 0 padded)
        # We have to use the edge vector for this, because in theory, a convolution
        #   could lead to a zero vector for an atom that is present in the molecule
        atom_degrees = K.sum(tf.keras.backend.cast(K.not_equal(edges, -1),
                                                   dtype='float32'),
                             axis=-1,
                             keepdims=True)
        general_atom_mask = K.cast(K.not_equal(atom_degrees, 0), K.floatx())

        # Sum the edge features for each atom
        summed_bond_features = K.sum(bonds, axis=-2)

        # Concatenate the summed atom and bond features
        atoms_bonds_features = keras.layers.Concatenate(axis=-1)(
            [atoms, summed_bond_features])

        # Compute fingerprint

        fingerprint_out_unmasked = self.inner_3D_layer(atoms_bonds_features)

        # Do explicit masking because TimeDistributed does not support masking
        fingerprint_out_masked = fingerprint_out_unmasked * general_atom_mask
        final_fp_out = fingerprint_out_masked

        # Sum across all atoms
        # final_fp_out = K.sum(fingerprint_out_masked, axis=-2, keepdims = False)

        return final_fp_out

    def compute_output_shape(self, inputs_shape):

        # Import dimensions
        (max_atoms, _, _, _,
         num_samples) = mol_shapes_to_dims(mol_shapes=inputs_shape)

        if self.encode:
            return (num_samples, max_atoms, self.fp_length)
        else:
            return (num_samples, max_atoms, self.original_atom_bond_features)

    def get_config(self):
        config = super(TiedGraphAutoencoderFP, self).get_config()

        # Store config of inner layer of the 3D wrapper
        inner_layer = self.inner_3D_layer.layer
        config['inner_layer_config'] = dict(
            config=inner_layer.get_config(),
            class_name=inner_layer.__class__.__name__)
        return config
    def build(self, inputs_shape):
        # Import dimensions
        (max_atoms, max_degree, num_atom_features, num_bond_features,
         _) = mol_shapes_to_dims(mol_shapes=inputs_shape)

        # Add the dense layers (that contain trainable params)
        #   (for each degree we convolve with a different weight matrix)
        self.trainable_weights = []
        self.non_trainable_weights = []
        self.inner_3D_layers = []
        self.all_layers = []

        self.idx = max_degree
        self_layer = self.create_inner_layer_fn()
        self_layer_type = self_layer.__class__.__name__.lower()
        self_layer.name = self.name + '_self_' + self_layer_type + '_'

        #Time Distributed layer wrapper
        self.self_3D_layer_name = self.name + '_self_timedistributed'
        self.self_3D_layer = TimeDistributed(self_layer,
                                             name=self.self_3D_layer_name)
        if self.encode_only:
            self.self_3D_layer.build(
                (None, max_atoms, num_atom_features + num_bond_features))
        else:
            self.self_3D_layer.build((None, max_atoms, self.conv_width))

        for degree in range(max_degree):
            self.idx = degree
            # Initialise inner layer, and rename it
            inner_layer = self.create_inner_layer_fn()
            inner_layer_type = inner_layer.__class__.__name__.lower()
            inner_layer.name = self.name + '_inner_' + inner_layer_type + '_' + str(
                degree)

            # Initialise TimeDistributed layer wrapper in order to parallelise
            #   dense layer across atoms (3D)
            inner_3D_layer_name = self.name + '_inner_timedistributed_' + str(
                degree)
            inner_3D_layer = TimeDistributed(inner_layer,
                                             name=inner_3D_layer_name)

            # Build the TimeDistributed layer (which will build the Dense layer)
            if self.encode_only:
                inner_3D_layer.build(
                    (None, max_atoms, num_bond_features + num_atom_features))
            else:
                inner_3D_layer.build((None, max_atoms, self.conv_width))

            # Store inner_3D_layer and it's weights
            self.inner_3D_layers.append(inner_3D_layer)
            self.all_layers.append(inner_3D_layer)
            if self.tied_to is not None:
                self.non_trainable_weights.append(inner_3D_layer.layer.kernel)
                if self.bias:
                    self.trainable_weights.append(inner_3D_layer.layer.bias)
            else:
                self.trainable_weights += inner_3D_layer.trainable_weights

        if self.tied_to is not None:
            self.trainable_weights.append(self.self_3D_layer.layer.bias)
            self.non_trainable_weights.append(self.self_3D_layer.layer.kernel)
        else:
            self.trainable_weights += self.self_3D_layer.trainable_weights

        self.all_layers.append(self_layer)
class TiedGraphAutoencoder(Layer):
    def __init__(self,
                 inner_layer_arg,
                 activ,
                 bias,
                 init,
                 original_atom_bond_features=None,
                 tied_to=None,
                 encode_only=False,
                 decode_only=False,
                 activity_reg=None,
                 **kwargs):
        # Initialise inner dense layers using convolution width
        # Check if inner_layer_arg is conv_width
        self.tied_to = tied_to
        self.encode_only = encode_only
        self.decode_only = decode_only
        self.bias = bias
        self.original_atom_bond_features = original_atom_bond_features
        self.activ = activ
        self.init = init
        self.reg = activity_reg

        # Case 1: check if conv_width is given
        if isinstance(inner_layer_arg, (int, np.int64)):
            self.conv_width = inner_layer_arg
            self.create_inner_layer_fn = lambda: DenseTied(
                self.conv_width,
                activation=self.activ,
                use_bias=bias,
                kernel_initializer=init,
                tied_to=self.tied_to,
                idx=self.idx,
                activity_regularizer=self.reg,
                **kwargs)
        # Case 2: Check if an initialised keras layer is given
        elif isinstance(inner_layer_arg, Layer):
            assert inner_layer_arg.built == False, 'When initialising with a keras layer, it cannot be built.'
            _, self.conv_width = inner_layer_arg.get_output_shape_for(
                (None, None))
            # layer_from_config will mutate the config dict, therefore create a get fn
            self.create_inner_layer_fn = lambda: layer_from_config(
                dict(class_name=inner_layer_arg.__class__.__name__,
                     config=inner_layer_arg.get_config()))
        else:
            raise ValueError(
                'TiedAutoencoder has to be initialised with 1). int conv_width, 2). a keras layer instance, or 3). a function returning a keras layer instance.'
            )

        super(TiedGraphAutoencoder, self).__init__(**kwargs)

    def build(self, inputs_shape):
        # Import dimensions
        (max_atoms, max_degree, num_atom_features, num_bond_features,
         _) = mol_shapes_to_dims(mol_shapes=inputs_shape)

        # Add the dense layers (that contain trainable params)
        #   (for each degree we convolve with a different weight matrix)
        self.trainable_weights = []
        self.non_trainable_weights = []
        self.inner_3D_layers = []
        self.all_layers = []

        self.idx = max_degree
        self_layer = self.create_inner_layer_fn()
        self_layer_type = self_layer.__class__.__name__.lower()
        self_layer.name = self.name + '_self_' + self_layer_type + '_'

        #Time Distributed layer wrapper
        self.self_3D_layer_name = self.name + '_self_timedistributed'
        self.self_3D_layer = TimeDistributed(self_layer,
                                             name=self.self_3D_layer_name)
        if self.encode_only:
            self.self_3D_layer.build(
                (None, max_atoms, num_atom_features + num_bond_features))
        else:
            self.self_3D_layer.build((None, max_atoms, self.conv_width))

        for degree in range(max_degree):
            self.idx = degree
            # Initialise inner layer, and rename it
            inner_layer = self.create_inner_layer_fn()
            inner_layer_type = inner_layer.__class__.__name__.lower()
            inner_layer.name = self.name + '_inner_' + inner_layer_type + '_' + str(
                degree)

            # Initialise TimeDistributed layer wrapper in order to parallelise
            #   dense layer across atoms (3D)
            inner_3D_layer_name = self.name + '_inner_timedistributed_' + str(
                degree)
            inner_3D_layer = TimeDistributed(inner_layer,
                                             name=inner_3D_layer_name)

            # Build the TimeDistributed layer (which will build the Dense layer)
            if self.encode_only:
                inner_3D_layer.build(
                    (None, max_atoms, num_bond_features + num_atom_features))
            else:
                inner_3D_layer.build((None, max_atoms, self.conv_width))

            # Store inner_3D_layer and it's weights
            self.inner_3D_layers.append(inner_3D_layer)
            self.all_layers.append(inner_3D_layer)
            if self.tied_to is not None:
                self.non_trainable_weights.append(inner_3D_layer.layer.kernel)
                if self.bias:
                    self.trainable_weights.append(inner_3D_layer.layer.bias)
            else:
                self.trainable_weights += inner_3D_layer.trainable_weights

        if self.tied_to is not None:
            self.trainable_weights.append(self.self_3D_layer.layer.bias)
            self.non_trainable_weights.append(self.self_3D_layer.layer.kernel)
        else:
            self.trainable_weights += self.self_3D_layer.trainable_weights

        self.all_layers.append(self_layer)

    def call(self, inputs, mask=None):
        atoms, bonds, edges = inputs

        if self.encode_only:
            return self.encode(inputs)
        elif self.decode_only:
            return self.decode(atoms, bonds, edges)
        else:
            return self.decode(self.encode(inputs), bonds, edges)

    def encode(self, inputs):
        atoms, bonds, edges = inputs

        # Import dimensions
        max_atoms = atoms._keras_shape[1]
        num_atom_features = atoms._keras_shape[-1]
        num_bond_features = bonds._keras_shape[-1]
        max_degree = 5

        # Looks up the neighbors, sums the edge features and creates vni
        summed_features, atom_degrees = self.mask_atoms_by_degree(
            atoms, edges, bonds)

        new_features_by_degree = self.create_layer_by_deg(
            max_degree, atom_degrees,
            (max_atoms, num_atom_features, num_bond_features), summed_features)
        zni = add(new_features_by_degree)
        summed_bonds = K.sum(bonds, axis=-2)
        vxi = K.concatenate([atoms, summed_bonds], axis=-1)
        zxi = self.self_3D_layer(vxi)

        vxi_plus_one = keras.layers.add([zni, zxi])

        return vxi_plus_one

    def decode(self, vxi_plus_one, bonds, edges):
        atoms = vxi_plus_one

        # Import dimensions
        max_atoms = atoms.shape[1]
        num_atom_features = atoms.shape[-1]
        num_bond_features = bonds._keras_shape[-1]
        max_degree = 5

        _, atom_degrees = self.mask_atoms_by_degree(atoms, edges, bonds=None)
        td_denses_by_degree = self.create_layer_by_deg(
            max_degree, atom_degrees,
            [max_atoms, num_atom_features, num_bond_features], vxi_plus_one)
        vni_dot = keras.layers.add(td_denses_by_degree)
        vxi_dot = self.self_3D_layer(vxi_plus_one)
        return [vni_dot, vxi_dot]

    def mask_atoms_by_degree(self, atoms, edges, bonds=None):

        # Create a matrix that stores for each atom, the degree it is
        atom_degrees = K.sum(tf.keras.backend.cast(K.not_equal(edges, -1),
                                                   dtype='float32'),
                             axis=-1,
                             keepdims=True)

        # For each atom, look up the features of it's neighbour
        neighbour_atom_features = neighbour_lookup(atoms,
                                                   edges,
                                                   include_self=False)

        # Sum along degree axis to get summed neighbour features
        summed_atom_features = K.sum(neighbour_atom_features, axis=-2)

        # Sum the edge features for each atom
        if bonds is not None:
            summed_bond_features = K.sum(bonds, axis=-2)

        # Concatenate the summed atom and bond features
        if bonds is not None:
            summed_features = K.concatenate(
                [summed_atom_features, summed_bond_features], axis=-1)
        else:
            summed_features = summed_atom_features

        return summed_features, atom_degrees

    def create_layer_by_deg(self, max_deg, atom_degrees, inputs,
                            summed_features):
        # For each degree we convolve with a different weight matrix
        [max_atoms, num_atom_features, num_bond_features] = inputs
        new_features_by_degree = []
        for degree in range(max_deg):

            # Create mask for this degree
            atom_masks_this_degree = K.cast(K.equal(atom_degrees, degree),
                                            K.floatx())

            # Multiply with hidden merge layer
            #   (use time Distributed because we are dealing with 2D input/3D for batches)
            # Add keras shape to let keras now the dimensions
            if self.encode_only:
                summed_features._keras_shape = (None, max_atoms,
                                                num_atom_features +
                                                num_bond_features)
            else:
                summed_features._keras_shape = (None, max_atoms,
                                                self.conv_width)

            new_unmasked_features = self.inner_3D_layers[degree](
                summed_features)
            # Do explicit masking because TimeDistributed does not support masking
            new_masked_features = new_unmasked_features * atom_masks_this_degree

            new_features_by_degree.append(new_masked_features)

        return new_features_by_degree

    def compute_output_shape(self, inputs_shape):

        # Import dimensions
        inputs_shape[0] = (None, int(inputs_shape[0][1]), inputs_shape[0][2])

        (max_atoms, _, _, _,
         num_samples) = mol_shapes_to_dims(mol_shapes=inputs_shape)

        if self.encode_only:
            return (num_samples, max_atoms, self.conv_width)
        else:
            return [(num_samples, max_atoms, self.original_atom_bond_features),
                    (num_samples, max_atoms, self.original_atom_bond_features)]
Exemplo n.º 4
0
    def __init__(self,
                 p=None,
                 h=None,
                 include_word_vectors=True,
                 word_embedding_weights=None,
                 train_word_embeddings=True,
                 include_chars=True,
                 chars_per_word=16,
                 char_embedding_size=8,
                 char_conv_filters=100,
                 char_conv_kernel_size=5,
                 include_syntactical_features=True,
                 syntactical_feature_size=50,
                 include_exact_match=True,
                 dropout_initial_keep_rate=1.,
                 dropout_decay_rate=0.977,
                 dropout_decay_interval=10000,
                 first_scale_down_ratio=0.3,
                 transition_scale_down_ratio=0.5,
                 growth_rate=20,
                 layers_per_dense_block=8,
                 nb_dense_blocks=3,
                 nb_labels=3,
                 inputs=None,
                 outputs=None,
                 name='DIIN'):
        """
        :ref https://openreview.net/forum?id=r1dHXnH6-&noteId=r1dHXnH6-

        :param p: sequence length of premise
        :param h: sequence length of hypothesis
        :param include_word_vectors: whether or not to include word vectors in the model
        :param word_embedding_weights: matrix of weights for word embeddings (GloVe pre-trained vectors)
        :param train_word_embeddings: whether or not to modify word embeddings while training
        :param include_chars: whether or not to include character embeddings in the model
        :param chars_per_word: how many chars are there per one word (a fixed number)
        :param char_embedding_size: input size of the character-embedding layer
        :param char_conv_filters: number of conv-filters applied on character embedding
        :param char_conv_kernel_size: size of the kernel applied on character embeddings
        :param include_syntactical_features: whether or not to include syntactical features (POS tags) in the model
        :param syntactical_feature_size: size of the syntactical feature vector for each word
        :param include_exact_match: whether or not to include exact match features in the model
        :param dropout_initial_keep_rate: initial state of dropout
        :param dropout_decay_rate: how much to change dropout at each interval
        :param dropout_decay_interval: how much time to wait for the next update
        :param first_scale_down_ratio: first scale down ratio in densenet
        :param transition_scale_down_ratio: transition scale down ratio in densenet
        :param growth_rate: growing rate in densenet
        :param layers_per_dense_block: number of layers in one dense-block
        :param nb_dense_blocks: number of dense blocks in densenet
        :param nb_labels: number of labels (3 labels by default: entailment, contradiction, neutral)
        """

        if inputs or outputs:
            super(DIIN, self).__init__(inputs=inputs,
                                       outputs=outputs,
                                       name=name)
            return

        if include_word_vectors:
            assert word_embedding_weights is not None
        inputs = []
        premise_embeddings = []
        hypothesis_embeddings = []
        '''Embedding layer'''
        # 1. Word embedding input
        if include_word_vectors:
            premise_word_input = Input(shape=(p, ),
                                       dtype='int64',
                                       name='PremiseWordInput')
            hypothesis_word_input = Input(shape=(h, ),
                                          dtype='int64',
                                          name='HypothesisWordInput')
            inputs.append(premise_word_input)
            inputs.append(hypothesis_word_input)

            word_embedding = Embedding(
                input_dim=word_embedding_weights.shape[0],
                output_dim=word_embedding_weights.shape[1],
                weights=[word_embedding_weights],
                trainable=train_word_embeddings,
                name='WordEmbedding')
            premise_word_embedding = word_embedding(premise_word_input)
            hypothesis_word_embedding = word_embedding(hypothesis_word_input)

            premise_word_embedding = DecayingDropout(
                initial_keep_rate=dropout_initial_keep_rate,
                decay_interval=dropout_decay_interval,
                decay_rate=dropout_decay_rate,
                name='PremiseWordEmbeddingDropout')(premise_word_embedding)
            hypothesis_word_embedding = DecayingDropout(
                initial_keep_rate=dropout_initial_keep_rate,
                decay_interval=dropout_decay_interval,
                decay_rate=dropout_decay_rate,
                name='HypothesisWordEmbeddingDropout')(
                    hypothesis_word_embedding)
            premise_embeddings.append(premise_word_embedding)
            hypothesis_embeddings.append(hypothesis_word_embedding)

        # 2. Character input
        if include_chars:
            premise_char_input = Input(shape=(
                p,
                chars_per_word,
            ),
                                       name='PremiseCharInput')
            hypothesis_char_input = Input(shape=(
                h,
                chars_per_word,
            ),
                                          name='HypothesisCharInput')
            inputs.append(premise_char_input)
            inputs.append(hypothesis_char_input)

            # Share weights of character-level embedding for premise and hypothesis
            character_embedding_layer = TimeDistributed(Sequential([
                Embedding(input_dim=100,
                          output_dim=char_embedding_size,
                          input_length=chars_per_word),
                Conv1D(filters=char_conv_filters,
                       kernel_size=char_conv_kernel_size),
                GlobalMaxPooling1D()
            ]),
                                                        name='CharEmbedding')
            character_embedding_layer.build(input_shape=(None, None,
                                                         chars_per_word))
            premise_char_embedding = character_embedding_layer(
                premise_char_input)
            hypothesis_char_embedding = character_embedding_layer(
                hypothesis_char_input)
            premise_embeddings.append(premise_char_embedding)
            hypothesis_embeddings.append(hypothesis_char_embedding)

        # 3. Syntactical features
        if include_syntactical_features:
            premise_syntactical_input = Input(shape=(
                p,
                syntactical_feature_size,
            ),
                                              name='PremiseSyntacticalInput')
            hypothesis_syntactical_input = Input(
                shape=(
                    h,
                    syntactical_feature_size,
                ),
                name='HypothesisSyntacticalInput')
            inputs.append(premise_syntactical_input)
            inputs.append(hypothesis_syntactical_input)
            premise_embeddings.append(premise_syntactical_input)
            hypothesis_embeddings.append(hypothesis_syntactical_input)

        # 4. One-hot exact match feature
        if include_exact_match:
            premise_exact_match_input = Input(shape=(p, ),
                                              name='PremiseExactMatchInput')
            hypothesis_exact_match_input = Input(
                shape=(h, ), name='HypothesisExactMatchInput')
            premise_exact_match = Reshape(target_shape=(
                p,
                1,
            ))(premise_exact_match_input)
            hypothesis_exact_match = Reshape(target_shape=(
                h,
                1,
            ))(hypothesis_exact_match_input)
            inputs.append(premise_exact_match_input)
            inputs.append(hypothesis_exact_match_input)
            premise_embeddings.append(premise_exact_match)
            hypothesis_embeddings.append(hypothesis_exact_match)

        # Concatenate all features
        premise_embedding = Concatenate(
            name='PremiseEmbedding')(premise_embeddings)
        hypothesis_embedding = Concatenate(
            name='HypothesisEmbedding')(hypothesis_embeddings)
        d = K.int_shape(hypothesis_embedding)[-1]
        '''Encoding layer'''
        # Now we have the embedded premise [pxd] along with embedded hypothesis [hxd]
        premise_encoding = Encoding(name='PremiseEncoding')(premise_embedding)
        hypothesis_encoding = Encoding(
            name='HypothesisEncoding')(hypothesis_embedding)
        '''Interaction layer'''
        interaction = Interaction(name='Interaction')(
            [premise_encoding, hypothesis_encoding])
        '''Feature Extraction layer'''
        feature_extractor_input = Conv2D(filters=int(d *
                                                     first_scale_down_ratio),
                                         kernel_size=1,
                                         activation=None,
                                         name='FirstScaleDown')(interaction)
        feature_extractor = DenseNet(
            include_top=False,
            input_tensor=Input(shape=K.int_shape(feature_extractor_input)[1:]),
            nb_dense_block=nb_dense_blocks,
            nb_layers_per_block=layers_per_dense_block,
            compression=transition_scale_down_ratio,
            growth_rate=growth_rate)(feature_extractor_input)
        '''Output layer'''
        features = DecayingDropout(initial_keep_rate=dropout_initial_keep_rate,
                                   decay_interval=dropout_decay_interval,
                                   decay_rate=dropout_decay_rate,
                                   name='Features')(feature_extractor)
        out = Dense(units=nb_labels, activation='softmax',
                    name='Output')(features)
        super(DIIN, self).__init__(inputs=inputs, outputs=out, name=name)
def build_model(cfg, summary=False, word_embedding_matrix=None):
    def _get_model(base_dir, cfg_=None):
        config_file = os.path.join(base_dir, 'bert_config.json')
        checkpoint_file = os.path.join(base_dir, 'bert_model.ckpt')
        if not os.path.exists(config_file):
            config_file = os.path.join(base_dir, 'bert_config_large.json')
            checkpoint_file = os.path.join(base_dir, 'roberta_l24_large_model')
        print(config_file, checkpoint_file)
        #         model = load_trained_model_from_checkpoint(config_file, checkpoint_file, training=True, seq_len=cfg_['maxlen'])
        model = load_trained_model_from_checkpoint(
            config_file,
            checkpoint_file,
            training=False,
            trainable=cfg_["bert_trainable"],
            output_layer_num=cfg["cls_num"],
            seq_len=cfg_['maxlen'])
        return model

    def get_opt(num_example, warmup_proportion=0.1, lr=2e-5, min_lr=None):
        if cfg["opt"].lower() == "nadam":
            opt = Nadam(lr=lr)
        else:
            total_steps, warmup_steps = calc_train_steps(
                num_example=num_example,
                batch_size=B_SIZE,
                epochs=MAX_EPOCH,
                warmup_proportion=warmup_proportion,
            )

            opt = AdamWarmup(total_steps, warmup_steps, lr=lr, min_lr=min_lr)

        return opt

    model1 = _get_model(cfg["base_dir"], cfg)

    #model1 = Model(inputs=model1.inputs[: 2], outputs=model1.layers[-7].output)
    model1 = Model(inputs=model1.inputs[:2], outputs=model1.layers[-7].output)

    if word_embedding_matrix is not None:
        embed_layer = Embedding(input_dim=word_embedding_matrix.shape[0],
                                output_dim=word_embedding_matrix.shape[1],
                                weights=[word_embedding_matrix],
                                trainable=cfg["trainable"],
                                name="embed_layer")

    inp_token1 = Input(shape=(None, ),
                       dtype=np.int32,
                       name="query_token_input")
    inp_segm1 = Input(shape=(None, ),
                      dtype=np.float32,
                      name="query_segm_input")

    #     inp_token2 = Input(shape=(None, ), dtype=np.int32)
    #     inp_segm2 = Input(shape=(None, ), dtype=np.float32)

    inp_image = Input(shape=(None, 2048), dtype=np.float32, name="image_input")
    inp_image_mask = Input(shape=(None, ),
                           dtype=np.float32,
                           name="image_mask_input")
    inp_pos = Input(shape=(None, 5), dtype=np.float32, name="image_pos_input")
    inp_image_char = Input(shape=(None, cfg["max_char"]),
                           dtype=np.int32,
                           name='image_char_input')

    mask = Lambda(lambda x: K.cast(K.not_equal(x, cfg["x_pad"]), 'float32'),
                  name="token_mask")(inp_token1)
    word_embed = embed_layer(inp_token1)
    word_embed = Lambda(lambda x: x[0] * K.expand_dims(x[1], axis=-1))(
        [word_embed, mask])
    word_embed = Bidirectional(LSTM(cfg["unit1_1"], return_sequences=True),
                               merge_mode="sum")(word_embed)
    word_embed = Lambda(lambda x: x[0] * K.expand_dims(x[1], axis=-1))(
        [word_embed, mask])

    sequence_output = model1([inp_token1, inp_segm1])
    sequence_output = Concatenate(axis=-1)([sequence_output, word_embed])
    text_pool = Lambda(lambda x: x[:, 0, :])(sequence_output)

    # Share weights of character-level embedding for premise and hypothesis
    character_embedding_layer = TimeDistributed(
        Sequential([
            embed_layer,
            # Embedding(input_dim=100, output_dim=char_embedding_size, input_length=chars_per_word),
            Conv1D(filters=128, kernel_size=3, name="char_embed_conv1d"),
            GlobalMaxPooling1D()
        ]),
        name='CharEmbedding')
    character_embedding_layer.build(input_shape=(None, None, cfg["max_char"]))
    image_char_embed = character_embedding_layer(inp_image_char)
    image_embed = Concatenate(axis=-1)([image_char_embed, inp_image])
    image_embed = Dense(512, activation='relu',
                        name='image_embed')(image_embed)
    image_embed = Lambda(lambda x: x[0] * K.expand_dims(x[1], axis=-1))(
        [image_embed, inp_image_mask])
    pos_embed = Dense(512, activation='relu', name='pos_embed')(inp_pos)
    pos_embed = Lambda(lambda x: x[0] * K.expand_dims(x[1], axis=-1))(
        [pos_embed, inp_image_mask])
    embed = Add()([image_embed, pos_embed])  # batch, maxlen(10), 1024+128

    image_embed = Bidirectional(LSTM(1152, return_sequences=True),
                                merge_mode="sum")(embed)
    image_embed = Lambda(lambda x: x[0] * K.expand_dims(x[1], axis=-1))(
        [image_embed, inp_image_mask])

    image_pool = Lambda(lambda x: x[:, 0, :])(image_embed)

    pool = Concatenate(axis=-1)([image_pool, text_pool])
    pool = Dense(2048, activation="relu")(pool)
    pool = Dense(512, activation="relu")(pool)
    pool = Dense(128, activation="relu")(pool)

    output = Dense(2, activation='softmax', name='output')(pool)

    opt = get_opt(num_example=cfg["num_example"],
                  lr=cfg["lr"],
                  min_lr=cfg['min_lr'])
    model = Model(inputs=[
        inp_token1, inp_segm1, inp_image, inp_image_mask, inp_pos,
        inp_image_char
    ],
                  outputs=[output])  #

    model.compile(optimizer=opt,
                  loss={'output': 'sparse_categorical_crossentropy'},
                  metrics=['accuracy'])
    if summary:
        model.summary()

    return model
Exemplo n.º 6
0
    def __init__(self,
                 p=None,
                 h=None,
                 use_word_embedding=True,
                 word_embedding_weights=None,
                 train_word_embeddings=False,
                 dropout_init_keep_rate=1.0,
                 dropout_decay_interval=10000,
                 dropout_decay_rate=0.977,
                 use_chars=False,
                 chars_per_word=16,
                 char_input_dim=100,
                 char_embedding_size=8,
                 char_conv_filters=100,
                 char_conv_kernel_size=5,
                 use_syntactical_features=False,
                 syntactical_feature_size=50,
                 use_exact_match=False,
                 first_scale_down_ratio=0.3,
                 nb_dense_blocks=3,
                 layers_per_dense_block=8,
                 nb_labels=3,
                 growth_rate=20,
                 transition_scale_down_ratio=0.5,
                 inputs=None,
                 outputs=None,
                 name="DIIN"):
        """Densely Interactive Inference Network(DIIN)

        Model from paper `Natural Language Inference over Interaction Space`
        (https://openreview.net/forum?id=r1dHXnH6-&noteId=r1dHXnH6-)

        :param p: sequence length of premise
        :param h: sequence length of hypothesis
        :param use_word_embedding: whether or not to include word vectors in the model
        :param use_chars: whether or not to include character embeddings in the model
        :param use_syntactical_features: whether or not to include syntactical features (POS tags) in the model
        :param use_exact_match: whether or not to include exact match features in the model
        :param word_embedding_weights: matrix of weights for word embeddings(pre-trained vectors)
        :param train_word_embeddings: whether or not to modify word embeddings while training
        :param dropout_init_keep_rate: initial keep rate of dropout
        :param dropout_decay_interval: the number of steps to wait for the next turn update, steps means single batch,
        other than epoch
        :param dropout_decay_rate: how much to change dropout at each interval
        :param chars_per_word: how many chars are there per one word
        :param char_input_dim: character unique numbers
        :param char_embedding_size: output size of the character-embedding layer
        :param char_conv_filters: filters of the kernel applied on character embeddings
        :param char_conv_kernel_size: size of the kernel applied on character embeddings
        :param syntactical_feature_size: size of the syntactical feature vector for each word
        :param first_scale_down_ratio: scale ratio of map features as the input of first Densenet block
        :param nb_dense_blocks: number of dense blocks in densenet
        :param layers_per_dense_block: number of layers in one dense block
        :param nb_labels: number of labels
        :param growth_rate:growing rate in dense net
        :param transition_scale_down_ratio: transition scale down ratio in dense net
        :param inputs: inputs of keras models
        :param outputs: outputs of keras models
        :param name: models name
        """

        if inputs or outputs:
            super(DIINModel, self).__init__(inputs=inputs,
                                            outputs=outputs,
                                            name=name)
            return

        if use_word_embedding:
            assert word_embedding_weights is not None, "Word embedding weights are needed"

        inputs = []
        premise_features = []
        hypothesis_features = []
        """Embedding layer"""
        # Input: word embedding
        if use_word_embedding:
            premise_word_input = Input(shape=(p, ),
                                       dtype="int64",
                                       name="premise_word_input")
            hypothesis_word_input = Input(shape=(h, ),
                                          dtype="int64",
                                          name="hypothesis_word_input")
            inputs.append(premise_word_input)
            inputs.append(hypothesis_word_input)

            word_embedding = Embedding(
                input_dim=word_embedding_weights.shape[0],
                output_dim=word_embedding_weights.shape[1],
                weights=[word_embedding_weights],
                trainable=train_word_embeddings,
                name="word_embedding")
            premise_word_embedding = word_embedding(premise_word_input)
            hypothesis_word_embedding = word_embedding(hypothesis_word_input)

            premise_word_embedding = DecayingDropout(
                init_keep_rate=dropout_init_keep_rate,
                decay_interval=dropout_decay_interval,
                decay_rate=dropout_decay_rate,
                name="premise_word_dropout")(premise_word_embedding)
            hypothesis_word_embedding = DecayingDropout(
                init_keep_rate=dropout_init_keep_rate,
                decay_interval=dropout_decay_interval,
                decay_rate=dropout_decay_rate,
                name="hypothesis_word_dropout")(hypothesis_word_embedding)

            premise_features.append(premise_word_embedding)
            hypothesis_features.append(hypothesis_word_embedding)

        # Input: character embedding
        if use_chars:
            premise_char_input = Input(shape=(p, chars_per_word),
                                       dtype="int64",
                                       name="premise_char_input")
            hypothesis_char_input = Input(shape=(h, chars_per_word),
                                          dtype="int64",
                                          name="hypothesis_char_input")
            inputs.append(premise_char_input)
            inputs.append(hypothesis_char_input)

            # Share weights of character-level embedding for premise and hypothesis
            character_embedding = TimeDistributed(Sequential([
                Embedding(input_dim=char_input_dim,
                          output_dim=char_embedding_size,
                          input_length=chars_per_word),
                Conv1D(filters=char_conv_filters,
                       kernel_size=char_conv_kernel_size),
                GlobalMaxPooling1D(),
            ]),
                                                  name="char_embedding")
            character_embedding.build(
                input_shape=(None, None, chars_per_word))  # Set input shape

            premise_char_embedding = character_embedding(premise_char_input)
            hypothesis_char_embedding = character_embedding(
                hypothesis_char_input)
            premise_features.append(premise_char_embedding)
            hypothesis_features.append(hypothesis_char_embedding)

        # Input: syntactical features
        if use_syntactical_features:
            premise_syntactical_input = Input(shape=(p,
                                                     syntactical_feature_size),
                                              name="premise_syntactical_input")
            hypothesis_syntactical_input = Input(
                shape=(h, syntactical_feature_size),
                name="hypothesis_syntactical_input")
            inputs.append(premise_syntactical_input)
            inputs.append(hypothesis_syntactical_input)
            premise_features.append(premise_syntactical_input)
            hypothesis_features.append(hypothesis_syntactical_input)

        # Input: one-hot exact match feature
        if use_exact_match:
            premise_exact_match_input = Input(shape=(p, ),
                                              name='premise_exact_match_input')
            hypothesis_exact_match_input = Input(
                shape=(h, ), name='hypothesis_exact_match_input')
            inputs.append(premise_exact_match_input)
            inputs.append(hypothesis_exact_match_input)

            premise_exact_match = Reshape(
                target_shape=(p, 1))(premise_exact_match_input)
            hypothesis_exact_match = Reshape(
                target_shape=(h, 1))(hypothesis_exact_match_input)
            premise_features.append(premise_exact_match)
            hypothesis_features.append(hypothesis_exact_match)

        # Concatenate all features
        if len(premise_features) > 1:
            premise_embedding = Concatenate()(premise_features)
            hypothesis_embedding = Concatenate()(hypothesis_features)
        else:
            premise_embedding = premise_features[0]
            hypothesis_embedding = hypothesis_features[0]
        d = K.int_shape(premise_embedding)[-1]
        """Encoding layer"""
        premise_encoding = Encoding(name="premise_encoding")(premise_embedding)
        hypothesis_encoding = Encoding(
            name="hypothesis_encoding")(hypothesis_embedding)
        """Interaction layer"""
        interaction = Interaction(name="interaction")(
            [premise_encoding, hypothesis_encoding])
        """Feature extraction layer"""
        feature_extractor_input = Conv2D(
            filters=int(d * first_scale_down_ratio),
            kernel_size=1,
            activation=None,
            name="bottleneck")(interaction)  # Bottleneck layer
        feature_extractor = DenseNet(
            input_tensor=Input(shape=K.int_shape(feature_extractor_input)[1:]),
            include_top=False,
            nb_dense_block=nb_dense_blocks,
            nb_layers_per_block=layers_per_dense_block,
            growth_rate=growth_rate,
            compression=transition_scale_down_ratio)(feature_extractor_input)
        """Output layer"""
        features = DecayingDropout(init_keep_rate=dropout_init_keep_rate,
                                   decay_interval=dropout_decay_interval,
                                   decay_rate=dropout_decay_rate,
                                   name="features")(feature_extractor)
        if nb_labels == 2:
            out = Dense(1, activation="sigmoid", name="output")(features)
        else:
            out = Dense(nb_labels, activation="softmax",
                        name="output")(features)
        super(DIINModel, self).__init__(inputs=inputs, outputs=out, name=name)