예제 #1
0
    def init_from_params(cls, params, vocab):
        config_file = params.pop('config_file', None)
        if config_file is None:
            raise ConfigureError(
                "Please provide ELMo config file for ELMo embedding.")
        # weight_file = params.pop('weight_file', None)
        # if weight_file is None:
        #     logger.warning("The ELMo embedding is initialize randomly.")
        encoder_name = params.pop("encoder_name", "elmo")
        vocab_namespace = params.pop('namespace', 'elmo_characters')
        dropout_rate = params.pop_float('dropout_rate', 0.0)

        ckpt_to_initialize_from = params.pop('ckpt_to_initialize_from', None)
        weight_file = params.pop('weight_file', None)
        if ckpt_to_initialize_from is None and weight_file is None:
            logger.warning("The ELMo embedding is initialize randomly.")

        # tmp_dir = params.pop('tmp_dir', None)
        # if tmp_dir is None:
        #     if weight_file:
        #         tmp_dir = os.path.dirname(weight_file)
        #     else:
        #         tmp_dir = "./"

        params.assert_empty(cls.__name__)

        return cls(config_file=config_file,
                   ckpt_to_initialize_from=ckpt_to_initialize_from,
                   dropout_rate=dropout_rate,
                   encoder_name=encoder_name,
                   vocab_namespace=vocab_namespace,
                   weight_file=weight_file)
예제 #2
0
    def init_from_params(cls, params, vocab):
        config_file = params.pop('config_file', None)
        if config_file is None:
            raise ConfigureError("Please provide bert config file for bert embedding.")
        old_vocab_file = params.pop('vocab_file', None)
        if old_vocab_file is None:
            logger.warning("The vocab file is not provided. We consider the embedding vocab is the same as the data "
                           "vocab acquiescently.")
        ckpt_to_initialize_from = params.pop('ckpt_to_initialize_from', None)
        if ckpt_to_initialize_from is None:
            logger.warning("The bert embedding is initialize randomly.")
        num_oov_buckets = params.pop_int("num_oov_buckets", 0)
        use_one_hot_embeddings = params.pop_bool("use_one_hot_embeddings", False)
        encoder_name = params.pop("encoder_name", "bert")
        vocab_namespace = params.pop("namespace", 'tokens')
        mask_namespace = params.pop("mask_namespace", None)
        new_vocab_file = vocab.get_vocab_path(vocab_namespace)
        new_vocab_size = vocab.get_vocab_size(vocab_namespace)
        projection_dim = params.pop_int("projection_dim", None)
        dropout_rate = params.pop_float("dropout_rate", 0.0)
        remove_bos_eos = params.pop_bool("remove_bos_eos", True)
        params.assert_empty(cls.__name__)

        return cls(config_file=config_file, ckpt_to_initialize_from=ckpt_to_initialize_from,
                   new_vocab_file=new_vocab_file, new_vocab_size=new_vocab_size, num_oov_buckets= num_oov_buckets,
                   old_vocab_file=old_vocab_file, vocab_namespace=vocab_namespace,
                   remove_bos_eos = remove_bos_eos,
                   mask_namespace=mask_namespace, projection_dim=projection_dim, dropout_rate=dropout_rate,
                   use_one_hot_embeddings=use_one_hot_embeddings, encoder_name=encoder_name)
예제 #3
0
 def generate_input_map(self, signature_def, features, labels=None):
     features_mapping = {
         "input_query": "premise/tokens",
         "input_title": "hypothesis/tokens"
     }
     inputs = signature_def.inputs
     input_map = {}
     for (key, tensor_info) in inputs.items():
         input_name = tensor_info.name
         if ':' in input_name:
             input_name = input_name[:input_name.find(':')]
         control_dependency_name = '^' + input_name
         if features_mapping is not None and key in features_mapping:
             feature_key = features_mapping[key]
         else:
             feature_key = key
         if feature_key in features:
             check_same_dtype_and_shape(features[feature_key], tensor_info,
                                        key)
             input_map[input_name] = input_map[
                 control_dependency_name] = features[feature_key]
         elif labels is not None and feature_key in labels:
             check_same_dtype_and_shape(labels[feature_key], tensor_info,
                                        key)
             input_map[input_name] = input_map[
                 control_dependency_name] = labels[feature_key]
         else:
             logger.warning(
                 'Key \"%s\" not found in features or labels passed in to the model '
                 'function. All required keys: %s' %
                 (feature_key, inputs.keys()))
     return input_map
예제 #4
0
def get_signature_def_for_mode(saved_model_loader, mode):
    meta_graph_def = get_meta_graph_def_for_mode(saved_model_loader, mode)
    sig_def_key = (signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
                   if mode == model_fn.ModeKeys.PREDICT else mode)
    if sig_def_key not in meta_graph_def.signature_def:
        logger.warning('Metagraph for mode %s was found, but SignatureDef with'
                       ' key \"%s\" is missing.' % (mode, sig_def_key))
        return None
    return meta_graph_def.signature_def[sig_def_key]
예제 #5
0
 def get_warm_start_setting(self):
     warm_start_settings = None
     for encoder in self._encoders.values():
         warm_start_settings_namespace = encoder.get_warm_start_setting()
         if isinstance(warm_start_settings_namespace, tf.estimator.WarmStartSettings):
             if warm_start_settings is None:
                 warm_start_settings = warm_start_settings_namespace
             else:
                 logger.warning("There are two pretrained embedding, which is not supported int this toolkit now.")
     return warm_start_settings
예제 #6
0
    def save_to_files(self, directory):
        os.makedirs(directory, exist_ok=True)
        if os.listdir(directory):
            logger.warning("Vocabulary directory %s is not empty", directory)

        save_to_txt(self._non_padded_namespaces, os.path.join(directory, NAMESPACE_PADDING_FILE))
        for namespace in self._token_to_index:
            vocab_namespace = [self._index_to_token[namespace][i] for i in range(len(self._index_to_token[namespace]))]
            vocab_namespace_file = os.path.join(directory, VOCAB_FILE % namespace)
            self._namespace_to_path[namespace] = vocab_namespace_file
            save_to_txt(vocab_namespace, vocab_namespace_file)
예제 #7
0
def _read_pretrained_embeddings_text(pretrained_file, embedding_dim, vocab,
                                     vocab_namespace):
    vocab_tokens = vocab.get_vocab_tokens(vocab_namespace)
    vocab_size = vocab.get_vocab_size(vocab_namespace)
    embeddings = {}
    logger.info("Reading pretrained embeddings from: %s" % pretrained_file)
    with open(pretrained_file, 'r', encoding='utf-8') as embeddings_file:
        for line in tqdm.tqdm(embeddings_file):
            token = line.split(" ", 1)[0]
            if token in vocab_tokens:
                fields = line.rstrip().split(' ')
                if len(fields) - 1 != embedding_dim:

                    logger.warning(
                        "Found line with wrong number of dimensions (expected: %d; actual: %d): %s",
                        embedding_dim,
                        len(fields) - 1, line)
                    continue

                vector = np.asarray(fields[1:], dtype='float32')
                embeddings[token] = vector

    if not embeddings:
        ConfigureError(
            "The embedding_dim or vocabulary does not fit the pretrained embedding."
        )
    all_embeddings = np.asarray(list(embeddings.values()))
    embeddings_mean = float(np.mean(all_embeddings))
    embeddings_std = float(np.std(all_embeddings))
    embedding_matrix = np.random.normal(embeddings_mean, embeddings_std,
                                        (vocab_size, embedding_dim))
    embedding_matrix = embedding_matrix.astype(np.float32)
    num_tokens_found = 0
    index_to_tokens = vocab.get_vocab_index_to_token(vocab_namespace)
    for i in range(vocab_size):
        token = index_to_tokens[i]
        if token in embeddings:
            embedding_matrix[i] = embeddings[token]
            num_tokens_found += 1
        else:
            logger.debug(
                "Token %s was not found in the embedding file. Initialising randomly.",
                token)

    logger.info("Pretrained embeddings were found for %d out of %d tokens",
                num_tokens_found, vocab_size)
    return embedding_matrix
예제 #8
0
    def forward(self, features, labels, mode, params):
        outputs = dict()
        is_training = (mode == tf.estimator.ModeKeys.TRAIN)

        for (feature_key, feature) in features.items():
            if '/' not in feature_key:
                continue
            feature_key_fields = feature_key.split("/")
            feature_namespace = feature_key_fields[1].strip()
            field_name = feature_key_fields[0].strip()
            if feature_namespace == self._vocab_namespace:
                with tf.variable_scope("embedding/"+self._vocab_namespace, reuse=tf.AUTO_REUSE):
                    input_ids = feature
                    input_mask = None
                    if self._mask_namespace:
                        mask_feature_key = field_name+"/"+self._mask_namespace
                        if mask_feature_key in features:
                            input_mask = features[field_name+"/"+self._mask_namespace]
                        else:
                            logger.warning("The mask namespace %s with field name %s is not in features (%s)"
                                           % (self._mask_namespace, field_name, mask_feature_key))
                    if input_mask is None:
                        input_length, input_mask = nn.length(input_ids)
                    else:
                        input_length, _ = nn.length(input_ids)
                    model = BertModel(
                        config=self._bert_config,
                        is_training=is_training,
                        input_ids=input_ids,
                        input_mask=input_mask,
                        use_one_hot_embeddings=self._use_one_hot_embeddings)

                    embedding_output = model.get_sequence_output()

                    if self._remove_bos_eos:
                        embedding_output = nn.remove_bos_eos(embedding_output, input_length)

                    dropout_rate = params.get('dropout_rate')
                    if dropout_rate is None:
                        dropout_rate = self._dropout_rate
                    emb_drop = tf.layers.dropout(embedding_output, dropout_rate, training=is_training)
                    if self._projection_dim:
                        emb_drop = tf.layers.dense(emb_drop, self._projection_dim, use_bias=False,
                                                   kernel_initializer=initializers.xavier_initializer())
                    outputs[feature_key] = emb_drop
        return outputs
예제 #9
0
def extract_available_modes(saved_model_loader):
    """Return list of modes found in SavedModel."""
    available_modes = []
    logger.info('Checking available modes.')
    for mode in [
            tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL,
            tf.estimator.ModeKeys.PREDICT
    ]:
        try:
            get_meta_graph_def_for_mode(saved_model_loader, mode)
        except RuntimeError:
            logger.warning('%s mode not found in SavedModel.' % mode)
            continue

        if get_signature_def_for_mode(saved_model_loader, mode) is not None:
            available_modes.append(mode)

    logger.info('Available modes: %s' % available_modes)
    return available_modes
예제 #10
0
    def load_from_files(self, directory):
        if not os.path.exists(directory):
            logger.warning("Vocabulary directory %s does not exist.", directory)
            return False
        namespaces_file = os.path.join(directory, NAMESPACE_PADDING_FILE)
        if not os.path.exists(namespaces_file):
            logger.warning("Vocabulary namespaces file %s does not exist", namespaces_file)
            return False

        vocab_filenames = [filename for filename in os.listdir(directory)
                            if filename.startswith(VOCAB_FILE[:6]) and filename.endswith(VOCAB_FILE[-4:])]
        if len(vocab_filenames) == 0:
            logger.warning("Vocabulary file %s does not exist")

        self._non_padded_namespaces = load_from_txt(namespaces_file)

        for vocab_filename in vocab_filenames:
            namespace = vocab_filename[6:-4]
            vocab_namespace_file = os.path.join(directory, vocab_filename)
            self._namespace_to_path[namespace] = vocab_namespace_file
            vocab_namespace = load_from_txt(vocab_namespace_file)
            self._index_to_token[namespace] = dict((index, token) for index, token in enumerate(vocab_namespace))
            self._token_to_index[namespace] = dict((token, index) for index, token in enumerate(vocab_namespace))

        if self.valid():
            return True
        else:
            raise ConfigureError("Vocabulary valid error")
예제 #11
0
    def forward(self, features, labels, mode, params):
        outputs = dict()
        is_training = (mode == tf.estimator.ModeKeys.TRAIN)

        for (feature_key, feature) in features.items():
            if '/' not in feature_key:
                continue
            feature_namespace = feature_key.split("/")[1].strip()
            if feature_namespace == self._vocab_namespace:
                with tf.variable_scope("embedding/" + self._vocab_namespace,
                                       reuse=tf.AUTO_REUSE):
                    if self._weight is None:
                        if not self._trainable:
                            logger.warning(
                                "No pretrained embedding is assigned. The embedding should be trainable."
                            )
                        logger.debug("loading random embedding.")
                        if self._padding_zero:
                            word_embeddings = tf.get_variable(
                                "embedding_weight",
                                shape=(self._num_embeddings - 1,
                                       self._embedding_dim),
                                initializer=initializers.xavier_initializer(),
                                trainable=self._trainable)
                            pad_embeddings = tf.constant(np.zeros(
                                [1, self._embedding_dim]),
                                                         dtype=tf.float32)
                            self._embeddings = tf.concat(
                                [pad_embeddings, word_embeddings], axis=0)
                        else:
                            self._embeddings = tf.get_variable(
                                "embedding_weight",
                                shape=(self._num_embeddings,
                                       self._embedding_dim),
                                initializer=initializers.xavier_initializer(),
                                trainable=self._trainable)
                    else:
                        if self._weight.shape != (self._num_embeddings,
                                                  self._embedding_dim):
                            raise ConfigureError(
                                "The parameter of embedding with shape (%s, %s), "
                                "but the pretrained embedding with shape %s." %
                                (self._num_embeddings, self._embedding_dim,
                                 self._weight.shape))
                        logger.debug(
                            "loading pretrained embedding with trainable %s." %
                            self._trainable)
                        if self._padding_zero:
                            word_embeddings = tf.get_variable(
                                "embedding_weight",
                                initializer=self._weight[1:, :],
                                trainable=self._trainable)
                            pad_embeddings = tf.constant(np.zeros(
                                [1, self._embedding_dim]),
                                                         dtype=tf.float32)
                            self._embeddings = tf.concat(
                                [pad_embeddings, word_embeddings], axis=0)
                        else:
                            self._embeddings = tf.get_variable(
                                "embedding_weight",
                                initializer=self._weight,
                                trainable=self._trainable)
                            # tf.Variable(self._weight, trainable=self._trainable, name='embedding_weight')
                    emb = tf.nn.embedding_lookup(self._embeddings, feature)

                    dropout_rate = params.get('dropout_rate')
                    if dropout_rate is None:
                        dropout_rate = self._dropout_rate
                    emb_drop = tf.layers.dropout(emb,
                                                 dropout_rate,
                                                 training=is_training)
                    if self._projection_dim:
                        emb_drop = tf.layers.dense(
                            emb_drop,
                            self._projection_dim,
                            use_bias=False,
                            kernel_initializer=initializers.xavier_initializer(
                            ))
                    outputs[feature_key] = emb_drop
        return outputs
예제 #12
0
    def forward(self, features, labels, mode, params):
        if self._sim_func != 'tensor' and self._num_tensor_dim != 1:
            self._num_tensor_dim = 1
            logger.warning(
                "The similarity function is tensor layer. The number of tensor dim is not effective."
            )
        features_embedding = self._embedding_mapping.forward(
            features, labels, mode, params)
        with tf.variable_scope(self._model_name):
            is_training = (mode == tf.estimator.ModeKeys.TRAIN)

            premise_tokens_ids = features.get('premise/tokens', None)
            if premise_tokens_ids is None:
                premise_tokens_ids = features.get('premise/elmo_characters',
                                                  None)
            hypothesis_tokens_ids = features.get('hypothesis/tokens', None)
            if hypothesis_tokens_ids is None:
                hypothesis_tokens_ids = features.get(
                    'hypothesis/elmo_characters', None)

            if premise_tokens_ids is None:
                raise ConfigureError(
                    "The input features should contain premise with vocabulary namespace tokens "
                    "or elmo_characters.")
            if hypothesis_tokens_ids is None:
                raise ConfigureError(
                    "The input features should contain hypothesis with vocabulary namespace tokens "
                    "or elmo_characters.")

            prem_seq_lengths, prem_mask = nn.length(premise_tokens_ids)
            hyp_seq_lengths, hyp_mask = nn.length(hypothesis_tokens_ids)
            if features.get(
                    'premise/elmo_characters', None) is not None or isinstance(
                        self._embedding_mapping.get_encoder('tokens'), Bert):
                prem_mask = nn.remove_bos_eos(prem_mask, prem_seq_lengths)
                prem_seq_lengths -= 2
            if features.get('hypothesis/elmo_characters',
                            None) is not None or isinstance(
                                self._embedding_mapping.get_encoder('tokens'),
                                Bert):
                hyp_mask = nn.remove_bos_eos(hyp_mask, hyp_seq_lengths)
                hyp_seq_lengths -= 2
            prem_mask = tf.expand_dims(prem_mask, -1)
            hyp_mask = tf.expand_dims(hyp_mask, -1)
            prem_hyp_mask = tf.matmul(prem_mask, hyp_mask, transpose_b=True)

            premise_tokens = features_embedding.get('premise/tokens', None)
            if premise_tokens is None:
                premise_tokens = features_embedding.get(
                    'premise/elmo_characters', None)
            hypothesis_tokens = features_embedding.get('hypothesis/tokens',
                                                       None)
            if hypothesis_tokens is None:
                hypothesis_tokens = features_embedding.get(
                    'hypothesis/elmo_characters', None)

            premise_outs, c1 = nn.bi_lstm(premise_tokens,
                                          self._hidden_dim,
                                          seq_len=prem_seq_lengths,
                                          name='premise')
            hypothesis_outs, c2 = nn.bi_lstm(hypothesis_tokens,
                                             self._hidden_dim,
                                             seq_len=hyp_seq_lengths,
                                             name='hypothesis')
            premise_bi = tf.concat(premise_outs, axis=2)
            hypothesis_bi = tf.concat(hypothesis_outs, axis=2)

            max_premise_length = premise_tokens.shape[1].value
            max_hypothesis_length = hypothesis_tokens.shape[1].value

            if self._sim_func == 'tensor':
                M = tf.Variable(
                    tf.random_normal([
                        self._num_tensor_dim, 2 * self._hidden_dim,
                        2 * self._hidden_dim
                    ],
                                     stddev=0.1))
                W = tf.Variable(
                    tf.random_normal([4 * self._hidden_dim, 1], stddev=0.1))
                bias = tf.Variable(tf.zeros([1]), name="tensor_bias")
                premise_ex = tf.tile(tf.expand_dims(premise_bi, axis=2),
                                     [1, 1, max_hypothesis_length, 1])
                hypothesis_ex = tf.tile(tf.expand_dims(hypothesis_bi, axis=1),
                                        [1, max_premise_length, 1, 1])
                tensor = []
                tmp2 = tf.einsum("abcd,df->abcf",
                                 tf.concat([premise_ex, hypothesis_ex],
                                           axis=3), W)  # [N, L1, L2, 1]
                tmp2 = tf.squeeze(tmp2, axis=3)
                for i in range(self._num_tensor_dim):
                    tmp1 = tf.einsum("abc,cd->abd", premise_bi,
                                     M[i])  # [N, L1, 2d]
                    tmp1 = tf.matmul(tmp1, hypothesis_bi,
                                     transpose_b=True)  # [N, L1, L2]
                    tensor.append(tf.nn.relu(tmp1 + tmp2 + bias))
                tensor = tf.concat([tensor], axis=0)
            elif self._sim_func == 'cosine':
                tensor = tf.matmul(tf.nn.l2_normalize(premise_bi, axis=-1),
                                   tf.nn.l2_normalize(hypothesis_bi, axis=-1),
                                   transpose_b=True)  # [N, L1, L2]
            elif self._sim_func == 'bilinear':
                M = tf.Variable(
                    tf.random_normal(
                        [2 * self._hidden_dim, 2 * self._hidden_dim],
                        stddev=0.1))
                b = tf.Variable(
                    tf.random_normal(
                        [max_premise_length, max_hypothesis_length],
                        stddev=0.1))
                bilinear = tf.einsum("abc,cd->abd", premise_bi,
                                     M)  # [N, L1, 2d]
                tensor = tf.matmul(bilinear, hypothesis_bi,
                                   transpose_b=True) + b  # [N, L1, L2]
            else:
                raise ConfigureError(
                    "The simility function %s is not supported. "
                    "The mvlstm only support simility function for [cosine, bilinear, tensor]."
                    % self._sim_func)

            tensor *= prem_hyp_mask
            # 3.1 k-Max Pooling
            matrix_in = tf.reshape(
                tensor, [-1, max_premise_length * max_hypothesis_length])
            values, indices = tf.nn.top_k(matrix_in,
                                          k=self._num_k,
                                          sorted=False)
            kmax = tf.reshape(values, [-1, self._num_tensor_dim * self._num_k])

            # MLP layer
            h_mlp_1 = tf.contrib.layers.fully_connected(kmax,
                                                        self._num_tensor_dim *
                                                        self._num_k,
                                                        scope='fc1')
            h_mlp_1_drop = tf.layers.dropout(h_mlp_1,
                                             self._dropout_rate,
                                             training=is_training)
            h_mlp_2 = tf.contrib.layers.fully_connected(h_mlp_1_drop,
                                                        self._num_tensor_dim *
                                                        self._num_k // 2,
                                                        scope='fc2')

            # Dropout applied to classifier
            h_drop = tf.layers.dropout(h_mlp_2,
                                       self._dropout_rate,
                                       training=is_training)
            # Get prediction
            output_dict = self._make_output(h_drop, params)

            if mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL:
                if 'label/labels' not in features:
                    raise ConfigureError(
                        "The input features should contain label with vocabulary namespace "
                        "labels int %s dataset." % mode)
                labels_embedding = features_embedding['label/labels']
                labels = features['label/labels']

                loss = self._make_loss(labels=labels_embedding,
                                       logits=output_dict['logits'],
                                       params=params)
                output_dict['loss'] = loss
                metrics = dict()
                metrics['accuracy'] = tf.metrics.accuracy(
                    labels=labels, predictions=output_dict['predictions'])
                metrics['precision'] = tf.metrics.precision(
                    labels=labels, predictions=output_dict['predictions'])
                metrics['recall'] = tf.metrics.recall(
                    labels=labels, predictions=output_dict['predictions'])

                #tf.metrics.auc(labels=labels, predictions=predictions)
                output_dict['metrics'] = metrics
                # output_dict['debugs'] = [hypothesis_tokens, premise_tokens, hypothesis_bi, premise_bi,
                #                          premise_ave, hypothesis_ave, diff, mul, h, h_mlp, logits]
            return output_dict