예제 #1
0
    def _make_parameters(self):
        super()._make_parameters()
        TokenEmbedder.make_parameters('token', self.parameters, self.metadata, self.hyperparameters)

        seq_layer_type = self.hyperparameters['seq_layer_type'].lower()
        if  seq_layer_type == 'birnn':
            self.parameters['seq_layers'] = [
                tf.keras.layers.Bidirectional(tf.keras.layers.GRU(self.hyperparameters['birnn_hidden_size'], return_sequences=True))
                for _ in range(self.hyperparameters['num_seq_layers'])
            ]
            out_size = 2 * self.hyperparameters['birnn_hidden_size']
        elif seq_layer_type == 'selfatt':
            self.parameters['seq_layers'] =[
                lambda input: transformer_model(input,
                                                hidden_size=self.hyperparameters['tranformer_hidden_size'],
                                                num_hidden_layers=1,
                                                num_attention_heads=self.hyperparameters['tranformer_num_attention_heads'],
                                                intermediate_size=self.hyperparameters['tranformer_intermediate_size'])
                for _ in range(self.hyperparameters['num_seq_layers'])
            ]
            out_size = self.hyperparameters['tranformer_hidden_size']
        else:
            raise ValueError('Unrecognized type of Sequential Layer %s' % seq_layer_type)

        self.__type_classification._make_parameters(representation_size=out_size)
예제 #2
0
 def _init_metadata(hyperparameters: Dict[str, Any],
                    raw_metadata: Dict[str, Any]) -> None:
     super(Sequence2HybridMetric,
           Sequence2HybridMetric)._init_metadata(hyperparameters,
                                                 raw_metadata)
     TokenEmbedder.init_metadata('token', raw_metadata, hyperparameters)
     TypeClassificationModel._init_metadata(hyperparameters, raw_metadata)
예제 #3
0
 def _init_minibatch(self, batch_data: Dict[str, Any]) -> None:
     super()._init_minibatch(batch_data)
     TokenEmbedder.init_minibatch('token', batch_data, self.hyperparameters)
     batch_data['batch_sequence_lengths'] = []
     batch_data['batch_variable_token_idxs'] = []
     batch_data['batch_variable_idxs'] = []
     self.__type_classification._init_minibatch(batch_data)
예제 #4
0
 def _finalise_metadata(
         self, raw_metadata_list: List[Dict[str, Any]]) -> Dict[str, Any]:
     final_metadata = super()._finalise_metadata(raw_metadata_list)
     TokenEmbedder.finalise_metadata('token', raw_metadata_list,
                                     final_metadata, self.hyperparameters)
     self.__type_classification._finalise_metadata(raw_metadata_list,
                                                   final_metadata)
     return final_metadata
예제 #5
0
 def _load_metadata_from_sample(hyperparameters: Dict[str, Any],
                                raw_sample: Dict[str, Any],
                                raw_metadata: Dict[str, Any]) -> None:
     super(Sequence2Metric, Sequence2Metric)._load_metadata_from_sample(
         hyperparameters, raw_sample, raw_metadata)
     TokenEmbedder.load_metadata_from_sample('token', [
         raw_sample['nodes'][i] for i in raw_sample['token-sequence']
         if i < len(raw_sample['nodes'])
     ], raw_metadata, hyperparameters)
예제 #6
0
    def _extend_minibatch_by_sample(self, batch_data: Dict[str, Any], sample: Dict[str, Any]) -> bool:
        super()._extend_minibatch_by_sample(batch_data, sample)

        TokenEmbedder.extend_minibatch_by_sample('token', batch_data, sample, self.hyperparameters)
        batch_data['batch_sequence_lengths'].append(sample['sequence_length'])

        batch_data['batch_variable_token_idxs'].append(sample['variable_token_idxs'])
        batch_data['batch_variable_idxs'].extend(sample['variable_idxs'] + len(batch_data['batch_target_variable_class']))

        self.__type_classification._extend_minibatch_by_sample(batch_data, sample)
        return len(batch_data['batch_sequence_lengths']) >= self.hyperparameters['batch_size']
예제 #7
0
    def _finalise_minibatch(self, batch_data: Dict[str, Any],
                            is_train: bool) -> Dict[tf.Tensor, Any]:
        minibatch = super()._finalise_minibatch(batch_data, is_train)
        TokenEmbedder.finalise_minibatch('token', batch_data,
                                         self.placeholders, minibatch,
                                         self.hyperparameters, is_train)

        max_sequence_len = max(batch_data['batch_sequence_lengths'])
        batch_size = len(batch_data['batch_sequence_lengths'])
        # TODO: Later, force split very long sequences

        # embedding_gather_matrix contains the idxs such that embeddings[embedding_gather_matrix] creates a B x max_sequence_len x D matrix
        embedding_gather_matrix = np.zeros((batch_size, max_sequence_len),
                                           dtype=np.int32)
        current_idx = 0
        for i, length in enumerate(batch_data['batch_sequence_lengths']):
            embedding_gather_matrix[i, :length] = np.arange(start=current_idx,
                                                            stop=current_idx +
                                                            length)
            current_idx += length
        write_to_minibatch(minibatch,
                           self.placeholders['embedding_gather_matrix'],
                           embedding_gather_matrix)

        total_variable_tokens = len(batch_data['batch_variable_idxs'])
        variable_token_gather_idxs = np.empty(total_variable_tokens,
                                              dtype=np.int32)
        gathered_tokens_so_far = 0
        for i, tidx in enumerate(batch_data['batch_variable_token_idxs']):
            num_variable_tokens = len(tidx)
            variable_token_gather_idxs[
                gathered_tokens_so_far:gathered_tokens_so_far +
                num_variable_tokens] = tidx + i * max_sequence_len
            gathered_tokens_so_far += num_variable_tokens

        write_to_minibatch(minibatch,
                           self.placeholders['variable_bound_token_ids'],
                           variable_token_gather_idxs)

        write_to_minibatch(minibatch, self.placeholders['sequence_lengths'],
                           batch_data['batch_sequence_lengths'])
        write_to_minibatch(minibatch, self.placeholders['token_variable_ids'],
                           batch_data['batch_variable_idxs'])
        self.__type_metric._finalise_minibatch(batch_data, is_train, minibatch)
        self.__type_classification._finalise_minibatch(batch_data, is_train,
                                                       minibatch)
        minibatch[self.placeholders['num_variables']] = len(
            batch_data['batch_target_variable_type'])

        return minibatch
예제 #8
0
    def _make_model(self, is_train: bool = True):
        super()._make_model(is_train)

        initial_token_embeddings = TokenEmbedder.make_model('token', self.placeholders, self.parameters,
                                                            self.hyperparameters, is_train)  # T x D

        sequence_token_embeddings = tf.gather_nd(params=initial_token_embeddings,
                                                 indices=tf.expand_dims(self.placeholders['embedding_gather_matrix'], axis=-1)
                                                 )  # B x max-len x D

        def get_variable_embeddings(all_sequence_embeddings):
            flat_sequence_embeddings = tf.reshape(all_sequence_embeddings, (-1, all_sequence_embeddings.get_shape()[-1]))  # B*max-len x D
            target_token_embeddings = tf.gather(params=flat_sequence_embeddings,
                                                indices=self.placeholders['variable_bound_token_ids'])

            return tf.unsorted_segment_mean(
                data=target_token_embeddings,
                segment_ids=self.placeholders['token_variable_ids'],
                num_segments=self.placeholders['num_variables']  # TODO: Do not depend in any way on the classes.
            ) # num-variables x H


        # TODO: Add positional encoding for Self-Attention.

        # Multiple layers of BiRNN/Transformer and "consistency" layer.
        current_out = sequence_token_embeddings  # B x max-len x H
        for i, seq_layer in enumerate(self.parameters['seq_layers']):
            # Mask out-of-sequence-tokens
            mask = tf.cast(
                tf.reshape(tf.range(tf.shape(sequence_token_embeddings)[1]), [1, -1, 1]) < tf.reshape(self.placeholders['sequence_lengths'], [-1, 1, 1]),
                current_out.dtype
            )
            current_out *= mask

            with tf.variable_scope('seqlayer_%s' % i):
                current_out = seq_layer(current_out)

            if i < len(self.parameters['seq_layers']) - 1 and self.hyperparameters['use_consistency_layer']:
                # Apply "consistency" layer to all layers but the last
                variable_embeddings = get_variable_embeddings(current_out) # num-variables x H

                variable_embedding_per_token = tf.gather(params=variable_embeddings, indices=self.placeholders['token_variable_ids'])  # num-variable-tokens x H

                current_out_shape = tf.shape(current_out)
                updates = tf.scatter_nd(
                    indices=tf.expand_dims(self.placeholders['variable_bound_token_ids'], axis=-1),
                    updates=variable_embedding_per_token,
                    shape=[current_out_shape[0] * current_out_shape[1], tf.shape(current_out)[2]]
                )
                current_out += tf.reshape(updates, tf.shape(current_out))

        variable_embeddings = get_variable_embeddings(current_out)
        self.ops['variable_embeddings'] = variable_embeddings

        self.__type_classification._make_model(variable_embeddings, is_train)
예제 #9
0
    def _make_placeholders(self, is_train: bool) -> None:
        super()._make_placeholders(is_train)
        TokenEmbedder.make_placeholders('token', self.placeholders, hyperparameters=self.hyperparameters)

        self.placeholders['embedding_gather_matrix'] = tf.placeholder(tf.int32, shape=(None, None),
                                                                      name='embedding_gather_matrix')

        self.placeholders['sequence_lengths'] = tf.placeholder(tf.int32,
                                                                       shape=(None,),
                                                                       name="sequence_lengths")

        self.placeholders['variable_bound_token_ids'] = tf.placeholder(tf.int32,
                                                                     shape=(None,),
                                                                     name="variable_bound_token_ids")
        self.placeholders['token_variable_ids'] = tf.placeholder(tf.int32,
                                                                       shape=(None,),
                                                                       name="token_variable_ids")
        self.placeholders['num_variables'] = tf.placeholder(tf.int32, shape=[], name="num_variables")

        self.__type_classification._make_placeholders(is_train)
예제 #10
0
    def _load_data_from_sample(hyperparameters: Dict[str, Any],
                               metadata: Dict[str, Any],
                               raw_sample: Dict[str, Any],
                               result_holder: Dict[str, Any],
                               is_train: bool = True) -> bool:
        keep_sample = super(Sequence2HybridMetric,
                            Sequence2HybridMetric)._load_data_from_sample(
                                hyperparameters, metadata, raw_sample,
                                result_holder, is_train)
        if not keep_sample:
            return False

        token_node_idxs = set(raw_sample['token-sequence'])
        node_idx_to_supernode_idx = {}  #  type: Dict[int, int]
        for from_idx, to_idxs in raw_sample['edges']['OCCURRENCE_OF'].items():
            from_idx = int(from_idx)
            if from_idx not in token_node_idxs:
                # Some supernodes do not have an associated token. Such nodes are attributes
                if str(from_idx) in raw_sample['edges']['CHILD']:
                    right_token_idx = max(
                        raw_sample['edges']['CHILD'][str(from_idx)])
                    assert right_token_idx in token_node_idxs
                    from_idx = right_token_idx
                else:
                    continue
            for to_idx in to_idxs:
                node_idx_to_supernode_idx[from_idx] = to_idx

        supernodes_with_related_nodes = set(node_idx_to_supernode_idx.values())

        variable_types = []  # type: List[str]
        variable_type_idxs = []  # type: List[int]
        ignored_supernodes = set()
        supernode_idxs_to_annotated_variable_idx = {}  # type: Dict[int, int]
        for node_idx, supernode_data in raw_sample['supernodes'].items():
            node_idx = int(node_idx)
            annotation = supernode_data['annotation']
            if ignore_type_annotation(annotation) and is_train:
                ignored_supernodes.add(node_idx)
                continue
            if node_idx not in supernodes_with_related_nodes:
                ignored_supernodes.add(node_idx)
                continue

            variable_idx = len(supernode_idxs_to_annotated_variable_idx)
            variable_types.append(annotation)
            variable_type_idxs.append(
                TypeClassificationModel._get_idx_for_type(
                    annotation, metadata, hyperparameters))
            supernode_idxs_to_annotated_variable_idx[node_idx] = variable_idx

        if len(variable_types) == 0:
            return False

        token_idx, variable_idx = [], []

        def create_token_sequence():
            for i, node_idx in enumerate(raw_sample['token-sequence']):
                supernode_idx = node_idx_to_supernode_idx.get(node_idx)
                if supernode_idx is not None:
                    annotated_variable_idxs = supernode_idxs_to_annotated_variable_idx.get(
                        supernode_idx)
                    if annotated_variable_idxs is not None:
                        token_idx.append(i)
                        variable_idx.append(annotated_variable_idxs)
                yield raw_sample['nodes'][node_idx]

        token_sequence = list(create_token_sequence())
        if len(token_sequence) > hyperparameters['max_seq_len']:
            return False

        # Did we see at least one token per variable?
        assert len(np.unique(variable_idx)) == len(variable_types)

        TokenEmbedder.load_data_from_sample('token', metadata, token_sequence,
                                            result_holder, hyperparameters,
                                            is_train)

        result_holder['sequence_length'] = len(token_sequence)
        result_holder['variable_token_idxs'] = np.array(token_idx,
                                                        dtype=np.uint32)
        result_holder['variable_idxs'] = np.array(variable_idx,
                                                  dtype=np.uint32)
        result_holder['target_type'] = variable_types
        result_holder['variable_target_class'] = np.array(variable_type_idxs,
                                                          dtype=np.uint32)
        result_holder['ignored_supernodes'] = ignored_supernodes
        return keep_sample
예제 #11
0
 def _load_metadata_from_sample(hyperparameters: Dict[str, Any], raw_sample: Dict[str, Any], raw_metadata: Dict[str, Any]) -> None:
     super(Sequence2Annotation, Sequence2Annotation)._load_metadata_from_sample(
         hyperparameters, raw_sample, raw_metadata)
     TokenEmbedder.load_metadata_from_sample('token', [raw_sample['nodes'][i] for i in raw_sample['token-sequence'] if i < len(raw_sample['nodes'])], raw_metadata,
                                             hyperparameters)
     TypeClassificationModel._load_metadata_from_sample(hyperparameters, raw_sample, raw_metadata)
예제 #12
0
 def _init_metadata(hyperparameters: Dict[str, Any],
                    raw_metadata: Dict[str, Any]) -> None:
     super(Sequence2Metric,
           Sequence2Metric)._init_metadata(hyperparameters, raw_metadata)
     TokenEmbedder.init_metadata('token', raw_metadata, hyperparameters)