예제 #1
0
파일: helper.py 프로젝트: braaannigan/ernie
def get_features(tokenizer, sentences, labels):
    features = []
    for i, sentence in enumerate(sentences):
        inputs = tokenizer.encode_plus(sentence,
                                       add_special_tokens=True,
                                       max_length=tokenizer.max_len)
        input_ids, token_type_ids = inputs['input_ids'], inputs[
            'token_type_ids']
        padding_length = tokenizer.max_len - len(input_ids)

        if tokenizer.padding_side == 'right':
            attention_mask = [1] * len(input_ids) + [0] * padding_length
            input_ids = input_ids + [tokenizer.pad_token_id] * padding_length
            token_type_ids = token_type_ids + [tokenizer.pad_token_type_id
                                               ] * padding_length
        else:
            attention_mask = [0] * padding_length + [1] * len(input_ids)
            input_ids = [tokenizer.pad_token_id] * padding_length + input_ids
            token_type_ids = [tokenizer.pad_token_type_id
                              ] * padding_length + token_type_ids

        assert tokenizer.max_len == len(attention_mask) == len(
            input_ids
        ) == len(
            token_type_ids
        ), f'{tokenizer.max_len}, {len(attention_mask)}, {len(input_ids)}, {len(token_type_ids)}'

        feature = {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'token_type_ids': token_type_ids,
            'label': int(labels[i])
        }

        features.append(feature)

    def gen():
        for feature in features:
            yield (
                {
                    'input_ids': feature['input_ids'],
                    'attention_mask': feature['attention_mask'],
                    'token_type_ids': feature['token_type_ids'],
                },
                feature['label'],
            )

    dataset = data.Dataset.from_generator(
        gen,
        ({
            'input_ids': int32,
            'attention_mask': int32,
            'token_type_ids': int32
        }, int64),
        (
            {
                'input_ids': TensorShape([None]),
                'attention_mask': TensorShape([None]),
                'token_type_ids': TensorShape([None]),
            },
            TensorShape([]),
        ),
    )

    return dataset
예제 #2
0
def _assert_compatible_shape(shape: tf.TensorShape, example_shape):
    if not shape.is_compatible_with(example_shape):
        raise ValueError(f"example shape {example_shape} is incompatible with "
                         f"feature shape {shape}")
예제 #3
0
def _hash_dtype_and_shape(dtype: tf.DType, shape: tf.TensorShape) -> int:
    if shape.rank is not None:
        # as_list is not defined on unknown tensorshapes
        return hash((dtype.name, tuple(shape.as_list())))
    return hash((dtype.name, None))
예제 #4
0
def _hash_dtype_and_shape(dtype: tf.DType, shape: tf.TensorShape) -> int:
    return hash((dtype.name, tuple(shape.as_list())))
예제 #5
0
파일: res.py 프로젝트: MilesGrey/emd
 def compute_output_shape(self, input_shape):
     return TensorShape(
         (input_shape[0], input_shape[1] // self.stride,
          input_shape[2] // self.stride, self.filters * self.k))
예제 #6
0
 def compute_output_shape(self, input_shape):
     return TensorShape((
         input_shape[0],
         self.number_of_classes
     ))