Пример #1
0
class DataManager:
    def __init__(self, log_dir):
        self.log_dir = log_dir
        self.START_TOKEN = '[SOS]'
        self.END_TOKEN = '[EOS]'
        self.vocab = list(sorted(set(string.printable))) + [self.START_TOKEN, self.END_TOKEN]
        self.chars_to_ids = StringLookup(vocabulary=self.vocab)
        self.vocab_size = self.chars_to_ids.vocab_size()

    def load_dataset(self):
        ds = TextLineDataset(str(pathlib.Path(self.log_dir, 'file_names.txt')))
        ds = ds.take(5)
        ds = ds.map(self.parse_svg_img, num_parallel_calls=tf.data.experimental.AUTOTUNE)
        ds = ds.padded_batch(2, drop_remainder=True)
        
        return ds
            
    def parse_svg_img(self, file_name):
        svg_path = tf.strings.join([self.log_dir, '/svgs/', file_name, '.svg'])
        img_path = tf.strings.join([self.log_dir, '/imgs/', file_name, '.png'])

        svg = tf.io.read_file(svg_path)
        svg = tf.concat([[self.START_TOKEN], unicode_split(svg, 'UTF-8'), [self.END_TOKEN]], axis=0)
        svg = self.chars_to_ids(svg)
        
        img = tf.io.read_file(img_path)
        img = tf.io.decode_png(img, channels=3)
        img = tf.cast(img, tf.float32)
        img = img / 255.0
        
        return (svg, img), svg
def load_data():
    data = pd.read_csv(
        "https://storage.googleapis.com/tf-datasets/titanic/train.csv")
    from sklearn.model_selection import train_test_split
    labels = data.pop('survived')
    label_names = ["Not survived", "Survived"]
    features = {}

    # Converting CSV file into Tensorflow object

    for name, column in data.items():
        dtype = column.dtype
        if dtype == object:
            dtype = string
        else:
            dtype = float32
        features[name] = Input(shape=(1, ), name=name, dtype=dtype)

    # Extracting and normalizing numeric features
    numeric_features = {
        name: feature
        for name, feature in features.items() if feature.dtype == float32
    }

    x = Concatenate()(list(numeric_features.values()))
    norm = Normalization()
    norm.adapt(np.array(data[numeric_features.keys()]))
    numeric_features = norm(x)

    processed_features = [numeric_features]
    # Extracting and normalizing non-numeric features

    for name, feature in features.items():
        if feature.dtype == float32:
            continue
        word = StringLookup(vocabulary=np.unique(data[name]))
        one_hot = CategoryEncoding(max_tokens=word.vocab_size())

        x = word(feature)
        x = one_hot(x)
        processed_features.append(x)

    processed_features = Concatenate()(processed_features)
    processed_features = Model(features, processed_features)

    utils.plot_model(model=processed_features,
                     rankdir='LR',
                     dpi=72,
                     show_shapes=True)

    feature_dict = {name: np.array(value) for name, value in data.items()}

    train_features, test_features, train_labels, test_labels = train_test_split(
        processed_features(feature_dict).numpy(), labels, test_size=0.2)
    return train_features, train_labels, test_features, test_labels
Пример #3
0
    def _encode_categorical_feature(
        feature: KerasTensor,
        name: str,
        dataset: Optional[BatchDataset],
    ) -> KerasTensor:
        """One-hot encode categorical features.

        Args:
            - feature: The input layer of the feature.
            - name: The feature's name (its column name in the original dataframe).
            - dataset: The training data, if not specified, return a no-op layer.

        Returns:
            The one-hot encoded tensor of the input feature.

        """
        # Return generic layer for the tuner initialization
        if not dataset:
            return KerasTensor(type_spec=TensorSpec(
                shape=(None, 1), dtype=tf.float32, name=None))

        # Create a StringLookup layer which will turn strings into integer indices
        index = StringLookup()

        # Prepare a Dataset that only yields our feature
        feature_ds = dataset.map(lambda x, y: x[name])
        feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))

        # Learn the set of possible string values and assign them a fixed integer index
        index.adapt(feature_ds)

        # Turn the string input into integer indices
        encoded_feature = index(feature)

        # Create a CategoryEncoding for our integer indices
        encoder = CategoryEncoding(output_mode="binary")

        # Learn the space of possible indices
        encoder.adapt(np.arange(index.vocab_size()))

        # Apply one-hot encoding to our indices{split + 1} / {n_splits}
        encoded_feature = encoder(encoded_feature)

        return encoded_feature
Пример #4
0
class WordShape(tf.keras.layers.Layer):
    SHAPE_HAS_CASE = 1
    SHAPE_LOWER_CASE = 2
    SHAPE_UPPER_CASE = 4
    SHAPE_TITLE_CASE = 8
    SHAPE_MIXED_CASE = 16
    SHAPE_ALL_CASES = SHAPE_HAS_CASE | SHAPE_LOWER_CASE | SHAPE_UPPER_CASE | SHAPE_TITLE_CASE | SHAPE_MIXED_CASE

    # Mean and std length from Universal Dependencies and large russian POS corporas
    # Tokens (split_words): 3.057 and 3.118
    # Words: 4.756 and 3.453
    SHAPE_LENGTH_NORM = 32

    SHAPE_LEFT_SAME = 64
    SHAPE_RIGHT_SAME = 128
    SHAPE_LEFT2_SAME = 256
    SHAPE_RIGHT2_SAME = 512
    SHAPE_ALL_SAME = SHAPE_LEFT_SAME | SHAPE_RIGHT_SAME | SHAPE_LEFT2_SAME | SHAPE_RIGHT2_SAME

    SHAPE_CHAR_CAT_FIRST = 1024
    SHAPE_CHAR_CAT_LAST = 2048
    SHAPE_CHAR_CAT_BOTH = SHAPE_CHAR_CAT_FIRST | SHAPE_CHAR_CAT_LAST

    SHAPE_ALL = SHAPE_ALL_CASES | SHAPE_LENGTH_NORM | SHAPE_ALL_SAME | SHAPE_CHAR_CAT_BOTH

    def __init__(self,
                 options,
                 mean_len=3.906,
                 std_len=3.285,
                 char_embed=5,
                 *args,
                 **kwargs):
        super(WordShape, self).__init__(*args, **kwargs)
        self.input_spec = tf.keras.layers.InputSpec(dtype='string')
        self._supports_ragged_inputs = True

        if 0 == options:
            raise ValueError('At least one shape option should be selected')

        self.options = options
        self.mean_len = mean_len
        self.std_len = std_len

    @tf_utils.shape_type_conversion
    def build(self, input_shape):
        if self.options & WordShape.SHAPE_CHAR_CAT_FIRST or self.options & WordShape.SHAPE_CHAR_CAT_LAST:
            category_vocab = [
                'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Mn', 'Me', 'Mc', 'Nd', 'Nl',
                'No', 'Zs', 'Zl', 'Zp', 'Cc', 'Cf', 'Co', 'Cs', 'Pd', 'Ps',
                'Pe', 'Pc', 'Po', 'Sm', 'Sc', 'Sk', 'So', 'Pi', 'Pf'
            ]
            self.cat_lookup = StringLookup(num_oov_indices=0,
                                           oov_token='Cn',
                                           vocabulary=category_vocab)
            if self.cat_lookup.vocab_size() != 30:
                raise ValueError('Wrong vocabulary size')

        super(WordShape, self).build(input_shape)

    def call(self, inputs, **kwargs):
        outputs_one, outputs_many = [], []

        # Case
        any_case = self.SHAPE_HAS_CASE | self.SHAPE_LOWER_CASE | self.SHAPE_UPPER_CASE | self.SHAPE_TITLE_CASE | \
                   self.SHAPE_MIXED_CASE
        if self.options & any_case:
            inputs_lower = lower_case(inputs)
            inputs_upper = upper_case(inputs)
            has_case = tf.not_equal(inputs_lower, inputs_upper)

        if self.options & self.SHAPE_HAS_CASE:
            outputs_one.append(has_case)

        if self.options & self.SHAPE_LOWER_CASE or self.options & self.SHAPE_MIXED_CASE:
            is_lower = tf.logical_and(has_case, tf.equal(inputs, inputs_lower))
        if self.options & self.SHAPE_LOWER_CASE:
            outputs_one.append(is_lower)

        if self.options & self.SHAPE_UPPER_CASE or self.options & self.SHAPE_MIXED_CASE:
            is_upper = tf.logical_and(has_case, tf.equal(inputs, inputs_upper))
        if self.options & self.SHAPE_UPPER_CASE:
            outputs_one.append(is_upper)

        if self.options & self.SHAPE_TITLE_CASE or self.options & self.SHAPE_MIXED_CASE:
            inputs_title = title_case(inputs)
            is_title = tf.logical_and(has_case, tf.equal(inputs, inputs_title))
        if self.options & self.SHAPE_TITLE_CASE:
            outputs_one.append(is_title)

        if self.options & self.SHAPE_MIXED_CASE:
            no_case = tf.logical_not(has_case)
            is_mixed = tf.logical_not(
                tf.logical_or(tf.logical_or(no_case, is_lower),
                              tf.logical_or(is_upper, is_title)))
            outputs_one.append(is_mixed)

        # Length
        if self.options & self.SHAPE_LENGTH_NORM:
            length_norm = tf.strings.length(inputs, unit='UTF8_CHAR')
            length_norm = (tf.cast(length_norm, self.compute_dtype) -
                           self.mean_len) / self.std_len
            outputs_one.append(length_norm)

        # Same
        any_same = self.SHAPE_LEFT_SAME | self.SHAPE_RIGHT_SAME | self.SHAPE_LEFT2_SAME | self.SHAPE_RIGHT2_SAME
        if self.options & any_same:
            empty_pad = tf.zeros_like(inputs[..., :1])
            inputs_padded = tf.concat(
                [empty_pad, empty_pad, inputs, empty_pad, empty_pad], axis=-1)

        if self.options & (self.SHAPE_LEFT_SAME | self.SHAPE_RIGHT_SAME):
            same_one = tf.equal(inputs_padded[..., 1:],
                                inputs_padded[..., :-1])

        if self.options & self.SHAPE_LEFT_SAME:
            same_left = same_one[..., 1:-2]
            outputs_one.append(same_left)

        if self.options & self.SHAPE_RIGHT_SAME:
            same_right = same_one[..., 2:-1]
            outputs_one.append(same_right)

        if self.options & (self.SHAPE_LEFT2_SAME | self.SHAPE_RIGHT2_SAME):
            same_two = tf.equal(inputs_padded[..., 2:],
                                inputs_padded[..., :-2])

        if self.options & self.SHAPE_LEFT2_SAME:
            same_left2 = same_two[..., :-2]
            outputs_one.append(same_left2)

        if self.options & self.SHAPE_RIGHT2_SAME:
            same_right2 = same_two[..., 2:]
            outputs_one.append(same_right2)

        # Char category
        if self.options & WordShape.SHAPE_CHAR_CAT_FIRST:
            first_cats = char_category(inputs)
            first_ids = self.cat_lookup(first_cats)
            first_feats = tf.one_hot(first_ids, depth=30)
            outputs_many.append(first_feats)

        if self.options & WordShape.SHAPE_CHAR_CAT_LAST:
            last_cats = char_category(inputs, first=False)
            last_ids = self.cat_lookup(last_cats)
            last_feats = tf.one_hot(last_ids, depth=30)
            outputs_many.append(last_feats)

        outputs_one = [tf.cast(o, self.compute_dtype) for o in outputs_one]
        outputs_many = [tf.cast(o, self.compute_dtype) for o in outputs_many]

        if not outputs_one:
            return tf.concat(outputs_many, axis=-1)

        outputs_one = tf.stack(outputs_one, axis=-1)
        if not outputs_many:
            return outputs_one

        return tf.concat([outputs_one, *outputs_many], axis=-1)

    @tf_utils.shape_type_conversion
    def compute_output_shape(self, input_shape):
        units = 0
        options = [
            self.SHAPE_HAS_CASE, self.SHAPE_LOWER_CASE, self.SHAPE_UPPER_CASE,
            self.SHAPE_TITLE_CASE, self.SHAPE_MIXED_CASE,
            self.SHAPE_LENGTH_NORM, self.SHAPE_LEFT_SAME,
            self.SHAPE_RIGHT_SAME, self.SHAPE_LEFT2_SAME,
            self.SHAPE_RIGHT2_SAME
        ]
        for opt in options:
            if self.options & opt:
                units += 1

        if self.options & WordShape.SHAPE_CHAR_CAT_FIRST:
            units += 30
        if self.options & WordShape.SHAPE_CHAR_CAT_LAST:
            units += 30

        return input_shape + (units, )

    def get_config(self):
        config = super().get_config()
        config.update({
            'options': self.options,
            'mean_len': self.mean_len,
            'std_len': self.std_len
        })

        return config