def audio_feature(folder, **kwargs): feature = { "name": "audio_" + random_string(), "type": "audio", "preprocessing": { "audio_feature": { "type": "fbank", "window_length_in_s": 0.04, "window_shift_in_s": 0.02, "num_filter_bands": 80, }, "audio_file_length_limit_in_s": 3.0, }, "encoder": "stacked_cnn", "should_embed": False, "conv_layers": [ {"filter_size": 400, "pool_size": 16, "num_filters": 32, "regularize": "false"}, {"filter_size": 40, "pool_size": 10, "num_filters": 64, "regularize": "false"}, ], "fc_size": 256, "destination_folder": folder, } feature.update(kwargs) feature[COLUMN] = feature[NAME] feature[PROC_COLUMN] = compute_feature_hash(feature) return feature
def h3_feature(**kwargs): feature = { 'name': 'h3_' + random_string(), 'type': 'h3' } feature.update(kwargs) feature[COLUMN] = feature[NAME] feature[PROC_COLUMN] = compute_feature_hash(feature) return feature
def numerical_feature(normalization=None, **kwargs): feature = { "name": "num_" + random_string(), "type": "numerical", "preprocessing": {"normalization": normalization}, } feature.update(kwargs) feature[COLUMN] = feature[NAME] feature[PROC_COLUMN] = compute_feature_hash(feature) return feature
def vector_feature(**kwargs): feature = { 'type': VECTOR, 'vector_size': 5, 'name': 'vector_' + random_string() } feature.update(kwargs) feature[COLUMN] = feature[NAME] feature[PROC_COLUMN] = compute_feature_hash(feature) return feature
def timeseries_feature(**kwargs): feature = { 'name': 'timeseries_' + random_string(), 'type': 'timeseries', 'max_len': 7 } feature.update(kwargs) feature[COLUMN] = feature[NAME] feature[PROC_COLUMN] = compute_feature_hash(feature) return feature
def date_feature(**kwargs): feature = { "name": "date_" + random_string(), "type": "date", "preprocessing": {"datetime_format": random.choice(list(DATETIME_FORMATS.keys()))}, } feature.update(kwargs) feature[COLUMN] = feature[NAME] feature[PROC_COLUMN] = compute_feature_hash(feature) return feature
def vector_feature(**kwargs): feature = { "type": VECTOR, "vector_size": 5, "name": "vector_" + random_string() } feature.update(kwargs) feature[COLUMN] = feature[NAME] feature[PROC_COLUMN] = compute_feature_hash(feature) return feature
def timeseries_feature(**kwargs): feature = { "name": "timeseries_" + random_string(), "type": "timeseries", "max_len": 7, } feature.update(kwargs) feature[COLUMN] = feature[NAME] feature[PROC_COLUMN] = compute_feature_hash(feature) return feature
def category_feature(**kwargs): feature = { 'type': 'category', 'name': 'category_' + random_string(), 'vocab_size': 10, 'embedding_size': 5 } feature.update(kwargs) feature[COLUMN] = feature[NAME] feature[PROC_COLUMN] = compute_feature_hash(feature) return feature
def category_feature(**kwargs): feature = { "type": "category", "name": "category_" + random_string(), "vocab_size": 10, "embedding_size": 5, } feature.update(kwargs) feature[COLUMN] = feature[NAME] feature[PROC_COLUMN] = compute_feature_hash(feature) return feature
def numerical_feature(normalization=None, **kwargs): feature = { 'name': 'num_' + random_string(), 'type': 'numerical', 'preprocessing': { 'normalization': normalization } } feature.update(kwargs) feature[COLUMN] = feature[NAME] feature[PROC_COLUMN] = compute_feature_hash(feature) return feature
def bag_feature(**kwargs): feature = { "name": "bag_" + random_string(), "type": "bag", "max_len": 5, "vocab_size": 10, "embedding_size": 5, } feature.update(kwargs) feature[COLUMN] = feature[NAME] feature[PROC_COLUMN] = compute_feature_hash(feature) return feature
def date_feature(**kwargs): feature = { 'name': 'date_' + random_string(), 'type': 'date', 'preprocessing': { 'datetime_format': random.choice(list(DATETIME_FORMATS.keys())) } } feature.update(kwargs) feature[COLUMN] = feature[NAME] feature[PROC_COLUMN] = compute_feature_hash(feature) return feature
def bag_feature(**kwargs): feature = { 'name': 'bag_' + random_string(), 'type': 'bag', 'max_len': 5, 'vocab_size': 10, 'embedding_size': 5 } feature.update(kwargs) feature[COLUMN] = feature[NAME] feature[PROC_COLUMN] = compute_feature_hash(feature) return feature
def text_feature(**kwargs): feature = { "name": "text_" + random_string(), "type": "text", "vocab_size": 5, "min_len": 7, "max_len": 7, "embedding_size": 8, "state_size": 8, } feature.update(kwargs) feature[COLUMN] = feature[NAME] feature[PROC_COLUMN] = compute_feature_hash(feature) return feature
def __init__(self, feature, *args, **kwargs): super().__init__() if NAME not in feature: raise ValueError("Missing feature name") self.feature_name = feature[NAME] if COLUMN not in feature: feature[COLUMN] = self.feature_name self.column = feature[COLUMN] if PROC_COLUMN not in feature: feature[PROC_COLUMN] = compute_feature_hash(feature) self.proc_column = feature[PROC_COLUMN]
def image_feature(folder, **kwargs): feature = { "type": "image", "name": "image_" + random_string(), "encoder": "resnet", "preprocessing": {"in_memory": True, "height": 12, "width": 12, "num_channels": 3}, "resnet_size": 8, "destination_folder": folder, "fc_size": 8, "num_filters": 8, } feature.update(kwargs) feature[COLUMN] = feature[NAME] feature[PROC_COLUMN] = compute_feature_hash(feature) return feature
def text_feature(**kwargs): feature = { 'name': 'text_' + random_string(), 'type': 'text', 'reduce_input': None, 'vocab_size': 5, 'min_len': 7, 'max_len': 7, 'embedding_size': 8, 'state_size': 8 } feature.update(kwargs) feature[COLUMN] = feature[NAME] feature[PROC_COLUMN] = compute_feature_hash(feature) return feature
def sequence_feature(**kwargs): feature = { 'type': 'sequence', 'name': 'sequence_' + random_string(), 'vocab_size': 10, 'max_len': 7, 'encoder': 'embed', 'embedding_size': 8, 'fc_size': 8, 'state_size': 8, 'num_filters': 8, 'hidden_size': 8 } feature.update(kwargs) feature[COLUMN] = feature[NAME] feature[PROC_COLUMN] = compute_feature_hash(feature) return feature
def sequence_feature(**kwargs): feature = { "type": "sequence", "name": "sequence_" + random_string(), "vocab_size": 10, "max_len": 7, "encoder": "embed", "embedding_size": 8, "fc_size": 8, "state_size": 8, "num_filters": 8, "hidden_size": 8, } feature.update(kwargs) feature[COLUMN] = feature[NAME] feature[PROC_COLUMN] = compute_feature_hash(feature) return feature
def image_feature(folder, **kwargs): feature = { 'type': 'image', 'name': 'image_' + random_string(), 'encoder': 'resnet', 'preprocessing': { 'in_memory': True, 'height': 12, 'width': 12, 'num_channels': 3 }, 'resnet_size': 8, 'destination_folder': folder, 'fc_size': 8, 'num_filters': 8 } feature.update(kwargs) feature[COLUMN] = feature[NAME] feature[PROC_COLUMN] = compute_feature_hash(feature) return feature
def audio_feature(folder, **kwargs): feature = { 'name': 'audio_' + random_string(), 'type': 'audio', 'preprocessing': { 'audio_feature': { 'type': 'fbank', 'window_length_in_s': 0.04, 'window_shift_in_s': 0.02, 'num_filter_bands': 80 }, 'audio_file_length_limit_in_s': 3.0 }, 'encoder': 'stacked_cnn', 'should_embed': False, 'conv_layers': [{ 'filter_size': 400, 'pool_size': 16, 'num_filters': 32, 'regularize': 'false' }, { 'filter_size': 40, 'pool_size': 10, 'num_filters': 64, 'regularize': 'false' }], 'fc_size': 256, 'destination_folder': folder } feature.update(kwargs) feature[COLUMN] = feature[NAME] feature[PROC_COLUMN] = compute_feature_hash(feature) return feature
def _set_proc_column(config: dict) -> None: for feature in config["input_features"] + config["output_features"]: if PROC_COLUMN not in feature: feature[PROC_COLUMN] = compute_feature_hash(feature)
def numerical_feature(): feature = {NAME: 'x', COLUMN: 'x', 'type': 'numerical'} feature[PROC_COLUMN] = compute_feature_hash(feature) return feature
def h3_feature(**kwargs): feature = {"name": "h3_" + random_string(), "type": "h3"} feature.update(kwargs) feature[COLUMN] = feature[NAME] feature[PROC_COLUMN] = compute_feature_hash(feature) return feature
def numerical_feature(): feature = {NAME: "x", COLUMN: "x", "type": "numerical"} feature[PROC_COLUMN] = compute_feature_hash(feature) return feature
def test_encoder(test_case): # set up required directories for images if needed shutil.rmtree(IMAGE_DIR, ignore_errors=True) os.mkdir(IMAGE_DIR) # reproducible synthetic data set np.random.seed(RANDOM_SEED) tf.random.set_seed(RANDOM_SEED) # create synthetic data for the test features = [ test_case.syn_data.feature_generator( *test_case.syn_data.feature_generator_args, **test_case.syn_data.feature_generator_kwargs) ] name = features[0][NAME] proc_column = compute_feature_hash(features[0]) features[0][PROC_COLUMN] = proc_column data_generator = build_synthetic_dataset(BATCH_SIZE, features) data_list = list(data_generator) raw_data = [x[0] for x in data_list[1:]] df = pd.DataFrame({data_list[0][0]: raw_data}) # minimal config sufficient to create the input feature config = {'input_features': features, 'output_features': []} training_set, _, _, training_set_metadata = preprocess_for_training( config, training_set=df, skip_save_processed_input=True, random_seed=RANDOM_SEED) # run through each type of regularizer for the encoder regularizer_losses = [] for regularizer in [None, 'l1', 'l2', 'l1_l2']: # start with clean slate and make reproducible tf.keras.backend.clear_session() np.random.seed(RANDOM_SEED) tf.random.set_seed(RANDOM_SEED) # setup kwarg for regularizer parms x_coder_kwargs = dict( zip(test_case.regularizer_parm_names, len(test_case.regularizer_parm_names) * [regularizer])) # combine other other keyword parameters x_coder_kwargs.update(test_case.XCoder_other_parms) features[0].update(x_coder_kwargs) # shim code to support sequence/sequence like features if features[0]['type'] in SEQUENCE_TYPES.union({'category', 'set'}): features[0]['vocab'] = training_set_metadata[name]['idx2str'] training_set.dataset[proc_column] = \ training_set.dataset[proc_column].astype(np.int32) input_def_obj = build_single_input(features[0], None) inputs = training_set.dataset[proc_column] # make sure we are at least rank 2 tensor if len(inputs.shape) == 1: inputs = inputs.reshape(-1, 1) # special handling for image feature if features[0]['type'] == 'image': inputs = tf.cast(inputs, tf.float32) / 255 input_def_obj.encoder_obj(inputs) regularizer_loss = tf.reduce_sum(input_def_obj.encoder_obj.losses) regularizer_losses.append(regularizer_loss) # check loss regularization loss values # None should be zero assert regularizer_losses[0] == 0 # l1, l2 and l1_l2 should be greater than zero assert np.all([t > 0.0 for t in regularizer_losses[1:]]) # # using default setting l1 + l2 == l1_l2 losses assert np.isclose( regularizer_losses[1].numpy() + regularizer_losses[2].numpy(), regularizer_losses[3].numpy()) # cleanup shutil.rmtree(IMAGE_DIR, ignore_errors=True)
def test_decoder(test_case): # reproducible synthetic data set np.random.seed(RANDOM_SEED) tf.random.set_seed(RANDOM_SEED) # create synthetic data for the test features = [ test_case.syn_data.feature_generator( *test_case.syn_data.feature_generator_args, **test_case.syn_data.feature_generator_kwargs) ] feature_name = features[0][NAME] proc_column = compute_feature_hash(features[0]) features[0][PROC_COLUMN] = proc_column data_generator = build_synthetic_dataset(BATCH_SIZE, features) data_list = list(data_generator) raw_data = [x[0] for x in data_list[1:]] df = pd.DataFrame({data_list[0][0]: raw_data}) # create synthetic combiner layer combiner_outputs_rank2 = { 'combiner_output': tf.random.normal([BATCH_SIZE, HIDDEN_SIZE], dtype=tf.float32) } combiner_outputs_rank3 = { 'combiner_output': tf.random.normal([BATCH_SIZE, SEQ_SIZE, HIDDEN_SIZE], dtype=tf.float32), 'encoder_output_state': tf.random.normal([BATCH_SIZE, HIDDEN_SIZE], dtype=tf.float32), 'lengths': tf.convert_to_tensor(np.array(BATCH_SIZE * [SEQ_SIZE]), dtype=tf.int32) } # minimal config sufficient to create output feature config = {'input_features': [], 'output_features': features} training_set, _, _, training_set_metadata = preprocess_for_training( config, training_set=df, skip_save_processed_input=True, random_seed=RANDOM_SEED) # run through each type of regularizer regularizer_losses = [] for regularizer in [None, 'l1', 'l2', 'l1_l2']: # start with clean slate and make reproducible tf.keras.backend.clear_session() np.random.seed(RANDOM_SEED) tf.random.set_seed(RANDOM_SEED) # setup kwarg for regularizer parms x_coder_kwargs = dict( zip(test_case.regularizer_parm_names, len(test_case.regularizer_parm_names) * [regularizer])) # combine other other keyword parameters x_coder_kwargs.update(test_case.XCoder_other_parms) features[0].update(x_coder_kwargs) if features[0]['type'] in SEQUENCE_TYPES: features[0]['num_classes'] = training_set_metadata[feature_name][ 'vocab_size'] + 1 training_set.dataset[proc_column] = \ training_set.dataset[proc_column].astype(np.int32) combiner_outputs = combiner_outputs_rank3 else: combiner_outputs = combiner_outputs_rank2 output_def_obj = build_single_output(features[0], None, None) targets = training_set.dataset[proc_column] if len(targets.shape) == 1: targets = targets.reshape(-1, 1) output_def_obj(((combiner_outputs, None), targets), training=True, mask=None) regularizer_loss = tf.reduce_sum(output_def_obj.decoder_obj.losses) regularizer_losses.append(regularizer_loss) # check loss regularization loss values # None should be zero assert regularizer_losses[0] == 0 # l1, l2 and l1_l2 should be greater than zero assert np.all([t > 0.0 for t in regularizer_losses[1:]]) # # using default setting l1 + l2 == l1_l2 losses assert np.isclose( regularizer_losses[1].numpy() + regularizer_losses[2].numpy(), regularizer_losses[3].numpy())