def generate_sequence_training_data(): input_features = [ sequence_feature( vocab_size=TEST_VOCAB_SIZE, embedding_size=TEST_EMBEDDING_SIZE, state_size=TEST_STATE_SIZE, hidden_size=TEST_HIDDEN_SIZE, num_filters=TEST_NUM_FILTERS, min_len=5, max_len=10, encoder="rnn", cell_type="lstm", reduce_output=None, ) ] output_features = [ sequence_feature(min_len=5, max_len=10, decoder="generator", cell_type="lstm", attention="bahdanau", reduce_input=None) ] # generate synthetic data set testing dataset = build_synthetic_dataset( 150, copy.deepcopy(input_features) + copy.deepcopy(output_features)) raw_data = "\n".join([r[0] + "," + r[1] for r in dataset]) df = pd.read_csv(StringIO(raw_data)) return df, input_features, output_features
def generate_sequence_training_data(): input_features = [ sequence_feature(min_len=5, max_len=10, encoder='rnn', cell_type='lstm', reduce_output=None) ] output_features = [ sequence_feature(min_len=5, max_len=10, decoder='generator', cell_type='lstm', attention='bahdanau', reduce_input=None) ] # generate synthetic data set testing dataset = build_synthetic_dataset( 150, copy.deepcopy(input_features) + copy.deepcopy(output_features)) raw_data = '\n'.join([r[0] + ',' + r[1] for r in dataset]) df = pd.read_csv(StringIO(raw_data)) return df, input_features, output_features
def generate_data( input_features, output_features, filename="test_csv.csv", num_examples=25, nan_percent=0.0, ): """Helper method to generate synthetic data based on input, output feature specs. :param num_examples: number of examples to generate :param input_features: schema :param output_features: schema :param filename: path to the file where data is stored :return: """ features = input_features + output_features df = build_synthetic_dataset(num_examples, features) data = [next(df) for _ in range(num_examples + 1)] dataframe = pd.DataFrame(data[1:], columns=data[0]) if nan_percent > 0: add_nans_to_df_in_place(dataframe, nan_percent) dataframe.to_csv(filename, index=False) return filename
def test_build_synthetic_dataset(): features = [ { "name": "text", "type": "text" }, { "name": "category", "type": "category" }, { "name": "number", "type": "number" }, { "name": "binary", "type": "binary" }, { "name": "set", "type": "set" }, { "name": "bag", "type": "bag" }, { "name": "sequence", "type": "sequence" }, { "name": "timeseries", "type": "timeseries" }, { "name": "date", "type": "date" }, { "name": "h3", "type": "h3" }, { "name": "vector", "type": "vector" }, { "name": "audio", "type": "audio" }, { "name": "image", "type": "image" }, ] assert len(list(dataset_synthesizer.build_synthetic_dataset( 100, features))) == 101 # Extra for the header.
def test_decoder(test_case): # reproducible synthetic data set np.random.seed(RANDOM_SEED) tf.random.set_seed(RANDOM_SEED) # create synthetic data for the test features = [ test_case.syn_data.feature_generator( *test_case.syn_data.feature_generator_args, **test_case.syn_data.feature_generator_kwargs ) ] feature_name = features[0]['name'] data_generator = build_synthetic_dataset(BATCH_SIZE, features) data_list = list(data_generator) raw_data = [x[0] for x in data_list[1:]] df = pd.DataFrame({data_list[0][0]: raw_data}) # create synthetic combiner layer combiner_outputs_rank2 = { 'combiner_output': tf.random.normal( [BATCH_SIZE, HIDDEN_SIZE], dtype=tf.float32 ) } combiner_outputs_rank3 = { 'combiner_output': tf.random.normal( [BATCH_SIZE, SEQ_SIZE, HIDDEN_SIZE], dtype=tf.float32 ), 'encoder_output_state': tf.random.normal( [BATCH_SIZE, HIDDEN_SIZE], dtype=tf.float32 ), 'lengths': tf.convert_to_tensor( np.array(BATCH_SIZE * [SEQ_SIZE]), dtype=tf.int32 ) } # minimal config sufficient to create output feature config = {'input_features': [], 'output_features': features} training_set, _, _, training_set_metadata = preprocess_for_training( config, training_set=df, skip_save_processed_input=True, random_seed=RANDOM_SEED ) # run through each type of regularizer regularizer_losses = [] for regularizer in [None, 'l1', 'l2', 'l1_l2']: # start with clean slate and make reproducible tf.keras.backend.clear_session() np.random.seed(RANDOM_SEED) tf.random.set_seed(RANDOM_SEED) # setup kwarg for regularizer parms x_coder_kwargs = dict( zip(test_case.regularizer_parm_names, len(test_case.regularizer_parm_names) * [regularizer]) ) # combine other other keyword parameters x_coder_kwargs.update(test_case.XCoder_other_parms) features[0].update(x_coder_kwargs) if features[0]['type'] in SEQUENCE_TYPES: features[0]['num_classes'] = training_set_metadata[feature_name][ 'vocab_size'] + 1 training_set.dataset[feature_name] = \ training_set.dataset[feature_name].astype(np.int32) combiner_outputs = combiner_outputs_rank3 else: combiner_outputs = combiner_outputs_rank2 output_def_obj = build_single_output(features[0], None, None) targets = training_set.dataset[feature_name] if len(targets.shape) == 1: targets = targets.reshape(-1, 1) output_def_obj( ( (combiner_outputs, None), targets ), training=True, mask=None ) regularizer_loss = tf.reduce_sum(output_def_obj.decoder_obj.losses) regularizer_losses.append(regularizer_loss) # check loss regularization loss values # None should be zero assert regularizer_losses[0] == 0 # l1, l2 and l1_l2 should be greater than zero assert np.all([t > 0.0 for t in regularizer_losses[1:]]) # # using default setting l1 + l2 == l1_l2 losses assert np.isclose( regularizer_losses[1].numpy() + regularizer_losses[2].numpy(), regularizer_losses[3].numpy())
def test_encoder(test_case): # set up required directories for images if needed shutil.rmtree(IMAGE_DIR, ignore_errors=True) os.mkdir(IMAGE_DIR) # reproducible synthetic data set np.random.seed(RANDOM_SEED) tf.random.set_seed(RANDOM_SEED) # create synthetic data for the test features = [ test_case.syn_data.feature_generator( *test_case.syn_data.feature_generator_args, **test_case.syn_data.feature_generator_kwargs ) ] feature_name = features[0]['name'] data_generator = build_synthetic_dataset(BATCH_SIZE, features) data_list = list(data_generator) raw_data = [x[0] for x in data_list[1:]] df = pd.DataFrame({data_list[0][0]: raw_data}) # minimal config sufficient to create the input feature config = {'input_features': features, 'output_features': []} training_set, _, _, training_set_metadata = preprocess_for_training( config, training_set=df, skip_save_processed_input=True, random_seed=RANDOM_SEED ) # run through each type of regularizer for the encoder regularizer_losses = [] for regularizer in [None, 'l1', 'l2', 'l1_l2']: # start with clean slate and make reproducible tf.keras.backend.clear_session() np.random.seed(RANDOM_SEED) tf.random.set_seed(RANDOM_SEED) # setup kwarg for regularizer parms x_coder_kwargs = dict( zip(test_case.regularizer_parm_names, len(test_case.regularizer_parm_names) * [regularizer]) ) # combine other other keyword parameters x_coder_kwargs.update(test_case.XCoder_other_parms) features[0].update(x_coder_kwargs) # shim code to support sequence/sequence like features if features[0]['type'] in SEQUENCE_TYPES.union({'category', 'set'}): features[0]['vocab'] = training_set_metadata[feature_name][ 'idx2str'] training_set.dataset[feature_name] = \ training_set.dataset[feature_name].astype(np.int32) input_def_obj = build_single_input(features[0], None) inputs = training_set.dataset[feature_name] # make sure we are at least rank 2 tensor if len(inputs.shape) == 1: inputs = inputs.reshape(-1, 1) # special handling for image feature if features[0]['type'] == 'image': inputs = tf.cast(inputs, tf.float32) / 255 input_def_obj.encoder_obj(inputs) regularizer_loss = tf.reduce_sum(input_def_obj.encoder_obj.losses) regularizer_losses.append(regularizer_loss) # check loss regularization loss values # None should be zero assert regularizer_losses[0] == 0 # l1, l2 and l1_l2 should be greater than zero assert np.all([t > 0.0 for t in regularizer_losses[1:]]) # # using default setting l1 + l2 == l1_l2 losses assert np.isclose( regularizer_losses[1].numpy() + regularizer_losses[2].numpy(), regularizer_losses[3].numpy()) # cleanup shutil.rmtree(IMAGE_DIR, ignore_errors=True)