def test_experiment_h3(encoder, csv_filename): input_features = [h3_feature()] output_features = [binary_feature()] # Generate test data rel_path = generate_data(input_features, output_features, csv_filename) input_features[0]['encoder'] = encoder run_experiment(input_features, output_features, dataset=rel_path)
def test_image_resizing_num_channel_handling(csv_filename): """ This test creates two image datasets with 3 channels and 1 channel. The combination of this data is used to train a model. This checks the cases where the user may or may not specify a number of channels in the config :param csv_filename: :return: """ # Image Inputs image_dest_folder = os.path.join(os.getcwd(), 'generated_images') # Resnet encoder input_features = [ image_feature(folder=image_dest_folder, encoder='resnet', preprocessing={ 'in_memory': True, 'height': 8, 'width': 8, 'num_channels': 3, 'num_processes': 5 }, fc_size=8, num_filters=8), text_feature(encoder='embed', min_len=1), numerical_feature(normalization='minmax') ] output_features = [binary_feature(), numerical_feature()] rel_path = generate_data(input_features, output_features, csv_filename, num_examples=50) df1 = read_csv(rel_path) input_features[0]['preprocessing']['num_channels'] = 1 rel_path = generate_data(input_features, output_features, csv_filename, num_examples=50) df2 = read_csv(rel_path) df = concatenate_df(df1, df2, None, LOCAL_BACKEND) df.to_csv(rel_path, index=False) # Here the user sepcifiies number of channels. Exception shouldn't be thrown run_experiment(input_features, output_features, dataset=rel_path) del input_features[0]['preprocessing']['num_channels'] # User now doesn't specify num channels. Should throw exception with pytest.raises(ValueError): run_experiment(input_features, output_features, dataset=rel_path) # Delete the temporary data created shutil.rmtree(image_dest_folder)
def test_experiment_date(encoder, csv_filename): input_features = [date_feature()] output_features = [category_feature(vocab_size=2)] # Generate test data rel_path = generate_data(input_features, output_features, csv_filename) input_features[0]['encoder'] = encoder run_experiment(input_features, output_features, dataset=rel_path)
def test_experiment_multiclass_with_class_weights(csv_filename): # Multiple inputs, Single category output input_features = [category_feature(vocab_size=10)] output_features = [category_feature(vocab_size=3, loss={"class_weights": [0, 1, 2, 3]})] # Generate test data rel_path = generate_data(input_features, output_features, csv_filename) run_experiment(input_features, output_features, dataset=rel_path)
def test_image_resizing_num_channel_handling(csv_filename): """This test creates two image datasets with 3 channels and 1 channel. The combination of this data is used to train a model. This checks the cases where the user may or may not specify a number of channels in the config. :param csv_filename: :return: """ # Image Inputs image_dest_folder = os.path.join(os.getcwd(), "generated_images") # Resnet encoder input_features = [ image_feature( folder=image_dest_folder, encoder="resnet", preprocessing={ "in_memory": True, "height": 8, "width": 8, "num_channels": 3, "num_processes": 5 }, fc_size=8, num_filters=8, ), text_feature(encoder="embed", min_len=1), numerical_feature(normalization="minmax"), ] output_features = [binary_feature(), numerical_feature()] rel_path = generate_data(input_features, output_features, csv_filename, num_examples=50) df1 = read_csv(rel_path) input_features[0]["preprocessing"]["num_channels"] = 1 rel_path = generate_data(input_features, output_features, csv_filename, num_examples=50) df2 = read_csv(rel_path) df = concatenate_df(df1, df2, None, LOCAL_BACKEND) df.to_csv(rel_path, index=False) # Here the user specifies number of channels. Exception shouldn't be thrown run_experiment(input_features, output_features, dataset=rel_path) del input_features[0]["preprocessing"]["num_channels"] # User doesn't specify num channels, but num channels is inferred. Exception shouldn't be thrown run_experiment(input_features, output_features, dataset=rel_path) # Delete the temporary data created shutil.rmtree(image_dest_folder)
def test_sequence_tagger_text(csv_filename): # Define input and output features input_features = [text_feature(max_len=10, encoder="rnn", reduce_output=None)] output_features = [sequence_feature(max_len=10, decoder="tagger", reduce_input=None)] # Generate test data rel_path = generate_data(input_features, output_features, csv_filename) # run the experiment run_experiment(input_features, output_features, dataset=rel_path)
def test_experiment_timeseries(csv_filename): input_features = [timeseries_feature()] output_features = [binary_feature()] encoders2 = ['transformer'] # Generate test data rel_path = generate_data(input_features, output_features, csv_filename) for encoder in encoders2: input_features[0]['encoder'] = encoder run_experiment(input_features, output_features, dataset=rel_path)
def test_experiment_multi_input_intent_classification(csv_filename, encoder): # Multiple inputs, Single category output input_features = [text_feature(vocab_size=10, min_len=1, representation="sparse"), category_feature(vocab_size=10)] output_features = [category_feature(vocab_size=2, reduce_input="sum")] # Generate test data rel_path = generate_data(input_features, output_features, csv_filename) input_features[0]["encoder"] = encoder run_experiment(input_features, output_features, dataset=rel_path)
def test_reduction(reduce_output, csv_filename): input_features = [sequence_feature(reduce_output=reduce_output)] output_features = [category_feature()] rel_path = generate_data(input_features, output_features, csv_filename) run_experiment(input_features, output_features, dataset=rel_path) del input_features del output_features
def test_experiment_audio_inputs(tmpdir): # Audio Inputs audio_dest_folder = os.path.join(tmpdir, "generated_audio") input_features = [audio_feature(folder=audio_dest_folder)] output_features = [binary_feature()] rel_path = generate_data(input_features, output_features, os.path.join(tmpdir, "dataset.csv")) run_experiment(input_features, output_features, dataset=rel_path)
def test_experiment_text_feature_non_HF(encoder, csv_filename): input_features = [ text_feature(vocab_size=30, min_len=1, encoder=encoder, preprocessing={'word_tokenizer': 'space'}) ] output_features = [category_feature(vocab_size=2)] # Generate test data rel_path = generate_data(input_features, output_features, csv_filename) run_experiment(input_features, output_features, dataset=rel_path)
def test_experiment_text_feature_HF(encoder, csv_filename): input_features = [ text_feature(vocab_size=30, min_len=1, reduce_output=None, encoder=encoder, preprocessing={'word_tokenizer': 'hf_tokenizer'}) ] output_features = [category_feature(vocab_size=2, reduce_input='sum')] # Generate test data rel_path = generate_data(input_features, output_features, csv_filename) run_experiment(input_features, output_features, data_csv=rel_path)
def test_experiment_multiple_seq_seq(csv_filename, output_features): input_features = [ text_feature(vocab_size=100, min_len=1, encoder='stacked_cnn'), numerical_feature(normalization='zscore'), category_feature(vocab_size=10, embedding_size=5), set_feature(), sequence_feature(vocab_size=10, max_len=10, encoder='embed') ] output_features = output_features rel_path = generate_data(input_features, output_features, csv_filename) run_experiment(input_features, output_features, dataset=rel_path)
def test_experiment_timeseries(csv_filename): input_features = [timeseries_feature()] output_features = [binary_feature()] encoders2 = [ 'rnn', 'cnnrnn', 'stacked_cnn', 'parallel_cnn', 'stacked_parallel_cnn' ] # Generate test data rel_path = generate_data(input_features, output_features, csv_filename) for encoder in encoders2: input_features[0]['encoder'] = encoder run_experiment(input_features, output_features, dataset=rel_path)
def test_experiment_audio_inputs(csv_filename): # Audio Inputs audio_dest_folder = os.path.join(os.getcwd(), 'generated_audio') input_features = [audio_feature(folder=audio_dest_folder)] output_features = [binary_feature()] rel_path = generate_data(input_features, output_features, csv_filename) run_experiment(input_features, output_features, dataset=rel_path) # Delete the temporary data created shutil.rmtree(audio_dest_folder)
def test_experiment_sampled_softmax(csv_filename): # Multiple inputs, Single category output input_features = [text_feature(vocab_size=10, min_len=1)] output_features = [category_feature( vocab_size=500, loss={'type': 'sampled_softmax_cross_entropy'} )] # Generate test data rel_path = generate_data(input_features, output_features, csv_filename, num_examples=10000) run_experiment(input_features, output_features, dataset=rel_path)
def test_experiment_seq_seq(csv_filename): # Single Sequence input, single sequence output # Only the following encoders are working input_features = [text_feature(reduce_output=None, encoder='rnn')] output_features = [text_feature(reduce_input=None, decoder='tagger')] # Generate test data rel_path = generate_data(input_features, output_features, csv_filename) encoders2 = ['cnnrnn', 'stacked_cnn'] for encoder in encoders2: logger.info('seq to seq test, Encoder: {0}'.format(encoder)) input_features[0]['encoder'] = encoder run_experiment(input_features, output_features, dataset=rel_path)
def test_experiment_seq_seq(csv_filename): # Single Sequence input, single sequence output # Only the following encoders are working input_features = [text_feature(reduce_output=None, encoder="rnn")] output_features = [text_feature(reduce_input=None, decoder="tagger")] # Generate test data rel_path = generate_data(input_features, output_features, csv_filename) encoders2 = ["cnnrnn", "stacked_cnn"] for encoder in encoders2: logger.info(f"seq to seq test, Encoder: {encoder}") input_features[0]["encoder"] = encoder run_experiment(input_features, output_features, dataset=rel_path)
def test_basic_image_feature(num_channels, image_source, in_memory, skip_save_processed_input, csv_filename): # Image Inputs image_dest_folder = os.path.join(os.getcwd(), 'generated_images') input_features = [ image_feature( folder=image_dest_folder, encoder='stacked_cnn', preprocessing={ 'in_memory': in_memory, 'height': 12, 'width': 12, 'num_channels': num_channels, 'num_processes': 5 }, fc_size=16, num_filters=8 ) ] output_features = [ category_feature(vocab_size=2, reduce_input='sum') ] rel_path = generate_data(input_features, output_features, csv_filename) if image_source == 'file': # use images from file run_experiment( input_features, output_features, dataset=rel_path, skip_save_processed_input=skip_save_processed_input ) else: # import image from file and store in dataframe as ndarrays df = pd.read_csv(rel_path) image_feature_name = input_features[0]['name'] df[image_feature_name] = df[image_feature_name].apply( lambda x: imread(x)) run_experiment( input_features, output_features, dataset=df, skip_save_processed_input=skip_save_processed_input ) # Delete the temporary data created shutil.rmtree(image_dest_folder, ignore_errors=True)
def run_experiment_with_encoder(encoder, csv_filename): # Run in a subprocess to clear TF and prevent OOM # This also allows us to use GPU resources input_features = [ text_feature( vocab_size=30, min_len=1, encoder=encoder, ) ] output_features = [category_feature(vocab_size=2)] # Generate test data rel_path = generate_data(input_features, output_features, csv_filename) run_experiment(input_features, output_features, dataset=rel_path)
def test_experiment_multi_input_intent_classification(csv_filename): # Multiple inputs, Single category output input_features = [ text_feature(vocab_size=10, min_len=1, representation='sparse'), category_feature(vocab_size=10) ] output_features = [category_feature(vocab_size=2, reduce_input='sum')] # Generate test data rel_path = generate_data(input_features, output_features, csv_filename) for encoder in ENCODERS: input_features[0]['encoder'] = encoder run_experiment(input_features, output_features, dataset=rel_path)
def test_tied_macro_level(tied_use_case, csv_filename): input_features = [ numerical_feature(), # Other feature tied_use_case.input_feature(), # first feature to be tied tied_use_case.input_feature(), # second feature to be tied category_feature() # other feature ] # tie second feature to first feature input_features[2]['tied'] = input_features[1]['name'] # setup output feature output_features = [tied_use_case.output_feature()] # Generate test data and run full_experiment rel_path = generate_data(input_features, output_features, csv_filename) run_experiment(input_features, output_features, data_csv=rel_path)
def test_tied_macro_level(tied_use_case: TiedUseCase, csv_filename: str): input_features = [ number_feature(), # Other feature tied_use_case.input_feature(), # first feature to be tied tied_use_case.input_feature(), # second feature to be tied category_feature(), # other feature ] # tie second feature to first feature input_features[2]["tied"] = input_features[1]["name"] # setup output feature output_features = [tied_use_case.output_feature()] # Generate test data and run full_experiment rel_path = generate_data(input_features, output_features, csv_filename) run_experiment(input_features, output_features, dataset=rel_path)
def test_experiment_tied_weights(csv_filename): # Single sequence input, single category output input_features = [ text_feature(name="text_feature1", min_len=1, encoder="cnnrnn", reduce_output="sum"), text_feature( name="text_feature2", min_len=1, encoder="cnnrnn", reduce_output="sum", tied_weights="text_feature1" ), ] output_features = [category_feature(vocab_size=2, reduce_input="sum")] # Generate test data rel_path = generate_data(input_features, output_features, csv_filename) for encoder in ENCODERS: input_features[0]["encoder"] = encoder input_features[1]["encoder"] = encoder run_experiment(input_features, output_features, dataset=rel_path)
def test_sequence_generator( enc_encoder, enc_cell_type, dec_cell_type, dec_attention, dec_beam_width, csv_filename ): tfa.options.TF_ADDONS_PY_OPS = True with graph_mode(): # Define input and output features input_features = [ sequence_feature( min_len=5, max_len=10, encoder='rnn', cell_type='lstm', reduce_output=None ) ] output_features = [ sequence_feature( min_len=5, max_len=10, decoder='generator', cell_type='lstm', attention='bahdanau', reduce_input=None ) ] # Generate test data rel_path = generate_data(input_features, output_features, csv_filename) # setup encoder specification input_features[0]['encoder'] = enc_encoder input_features[0]['cell_type'] = enc_cell_type # setup decoder specification output_features[0]['cell_type'] = dec_cell_type output_features[0]['attention'] = dec_attention output_features[0]['beam_width'] = dec_beam_width # run the experiment run_experiment(input_features, output_features, dataset=rel_path)
def test_experiment_seq_seq_model_def_file(csv_filename, yaml_filename): # seq-to-seq test to use config file instead of dictionary input_features = [text_feature(reduce_output=None, encoder="embed")] output_features = [text_feature(reduce_input=None, vocab_size=3, decoder="tagger")] # Save the config to a yaml file config = { "input_features": input_features, "output_features": output_features, "combiner": {"type": "concat", "fc_size": 14}, "training": {"epochs": 2}, } with open(yaml_filename, "w") as yaml_out: yaml.safe_dump(config, yaml_out) rel_path = generate_data(input_features, output_features, csv_filename) run_experiment(None, None, dataset=rel_path, config=yaml_filename)
def test_visual_question_answering(csv_filename): image_dest_folder = os.path.join(os.getcwd(), "generated_images") input_features = [ image_feature( folder=image_dest_folder, encoder="resnet", preprocessing={"in_memory": True, "height": 8, "width": 8, "num_channels": 3, "num_processes": 5}, fc_size=8, num_filters=8, ), text_feature(encoder="embed", min_len=1, level="word"), ] output_features = [sequence_feature(decoder="generator", cell_type="lstm")] rel_path = generate_data(input_features, output_features, csv_filename) run_experiment(input_features, output_features, dataset=rel_path) # Delete the temporary data created shutil.rmtree(image_dest_folder)
def test_experiment_seq_seq_train_test_valid(tmpdir): # seq-to-seq test to use train, test, validation files input_features = [text_feature(reduce_output=None, encoder="rnn")] output_features = [text_feature(reduce_input=None, vocab_size=3, decoder="tagger")] train_csv = generate_data(input_features, output_features, os.path.join(tmpdir, "train.csv")) test_csv = generate_data(input_features, output_features, os.path.join(tmpdir, "test.csv"), 20) valdation_csv = generate_data(input_features, output_features, os.path.join(tmpdir, "val.csv"), 20) run_experiment( input_features, output_features, training_set=train_csv, test_set=test_csv, validation_set=valdation_csv ) input_features[0]["encoder"] = "parallel_cnn" # Save intermediate output run_experiment( input_features, output_features, training_set=train_csv, test_set=test_csv, validation_set=valdation_csv )
def test_basic_image_feature(num_channels, image_source, in_memory, skip_save_processed_input, csv_filename): # Image Inputs image_dest_folder = os.path.join(os.getcwd(), "generated_images") input_features = [ image_feature( folder=image_dest_folder, encoder="stacked_cnn", preprocessing={ "in_memory": in_memory, "height": 12, "width": 12, "num_channels": num_channels, "num_processes": 5, }, fc_size=16, num_filters=8, ) ] output_features = [category_feature(vocab_size=2, reduce_input="sum")] rel_path = generate_data(input_features, output_features, csv_filename) if image_source == "file": # use images from file run_experiment(input_features, output_features, dataset=rel_path, skip_save_processed_input=skip_save_processed_input) else: # import image from file and store in dataframe as tensors. df = pd.read_csv(rel_path) image_feature_name = input_features[0]["name"] df[image_feature_name] = df[image_feature_name].apply( lambda x: torchvision.io.read_image(x)) run_experiment(input_features, output_features, dataset=df, skip_save_processed_input=skip_save_processed_input) # Delete the temporary data created shutil.rmtree(image_dest_folder, ignore_errors=True)
def test_experiment_sequence_combiner_with_reduction_fails(csv_filename): config = { "input_features": [ sequence_feature( name="seq1", min_len=5, max_len=5, encoder="embed", cell_type="lstm", reduce_output="sum", ), sequence_feature( name="seq2", min_len=5, max_len=5, encoder="embed", cell_type="lstm", reduce_output="sum", ), category_feature(vocab_size=5), ], "output_features": [category_feature(reduce_input="sum", vocab_size=5)], TRAINER: { "epochs": 2 }, "combiner": { "type": "sequence", "encoder": "rnn", "main_sequence_feature": "seq1", "reduce_output": None, }, } # Generate test data rel_path = generate_data(config["input_features"], config["output_features"], csv_filename) # Encoding sequence features with 'embed' should fail with SequenceConcatCombiner, since at least one sequence # feature should be rank 3. with pytest.raises(ValueError): run_experiment(config=config, dataset=rel_path)