示例#1
0
def test_experiment_h3(encoder, csv_filename):
    input_features = [h3_feature()]
    output_features = [binary_feature()]

    # Generate test data
    rel_path = generate_data(input_features, output_features, csv_filename)

    input_features[0]['encoder'] = encoder
    run_experiment(input_features, output_features, dataset=rel_path)
示例#2
0
def test_image_resizing_num_channel_handling(csv_filename):
    """
    This test creates two image datasets with 3 channels and 1 channel. The
    combination of this data is used to train a model. This checks the cases
    where the user may or may not specify a number of channels in the
    config
    :param csv_filename:
    :return:
    """
    # Image Inputs
    image_dest_folder = os.path.join(os.getcwd(), 'generated_images')

    # Resnet encoder
    input_features = [
        image_feature(folder=image_dest_folder,
                      encoder='resnet',
                      preprocessing={
                          'in_memory': True,
                          'height': 8,
                          'width': 8,
                          'num_channels': 3,
                          'num_processes': 5
                      },
                      fc_size=8,
                      num_filters=8),
        text_feature(encoder='embed', min_len=1),
        numerical_feature(normalization='minmax')
    ]
    output_features = [binary_feature(), numerical_feature()]
    rel_path = generate_data(input_features,
                             output_features,
                             csv_filename,
                             num_examples=50)

    df1 = read_csv(rel_path)

    input_features[0]['preprocessing']['num_channels'] = 1
    rel_path = generate_data(input_features,
                             output_features,
                             csv_filename,
                             num_examples=50)
    df2 = read_csv(rel_path)

    df = concatenate_df(df1, df2, None, LOCAL_BACKEND)
    df.to_csv(rel_path, index=False)

    # Here the user sepcifiies number of channels. Exception shouldn't be thrown
    run_experiment(input_features, output_features, dataset=rel_path)

    del input_features[0]['preprocessing']['num_channels']

    # User now doesn't specify num channels. Should throw exception
    with pytest.raises(ValueError):
        run_experiment(input_features, output_features, dataset=rel_path)

    # Delete the temporary data created
    shutil.rmtree(image_dest_folder)
示例#3
0
def test_experiment_date(encoder, csv_filename):
    input_features = [date_feature()]
    output_features = [category_feature(vocab_size=2)]

    # Generate test data
    rel_path = generate_data(input_features, output_features, csv_filename)

    input_features[0]['encoder'] = encoder
    run_experiment(input_features, output_features, dataset=rel_path)
示例#4
0
def test_experiment_multiclass_with_class_weights(csv_filename):
    # Multiple inputs, Single category output
    input_features = [category_feature(vocab_size=10)]
    output_features = [category_feature(vocab_size=3,
                                        loss={"class_weights": [0, 1, 2, 3]})]

    # Generate test data
    rel_path = generate_data(input_features, output_features, csv_filename)
    run_experiment(input_features, output_features, dataset=rel_path)
示例#5
0
def test_image_resizing_num_channel_handling(csv_filename):
    """This test creates two image datasets with 3 channels and 1 channel. The combination of this data is used to
    train a model. This checks the cases where the user may or may not specify a number of channels in the config.

    :param csv_filename:
    :return:
    """
    # Image Inputs
    image_dest_folder = os.path.join(os.getcwd(), "generated_images")

    # Resnet encoder
    input_features = [
        image_feature(
            folder=image_dest_folder,
            encoder="resnet",
            preprocessing={
                "in_memory": True,
                "height": 8,
                "width": 8,
                "num_channels": 3,
                "num_processes": 5
            },
            fc_size=8,
            num_filters=8,
        ),
        text_feature(encoder="embed", min_len=1),
        numerical_feature(normalization="minmax"),
    ]
    output_features = [binary_feature(), numerical_feature()]
    rel_path = generate_data(input_features,
                             output_features,
                             csv_filename,
                             num_examples=50)

    df1 = read_csv(rel_path)

    input_features[0]["preprocessing"]["num_channels"] = 1
    rel_path = generate_data(input_features,
                             output_features,
                             csv_filename,
                             num_examples=50)
    df2 = read_csv(rel_path)

    df = concatenate_df(df1, df2, None, LOCAL_BACKEND)
    df.to_csv(rel_path, index=False)

    # Here the user specifies number of channels. Exception shouldn't be thrown
    run_experiment(input_features, output_features, dataset=rel_path)

    del input_features[0]["preprocessing"]["num_channels"]

    # User doesn't specify num channels, but num channels is inferred. Exception shouldn't be thrown
    run_experiment(input_features, output_features, dataset=rel_path)

    # Delete the temporary data created
    shutil.rmtree(image_dest_folder)
示例#6
0
def test_sequence_tagger_text(csv_filename):
    # Define input and output features
    input_features = [text_feature(max_len=10, encoder="rnn", reduce_output=None)]
    output_features = [sequence_feature(max_len=10, decoder="tagger", reduce_input=None)]

    # Generate test data
    rel_path = generate_data(input_features, output_features, csv_filename)

    # run the experiment
    run_experiment(input_features, output_features, dataset=rel_path)
示例#7
0
def test_experiment_timeseries(csv_filename):
    input_features = [timeseries_feature()]
    output_features = [binary_feature()]

    encoders2 = ['transformer']
    # Generate test data
    rel_path = generate_data(input_features, output_features, csv_filename)
    for encoder in encoders2:
        input_features[0]['encoder'] = encoder
        run_experiment(input_features, output_features, dataset=rel_path)
示例#8
0
def test_experiment_multi_input_intent_classification(csv_filename, encoder):
    # Multiple inputs, Single category output
    input_features = [text_feature(vocab_size=10, min_len=1, representation="sparse"), category_feature(vocab_size=10)]
    output_features = [category_feature(vocab_size=2, reduce_input="sum")]

    # Generate test data
    rel_path = generate_data(input_features, output_features, csv_filename)

    input_features[0]["encoder"] = encoder
    run_experiment(input_features, output_features, dataset=rel_path)
示例#9
0
def test_reduction(reduce_output, csv_filename):
    input_features = [sequence_feature(reduce_output=reduce_output)]

    output_features = [category_feature()]

    rel_path = generate_data(input_features, output_features, csv_filename)
    run_experiment(input_features, output_features, dataset=rel_path)

    del input_features
    del output_features
示例#10
0
def test_experiment_audio_inputs(tmpdir):
    # Audio Inputs
    audio_dest_folder = os.path.join(tmpdir, "generated_audio")

    input_features = [audio_feature(folder=audio_dest_folder)]
    output_features = [binary_feature()]

    rel_path = generate_data(input_features, output_features,
                             os.path.join(tmpdir, "dataset.csv"))

    run_experiment(input_features, output_features, dataset=rel_path)
示例#11
0
def test_experiment_text_feature_non_HF(encoder, csv_filename):
    input_features = [
        text_feature(vocab_size=30,
                     min_len=1,
                     encoder=encoder,
                     preprocessing={'word_tokenizer': 'space'})
    ]
    output_features = [category_feature(vocab_size=2)]
    # Generate test data
    rel_path = generate_data(input_features, output_features, csv_filename)
    run_experiment(input_features, output_features, dataset=rel_path)
示例#12
0
def test_experiment_text_feature_HF(encoder, csv_filename):
    input_features = [
        text_feature(vocab_size=30,
                     min_len=1,
                     reduce_output=None,
                     encoder=encoder,
                     preprocessing={'word_tokenizer': 'hf_tokenizer'})
    ]
    output_features = [category_feature(vocab_size=2, reduce_input='sum')]
    # Generate test data
    rel_path = generate_data(input_features, output_features, csv_filename)
    run_experiment(input_features, output_features, data_csv=rel_path)
示例#13
0
def test_experiment_multiple_seq_seq(csv_filename, output_features):
    input_features = [
        text_feature(vocab_size=100, min_len=1, encoder='stacked_cnn'),
        numerical_feature(normalization='zscore'),
        category_feature(vocab_size=10, embedding_size=5),
        set_feature(),
        sequence_feature(vocab_size=10, max_len=10, encoder='embed')
    ]
    output_features = output_features

    rel_path = generate_data(input_features, output_features, csv_filename)
    run_experiment(input_features, output_features, dataset=rel_path)
示例#14
0
def test_experiment_timeseries(csv_filename):
    input_features = [timeseries_feature()]
    output_features = [binary_feature()]

    encoders2 = [
        'rnn', 'cnnrnn', 'stacked_cnn', 'parallel_cnn', 'stacked_parallel_cnn'
    ]
    # Generate test data
    rel_path = generate_data(input_features, output_features, csv_filename)
    for encoder in encoders2:
        input_features[0]['encoder'] = encoder
        run_experiment(input_features, output_features, dataset=rel_path)
示例#15
0
def test_experiment_audio_inputs(csv_filename):
    # Audio Inputs
    audio_dest_folder = os.path.join(os.getcwd(), 'generated_audio')

    input_features = [audio_feature(folder=audio_dest_folder)]
    output_features = [binary_feature()]

    rel_path = generate_data(input_features, output_features, csv_filename)
    run_experiment(input_features, output_features, dataset=rel_path)

    # Delete the temporary data created
    shutil.rmtree(audio_dest_folder)
示例#16
0
def test_experiment_sampled_softmax(csv_filename):
    # Multiple inputs, Single category output
    input_features = [text_feature(vocab_size=10, min_len=1)]
    output_features = [category_feature(
        vocab_size=500,
        loss={'type': 'sampled_softmax_cross_entropy'}
    )]

    # Generate test data
    rel_path = generate_data(input_features, output_features, csv_filename,
                             num_examples=10000)

    run_experiment(input_features, output_features, dataset=rel_path)
示例#17
0
def test_experiment_seq_seq(csv_filename):
    # Single Sequence input, single sequence output
    # Only the following encoders are working
    input_features = [text_feature(reduce_output=None, encoder='rnn')]
    output_features = [text_feature(reduce_input=None, decoder='tagger')]

    # Generate test data
    rel_path = generate_data(input_features, output_features, csv_filename)

    encoders2 = ['cnnrnn', 'stacked_cnn']
    for encoder in encoders2:
        logger.info('seq to seq test, Encoder: {0}'.format(encoder))
        input_features[0]['encoder'] = encoder
        run_experiment(input_features, output_features, dataset=rel_path)
示例#18
0
def test_experiment_seq_seq(csv_filename):
    # Single Sequence input, single sequence output
    # Only the following encoders are working
    input_features = [text_feature(reduce_output=None, encoder="rnn")]
    output_features = [text_feature(reduce_input=None, decoder="tagger")]

    # Generate test data
    rel_path = generate_data(input_features, output_features, csv_filename)

    encoders2 = ["cnnrnn", "stacked_cnn"]
    for encoder in encoders2:
        logger.info(f"seq to seq test, Encoder: {encoder}")
        input_features[0]["encoder"] = encoder
        run_experiment(input_features, output_features, dataset=rel_path)
示例#19
0
def test_basic_image_feature(num_channels, image_source, in_memory,
                             skip_save_processed_input, csv_filename):
    # Image Inputs
    image_dest_folder = os.path.join(os.getcwd(), 'generated_images')

    input_features = [
        image_feature(
            folder=image_dest_folder,
            encoder='stacked_cnn',
            preprocessing={
                'in_memory': in_memory,
                'height': 12,
                'width': 12,
                'num_channels': num_channels,
                'num_processes': 5
            },
            fc_size=16,
            num_filters=8
        )
    ]
    output_features = [
        category_feature(vocab_size=2, reduce_input='sum')
    ]

    rel_path = generate_data(input_features, output_features, csv_filename)

    if image_source == 'file':
        # use images from file
        run_experiment(
            input_features,
            output_features,
            dataset=rel_path,
            skip_save_processed_input=skip_save_processed_input
        )
    else:
        # import image from file and store in dataframe as ndarrays
        df = pd.read_csv(rel_path)
        image_feature_name = input_features[0]['name']
        df[image_feature_name] = df[image_feature_name].apply(
            lambda x: imread(x))

        run_experiment(
            input_features,
            output_features,
            dataset=df,
            skip_save_processed_input=skip_save_processed_input
        )

    # Delete the temporary data created
    shutil.rmtree(image_dest_folder, ignore_errors=True)
示例#20
0
def run_experiment_with_encoder(encoder, csv_filename):
    # Run in a subprocess to clear TF and prevent OOM
    # This also allows us to use GPU resources
    input_features = [
        text_feature(
            vocab_size=30,
            min_len=1,
            encoder=encoder,
        )
    ]
    output_features = [category_feature(vocab_size=2)]
    # Generate test data
    rel_path = generate_data(input_features, output_features, csv_filename)
    run_experiment(input_features, output_features, dataset=rel_path)
示例#21
0
def test_experiment_multi_input_intent_classification(csv_filename):
    # Multiple inputs, Single category output
    input_features = [
        text_feature(vocab_size=10, min_len=1, representation='sparse'),
        category_feature(vocab_size=10)
    ]
    output_features = [category_feature(vocab_size=2, reduce_input='sum')]

    # Generate test data
    rel_path = generate_data(input_features, output_features, csv_filename)

    for encoder in ENCODERS:
        input_features[0]['encoder'] = encoder
        run_experiment(input_features, output_features, dataset=rel_path)
示例#22
0
def test_tied_macro_level(tied_use_case, csv_filename):
    input_features = [
        numerical_feature(),  # Other feature
        tied_use_case.input_feature(),  # first feature to be tied
        tied_use_case.input_feature(),  # second feature to be tied
        category_feature()  # other feature
    ]
    # tie second feature to first feature
    input_features[2]['tied'] = input_features[1]['name']

    # setup output feature
    output_features = [tied_use_case.output_feature()]

    # Generate test data and run full_experiment
    rel_path = generate_data(input_features, output_features, csv_filename)
    run_experiment(input_features, output_features, data_csv=rel_path)
示例#23
0
def test_tied_macro_level(tied_use_case: TiedUseCase, csv_filename: str):
    input_features = [
        number_feature(),  # Other feature
        tied_use_case.input_feature(),  # first feature to be tied
        tied_use_case.input_feature(),  # second feature to be tied
        category_feature(),  # other feature
    ]
    # tie second feature to first feature
    input_features[2]["tied"] = input_features[1]["name"]

    # setup output feature
    output_features = [tied_use_case.output_feature()]

    # Generate test data and run full_experiment
    rel_path = generate_data(input_features, output_features, csv_filename)
    run_experiment(input_features, output_features, dataset=rel_path)
示例#24
0
def test_experiment_tied_weights(csv_filename):
    # Single sequence input, single category output
    input_features = [
        text_feature(name="text_feature1", min_len=1, encoder="cnnrnn", reduce_output="sum"),
        text_feature(
            name="text_feature2", min_len=1, encoder="cnnrnn", reduce_output="sum", tied_weights="text_feature1"
        ),
    ]
    output_features = [category_feature(vocab_size=2, reduce_input="sum")]

    # Generate test data
    rel_path = generate_data(input_features, output_features, csv_filename)
    for encoder in ENCODERS:
        input_features[0]["encoder"] = encoder
        input_features[1]["encoder"] = encoder
        run_experiment(input_features, output_features, dataset=rel_path)
示例#25
0
def test_sequence_generator(
        enc_encoder,
        enc_cell_type,
        dec_cell_type,
        dec_attention,
        dec_beam_width,
        csv_filename
):
    tfa.options.TF_ADDONS_PY_OPS = True

    with graph_mode():
        # Define input and output features
        input_features = [
            sequence_feature(
                min_len=5,
                max_len=10,
                encoder='rnn',
                cell_type='lstm',
                reduce_output=None
            )
        ]
        output_features = [
            sequence_feature(
                min_len=5,
                max_len=10,
                decoder='generator',
                cell_type='lstm',
                attention='bahdanau',
                reduce_input=None
            )
        ]

        # Generate test data
        rel_path = generate_data(input_features, output_features, csv_filename)

        # setup encoder specification
        input_features[0]['encoder'] = enc_encoder
        input_features[0]['cell_type'] = enc_cell_type

        # setup decoder specification
        output_features[0]['cell_type'] = dec_cell_type
        output_features[0]['attention'] = dec_attention
        output_features[0]['beam_width'] = dec_beam_width

        # run the experiment
        run_experiment(input_features, output_features, dataset=rel_path)
示例#26
0
def test_experiment_seq_seq_model_def_file(csv_filename, yaml_filename):
    # seq-to-seq test to use config file instead of dictionary
    input_features = [text_feature(reduce_output=None, encoder="embed")]
    output_features = [text_feature(reduce_input=None, vocab_size=3, decoder="tagger")]

    # Save the config to a yaml file
    config = {
        "input_features": input_features,
        "output_features": output_features,
        "combiner": {"type": "concat", "fc_size": 14},
        "training": {"epochs": 2},
    }
    with open(yaml_filename, "w") as yaml_out:
        yaml.safe_dump(config, yaml_out)

    rel_path = generate_data(input_features, output_features, csv_filename)
    run_experiment(None, None, dataset=rel_path, config=yaml_filename)
示例#27
0
def test_visual_question_answering(csv_filename):
    image_dest_folder = os.path.join(os.getcwd(), "generated_images")
    input_features = [
        image_feature(
            folder=image_dest_folder,
            encoder="resnet",
            preprocessing={"in_memory": True, "height": 8, "width": 8, "num_channels": 3, "num_processes": 5},
            fc_size=8,
            num_filters=8,
        ),
        text_feature(encoder="embed", min_len=1, level="word"),
    ]
    output_features = [sequence_feature(decoder="generator", cell_type="lstm")]
    rel_path = generate_data(input_features, output_features, csv_filename)
    run_experiment(input_features, output_features, dataset=rel_path)

    # Delete the temporary data created
    shutil.rmtree(image_dest_folder)
示例#28
0
def test_experiment_seq_seq_train_test_valid(tmpdir):
    # seq-to-seq test to use train, test, validation files
    input_features = [text_feature(reduce_output=None, encoder="rnn")]
    output_features = [text_feature(reduce_input=None, vocab_size=3, decoder="tagger")]

    train_csv = generate_data(input_features, output_features, os.path.join(tmpdir, "train.csv"))
    test_csv = generate_data(input_features, output_features, os.path.join(tmpdir, "test.csv"), 20)
    valdation_csv = generate_data(input_features, output_features, os.path.join(tmpdir, "val.csv"), 20)

    run_experiment(
        input_features, output_features, training_set=train_csv, test_set=test_csv, validation_set=valdation_csv
    )

    input_features[0]["encoder"] = "parallel_cnn"
    # Save intermediate output
    run_experiment(
        input_features, output_features, training_set=train_csv, test_set=test_csv, validation_set=valdation_csv
    )
示例#29
0
def test_basic_image_feature(num_channels, image_source, in_memory,
                             skip_save_processed_input, csv_filename):
    # Image Inputs
    image_dest_folder = os.path.join(os.getcwd(), "generated_images")

    input_features = [
        image_feature(
            folder=image_dest_folder,
            encoder="stacked_cnn",
            preprocessing={
                "in_memory": in_memory,
                "height": 12,
                "width": 12,
                "num_channels": num_channels,
                "num_processes": 5,
            },
            fc_size=16,
            num_filters=8,
        )
    ]
    output_features = [category_feature(vocab_size=2, reduce_input="sum")]

    rel_path = generate_data(input_features, output_features, csv_filename)

    if image_source == "file":
        # use images from file
        run_experiment(input_features,
                       output_features,
                       dataset=rel_path,
                       skip_save_processed_input=skip_save_processed_input)
    else:
        # import image from file and store in dataframe as tensors.
        df = pd.read_csv(rel_path)
        image_feature_name = input_features[0]["name"]
        df[image_feature_name] = df[image_feature_name].apply(
            lambda x: torchvision.io.read_image(x))

        run_experiment(input_features,
                       output_features,
                       dataset=df,
                       skip_save_processed_input=skip_save_processed_input)

    # Delete the temporary data created
    shutil.rmtree(image_dest_folder, ignore_errors=True)
示例#30
0
def test_experiment_sequence_combiner_with_reduction_fails(csv_filename):
    config = {
        "input_features": [
            sequence_feature(
                name="seq1",
                min_len=5,
                max_len=5,
                encoder="embed",
                cell_type="lstm",
                reduce_output="sum",
            ),
            sequence_feature(
                name="seq2",
                min_len=5,
                max_len=5,
                encoder="embed",
                cell_type="lstm",
                reduce_output="sum",
            ),
            category_feature(vocab_size=5),
        ],
        "output_features":
        [category_feature(reduce_input="sum", vocab_size=5)],
        TRAINER: {
            "epochs": 2
        },
        "combiner": {
            "type": "sequence",
            "encoder": "rnn",
            "main_sequence_feature": "seq1",
            "reduce_output": None,
        },
    }

    # Generate test data
    rel_path = generate_data(config["input_features"],
                             config["output_features"], csv_filename)

    # Encoding sequence features with 'embed' should fail with SequenceConcatCombiner, since at least one sequence
    # feature should be rank 3.
    with pytest.raises(ValueError):
        run_experiment(config=config, dataset=rel_path)