예제 #1
0
def _preprocess_df_for_training(
        features,
        data_df=None,
        data_train_df=None,
        data_validation_df=None,
        data_test_df=None,
        preprocessing_params=default_preprocessing_parameters,
        random_seed=default_random_seed):
    """ Method to pre-process dataframes. This doesn't have the optoin to save the
    processed data as hdf5 as we don't expect users to do this as the data can
    be processed in memory
    """

    if data_df is not None:
        # needs preprocessing
        logging.info('Using full dataframe')
        logging.info('Building dataset (it may take a while)')

    elif data_train_df is not None:
        # needs preprocessing
        logging.info('Using training dataframe')
        logging.info('Building dataset (it may take a while)')
        data_df = concatenate_df(data_train_df, data_validation_df,
                                 data_test_df)

    data, train_set_metadata = build_dataset_df(data_df,
                                                features,
                                                preprocessing_params,
                                                random_seed=random_seed)
    training_set, test_set, validation_set = split_dataset_tvt(
        data, data['split'])
    return training_set, test_set, validation_set, train_set_metadata
예제 #2
0
def test_number_feature_wrong_dtype(csv_filename, tmpdir):
    """Tests that a number feature with all string values is treated as having missing values by default."""
    data_csv_path = os.path.join(tmpdir, csv_filename)

    num_feat = number_feature()
    input_features = [num_feat]
    output_features = [binary_feature()]
    config = {
        "input_features": input_features,
        "output_features": output_features
    }

    training_data_csv_path = generate_data(input_features, output_features,
                                           data_csv_path)
    df = pd.read_csv(training_data_csv_path)

    # convert numbers to random strings
    def random_string():
        letters = string.ascii_lowercase
        return "".join(random.choice(letters) for _ in range(10))

    df[num_feat[COLUMN]] = df[num_feat[COLUMN]].apply(
        lambda _: random_string())

    # run preprocessing
    backend = LocalTestBackend()
    ludwig_model = LudwigModel(config, backend=backend)
    train_ds, val_ds, test_ds, _ = ludwig_model.preprocess(dataset=df)

    concatenated_df = concatenate_df(train_ds.to_df(), val_ds.to_df(),
                                     test_ds.to_df(), backend)

    # check that train_ds had invalid values replaced with the missing value
    assert len(concatenated_df) == len(df)
    assert np.all(concatenated_df[num_feat[PROC_COLUMN]] == 0.0)
예제 #3
0
def test_image_resizing_num_channel_handling(csv_filename):
    """
    This test creates two image datasets with 3 channels and 1 channel. The
    combination of this data is used to train a model. This checks the cases
    where the user may or may not specify a number of channels in the
    config
    :param csv_filename:
    :return:
    """
    # Image Inputs
    image_dest_folder = os.path.join(os.getcwd(), 'generated_images')

    # Resnet encoder
    input_features = [
        image_feature(folder=image_dest_folder,
                      encoder='resnet',
                      preprocessing={
                          'in_memory': True,
                          'height': 8,
                          'width': 8,
                          'num_channels': 3,
                          'num_processes': 5
                      },
                      fc_size=8,
                      num_filters=8),
        text_feature(encoder='embed', min_len=1),
        numerical_feature(normalization='minmax')
    ]
    output_features = [binary_feature(), numerical_feature()]
    rel_path = generate_data(input_features,
                             output_features,
                             csv_filename,
                             num_examples=50)

    df1 = read_csv(rel_path)

    input_features[0]['preprocessing']['num_channels'] = 1
    rel_path = generate_data(input_features,
                             output_features,
                             csv_filename,
                             num_examples=50)
    df2 = read_csv(rel_path)

    df = concatenate_df(df1, df2, None, LOCAL_BACKEND)
    df.to_csv(rel_path, index=False)

    # Here the user sepcifiies number of channels. Exception shouldn't be thrown
    run_experiment(input_features, output_features, dataset=rel_path)

    del input_features[0]['preprocessing']['num_channels']

    # User now doesn't specify num channels. Should throw exception
    with pytest.raises(ValueError):
        run_experiment(input_features, output_features, dataset=rel_path)

    # Delete the temporary data created
    shutil.rmtree(image_dest_folder)
예제 #4
0
def test_image_resizing_num_channel_handling(csv_filename):
    """This test creates two image datasets with 3 channels and 1 channel. The combination of this data is used to
    train a model. This checks the cases where the user may or may not specify a number of channels in the config.

    :param csv_filename:
    :return:
    """
    # Image Inputs
    image_dest_folder = os.path.join(os.getcwd(), "generated_images")

    # Resnet encoder
    input_features = [
        image_feature(
            folder=image_dest_folder,
            encoder="resnet",
            preprocessing={
                "in_memory": True,
                "height": 8,
                "width": 8,
                "num_channels": 3,
                "num_processes": 5
            },
            fc_size=8,
            num_filters=8,
        ),
        text_feature(encoder="embed", min_len=1),
        numerical_feature(normalization="minmax"),
    ]
    output_features = [binary_feature(), numerical_feature()]
    rel_path = generate_data(input_features,
                             output_features,
                             csv_filename,
                             num_examples=50)

    df1 = read_csv(rel_path)

    input_features[0]["preprocessing"]["num_channels"] = 1
    rel_path = generate_data(input_features,
                             output_features,
                             csv_filename,
                             num_examples=50)
    df2 = read_csv(rel_path)

    df = concatenate_df(df1, df2, None, LOCAL_BACKEND)
    df.to_csv(rel_path, index=False)

    # Here the user specifies number of channels. Exception shouldn't be thrown
    run_experiment(input_features, output_features, dataset=rel_path)

    del input_features[0]["preprocessing"]["num_channels"]

    # User doesn't specify num channels, but num channels is inferred. Exception shouldn't be thrown
    run_experiment(input_features, output_features, dataset=rel_path)

    # Delete the temporary data created
    shutil.rmtree(image_dest_folder)
예제 #5
0
def preprocess_for_training(
        model_definition,
        dataset_type='generic',
        data_df=None,
        data_train_df=None,
        data_validation_df=None,
        data_test_df=None,
        data_csv=None,
        data_train_csv=None,
        data_validation_csv=None,
        data_test_csv=None,
        data_hdf5=None,
        data_train_hdf5=None,
        data_validation_hdf5=None,
        data_test_hdf5=None,
        train_set_metadata_json=None,
        skip_save_processed_input=False,
        preprocessing_params=default_preprocessing_parameters,
        random_seed=default_random_seed):
    # Check if hdf5 and json already exist
    data_hdf5_fp = None
    data_train_hdf5_fp = None
    data_validation_hdf5_fp = None
    data_test_hdf5_fp = None
    train_set_metadata_json_fp = 'metadata.json'
    if data_csv is not None:
        data_hdf5_fp = os.path.splitext(data_csv)[0] + '.hdf5'
        train_set_metadata_json_fp = os.path.splitext(data_csv)[0] + '.json'
        if (os.path.isfile(data_hdf5_fp)
                and os.path.isfile(train_set_metadata_json_fp)):
            logging.info('Found hdf5 and json with the same filename '
                         'of the csv, using them instead')
            data_csv = None
            data_hdf5 = data_hdf5_fp
            train_set_metadata_json = train_set_metadata_json_fp

    if data_train_csv is not None:
        data_train_hdf5_fp = os.path.splitext(data_train_csv)[0] + '.hdf5'
        train_set_metadata_json_fp = os.path.splitext(
            data_train_csv)[0] + '.json'
        if (os.path.isfile(data_train_hdf5_fp)
                and os.path.isfile(train_set_metadata_json_fp)):
            logging.info('Found hdf5 and json with the same filename of '
                         'the train csv, using them instead')
            data_train_csv = None
            data_train_hdf5 = data_train_hdf5_fp
            train_set_metadata_json = train_set_metadata_json_fp

    if data_validation_csv is not None:
        data_validation_hdf5_fp = os.path.splitext(
            data_validation_csv)[0] + '.hdf5'
        if os.path.isfile(data_validation_hdf5_fp):
            logging.info('Found hdf5 with the same filename of '
                         'the validation csv, using it instead')
            data_validation_csv = None
            data_validation_hdf5 = data_validation_hdf5_fp

    if data_test_csv is not None:
        data_test_hdf5_fp = os.path.splitext(data_test_csv)[0] + '.hdf5'
        if os.path.isfile(data_test_hdf5_fp):
            logging.info('Found hdf5 with the same filename of '
                         'the validation csv, using it instead')
            data_test_csv = None
            data_test_hdf5 = data_test_hdf5_fp

    model_definition['data_hdf5_fp'] = data_hdf5_fp

    # Decide if to preprocess or just load
    features = (model_definition['input_features'] +
                model_definition['output_features'])
    (concatenate_csv, concatenate_df, build_dataset,
     build_dataset_df) = get_dataset_fun(dataset_type)

    if data_df is not None:
        # needs preprocessing
        logging.info('Using full dataframe')
        logging.info('Building dataset (it may take a while)')
        data, train_set_metadata = build_dataset_df(data_df,
                                                    features,
                                                    preprocessing_params,
                                                    random_seed=random_seed)
        if not skip_save_processed_input:
            logging.info('Writing dataset')
            data_utils.save_hdf5(data_hdf5_fp, data, train_set_metadata)
        logging.info('Writing train set metadata with vocabulary')
        data_utils.save_json(train_set_metadata_json_fp, train_set_metadata)
        training_set, test_set, validation_set = split_dataset_tvt(
            data, data['split'])

    elif data_train_df is not None:
        # needs preprocessing
        logging.info('Using training dataframe')
        logging.info('Building dataset (it may take a while)')
        concatenated_df = concatenate_df(data_train_df, data_validation_df,
                                         data_test_df)
        data, train_set_metadata = build_dataset_df(concatenated_df,
                                                    features,
                                                    preprocessing_params,
                                                    random_seed=random_seed)
        training_set, test_set, validation_set = split_dataset_tvt(
            data, data['split'])
        if not skip_save_processed_input:
            logging.info('Writing dataset')
            data_utils.save_hdf5(data_train_hdf5_fp, training_set,
                                 train_set_metadata)
            if validation_set is not None:
                data_utils.save_hdf5(data_validation_hdf5_fp, validation_set,
                                     train_set_metadata)
            if test_set is not None:
                data_utils.save_hdf5(data_test_hdf5_fp, test_set,
                                     train_set_metadata)
        logging.info('Writing train set metadata with vocabulary')
        data_utils.save_json(train_set_metadata_json_fp, train_set_metadata)

    elif data_csv is not None:
        # Use data and ignore _train, _validation and _test.
        # Also ignore data and train set metadata needs preprocessing
        logging.info('Using full raw csv, no hdf5 and json file '
                     'with the same name have been found')
        logging.info('Building dataset (it may take a while)')
        data, train_set_metadata = build_dataset(data_csv,
                                                 features,
                                                 preprocessing_params,
                                                 random_seed=random_seed)
        if not skip_save_processed_input:
            logging.info('Writing dataset')
            data_utils.save_hdf5(data_hdf5_fp, data, train_set_metadata)
            logging.info('Writing train set metadata with vocabulary')
            data_utils.save_json(train_set_metadata_json_fp,
                                 train_set_metadata)
        training_set, test_set, validation_set = split_dataset_tvt(
            data, data['split'])

    elif data_train_csv is not None:
        # use data_train (including _validation and _test if they are present)
        # and ignore data and train set metadata
        # needs preprocessing
        logging.info('Using training raw csv, no hdf5 and json '
                     'file with the same name have been found')
        logging.info('Building dataset (it may take a while)')
        concatenated_df = concatenate_csv(data_train_csv, data_validation_csv,
                                          data_test_csv)
        concatenated_df.csv = data_train_csv
        data, train_set_metadata = build_dataset_df(concatenated_df,
                                                    features,
                                                    preprocessing_params,
                                                    random_seed=random_seed)
        training_set, test_set, validation_set = split_dataset_tvt(
            data, data['split'])
        if not skip_save_processed_input:
            logging.info('Writing dataset')
            data_utils.save_hdf5(data_train_hdf5_fp, training_set,
                                 train_set_metadata)
            if validation_set is not None:
                data_utils.save_hdf5(data_validation_hdf5_fp, validation_set,
                                     train_set_metadata)
            if test_set is not None:
                data_utils.save_hdf5(data_test_hdf5_fp, test_set,
                                     train_set_metadata)
            logging.info('Writing train set metadata with vocabulary')
            data_utils.save_json(train_set_metadata_json_fp,
                                 train_set_metadata)

    elif data_hdf5 is not None and train_set_metadata_json is not None:
        # use data and train set metadata
        # doesn't need preprocessing, just load
        logging.info('Using full hdf5 and json')
        training_set, test_set, validation_set = load_data(
            data_hdf5,
            model_definition['input_features'],
            model_definition['output_features'],
            shuffle_training=True)
        train_set_metadata = load_metadata(train_set_metadata_json)

    elif data_train_hdf5 is not None and train_set_metadata_json is not None:
        # use data and train set metadata
        # doesn't need preprocessing, just load
        logging.info('Using hdf5 and json')
        training_set = load_data(data_train_hdf5,
                                 model_definition['input_features'],
                                 model_definition['output_features'],
                                 split_data=False)
        train_set_metadata = load_metadata(train_set_metadata_json)
        if data_validation_hdf5 is not None:
            validation_set = load_data(data_validation_hdf5,
                                       model_definition['input_features'],
                                       model_definition['output_features'],
                                       split_data=False)
        else:
            validation_set = None
        if data_test_hdf5 is not None:
            test_set = load_data(data_test_hdf5,
                                 model_definition['input_features'],
                                 model_definition['output_features'],
                                 split_data=False)
        else:
            test_set = None

    else:
        raise RuntimeError('Insufficient input parameters')

    replace_text_feature_level(model_definition,
                               [training_set, validation_set, test_set])

    training_dataset = Dataset(training_set,
                               model_definition['input_features'],
                               model_definition['output_features'],
                               data_hdf5_fp)

    validation_dataset = None
    if validation_set is not None:
        validation_dataset = Dataset(validation_set,
                                     model_definition['input_features'],
                                     model_definition['output_features'],
                                     data_hdf5_fp)

    test_dataset = None
    if test_set is not None:
        test_dataset = Dataset(test_set, model_definition['input_features'],
                               model_definition['output_features'],
                               data_hdf5_fp)

    return (training_dataset, validation_dataset, test_dataset,
            train_set_metadata)
예제 #6
0
def test_image_resizing_num_channel_handling(csv_filename):
    """
    This test creates two image datasets with 3 channels and 1 channel. The
    combination of this data is used to train a model. This checks the cases
    where the user may or may not specify a number of channels in the
    model definition
    :param csv_filename:
    :return:
    """
    # Image Inputs
    image_dest_folder = os.path.join(os.getcwd(), 'generated_images')
    input_features_template = Template(
        "[{type: text, name: random_text, vocab_size: 100,"
        " max_len: 10, encoder: stacked_cnn}, {type: numerical,"
        " name: random_number}, "
        "{type: image, name: random_image, encoder: ${encoder}, preprocessing:"
        " {width: 10, in_memory: ${in_memory},"
        " height: 10, num_channels: 3},"
        " resnet_size: 8, destination_folder: ${folder}}]")

    # Resnet encoder
    input_features = input_features_template.substitute(
        encoder='resnet',
        folder=image_dest_folder,
        in_memory='true',
    )
    output_features = "[{type: binary, name: intent, reduce_input: sum}, " \
                      "{type: numerical, name: random_num_output}]"

    rel_path = generate_data(input_features, output_features, csv_filename)

    df1 = pd.read_csv(rel_path)

    input_features_template = Template(
        "[{type: text, name: random_text, vocab_size: 100,"
        " max_len: 10, encoder: stacked_cnn}, {type: numerical,"
        " name: random_number}, "
        "{type: image, name: random_image, preprocessing: {width: 10,"
        " in_memory: ${in_memory}, height: 10, num_channels: 1},"
        " encoder: ${encoder},"
        " resnet_size: 8, destination_folder: ${folder}}]")

    input_features = input_features_template.substitute(
        encoder='resnet',
        folder=image_dest_folder,
        in_memory='true',
    )
    rel_path = generate_data(input_features, output_features, csv_filename)
    df2 = pd.read_csv(rel_path)

    df = concatenate_df(df1, df2, None)
    df.to_csv(rel_path, index=False)

    # Here the user sepcifiies number of channels. Exception shouldn't be thrown
    run_experiment(input_features, output_features, rel_path)

    input_features_template = Template(
        "[{type: text, name: random_text, vocab_size: 100,"
        " max_len: 10, encoder: stacked_cnn}, {type: numerical,"
        " name: random_number}, "
        "{type: image, name: random_image, preprocessing: {width: 10, "
        "in_memory: ${in_memory}, height: 10} , encoder: ${encoder},"
        " resnet_size: 8, destination_folder: ${folder}}]")

    input_features = input_features_template.substitute(
        encoder='resnet',
        folder=image_dest_folder,
        in_memory='true',
    )
    # User now doesn't specify num channels. Should throw exception
    with pytest.raises(ValueError):
        run_experiment(input_features, output_features, rel_path)

    # Delete the temporary data created
    all_images = glob.glob(os.path.join(image_dest_folder, '*.jpg'))
    for im in all_images:
        os.remove(im)

    os.rmdir(image_dest_folder)