示例#1
0
    def add_feature_data(feature_config, input_df, proc_df, metadata,
                         preprocessing_parameters, backend,
                         skip_save_processed_input):
        set_default_value(feature_config["preprocessing"], "in_memory",
                          preprocessing_parameters["in_memory"])

        if "audio_feature" not in preprocessing_parameters:
            raise ValueError(
                "audio_feature dictionary has to be present in preprocessing "
                "for audio.")
        if TYPE not in preprocessing_parameters["audio_feature"]:
            raise ValueError(
                "type has to be present in audio_feature dictionary "
                "for audio.")

        name = feature_config[NAME]
        column = feature_config[COLUMN]
        proc_column = feature_config[PROC_COLUMN]

        src_path = None
        # this is not super nice, but works both and DFs and lists
        first_path = "."
        for first_path in input_df[column]:
            break
        if SRC in metadata:
            src_path = os.path.dirname(os.path.abspath(metadata.get(SRC)))
        if src_path is None and not os.path.isabs(first_path):
            raise ValueError("Audio file paths must be absolute")

        num_audio_utterances = len(input_df[feature_config[COLUMN]])
        padding_value = preprocessing_parameters["padding_value"]
        normalization_type = preprocessing_parameters["norm"]

        feature_dim = metadata[name]["feature_dim"]
        max_length = metadata[name]["max_length"]
        audio_feature_dict = preprocessing_parameters["audio_feature"]
        audio_file_length_limit_in_s = preprocessing_parameters[
            "audio_file_length_limit_in_s"]

        if num_audio_utterances == 0:
            raise ValueError(
                "There are no audio files in the dataset provided.")

        if feature_config[PREPROCESSING]["in_memory"]:
            audio_features = AudioFeatureMixin._process_in_memory(
                input_df[feature_config[NAME]],
                src_path,
                audio_feature_dict,
                feature_dim,
                max_length,
                padding_value,
                normalization_type,
                audio_file_length_limit_in_s,
                backend,
            )
            proc_df[proc_column] = audio_features
        else:
            backend.check_lazy_load_supported(feature_config)

        return proc_df
示例#2
0
    def populate_defaults(output_feature):
        # If Loss is not defined, set an empty dictionary
        set_default_value(output_feature, LOSS, {})

        # Populate the default values for LOSS if they aren't defined already
        set_default_values(
            output_feature[LOSS],
            {
                TYPE: "softmax_cross_entropy",
                "labels_smoothing": 0,
                "class_weights": 1,
                "robust_lambda": 0,
                "confidence_penalty": 0,
                "class_similarities_temperature": 0,
                "weight": 1,
            },
        )

        if output_feature[LOSS][TYPE] == "sampled_softmax_cross_entropy":
            set_default_values(
                output_feature[LOSS],
                {"sampler": "log_uniform", "unique": False, "negative_samples": 25, "distortion": 0.75},
            )

        set_default_values(
            output_feature, {"top_k": 3, "dependencies": [], "reduce_input": SUM, "reduce_dependencies": SUM}
        )
示例#3
0
    def populate_defaults(output_feature):
        # If Loss is not defined, set an empty dictionary
        set_default_value(output_feature, LOSS, {})
        set_default_values(
            output_feature[LOSS],
            {
                'robust_lambda': 0,
                'confidence_penalty': 0,
                'positive_class_weight': 1,
                'weight': 1
            }
        )

        set_default_value(output_feature[LOSS], 'robust_lambda', 0)
        set_default_value(output_feature[LOSS], 'confidence_penalty', 0)
        set_default_value(output_feature[LOSS], 'positive_class_weight', 1)
        set_default_value(output_feature[LOSS], 'weight', 1)

        set_default_values(
            output_feature,
            {
                'threshold': 0.5,
                'dependencies': [],
                'reduce_input': SUM,
                'reduce_dependencies': SUM
            }
        )
示例#4
0
    def populate_defaults(output_feature):
        # If Loss is not defined, set an empty dictionary
        set_default_value(output_feature, LOSS, {})
        set_default_values(
            output_feature[LOSS],
            {
                "robust_lambda": 0,
                "confidence_penalty": 0,
                "positive_class_weight": 1,
                "weight": 1,
            },
        )

        set_default_value(output_feature[LOSS], "robust_lambda", 0)
        set_default_value(output_feature[LOSS], "confidence_penalty", 0)
        set_default_value(output_feature[LOSS], "positive_class_weight", 1)
        set_default_value(output_feature[LOSS], "weight", 1)

        set_default_values(
            output_feature,
            {
                "threshold": 0.5,
                "dependencies": [],
                "reduce_input": SUM,
                "reduce_dependencies": SUM,
            },
        )
示例#5
0
    def populate_defaults(output_feature):
        # If Loss is not defined, set an empty dictionary
        set_default_value(output_feature, LOSS, {})

        # Populate the default values for LOSS if they aren't defined already
        set_default_values(
            output_feature[LOSS], {
                TYPE: 'softmax_cross_entropy',
                'labels_smoothing': 0,
                'class_weights': 1,
                'robust_lambda': 0,
                'confidence_penalty': 0,
                'class_similarities_temperature': 0,
                'weight': 1
            })

        if output_feature[LOSS][TYPE] == 'sampled_softmax_cross_entropy':
            set_default_values(
                output_feature[LOSS], {
                    'sampler': 'log_uniform',
                    'unique': False,
                    'negative_samples': 25,
                    'distortion': 0.75
                })

        set_default_values(
            output_feature, {
                'top_k': 3,
                'dependencies': [],
                'reduce_input': SUM,
                'reduce_dependencies': SUM
            })
示例#6
0
def merge_with_defaults(config):
    config = copy.deepcopy(config)
    _perform_sanity_checks(config)
    _set_feature_column(config)
    _set_proc_column(config)
    _merge_hyperopt_with_training(config)

    # ===== Preprocessing =====
    config["preprocessing"] = merge_dict(default_preprocessing_parameters,
                                         config.get("preprocessing", {}))

    stratify = config["preprocessing"]["stratify"]
    if stratify is not None:
        features = config["input_features"] + config["output_features"]
        feature_names = {f[COLUMN] for f in features}
        if stratify not in feature_names:
            logger.warning("Stratify is not among the features. "
                           "Cannot establish if it is a binary or category")
        elif [f for f in features
              if f[COLUMN] == stratify][0][TYPE] not in {BINARY, CATEGORY}:
            raise ValueError("Stratify feature must be binary or category")

    # ===== Training =====
    set_default_value(config, TRAINING, default_training_params)

    for param, value in default_training_params.items():
        set_default_value(config[TRAINING], param, value)

    set_default_value(
        config[TRAINING],
        "validation_metric",
        output_type_registry[config["output_features"][0]
                             [TYPE]].default_validation_metric,
    )

    # ===== Training Optimizer =====
    optimizer = config[TRAINING]["optimizer"]
    default_optimizer_params = get_default_optimizer_params(optimizer[TYPE])
    for param in default_optimizer_params:
        set_default_value(optimizer, param, default_optimizer_params[param])

    # ===== Input Features =====
    for input_feature in config["input_features"]:
        get_from_registry(input_feature[TYPE],
                          input_type_registry).populate_defaults(input_feature)

    # ===== Combiner =====
    set_default_value(config, "combiner", {TYPE: default_combiner_type})

    # ===== Output features =====
    for output_feature in config["output_features"]:
        get_from_registry(
            output_feature[TYPE],
            output_type_registry).populate_defaults(output_feature)

    return config
示例#7
0
def merge_with_defaults(model_definition):
    _perform_sanity_checks(model_definition)

    # ===== Preprocessing =====
    model_definition['preprocessing'] = merge_dict(
        default_preprocessing_parameters,
        model_definition.get('preprocessing', {}))

    stratify = model_definition['preprocessing']['stratify']

    if stratify is not None:
        if stratify not in [
                x['name'] for x in model_definition['output_features']
        ]:
            raise ValueError('Stratify must be in output features')
        if ([
                x for x in model_definition['output_features']
                if x['name'] == stratify
        ][0][TYPE] not in [BINARY, CATEGORY]):
            raise ValueError('Stratify feature must be binary or category')
    # ===== Model =====
    set_default_value(model_definition, 'combiner',
                      {'type': default_combiner_type})

    # ===== Training =====
    set_default_value(model_definition, TRAINING, default_training_params)

    for param, value in default_training_params.items():
        set_default_value(model_definition[TRAINING], param, value)

    set_default_value(
        model_definition[TRAINING], 'validation_metric',
        output_type_registry[model_definition['output_features'][0]
                             [TYPE]].default_validation_metric)

    # ===== Training Optimizer =====
    optimizer = model_definition[TRAINING]['optimizer']
    default_optimizer_params = get_default_optimizer_params(optimizer[TYPE])
    for param in default_optimizer_params:
        set_default_value(optimizer, param, default_optimizer_params[param])

    # ===== Input Features =====
    for input_feature in model_definition['input_features']:
        get_from_registry(input_feature[TYPE],
                          input_type_registry).populate_defaults(input_feature)

    # ===== Output features =====
    for output_feature in model_definition['output_features']:
        get_from_registry(
            output_feature['type'],
            output_type_registry).populate_defaults(output_feature)

    return model_definition
示例#8
0
    def add_feature_data(feature, input_df, proc_df, metadata,
                         preprocessing_parameters, backend,
                         skip_save_processed_input):
        set_default_value(feature['preprocessing'], 'in_memory',
                          preprocessing_parameters['in_memory'])

        if 'audio_feature' not in preprocessing_parameters:
            raise ValueError(
                'audio_feature dictionary has to be present in preprocessing '
                'for audio.')
        if TYPE not in preprocessing_parameters['audio_feature']:
            raise ValueError(
                'type has to be present in audio_feature dictionary '
                'for audio.')

        name = feature[NAME]
        column = feature[COLUMN]
        proc_column = feature[PROC_COLUMN]

        src_path = None
        # this is not super nice, but works both and DFs and lists
        first_path = '.'
        for first_path in input_df[column]:
            break
        if SRC in metadata:
            src_path = os.path.dirname(os.path.abspath(metadata.get(SRC)))
        if src_path is None and not os.path.isabs(first_path):
            raise ValueError('Audio file paths must be absolute')

        num_audio_utterances = len(input_df[feature[COLUMN]])
        padding_value = preprocessing_parameters['padding_value']
        normalization_type = preprocessing_parameters['norm']

        feature_dim = metadata[name]['feature_dim']
        max_length = metadata[name]['max_length']
        audio_feature_dict = preprocessing_parameters['audio_feature']
        audio_file_length_limit_in_s = preprocessing_parameters[
            'audio_file_length_limit_in_s']

        if num_audio_utterances == 0:
            raise ValueError(
                'There are no audio files in the dataset provided.')

        if feature[PREPROCESSING]['in_memory']:
            audio_features = AudioFeatureMixin._process_in_memory(
                input_df[feature[NAME]], src_path, audio_feature_dict,
                feature_dim, max_length, padding_value, normalization_type,
                audio_file_length_limit_in_s, backend)
            proc_df[proc_column] = audio_features
        else:
            backend.check_lazy_load_supported(feature)

        return proc_df
示例#9
0
def merge_with_defaults(config):
    _perform_sanity_checks(config)
    _set_feature_column(config)
    _set_proc_column(config)
    _merge_hyperopt_with_training(config)

    # ===== Preprocessing =====
    config['preprocessing'] = merge_dict(default_preprocessing_parameters,
                                         config.get('preprocessing', {}))

    stratify = config['preprocessing']['stratify']
    if stratify is not None:
        features = (config['input_features'] + config['output_features'])
        feature_names = set(f[COLUMN] for f in features)
        if stratify not in feature_names:
            logger.warning('Stratify is not among the features. '
                           'Cannot establish if it is a binary or category')
        elif ([f for f in features if f[COLUMN] == stratify][0][TYPE]
              not in {BINARY, CATEGORY}):
            raise ValueError('Stratify feature must be binary or category')

    # ===== Training =====
    set_default_value(config, TRAINING, default_training_params)

    for param, value in default_training_params.items():
        set_default_value(config[TRAINING], param, value)

    set_default_value(
        config[TRAINING], 'validation_metric', output_type_registry[
            config['output_features'][0][TYPE]].default_validation_metric)

    # ===== Training Optimizer =====
    optimizer = config[TRAINING]['optimizer']
    default_optimizer_params = get_default_optimizer_params(optimizer[TYPE])
    for param in default_optimizer_params:
        set_default_value(optimizer, param, default_optimizer_params[param])

    # ===== Input Features =====
    for input_feature in config['input_features']:
        get_from_registry(input_feature[TYPE],
                          input_type_registry).populate_defaults(input_feature)

    # ===== Combiner =====
    set_default_value(config, 'combiner', {TYPE: default_combiner_type})

    # ===== Output features =====
    for output_feature in config['output_features']:
        get_from_registry(
            output_feature[TYPE],
            output_type_registry).populate_defaults(output_feature)

    return config
示例#10
0
文件: run.py 项目: ludwig-ai/ludwig
def update_hyperopt_params_with_defaults(hyperopt_params):
    from ludwig.hyperopt.execution import executor_registry

    set_default_value(hyperopt_params, EXECUTOR, {})
    set_default_value(hyperopt_params, "split", VALIDATION)
    set_default_value(hyperopt_params, "output_feature", COMBINED)
    set_default_value(hyperopt_params, "metric", LOSS)
    set_default_value(hyperopt_params, "goal", MINIMIZE)

    set_default_values(hyperopt_params[EXECUTOR], {TYPE: "ray"})
    executor = get_from_registry(hyperopt_params[EXECUTOR][TYPE],
                                 executor_registry)
    executor_defaults = {
        k: v
        for k, v in executor.__dict__.items()
        if k in get_class_attributes(executor)
    }
    set_default_values(
        hyperopt_params[EXECUTOR],
        executor_defaults,
    )
示例#11
0
    def populate_defaults(output_feature):
        # If Loss is not defined, set an empty dictionary
        set_default_value(output_feature, LOSS, {})

        # Populate the default values for LOSS if they aren't defined already
        set_default_values(
            output_feature[LOSS],
            {
                TYPE: "softmax_cross_entropy",
                "class_weights": 1,
                "robust_lambda": 0,
                "confidence_penalty": 0,
                "class_similarities_temperature": 0,
                "weight": 1,
            },
        )

        set_default_values(
            output_feature, {
                "top_k": 3,
                "dependencies": [],
                "reduce_input": SUM,
                "reduce_dependencies": SUM
            })
示例#12
0
    def populate_defaults(output_feature):
        set_default_value(output_feature, LOSS, {TYPE: "mean_squared_error", "weight": 1})
        set_default_value(output_feature[LOSS], TYPE, "mean_squared_error")
        set_default_value(output_feature[LOSS], "weight", 1)

        set_default_values(
            output_feature,
            {
                "clip": None,
                "dependencies": [],
                "reduce_input": SUM,
                "reduce_dependencies": SUM,
            },
        )
示例#13
0
    def populate_defaults(output_feature):
        set_default_value(output_feature, LOSS, {
            TYPE: 'mean_squared_error',
            'weight': 1
        })
        set_default_value(output_feature[LOSS], TYPE, 'mean_squared_error')
        set_default_value(output_feature[LOSS], 'weight', 1)

        set_default_values(
            output_feature, {
                'clip': None,
                'dependencies': [],
                'reduce_input': SUM,
                'reduce_dependencies': SUM
            })
示例#14
0
 def populate_defaults(input_feature):
     set_default_value(input_feature, TIED, None)
     set_default_value(input_feature, 'encoder', 'parallel_cnn')
示例#15
0
    def populate_defaults(output_feature):
        set_default_value(output_feature, LOSS, {TYPE: SIGMOID_CROSS_ENTROPY, "weight": 1})
        set_default_value(output_feature[LOSS], "weight", 1)
        set_default_value(output_feature[LOSS], "class_weights", None)

        set_default_value(output_feature, "threshold", 0.5)
        set_default_value(output_feature, "dependencies", [])
        set_default_value(output_feature, "reduce_input", SUM)
        set_default_value(output_feature, "reduce_dependencies", SUM)
示例#16
0
 def populate_defaults(input_feature):
     set_default_value(input_feature, TIED, None)
     set_default_value(input_feature, PREPROCESSING, {})
示例#17
0
    def add_feature_data(feature, input_df, proc_df, metadata,
                         preprocessing_parameters, backend):
        set_default_value(feature[PREPROCESSING], 'in_memory',
                          preprocessing_parameters['in_memory'])
        set_default_value(feature[PREPROCESSING], 'num_processes',
                          preprocessing_parameters['num_processes'])
        src_path = None
        if hasattr(input_df, 'src'):
            src_path = os.path.dirname(os.path.abspath(input_df.src))

        num_images = len(input_df)
        if num_images == 0:
            raise ValueError('There are no images in the dataset provided.')

        first_path = next(iter(input_df[feature[COLUMN]]))

        if src_path is None and not os.path.isabs(first_path):
            raise ValueError('Image file paths must be absolute')

        first_path = get_abs_path(src_path, first_path)

        (should_resize, width, height, num_channels,
         user_specified_num_channels,
         first_image) = ImageFeatureMixin._finalize_preprocessing_parameters(
             preprocessing_parameters, first_path)

        metadata[feature[NAME]][PREPROCESSING]['height'] = height
        metadata[feature[NAME]][PREPROCESSING]['width'] = width
        metadata[feature[NAME]][PREPROCESSING]['num_channels'] = num_channels

        read_image_and_resize = partial(
            ImageFeatureMixin._read_image_and_resize,
            img_width=width,
            img_height=height,
            should_resize=should_resize,
            num_channels=num_channels,
            resize_method=preprocessing_parameters['resize_method'],
            user_specified_num_channels=user_specified_num_channels)

        if feature[PREPROCESSING]['in_memory']:
            # Number of processes to run in parallel for preprocessing
            num_processes = feature[PREPROCESSING]['num_processes']
            metadata[
                feature[NAME]][PREPROCESSING]['num_processes'] = num_processes

            # Split the dataset into pools only if we have an explicit request to use
            # multiple processes. In case we have multiple input images use the
            # standard code anyway.
            if backend.supports_multiprocessing and (num_processes > 1
                                                     or num_images > 1):
                all_file_paths = [
                    get_abs_path(src_path, file_path)
                    for file_path in input_df[feature[NAME]]
                ]

                with Pool(num_processes) as pool:
                    logger.debug(
                        'Using {} processes for preprocessing images'.format(
                            num_processes))
                    proc_df[feature[PROC_COLUMN]] = pool.map(
                        read_image_and_resize, all_file_paths)
            else:
                # If we're not running multiple processes and we are only processing one
                # image just use this faster shortcut, bypassing multiprocessing.Pool.map
                logger.debug(
                    'No process pool initialized. Using internal process for preprocessing images'
                )

                proc_df[feature[PROC_COLUMN]] = backend.df_engine.map_objects(
                    input_df[feature[COLUMN]],
                    lambda file_path: read_image_and_resize(
                        get_abs_path(src_path, file_path)))
        else:
            backend.check_lazy_load_supported(feature)

            all_file_paths = [
                get_abs_path(src_path, file_path)
                for file_path in input_df[feature[NAME]]
            ]

            data_fp = os.path.splitext(input_df.src)[0] + '.hdf5'
            mode = 'w'
            if os.path.isfile(data_fp):
                mode = 'r+'

            with h5py.File(data_fp, mode) as h5_file:
                # todo future add multiprocessing/multithreading
                image_dataset = h5_file.create_dataset(
                    feature[PROC_COLUMN] + '_data',
                    (num_images, height, width, num_channels),
                    dtype=np.uint8)
                for i, filepath in enumerate(all_file_paths):
                    image_dataset[i, :height, :width, :] = (
                        read_image_and_resize(filepath))
                h5_file.flush()

            proc_df[feature[PROC_COLUMN]] = np.arange(num_images)
        return proc_df
示例#18
0
    def add_feature_data(feature, dataset_df, dataset, metadata,
                         preprocessing_parameters):
        set_default_value(feature['preprocessing'], 'in_memory',
                          preprocessing_parameters['in_memory'])

        if not 'audio_feature' in preprocessing_parameters:
            raise ValueError(
                'audio_feature dictionary has to be present in preprocessing '
                'for audio.')
        if not TYPE in preprocessing_parameters['audio_feature']:
            raise ValueError(
                'type has to be present in audio_feature dictionary '
                'for audio.')

        src_path = None
        # this is not super nice, but works both and DFs and lists
        first_path = '.'
        for first_path in dataset_df[feature[NAME]]:
            break
        if hasattr(dataset_df, 'src'):
            src_path = os.path.dirname(os.path.abspath(dataset_df.src))
        if src_path is None and not os.path.isabs(first_path):
            raise ValueError('Audio file paths must be absolute')

        num_audio_utterances = len(dataset_df)
        padding_value = preprocessing_parameters['padding_value']
        normalization_type = preprocessing_parameters['norm']
        feature_name = feature[NAME]

        feature_dim = metadata[feature_name]['feature_dim']
        max_length = metadata[feature_name]['max_length']
        audio_feature_dict = preprocessing_parameters['audio_feature']
        audio_file_length_limit_in_s = preprocessing_parameters[
            'audio_file_length_limit_in_s']

        if num_audio_utterances == 0:
            raise ValueError(
                'There are no audio files in the dataset provided.')
        audio_stats = {
            'count': 0,
            'mean': 0,
            'var': 0,
            'std': 0,
            'max': 0,
            'min': float('inf'),
            'cropped': 0,
            'max_length_in_s': audio_file_length_limit_in_s
        }

        if feature['preprocessing']['in_memory']:
            dataset[feature[NAME]] = np.empty(
                (num_audio_utterances, max_length, feature_dim),
                dtype=np.float32)
            for i, path in enumerate(dataset_df[feature[NAME]]):
                filepath = get_abs_path(src_path, path)
                audio_feature = AudioFeatureMixin._read_audio_and_transform_to_feature(
                    filepath, audio_feature_dict, feature_dim, max_length,
                    padding_value, normalization_type, audio_stats)

                dataset[feature[NAME]][i, :, :] = audio_feature

            audio_stats['std'] = np.sqrt(audio_stats['var'] /
                                         float(audio_stats['count']))
            print_statistics = ("{} audio files loaded.\n"
                                "Statistics of audio file lengths:\n"
                                "- mean: {:.4f}\n"
                                "- std: {:.4f}\n"
                                "- max: {:.4f}\n"
                                "- min: {:.4f}\n"
                                "- cropped audio_files: {}\n"
                                "Max length was given as {}s").format(
                                    audio_stats['count'], audio_stats['mean'],
                                    audio_stats['std'], audio_stats['max'],
                                    audio_stats['min'], audio_stats['cropped'],
                                    audio_stats['max_length_in_s'])
            logger.debug(print_statistics)
示例#19
0
 def populate_defaults(output_feature):
     set_default_value(output_feature, LOSS, {})
     set_default_value(output_feature[LOSS], TYPE, MEAN_SQUARED_ERROR)
     set_default_value(output_feature[LOSS], "weight", 1)
     set_default_value(output_feature, "reduce_input", None)
     set_default_value(output_feature, "reduce_dependencies", None)
     set_default_value(output_feature, "decoder", "projector")
     set_default_value(output_feature, "dependencies", [])
示例#20
0
 def populate_defaults(output_feature):
     set_default_value(output_feature, LOSS, {})
     set_default_value(output_feature[LOSS], TYPE, MEAN_SQUARED_ERROR)
     set_default_value(output_feature[LOSS], 'weight', 1)
     set_default_value(output_feature, 'reduce_input', None)
     set_default_value(output_feature, 'reduce_dependencies', None)
     set_default_value(output_feature, 'decoder', 'projector')
     set_default_value(output_feature, 'dependencies', [])
示例#21
0
def update_hyperopt_params_with_defaults(hyperopt_params):
    set_default_value(hyperopt_params, STRATEGY, {})
    set_default_value(hyperopt_params, EXECUTOR, {})
    set_default_value(hyperopt_params, "split", VALIDATION)
    set_default_value(hyperopt_params, "output_feature", COMBINED)
    set_default_value(hyperopt_params, "metric", LOSS)
    set_default_value(hyperopt_params, "goal", MINIMIZE)

    set_default_values(hyperopt_params[STRATEGY], {TYPE: "random"})

    strategy = get_from_registry(hyperopt_params[STRATEGY][TYPE],
                                 sampler_registry)
    strategy_defaults = {
        k: v
        for k, v in strategy.__dict__.items()
        if k in get_class_attributes(strategy)
    }
    set_default_values(
        hyperopt_params[STRATEGY],
        strategy_defaults,
    )

    set_default_values(hyperopt_params[EXECUTOR], {TYPE: "serial"})

    executor = get_from_registry(hyperopt_params[EXECUTOR][TYPE],
                                 executor_registry)
    executor_defaults = {
        k: v
        for k, v in executor.__dict__.items()
        if k in get_class_attributes(executor)
    }
    set_default_values(
        hyperopt_params[EXECUTOR],
        executor_defaults,
    )
示例#22
0
    def populate_defaults(output_feature):
        set_default_value(
            output_feature,
            LOSS,
            {
                TYPE: "sequence_softmax_cross_entropy",
                "class_weights": 1,
                "robust_lambda": 0,
                "confidence_penalty": 0,
                "class_similarities_temperature": 0,
                "weight": 1,
            },
        )

        set_default_value(output_feature[LOSS], "unique", False)
        set_default_value(output_feature, "decoder", "generator")

        if output_feature["decoder"] == "tagger":
            set_default_value(output_feature, "reduce_input", None)

        set_default_value(output_feature, "dependencies", [])
        set_default_value(output_feature, "reduce_input", SUM)
        set_default_value(output_feature, "reduce_dependencies", SUM)
示例#23
0
 def populate_defaults(input_feature):
     set_default_value(input_feature, TIED, None)
     set_default_value(input_feature, "encoder", "parallel_cnn")
示例#24
0
    def populate_defaults(output_feature):
        set_default_value(
            output_feature, LOSS, {
                TYPE: 'softmax_cross_entropy',
                'sampler': None,
                'negative_samples': 0,
                'distortion': 1,
                'labels_smoothing': 0,
                'class_weights': 1,
                'robust_lambda': 0,
                'confidence_penalty': 0,
                'class_similarities_temperature': 0,
                'weight': 1
            })
        set_default_value(output_feature[LOSS], TYPE, 'softmax_cross_entropy')
        set_default_value(output_feature[LOSS], 'labels_smoothing', 0)
        set_default_value(output_feature[LOSS], 'class_weights', 1)
        set_default_value(output_feature[LOSS], 'robust_lambda', 0)
        set_default_value(output_feature[LOSS], 'confidence_penalty', 0)
        set_default_value(output_feature[LOSS],
                          'class_similarities_temperature', 0)
        set_default_value(output_feature[LOSS], 'weight', 1)

        if output_feature[LOSS][TYPE] == 'sampled_softmax_cross_entropy':
            set_default_value(output_feature[LOSS], 'sampler', 'log_uniform')
            set_default_value(output_feature[LOSS], 'negative_samples', 25)
            set_default_value(output_feature[LOSS], 'distortion', 0.75)
        else:
            set_default_value(output_feature[LOSS], 'sampler', None)
            set_default_value(output_feature[LOSS], 'negative_samples', 0)
            set_default_value(output_feature[LOSS], 'distortion', 1)

        set_default_value(output_feature[LOSS], 'unique', False)

        set_default_value(output_feature, 'decoder', 'generator')

        if output_feature['decoder'] == 'tagger':
            set_default_value(output_feature, 'reduce_input', None)

        set_default_value(output_feature, 'dependencies', [])
        set_default_value(output_feature, 'reduce_input', SUM)
        set_default_value(output_feature, 'reduce_dependencies', SUM)
示例#25
0
 def populate_defaults(input_feature):
     set_default_value(input_feature, TIED, None)
     set_default_value(input_feature, "preprocessing", {})
示例#26
0
    def add_feature_data(feature, dataset_df, data, metadata,
                         preprocessing_parameters):
        set_default_value(feature['preprocessing'], 'in_memory',
                          preprocessing_parameters['in_memory'])
        set_default_value(feature['preprocessing'], 'num_processes',
                          preprocessing_parameters['num_processes'])
        csv_path = None
        if hasattr(dataset_df, 'csv'):
            csv_path = os.path.dirname(os.path.abspath(dataset_df.csv))

        num_images = len(dataset_df)
        if num_images == 0:
            raise ValueError('There are no images in the dataset provided.')

        # this is not super nice, but works both and DFs and lists
        first_path = '.'
        for first_path in dataset_df[feature['name']]:
            break

        if csv_path is None and not os.path.isabs(first_path):
            raise ValueError('Image file paths must be absolute')

        first_path = get_abs_path(csv_path, first_path)

        (should_resize, width, height, num_channels,
         user_specified_num_channels,
         first_image) = ImageFeatureMixin._finalize_preprocessing_parameters(
             preprocessing_parameters, first_path)

        metadata[feature['name']]['preprocessing']['height'] = height
        metadata[feature['name']]['preprocessing']['width'] = width
        metadata[
            feature['name']]['preprocessing']['num_channels'] = num_channels

        read_image_and_resize = partial(
            ImageFeatureMixin._read_image_and_resize,
            img_width=width,
            img_height=height,
            should_resize=should_resize,
            num_channels=num_channels,
            resize_method=preprocessing_parameters['resize_method'],
            user_specified_num_channels=user_specified_num_channels)
        all_file_paths = [
            get_abs_path(csv_path, file_path)
            for file_path in dataset_df[feature['name']]
        ]

        if feature['preprocessing']['in_memory']:
            # Number of processes to run in parallel for preprocessing
            num_processes = feature['preprocessing']['num_processes']
            metadata[feature['name']]['preprocessing'][
                'num_processes'] = num_processes

            data[feature['name']] = np.empty(
                (num_images, height, width, num_channels), dtype=np.uint8)
            # Split the dataset into pools only if we have an explicit request to use
            # multiple processes. In case we have multiple input images use the
            # standard code anyway.
            if num_processes > 1 or num_images > 1:
                with Pool(num_processes) as pool:
                    logger.debug(
                        'Using {} processes for preprocessing images'.format(
                            num_processes))
                    data[feature['name']] = np.array(
                        pool.map(read_image_and_resize, all_file_paths))

            else:
                # If we're not running multiple processes and we are only processing one
                # image just use this faster shortcut, bypassing multiprocessing.Pool.map
                logger.debug(
                    'No process pool initialized. Using one process for preprocessing images'
                )
                img = read_image_and_resize(all_file_paths[0])
                data[feature['name']] = np.array([img])
        else:
            data_fp = os.path.splitext(dataset_df.csv)[0] + '.hdf5'
            mode = 'w'
            if os.path.isfile(data_fp):
                mode = 'r+'

            with h5py.File(data_fp, mode) as h5_file:
                # TODO add multiprocessing/multithreading
                image_dataset = h5_file.create_dataset(
                    feature['name'] + '_data',
                    (num_images, height, width, num_channels),
                    dtype=np.uint8)
                for i, filepath in enumerate(all_file_paths):
                    image_dataset[i, :height, :width, :] = (
                        read_image_and_resize(filepath))

            data[feature['name']] = np.arange(num_images)
示例#27
0
 def populate_defaults(input_feature):
     set_default_value(input_feature, TIED, None)
示例#28
0
    def add_feature_data(feature, input_df, proc_df, metadata,
                         preprocessing_parameters, backend,
                         skip_save_processed_input):
        set_default_value(feature['preprocessing'], 'in_memory',
                          preprocessing_parameters['in_memory'])

        if 'audio_feature' not in preprocessing_parameters:
            raise ValueError(
                'audio_feature dictionary has to be present in preprocessing '
                'for audio.')
        if TYPE not in preprocessing_parameters['audio_feature']:
            raise ValueError(
                'type has to be present in audio_feature dictionary '
                'for audio.')

        name = feature[NAME]
        column = feature[COLUMN]
        proc_column = feature[PROC_COLUMN]

        src_path = None
        # this is not super nice, but works both and DFs and lists
        first_path = '.'
        for first_path in input_df[column]:
            break
        if hasattr(input_df, 'src'):
            src_path = os.path.dirname(os.path.abspath(input_df.src))
        if src_path is None and not os.path.isabs(first_path):
            raise ValueError('Audio file paths must be absolute')

        num_audio_utterances = len(input_df)
        padding_value = preprocessing_parameters['padding_value']
        normalization_type = preprocessing_parameters['norm']

        feature_dim = metadata[name]['feature_dim']
        max_length = metadata[name]['max_length']
        audio_feature_dict = preprocessing_parameters['audio_feature']
        audio_file_length_limit_in_s = preprocessing_parameters[
            'audio_file_length_limit_in_s']

        if num_audio_utterances == 0:
            raise ValueError(
                'There are no audio files in the dataset provided.')

        if feature[PREPROCESSING]['in_memory']:
            audio_features, audio_stats = AudioFeatureMixin._process_in_memory(
                input_df[feature[NAME]], src_path, audio_feature_dict,
                feature_dim, max_length, padding_value, normalization_type,
                audio_file_length_limit_in_s, backend)
            proc_df[proc_column] = audio_features

            audio_stats['std'] = np.sqrt(audio_stats['var'] /
                                         float(audio_stats['count']))
            print_statistics = ("{} audio files loaded.\n"
                                "Statistics of audio file lengths:\n"
                                "- mean: {:.4f}\n"
                                "- std: {:.4f}\n"
                                "- max: {:.4f}\n"
                                "- min: {:.4f}\n"
                                "- cropped audio_files: {}\n"
                                "Max length was given as {}s").format(
                                    audio_stats['count'], audio_stats['mean'],
                                    audio_stats['std'], audio_stats['max'],
                                    audio_stats['min'], audio_stats['cropped'],
                                    audio_file_length_limit_in_s)
            logger.debug(print_statistics)
        else:
            backend.check_lazy_load_supported(feature)

        return proc_df
示例#29
0
 def populate_defaults(output_feature):
     set_default_value(output_feature, 'level', 'word')
     SequenceOutputFeature.populate_defaults(output_feature)
示例#30
0
    def populate_defaults(output_feature):
        set_default_value(output_feature, LOSS, {
            TYPE: SIGMOID_CROSS_ENTROPY,
            'weight': 1
        })
        set_default_value(output_feature[LOSS], 'weight', 1)
        set_default_value(output_feature[LOSS], 'class_weights', 1)

        set_default_value(output_feature, 'threshold', 0.5)
        set_default_value(output_feature, 'dependencies', [])
        set_default_value(output_feature, 'reduce_input', SUM)
        set_default_value(output_feature, 'reduce_dependencies', SUM)