Пример #1
0
def _init_miscellaneous(args: dict) -> None:
    """ Initialize all the others variables inside the `env` module. """
    env.current_phase = Phase.PREPROCESSING
    env.dialect_input = Dialect(encoding=env.encoding_input,
                                delimiter=env.delimiter_input,
                                quoting=env.quoting_input,
                                quote_char=env.quote_character_input,
                                line_delimiter=env.line_delimiter_input,
                                skip_initial_space=True)
    env.dialect_output = Dialect(encoding=env.encoding_output,
                                 delimiter=env.delimiter_output,
                                 quoting=env.quoting_output,
                                 quote_char=env.quote_character_output,
                                 line_delimiter=env.line_delimiter_output,
                                 skip_initial_space=True)
    env.possible_classes = list(
        set(
            get_column(path=args.get(gpn.database()),
                       column=args.get(gpn.class_name()),
                       have_header=args.get(gpn.have_header()),
                       dialect=env.dialect_input)))
    if env.true_class_directory in env.possible_classes:
        raise TrueClassNameOverrideOneClass(env.true_class_directory)

    if args.get(gpn.number_of_tnorms()):
        env.t_norms_names = [
            tnorm_to_str(name)
            for name in range(args.get(gpn.number_of_tnorms()) + 1)
        ]
Пример #2
0
def _clean_column_index_or_name(args: dict, param_name: str,
                                column_name: str) -> None:
    """ If the specified name value is a column name, convert it to it's respective index. Otherwise, check if it's
    inbounds and convert it to an integer.
    """
    if (not is_an_int(args[param_name])) and (type(args[param_name]) == str):
        # User asked for a named class, we retrieve its index then change it
        args[param_name] = find_index_with_class(
            path=args[gpn.database()],
            class_name=args[param_name],
            dialect=Dialect(encoding=args[gpn.encoding_input()],
                            delimiter=args[gpn.delimiter_input()],
                            quoting=args[gpn.quoting_input()],
                            quote_char=args[gpn.quote_char_input()],
                            skip_initial_space=True))
    else:
        # User asked for an index, we convert it to int then check if it's inbound
        args[param_name] = int(args[param_name])
        if not index_in_bounds(input_path=args[gpn.database()],
                               index=args[param_name],
                               dialect=Dialect(
                                   encoding=args[gpn.encoding_input()],
                                   delimiter=args[gpn.delimiter_input()],
                                   quoting=args[gpn.quoting_input()],
                                   quote_char=args[gpn.quote_char_input()],
                                   skip_initial_space=True)):
            raise IndexOutOfBounds(
                index=args[param_name],
                column=column_name,
                length=get_number_of_columns(
                    path=args[gpn.database()],
                    dialect=Dialect(encoding=args[gpn.encoding_input()],
                                    delimiter=args[gpn.delimiter_input()],
                                    quoting=args[gpn.quoting_input()],
                                    quote_char=args[gpn.quote_char_input()],
                                    skip_initial_space=True)))
Пример #3
0
def clean_args(args: dict) -> None:
    """ Clean the command-line arguments parsed by the `docopt` package.
    It mainly convert string values to their numeric and enum counterpart. If a parameter requiring an index or a column
    name has been completed with a name, it change it to its corresponding index. It also checks if some of the
    parameters are invalids and raises exceptions accordingly.
    """
    # Rename parameter database
    args[gpn.database()] = args["<" + gpn.database() + ">"]
    del args["<" + gpn.database() + ">"]

    # Rename parameter parent_dir
    args[gpn.parent_dir()] = args["<" + gpn.parent_dir() + ">"]
    del args["<" + gpn.parent_dir() + ">"]

    # Clean important args used by other functions
    args[gpn.quoting_input()] = str_to_quoting(args[gpn.quoting_input()])

    extension = args[gpn.format_output()].lower()

    for param_name in args.keys():
        if param_name == gpn.parent_dir():
            if not args[param_name]:
                args[param_name] = get_absolute_path(".")
            else:
                args[param_name] = get_absolute_path(args[param_name])
        if param_name == gpn.class_name():
            _check_key_exists(args,
                              param_name,
                              custom_exception=MissingClassificationAttribute)
            _clean_column_index_or_name(args=args,
                                        param_name=param_name,
                                        column_name="class")
        elif param_name in (gpn.discretization_threshold(),
                            gpn.number_of_tnorms(), gpn.trees_in_forest()):
            args[param_name] = int(args[param_name])
        elif param_name in (gpn.format_input(), gpn.format_output()):
            args[param_name] = str_to_format(args[param_name])
        elif param_name == gpn.entropy_measure():
            args[param_name] = str_to_entropymeasure(args[param_name])
        elif param_name in (gpn.entropy_threshold(), gpn.quality_threshold()):
            if not is_a_percentage(args[param_name]):
                raise InvalidPercentage(args[param_name])
        elif param_name == gpn.identifier():
            if _check_default_value_id(args[param_name], gdv.identifier()):
                # We must add a column as an identifier. It will be done in the preprocessing function
                args[param_name] = None
            else:
                _clean_column_index_or_name(args=args,
                                            param_name=param_name,
                                            column_name="identifier")
        elif param_name in (gpn.initial_split_method(),
                            gpn.reference_split_method(),
                            gpn.subsubtrain_split_method()):
            args[param_name] = str_to_splittingmethod(args[param_name])
            if args[param_name] == SplittingMethod.KEEP_DISTRIBUTION and args[
                    gpn.class_name()] is None:
                raise MissingClassificationAttribute()
        elif param_name == gpn.quality_computing_method():
            args[param_name] = str_to_qualitycomputingmethod(args[param_name])
        elif param_name == gpn.clustering_trees_method():
            args[param_name] = str_to_clusteringtreesmethod(args[param_name])
        elif param_name == gpn.quoting_output():
            args[param_name] = str_to_quoting(args[param_name])
        elif param_name == gpn.main_directory():
            if args[param_name] is None:
                args[param_name] = get_filename(args[gpn.database()])
        elif param_name == gpn.preprocessed_database_name():
            if args[param_name] is None:
                args[param_name] = _get_preprocessed_db_name(
                    database_name=args[gpn.database()], extension=extension)
            else:
                args[param_name] = get_filename(args[param_name],
                                                with_extension=True)
        elif param_name in (gpn.reference_name(), gpn.subtrain_name(),
                            gpn.test_name(), gpn.train_name()):
            args[param_name] = "{}.{}".format(
                get_filename(args[param_name], with_extension=False),
                extension)
        elif param_name == gpn.header_name():
            args[param_name] = "{}.{}".format(
                get_filename(args[param_name], with_extension=False),
                args[gpn.header_extension()])
        elif param_name == gpn.verbosity():
            args[param_name] = str_to_verbosity(args[param_name])
        elif param_name in (gpn.line_delimiter_input(),
                            gpn.line_delimiter_output()):
            if args[param_name] not in (None, "", "\n", "\r", "\r\n"):
                if args[param_name] == "\\n":
                    args[param_name] = "\n"
                else:
                    raise IllegalLineDelimiter(args[param_name])
        elif param_name in (gpn.last_phase(), gpn.resume_phase()):
            args[param_name] = str_to_phase(args[param_name])
        elif param_name == gpn.clustering_trees_method():
            args[param_name] = str_to_clusteringtreesmethod(args[param_name])
Пример #4
0
    doc_encoding_input=gpd.encoding_input(),
    doc_encoding_output=gpd.encoding_output(),
    doc_format_input=gpd.format_input(),
    doc_format_output=gpd.format_output(),
    doc_delimiter_input=gpd.delimiter_input(),
    doc_delimiter_output=gpd.delimiter_output(),
    doc_quoting_input=gpd.quoting_input(),
    doc_quoting_output=gpd.quoting_output(),
    doc_quote_char_input=gpd.quote_char_input(),
    doc_quote_char_output=gpd.quote_char_output(),
    doc_line_delimiter_input=gpd.line_delimiter_input(),
    doc_line_delimiter_output=gpd.line_delimiter_output(),
    doc_verbosity=gpd.verbosity(),

    # Parameters
    param_database=gpn.database(),
    param_parent_dir=gpn.parent_dir(),
    param_training_value=gpn.training_value(),
    param_reference_value=gpn.reference_value(),
    param_trees_in_forest=gpn.trees_in_forest(),
    param_quality_threshold=gpn.quality_threshold(),
    param_initial_split_method=gpn.initial_split_method(),
    param_reference_split_method=gpn.reference_split_method(),
    param_subsubtrain_split_method=gpn.subsubtrain_split_method(),
    param_quality_computing_method=gpn.quality_computing_method(),
    param_clustering_trees_method=gpn.clustering_trees_method(),
    param_train_name=gpn.train_name(),
    param_test_name=gpn.test_name(),
    param_preprocessed_db_name=gpn.preprocessed_database_name(),
    param_subtrain_name=gpn.subtrain_name(),
    param_reference_name=gpn.reference_name(),
Пример #5
0
def _init_command_line_parameters(args: dict) -> None:
    """ Initialize all the command-line-parameters-related variables located inside the `env` module. """
    env.cclassified_vector_prefix = args.get(
        gpn.cclassified_vector_prefix().split()[-1])
    env.class_name = args.get(gpn.class_name().split()[-1])
    env.class_matrix_prefix = args.get(gpn.class_matrix_prefix().split()[-1])
    env.classes_matrices_directory = args.get(
        gpn.classes_matrices_directory().split()[-1])
    env.clustering_trees_directory = args.get(
        gpn.clustering_trees_directory().split()[-1])
    env.clustering_trees_method = args.get(
        gpn.clustering_trees_method().split()[-1])
    env.clustering_trees_prefix = args.get(
        gpn.clustering_trees_prefix().split()[-1])
    env.delimiter_input = args.get(gpn.delimiter_input().split()[-1])
    env.delimiter_output = args.get(gpn.delimiter_output().split()[-1])
    env.difficulty_vector_prefix = args.get(
        gpn.difficulty_vector_prefix().split()[-1])
    env.discretization_threshold = args.get(
        gpn.discretization_threshold().split()[-1])
    env.encoding_input = args.get(gpn.encoding_input().split()[-1])
    env.encoding_output = args.get(gpn.encoding_output().split()[-1])
    env.entropy_measure = args.get(gpn.entropy_measure().split()[-1])
    env.entropy_threshold = args.get(gpn.entropy_threshold().split()[-1])
    env.format_input = args.get(gpn.format_input().split()[-1])
    env.format_output = args.get(gpn.format_output().split()[-1])
    env.have_header = args.get(gpn.have_header().split()[-1])
    env.header_extension = args.get(gpn.header_extension().split()[-1])
    env.header_name = args.get(gpn.header_name().split()[-1])
    env.identifier = args.get(gpn.identifier().split()[-1])
    env.last_phase = args.get(gpn.last_phase().split()[-1])
    env.initial_database_name = args.get(gpn.database().split()[-1])
    env.initial_split_method = args.get(gpn.initial_split_method().split()[-1])
    env.line_delimiter_input = args.get(gpn.line_delimiter_input().split()[-1])
    env.line_delimiter_output = args.get(
        gpn.line_delimiter_output().split()[-1])
    env.main_directory = args.get(gpn.main_directory().split()[-1])
    env.minimal_size_leaf = args.get(gpn.min_size_leaf().split()[-1])
    env.parent_dir = args.get(gpn.parent_dir())
    env.preprocessed_database_name = args.get(
        gpn.preprocessed_database_name().split()[-1])
    env.quality_threshold = args.get(gpn.quality_threshold().split()[-1])
    env.salammbo_vector_prefix = args.get(
        gpn.salammbo_vector_prefix().split()[-1])
    env.quality_computing_method = args.get(
        gpn.quality_computing_method().split()[-1])
    env.quality_file_prefix = args.get(gpn.quality_file_prefix().split()[-1])
    env.quote_character_input = args.get(gpn.quote_char_input().split()[-1])
    env.quote_character_output = args.get(gpn.quote_char_output().split()[-1])
    env.quoting_input = args.get(gpn.quoting_input().split()[-1])
    env.quoting_output = args.get(gpn.quoting_output().split()[-1])
    env.reference_database_name = args.get(gpn.reference_name().split()[-1])
    env.reference_split_method = args.get(
        gpn.reference_split_method().split()[-1])
    env.reference_value = args.get(gpn.reference_value().split()[-1])
    env.resume_phase = args.get(gpn.resume_phase().split()[-1])
    env.statistics_file_name = args.get(gpn.statistics_file_name().split()[-1])
    env.subsubtrain_directory = args.get(
        gpn.subsubtrain_directory().split()[-1])
    env.subsubtrain_directory_pattern = args.get(
        gpn.subsubtrain_directory_pattern().split()[-1])
    env.subsubtrain_name_pattern = args.get(
        gpn.subsubtrain_name_pattern().split()[-1])
    env.subsubtrain_split_method = args.get(
        gpn.subsubtrain_split_method().split()[-1])
    env.subtrain_directory = args.get(gpn.subtrain_directory().split()[-1])
    env.subtrain_name = args.get(gpn.subtrain_name().split()[-1])
    env.t_norms = args.get(gpn.number_of_tnorms().split()[-1])
    env.test_database_name = args.get(gpn.test_name().split()[-1])
    env.train_database_name = args.get(gpn.train_name().split()[-1])
    env.training_value = args.get(gpn.training_value().split()[-1])
    env.tree_file_extension = args.get(gpn.tree_file_extension().split()[-1])
    env.trees_in_forest = args.get(gpn.trees_in_forest().split()[-1])
    env.true_class_directory = args.get(gpn.true_class_directory().split()[-1])
    env.vector_file_extension = args.get(
        gpn.vector_file_extension().split()[-1])
    env.verbosity = args.get(gpn.verbosity().split()[-1])
Пример #6
0
def _init_names(args: dict) -> None:
    """ Initialize all the names-related variables inside the `env` module. """
    env.original_database_name = get_filename(args.get(gpn.database()))
Пример #7
0
def _init_paths(args: dict) -> None:
    """ Initialize all the path-related variables inside the `env` module. """
    env.statistics_file_path = "{}/{}".format(env.main_directory_path,
                                              env.statistics_file_name)
    env.original_database_path = args.get(gpn.database())
    env.preprocessed_database_path = "{}/{}".format(
        env.main_directory_path, args.get(gpn.preprocessed_database_name()))
    env.header_path = "{}/{}".format(env.main_directory_path, env.header_name)
    env.test_database_path = "{}/{}".format(env.main_directory_path,
                                            args.get(gpn.test_name()))
    env.train_database_path = "{}/{}".format(env.main_directory_path,
                                             args.get(gpn.train_name()))
    env.reference_database_path = "{}/{}".format(
        env.subtrain_directory_path, args.get(gpn.reference_name()))
    env.subtrain_database_path = "{}/{}".format(env.subtrain_directory_path,
                                                args.get(gpn.subtrain_name()))
    if env.trees_in_forest:
        env.subsubtrain_databases_paths = [
            "{}/{}.{}".format(
                env.subsubtrain_directories_path[tree_index],
                env.subsubtrain_directory_pattern %
                str(tree_index + 1).zfill(len(str(env.trees_in_forest))),
                format_to_str(args.get(gpn.format_output())).lower())
            for tree_index in range(env.trees_in_forest)
        ]
        env.cclassified_vectors_paths = {
            tnorm: [
                "{}/{}{}.{}".format(
                    env.subsubtrain_directories_path[tree_index - 1],
                    env.cclassified_vector_prefix, tnorm,
                    env.vector_file_extension)
                for tree_index in range(1, env.trees_in_forest + 1)
            ]
            for tnorm in [
                tnorm_to_str(tnorm_index)
                for tnorm_index in range(env.t_norms + 1)
            ]
        }

        env.salammbo_vectors_paths = {
            tnorm: [
                "{}/{}{}.{}".format(
                    env.subsubtrain_directories_path[tree_index - 1],
                    env.salammbo_vector_prefix, tnorm,
                    env.vector_file_extension)
                for tree_index in range(1, env.trees_in_forest + 1)
            ]
            for tnorm in [
                tnorm_to_str(tnorm_index)
                for tnorm_index in range(env.t_norms + 1)
            ]
        }
    if env.t_norms:
        env.difficulty_vectors_paths = {
            tnorm: "{}/{}{}.{}".format(env.subtrain_directory_path,
                                       env.difficulty_vector_prefix, tnorm,
                                       env.vector_file_extension)
            for tnorm in [
                tnorm_to_str(tnorm_index)
                for tnorm_index in range(env.t_norms + 1)
            ]
        }
        env.quality_files_paths = {
            tnorm:
            "{}/{}{}.{}".format(env.subtrain_directory_path,
                                env.quality_file_prefix, tnorm,
                                format_to_str(args.get(gpn.format_output())))
            for tnorm in [
                tnorm_to_str(tnorm_index)
                for tnorm_index in range(env.t_norms + 1)
            ]
        }
        env.classes_matrices_files_paths = {
            class_name: {
                tnorm: "{}/{}{}_{}.{}".format(
                    env.classes_matrices_directories_path[class_name],
                    env.class_matrix_prefix, class_name, tnorm,
                    format_to_str(args.get(gpn.format_output())))
                for tnorm in [
                    tnorm_to_str(tnorm_index)
                    for tnorm_index in range(env.t_norms + 1)
                ]
            }
            for class_name in env.possible_classes
        }
        env.clustering_trees_files_paths = {
            class_name: {
                tnorm: "{}/{}{}_{}.{}".format(
                    env.clustering_trees_directories_path[class_name],
                    env.clustering_trees_prefix, class_name, tnorm,
                    format_to_str(args.get(gpn.format_output())))
                for tnorm in [
                    tnorm_to_str(tnorm_index)
                    for tnorm_index in range(env.t_norms + 1)
                ]
            }
            for class_name in env.possible_classes
        }