def _init_miscellaneous(args: dict) -> None: """ Initialize all the others variables inside the `env` module. """ env.current_phase = Phase.PREPROCESSING env.dialect_input = Dialect(encoding=env.encoding_input, delimiter=env.delimiter_input, quoting=env.quoting_input, quote_char=env.quote_character_input, line_delimiter=env.line_delimiter_input, skip_initial_space=True) env.dialect_output = Dialect(encoding=env.encoding_output, delimiter=env.delimiter_output, quoting=env.quoting_output, quote_char=env.quote_character_output, line_delimiter=env.line_delimiter_output, skip_initial_space=True) env.possible_classes = list( set( get_column(path=args.get(gpn.database()), column=args.get(gpn.class_name()), have_header=args.get(gpn.have_header()), dialect=env.dialect_input))) if env.true_class_directory in env.possible_classes: raise TrueClassNameOverrideOneClass(env.true_class_directory) if args.get(gpn.number_of_tnorms()): env.t_norms_names = [ tnorm_to_str(name) for name in range(args.get(gpn.number_of_tnorms()) + 1) ]
def _clean_column_index_or_name(args: dict, param_name: str, column_name: str) -> None: """ If the specified name value is a column name, convert it to it's respective index. Otherwise, check if it's inbounds and convert it to an integer. """ if (not is_an_int(args[param_name])) and (type(args[param_name]) == str): # User asked for a named class, we retrieve its index then change it args[param_name] = find_index_with_class( path=args[gpn.database()], class_name=args[param_name], dialect=Dialect(encoding=args[gpn.encoding_input()], delimiter=args[gpn.delimiter_input()], quoting=args[gpn.quoting_input()], quote_char=args[gpn.quote_char_input()], skip_initial_space=True)) else: # User asked for an index, we convert it to int then check if it's inbound args[param_name] = int(args[param_name]) if not index_in_bounds(input_path=args[gpn.database()], index=args[param_name], dialect=Dialect( encoding=args[gpn.encoding_input()], delimiter=args[gpn.delimiter_input()], quoting=args[gpn.quoting_input()], quote_char=args[gpn.quote_char_input()], skip_initial_space=True)): raise IndexOutOfBounds( index=args[param_name], column=column_name, length=get_number_of_columns( path=args[gpn.database()], dialect=Dialect(encoding=args[gpn.encoding_input()], delimiter=args[gpn.delimiter_input()], quoting=args[gpn.quoting_input()], quote_char=args[gpn.quote_char_input()], skip_initial_space=True)))
def clean_args(args: dict) -> None: """ Clean the command-line arguments parsed by the `docopt` package. It mainly convert string values to their numeric and enum counterpart. If a parameter requiring an index or a column name has been completed with a name, it change it to its corresponding index. It also checks if some of the parameters are invalids and raises exceptions accordingly. """ # Rename parameter database args[gpn.database()] = args["<" + gpn.database() + ">"] del args["<" + gpn.database() + ">"] # Rename parameter parent_dir args[gpn.parent_dir()] = args["<" + gpn.parent_dir() + ">"] del args["<" + gpn.parent_dir() + ">"] # Clean important args used by other functions args[gpn.quoting_input()] = str_to_quoting(args[gpn.quoting_input()]) extension = args[gpn.format_output()].lower() for param_name in args.keys(): if param_name == gpn.parent_dir(): if not args[param_name]: args[param_name] = get_absolute_path(".") else: args[param_name] = get_absolute_path(args[param_name]) if param_name == gpn.class_name(): _check_key_exists(args, param_name, custom_exception=MissingClassificationAttribute) _clean_column_index_or_name(args=args, param_name=param_name, column_name="class") elif param_name in (gpn.discretization_threshold(), gpn.number_of_tnorms(), gpn.trees_in_forest()): args[param_name] = int(args[param_name]) elif param_name in (gpn.format_input(), gpn.format_output()): args[param_name] = str_to_format(args[param_name]) elif param_name == gpn.entropy_measure(): args[param_name] = str_to_entropymeasure(args[param_name]) elif param_name in (gpn.entropy_threshold(), gpn.quality_threshold()): if not is_a_percentage(args[param_name]): raise InvalidPercentage(args[param_name]) elif param_name == gpn.identifier(): if _check_default_value_id(args[param_name], gdv.identifier()): # We must add a column as an identifier. It will be done in the preprocessing function args[param_name] = None else: _clean_column_index_or_name(args=args, param_name=param_name, column_name="identifier") elif param_name in (gpn.initial_split_method(), gpn.reference_split_method(), gpn.subsubtrain_split_method()): args[param_name] = str_to_splittingmethod(args[param_name]) if args[param_name] == SplittingMethod.KEEP_DISTRIBUTION and args[ gpn.class_name()] is None: raise MissingClassificationAttribute() elif param_name == gpn.quality_computing_method(): args[param_name] = str_to_qualitycomputingmethod(args[param_name]) elif param_name == gpn.clustering_trees_method(): args[param_name] = str_to_clusteringtreesmethod(args[param_name]) elif param_name == gpn.quoting_output(): args[param_name] = str_to_quoting(args[param_name]) elif param_name == gpn.main_directory(): if args[param_name] is None: args[param_name] = get_filename(args[gpn.database()]) elif param_name == gpn.preprocessed_database_name(): if args[param_name] is None: args[param_name] = _get_preprocessed_db_name( database_name=args[gpn.database()], extension=extension) else: args[param_name] = get_filename(args[param_name], with_extension=True) elif param_name in (gpn.reference_name(), gpn.subtrain_name(), gpn.test_name(), gpn.train_name()): args[param_name] = "{}.{}".format( get_filename(args[param_name], with_extension=False), extension) elif param_name == gpn.header_name(): args[param_name] = "{}.{}".format( get_filename(args[param_name], with_extension=False), args[gpn.header_extension()]) elif param_name == gpn.verbosity(): args[param_name] = str_to_verbosity(args[param_name]) elif param_name in (gpn.line_delimiter_input(), gpn.line_delimiter_output()): if args[param_name] not in (None, "", "\n", "\r", "\r\n"): if args[param_name] == "\\n": args[param_name] = "\n" else: raise IllegalLineDelimiter(args[param_name]) elif param_name in (gpn.last_phase(), gpn.resume_phase()): args[param_name] = str_to_phase(args[param_name]) elif param_name == gpn.clustering_trees_method(): args[param_name] = str_to_clusteringtreesmethod(args[param_name])
doc_encoding_input=gpd.encoding_input(), doc_encoding_output=gpd.encoding_output(), doc_format_input=gpd.format_input(), doc_format_output=gpd.format_output(), doc_delimiter_input=gpd.delimiter_input(), doc_delimiter_output=gpd.delimiter_output(), doc_quoting_input=gpd.quoting_input(), doc_quoting_output=gpd.quoting_output(), doc_quote_char_input=gpd.quote_char_input(), doc_quote_char_output=gpd.quote_char_output(), doc_line_delimiter_input=gpd.line_delimiter_input(), doc_line_delimiter_output=gpd.line_delimiter_output(), doc_verbosity=gpd.verbosity(), # Parameters param_database=gpn.database(), param_parent_dir=gpn.parent_dir(), param_training_value=gpn.training_value(), param_reference_value=gpn.reference_value(), param_trees_in_forest=gpn.trees_in_forest(), param_quality_threshold=gpn.quality_threshold(), param_initial_split_method=gpn.initial_split_method(), param_reference_split_method=gpn.reference_split_method(), param_subsubtrain_split_method=gpn.subsubtrain_split_method(), param_quality_computing_method=gpn.quality_computing_method(), param_clustering_trees_method=gpn.clustering_trees_method(), param_train_name=gpn.train_name(), param_test_name=gpn.test_name(), param_preprocessed_db_name=gpn.preprocessed_database_name(), param_subtrain_name=gpn.subtrain_name(), param_reference_name=gpn.reference_name(),
def _init_command_line_parameters(args: dict) -> None: """ Initialize all the command-line-parameters-related variables located inside the `env` module. """ env.cclassified_vector_prefix = args.get( gpn.cclassified_vector_prefix().split()[-1]) env.class_name = args.get(gpn.class_name().split()[-1]) env.class_matrix_prefix = args.get(gpn.class_matrix_prefix().split()[-1]) env.classes_matrices_directory = args.get( gpn.classes_matrices_directory().split()[-1]) env.clustering_trees_directory = args.get( gpn.clustering_trees_directory().split()[-1]) env.clustering_trees_method = args.get( gpn.clustering_trees_method().split()[-1]) env.clustering_trees_prefix = args.get( gpn.clustering_trees_prefix().split()[-1]) env.delimiter_input = args.get(gpn.delimiter_input().split()[-1]) env.delimiter_output = args.get(gpn.delimiter_output().split()[-1]) env.difficulty_vector_prefix = args.get( gpn.difficulty_vector_prefix().split()[-1]) env.discretization_threshold = args.get( gpn.discretization_threshold().split()[-1]) env.encoding_input = args.get(gpn.encoding_input().split()[-1]) env.encoding_output = args.get(gpn.encoding_output().split()[-1]) env.entropy_measure = args.get(gpn.entropy_measure().split()[-1]) env.entropy_threshold = args.get(gpn.entropy_threshold().split()[-1]) env.format_input = args.get(gpn.format_input().split()[-1]) env.format_output = args.get(gpn.format_output().split()[-1]) env.have_header = args.get(gpn.have_header().split()[-1]) env.header_extension = args.get(gpn.header_extension().split()[-1]) env.header_name = args.get(gpn.header_name().split()[-1]) env.identifier = args.get(gpn.identifier().split()[-1]) env.last_phase = args.get(gpn.last_phase().split()[-1]) env.initial_database_name = args.get(gpn.database().split()[-1]) env.initial_split_method = args.get(gpn.initial_split_method().split()[-1]) env.line_delimiter_input = args.get(gpn.line_delimiter_input().split()[-1]) env.line_delimiter_output = args.get( gpn.line_delimiter_output().split()[-1]) env.main_directory = args.get(gpn.main_directory().split()[-1]) env.minimal_size_leaf = args.get(gpn.min_size_leaf().split()[-1]) env.parent_dir = args.get(gpn.parent_dir()) env.preprocessed_database_name = args.get( gpn.preprocessed_database_name().split()[-1]) env.quality_threshold = args.get(gpn.quality_threshold().split()[-1]) env.salammbo_vector_prefix = args.get( gpn.salammbo_vector_prefix().split()[-1]) env.quality_computing_method = args.get( gpn.quality_computing_method().split()[-1]) env.quality_file_prefix = args.get(gpn.quality_file_prefix().split()[-1]) env.quote_character_input = args.get(gpn.quote_char_input().split()[-1]) env.quote_character_output = args.get(gpn.quote_char_output().split()[-1]) env.quoting_input = args.get(gpn.quoting_input().split()[-1]) env.quoting_output = args.get(gpn.quoting_output().split()[-1]) env.reference_database_name = args.get(gpn.reference_name().split()[-1]) env.reference_split_method = args.get( gpn.reference_split_method().split()[-1]) env.reference_value = args.get(gpn.reference_value().split()[-1]) env.resume_phase = args.get(gpn.resume_phase().split()[-1]) env.statistics_file_name = args.get(gpn.statistics_file_name().split()[-1]) env.subsubtrain_directory = args.get( gpn.subsubtrain_directory().split()[-1]) env.subsubtrain_directory_pattern = args.get( gpn.subsubtrain_directory_pattern().split()[-1]) env.subsubtrain_name_pattern = args.get( gpn.subsubtrain_name_pattern().split()[-1]) env.subsubtrain_split_method = args.get( gpn.subsubtrain_split_method().split()[-1]) env.subtrain_directory = args.get(gpn.subtrain_directory().split()[-1]) env.subtrain_name = args.get(gpn.subtrain_name().split()[-1]) env.t_norms = args.get(gpn.number_of_tnorms().split()[-1]) env.test_database_name = args.get(gpn.test_name().split()[-1]) env.train_database_name = args.get(gpn.train_name().split()[-1]) env.training_value = args.get(gpn.training_value().split()[-1]) env.tree_file_extension = args.get(gpn.tree_file_extension().split()[-1]) env.trees_in_forest = args.get(gpn.trees_in_forest().split()[-1]) env.true_class_directory = args.get(gpn.true_class_directory().split()[-1]) env.vector_file_extension = args.get( gpn.vector_file_extension().split()[-1]) env.verbosity = args.get(gpn.verbosity().split()[-1])
def _init_names(args: dict) -> None: """ Initialize all the names-related variables inside the `env` module. """ env.original_database_name = get_filename(args.get(gpn.database()))
def _init_paths(args: dict) -> None: """ Initialize all the path-related variables inside the `env` module. """ env.statistics_file_path = "{}/{}".format(env.main_directory_path, env.statistics_file_name) env.original_database_path = args.get(gpn.database()) env.preprocessed_database_path = "{}/{}".format( env.main_directory_path, args.get(gpn.preprocessed_database_name())) env.header_path = "{}/{}".format(env.main_directory_path, env.header_name) env.test_database_path = "{}/{}".format(env.main_directory_path, args.get(gpn.test_name())) env.train_database_path = "{}/{}".format(env.main_directory_path, args.get(gpn.train_name())) env.reference_database_path = "{}/{}".format( env.subtrain_directory_path, args.get(gpn.reference_name())) env.subtrain_database_path = "{}/{}".format(env.subtrain_directory_path, args.get(gpn.subtrain_name())) if env.trees_in_forest: env.subsubtrain_databases_paths = [ "{}/{}.{}".format( env.subsubtrain_directories_path[tree_index], env.subsubtrain_directory_pattern % str(tree_index + 1).zfill(len(str(env.trees_in_forest))), format_to_str(args.get(gpn.format_output())).lower()) for tree_index in range(env.trees_in_forest) ] env.cclassified_vectors_paths = { tnorm: [ "{}/{}{}.{}".format( env.subsubtrain_directories_path[tree_index - 1], env.cclassified_vector_prefix, tnorm, env.vector_file_extension) for tree_index in range(1, env.trees_in_forest + 1) ] for tnorm in [ tnorm_to_str(tnorm_index) for tnorm_index in range(env.t_norms + 1) ] } env.salammbo_vectors_paths = { tnorm: [ "{}/{}{}.{}".format( env.subsubtrain_directories_path[tree_index - 1], env.salammbo_vector_prefix, tnorm, env.vector_file_extension) for tree_index in range(1, env.trees_in_forest + 1) ] for tnorm in [ tnorm_to_str(tnorm_index) for tnorm_index in range(env.t_norms + 1) ] } if env.t_norms: env.difficulty_vectors_paths = { tnorm: "{}/{}{}.{}".format(env.subtrain_directory_path, env.difficulty_vector_prefix, tnorm, env.vector_file_extension) for tnorm in [ tnorm_to_str(tnorm_index) for tnorm_index in range(env.t_norms + 1) ] } env.quality_files_paths = { tnorm: "{}/{}{}.{}".format(env.subtrain_directory_path, env.quality_file_prefix, tnorm, format_to_str(args.get(gpn.format_output()))) for tnorm in [ tnorm_to_str(tnorm_index) for tnorm_index in range(env.t_norms + 1) ] } env.classes_matrices_files_paths = { class_name: { tnorm: "{}/{}{}_{}.{}".format( env.classes_matrices_directories_path[class_name], env.class_matrix_prefix, class_name, tnorm, format_to_str(args.get(gpn.format_output()))) for tnorm in [ tnorm_to_str(tnorm_index) for tnorm_index in range(env.t_norms + 1) ] } for class_name in env.possible_classes } env.clustering_trees_files_paths = { class_name: { tnorm: "{}/{}{}_{}.{}".format( env.clustering_trees_directories_path[class_name], env.clustering_trees_prefix, class_name, tnorm, format_to_str(args.get(gpn.format_output()))) for tnorm in [ tnorm_to_str(tnorm_index) for tnorm_index in range(env.t_norms + 1) ] } for class_name in env.possible_classes }