def compute_first_phase() -> Phase: """ Parse only a part of arguments needed to know if the user asked to start the software from the beginning or from a specific phase. Thus return the asked phase. """ main_dir_name = _get_main_dir_name() environment_file_path = os.path.join(os.getcwd(), main_dir_name, ENVIRONMENT_FILE_NAME) if _env_file_exists(environment_file_path) and _resume_phase_asked(): # User wants to resume where he stopped last time load_environment_file(path=environment_file_path) current_phase = str_to_phase( sys.argv[sys.argv.index(gpn.resume_phase()) + 1]) # Check if the data needed to process the phase asked have been previously computed if not phase_processable(phase_to_compute=current_phase, last_phase_computed=env.last_phase): raise UnprocessablePhase(phase_to_str(current_phase), phase_to_str(env.last_phase)) else: env.current_phase = current_phase return current_phase else: # User want to compute all phases, regarding of previous computations (or asked it but environment file has not # been found). if (not _env_file_exists(environment_file_path) ) and _resume_phase_asked(): vprint(Message.ENVIRONMENT_FILE_NOT_FOUND) env.current_phase = Phase.PARSING return Phase.PARSING
def _resume_phase_asked() -> bool: """ Check if the user asked to resume at a specific phase. Try to retrieve this information directly from the command-line. """ try: sys.argv.index(gpn.resume_phase()) return True except ValueError: return False
param_class_matrix_prefix=gpn.class_matrix_prefix(), param_clustering_trees_prefix=gpn.clustering_trees_prefix(), param_main_directory=gpn.main_directory(), param_subtrain_directory=gpn.subtrain_directory(), param_subsubtrain_directory=gpn.subsubtrain_directory(), param_true_class_directory=gpn.true_class_directory(), param_classes_matrices_directory=gpn.classes_matrices_directory(), param_clustering_trees_directory=gpn.clustering_trees_directory(), param_subsubtrain_directory_pattern=gpn.subsubtrain_directory_pattern(), param_discretization_threshold=gpn.discretization_threshold(), param_entropy_threshold=gpn.entropy_threshold(), param_min_size_leaf=gpn.min_size_leaf(), param_entropy_measure=gpn.entropy_measure(), param_number_of_tnorms=gpn.number_of_tnorms(), param_last_phase=gpn.last_phase(), param_resume_phase=gpn.resume_phase(), param_help=gpn.help_param(), param_identifier=gpn.identifier(), param_class_name=gpn.class_name(), param_have_header=gpn.have_header(), param_encoding_input=gpn.encoding_input(), param_encoding_output=gpn.encoding_output(), param_format_input=gpn.format_input(), param_format_output=gpn.format_output(), param_delimiter_input=gpn.delimiter_input(), param_delimiter_output=gpn.delimiter_output(), param_quoting_input=gpn.quoting_input(), param_quoting_output=gpn.quoting_output(), param_quote_char_input=gpn.quote_char_input(), param_quote_char_output=gpn.quote_char_output(), param_line_delimiter_input=gpn.line_delimiter_input(),
def clean_args(args: dict) -> None: """ Clean the command-line arguments parsed by the `docopt` package. It mainly convert string values to their numeric and enum counterpart. If a parameter requiring an index or a column name has been completed with a name, it change it to its corresponding index. It also checks if some of the parameters are invalids and raises exceptions accordingly. """ # Rename parameter database args[gpn.database()] = args["<" + gpn.database() + ">"] del args["<" + gpn.database() + ">"] # Rename parameter parent_dir args[gpn.parent_dir()] = args["<" + gpn.parent_dir() + ">"] del args["<" + gpn.parent_dir() + ">"] # Clean important args used by other functions args[gpn.quoting_input()] = str_to_quoting(args[gpn.quoting_input()]) extension = args[gpn.format_output()].lower() for param_name in args.keys(): if param_name == gpn.parent_dir(): if not args[param_name]: args[param_name] = get_absolute_path(".") else: args[param_name] = get_absolute_path(args[param_name]) if param_name == gpn.class_name(): _check_key_exists(args, param_name, custom_exception=MissingClassificationAttribute) _clean_column_index_or_name(args=args, param_name=param_name, column_name="class") elif param_name in (gpn.discretization_threshold(), gpn.number_of_tnorms(), gpn.trees_in_forest()): args[param_name] = int(args[param_name]) elif param_name in (gpn.format_input(), gpn.format_output()): args[param_name] = str_to_format(args[param_name]) elif param_name == gpn.entropy_measure(): args[param_name] = str_to_entropymeasure(args[param_name]) elif param_name in (gpn.entropy_threshold(), gpn.quality_threshold()): if not is_a_percentage(args[param_name]): raise InvalidPercentage(args[param_name]) elif param_name == gpn.identifier(): if _check_default_value_id(args[param_name], gdv.identifier()): # We must add a column as an identifier. It will be done in the preprocessing function args[param_name] = None else: _clean_column_index_or_name(args=args, param_name=param_name, column_name="identifier") elif param_name in (gpn.initial_split_method(), gpn.reference_split_method(), gpn.subsubtrain_split_method()): args[param_name] = str_to_splittingmethod(args[param_name]) if args[param_name] == SplittingMethod.KEEP_DISTRIBUTION and args[ gpn.class_name()] is None: raise MissingClassificationAttribute() elif param_name == gpn.quality_computing_method(): args[param_name] = str_to_qualitycomputingmethod(args[param_name]) elif param_name == gpn.clustering_trees_method(): args[param_name] = str_to_clusteringtreesmethod(args[param_name]) elif param_name == gpn.quoting_output(): args[param_name] = str_to_quoting(args[param_name]) elif param_name == gpn.main_directory(): if args[param_name] is None: args[param_name] = get_filename(args[gpn.database()]) elif param_name == gpn.preprocessed_database_name(): if args[param_name] is None: args[param_name] = _get_preprocessed_db_name( database_name=args[gpn.database()], extension=extension) else: args[param_name] = get_filename(args[param_name], with_extension=True) elif param_name in (gpn.reference_name(), gpn.subtrain_name(), gpn.test_name(), gpn.train_name()): args[param_name] = "{}.{}".format( get_filename(args[param_name], with_extension=False), extension) elif param_name == gpn.header_name(): args[param_name] = "{}.{}".format( get_filename(args[param_name], with_extension=False), args[gpn.header_extension()]) elif param_name == gpn.verbosity(): args[param_name] = str_to_verbosity(args[param_name]) elif param_name in (gpn.line_delimiter_input(), gpn.line_delimiter_output()): if args[param_name] not in (None, "", "\n", "\r", "\r\n"): if args[param_name] == "\\n": args[param_name] = "\n" else: raise IllegalLineDelimiter(args[param_name]) elif param_name in (gpn.last_phase(), gpn.resume_phase()): args[param_name] = str_to_phase(args[param_name]) elif param_name == gpn.clustering_trees_method(): args[param_name] = str_to_clusteringtreesmethod(args[param_name])
def _init_command_line_parameters(args: dict) -> None: """ Initialize all the command-line-parameters-related variables located inside the `env` module. """ env.cclassified_vector_prefix = args.get( gpn.cclassified_vector_prefix().split()[-1]) env.class_name = args.get(gpn.class_name().split()[-1]) env.class_matrix_prefix = args.get(gpn.class_matrix_prefix().split()[-1]) env.classes_matrices_directory = args.get( gpn.classes_matrices_directory().split()[-1]) env.clustering_trees_directory = args.get( gpn.clustering_trees_directory().split()[-1]) env.clustering_trees_method = args.get( gpn.clustering_trees_method().split()[-1]) env.clustering_trees_prefix = args.get( gpn.clustering_trees_prefix().split()[-1]) env.delimiter_input = args.get(gpn.delimiter_input().split()[-1]) env.delimiter_output = args.get(gpn.delimiter_output().split()[-1]) env.difficulty_vector_prefix = args.get( gpn.difficulty_vector_prefix().split()[-1]) env.discretization_threshold = args.get( gpn.discretization_threshold().split()[-1]) env.encoding_input = args.get(gpn.encoding_input().split()[-1]) env.encoding_output = args.get(gpn.encoding_output().split()[-1]) env.entropy_measure = args.get(gpn.entropy_measure().split()[-1]) env.entropy_threshold = args.get(gpn.entropy_threshold().split()[-1]) env.format_input = args.get(gpn.format_input().split()[-1]) env.format_output = args.get(gpn.format_output().split()[-1]) env.have_header = args.get(gpn.have_header().split()[-1]) env.header_extension = args.get(gpn.header_extension().split()[-1]) env.header_name = args.get(gpn.header_name().split()[-1]) env.identifier = args.get(gpn.identifier().split()[-1]) env.last_phase = args.get(gpn.last_phase().split()[-1]) env.initial_database_name = args.get(gpn.database().split()[-1]) env.initial_split_method = args.get(gpn.initial_split_method().split()[-1]) env.line_delimiter_input = args.get(gpn.line_delimiter_input().split()[-1]) env.line_delimiter_output = args.get( gpn.line_delimiter_output().split()[-1]) env.main_directory = args.get(gpn.main_directory().split()[-1]) env.minimal_size_leaf = args.get(gpn.min_size_leaf().split()[-1]) env.parent_dir = args.get(gpn.parent_dir()) env.preprocessed_database_name = args.get( gpn.preprocessed_database_name().split()[-1]) env.quality_threshold = args.get(gpn.quality_threshold().split()[-1]) env.salammbo_vector_prefix = args.get( gpn.salammbo_vector_prefix().split()[-1]) env.quality_computing_method = args.get( gpn.quality_computing_method().split()[-1]) env.quality_file_prefix = args.get(gpn.quality_file_prefix().split()[-1]) env.quote_character_input = args.get(gpn.quote_char_input().split()[-1]) env.quote_character_output = args.get(gpn.quote_char_output().split()[-1]) env.quoting_input = args.get(gpn.quoting_input().split()[-1]) env.quoting_output = args.get(gpn.quoting_output().split()[-1]) env.reference_database_name = args.get(gpn.reference_name().split()[-1]) env.reference_split_method = args.get( gpn.reference_split_method().split()[-1]) env.reference_value = args.get(gpn.reference_value().split()[-1]) env.resume_phase = args.get(gpn.resume_phase().split()[-1]) env.statistics_file_name = args.get(gpn.statistics_file_name().split()[-1]) env.subsubtrain_directory = args.get( gpn.subsubtrain_directory().split()[-1]) env.subsubtrain_directory_pattern = args.get( gpn.subsubtrain_directory_pattern().split()[-1]) env.subsubtrain_name_pattern = args.get( gpn.subsubtrain_name_pattern().split()[-1]) env.subsubtrain_split_method = args.get( gpn.subsubtrain_split_method().split()[-1]) env.subtrain_directory = args.get(gpn.subtrain_directory().split()[-1]) env.subtrain_name = args.get(gpn.subtrain_name().split()[-1]) env.t_norms = args.get(gpn.number_of_tnorms().split()[-1]) env.test_database_name = args.get(gpn.test_name().split()[-1]) env.train_database_name = args.get(gpn.train_name().split()[-1]) env.training_value = args.get(gpn.training_value().split()[-1]) env.tree_file_extension = args.get(gpn.tree_file_extension().split()[-1]) env.trees_in_forest = args.get(gpn.trees_in_forest().split()[-1]) env.true_class_directory = args.get(gpn.true_class_directory().split()[-1]) env.vector_file_extension = args.get( gpn.vector_file_extension().split()[-1]) env.verbosity = args.get(gpn.verbosity().split()[-1])