def cli_detect(argv: List[str] = None) -> int: """ CLI assistant using ARGV and ArgumentParser :param argv: :return: 0 if everything is fine, anything else equal trouble """ parser = argparse.ArgumentParser( description="The Real First Universal Charset Detector. " "Discover originating encoding used on text file. " "Normalize text to unicode.") parser.add_argument("files", type=argparse.FileType("rb"), nargs="+", help="File(s) to be analysed") parser.add_argument( "-v", "--verbose", action="store_true", default=False, dest="verbose", help="Display complementary information about file if any. " "Stdout will contain logs about the detection process.", ) parser.add_argument( "-a", "--with-alternative", action="store_true", default=False, dest="alternatives", help= "Output complementary possibilities if any. Top-level JSON WILL be a list.", ) parser.add_argument( "-n", "--normalize", action="store_true", default=False, dest="normalize", help= "Permit to normalize input file. If not set, program does not write anything.", ) parser.add_argument( "-m", "--minimal", action="store_true", default=False, dest="minimal", help= "Only output the charset detected to STDOUT. Disabling JSON output.", ) parser.add_argument( "-r", "--replace", action="store_true", default=False, dest="replace", help= "Replace file when trying to normalize it instead of creating a new one.", ) parser.add_argument( "-f", "--force", action="store_true", default=False, dest="force", help= "Replace file without asking if you are sure, use this flag with caution.", ) parser.add_argument( "-t", "--threshold", action="store", default=0.1, type=float, dest="threshold", help= "Define a custom maximum amount of chaos allowed in decoded content. 0. <= chaos <= 1.", ) parser.add_argument( "--version", action="version", version="Charset-Normalizer {} - Python {}".format( __version__, python_version()), help="Show version information and exit.", ) args = parser.parse_args(argv) if args.replace is True and args.normalize is False: print("Use --replace in addition of --normalize only.", file=sys.stderr) return 1 if args.force is True and args.replace is False: print("Use --force in addition of --replace only.", file=sys.stderr) return 1 if args.threshold < 0.0 or args.threshold > 1.0: print("--threshold VALUE should be between 0. AND 1.", file=sys.stderr) return 1 x_ = [] for my_file in args.files: matches = from_fp(my_file, threshold=args.threshold, explain=args.verbose) best_guess = matches.best() if best_guess is None: print( 'Unable to identify originating encoding for "{}". {}'.format( my_file.name, "Maybe try increasing maximum amount of chaos." if args.threshold < 1.0 else "", ), file=sys.stderr, ) x_.append( CliDetectionResult( abspath(my_file.name), None, [], [], "Unknown", [], False, 1.0, 0.0, None, True, )) else: x_.append( CliDetectionResult( abspath(my_file.name), best_guess.encoding, best_guess.encoding_aliases, [ cp for cp in best_guess.could_be_from_charset if cp != best_guess.encoding ], best_guess.language, best_guess.alphabets, best_guess.bom, best_guess.percent_chaos, best_guess.percent_coherence, None, True, )) if len(matches) > 1 and args.alternatives: for el in matches: if el != matches: x_.append( CliDetectionResult( abspath(my_file.name), el.encoding, el.encoding_aliases, [ cp for cp in el.could_be_from_charset if cp != el.encoding ], el.language, el.alphabets, el.bom, el.percent_chaos, el.percent_coherence, None, False, )) if args.normalize is True: if best_guess.encoding.startswith("utf") is True: print( '"{}" file does not need to be normalized, as it already came from unicode.' .format(my_file.name), file=sys.stderr, ) if my_file.closed is False: my_file.close() continue o_ = my_file.name.split(".") # type: List[str] if args.replace is False: o_.insert(-1, best_guess.encoding) if my_file.closed is False: my_file.close() else: if (args.force is False and query_yes_no( 'Are you sure to normalize "{}" by replacing it ?'. format(my_file.name), "no", ) is False): if my_file.closed is False: my_file.close() continue try: x_[0].unicode_path = abspath("./{}".format(".".join(o_))) with open(x_[0].unicode_path, "w", encoding="utf-8") as fp: fp.write(str(best_guess)) except IOError as e: print(str(e), file=sys.stderr) if my_file.closed is False: my_file.close() return 2 if my_file.closed is False: my_file.close() if args.minimal is False: print( dumps( [el.__dict__ for el in x_] if len(x_) > 1 else x_[0].__dict__, ensure_ascii=True, indent=4, )) else: print(", ".join( [el.encoding if el.encoding else "undefined" for el in x_])) return 0
def cli_detect(argv=None): """ CLI assistant using ARGV and ArgumentParser :param argv: :return: 0 if everything is fine, anything else equal trouble """ parser = argparse.ArgumentParser( description="The Real First Universal Charset Detector. " "Discover originating encoding used on text file. " "Normalize text to unicode.") parser.add_argument('files', type=argparse.FileType('rb'), nargs='+', help='File(s) to be analysed') parser.add_argument( '-v', '--verbose', action="store_true", default=False, dest='verbose', help= 'Display complementary information about file if any. Stdout will contain logs about the detection process.' ) parser.add_argument( '-a', '--with-alternative', action="store_true", default=False, dest='alternatives', help= 'Output complementary possibilities if any. Top-level JSON WILL be a list.' ) parser.add_argument( '-n', '--normalize', action="store_true", default=False, dest='normalize', help= 'Permit to normalize input file. If not set, program does not write anything.' ) parser.add_argument( '-m', '--minimal', action="store_true", default=False, dest='minimal', help= 'Only output the charset detected to STDOUT. Disabling JSON output.') parser.add_argument( '-r', '--replace', action="store_true", default=False, dest='replace', help= 'Replace file when trying to normalize it instead of creating a new one.' ) parser.add_argument( '-f', '--force', action="store_true", default=False, dest='force', help= 'Replace file without asking if you are sure, use this flag with caution.' ) parser.add_argument( '-t', '--threshold', action="store", default=0.1, type=float, dest='threshold', help= "Define a custom maximum amount of chaos allowed in decoded content. 0. <= chaos <= 1." ) parser.add_argument("--version", action="version", version="Charset-Normalizer {} - Python {}".format( __version__, python_version()), help="Show version information and exit.") args = parser.parse_args(argv) if args.replace is True and args.normalize is False: print('Use --replace in addition of --normalize only.', file=sys.stderr) return 1 if args.force is True and args.replace is False: print('Use --force in addition of --replace only.', file=sys.stderr) return 1 if args.threshold < 0. or args.threshold > 1.: print('--threshold VALUE should be between 0. AND 1.', file=sys.stderr) return 1 x_ = [] for my_file in args.files: matches = from_fp(my_file, threshold=args.threshold, explain=args.verbose) if len(matches) == 0: print( 'Unable to identify originating encoding for "{}". {}'.format( my_file.name, 'Maybe try increasing maximum amount of chaos.' if args.threshold < 1. else ''), file=sys.stderr) x_.append( CliDetectionResult(abspath(my_file.name), None, [], [], "Unknown", [], False, 1., 0., None, True)) else: r_ = matches.best() p_ = r_.first() x_.append( CliDetectionResult(abspath( my_file.name), p_.encoding, p_.encoding_aliases, [ cp for cp in p_.could_be_from_charset if cp != p_.encoding ], p_.language, p_.alphabets, p_.bom, p_.percent_chaos, p_.percent_coherence, None, True)) if len(matches) > 1 and args.alternatives: for el in matches: if el != p_: x_.append( CliDetectionResult( abspath(my_file.name), el.encoding, el.encoding_aliases, [ cp for cp in el.could_be_from_charset if cp != el.encoding ], el.language, el.alphabets, el.bom, el.percent_chaos, el.percent_coherence, None, False)) if args.normalize is True: if p_.encoding.startswith('utf') is True: print( '"{}" file does not need to be normalized, as it already came from unicode.' .format(my_file.name), file=sys.stderr) if my_file.closed is False: my_file.close() continue o_ = my_file.name.split('.') # type: list[str] if args.replace is False: o_.insert(-1, p_.encoding) if my_file.closed is False: my_file.close() else: if args.force is False and query_yes_no( 'Are you sure to normalize "{}" by replacing it ?'. format(my_file.name), 'no') is False: if my_file.closed is False: my_file.close() continue try: x_[0].unicode_path = abspath('./{}'.format('.'.join(o_))) with open(x_[0].unicode_path, 'w', encoding='utf-8') as fp: fp.write(str(p_)) except IOError as e: print(str(e), file=sys.stderr) if my_file.closed is False: my_file.close() return 2 if my_file.closed is False: my_file.close() if args.minimal is False: print( dumps([el.__dict__ for el in x_] if len(x_) > 1 else x_[0].__dict__, ensure_ascii=True, indent=4)) else: print(', '.join([el.encoding for el in x_])) return 0