Пример #1
0
def cli_detect(argv: List[str] = None) -> int:
    """
    CLI assistant using ARGV and ArgumentParser
    :param argv:
    :return: 0 if everything is fine, anything else equal trouble
    """
    parser = argparse.ArgumentParser(
        description="The Real First Universal Charset Detector. "
        "Discover originating encoding used on text file. "
        "Normalize text to unicode.")

    parser.add_argument("files",
                        type=argparse.FileType("rb"),
                        nargs="+",
                        help="File(s) to be analysed")
    parser.add_argument(
        "-v",
        "--verbose",
        action="store_true",
        default=False,
        dest="verbose",
        help="Display complementary information about file if any. "
        "Stdout will contain logs about the detection process.",
    )
    parser.add_argument(
        "-a",
        "--with-alternative",
        action="store_true",
        default=False,
        dest="alternatives",
        help=
        "Output complementary possibilities if any. Top-level JSON WILL be a list.",
    )
    parser.add_argument(
        "-n",
        "--normalize",
        action="store_true",
        default=False,
        dest="normalize",
        help=
        "Permit to normalize input file. If not set, program does not write anything.",
    )
    parser.add_argument(
        "-m",
        "--minimal",
        action="store_true",
        default=False,
        dest="minimal",
        help=
        "Only output the charset detected to STDOUT. Disabling JSON output.",
    )
    parser.add_argument(
        "-r",
        "--replace",
        action="store_true",
        default=False,
        dest="replace",
        help=
        "Replace file when trying to normalize it instead of creating a new one.",
    )
    parser.add_argument(
        "-f",
        "--force",
        action="store_true",
        default=False,
        dest="force",
        help=
        "Replace file without asking if you are sure, use this flag with caution.",
    )
    parser.add_argument(
        "-t",
        "--threshold",
        action="store",
        default=0.1,
        type=float,
        dest="threshold",
        help=
        "Define a custom maximum amount of chaos allowed in decoded content. 0. <= chaos <= 1.",
    )
    parser.add_argument(
        "--version",
        action="version",
        version="Charset-Normalizer {} - Python {}".format(
            __version__, python_version()),
        help="Show version information and exit.",
    )

    args = parser.parse_args(argv)

    if args.replace is True and args.normalize is False:
        print("Use --replace in addition of --normalize only.",
              file=sys.stderr)
        return 1

    if args.force is True and args.replace is False:
        print("Use --force in addition of --replace only.", file=sys.stderr)
        return 1

    if args.threshold < 0.0 or args.threshold > 1.0:
        print("--threshold VALUE should be between 0. AND 1.", file=sys.stderr)
        return 1

    x_ = []

    for my_file in args.files:

        matches = from_fp(my_file,
                          threshold=args.threshold,
                          explain=args.verbose)

        best_guess = matches.best()

        if best_guess is None:
            print(
                'Unable to identify originating encoding for "{}". {}'.format(
                    my_file.name,
                    "Maybe try increasing maximum amount of chaos."
                    if args.threshold < 1.0 else "",
                ),
                file=sys.stderr,
            )
            x_.append(
                CliDetectionResult(
                    abspath(my_file.name),
                    None,
                    [],
                    [],
                    "Unknown",
                    [],
                    False,
                    1.0,
                    0.0,
                    None,
                    True,
                ))
        else:
            x_.append(
                CliDetectionResult(
                    abspath(my_file.name),
                    best_guess.encoding,
                    best_guess.encoding_aliases,
                    [
                        cp for cp in best_guess.could_be_from_charset
                        if cp != best_guess.encoding
                    ],
                    best_guess.language,
                    best_guess.alphabets,
                    best_guess.bom,
                    best_guess.percent_chaos,
                    best_guess.percent_coherence,
                    None,
                    True,
                ))

            if len(matches) > 1 and args.alternatives:
                for el in matches:
                    if el != matches:
                        x_.append(
                            CliDetectionResult(
                                abspath(my_file.name),
                                el.encoding,
                                el.encoding_aliases,
                                [
                                    cp for cp in el.could_be_from_charset
                                    if cp != el.encoding
                                ],
                                el.language,
                                el.alphabets,
                                el.bom,
                                el.percent_chaos,
                                el.percent_coherence,
                                None,
                                False,
                            ))

            if args.normalize is True:

                if best_guess.encoding.startswith("utf") is True:
                    print(
                        '"{}" file does not need to be normalized, as it already came from unicode.'
                        .format(my_file.name),
                        file=sys.stderr,
                    )
                    if my_file.closed is False:
                        my_file.close()
                    continue

                o_ = my_file.name.split(".")  # type: List[str]

                if args.replace is False:
                    o_.insert(-1, best_guess.encoding)
                    if my_file.closed is False:
                        my_file.close()
                else:
                    if (args.force is False and query_yes_no(
                            'Are you sure to normalize "{}" by replacing it ?'.
                            format(my_file.name),
                            "no",
                    ) is False):
                        if my_file.closed is False:
                            my_file.close()
                        continue

                try:
                    x_[0].unicode_path = abspath("./{}".format(".".join(o_)))

                    with open(x_[0].unicode_path, "w", encoding="utf-8") as fp:
                        fp.write(str(best_guess))
                except IOError as e:
                    print(str(e), file=sys.stderr)
                    if my_file.closed is False:
                        my_file.close()
                    return 2

        if my_file.closed is False:
            my_file.close()

    if args.minimal is False:
        print(
            dumps(
                [el.__dict__ for el in x_] if len(x_) > 1 else x_[0].__dict__,
                ensure_ascii=True,
                indent=4,
            ))
    else:
        print(", ".join(
            [el.encoding if el.encoding else "undefined" for el in x_]))

    return 0
Пример #2
0
def cli_detect(argv=None):
    """
    CLI assistant using ARGV and ArgumentParser
    :param argv:
    :return: 0 if everything is fine, anything else equal trouble
    """
    parser = argparse.ArgumentParser(
        description="The Real First Universal Charset Detector. "
        "Discover originating encoding used on text file. "
        "Normalize text to unicode.")

    parser.add_argument('files',
                        type=argparse.FileType('rb'),
                        nargs='+',
                        help='File(s) to be analysed')
    parser.add_argument(
        '-v',
        '--verbose',
        action="store_true",
        default=False,
        dest='verbose',
        help=
        'Display complementary information about file if any. Stdout will contain logs about the detection process.'
    )
    parser.add_argument(
        '-a',
        '--with-alternative',
        action="store_true",
        default=False,
        dest='alternatives',
        help=
        'Output complementary possibilities if any. Top-level JSON WILL be a list.'
    )
    parser.add_argument(
        '-n',
        '--normalize',
        action="store_true",
        default=False,
        dest='normalize',
        help=
        'Permit to normalize input file. If not set, program does not write anything.'
    )
    parser.add_argument(
        '-m',
        '--minimal',
        action="store_true",
        default=False,
        dest='minimal',
        help=
        'Only output the charset detected to STDOUT. Disabling JSON output.')
    parser.add_argument(
        '-r',
        '--replace',
        action="store_true",
        default=False,
        dest='replace',
        help=
        'Replace file when trying to normalize it instead of creating a new one.'
    )
    parser.add_argument(
        '-f',
        '--force',
        action="store_true",
        default=False,
        dest='force',
        help=
        'Replace file without asking if you are sure, use this flag with caution.'
    )
    parser.add_argument(
        '-t',
        '--threshold',
        action="store",
        default=0.1,
        type=float,
        dest='threshold',
        help=
        "Define a custom maximum amount of chaos allowed in decoded content. 0. <= chaos <= 1."
    )
    parser.add_argument("--version",
                        action="version",
                        version="Charset-Normalizer {} - Python {}".format(
                            __version__, python_version()),
                        help="Show version information and exit.")

    args = parser.parse_args(argv)

    if args.replace is True and args.normalize is False:
        print('Use --replace in addition of --normalize only.',
              file=sys.stderr)
        return 1

    if args.force is True and args.replace is False:
        print('Use --force in addition of --replace only.', file=sys.stderr)
        return 1

    if args.threshold < 0. or args.threshold > 1.:
        print('--threshold VALUE should be between 0. AND 1.', file=sys.stderr)
        return 1

    x_ = []

    for my_file in args.files:

        matches = from_fp(my_file,
                          threshold=args.threshold,
                          explain=args.verbose)

        if len(matches) == 0:
            print(
                'Unable to identify originating encoding for "{}". {}'.format(
                    my_file.name,
                    'Maybe try increasing maximum amount of chaos.'
                    if args.threshold < 1. else ''),
                file=sys.stderr)
            x_.append(
                CliDetectionResult(abspath(my_file.name), None, [], [],
                                   "Unknown", [], False, 1., 0., None, True))
        else:

            r_ = matches.best()
            p_ = r_.first()

            x_.append(
                CliDetectionResult(abspath(
                    my_file.name), p_.encoding, p_.encoding_aliases, [
                        cp
                        for cp in p_.could_be_from_charset if cp != p_.encoding
                    ], p_.language, p_.alphabets, p_.bom, p_.percent_chaos,
                                   p_.percent_coherence, None, True))

            if len(matches) > 1 and args.alternatives:
                for el in matches:
                    if el != p_:
                        x_.append(
                            CliDetectionResult(
                                abspath(my_file.name), el.encoding,
                                el.encoding_aliases, [
                                    cp for cp in el.could_be_from_charset
                                    if cp != el.encoding
                                ], el.language, el.alphabets, el.bom,
                                el.percent_chaos, el.percent_coherence, None,
                                False))

            if args.normalize is True:

                if p_.encoding.startswith('utf') is True:
                    print(
                        '"{}" file does not need to be normalized, as it already came from unicode.'
                        .format(my_file.name),
                        file=sys.stderr)
                    if my_file.closed is False:
                        my_file.close()
                    continue

                o_ = my_file.name.split('.')  # type: list[str]

                if args.replace is False:
                    o_.insert(-1, p_.encoding)
                    if my_file.closed is False:
                        my_file.close()
                else:
                    if args.force is False and query_yes_no(
                            'Are you sure to normalize "{}" by replacing it ?'.
                            format(my_file.name), 'no') is False:
                        if my_file.closed is False:
                            my_file.close()
                        continue

                try:
                    x_[0].unicode_path = abspath('./{}'.format('.'.join(o_)))

                    with open(x_[0].unicode_path, 'w', encoding='utf-8') as fp:
                        fp.write(str(p_))
                except IOError as e:
                    print(str(e), file=sys.stderr)
                    if my_file.closed is False:
                        my_file.close()
                    return 2

        if my_file.closed is False:
            my_file.close()

    if args.minimal is False:
        print(
            dumps([el.__dict__
                   for el in x_] if len(x_) > 1 else x_[0].__dict__,
                  ensure_ascii=True,
                  indent=4))
    else:
        print(', '.join([el.encoding for el in x_]))

    return 0