예제 #1
0
def _detect_encoding_by_bom(
        sample: typing.AnyStr,
        default: typing.Optional[str] = None) -> typing.Optional[str]:
    """
    Detects the encoding of a `sample` string, among various Unicode
    variants, by looking at the BOM (Byte Order Mark) as defined in
    the `codecs` module.
    """

    # JSON always starts with two ASCII characters, so detection is as
    # easy as counting the nulls and from their location and count
    # determine the encoding. Also detect a BOM, if present.

    sample = sample[:4]

    if sample in (codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE):
        return 'utf-32'  # BOM included
    if sample[:3] == codecs.BOM_UTF8:
        return 'utf-8-sig'  # BOM included, MS style (discouraged)
    if sample[:2] in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE):
        return 'utf-16'  # BOM included

    nullcount = sample.count(_null)

    if nullcount == 0:
        return default

    if nullcount == 2:
        if sample[::2] == _null2:  # 1st and 3rd are null
            return 'utf-16-be'
        if sample[1::2] == _null2:  # 2nd and 4th are null
            return 'utf-16-le'
        # Did not detect 2 valid UTF-16 ascii-range characters

    if nullcount == 3:
        if sample[:3] == _null3:
            return 'utf-32-be'
        if sample[1:] == _null3:
            return 'utf-32-le'
        # Did not detect a valid UTF-32 ascii-range character

    return default
예제 #2
0
def charCountBiggerEqualThanX(text: typing.AnyStr, cha: typing.AnyStr):
    res = text.count(cha)
    if debug:
        print(res)
    return res