Python items 예제들, encodings.aliases.aliases.items Python 예제들

예제 #1

0

파일 보기

파일: common.py 프로젝트: ClarkOh/MyWork

def findCodecName(text,
                  displayEncodingCodecName=sys.stdout.encoding,
                  encodingErrorFlag=0):
    from sets import Set
    from encodings.aliases import aliases

    print text

    encodingCodecSet = Set()
    for encodingName in aliases.items():
        encodingCodecSet.add(encodingName[1].replace('_', '-'))
    """ 
    # [DUMP ENCODING CODEC]
    for encodingCodec in encodingCodecSet :
        if encodingCodec == 'cp949' : print 'FOUND'
        print encodingCodec,
    """

    if len(encodingCodecSet.intersection([displayEncodingCodecName])) == 0:
        print 'invalid displayEncodingCodecName : %s' % (
            displayEncodingCodecName)
        return

    for encodingCodec in encodingCodecSet:

        try:
            encodedStr = text.decode(encodingCodec).encode(
                displayEncodingCodecName)


#        except UnicodeEncodeError as e :
#            if encodingErrorFlag == 1 :
#                print '"',encodingCodec,'"','->','"',sys.stdout.encoding,'":',e
#            continue
#        except UnicodeDecodeError as e :
#            if encodingErrorFlag == 1 :
#                print '"',encodingCodec,'"','->','"',sys.stdout.encoding,'":',e
#            continue
#        except ValueError as e :
#            if encodingErrorFlag == 1 :
#                print '"',encodingCodec,'"','->','"',sys.stdout.encoding,'":',e
#            continue
#        except TypeError as e :
#            if encodingErrorFlag == 1 :
#                print '"',encodingCodec,'"','->','"',sys.stdout.encoding,'":',e
#            continue
#        except IOError as e :
#            if encodingErrorFlag == 1 :
#                print '"',encodingCodec,'"','->','"',sys.stdout.encoding,'":',e
#            continue
#        except LookupError as e :
#            if encodingErrorFlag == 1 :
#                print '"',encodingCodec,'"','->','"',sys.stdout.encoding,'":',e
#            continue
        except BaseException as e:
            if encodingErrorFlag == 1:
                print '"', encodingCodec, '"', '->', '"', sys.stdout.encoding, '":', e
            continue

        print '"', encodingCodec, '"', '->', '"', sys.stdout.encoding, '":', encodedStr

예제 #2

0

파일 보기

파일: utils.py 프로젝트: nayanasp/project

def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional[str]:
    """
    Extract using ASCII-only decoder any specified encoding in the first n-bytes.
    """
    if not isinstance(sequence, bytes):
        raise TypeError

    seq_len = len(sequence)  # type: int

    results = findall(
        RE_POSSIBLE_ENCODING_INDICATION,
        sequence[: seq_len if seq_len <= search_zone else search_zone].decode(
            "ascii", errors="ignore"
        ),
    )  # type: List[str]

    if len(results) == 0:
        return None

    for specified_encoding in results:
        specified_encoding = specified_encoding.lower().replace("-", "_")

        for encoding_alias, encoding_iana in aliases.items():
            if encoding_alias == specified_encoding:
                return encoding_iana
            if encoding_iana == specified_encoding:
                return encoding_iana

    return None

예제 #3

0

파일 보기

def check_encoding_supported(encoding):
    is_supported = False
    for key, value in enumerate(aliases.items()):
        if encoding in list(value):
            is_supported = True
            break
    return is_supported

예제 #4

0

파일 보기

파일: common.py 프로젝트: ClarkOh/snowball

def findCodecName( text, 
                   displayEncodingCodecName = sys.stdout.encoding,
                   encodingErrorFlag = 0 ) :
    from sets import Set
    from encodings.aliases import aliases
    
    print text

    encodingCodecSet = Set()
    for encodingName in aliases.items() :
        encodingCodecSet.add(encodingName[1].replace('_','-'))

    """ 
    # [DUMP ENCODING CODEC]
    for encodingCodec in encodingCodecSet :
        if encodingCodec == 'cp949' : print 'FOUND'
        print encodingCodec,
    """
    
    if len(encodingCodecSet.intersection([displayEncodingCodecName])) == 0 :
        print 'invalid displayEncodingCodecName : %s'%(displayEncodingCodecName)
        return
    

    for encodingCodec in encodingCodecSet :

        try :
            encodedStr = text.decode(encodingCodec).encode(displayEncodingCodecName)
        
#        except UnicodeEncodeError as e :
#            if encodingErrorFlag == 1 :
#                print '"',encodingCodec,'"','->','"',sys.stdout.encoding,'":',e
#            continue
#        except UnicodeDecodeError as e :
#            if encodingErrorFlag == 1 :
#                print '"',encodingCodec,'"','->','"',sys.stdout.encoding,'":',e
#            continue
#        except ValueError as e :
#            if encodingErrorFlag == 1 :
#                print '"',encodingCodec,'"','->','"',sys.stdout.encoding,'":',e
#            continue
#        except TypeError as e :
#            if encodingErrorFlag == 1 :
#                print '"',encodingCodec,'"','->','"',sys.stdout.encoding,'":',e
#            continue
#        except IOError as e :
#            if encodingErrorFlag == 1 :
#                print '"',encodingCodec,'"','->','"',sys.stdout.encoding,'":',e
#            continue
#        except LookupError as e :
#            if encodingErrorFlag == 1 :
#                print '"',encodingCodec,'"','->','"',sys.stdout.encoding,'":',e
#            continue
        except BaseException as e :
            if encodingErrorFlag == 1 :
                print '"',encodingCodec,'"','->','"',sys.stdout.encoding,'":', e
            continue

        print '"',encodingCodec,'"','->','"',sys.stdout.encoding,'":', encodedStr

예제 #5

0

파일 보기

파일: utils.py 프로젝트: daviderill/giswater_qgis_plugin

    def populate_cmb_unicodes(self, combo):
        """ Populate combo with full list of codes """

        unicode_list = []
        for item in list(aliases.items()):
            unicode_list.append(str(item[0]))
            sorted_list = sorted(unicode_list, key=str.lower)
        utils_giswater.set_autocompleter(combo, sorted_list)

예제 #6

0

파일 보기

파일: models.py 프로젝트: ivanjo39191/vue-blog-backend

 def encoding_aliases(self) -> List[str]:
     """
     Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
     """
     also_known_as = []  # type: List[str]
     for u, p in aliases.items():
         if self.encoding == u:
             also_known_as.append(p)
         elif self.encoding == p:
             also_known_as.append(u)
     return also_known_as

예제 #7

0

파일 보기

파일: utils.py 프로젝트: nayanasp/project

def iana_name(cp_name: str, strict: bool = True) -> str:
    cp_name = cp_name.lower().replace("-", "_")

    for encoding_alias, encoding_iana in aliases.items():
        if cp_name == encoding_alias or cp_name == encoding_iana:
            return encoding_iana

    if strict:
        raise ValueError("Unable to retrieve IANA for '{}'".format(cp_name))

    return cp_name

예제 #8

0

파일 보기

파일: tools_qt.py 프로젝트: Giswater/giswater_qgis_plugin

def fill_combo_unicodes(combo):
    """ Populate combo with full list of codes """

    unicode_list = []
    matches = ["utf8", "windows", "latin"]
    for item in list(aliases.items()):
        for x in matches:
            if not f"{item[0]}".startswith(x):
                continue
            unicode_list.append(str(item[0]))

    sorted_list = sorted(unicode_list, key=str.lower)
    if sorted_list:
        set_autocompleter(combo, sorted_list)

예제 #9

0

파일 보기

파일: CSVDialogs.py 프로젝트: miketian2020/fracturing

def _calculateEncodingKey(comparator):
    """Gets the first key of all available encodings where the corresponding
    value matches the comparator.

    Args:
        comparator (string): A view name for an encoding.

    Returns:
        str: A key for a specific encoding used by python.

    """
    encodingName = None
    for k, v in list(_encodings.items()):
        if v == comparator:
            encodingName = k
            break
    return encodingName

예제 #10

0

파일 보기

파일: BiBiParrotFormatTextHandler.py 프로젝트: bibiparrot/bibiparrot

def findEncodings(q):
    return [(k,v) for k, v in aliases.items() if q in k or q in v]

예제 #11

0

파일 보기

파일: charset_inspect.py 프로젝트: firsov/charsetinspect

parser.add_argument("-s", "--starts-with", help="Searches the start of the multi-byte character", action="store_true")
parser.add_argument("-c", "--contains", help="Searches entire multi-byte character", action="store_true")
args = parser.parse_args()
if(args.ends_with and args.starts_with) or (args.ends_with and args.contains) or (args.starts_with and args.contains):
  print("You may not select more than one search position at a time")
  print("Please choose only one of the following arguments: -e, -s, -c")
  exit()
if args.starts_with:
  print("You are currently searching for a character set that begins with: " + args.needle)
elif args.contains:
  print("You are currently searching for a character set that contains: " + args.needle)
else:
  print("You are currently searching for a character set that ends with: " + args.needle)
chars = list(str for str in map(chr, range(0,1114112)) if str.isprintable())
search_needles = dict()
for v,encoding in aliases.items():
  for char in chars:
    try:
      if char == args.needle:
        search_needles[encoding] = ' '.join(map(hex,char.encode(encoding)))
    except LookupError:
      pass
for encoding,code in search_needles.items():
  for char in chars:
    try:
      if args.starts_with:
        if ' '.join(map(hex,char.encode(encoding))).startswith(code) and char != args.needle:
          print(encoding + " | " + char +  " | " + ' '.join(map(hex,char.encode(encoding))) + " | " + unicodedata.name(char))
      elif args.contains:
        if code in ' '.join(map(hex,char.encode(encoding))) and char != args.needle:
          print(encoding + " | " + char +  " | " + ' '.join(map(hex,char.encode(encoding))) + " | " + unicodedata.name(char))

예제 #12

0

파일 보기

파일: ch4-encoding.py 프로젝트: kyleburton/sandbox

from encodings.aliases import aliases

s = 'El Niño'
print("Codecs")

for codec in ['latin_1', 'utf_8', 'utf_16']:
    print("codec({}) {}: {}".format(codec, s, s.encode(codec)))

# NB: stackoverflow suggests this is an incomplete set, still interesting IMO
all_the_things = set()
for alias in aliases.items():
    all_the_things.add(alias[0])
    all_the_things.add(alias[1])

def chunks(l, n):
    for i in range(0, len(l), n):
        yield l[i:i + n]

for elts in chunks(list(all_the_things), 8):
    print(", ".join(elts))

예제 #13

0

파일 보기

# Copyright: ZopeChina Corp, Ltd. http://zopechina.com
# hack python's default encoding to 'utf-8'

import sys
reload(sys)
sys.setdefaultencoding('utf-8')
del sys.setdefaultencoding

import os
from encodings.aliases import aliases

# gb2312 is obsoleted, use gbk
for k,v in aliases.items():
    if v == 'cjkcodecs.gb2312':
        aliases[k] = 'cjkcodecs.gbk'
        
if os.name == 'nt':
    import encodings
    for ec in ['gb2312', 'gbk', 'gb18030', 'big5']:
        if not encodings.aliases.aliases.has_key(ec):
            encodings.aliases.aliases[ec] = 'mbcs'
            # clear cache
            if encodings._cache.has_key(ec):
                del encodings._cache[ec]


import ZopePak
import StructuredTextPak
import setup
try:
    import PlonePak

예제 #14

0

파일 보기

파일: findingEncoding.py 프로젝트: ClarkOh/MyWork

# -*- coding: utf-8 -*-

#from cxFile import cxFile
from sets import Set
from encodings.aliases import aliases

#import sys
#reload(sys)
#sys.setdefaultencoding('utf-8')

#resultFile = cxFile()
encodingCodecSet = Set()
#for encodingName in aliases.keys() :
for encodingName in aliases.items() :
    encodingCodecSet.add(encodingName[1].replace('_','-'))
    #string = encodingName[0] + encodingName[1] + '\n'
    #string = encodingName[0] + '\t:\t' + encodingName[1].replace('_','-') + '\n'
    #resultFile.write(string)

#for encodingCodec in encodingCodecSet :
#    string = encodingCodec + '\n'
#    resultFile.write(string)
#resultFile.close()

testString = '¿ÀÁø¿ø'

print testString

import sys
print sys.stdout.encoding

예제 #15

0

파일 보기

# -*- coding: utf-8 -*-

#from cxFile import cxFile
from sets import Set
from encodings.aliases import aliases

#import sys
#reload(sys)
#sys.setdefaultencoding('utf-8')

#resultFile = cxFile()
encodingCodecSet = Set()
#for encodingName in aliases.keys() :
for encodingName in aliases.items():
    encodingCodecSet.add(encodingName[1].replace('_', '-'))
    #string = encodingName[0] + encodingName[1] + '\n'
    #string = encodingName[0] + '\t:\t' + encodingName[1].replace('_','-') + '\n'
    #resultFile.write(string)

#for encodingCodec in encodingCodecSet :
#    string = encodingCodec + '\n'
#    resultFile.write(string)
#resultFile.close()

testString = '¿ÀÁø¿ø'

print testString

import sys
print sys.stdout.encoding

예제 #16

0

파일 보기

파일: normalizer.py 프로젝트: jayvdb/charset_normalizer

    def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.09):
        """
        Take a sequence of bytes that could potentially be decoded to str and discard all obvious non supported
        charset encoding.
        :param bytearray sequences: Actual sequence of bytes to analyse
        :param float threshold: Maximum amount of chaos allowed on first pass
        :param int chunk_size: Size to extract and analyse in each step
        :param int steps: Number of steps
        :return: List of potential matches
        :rtype: CharsetNormalizerMatches
        """
        py_v = [int(el) for el in python_version_tuple()]
        py_need_sort = py_v[0] < 3 or (py_v[0] == 3 and py_v[1] < 6)

        supported = sorted(
            aliases.items()) if py_need_sort else aliases.items()
        tested = set()
        working = dict()

        maximum_length = len(sequences)

        for support in supported:

            k, p = support

            if p in tested:
                continue

            tested.add(p)

            try:
                str(sequences, encoding=p)
            except UnicodeDecodeError:
                continue
            except LookupError:
                continue

            chaos_measures = list()
            ranges_encountered_t = dict()
            decoded_len_t = 0

            for i in range(0, maximum_length, int(maximum_length / steps)):

                chunk = sequences[i:i + chunk_size]
                decoded = str(chunk, encoding=p, errors='ignore')

                probe_chaos = ProbeChaos(decoded)
                chaos_measure, ranges_encountered = probe_chaos.ratio, probe_chaos.encountered_unicode_range_occurrences

                for k, e in ranges_encountered.items():
                    if k not in ranges_encountered_t.keys():
                        ranges_encountered_t[k] = 0
                    ranges_encountered_t[k] += e

                if chaos_measure > threshold:
                    if p in working.keys():
                        del working[p]
                    break

                chaos_measures.append(chaos_measure)

                if p not in working.keys():
                    working[p] = dict()

            if p in working.keys():
                working[p]['ratio'] = statistics.mean(chaos_measures)
                working[p]['ranges'] = ranges_encountered_t
                working[p]['chaos'] = sum(chaos_measures)
                working[p]['len'] = decoded_len_t

            if p == 'ascii' and working[p]['ratio'] == 0.:
                break

        return CharsetNormalizerMatches([
            CharsetNormalizerMatch(sequences, enc, working[enc]['ratio'],
                                   working[enc]['ranges'])
            for enc in (
                sorted(working.keys()) if py_need_sort else working.keys())
            if working[enc]['ratio'] <= threshold
        ])

예제 #17

0

파일 보기

파일: charset_inspect.py 프로젝트: firsov/charsetinspect

    print("You may not select more than one search position at a time")
    print("Please choose only one of the following arguments: -e, -s, -c")
    exit()
if args.starts_with:
    print(
        "You are currently searching for a character set that begins with: " +
        args.needle)
elif args.contains:
    print("You are currently searching for a character set that contains: " +
          args.needle)
else:
    print("You are currently searching for a character set that ends with: " +
          args.needle)
chars = list(str for str in map(chr, range(0, 1114112)) if str.isprintable())
search_needles = dict()
for v, encoding in aliases.items():
    for char in chars:
        try:
            if char == args.needle:
                search_needles[encoding] = ' '.join(
                    map(hex, char.encode(encoding)))
        except LookupError:
            pass
for encoding, code in search_needles.items():
    for char in chars:
        try:
            if args.starts_with:
                if ' '.join(map(hex, char.encode(encoding))).startswith(
                        code) and char != args.needle:
                    print(encoding + " | " + char + " | " +
                          ' '.join(map(hex, char.encode(encoding))) + " | " +

예제 #18

0

파일 보기

파일: datasets.py 프로젝트: Nathanlauga/transparentai-ui

from ..utils import add_in_db, exists_in_db

dataset_controller = Controller(component=Dataset,
                                format_fn=format_dataset,
                                control_fn=control_dataset,
                                module_fn=load_dataset_modules_in_background)

pandas_prof_controller = Controller(component=ModulePandasProfiling,
                                    format_fn=pandas_profiling.format_module,
                                    control_fn=pandas_profiling.control_module)

bias_controller = Controller(component=ModuleBias,
                             format_fn=bias.format_module,
                             control_fn=bias.control_module)

encodings = list(sorted(set([v for k, v in aliases.items()])))


def index():
    title = _('Datasets')
    header = get_header_attributes()
    datasets = dataset_controller.index()

    return render_template("datasets/index.html",
                           session=session,
                           datasets=datasets,
                           header=header)


def get_all_instances_json():
    datasets = dataset_controller.index()

예제 #19

0

파일 보기

파일: SecurityEditor.py 프로젝트: mauricelambert/SecurityEditor

    def __init__(self, parent):

        menubar = tk.Menu(parent.master)
        self.config(menubar, parent)
        parent.master.config(menu = menubar)

        file_dropdown = tk.Menu(menubar)
        file_dropdown.add_command(label = "Nouveau",
            accelerator = "Ctrl+N", command = parent.new_file)
        file_dropdown.add_command(label = "Ouvrir",
            accelerator = "Ctrl+O", command = parent.ask_file)

        reopen_dropdown = tk.Menu(file_dropdown)
        for file in parent.constante.reopen_files :
            if path.isfile(file) :
                reopen_dropdown.add_command(label = file, 
                    command = lambda file = file : parent.open_file(file))
        self.config(reopen_dropdown, parent)
        file_dropdown.add_cascade(label = "Ré-ouvrir",
            menu = reopen_dropdown)
        
        file_dropdown.add_command(label = "Sauvegarder",
            accelerator = "Ctrl+S", command = parent.save)
        file_dropdown.add_command(label = "Sauvegarder sous...",
            accelerator = "Ctrl+Shift+S", command = parent.save_as)
        
        file_dropdown.add_separator()
        file_dropdown.add_command(label = "info", accelerator = "Ctrl+I",
            command = parent.get_info)
        file_dropdown.add_command(label = "Rechercher", accelerator = "Ctrl+F",
            command = parent.find)
        file_dropdown.add_separator()
        
        file_dropdown.add_command(label = "Quitter",
            command = parent.exit)
        self.config(file_dropdown, parent)

        theme_dropdown = tk.Menu(menubar)
        theme_dropdown.add_command(label = "Sombre",
            command = lambda : self.modif_config(parent, "Sombre"))
        theme_dropdown.add_command(label = "Clair",
            command = lambda : self.modif_config(parent, "Clair"))
        theme_dropdown.add_command(label = "Pas de theme",
            command = lambda : self.modif_config(parent))
        self.config(theme_dropdown, parent)

        crypt_dropdown = tk.Menu(menubar)
        crypt_dropdown.add_command(label = "Cryptage : faible (sans clés)", command = lambda : (f:=open(
            parent.constante.file_full_name+".crypt",'wb'),f.write(b85encode(b16encode(codecs.encode(parent.textarea.get(
                1.0,tk.END),"rot13").encode()))),f.close(),print("Le fichier a été crypté avec succès.")))
        crypt_dropdown.add_command(label = "Cryptage : fort (avec clés)", command = lambda : (key:=simpledialog.askstring(
            "Clef","Clef de cryptage",show="*"),f:=open(parent.constante.file_full_name+".crypt","wb"),f.write(File.XOR(
                parent.textarea.get(1.0,tk.END).encode(),key)),f.close(),print("Le fichier a été crypté avec succès")))
        crypt_dropdown.add_command(label = "Décryptage : faible (sans clés)", command = lambda : (
            file:=filedialog.askopenfilename(defaultextension=".crypt").replace("/", "\\"),f:=open(file,'rb'),code:=f.read(),
            f.close(),f:=open(file+".decrypt","w"),f.write(codecs.decode(b16decode(b85decode(code)).decode(),"rot13")
                ),f.close(),print("Le fichier a été décrypté avec succès.")))
        crypt_dropdown.add_command(label = "Décrypter : fort (avec clés)", command = lambda : (file:=filedialog.askopenfilename(
            defaultextension=".crypt").replace("/", "\\"),key:=simpledialog.askstring("Clef","Clef de cryptage",show="*"),
            f:=open(file,'rb'),code:=File.XOR(f.read(),key),f.close(),f:=open(file+".decrypt",'wb'),f.write(code),f.close(),
            print("Le fichier a été décrypté avec succès.")))
        self.config(crypt_dropdown, parent)

        checksums_dropdown = tk.Menu(menubar)
        checksums_dropdown.add_command(label = "Voir les checksums", command = lambda : (print(f"""SELECT :\n{File.hashs(
            parent.get_select())}""")if parent.get_select()else None,print(f"""FILE :\n{File.hashs(parent.textarea.get(1.0,
                tk.END))}""")))
        checksums_dropdown.add_command(label = "Généré un fichier de checksums", command = lambda : (f:=open(
            parent.constante.file_full_name+".hash","w", encoding = "utf-8"),f.write(File.hashs(parent.textarea.get(1.0,tk.END
                ))),f.close(),print("Le fichier a été créé avec succès.")))
        self.config(checksums_dropdown, parent)

        compress_dropdown = tk.Menu(menubar)
        compress_dropdown.add_command(label = "Compresser le fichier", command = lambda : File.compress(parent.constante))
        compress_dropdown.add_command(label = "Décompresser le fichier", command = lambda : (file:=filedialog.askopenfilename(
            defaultextension=".zip").replace("/", "\\"),pwd:=simpledialog.askstring("Mot de passe",
            "Mot de passe (facultatif) : ",show="*"),File.decompress(file,pwd)))
        self.config(compress_dropdown, parent)

        python_dropdown = tk.Menu(menubar)
        python_dropdown.add_command(label = "Compilation du fichier", command = lambda : (py_compile.compile(
            parent.constante.file_full_name,cfile=parent.constante.file_full_name+"c")if re.match(r"^(.*)\.py$",
            parent.constante.file_full_name)else print("Ce fichier n'est pas un fichier Python..."),
            print("Fin de la compilation")))
        python_dropdown.add_command(label = "Script en 1 ligne", command = lambda : (File.script_one_line(
            parent)if re.match(r"^(.*)\.py$", parent.constante.file_full_name)else print(
            "Ce fichier n'est pas un fichier Python..."),print("Fin de la transformation")))
        self.config(python_dropdown, parent)

        encoding_dropdown = tk.Menu(menubar)
        for encoding in parent.constante.config["general"]["encodings"] :
            encoding_dropdown.add_command(label = encoding, command = lambda enc=encoding : (parent.save(),
                parent.read_file([enc])))
        encoding_dropdown.add_command(label = "Hexadecimal", command = lambda : (parent.save(),
                parent.read_file(["hex"])))
        encoding_dropdown.add_command(label = "Liste des encodings",
            command = lambda : [print(alias) for alias, enc in aliases.items()])
        self.config(encoding_dropdown, parent)

        about_dropdown = tk.Menu(menubar)
        about_dropdown.add_command(label = "Version",
                                   command = self.show_release_notes)
        about_dropdown.add_separator()
        about_dropdown.add_command(label = "A propos...",
                                   command = self.show_about_message)
        self.config(about_dropdown, parent)

        menubar.add_cascade(label = "Fichier", menu = file_dropdown)
        menubar.add_cascade(label = "Themes", menu = theme_dropdown)
        menubar.add_cascade(label = "Encoding", menu = encoding_dropdown)
        menubar.add_cascade(label = "Compression", menu = compress_dropdown)
        menubar.add_cascade(label = "Cryptage", menu = crypt_dropdown)
        menubar.add_cascade(label = "Checksums", menu = checksums_dropdown)
        menubar.add_cascade(label = "Script Python", menu = python_dropdown)
        menubar.add_command(label = "Execute", accelerator = "f5")       
        menubar.add_cascade(label = "A propos", menu = about_dropdown)

예제 #20

0

파일 보기

    def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.20):
        """
        Take a sequence of bytes that could potentially be decoded to str and discard all obvious non supported
        charset encoding.
        Will test input like this (with steps=4 & chunk_size=4) --> [####     ####     ####     ####]
        :param bytes sequences: Actual sequence of bytes to analyse
        :param float threshold: Maximum amount of chaos allowed on first pass
        :param int chunk_size: Size to extract and analyse in each step
        :param int steps: Number of steps
        :return: List of potential matches
        :rtype: CharsetNormalizerMatches
        """
        py_v = [int(el) for el in python_version_tuple()]
        py_need_sort = py_v[0] < 3 or (py_v[0] == 3 and py_v[1] < 6)

        supported = sorted(aliases.items()) if py_need_sort else aliases.items()

        tested = set()
        matches = list()

        maximum_length = len(sequences)

        if maximum_length <= chunk_size:
            chunk_size = maximum_length
            steps = 1

        for support in supported:

            k, p = support

            if p in tested:
                continue

            tested.add(p)

            bom_available = False
            bom_len = None

            try:
                if p in BYTE_ORDER_MARK.keys():

                    if isinstance(BYTE_ORDER_MARK[p], bytes) and sequences.startswith(BYTE_ORDER_MARK[p]):
                        bom_available = True
                        bom_len = len(BYTE_ORDER_MARK[p])
                    elif isinstance(BYTE_ORDER_MARK[p], list):
                        bom_c_list = [sequences.startswith(el) for el in BYTE_ORDER_MARK[p]]
                        if any(bom_c_list) is True:
                            bom_available = True
                            bom_len = len(BYTE_ORDER_MARK[p][bom_c_list.index(True)])

                str(
                    sequences if bom_available is False else sequences[bom_len:],
                    encoding=p
                )

            except UnicodeDecodeError:
                continue
            except LookupError:
                continue

            r_ = range(
                0 if bom_available is False else bom_len,
                maximum_length,
                int(maximum_length / steps)
            )

            measures = [ProbeChaos(str(sequences[i:i + chunk_size], encoding=p, errors='ignore'), giveup_threshold=threshold) for i in r_]
            ratios = [el.ratio for el in measures]
            nb_gave_up = [el.gave_up is True or el.ratio >= threshold for el in measures].count(True)

            chaos_means = statistics.mean(ratios)
            chaos_median = statistics.median(ratios)
            # chaos_min = min(ratios)
            # chaos_max = max(ratios)

            if (len(r_) >= 4 and nb_gave_up > len(r_) / 4) or chaos_median > threshold:
                # print(p, 'is too much chaos for decoded input !')
                continue

            encountered_unicode_range_occurrences = dict()

            for el in measures:
                for u_name, u_occ in el.encountered_unicode_range_occurrences.items():
                    if u_name not in encountered_unicode_range_occurrences.keys():
                        encountered_unicode_range_occurrences[u_name] = 0
                    encountered_unicode_range_occurrences[u_name] += u_occ

            # print(p, 'U RANGES', encountered_unicode_range_occurrences)

            cnm = CharsetNormalizerMatch(
                sequences if not bom_available else sequences[bom_len:],
                p,
                chaos_means,
                encountered_unicode_range_occurrences,
                bom_available
            )

            fingerprint_tests = [el.fingerprint == cnm.fingerprint for el in matches]

            if any(fingerprint_tests) is True:
                matches[fingerprint_tests.index(True)].submatch.append(cnm)
            else:
                matches.append(
                    CharsetNormalizerMatch(
                        sequences if not bom_available else sequences[bom_len:],
                        p,
                        chaos_means,
                        encountered_unicode_range_occurrences,
                        bom_available
                    )
                )

            # print(p, nb_gave_up, chaos_means, chaos_median, chaos_min, chaos_max, matches[-1].coherence, matches[-1].languages,)

            if (p == 'ascii' and chaos_median == 0.) or bom_available is True:
                return CharsetNormalizerMatches([matches[-1]])

        return CharsetNormalizerMatches(matches)