Python CharsetDetector示例，icu.CharsetDetector Python示例

示例#1

0

显示文件

    def __init__(self, type=1):
        self.__sbcs_url = 'http://data.10jqka.com.cn/ifmarket/lhbyyb/type/1/tab/sbcs/field/sbcs/sort/desc/page/'
        self.__sbcs_list = []

        self.__zjsl_url = 'http://data.10jqka.com.cn/ifmarket/lhbyyb/type/1/tab/zjsl/field/zgczje/sort/desc/page/'
        self.__zjsl_list = []

        self.__btcz_url = 'http://data.10jqka.com.cn/ifmarket/lhbyyb/type/1/tab/btcz/field/xsjs/sort/desc/page/'
        self.__btcz_list = []

        self.__orgcode_url = 'http://data.10jqka.com.cn/ifmarket/xtyyb/orgcode/'
        self.__orgcode_list = []

        self.__market_lhb_url = 'http://data.10jqka.com.cn/market/lhbyyb/orgcode/'
        self.__market_lhb_list = []

        if type == 1 or type == 0:
            self.__init_lhburl()
        elif type == 2:
            self.__init_detailurl()
        elif type == 3:
            self.__init_marketurl()

        self.__client = Client('data.10jqka.com.cn')
        self.__detector = icu.CharsetDetector()

示例#2

0

显示文件

文件： rotatorpnp.py 项目： pmnxis/RotatorPnP

def inteli_open(__path, opt='rb'):
    with open(__path, 'rb') as stuff:
        data = stuff.read()
    stuff.close()
    coding = icu.CharsetDetector(data).detect().getName()
    print(__path, ' - Encoding : ', coding)
    stuff = codecs.open(__path, opt, encoding=coding)
    return stuff, coding

示例#3

0

显示文件

文件： fs.py 项目： sloria/prance

def detect_encoding(filename, default_to_utf8=True, **kwargs):
    """
  Detect the named file's character encoding.

  If the first parts of the file appear to be ASCII, this function returns
  'UTF-8', as that's a safe superset of ASCII. This can be switched off by
  changing the `default_to_utf8` parameter.

  :param str filename: The name of the file to detect the encoding of.
  :param bool default_to_utf8: Defaults to True. Set to False to disable
      treating ASCII files as UTF-8.
  :param bool read_all: Keyword argument; if True, reads the entire file
      for encoding detection.
  :return: The file encoding.
  :rtype: str
  """
    # Read no more than 32 bytes or the file's size
    import os.path
    filename = from_posix(filename)
    file_len = os.path.getsize(filename)
    read_len = min(32, file_len)

    # ... unless we're supposed to!
    if kwargs.get('read_all', False):
        read_len = file_len

    # Read the first read_len bytes raw, so we can detect the encoding
    with open(filename, 'rb') as raw_handle:
        raw = raw_handle.read(read_len)

    # Detect the encoding the file specfies, if any.
    import codecs
    if raw.startswith(codecs.BOM_UTF8):
        encoding = 'utf-8-sig'
    else:
        # Detect encoding using the best detector available
        try:
            # First try ICU. ICU will report ASCII in the first 32 Bytes as
            # ISO-8859-1, which isn't exactly wrong, but maybe optimistic.
            import icu
            encoding = icu.CharsetDetector(raw).detect().getName().lower()
        except ImportError:
            # If that doesn't work, try chardet - it's not got native components,
            # which is a bonus in some environments, but it's not as precise.
            import chardet
            encoding = chardet.detect(raw)['encoding'].lower()

            # Chardet is more brutal in that it reports ASCII if none of the first
            # Bytes contain high bits. To emulate ICU, we just bump up the detected
            # encoding.
            if encoding == 'ascii':
                encoding = 'iso-8859-1'

        # Return UTF-8 if that is what we're supposed to default to
        if default_to_utf8 and encoding in ('ascii', 'iso-8859-1'):
            encoding = 'utf-8'

    return encoding

示例#4

0

显示文件

文件： savViewer.py 项目： ashkart/savReaderWriterFork

 def _get_encoding(self, csvFileName):
     if not self.icuOk:
         return locale.getpreferredencoding()
     with open(csvFileName) as csvfile:
         sample = csvfile.read(self.sampleSize)
     cd = icu.CharsetDetector()
     cd.setText(sample)
     encoding = cd.detect().getName()
     return encoding

示例#5

0

显示文件

文件： mailfv.py 项目： robert3005/scripts

 def convert(self, data, outputCoding='utf-8'):
     coding = icu.CharsetDetector(data).detect().getName()
     print coding
     if outputCoding.upper() != coding.upper():
         data = unicode(data, coding, "replace").encode(outputCoding)
     return data

示例#6

0

显示文件

 def __init__(self,name,type,timeout = 5):
     self.detector = icu.CharsetDetector()
     self.engine = SQLiteExt(name,type,timeout)
     self.name = name
     self.wait_queue = []

示例#7

0

显示文件

def main(argv):

    icao = "KATL"
    sid = ""
    sids = []
    author = ""

    try:
        conn = psycopg2.connect(**db_params)
    except:
        print("Cannot connect to database.", db_params)
        sys.exit()
    cur = conn.cursor()

    try:
        opts, args = getopt.getopt(argv, "hi:s:")
    except getopt.GetoptError:
        print(helptext)
        sys.exit(2)
    for opt, arg in opts:
        if opt == "-h":
            print(helptext)
            sys.exit()
        elif opt == "-i":
            icao = str.upper(arg)
        elif opt == "-s":
            sid = arg
    htconn = httplib.HTTPConnection("gateway.x-plane.com")

    if sid == "":

        htconn.request("GET", "/apiv1/airport/%s" % icao)
        r1 = htconn.getresponse()
        r2 = r1.read()
        result = json.loads(r2)
        sid = result["airport"]["recommendedSceneryId"]

        sceneries = result["airport"]["scenery"]
        for s in sceneries:
            for k2, v2 in s.items():
                if k2 in [
                        "dateDeclined", "dateAccepted", "DateAccepted",
                        "DateApproved", "DateDeclined", "userId", "type"
                ]:
                    pass
                else:
                    print(k2, v2)
            if s["Status"] == "Approved":
                sids.append(s["sceneryId"])
            if s["sceneryId"] == sid:
                author = s["userName"]
            print("-----------------")

        print("approved scenery ids for %s: %s" % (icao, sids))
        #print("highest approved id:", max(sids))
        print("recommended SceneryId: %s by author: %s" % (sid, author))

    htconn.request("GET", "/apiv1/scenery/%s" % str(sid))
    r1 = htconn.getresponse()
    r2 = r1.read()
    result = json.loads(r2)

    zip_base64 = result["scenery"]["masterZipBlob"]

    zip_blob = base64.b64decode(zip_base64)

    print("writing %s.zip" % icao)
    #    file = open("%s.zip" % icao, "wb")
    #    file.write(zip_blob)
    #    file.close()
    zip_bytearray = io.BytesIO(zip_blob)
    zip_fhandle = zipfile.ZipFile(zip_bytearray)

    print("reading %s.zip" % icao)
    #    myZip = zipfile.ZipFile("%s.zip" % icao, "r")
    datstring = zip_fhandle.read("%s.dat" % icao)
    try:
        txtstring = zip_fhandle.read("%s.txt" % icao)
    except:
        print "(2D)"
    else:
        print " 3D :-)"

    dat_handle = StringIO()
    dat_handle.write(datstring)
    dat_handle.seek(0)

    print("writing %s.dat" % icao)
    #    myZip.extract("%s.dat" % icao)
    #
    #    print("deleting %s.zip" % icao)
    #    os.remove("%s.zip" % icao)
    #    print(datstring)

    #################################

    #I
    #1000 Version - data cycle 2013.10, build 20131335, metadata AptXP1000.
    #
    #1     1906 1 0 LOWI Innsbruck Kranebitten
    #01234567890123456789
    #          1    ^

    # "2015" file:
    #A
    #1000 Generated by WorldEditor
    #
    #1   1470 0 0 0B7 Warren-Sugarbush
    #01234567890123456789
    #          1  ^

    #XP10 custom cenery pack:
    #A
    #1000 Generated by WorldEditor
    #
    #1   1906 1 0 LOWI Innsbruck Kranebitten
    #01234567890123456789
    #          1  ^
    #    elev 0 0 ICAO Name

    #known bugs:
    # LOWI
    #invalid byte sequence for encoding "UTF8": 0xf6 0x64 0x20 0x74
    # EDRZ
    #invalid byte sequence for encoding "UTF8": 0xfc
    # fix:
    # iconv...

    icao = ""
    counter = 0
    #    filename = "apt.dat.lowi-in"

    # main loop
    for line in dat_handle:
        line = line.replace("\r\n", os.linesep)
        line = line.strip()
        #        print(line)
        # 1 for airports, 16 for seaports, ....
        if line.startswith("1 ") or line.startswith("16 ") or line.startswith(
                "17 "):

            #the previous icao:
            if icao != "":
                #for testing
                #if icao == "LOWI":
                # write previous airport to DB
                print(icao, counter)
                counter = counter + 1
                insert_or_update(cur, icao, linearray)
                if (counter % 1000 == 0):
                    conn.commit()
                    print("=============COMMIT==============")

            #the next airport:
            apt_header = line.split()
            icao = apt_header[4]
            name = ' '.join(apt_header[5:])
            #print(icao, name)
            linearray = "$$%s$$" % line

        else:
            #read all the lines of that airport
            if icao != "" and line != "" and line != "99":
                linearray += ", $$%s$$" % line

    # last airport in apt.dat:

    encoding = icu.CharsetDetector(linearray).detect().getName()
    new_encoding = "utf-8"
    if new_encoding.upper() != encoding.upper():
        #        encoding = "windows-1250"
        #        encoding = "cp1252"
        #        encoding = "utf-8"
        #        encoding = "latin1"
        print("   --- Converting from file encoding: %s" % str(encoding))
        linearray = unicode(linearray, encoding).encode(new_encoding).replace(
            "\r\n", os.linesep)

    if icao != "":
        #        print(linearray)
        insert_or_update(cur, icao, linearray)
        print(icao, counter)

    conn.commit()
    cur.close()
    conn.close()

示例#8

0

显示文件

文件： fs.py 项目： RonnyPfannschmidt/prance

def detect_encoding(filename, default_to_utf8=True, **kwargs):
    """
    Detect the named file's character encoding.

    If the first parts of the file appear to be ASCII, this function returns
    'UTF-8', as that's a safe superset of ASCII. This can be switched off by
    changing the `default_to_utf8` parameter.

    :param str filename: The name of the file to detect the encoding of.
    :param bool default_to_utf8: Defaults to True. Set to False to disable
        treating ASCII files as UTF-8.
    :param bool read_all: Keyword argument; if True, reads the entire file
        for encoding detection.
    :return: The file encoding.
    :rtype: str
    """
    # Read some of the file
    import os.path

    filename = from_posix(filename)
    file_len = os.path.getsize(filename)
    read_len = min(_READ_CHUNK_SIZE, file_len)

    # ... unless we're supposed to!
    if kwargs.get("read_all", False):
        read_len = file_len

    # Read the first read_len bytes raw, so we can detect the encoding
    with open(filename, "rb") as raw_handle:
        raw = raw_handle.read(read_len)

    # Detect the encoding the file specfies, if any.
    import codecs

    if raw.startswith(codecs.BOM_UTF8):
        encoding = "utf-8-sig"
    else:
        # Detect encoding using the best detector available
        try:
            # First try ICU. ICU will report ASCII in the first 32 Bytes as
            # ISO-8859-1, which isn't exactly wrong, but maybe optimistic.
            import icu

            encoding = icu.CharsetDetector(raw).detect().getName().lower()
        except ImportError:  # pragma: nocover
            # If that doesn't work, try chardet - it's not got native components,
            # which is a bonus in some environments, but it's not as precise.
            import chardet

            encoding = chardet.detect(raw)["encoding"].lower()

            # Chardet is more brutal in that it reports ASCII if none of the first
            # Bytes contain high bits. To emulate ICU, we just bump up the detected
            # encoding.
            if encoding == "ascii":
                encoding = "iso-8859-1"

        # Both chardet and ICU may detect ISO-8859-x, which may not be possible
        # to decode as UTF-8. So whatever they report, we'll try decoding as
        # UTF-8 before reporting it.
        if default_to_utf8 and encoding in ("ascii", "iso-8859-1", "windows-1252"):
            # Try decoding as utf-8
            try:
                raw.decode("utf-8")
                # If this worked... well there's no guarantee it's utf-8, to be
                # honest.
                encoding = "utf-8"
            except UnicodeDecodeError:
                # Decoding as utf-8 failed, so we can't default to it.
                pass

    return encoding

示例#9

0

显示文件

def convertData(data):
    # detect encoding with IBMs ICU Detector... python wrapper of the system lib :)
    coding = icu.CharsetDetector(data).detect().getName()
    if targetFormat.upper() != coding.upper():
        data = unicode(data, coding).encode(targetFormat)
    return data

示例#10

0

显示文件

文件： html.py 项目： tangyi1989/unreal

def convert_encoding(data, new_coding='UTF-8'):
    coding = icu.CharsetDetector(data).detect().getName()
    if new_coding.upper() != coding.upper():
        data = unicode(data, coding).encode(new_coding)
    return data

示例#11

0

显示文件

文件： kommersant_parser.py 项目： japegon/news_sentiment_analys

 def convert_encoding(self, data, new_coding='UTF-8'):
     coding = icu.CharsetDetector(data).detect().getName()
     print("Detected coding {}".format(coding))
     if new_coding.upper() != coding.upper():
         data = (data.decode(coding)).encode(new_coding)
     return data