def __init__(self, type=1): self.__sbcs_url = 'http://data.10jqka.com.cn/ifmarket/lhbyyb/type/1/tab/sbcs/field/sbcs/sort/desc/page/' self.__sbcs_list = [] self.__zjsl_url = 'http://data.10jqka.com.cn/ifmarket/lhbyyb/type/1/tab/zjsl/field/zgczje/sort/desc/page/' self.__zjsl_list = [] self.__btcz_url = 'http://data.10jqka.com.cn/ifmarket/lhbyyb/type/1/tab/btcz/field/xsjs/sort/desc/page/' self.__btcz_list = [] self.__orgcode_url = 'http://data.10jqka.com.cn/ifmarket/xtyyb/orgcode/' self.__orgcode_list = [] self.__market_lhb_url = 'http://data.10jqka.com.cn/market/lhbyyb/orgcode/' self.__market_lhb_list = [] if type == 1 or type == 0: self.__init_lhburl() elif type == 2: self.__init_detailurl() elif type == 3: self.__init_marketurl() self.__client = Client('data.10jqka.com.cn') self.__detector = icu.CharsetDetector()
def inteli_open(__path, opt='rb'): with open(__path, 'rb') as stuff: data = stuff.read() stuff.close() coding = icu.CharsetDetector(data).detect().getName() print(__path, ' - Encoding : ', coding) stuff = codecs.open(__path, opt, encoding=coding) return stuff, coding
def detect_encoding(filename, default_to_utf8=True, **kwargs): """ Detect the named file's character encoding. If the first parts of the file appear to be ASCII, this function returns 'UTF-8', as that's a safe superset of ASCII. This can be switched off by changing the `default_to_utf8` parameter. :param str filename: The name of the file to detect the encoding of. :param bool default_to_utf8: Defaults to True. Set to False to disable treating ASCII files as UTF-8. :param bool read_all: Keyword argument; if True, reads the entire file for encoding detection. :return: The file encoding. :rtype: str """ # Read no more than 32 bytes or the file's size import os.path filename = from_posix(filename) file_len = os.path.getsize(filename) read_len = min(32, file_len) # ... unless we're supposed to! if kwargs.get('read_all', False): read_len = file_len # Read the first read_len bytes raw, so we can detect the encoding with open(filename, 'rb') as raw_handle: raw = raw_handle.read(read_len) # Detect the encoding the file specfies, if any. import codecs if raw.startswith(codecs.BOM_UTF8): encoding = 'utf-8-sig' else: # Detect encoding using the best detector available try: # First try ICU. ICU will report ASCII in the first 32 Bytes as # ISO-8859-1, which isn't exactly wrong, but maybe optimistic. import icu encoding = icu.CharsetDetector(raw).detect().getName().lower() except ImportError: # If that doesn't work, try chardet - it's not got native components, # which is a bonus in some environments, but it's not as precise. import chardet encoding = chardet.detect(raw)['encoding'].lower() # Chardet is more brutal in that it reports ASCII if none of the first # Bytes contain high bits. To emulate ICU, we just bump up the detected # encoding. if encoding == 'ascii': encoding = 'iso-8859-1' # Return UTF-8 if that is what we're supposed to default to if default_to_utf8 and encoding in ('ascii', 'iso-8859-1'): encoding = 'utf-8' return encoding
def _get_encoding(self, csvFileName): if not self.icuOk: return locale.getpreferredencoding() with open(csvFileName) as csvfile: sample = csvfile.read(self.sampleSize) cd = icu.CharsetDetector() cd.setText(sample) encoding = cd.detect().getName() return encoding
def convert(self, data, outputCoding='utf-8'): coding = icu.CharsetDetector(data).detect().getName() print coding if outputCoding.upper() != coding.upper(): data = unicode(data, coding, "replace").encode(outputCoding) return data
def __init__(self,name,type,timeout = 5): self.detector = icu.CharsetDetector() self.engine = SQLiteExt(name,type,timeout) self.name = name self.wait_queue = []
def main(argv): icao = "KATL" sid = "" sids = [] author = "" try: conn = psycopg2.connect(**db_params) except: print("Cannot connect to database.", db_params) sys.exit() cur = conn.cursor() try: opts, args = getopt.getopt(argv, "hi:s:") except getopt.GetoptError: print(helptext) sys.exit(2) for opt, arg in opts: if opt == "-h": print(helptext) sys.exit() elif opt == "-i": icao = str.upper(arg) elif opt == "-s": sid = arg htconn = httplib.HTTPConnection("gateway.x-plane.com") if sid == "": htconn.request("GET", "/apiv1/airport/%s" % icao) r1 = htconn.getresponse() r2 = r1.read() result = json.loads(r2) sid = result["airport"]["recommendedSceneryId"] sceneries = result["airport"]["scenery"] for s in sceneries: for k2, v2 in s.items(): if k2 in [ "dateDeclined", "dateAccepted", "DateAccepted", "DateApproved", "DateDeclined", "userId", "type" ]: pass else: print(k2, v2) if s["Status"] == "Approved": sids.append(s["sceneryId"]) if s["sceneryId"] == sid: author = s["userName"] print("-----------------") print("approved scenery ids for %s: %s" % (icao, sids)) #print("highest approved id:", max(sids)) print("recommended SceneryId: %s by author: %s" % (sid, author)) htconn.request("GET", "/apiv1/scenery/%s" % str(sid)) r1 = htconn.getresponse() r2 = r1.read() result = json.loads(r2) zip_base64 = result["scenery"]["masterZipBlob"] zip_blob = base64.b64decode(zip_base64) print("writing %s.zip" % icao) # file = open("%s.zip" % icao, "wb") # file.write(zip_blob) # file.close() zip_bytearray = io.BytesIO(zip_blob) zip_fhandle = zipfile.ZipFile(zip_bytearray) print("reading %s.zip" % icao) # myZip = zipfile.ZipFile("%s.zip" % icao, "r") datstring = zip_fhandle.read("%s.dat" % icao) try: txtstring = zip_fhandle.read("%s.txt" % icao) except: print "(2D)" else: print " 3D :-)" dat_handle = StringIO() dat_handle.write(datstring) dat_handle.seek(0) print("writing %s.dat" % icao) # myZip.extract("%s.dat" % icao) # # print("deleting %s.zip" % icao) # os.remove("%s.zip" % icao) # print(datstring) ################################# #I #1000 Version - data cycle 2013.10, build 20131335, metadata AptXP1000. # #1 1906 1 0 LOWI Innsbruck Kranebitten #01234567890123456789 # 1 ^ # "2015" file: #A #1000 Generated by WorldEditor # #1 1470 0 0 0B7 Warren-Sugarbush #01234567890123456789 # 1 ^ #XP10 custom cenery pack: #A #1000 Generated by WorldEditor # #1 1906 1 0 LOWI Innsbruck Kranebitten #01234567890123456789 # 1 ^ # elev 0 0 ICAO Name #known bugs: # LOWI #invalid byte sequence for encoding "UTF8": 0xf6 0x64 0x20 0x74 # EDRZ #invalid byte sequence for encoding "UTF8": 0xfc # fix: # iconv... icao = "" counter = 0 # filename = "apt.dat.lowi-in" # main loop for line in dat_handle: line = line.replace("\r\n", os.linesep) line = line.strip() # print(line) # 1 for airports, 16 for seaports, .... if line.startswith("1 ") or line.startswith("16 ") or line.startswith( "17 "): #the previous icao: if icao != "": #for testing #if icao == "LOWI": # write previous airport to DB print(icao, counter) counter = counter + 1 insert_or_update(cur, icao, linearray) if (counter % 1000 == 0): conn.commit() print("=============COMMIT==============") #the next airport: apt_header = line.split() icao = apt_header[4] name = ' '.join(apt_header[5:]) #print(icao, name) linearray = "$$%s$$" % line else: #read all the lines of that airport if icao != "" and line != "" and line != "99": linearray += ", $$%s$$" % line # last airport in apt.dat: encoding = icu.CharsetDetector(linearray).detect().getName() new_encoding = "utf-8" if new_encoding.upper() != encoding.upper(): # encoding = "windows-1250" # encoding = "cp1252" # encoding = "utf-8" # encoding = "latin1" print(" --- Converting from file encoding: %s" % str(encoding)) linearray = unicode(linearray, encoding).encode(new_encoding).replace( "\r\n", os.linesep) if icao != "": # print(linearray) insert_or_update(cur, icao, linearray) print(icao, counter) conn.commit() cur.close() conn.close()
def detect_encoding(filename, default_to_utf8=True, **kwargs): """ Detect the named file's character encoding. If the first parts of the file appear to be ASCII, this function returns 'UTF-8', as that's a safe superset of ASCII. This can be switched off by changing the `default_to_utf8` parameter. :param str filename: The name of the file to detect the encoding of. :param bool default_to_utf8: Defaults to True. Set to False to disable treating ASCII files as UTF-8. :param bool read_all: Keyword argument; if True, reads the entire file for encoding detection. :return: The file encoding. :rtype: str """ # Read some of the file import os.path filename = from_posix(filename) file_len = os.path.getsize(filename) read_len = min(_READ_CHUNK_SIZE, file_len) # ... unless we're supposed to! if kwargs.get("read_all", False): read_len = file_len # Read the first read_len bytes raw, so we can detect the encoding with open(filename, "rb") as raw_handle: raw = raw_handle.read(read_len) # Detect the encoding the file specfies, if any. import codecs if raw.startswith(codecs.BOM_UTF8): encoding = "utf-8-sig" else: # Detect encoding using the best detector available try: # First try ICU. ICU will report ASCII in the first 32 Bytes as # ISO-8859-1, which isn't exactly wrong, but maybe optimistic. import icu encoding = icu.CharsetDetector(raw).detect().getName().lower() except ImportError: # pragma: nocover # If that doesn't work, try chardet - it's not got native components, # which is a bonus in some environments, but it's not as precise. import chardet encoding = chardet.detect(raw)["encoding"].lower() # Chardet is more brutal in that it reports ASCII if none of the first # Bytes contain high bits. To emulate ICU, we just bump up the detected # encoding. if encoding == "ascii": encoding = "iso-8859-1" # Both chardet and ICU may detect ISO-8859-x, which may not be possible # to decode as UTF-8. So whatever they report, we'll try decoding as # UTF-8 before reporting it. if default_to_utf8 and encoding in ("ascii", "iso-8859-1", "windows-1252"): # Try decoding as utf-8 try: raw.decode("utf-8") # If this worked... well there's no guarantee it's utf-8, to be # honest. encoding = "utf-8" except UnicodeDecodeError: # Decoding as utf-8 failed, so we can't default to it. pass return encoding
def convertData(data): # detect encoding with IBMs ICU Detector... python wrapper of the system lib :) coding = icu.CharsetDetector(data).detect().getName() if targetFormat.upper() != coding.upper(): data = unicode(data, coding).encode(targetFormat) return data
def convert_encoding(data, new_coding='UTF-8'): coding = icu.CharsetDetector(data).detect().getName() if new_coding.upper() != coding.upper(): data = unicode(data, coding).encode(new_coding) return data
def convert_encoding(self, data, new_coding='UTF-8'): coding = icu.CharsetDetector(data).detect().getName() print("Detected coding {}".format(coding)) if new_coding.upper() != coding.upper(): data = (data.decode(coding)).encode(new_coding) return data