Exemplo n.º 1
0
def _OSdownload(SubId, SubCodec):

    log.debug("Download subtitle: %s" % SubId)
    time.sleep(6)
    if not OS_NoOp():
        return None
    try:
        Result = autosub.OPENSUBTITLESSERVER.DownloadSubtitles(
            autosub.OPENSUBTITLESTOKEN, [SubId])
    except:
        autosub.OPENSUBTITLESTOKEN = None
        log.error('Error from Opensubtitles download API. DownloadId is: %s' %
                  SubId)
        return None

    if Result['status'] == '200 OK':
        try:
            CompressedData = Result['data'][0]['data'].decode('base64')
        except Exception as error:
            log.error(
                'Error decompressing sub from opensubtitles. Message is: %s' %
                error)
            return None
        if not CompressedData:
            log.debug(
                'No data returned from DownloadSubtitles API call. Skipping this one.'
            )
            return None
        SubDataBytes = gzip.GzipFile(fileobj=io.BytesIO(CompressedData)).read()
        # Opensubtitles makes no difference in UTF-8 and UTF8-SIG so we check with chardet the correct encoding
        # if Opensubtile does not know the encoding we assume windows-1252 is used.
        if SubCodec:
            if 'UTF' in SubCodec.upper() or SubCodec == 'Unknown':
                SubCodec = chardet.detect(SubDataBytes)['encoding']
            elif '1252' in SubCodec:
                SubCodec = u'cp1252'
            elif '850' in SubCodec:
                SubCodec = u'cp850'
        else:
            SubCodec = chardet.detect(SubDataBytes)['encoding']
            if not 'UTF' in SubCodec.upper():
                SubCodec = u'cp1252'
        try:
            SubData = SubDataBytes.decode(SubCodec, errors='replace')
        except Exception as error:
            log.error('Error decoding sub from opensubtitles. Message is: %s' %
                      error)
            return None
        return (SubData)
    else:
        if Result['status'][:3] == '406':
            autosub.OPENSUBTITLESTOKEN = None
        log.error('Message : %s' % Result['status'])
        return None
Exemplo n.º 2
0
def _getzip(Session, url):
    # returns a file-like String object
    try:
        Result = Session.get(url, verify=autosub.CERTIFICATEPATH, timeout=22)
    except:
        log.debug("Zip file at %s couldn't be retrieved" % url)
        return None
    try:
        zip = zipfile.ZipFile(io.BytesIO(Result.content))
    except Exception as error:
        log.debug("Expected a zip file but got error for link %s" % url)
        log.debug("%s is likely a dead link" % url)
        return None
    nameList = zip.namelist()
    for name in nameList:
        # sometimes .nfo files are in the zip container
        if name.lower().endswith('srt'):
            try:
                Data = zip.read(name)
                if Data.startswith(codecs.BOM_UTF8):
                    SubData = unicode(Data[3:], 'UTF-8')
                else:
                    Codec = chardet.detect(Data)['encoding']
                    SubData = unicode(Data, Codec)
                if SubData:
                    return SubData
            except Exception as error:
                log.error(error.message)
    log.debug("No subtitle files was found in the zip archive for %s" % url)
    return None
Exemplo n.º 3
0
    def follow_meta_redirects(url, redirects, **kwargs):
        urls_history[url] = True

        if redirects < 0:
            raise ValueError("Cannot resolve real url with max_redirects=%s" %
                             max_redirects)

        redirects -= 1

        with closing(s.get(url, allow_redirects=True, stream=True,
                           **kwargs)) as resp:
            if resp.history:
                for r in resp.history:
                    urls_history[r.url] = True

            head, real_url = next(resp.iter_content(chunk_size)), resp.url

            encoding = resp.encoding
            if encoding is None:
                # detect encoding
                encoding = chardet.detect(head)['encoding']

            try:
                head = str(head, encoding, errors='replace')
            except (LookupError, TypeError):
                head = str(head, errors='replace')

        # Removing html blocks in <noscript></noscript>
        if remove_noscript:
            head = re.sub(r'<noscript[^>]*>.*</noscript[^>]*>',
                          '',
                          head,
                          flags=re.DOTALL)

        redirect = None
        if 'refresh' in resp.headers:
            redirect = resp.headers['refresh']
        elif not redirect:
            for tag in get_tags(head, 'meta'):
                if tag.get('http-equiv', '') == 'refresh':
                    redirect = tag.get('content', None)

        if redirect:
            m = re.search(r'url\s*=\s*([^\s;]+)', redirect, re.I)
            if m:
                m = m.group(1)

                # fixing case url='#url here#'
                if m.startswith(('"', "'")) and m.endswith(('"', "'")):
                    m = m[1:-1]

                real_url = follow_meta_redirects(urljoin(resp.url, m),
                                                 redirects)

        urls_history[real_url] = True

        return real_url
Exemplo n.º 4
0
def ensure_text_type(value):
    try:
        return value.decode('utf-8')
    except AttributeError:
        # AttributeError: '<>' object has no attribute 'decode'
        # In this case assume already text_type and do nothing
        return value
    except UnicodeDecodeError:
        from requests.packages.chardet import detect
        encoding = detect(value).get('encoding') or 'utf-8'
        return value.decode(encoding)
Exemplo n.º 5
0
def host_check(host_ip):
    host, ip = host_ip
    schemes = ["http://", "https://"]
    for scheme in schemes:
        url = scheme + ip
        headers = {
            'Host':
            host.strip(),
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'
        }
        try:
            r = requests.session()
            requests.packages.urllib3.disable_warnings()
            res = r.get(url,
                        verify=False,
                        headers=headers,
                        allow_redirects=False,
                        timeout=30)
            charset = chardet.detect(res.content)["encoding"]
            res.encoding = charset
            title = ""
            try:
                title = re.search('<title>(.*)</title>',
                                  res.text).group(1)  #获取标题
            except Exception as ex:
                title = "Failed to get title!"
            info = '%s,%s,%s,Packet size:%d,Title:%s' % (
                ip, host, scheme + host, len(res.text), title)
            if lock.acquire():
                try:
                    success_list.append(info)
                    pbar.echo(info)
                    pbar.update_suc()
                    with open('hosts_ok.txt', 'a+', encoding="utf-8") as f:
                        print(info + "\n")
                        f.write(info + "\n")
                        f.close()

                finally:
                    lock.release()

        except Exception as ex:
            if lock.acquire():
                try:
                    # print ex.message
                    # logging.exception(ex)
                    error = "%s,%s,%s,Access failed!" % (ip, host,
                                                         scheme + host)
                    pbar.echo(error)
                finally:
                    lock.release()
        finally:
            pbar.update()
Exemplo n.º 6
0
 def get_valid_response(self, response):
     html_content = response.body
     content_type = chardet.detect(html_content)
     print(content_type['encoding'])
     if content_type['encoding'] != "UTF-8":
         html_content = html_content.decode(content_type['encoding'])
     html_content = html_content.encode("utf-8")
     # open("qunima.html","wb").write(html_content)
     html_content = str(html_content)
     html_content = html_content.replace("\n", "")
     return html_content
Exemplo n.º 7
0
 def get_valid_response(self, response):
     html_content = response.body
     content_type = chardet.detect(html_content)
     print(content_type['encoding'])
     if content_type['encoding'] != "UTF-8":
         html_content = html_content.decode(content_type['encoding'])
     html_content = html_content.encode("utf-8")
     # open("qunima.html","wb").write(html_content)
     html_content = str(html_content)
     html_content = html_content.replace("\n", "")
     return html_content
Exemplo n.º 8
0
def ensure_text_type(value):
    try:
        return value.decode('utf-8')
    except AttributeError:
        # AttributeError: '<>' object has no attribute 'decode'
        # In this case assume already text_type and do nothing
        return value
    except UnicodeDecodeError:
        from requests.packages.chardet import detect
        encoding = detect(value).get('encoding') or 'utf-8'
        return value.decode(encoding)
Exemplo n.º 9
0
    def follow_meta_redirects(url, redirects, **kwargs):
        urls_history[url] = True

        if redirects < 0:
            raise ValueError("Cannot resolve real url with max_redirects=%s" % max_redirects)

        redirects -= 1

        with closing(s.get(url, allow_redirects=True, stream=True, **kwargs)) as resp:
            if resp.history:
                for r in resp.history:
                    urls_history[r.url] = True

            head, real_url = resp.iter_content(chunk_size).next(), resp.url

            encoding = resp.encoding
            if encoding is None:
                # detect encoding
                encoding = chardet.detect(head)['encoding']

            try:
                head = unicode(head, encoding, errors='replace')
            except (LookupError, TypeError):
                head = unicode(head, errors='replace')

        # Removing html blocks in <noscript></noscript>
        if remove_noscript:
            head = re.sub('<noscript[^>]*>.*</noscript[^>]*>', '', head, flags=re.DOTALL)

        redirect = None
        if 'refresh' in resp.headers:
            redirect = resp.headers['refresh']
        elif not redirect:
            for tag in get_tags(head, 'meta'):
                if tag.get('http-equiv', '') == 'refresh':
                    redirect = tag.get('content', None)

        if redirect:
            m = re.search('url\s*=\s*([^\s;]+)', redirect, re.I)
            if m:
                m = m.group(1)

                # fixing case url='#url here#'
                if m.startswith(('"', "'")) and m.endswith(('"', "'")):
                    m = m[1:-1]

                real_url = follow_meta_redirects(urljoin(resp.url, m), redirects)

        urls_history[real_url] = True

        return real_url
Exemplo n.º 10
0
def _ensure_text_type(value):
    # copying here from conda/common/compat.py to avoid the import
    try:
        return value.decode('utf-8')
    except AttributeError:
        # AttributeError: '<>' object has no attribute 'decode'
        # In this case assume already text_type and do nothing
        return value
    except UnicodeDecodeError:
        try:
            from requests.packages.chardet import detect
        except ImportError:  # pragma: no cover
            from pip._vendor.requests.packages.chardet import detect
        encoding = detect(value).get('encoding') or 'utf-8'
        return value.decode(encoding)
Exemplo n.º 11
0
def _open_resource(xml_resource, detect_encoding=False):
    if isinstance(xml_resource, basestring):
        if detect_encoding:
            encoding = chardet.detect(xml_resource)['encoding']
            if encoding in ('UTF-16LE', 'UTF-16BE'):
                xml_resource = xml_resource.decode('UTF-16').encode('utf-8')

        try: # https://github.com/IATI/iati-datastore/issues/160
            xml_resource_is_path = os.path.exists(xml_resource)
        except TypeError:
            xml_resource_is_path = False

        if xml_resource_is_path:
            #https://bugzilla.redhat.com/show_bug.cgi?id=874546
            f = open(xml_resource)
            lines = f.read()
            xmlfile = StringIO(lines)
        else:
            xmlfile = StringIO(xml_resource)
    else:
        # so it's a xml literal, probably from a test. It shouldn't be
        # big enough that a round trip through the serializer is a problem
        xmlfile = StringIO(ET.tostring(xml_resource))
    return xmlfile
Exemplo n.º 12
0
def _open_resource(xml_resource, detect_encoding=False):
    if isinstance(xml_resource, basestring):
        if detect_encoding:
            encoding = chardet.detect(xml_resource)['encoding']
            if encoding in ('UTF-16LE', 'UTF-16BE'):
                xml_resource = xml_resource.decode('UTF-16').encode('utf-8')

        try:  # https://github.com/IATI/iati-datastore/issues/160
            xml_resource_is_path = os.path.exists(xml_resource)
        except TypeError:
            xml_resource_is_path = False

        if xml_resource_is_path:
            #https://bugzilla.redhat.com/show_bug.cgi?id=874546
            f = open(xml_resource)
            lines = f.read()
            xmlfile = StringIO(lines)
        else:
            xmlfile = StringIO(xml_resource)
    else:
        # so it's a xml literal, probably from a test. It shouldn't be
        # big enough that a round trip through the serializer is a problem
        xmlfile = StringIO(ET.tostring(xml_resource))
    return xmlfile
Exemplo n.º 13
0
def to_unicode(content):
    from requests.packages import chardet
    encode_name = chardet.detect(content).get('encoding')
    return unicode(content, encode_name) if encode_name else ''
Exemplo n.º 14
0
def to_unicode(content):
    from requests.packages import chardet
    encode_name = chardet.detect(content).get('encoding')
    return unicode(content, encode_name) if encode_name else ''
Exemplo n.º 15
0
# -*- coding: utf-8 -*-
# Time: 2019/6/7 17:20
# Author: laugc
# Email: [email protected]
# File: py70_chardet.py

from requests.packages import chardet
"""
编码
"""

print(chardet.detect(b'Hello, world!'))

data = '离离原上草,一岁一枯荣'.encode('gbk')
print(chardet.detect(data))

data1 = '离离原上草,一岁一枯荣'.encode('utf-8')
print(chardet.detect(data1))

data2 = '最新の主要ニュース'.encode('euc-jp')
print(chardet.detect(data2))
Exemplo n.º 16
0
 def set_output(self,text,charset=None):
     #: TODO: MUST do nothing on unicode python 2.7
     if not charset: 
         charset = chardet.detect(text)['encoding']
     self.output = text.decode(charset)
Exemplo n.º 17
0
from requests.packages import chardet

with open('test', 'rb') as f:
    print(chardet.detect(f.read()))
Exemplo n.º 18
0
 def apparent_encoding(self):
     """The apparent encoding, provided by the chardet library"""
     return chardet.detect(self.content)['encoding']