Exemplo n.º 1
0
 def test_speed(self):
     try:
         import chardet
         has_chardet = True
     except ImportError:
         has_chardet = False
     import time
     do_times = 5
     path = r"testdata/wikipediaJa_One_Thousand_and_One_Nights_SJIS.txt"
     with open(path, 'rb') as f:
         msg = f.read()
     # Test chardet
     if has_chardet:
         result_chardet = 0
         for i in range(do_times):
             start_chardet = time.time()
             chardet.detect(msg)
             result_chardet += (time.time() - start_chardet)
         print('chardet:',1/(result_chardet/do_times), 'call(s)/s')
     # Test cchardet
     result_cchardet = 0
     for i in range(do_times):
         start_cchardet = time.time()
         cchardet.detect(msg)
         result_cchardet += (time.time() - start_cchardet)
     print('cchardet:',1/(result_cchardet/do_times), 'call(s)/s')
Exemplo n.º 2
0
def get(qstring):
    """ Builds and returns a JSON reply of all information and requested data """
    args = dict(urlparse.parse_qsl(qstring)) 
 
    reply = {}
    reply["headers"] = {}
    reply["status"] = {}

    if "url" in args and _validateUrl(args["url"]):
        reply["status"]["url"] = args["url"]
        
        if not args["url"].startswith("http://") and not args["url"].startswith("https://") :
            args["url"] = "http://"+args["url"] 
        hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11'}
        req = urllib2.Request(args["url"],headers=hdr)
    
        try: 
            response = urllib2.urlopen(req)
            charset = response.headers.getparam('charset')
            if charset is not None:
                resp = response.read()
                try:
                    reply["content"] = resp.decode(charset)
                except:
                    try:
                        reply["content"] = resp.decode(cchardet.detect(resp)['encoding'])
                    except:
                        reply["content"] = resp
            else:
                resp = response.read()
                try:
                    reply["content"] = resp.decode(cchardet.detect(resp)['encoding'])
                except:
                    reply["content"] = resp
            reply["status"]["http_code"] = response.code

            if "headers" in args and args["headers"] == "true":
                reply["headers"] = dict(response.info())

        except (urllib2.HTTPError, urllib2.URLError) as e:
            try:
                reply['content'] = urllib2.build_opener(urllib2.HTTPCookieProcessor).open(args["url"]).read()
            except:
                reply["status"]["reason"] = str(e.reason)
                reply["content"] = None
                reply["status"]["http_code"] = e.code if hasattr(e,'code') else 0
    else:
        reply["content"] = None
        reply["status"]["http_code"] = 400
        reply["status"]["reason"] = "The url parameter value is missing or invalid"
   
    # Attach callback to reply if jsonp request
    if "callback" in args:
        return "{0}({1})".format(args["callback"], json.dumps(reply))
    return json.dumps(reply, ensure_ascii=False)
Exemplo n.º 3
0
    def test_github_issue_20(self):
        """
        https://github.com/PyYoshi/cChardet/issues/20
        """
        msg = b'\x8f'

        cchardet.detect(msg)

        detector = cchardet.UniversalDetector()
        detector.feed(msg)
        detector.close()
Exemplo n.º 4
0
 def detect(self, data, safe = False):
     try:
         return cchardet.detect(data)
     except TypeError:
         # TypeError is usually raised when cchardet expects a string
         # instead of unicode. Let's give it another last try before
         # giving up
         try:
             return cchardet.detect(str(data))
         except Exception:
             raise
     except Exception:  # pragma: no cover
         if safe:
             return None
         raise
Exemplo n.º 5
0
    def _convert_to_unicode(self, content):
        """
        Converts content to unicode (or all the strings in content)

        NOTE: Even though this method supports any type, it will
        currently ignore contents of lists, tuple or any other
        iterable than dict. We don't need support for these at the
        moment

        :param content: content to convert
        :type content: object

        :rtype: object
        """
        if isinstance(content, unicode):
            return content
        elif isinstance(content, str):
            result = chardet.detect(content)
            default = "utf-8"
            encoding = result["encoding"] or default
            try:
                content = content.decode(encoding)
            except UnicodeError as e:
                logger.error("Unicode error: {0!r}. Using 'replace'".format(e))
                content = content.decode(encoding, 'replace')
            return content
        else:
            if isinstance(content, dict):
                for key in content.keys():
                    content[key] = self._convert_to_unicode(content[key])
        return content
def versioned_static(file_path):
    """
    Given the path for a static file
    Output a url path with a hex has query string for versioning
    """

    full_path = find(file_path)
    url = static(file_path)

    if not full_path:
        msg = 'Could not find static file: {0}'.format(file_path)
        logger.warning(msg)
        return url

    versioned_url_path = url

    with open(full_path, 'rb') as file_contents:
        file_data = file_contents.read()

        # # Normalise encoding
        try:
            encoding = cchardet.detect(file_data)['encoding']
            file_data = file_data.decode(encoding)
        except ValueError:
            pass
        file_data = file_data.encode('utf-8')

        # 7 chars of sha1 hex
        sha1_hash = sha1(file_data)
        sha1_hex = sha1_hash.hexdigest()[:7]

        versioned_url_path += '?v=' + sha1_hex

    return versioned_url_path
Exemplo n.º 7
0
    def _write(self, raw_data, filename) :
        """
        Write raw data to a compressed file.

        @arg raw_data: The raw_data to be compressed and written
        @type raw_data: byte string
        @arg filename: The intended name of the outfile
        @type filename: unicode

        @return: outfile ; The full path and name of the file written
        @rtype: unicode
        """
        result = chardet.detect(raw_data)
        if result['confidence'] > 0.5:
            encoding = result['encoding']
        else:
            encoding = 'utf-8'

        if not util.is_utf8_alias(encoding):
            raw_data = raw_data.decode(encoding).encode('utf-8')

        # Compress the data to save disk space.
        comp = bz2.BZ2Compressor()
        data = comp.compress(raw_data)
        data += comp.flush()
        out_handle = open(self._nametofile(filename), "wb")
        out_handle.write(data)
        out_handle.close()

        return out_handle.name      # return the full path to the file
Exemplo n.º 8
0
 def lazy_chardet_encoding(data):
     chardet_encoding = chardet.detect(data)['encoding']
     if not chardet_encoding:
         chardet_encoding = ''
     if isinstance(chardet_encoding, bytes_):
         chardet_encoding = chardet_encoding.encode('ascii', 'ignore')
     return chardet_encoding
Exemplo n.º 9
0
    def d(url, bytes_data):
        if chardet == None:
            return ''

        Fetcher.lock.acquire()

        if url not in Fetcher.encoding_cache:
            cd_r = chardet.detect(bytes_data)
            cd_confidence = cd_r['confidence']
            cd_encoding = cd_r['encoding']

            if cd_confidence > 0.8:
                ret = Fetcher.lookup_encoding(cd_encoding)
                print('\n%s\nchardet[encoding:%s, confidence:%.5f]' % \
                      (url, ret, cd_confidence)
                      )
            else:
                ret = ''

            # add to cache
            Fetcher.encoding_cache[url] = ret

        else:
            ret = Fetcher.encoding_cache[url]

        Fetcher.lock.release()
        return ret
Exemplo n.º 10
0
def convert_encoding(data, new_coding='UTF-8'):
    encoding = cchardet.detect(data)['encoding']

    if new_coding.upper() != encoding.upper():
        data = data.decode(encoding, data).encode(new_coding)

    return data
Exemplo n.º 11
0
 def test_detect_zh_gb18030(self):
     encoding = "GB18030"
     path = r"testdata/zh/GB18030/wikitop_zh_GB18030.txt"
     with open(path, 'rb') as f:
         msg = f.read()
     detected_encoding = cchardet.detect(msg)
     eq_(encoding.lower(),detected_encoding['encoding'].lower())
Exemplo n.º 12
0
def main(path, n, encoding=None, no_more=False):
    try:
        with open(path, 'rb') as f:
            res_list = fileecho.head(f, n)
            res = b'\n'.join(res_list)
            detect_result = chardet.detect(res)

            if encoding is not None:
                codec = encoding
            elif detect_result['confidence'] > 0.7:
                codec = detect_result['encoding']
            else:
                color.print_warn('Not Known encoding, may be %s.\n'
                                 'Please point it explictly' % detect_result['encoding'])
                return

            if no_more:
                color.print_info(res.decode(codec, errors='ignore'))
            else:
                more(res.decode(codec, errors='ignore'), print_color=True)

    except FileNotFoundError:
        color.print_err('%s not found' % path)
    except PermissionError:
        color.print_err('Permission denied: %s' % path)
Exemplo n.º 13
0
 def test_detect_bg_iso88595(self):
     encoding = "ISO-8859-5"
     path = r"testdata/bg/ISO-8859-5/wikitop_bg_ISO-8859-5.txt"
     with open(path, 'rb') as f:
         msg = f.read()
     detected_encoding = cchardet.detect(msg)
     eq_(encoding.lower(),detected_encoding['encoding'].lower())
Exemplo n.º 14
0
def stdEncode(sourceFile,destinationDir,suffix='',ext='txt',sourceEncoding='utf-16',destinationEncoding='utf-8', 
              detectEncoding=True, overwrite=False, verbose = True):
    print(destinationDir)
    print(sourceFile)
    # make destination dir if necessary
    if not(os.path.exists(destinationDir)):
        os.mkdir(destinationDir)
    
    destFile = re.sub('(\.[a-zA-Z0-9]{1,6})', suffix+'.'+ext, sourceFile)
    destPath = os.path.join(destinationDir,destFile)
    
    if not(overwrite):
        if os.path.exists(destPath):
            if(verbose):
                print(destPath + " already exists; skipping.")
            return
    
    with open(sourceFile, mode = "r") as infile:
        try:
            string = infile.read()
            if detectEncoding==True:
                sourceEncoding = cchardet.detect(string)['encoding']
                #print(sourceEncoding)
            
            string = string.decode(sourceEncoding)
            
            with open(destPath,"w") as outfile:
                outfile.write(string.encode('utf-8'))
        except DecodeError:
            return
Exemplo n.º 15
0
 def test_detect_de_windows1252(self):
     encoding = "WINDOWS-1252"
     path = r"testdata/de/WINDOWS-1252/wikitop_de_WINDOWS-1252.txt"
     with open(path, 'rb') as f:
         msg = f.read()
     detected_encoding = cchardet.detect(msg)
     print(detected_encoding)
     eq_(encoding.lower(),detected_encoding['encoding'].lower())
Exemplo n.º 16
0
 def test_detect_de_utf8(self):
     encoding = "UTF-8"
     path = r"testdata/de/UTF-8/wikitop_de_UTF-8.txt"
     with open(path, 'rb') as f:
         msg = f.read()
     detected_encoding = cchardet.detect(msg)
     print(detected_encoding)
     eq_(encoding.lower(),detected_encoding['encoding'].lower())
Exemplo n.º 17
0
 def test_detect_cz_iso88592(self):
     encoding = "ISO-8859-2"
     path = r"testdata/cz/ISO-8859-2/wikitop_cz_ISO-8859-2.txt"
     with open(path, 'rb') as f:
         msg = f.read()
     detected_encoding = cchardet.detect(msg)
     print(detected_encoding)
     eq_(encoding.lower(),detected_encoding['encoding'].lower())
Exemplo n.º 18
0
 def test_detect_el_iso88597(self):
     encoding = "ISO-8859-7"
     path = r"testdata/el/ISO-8859-7/wikitop_el_ISO-8859-7.txt"
     with open(path, 'rb') as f:
         msg = f.read()
     detected_encoding = cchardet.detect(msg)
     print(detected_encoding)
     eq_(encoding.lower(),detected_encoding['encoding'].lower())
Exemplo n.º 19
0
 def test_detect_tr_iso88599(self):
     encoding = "ISO-8859-9"
     path = r"testdata/tr/ISO-8859-9/wikitop_tr_ISO-8859-9.txt"
     with open(path, 'rb') as f:
         msg = f.read()
     detected_encoding = cchardet.detect(msg)
     print(detected_encoding)
     eq_(encoding.lower(),detected_encoding['encoding'].lower())
Exemplo n.º 20
0
 def test_detect_th_tis620_2(self):
     encoding = "TIS-620"
     path = r"testdata/th/TIS-620/wikitop_th_TIS-620.txt"
     with open(path, 'rb') as f:
         msg = f.read()
     detected_encoding = cchardet.detect(msg)
     print(detected_encoding)
     eq_(encoding.lower(),detected_encoding['encoding'].lower())
Exemplo n.º 21
0
 def test_detect_ru_maccyrillic(self):
     encoding = "MAC-CYRILLIC"
     path = r"testdata/ru/X-MAC-CYRILLIC/wikitop_ru_MACCYRILLIC.txt"
     with open(path, 'rb') as f:
         msg = f.read()
     detected_encoding = cchardet.detect(msg)
     print(detected_encoding)
     eq_(encoding.lower(),detected_encoding['encoding'].lower())
Exemplo n.º 22
0
 def test_detect_ru_ibm855(self):
     encoding = "IBM855"
     path = r"testdata/ru/IBM855/wikitop_ru_IBM855.txt"
     with open(path, 'rb') as f:
         msg = f.read()
     detected_encoding = cchardet.detect(msg)
     print(detected_encoding)
     eq_(encoding.lower(),detected_encoding['encoding'].lower())
    def _detect_encoding(self):
        """Sniff the encoding using the entire file."""

        with open(self.resource['path'], 'rb') as stream:
            text = stream.read()
            encoding = cchardet.detect(text)['encoding']
            logging.info('Detected %s encoding with cchardet', encoding)
            return encoding
Exemplo n.º 24
0
    def detect(self, data, safe = False):
        try:
            return cchardet.detect(data)
        except:
            if safe:
                return None

            raise
Exemplo n.º 25
0
    def fetch_html_with_response(self, response):
        encoding_detected_by_cchardet = cchardet.detect(response.content)['encoding']
        response.encoding = encoding_detected_by_cchardet
        html_body = response.text
        self.response = response  # Adのために作った

        script_pattern = re.compile('<script.*?<\/script>')
        self.html_body = script_pattern.sub('', html_body)
Exemplo n.º 26
0
 def test_detect_ru_koi8r(self):
     encoding = "KOI8-R"
     path = r"testdata/ru/KOI8-R/wikitop_ru_KOI8-R.txt"
     with open(path, 'rb') as f:
         msg = f.read()
     detected_encoding = cchardet.detect(msg)
     print(detected_encoding)
     eq_(encoding.lower(),detected_encoding['encoding'].lower())
Exemplo n.º 27
0
    def _file_encoding(file_name):
        try:
            with open(file_name, 'rb') as f:
                encoding = cchardet.detect(f.read()).get('encoding')
        except IOError as e:
            raise Error(e)

        return encoding
Exemplo n.º 28
0
    def guess_encoding(self):
        """ Makes an expensive guess of the charset with the chardet library """

        # TODO: would it be faster to look only in the first N thousand bytes?
        detected = cchardet.detect(self.doc.source_data)
        if detected.get("encoding"):
            c = webencodings.lookup(detected.get("encoding"))
            if c:
                return c.codec_info
Exemplo n.º 29
0
def guess_encoding(filename_or_bytes):
    if isinstance(filename_or_bytes, bytes):
        result = chardet.detect(filename_or_bytes)
        return result['encoding']
    else:
        # TODO: Use UniversalDetector to detect encoding incrementally (for large files)
        # http://chardet.readthedocs.org/en/latest/usage.html#example-using-the-detect-function
        with open(filename_or_bytes, 'rb') as f:
            return guess_encoding(f.read())
Exemplo n.º 30
0
 def test_ascii(self):
     detected_encoding = cchardet.detect(b'abcdefghijklmnopqrstuvwxyz')
     eq_(
         'ascii',
         detected_encoding['encoding'].lower(),
         'Expected %s, but got %s' % (
             'ascii',
             detected_encoding['encoding'].lower()
         )
     )
Exemplo n.º 31
0
def get_encoding(auto, raw):
    """
    Automatically detect character encoding.

    Arguments:
        auto (str): auto-detection of character encoding - can be either
            'chardet', 'cchardet', False, or True (the latter will pick the
            fastest available option)
        raw (bytes): array of bytes to detect from

    Returns:
        A string specifying the character encoding.

    """
    if auto is True:
        try:
            import cchardet as chardet
        except ImportError:
            try:
                import chardet
            except ImportError:
                logger.debug(
                    "chardet or cchardet is recommended for automatic"
                    " detection of character encodings. Instead trying some"
                    " common encodings."
                )
                return None
            else:
                logger.debug("get_encoding Using chardet")
                method = "chardet"
        else:
            logger.debug("get_encoding Using cchardet")
            method = "cchardet"
    elif auto.lower() == "chardet":
        import chardet

        logger.debug("get_encoding Using chardet")
        method = "chardet"
    elif auto.lower() == "cchardet":
        import cchardet as chardet

        logger.debug("get_encoding Using cchardet")
        method = "cchardet"
    result = chardet.detect(raw)
    logger.debug(
        "{} method detected encoding of {} at confidence {}".format(
            method, result["encoding"], result["confidence"]
        )
    )
    return result["encoding"]
Exemplo n.º 32
0
def quick_detect_encoding(string):
    """
    Tries to detect the encoding of the passed string.

    Uses cchardet. Fallbacks to detect_encoding.
    """
    assert isinstance(string, bytes)
    try:
        detected = cchardet.detect(string)
        if detected:
            return detected.get('encoding') or detect_encoding(string)
    except Exception as e:
        pass
    return detect_encoding(string)
Exemplo n.º 33
0
def set_response_encoding(request):
    """Set the encoding if it isn't set already.

    Use cchardet for added performance.
    """
    if request:
        # If the encoding is iso-8859-1, switch it to cp1252 (a superset)
        if request.encoding == 'ISO-8859-1':
            request.encoding = 'cp1252'

        if request.encoding is None:
            # Requests detects the encoding when the item is GET'ed using
            # HTTP headers, and then when r.text is accessed, if the encoding
            # hasn't been set by that point. By setting the encoding here, we
            # ensure that it's done by cchardet, if it hasn't been done with
            # HTTP headers. This way it is done before r.text is accessed
            # (which would do it with vanilla chardet). This is a big
            # performance boon, and can be removed once requests is upgraded
            if isinstance(request.content, text_type):
                as_bytes = request.content.encode()
                request.encoding = chardet.detect(as_bytes)['encoding']
            else:
                request.encoding = chardet.detect(request.content)['encoding']
Exemplo n.º 34
0
def decode_response(response, chunk_size=65536):
    """Read the first chunk of server response and decode it"""
    guessed_encoding = chardet.detect(
        response.content[:chunk_size])['encoding']
    LOGGER.debug('response/guessed encoding: %s / %s', response.encoding,
                 guessed_encoding)
    if guessed_encoding is not None:
        try:
            htmltext = response.content.decode(guessed_encoding)
        except UnicodeDecodeError:
            htmltext = response.text
    else:
        htmltext = response.text
    return htmltext
Exemplo n.º 35
0
 def _get_encoded_text(self):
     if self.successfully_read:
         encoding = cchardet.detect(self.text)['encoding']
         if encoding is None:
             encoding = "utf-8"
         if len(self.text) > 0:
             # We convert, even if the text is detected to be UTF8 so, if it is an error and conversion fails,
             # the error is caught here
             for enc in [encoding, 'utf-8', 'iso-8859-1', 'windows‑1252']:
                 try:
                     return enc, self.text.decode(enc)
                 except:
                     pass
     return None, ''
Exemplo n.º 36
0
def guess_encoding(text, default='utf-8'):
    """Guess string encoding.

    Given a piece of text, apply character encoding detection to
    guess the appropriate encoding of the text.
    """
    result = chardet.detect(text)
    if result:
        encoding = result.get('encoding')
        if encoding is not None:
            encoding = encoding.lower().strip()
            if encoding != 'ascii':
                return encoding
    return default
Exemplo n.º 37
0
    def separete_file_sentences(self):

        with open(self._file_path, "rb") as f:
            msg = f.read()
            result = chardet.detect(msg)
            #print(result)

        with codecs.open(self._file_path, "r", encoding=result["encoding"]) as text_file:
            file_text = text_file.read()
            sentences = nltk.tokenize.sent_tokenize(file_text)

            for i in range(len(sentences)):
                if(len(sentences[i]) > 0):
                    self._sentences.append(sentences[i])
Exemplo n.º 38
0
    def infer(self):
        """https://github.com/frictionlessdata/datapackage-py#resource
        """
        descriptor = deepcopy(self.__current_descriptor)

        # Blank -> Stop
        if self.__source_inspection.get('blank'):
            return descriptor

        # Name
        if not descriptor.get('name'):
            descriptor['name'] = self.__source_inspection['name']

        # Only for non inline/storage
        if not self.inline and not self.__storage:

            # Format
            if not descriptor.get('format'):
                descriptor['format'] = self.__source_inspection['format']

            # Mediatype
            if not descriptor.get('mediatype'):
                descriptor['mediatype'] = 'text/%s' % descriptor['format']

            # Encoding
            if not descriptor.get('encoding'):
                contents = b''
                with self.raw_iter(stream=True) as stream:
                    for chunk in stream:
                        contents += chunk
                        if len(contents) > 1000: break
                encoding = cchardet.detect(contents)['encoding'].lower()
                descriptor[
                    'encoding'] = 'utf-8' if encoding == 'ascii' else encoding

        # Schema
        if not descriptor.get('schema'):
            if self.tabular:
                descriptor['schema'] = self.__get_table().infer()

        # Profile
        if descriptor.get('profile') == config.DEFAULT_RESOURCE_PROFILE:
            if self.tabular:
                descriptor['profile'] = 'tabular-data-resource'

        # Save descriptor
        self.__current_descriptor = descriptor
        self.__build()

        return descriptor
Exemplo n.º 39
0
def detect_encoding(bytesobject):
    """Read the first chunk of input and return its encoding"""
    # unicode-test
    if isutf8(bytesobject):
        return 'UTF-8'
    else:
        guess = cchardet.detect(bytesobject)
        LOGGER.debug('guessed encoding: %s', guess['encoding'])
        return guess['encoding']
    # fallback on full response
    # if guess is None or guess['encoding'] is None: # or guess['confidence'] < 0.99:
    #    guessed_encoding = chardet.detect(bytesobject)['encoding']
    # return
    return None
Exemplo n.º 40
0
    def fetch(self, url: str) -> str:
        html = self._storage.get(url)
        if html is not None:
            logger.info(Fore.BLUE, 'Storage', f'Get<{url}>')
            return html

        r = requests.get(url)
        content = r.content
        charset = cchardet.detect(content)
        html = content.decode(charset['encoding'] or 'utf-8')
        logger.info(Fore.GREEN, 'Sent', f'{url} {len(html)} {r.status_code}')
        self._storage[url] = html
        logger.info(Fore.BLUE, 'Storage', f'Set<{url}>')
        return html
Exemplo n.º 41
0
    def _get_file_encoding(self, file_path):
        """Detecta la codificación de un archivo con cierto nivel de confianza
           y devuelve esta codificación o el valor por defecto.

        Args:
            file_path (str): Ruta del archivo.

        Returns:
            str: Codificación del archivo.
        """
        with open(file_path, 'rb') as f:
            info = cchardet.detect(f.read())
        return (info['encoding']
                if info['confidence'] > 0.75 else self.INPUT_DEFAULT_ENCODING)
Exemplo n.º 42
0
    def download(self, url):
        if url is None:
            return
        s = requests.Session()
        s.headers[
            'User-Agent'] = 'Mozilla / 5.0(Windows NT 10.0;WOW64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 63.0.3239.132Safari / 537.36'
        res = s.get(url)

        if res.status_code == 200:
            encoding = cchardet.detect(res.content)['encoding']
            text = res.content.decode(encoding)
            return text

        return None
Exemplo n.º 43
0
def main():
    do_times = 100
    path = r'tests/samples/wikipediaJa_One_Thousand_and_One_Nights_SJIS.txt'
    with open(path, 'rb') as f:
        msg = f.read()

        # Test chardet
        result_chardet = 0
        for i in range(do_times):
            start_chardet = time.time()
            chardet.detect(msg)
            result_chardet += (time.time() - start_chardet)
        print('chardet v%s:' % (chardet.__version__),
              1 / (result_chardet / do_times), 'call(s)/s')

        # Test cchardet
        result_cchardet = 0
        for i in range(do_times):
            start_cchardet = time.time()
            cchardet.detect(msg)
            result_cchardet += (time.time() - start_cchardet)
        print('cchardet v%s:' % (cchardet.__version__),
              1 / (result_cchardet / do_times), 'call(s)/s')
Exemplo n.º 44
0
 def search_page(self, query, language=None, num=None, start=0, pause=2):
     """
     Google search
     :param query: Keyword
     :param language: Language
     :return: result
     """
     time.sleep(pause)
     domain = self.get_random_domain()
     if start > 0:
         url = URL_NEXT
         url = url.format(domain=domain,
                          language=language,
                          query=quote_plus(query),
                          num=num,
                          start=start)
     else:
         if num is None:
             url = URL_SEARCH
             url = url.format(domain=domain,
                              language=language,
                              query=quote_plus(query))
         else:
             url = URL_NUM
             url = url.format(domain=domain,
                              language=language,
                              query=quote_plus(query),
                              num=num)
     if language is None:
         url = url.replace('hl=None&', '')
     # Add headers
     headers = {'user-agent': self.get_random_user_agent()}
     try:
         requests.packages.urllib3.disable_warnings(
             requests.packages.urllib3.exceptions.InsecureRequestWarning)
         r = requests.get(url=url,
                          proxies=self.proxies,
                          headers=headers,
                          allow_redirects=False,
                          verify=False,
                          timeout=30)
         LOGGER.info(url)
         content = r.content
         charset = cchardet.detect(content)
         text = content.decode(charset['encoding'])
         return text
     except Exception as e:
         LOGGER.exception(e)
         return None
Exemplo n.º 45
0
def csvencode(filename) :
  #print("resp")
  #try:
      #out file name
  #splunk_path = os.environ['SPLUNK_HOME']
  #splunk_path = "/Applications/Splunk"
  convfile = filename.replace(".csv","_utf8.csv")

  #data=pd.read_csv(filename)
  #    for col in data.columns:
  #        try:
  #            if "Unnamed" not in col:
  #                data[col]=data[col].str.replace(",","")
  #        except:
  #                data[col]=data[col]

  #data.to_csv(tmpfile)
  #os.system('touch '+splunk_path+'/etc/apps/aiam-ml-core/lookups/worked')

  #os.remove(filename)
  with open(filename) as f:
      data=f.read()
      enc=c.detect(data)['encoding'].upper()    
  print('enc',enc)    
  if 'ISO-8859-1' in enc:
      enc='CP1252'
  cmd='iconv -c -f '+enc+' -t utf-8 '+filename+' > '+convfile

  os.system(cmd)

      #reads in file
  with open(convfile, 'r+') as f:
      text = f.read()


    #  #encodes in utf-8 and changes to Unix line flush
      #temp = text.encode('utf-8', 'strict')
      temp_line = text.replace('\r', '\n')
      final = temp_line.replace('\n\n', '\n')

    #  #removes prev file to overwrite
    #  os.remove(tmpfile)

    #  #writes out file
      f.seek(0)
      f.write(final)
      f.truncate()

  print("File successfuly converted")
Exemplo n.º 46
0
def decode_bytes(content):
    try:
        if content.startswith(UTF8_HEADER):
            return content.decode('utf-8-sig')
        if content.startswith(BIG_ENDIAN_HEADER):
            return content[len(BIG_ENDIAN_HEADER):].decode('utf-16-be')
        if content.startswith(LITTLE_ENDIAN_HEADER):
            return content[len(LITTLE_ENDIAN_HEADER):].decode('utf-16-le')
        return content.decode('utf-8')
    except Exception:
        encoding_info = cchardet.detect(content)
        logger.info(
            f"Encoding detected to be {encoding_info['encoding']} with confidence of {encoding_info['confidence']}."
        )
        return content.decode(encoding_info['encoding'])
Exemplo n.º 47
0
    def get_encoding(self):

        for f_or_d in vars(self.args())['file']:
            #位置参数是文件
            if os.path.isfile(f_or_d):
                with open(f_or_d, "rb") as f:
                    msg = f.read()
                    result = chardet.detect(msg)
                    print(f_or_d, result)

            #位置参数是目录
            elif os.path.isdir(f_or_d):
                #指定了可选参数 -r:递归
                if self.args().recursion:
                    for i in self.getallfiles(f_or_d):
                        with open(i, "rb") as f:
                            msg = f.read()
                            result = chardet.detect(msg)
                            print(i, result)
                #没有指定可选参数 -r:只处理当前文件夹下的文件,忽略目录
                else:
                    current_dir = os.listdir(os.path.abspath(f_or_d))
                    for m in current_dir:
                        #获取输入文件夹下文件的绝对路径
                        cureent_dir_abs_path = os.path.join(
                            os.path.abspath(f_or_d), m)
                        #如果该文件是一个文件
                        if os.path.isfile(cureent_dir_abs_path):
                            with open(cureent_dir_abs_path, "rb") as f:
                                msg = f.read()
                                result = chardet.detect(msg)
                                print(cureent_dir_abs_path, result)

            else:
                print("no such file or directory")
                sys.exit()
Exemplo n.º 48
0
    def _get_encoding(self):
        ctype = self.headers.get(hdrs.CONTENT_TYPE, '').lower()
        mtype, stype, _, params = helpers.parse_mimetype(ctype)

        encoding = params.get('charset')
        if not encoding:
            if mtype == 'application' and stype == 'json':
                # RFC 7159 states that the default encoding is UTF-8.
                encoding = 'utf-8'
            else:
                encoding = chardet.detect(self._content)['encoding']
        if not encoding:
            encoding = 'utf-8'

        return encoding
Exemplo n.º 49
0
def GetHtml(url, Htype='get', data={}):
    #设置提交方式,使用session()包含cookies
    # r = requests.session()
    print('session', r)
    print('cookies', r.cookies.get_dict())
    #提交链接,获取网页
    if Htype == 'get':
        req = r.get(url)
    elif Htype == 'post':
        req = r.post(url, data)
    # 判断编码
    codetype = cchardet.detect(req.content)
    # 设置编码
    req.encoding = codetype["encoding"]
    return req.text
Exemplo n.º 50
0
def html2Unicode(bData):
    try:
        import cchardet as chardet
    except ImportError:
        try:
            import chardet
        except ImportError:
            return bData.decode('utf-8')
    mResult = chardet.detect(bData)
    if (mResult['confidence'] > 0.5):
        sEnc = mResult['encoding']
        sData = bData.decode(sEnc, 'replace')
        return sData
    else:
        raise UnicodeError('can not identify the encoding')
Exemplo n.º 51
0
def get_html_soup(url, **kwargs):
    headers = kwargs.get("headers", {})
    headers.setdefault('User-Agent', random.choice(config.HEADER.USER_AGENT))
    kwargs["headers"] = headers
    try:
        html = requests.get(url, timeout=config.request_timeout, **kwargs)
        if ('content-type' in html.headers and 'charset' not in html.headers['content-type']) \
                or ('content-type' not in html.headers):
            # html.encoding = config.encoding
            html.encoding = cchardet.detect(html.content)['encoding']
        soup = BeautifulSoup(html.text, "lxml")
        return soup
    except Exception as e:
        msg = u"get [%s] : %s" % (url, e)
        logger.get("error-log").error(msg)
Exemplo n.º 52
0
def to_buffer(buffer_or_path):
    if isinstance(buffer_or_path, str):
        if not os.path.isfile(buffer_or_path):
            raise FileNotFoundError(
                f"no file found at given path: {buffer_or_path}")
        path = buffer_or_path
        with open(buffer_or_path, "rb") as f:
            encoding = chardet.detect(f.read())
        buffer = open(buffer_or_path,
                      encoding=encoding["encoding"],
                      errors="ignore")
    else:
        path = None
        buffer = buffer_or_path
    return path, buffer
Exemplo n.º 53
0
def get_html_by_requests(url, headers, timeout=15):
    """
    :param url:
    :return:
    """
    try:
        response = requests.get(url=url, headers=headers, verify=False, timeout=timeout)
        response.raise_for_status()
        content = response.content
        charset = cchardet.detect(content)
        text = content.decode(charset['encoding'])
        return text
    except Exception as e:
        LOGGER.exception(e)
        return None
Exemplo n.º 54
0
def decode_string(string, encoding):
    try:
        value = string.decode(encoding)
    except (UnicodeDecodeError, LookupError):
        if chardet:
            enc = chardet.detect(string)
            try:
                if not (enc['confidence'] == 1 and enc['encoding'] == 'ascii'):
                    value = string.decode(enc['encoding'])
                else:
                    value = string.decode('ascii', 'ignore')
            except UnicodeDecodeError:
                value = force_string_decode(string)

    return value
Exemplo n.º 55
0
async def main():
    ssa = init_ssa()
    #print(ssa)
    print('media library path:', PATH)
    success = []
    fail = []
    print('finding and converting started...')
    for p, w, f in os.walk(PATH):
        for file_name in f:
            if file_name[-4:].lower() == '.srt':
                print('processing %s' % os.path.join(p, file_name))
                try:
                    with open(os.path.join(p, file_name), 'rb') as srt_file:
                        srt_raw = srt_file.read()
                        encoding = cchardet.detect(srt_raw)
                    srt = srt_raw.decode(encoding['encoding'],
                                         errors=DECODE_ERRORS)
                    ssa_file = codecs.open(os.path.join(
                        p,
                        os.path.splitext(file_name)[0] + SUFFIX + '.ssa'),
                                           'w',
                                           encoding='utf-8')
                    #ssa_file.write(convert_ssa(parse(smi),LANG))
                    #convert(srt)
                    ssa_file.write(convert(srt))
                    success.append(file_name)
                    if REMOVE_OPTION:
                        os.remove(os.path.join(p, file_name))
                except:
                    fail.append(file_name)

    srt_list = list(set(success) | set(fail))
    print('\nfound .srt subtitles:')
    for srt in srt_list:
        print(srt)

    if len(success) > 0:
        print('\nworked .srt subtitles:')
        for srt in success:
            print(srt)

    if len(fail) > 0:
        print('\nfailed .srt subtitles:')
        for srt in fail:
            print(srt)

    if REMOVE_OPTION:
        print('\nworked srt files are removed due to removal option')
Exemplo n.º 56
0
def decode_string(string: bytes, encoding: typing.Optional[str]) -> str:
    """Try anything possible to parse an encoded bytes string and return the result.

    We do this using the encoding hint, if this fails, we try to detect the correct
    encoding using the chardet module, if that failed we try latin-1, utf-8 and
    as a last resort ascii.
    In any case we always return something.

    Args:
        string (bytes): The bytes string to be decoded.
        encoding (str, optional): An optional encoding hint.

    Returns:
        str: A decoded form of the string.
    """
    if string == b'':
        return ''

    if encoding is not None:
        try:
            return string.decode(encoding)
        except (UnicodeDecodeError, LookupError):
            pass

    if chardet:
        enc = chardet.detect(string)
        if not (enc['confidence'] is None or enc['encoding'] is None) and not (
                enc['confidence'] == 1 and enc['encoding'] == 'ascii'):
            value = string.decode(enc['encoding'], 'replace')
        else:
            value = string.decode('ascii', 'replace')
    else:
        text = ''

        for e in ('latin1', 'utf-8'):
            try:
                text = string.decode(e)
            except UnicodeDecodeError:
                pass
            else:
                break

        if text == '':
            value = string.decode('ascii', 'ignore')
        else:
            value = text

    return value
Exemplo n.º 57
0
def get_text_v3(address, stream, mapped=False, decode=True):
    """faster way to extract strings from mdf versions 2 and 3 TextBlock

    Parameters
    ----------
    address : int
        TextBlock address
    stream : handle
        file IO handle

    Returns
    -------
    text : str
        unicode string

    """

    if address == 0:
        return "" if decode else b""

    if mapped:
        block_id = stream[address : address + 2]
        if block_id != b"TX":
            return "" if decode else b""
        (size,) = UINT16_uf(stream, address + 2)
        text_bytes = (
            stream[address + 4 : address + size].split(b"\0")[0].rstrip(b" \r\t\n\0")
        )
    else:
        stream.seek(address)
        block_id = stream.read(2)
        if block_id != b"TX":
            return "" if decode else b""
        size = UINT16_u(stream.read(2))[0] - 4
        text_bytes = stream.read(size).split(b"\0")[0].rstrip(b" \r\t\n\0")
    if decode:
        try:
            text = text_bytes.decode("latin-1")
        except UnicodeDecodeError:
            try:
                encoding = detect(text_bytes)["encoding"]
                text = text_bytes.decode(encoding, "ignore")
            except:
                text = "<!text_decode_error>"
    else:
        text = text_bytes

    return text
Exemplo n.º 58
0
def requests_target_fetch(url):
    """
    :param url:
    :return:
    """
    try:
        headers = {'user-agent': get_random_user_agent()}
        response = requests.get(url=url, headers=headers, verify=False)
        response.raise_for_status()
        content = response.content
        charset = cchardet.detect(content)
        text = content.decode(charset['encoding'])
        return text
    except Exception as e:
        LOGGER.exception(e)
        return None
Exemplo n.º 59
0
def detect_encoding(sample, encoding=None):
    """Detect encoding of a byte string sample.
    """
    # To reduce tabulator import time
    from cchardet import detect
    if encoding is not None:
        return normalize_encoding(sample, encoding)
    result = detect(sample)
    confidence = result['confidence'] or 0
    encoding = result['encoding'] or 'ascii'
    encoding = normalize_encoding(sample, encoding)
    if confidence < config.ENCODING_CONFIDENCE:
        encoding = config.DEFAULT_ENCODING
    if encoding == 'ascii':
        encoding = config.DEFAULT_ENCODING
    return encoding
Exemplo n.º 60
0
def convert_encoding(data):
    encoding = cchardet.detect(data)['encoding']

    if encoding == None:
        encoding = "utf-8"

    if len(data) > 0:
        # We convert, even if the text is detected to be UTF8 so, if it is an error and conversion fails, the error
        # is catched here
        for enc in [encoding, 'utf-8', 'iso-8859-1', 'windows‑1252']:
            try:
                return enc, data.decode(enc)
            except:
                pass

    return None, ''