def __init__(self, extractor='DefaultExtractor', **kwargs): if kwargs.get('url'): request = urllib.request.Request(kwargs['url'], headers=self.headers) connection = urllib.request.urlopen(request) self.data = connection.read() encoding = connection.headers['content-type'].lower().split('charset=')[-1] if encoding.lower() == 'text/html': encoding = charade.detect(self.data)['encoding'] self.data = str(self.data, encoding) elif kwargs.get('html'): self.data = kwargs['html'] if not isinstance(self.data, str): self.data = str(self.data, charade.detect(self.data)['encoding']) else: raise Exception('No text or url provided') try: # make it thread-safe if threading.activeCount() > 1: if jpype.isThreadAttachedToJVM() == False: jpype.attachThreadToJVM() lock.acquire() self.extractor = jpype.JClass( "de.l3s.boilerpipe.extractors."+extractor).INSTANCE finally: lock.release() reader = StringReader(self.data) self.source = BoilerpipeSAXInput(InputSource(reader)).getTextDocument() self.extractor.process(self.source)
def __init__(self, extractor='DefaultExtractor', **kwargs): if kwargs.get('url'): request = urllib2.Request(kwargs['url'], headers=self.headers) connection = urllib2.urlopen(request, timeout=10) self.data = connection.read() encoding = connection.headers['content-type'].lower().split( 'charset=')[-1] if encoding.lower() == 'text/html': encoding = charade.detect(self.data)['encoding'] if encoding is None: encoding = 'utf-8' self.data = str(self.data, encoding, errors='ignore') elif kwargs.get('html'): self.data = kwargs['html'] if not isinstance(self.data, str): self.data = str(self.data, charade.detect(self.data)['encoding']) else: raise Exception('No text or url provided') try: # make it thread-safe if threading.activeCount() > 1: if jpype.isThreadAttachedToJVM() == False: jpype.attachThreadToJVM() lock.acquire() self.extractor = jpype.JClass("de.l3s.boilerpipe.extractors." + extractor).INSTANCE finally: lock.release() reader = StringReader(self.data) self.source = BoilerpipeSAXInput(InputSource(reader)).getTextDocument() self.extractor.process(self.source)
def __init__(self, extractor='DefaultExtractor', **kwargs): if kwargs.get('url'): request = urllib2.Request(kwargs['url'], headers=self.headers) # Version without headers # request = urllib2.Request(kwargs['url']) connection = urllib2.urlopen(request) self.data = connection.read() encoding = connection.headers['content-type'].lower().split('charset=')[-1] # Try requests # request = requests.get(kwargs['url'], headers=self.headers, verify=False) # self.data = request.text # encoding = request.headers['content-type'].lower().split('charset=')[-1] if encoding.lower() == 'text/html': encoding = charade.detect(self.data)['encoding'] try: self.data = unicode(self.data, encoding, errors='replace') except LookupError as e: print e import ipdb; ipdb.set_trace() # XXX BREAKPOINT elif kwargs.get('html'): self.data = kwargs['html'] if not isinstance(self.data, unicode): self.data = unicode(self.data, charade.detect(self.data)['encoding'], errors='replace') import ipdb; ipdb.set_trace() # XXX BREAKPOINT else: raise Exception('No text or url provided') try: # make it thread-safe if threading.activeCount() > 1: if jpype.isThreadAttachedToJVM() == False: jpype.attachThreadToJVM() lock.acquire() self.extractor = jpype.JClass( "de.l3s.boilerpipe.extractors."+extractor).INSTANCE finally: lock.release() reader = StringReader(self.data) self.source = BoilerpipeSAXInput(InputSource(reader)).getTextDocument() self.extractor.process(self.source)
def __init__(self, extractor='DefaultExtractor', **kwargs): if kwargs.get('url'): # Correctly encode url url = unicode(kwargs['url']) if re_rus.search(url): url = re_http.sub("", url) url = re_slash.sub("", url) url = url.encode("idna") url = "http://" + url # Set header h = {'User-Agent': self.headers[0], 'Accept': '*/*'} # Download the page request = urllib2.Request(url, headers=h) connection = urllib2.urlopen(request) self.data = connection.read() encoding = connection.headers['content-type'].lower().split( 'charset=')[-1] # Decode the page contents in the correct encoding if self.data is None: raise Exception('Html data cannot be extracted.') if encoding.lower() == 'text/html': encoding = charade.detect(self.data)['encoding'] old = encoding encoding = re_enc_error.sub("", encoding) encoding = re_enc_error2.sub("", encoding) encoding = re_enc_win.sub("windows-1251", encoding) if re_enc_def.search(encoding): encoding = DEFAULT_ENCODING self.data = unicode(self.data, encoding, "ignore") connection.close() elif kwargs.get('html'): self.data = kwargs['html'] if not isinstance(self.data, unicode): self.data = unicode(self.data, charade.detect(self.data)['encoding']) else: raise Exception('No text or url provided') try: # make it thread-safe if threading.activeCount() > 1: if jpype.isThreadAttachedToJVM() == False: jpype.attachThreadToJVM() lock.acquire() self.extractor = jpype.JClass("de.l3s.boilerpipe.extractors." + extractor).INSTANCE finally: lock.release() reader = StringReader(self.data) self.source = BoilerpipeSAXInput(InputSource(reader)).getTextDocument() self.extractor.process(self.source)
def __init__(self, extractor='DefaultExtractor', **kwargs): if kwargs.get('logger'): self.logger = kwargs['logger'] else: self.logger = None if kwargs.get('url'): request = urllib2.Request(kwargs['url'], headers=self.headers) try: connection = urllib2.urlopen(request) except: connection = None if self.logger is not None: self.logger.exception( 'boilerpipe extractor failed on urlopen() for uri %s' % kwargs['url']) if connection is not None: self.data = connection.read() encoding = connection.headers['content-type'].lower().split( 'charset=')[-1] if encoding.lower() == 'text/html': encoding = charade.detect(self.data)['encoding'] self.data = unicode(self.data, encoding) else: if self.logger is not None: self.logger.debug( 'boilerpipe execution continues with empty document') self.data = u'' elif kwargs.get('html'): self.data = kwargs['html'] if not isinstance(self.data, unicode): self.data = unicode(self.data, charade.detect(self.data)['encoding']) else: raise Exception('No text or url provided') try: # make it thread-safe if threading.activeCount() > 1: if jpype.isThreadAttachedToJVM() == False: jpype.attachThreadToJVM() lock.acquire() self.extractor = jpype.JClass("de.l3s.boilerpipe.extractors." + extractor).INSTANCE finally: lock.release() reader = StringReader(self.data) self.source = BoilerpipeSAXInput(InputSource(reader)).getTextDocument() self.extractor.process(self.source)
def __init__(self, extractor='DefaultExtractor', **kwargs): if kwargs.get('url'): # Correctly encode url url = unicode(kwargs['url']) if re_rus.search(url): url = re_http.sub("", url) url = re_slash.sub("", url) url = url.encode("idna") url = "http://" + url # Set header h = {'User-Agent':self.headers[0], 'Accept':'*/*'} # Download the page request = urllib2.Request(url, headers=h) connection = urllib2.urlopen(request) self.data = connection.read() encoding = connection.headers['content-type'].lower().split('charset=')[-1] # Decode the page contents in the correct encoding if self.data is None: raise Exception('Html data cannot be extracted.') if encoding.lower() == 'text/html': encoding = charade.detect(self.data)['encoding'] old = encoding encoding = re_enc_error.sub("", encoding) encoding = re_enc_error2.sub("", encoding) encoding = re_enc_win.sub("windows-1251", encoding) if re_enc_def.search(encoding): encoding = DEFAULT_ENCODING self.data = unicode(self.data, encoding, "ignore") connection.close() elif kwargs.get('html'): self.data = kwargs['html'] if not isinstance(self.data, unicode): self.data = unicode(self.data, charade.detect(self.data)['encoding']) else: raise Exception('No text or url provided') try: # make it thread-safe if threading.activeCount() > 1: if jpype.isThreadAttachedToJVM() == False: jpype.attachThreadToJVM() lock.acquire() self.extractor = jpype.JClass( "de.l3s.boilerpipe.extractors."+extractor).INSTANCE finally: lock.release() reader = StringReader(self.data) self.source = BoilerpipeSAXInput(InputSource(reader)).getTextDocument() self.extractor.process(self.source)
def __init__(self, extractor='DefaultExtractor', **kwargs): if kwargs.get('logger'): self.logger = kwargs['logger'] else: self.logger = None if kwargs.get('url'): request = urllib2.Request(kwargs['url'], headers=self.headers) try: connection = urllib2.urlopen(request) except: connection = None if self.logger is not None: self.logger.exception( 'boilerpipe extractor failed on urlopen() for uri %s' % kwargs['url'] ) if connection is not None: self.data = connection.read() encoding = connection.headers['content-type'].lower().split('charset=')[-1] if encoding.lower() == 'text/html': encoding = charade.detect(self.data)['encoding'] self.data = unicode(self.data, encoding) else: if self.logger is not None: self.logger.debug('boilerpipe execution continues with empty document') self.data = u'' elif kwargs.get('html'): self.data = kwargs['html'] if not isinstance(self.data, unicode): self.data = unicode(self.data, charade.detect(self.data)['encoding']) else: raise Exception('No text or url provided') try: # make it thread-safe if threading.activeCount() > 1: if jpype.isThreadAttachedToJVM() == False: jpype.attachThreadToJVM() lock.acquire() self.extractor = jpype.JClass( "de.l3s.boilerpipe.extractors."+extractor).INSTANCE finally: lock.release() reader = StringReader(self.data) self.source = BoilerpipeSAXInput(InputSource(reader)).getTextDocument() self.extractor.process(self.source)
def decode_html(html): """ Converts bytes stream containing an HTML page into Unicode. Tries to guess character encoding from meta tag of by "charade" library. """ if isinstance(html, unicode): return html match = CHARSET_META_TAG_PATTERN.search(html) if match: declared_encoding = match.group(1).decode("ASCII") # proceed unknown encoding as if it wasn't found at all with ignored(LookupError): return html.decode(declared_encoding, "ignore") # try to enforce UTF-8 firstly with ignored(UnicodeDecodeError): return html.decode("utf8") text = TAG_MARK_PATTERN.sub(to_bytes(" "), html) diff = text.decode("utf8", "ignore").encode("utf8") sizes = len(diff), len(text) # 99% of text is UTF-8 if abs(len(text) - len(diff)) < max(sizes) * 0.01: return html.decode("utf8", "ignore") # try detect encoding encoding = "utf8" encoding_detector = charade.detect(text) if encoding_detector["encoding"]: encoding = encoding_detector["encoding"] return html.decode(encoding, "ignore")
def decode(content, language): """Decode subtitle `content` in a specified `language` :param bytes content: content of the subtitle :param language: language of the subtitle :type language: :class:`babelfish.Language` :return: the decoded `content` bytes :rtype: string """ # always try utf-8 first encodings = ['utf-8'] # add language-specific encodings if language.alpha3 == 'zho': encodings.extend(['gb18030', 'big5']) elif language.alpha3 == 'jpn': encodings.append('shift-jis') elif language.alpha3 == 'ara': encodings.append('windows-1256') elif language.alpha3 == 'heb': encodings.append('windows-1255') else: encodings.append('latin-1') # try to decode for encoding in encodings: try: return content.decode(encoding) except UnicodeDecodeError: pass # fallback on charade logger.warning('Could not decode content with encodings %r', encodings) return content.decode(charade.detect(content)['encoding'], 'replace')
def decode(content, language): """Decode subtitle `content` in a specified `language` :param bytes content: content of the subtitle :param language: language of the subtitle :type language: :class:`babelfish.Language` :return: the decoded `content` bytes :rtype: string """ # always try utf-8 first encodings = ['utf-8'] # add language-specific encodings if language.alpha3 == 'zho': encodings.extend(['gb18030', 'big5']) elif language.alpha3 == 'jpn': encodings.append('shift-jis') elif language.alpha3 == 'ara': encodings.append('windows-1256') else: encodings.append('latin-1') # try to decode for encoding in encodings: try: return content.decode(encoding) except UnicodeDecodeError: pass # fallback on charade logger.warning('Could not decode content with encodings %r', encodings) return content.decode(charade.detect(content)['encoding'], 'replace')
def download_subtitle(self, subtitle): soup = self.get(subtitle.link, is_xml=False) pre_link = soup.find('a', href=self.pre_link_re) if not pre_link: raise ProviderError('Cannot find the pre-download link') pre_link = self.server + \ self.pre_link_re.match(pre_link['href']).group('link') # Continue following the link soup = self.get( pre_link, headers={ 'Referer': self.server, }, is_xml=False, ) link = soup.find('a', href=self.link_re) if not link: raise ProviderError('Cannot find the download link') try: r = self.session.get(self.server + self.link_re.match(link['href']).group('link'), timeout=10) except requests.Timeout: raise ProviderNotAvailable('Timeout after 10 seconds') if r.status_code != 200: raise ProviderNotAvailable('Request failed with status code %d' % r.status_code) with contextlib.closing(zipfile.ZipFile(io.BytesIO(r.content))) as zf: if len(zf.namelist()) > 1: raise ProviderError('More than one file to unzip') subtitle_bytes = zf.read(zf.namelist()[0]) subtitle_text = subtitle_bytes.decode(charade.detect(subtitle_bytes)['encoding'], 'replace') if not is_valid_subtitle(subtitle_text): raise InvalidSubtitle return subtitle_text
def __init__(self, extractor='DefaultExtractor', **kwargs): if kwargs.get('url'): response = requests.request('GET', kwargs['url'], headers=self.headers) self.data = response.text elif kwargs.get('html'): self.data = kwargs['html'] if not isinstance(self.data, unicode): self.data = unicode(self.data, charade.detect(self.data)['encoding']) else: raise Exception('No text or url provided') try: # make it thread-safe if threading.activeCount() > 1: if jpype.isThreadAttachedToJVM() == False: jpype.attachThreadToJVM() lock.acquire() self.extractor = jpype.JClass( "de.l3s.boilerpipe.extractors."+extractor).INSTANCE finally: lock.release() reader = StringReader(self.data) self.source = BoilerpipeSAXInput(InputSource(reader)).getTextDocument() self.extractor.process(self.source)
def _bytes_to_unicode(self, byte_data): """ Decode a http byte-response to unicode. Tries to decode bytestream to utf-8. If this fails, encoding is guessed by charade and decoding is repeated with just dedected encoding. :param byte_data: A bytestream. :returns: A unicode """ try: encoding = charade.detect(byte_data).get('encoding') return byte_data.decode(encoding) except (TypeError, AttributeError, UnicodeError) as e: print('Error decoding bytes with charade.', e) try: return byte_data.decode('utf-8') except (TypeError, AttributeError, UnicodeError) as e: print('Error decoding bytes to utf-8.', e) try: return str(BeautifulSoup(byte_data)) except Exception as e: print('Exception in downloadqueue while trying to encode with BeautifulSoup:', e)
def download_subtitle(self, subtitle): params = {"action": "download", "hash": subtitle.hash, "language": subtitle.language.alpha2} r = self.get(params) if r.status_code != 200: raise ProviderError("Request failed with status code %d" % r.status_code) subtitle_text = r.content.decode(charade.detect(r.content)["encoding"], "replace") if not is_valid_subtitle(subtitle_text): raise InvalidSubtitle subtitle.content = subtitle_text
def detect(s): ''' >>> detect('ascii') {'confidence': 1.0, 'encoding': 'ascii'} >>> detect('abcdé') {'confidence': 0.505, 'encoding': 'utf-8'} >>> detect(bytes('abcdé', 'utf-8')) {'confidence': 0.505, 'encoding': 'utf-8'} >>> detect(bytes('\222\222\223\225', 'latin-1')) {'confidence': 0.5, 'encoding': 'windows-1252'} ''' try: if isinstance(s, str): return charade.detect(s.encode()) else: return charade.detect(s) except UnicodeDecodeError: return charade.detect(s.encode('utf-8'))
def download_subtitle(self, subtitle): params = {'action': 'download', 'hash': subtitle.hash, 'language': subtitle.language.alpha2} r = self.get(params) if r.status_code != 200: raise ProviderError('Request failed with status code %d' % r.status_code) subtitle_text = r.content.decode(charade.detect(r.content)['encoding']) if not is_valid_subtitle(subtitle_text): raise InvalidSubtitle return subtitle_text
def bp_extract(url): request = urllib2.Request(url, headers=headers) connection = urllib2.urlopen(request) data = connection.read() encoding = connection.headers['content-type'].lower().split('charset=')[-1] encoding = charade.detect(data)['encoding'] extr = Extractor(extractor='ArticleExtractor', url=url) return extr.getText().encode(encoding).decode('iso-8859-15')
def trans_url_to_utf8(url): debug('trans url to utf-8 encoding') if isinstance(url, unicode): url = url.encode('utf-8') encoding = charade.detect(url)['encoding'] if encoding.lower() in ('gb2312', 'gbk'): encoding = 'gb18030' url = url.decode(encoding).encode('utf-8') return url
def set_encoding(soup): try: enc = charade.detect(soup.get_text())['encoding'] except: log.warning('Could not detect encoding, good luck!') return soup return soup.encode(enc)
def _detect_encoding(cls, path): sample = open(path).read(1024) for bom, encoding in BOMS: if sample.startswith(bom): return encoding report = charade.detect(sample) encoding = report.get('encoding') if not encoding: return cls.DEFAULT_ENCODING return cls._normalize_encoding(encoding)
def download_subtitle(self, subtitle): try: r = self.session.get(self.server + subtitle.download_link, timeout=10, headers={'Referer': self.server + subtitle.referer}) except requests.Timeout: raise ProviderNotAvailable('Timeout after 10 seconds') if r.status_code != 200: raise ProviderNotAvailable('Request failed with status code %d' % r.status_code) subtitle_text = r.content.decode(charade.detect(r.content)['encoding']) if not is_valid_subtitle(subtitle_text): raise InvalidSubtitle return subtitle_text
def serial(result, fname="temp.bin"): if charade.detect(fname)['encoding'] == 'utf-8': fname = convert(fname) root_dir = os.path.dirname(__file__) fname = root_dir + "\\" + fname f = open(fname, "wb") p = cPickle.Pickler(f) p.clear_memo() p.fast = True p.dump(result) f.close()
def __init__(self, extractor='DefaultExtractor', **kwargs): if kwargs.get('url'): request = urllib2.Request(kwargs['url'], headers=self.headers) connection = urllib2.urlopen(request) self.data = connection.read() encoding = connection.headers['content-type'].lower().split('charset=')[-1] if encoding.lower() == 'text/html': encoding = charade.detect(self.data)['encoding'] # self.data = unicode(self.data, 'gbk') #self.data = self.data.decode(encoding, 'ignore') try: self.data = unicode(self.data, charade.detect(self.data)['encoding']) except UnicodeError: encoding = charade.detect(self.data)['encoding'] self.data = self.data.decode(encoding, 'ignore') elif kwargs.get('html'): self.data = kwargs['html'] if not isinstance(self.data, unicode): try: self.data = unicode(self.data,'gbk') #self.data = unicode(self.data, charade.detect(self.data)['encoding']) #try: # self.data = unicode(self.data, charade.detect(self.data)['encoding']) except UnicodeError: encoding = charade.detect(self.data)['encoding'] print "charset is :",encoding self.data = self.data.decode(encoding, 'ignore') ## Extractor(extractor='ArticleExtractor',file='/tmp/a.html') elif kwargs.get('file'): Path = kwargs['file'] f = open(Path, 'r') self.data = f.read() f.close() if not isinstance(self.data, unicode): try: self.data = unicode(self.data, charade.detect(self.data)['encoding']) except UnicodeError: encoding = charade.detect(self.data)['encoding'] self.data = self.data.decode(encoding, 'ignore') else: raise Exception('No text or url provided') try: # make it thread-safe if threading.activeCount() > 1: if jpype.isThreadAttachedToJVM() == False: jpype.attachThreadToJVM() lock.acquire() self.extractor = jpype.JClass( "de.l3s.boilerpipe.extractors."+extractor).INSTANCE finally: lock.release() reader = StringReader(self.data) self.source = BoilerpipeSAXInput(InputSource(reader)).getTextDocument() self.extractor.process(self.source)
def guess_text_charset(text, is_html=False): if is_html: rules = isinstance(text, bytes) and RULES_B or RULES_U for meta in rules.re_meta.findall(text): if rules.re_is_http_equiv.findall(meta): for content in rules.re_parse_http_equiv.findall(meta): for charset in rules.re_charset.findall(content): return to_native(charset) else: for charset in rules.re_charset.findall(meta): return to_native(charset) # guess by chardet if isinstance(text, bytes): return to_native(charade.detect(text)['encoding'])
def download_subtitle(self, subtitle): try: response = self.server.DownloadSubtitles(self.token, [subtitle.id]) except xmlrpclib.ProtocolError: raise ProviderNotAvailable if response['status'] != '200 OK': raise ProviderError('Download failed with status %r' % response['status']) if not response['data']: raise ProviderError('Nothing to download') subtitle_bytes = zlib.decompress(base64.b64decode(response['data'][0]['data']), 47) subtitle_text = subtitle_bytes.decode(charade.detect(subtitle_bytes)['encoding']) if not is_valid_subtitle(subtitle_text): raise InvalidSubtitle return subtitle_text
def guess_text_charset(text, is_html=False): if is_html: rules = isinstance(text, bytes) and RULES_B or RULES_U for meta in rules.re_meta.findall(text): if rules.re_is_http_equiv.findall(meta): for content in rules.re_parse_http_equiv.findall(meta): for charset in rules.re_charset.findall(content): return to_native(charset) else: for charset in rules.re_charset.findall(meta): return to_native(charset) # guess by chardet if isinstance(text, bytes): return to_native(chardet.detect(text)['encoding'])
def download_subtitle(self, subtitle): try: r = self.session.get(self.server + subtitle.download_link, timeout=10, headers={'Referer': self.server + subtitle.referer}) except requests.Timeout: raise ProviderNotAvailable('Timeout after 10 seconds') if r.status_code != 200: raise ProviderNotAvailable('Request failed with status code %d' % r.status_code) if r.headers['Content-Type'] == 'text/html': raise ProviderNotAvailable('Download limit exceeded') subtitle_text = r.content.decode(charade.detect(r.content)['encoding'], 'replace') if not is_valid_subtitle(subtitle_text): raise InvalidSubtitle subtitle.content = subtitle_text
def download_subtitle(self, subtitle): params = { 'action': 'download', 'hash': subtitle.hash, 'language': subtitle.language.alpha2 } r = self.get(params) if r.status_code != 200: raise ProviderError('Request failed with status code %d' % r.status_code) subtitle_text = r.content.decode(charade.detect(r.content)['encoding']) if not is_valid_subtitle(subtitle_text): raise InvalidSubtitle return subtitle_text
def _convert_encoding(self, str): try: encoded = str.encode("iso-8859-1") except (UnicodeEncodeError, # Encoding is determined correctly by Mutagen AttributeError): # Object has no method encode() return str charset = charade.detect(encoded) try: return encoded.decode(charset['encoding']) except (TypeError, # Charade can not determines encoding UnicodeDecodeError, # Encoding is determined incorrectly by Charade LookupError): # Encoding determined by Charade is not found return str
def download_subtitle(self, subtitle): try: r = self.session.get(self.server + '/download-{subtitle_id}.html'.format(subtitle_id=subtitle.id), timeout=10) except requests.Timeout: raise ProviderNotAvailable('Timeout after 10 seconds') if r.status_code != 200: raise ProviderNotAvailable('Request failed with status code %d' % r.status_code) with zipfile.ZipFile(io.BytesIO(r.content)) as zf: if len(zf.namelist()) > 1: raise ProviderError('More than one file to unzip') subtitle_bytes = zf.read(zf.namelist()[0]) subtitle_text = subtitle_bytes.decode(charade.detect(subtitle_bytes)['encoding']) if not is_valid_subtitle(subtitle_text): raise InvalidSubtitle return subtitle_text
def download_subtitle(self, subtitle): try: r = self.session.get(self.server + '/download-{subtitle_id}.html'.format(subtitle_id=subtitle.id), timeout=10) except requests.Timeout: raise ProviderNotAvailable('Timeout after 10 seconds') if r.status_code != 200: raise ProviderNotAvailable('Request failed with status code %d' % r.status_code) with zipfile.ZipFile(io.BytesIO(r.content)) as zf: if len(zf.namelist()) > 1: raise ProviderError('More than one file to unzip') subtitle_bytes = zf.read(zf.namelist()[0]) subtitle_text = subtitle_bytes.decode(charade.detect(subtitle_bytes)['encoding'], 'replace') if not is_valid_subtitle(subtitle_text): raise InvalidSubtitle subtitle.content = subtitle_text
def detect_encoding_from_requests_response(response): """ :param:`response` beiing a :module:`requests` response, this function will try to detect the encoding as much as possible. Fist, the "normal" response encoding will be tried, else the headers will be parsed, and finally the ``<head>`` of the ``<html>`` content will be parsed. If nothing succeeds, we will rely on :module:`charade` to guess from the content. .. todo:: we have to check if content-type is HTML before parsing the headers. For now you should use this function only on responses which you are sure they will contain HTML. """ if getattr(response, 'encoding', None): return response.encoding # In case the headers don't contain an content-type, we get() # 'text/html' as a fallback value, which will trigger the same # behaviour as having a content-type header with no charset value. encoding = response.headers.get( 'content-type', 'text/html').lower().split('charset=')[-1] if encoding.lower() == 'text/html': # HTTP headers don't contain any encoding. # Search in page head, then try to detect from data. html_content = BeautifulSoup(response.content, 'lxml') for meta_header in html_content.head.findAll('meta'): for attribute, value in meta_header.attrs.items(): if attribute.lower() == 'http-equiv': if value.lower() == 'content-type': content = meta_header.attrs.get('content') encoding = content.lower().split('charset=')[-1] break if encoding.lower() == 'text/html': # If we couldn't find an encoding in the HTML <head>, # try to detect it manually wth charade. This can # eventually fail, too… In this case, OMG… We are alone. try: return charade.detect(response)['encoding'] except: LOGGER.critical('Could not detect encoding of %s', response) return None
def download_subtitle(self, subtitle): try: response = self.server.DownloadSubtitles(self.token, [subtitle.id]) except xmlrpclib.ProtocolError: raise ProviderNotAvailable if response['status'] != '200 OK': raise ProviderError('Download failed with status %r' % response['status']) if not response['data']: raise ProviderError('Nothing to download') subtitle_bytes = zlib.decompress( base64.b64decode(response['data'][0]['data']), 47) subtitle_text = subtitle_bytes.decode( charade.detect(subtitle_bytes)['encoding'], 'replace') if not is_valid_subtitle(subtitle_text): raise InvalidSubtitle return subtitle_text
def download_subtitle(self, subtitle): soup = self.get(subtitle.link, is_xml=False) link = soup.find('a', href=self.link_re) if not link: raise ProviderError('Cannot find the download link') try: r = self.session.get(self.server + self.link_re.match(link['href']).group('link'), timeout=10) except requests.Timeout: raise ProviderNotAvailable('Timeout after 10 seconds') if r.status_code != 200: raise ProviderNotAvailable('Request failed with status code %d' % r.status_code) with zipfile.ZipFile(io.BytesIO(r.content)) as zf: if len(zf.namelist()) > 1: raise ProviderError('More than one file to unzip') subtitle_bytes = zf.read(zf.namelist()[0]) subtitle_text = subtitle_bytes.decode(charade.detect(subtitle_bytes)['encoding'], 'replace') if not is_valid_subtitle(subtitle_text): raise InvalidSubtitle subtitle.content = subtitle_text
def __init__(self, data_path, verbose=False): # data_path is the path to TAC_2014_BiomedSumm folder self.docs = {} self.verbose = verbose for topic_path in listfulldir(os.path.join(data_path, 'data')): topic = os.path.split(topic_path)[1].lower() self.docs.setdefault(topic, {}) for doc_path in listfulldir(os.path.join(topic_path, 'Documents_Text')): doc = os.path.split(doc_path)[1][:-4].lower() with codecs.open(doc_path, mode='rb', encoding='utf-8', errors='strict') as df: try: self.docs[topic][doc] = df.read().replace('\r', '') except UnicodeDecodeError: with file(doc_path, mode='rb') as df: frmt = charade.detect(df.read())['encoding'] with codecs.open(doc_path, mode='rb', encoding=frmt, errors='strict') as df: self.docs[topic][doc] = df.read().replace('\r', '') if self.verbose: print('list of topics: %s' % '; '.join(self.docs.keys())) dnames = set(chain(*[d.keys() for d in self.docs.itervalues()])) print('list of doc_name: %s' % '; '.join(dnames)) # create name aliases for incosistencies caused by ES for topic in self.docs.keys(): for doc in self.docs[topic].keys(): if doc.find(',') >= 0 or doc.find('\'') >= 0: new_doc = doc.replace(',', '').replace('\'', '"') self.docs[topic][new_doc] = self.docs[topic][doc] self.para_index = {} for topic in self.docs: self.para_index.setdefault(topic, {}) for doc, data in self.docs[topic].iteritems(): paragraphs = para_tokenize(self.docs[topic][doc]) soff = [(s, e) for s, e in sorted(paragraphs['offsets'], key=lambda x: x[0], reverse=True)] self.para_index[topic][doc] = OrderedDict(soff)
def text_reader(file): try: with open(file, 'rb') as f_obj: origin = f_obj.read() chartype = charade.detect(origin) try: if 'GB' in chartype['encoding']: article = origin.decode('gbk') else: article = origin.decode(chartype['encoding']) except UnicodeDecodeError and TypeError: error_box.append('Error - File decode is failed: ' + file) article = None except FileNotFoundError: error_box.append('Error - File is not exist: ' + file) article = None if article: while '=' in article: article = article.replace('=', '等于') return article
def serial(result, fname="temp.bin"): if isinstance(result, pd.DataFrame) or isinstance(result, pd.Panel): fname = str(fname).replace('.searial', '.df') elif isinstance(result, np.ndarray): fname = str(fname).replace('.searial', '.csv') if charade.detect(fname)['encoding'] == 'utf-8': fname = convert(fname) if isinstance(result, pd.DataFrame) or isinstance(result, pd.Panel): result.to_pickle(fname) #result.to_csv(fname) elif isinstance(result, np.ndarray): np.savetxt(fname, result, delimiter=',', fmt='%.3f') else: f = open(fname, "wb") p = cPickle.Pickler(f) p.clear_memo() p.fast = True p.dump(result) f.close()
def determine_encoding(page): encoding = "utf8" text = TAG_MARK_PATTERN.sub(to_bytes(" "), page) # don't venture to guess if not text.strip() or len(text) < 10: return encoding # try enforce UTF-8 diff = text.decode(encoding, "ignore").encode(encoding) sizes = len(diff), len(text) # 99% of UTF-8 if abs(len(text) - len(diff)) < max(sizes) * 0.01: return encoding # try detect encoding encoding_detector = charade.detect(text) if encoding_detector["encoding"]: encoding = encoding_detector["encoding"] return encoding
def dec(raw): print("mmm", charade.detect(raw)['encoding']) encoding = None for enc in ('utf-8', "CP1252", 'utf-16', 'utf-32'): try: sdec = raw.decode(enc) encoding = enc break #print "good",encoding #return sdec except UnicodeDecodeError: print("error", enc) if encoding: print("found encoding", encoding) #print sdec if "é" in sdec: print("ooooooooooooooooooo") else: findEncodingInfo(raw) decode(raw) print("chardet.detect(raw)", chardet.detect(raw))
def detect_encoding_from_requests_response(response, meta=False, deep=False): """ Try to detect encoding as much as possible. :param:`response` beiing a :module:`requests` response, this function will try to detect the encoding as much as possible. Fist, the "normal" response encoding will be tried, else the headers will be parsed, and finally the ``<head>`` of the ``<html>`` content will be parsed. If nothing succeeds, we will rely on :module:`charade` to guess from the content. .. todo:: we have to check if content-type is HTML before parsing the headers. For now you should use this function only on responses which you are sure they will contain HTML. """ if getattr(response, 'encoding', None) and not (meta or deep): # To understand, please read # http://docs.python-requests.org/en/latest/user/advanced/#encodings if response.encoding.lower() != 'iso-8859-1': if __debug__: LOGGER.debug(u'detect_encoding_from_requests_response(): ' u'detected %s via `requests` module.', response.encoding) return response.encoding # If requests doesn't bring us any encoding or returns 'iso-8859-1', # we have 3 fallback options: # - inspect the server headers ourselves. This is fast, but rarely # they exist (that's probably why requests failed), and sometimes # they disagree with META tags, # - look up the META tags. This is fast too, but sometimes the tag # is not present or the value is wrong too, # - detect it via `charade`. Quite slower, but gives accurate results. content_type = response.headers.get('content-type', None) # If found and no deeper search is wanted, return it. if content_type is not None and 'charset' in content_type \ and not (meta or deep): encoding = content_type.lower().split('charset=')[-1] if __debug__: LOGGER.debug(u'detect_encoding_from_requests_response(): ' u'detected %s via server headers.', encoding) return encoding # HTTP headers don't contain any encoding. # Search in page head, then try to detect from data. html_content = BeautifulSoup(response.content, 'lxml') found = False try: metas = html_content.head.findAll('meta') except AttributeError: # Happens on non-HTML pages (eg. RSS feed, other XML resources…) metas = [] for meta_header in metas: for attribute, value in meta_header.attrs.items(): if attribute.lower() == 'charset': encoding = value found = True break elif attribute.lower() == 'http-equiv': if value.lower() == 'content-type': # OMG o_O took time to find this one : # # In [73]: meta_header # Out[73]: <meta content="text/html; charset=utf-8" … # In [74]: meta_header.get('content') # Out[74]: u'text/html; charset=iso-8859-1' # # We cannot rely on get('content') and need to # fallback to good ol' RE searching. Thanks BS4. content = unicode(meta_header).lower() if 'charset' in content: encoding = re.search('charset=([\w-]*)', content, re.I | re.U).group(1) found = True break if found: break # If no deeper search is wanted, return it now. if found and encoding not in ('text/html', '', None) and not deep: if __debug__: LOGGER.debug(u'detect_encoding_from_requests_response(): ' u'detected %s via HTML meta tags.', encoding) return encoding try: charade_result = charade.detect(response.content) except: pass else: if __debug__: LOGGER.debug(u'detect_encoding_from_requests_response(): ' u'detected %s via `charade` module (with %s%% ' u'confidence).', charade_result['encoding'], charade_result['confidence'] * 100) return charade_result['encoding'] LOGGER.critical('detect_encoding_from_requests_response(): could not ' u'detect encoding of %s via all test methods.', response) return None
def CodingDetermine(self): coding = charade.detect(self.textEdit.toPlainText()) ExMess = QtGui.QMessageBox.question(self, u'Coding!', str(coding.items()), QtGui.QMessageBox.Yes, QtGui.QMessageBox.No)
help="Tries, default: 10", type=int, default="10") parser.add_argument("-r", "--remove_punct", help="Remove punctuation,default: False", type=bool, default=False) args = parser.parse_args() print "Reading all files from %s" % args.dir try: files = open_dir(args.dir) result = read_files(files) enc = charade.detect(result) syscodepage = sys.stdout.encoding if args.remove_punct: print "Removing punctuation per %s" % args.remove_punct chars = re.escape(string.punctuation) new_result = re.sub(r'[' + chars + ']', ' ', result.decode(enc['encoding'])) else: new_result = result.decode(enc['encoding']) except: raise print "Tokenizing text..." if args.remove_punct: tokens = nltk.word_tokenize(new_result.lower())
def _detect_encoding(cls, path): report = charade.detect(open(path).read()) encoding = report.get('encoding') if not encoding: return cls.DEFAULT_ENCODING return cls._normalize_encoding(encoding)
def encoding_detect(file_path): """ get file's encoding """ file_buf = open(file_path, 'rb').read() result = charade.detect(file_buf) return result['encoding']
# tested windows-1250 to windows-1258 (1259 doesn't exist) # utf-16 gives error: UnicodeError: UTF-16 stream does not start with BOM # macRoman macGreek macturkish maclatin2 # latin-1 latin2 - latin10 nb iso-8859-1 == latin-1 iso-8859-5 to 8 # UTF-16LE UTF-16BE utf_32_le utf_32_be # ISO-8859-7 # cp500 cp737 cp850 cp852 cp855 cp857 cp858 cp869 cp875 cp1026 cp1140 # greek == iso-8859-7 # ascii (lol) # import ftfy rawdata = open(dir + file, 'rb').read() result = charade.detect(rawdata) print ftfy.guess_bytes(rawdata)[0] print rawdata print result ''' with codecs.open(dir + file, mode='r', encoding='utf-8') as infile: #with io.open(dir + file, mode='rb') as infile: # data = infile.read().encode('windows-1250') #.decode('latin1') #print data for line in infile: #line = line.replace(u'ˆ', u'à')
if __name__ == '__main__': parser = argparse.ArgumentParser(description="Generate Markov chains from directory with text files") parser.add_argument("-d", "--dir", help="directory with files", required=True) parser.add_argument("-n", "--ngram", help="n-Gram, default: 3 (trigram)", type=int, default="3") parser.add_argument("-w", "--words", help="Words, default: 100", type=int, default="100") parser.add_argument("-t", "--tries", help="Tries, default: 10", type=int, default="10") parser.add_argument("-r", "--remove_punct", help="Remove punctuation,default: False", type=bool, default=False) args = parser.parse_args() print "Reading all files from %s" % args.dir try: files = open_dir(args.dir) result = read_files(files) enc = charade.detect(result) syscodepage = sys.stdout.encoding if args.remove_punct: print "Removing punctuation per %s" % args.remove_punct chars = re.escape(string.punctuation) new_result = re.sub(r'[' + chars + ']', ' ', result.decode(enc['encoding'])) else: new_result = result.decode(enc['encoding']) except: raise print "Tokenizing text..." if args.remove_punct: tokens = nltk.word_tokenize(new_result.lower()) else:
def _openLink(self, link): if verbose: print("trying: --------------- " + link.encode("utf-8") + " ---------------------") try: r = self.groopener.open(link) self.urls = [] self.mime = r.headers.get('content-type', None) # .info().gettype() if verbose: print("self.mime", self.mime) if self.mime and self.mime.startswith( "text"): ######### text, html, or xml # network.read() if verbose: print("reading text or html: " + link.encode("utf-8") + " ok") # if self.followRedirect:match=redirectre.search(self.source) # while self.followRedirect and link not in self.urls and redirectre.search(self.source): # link = urljoin(link, redirectre.search(self.source).groups()[0].strip()) # network=self.groopener.open(link) # self.source = network.read() # self.urls+=[link] self.urls = [x.url for x in r.history] if link[-1] == "/": link = link[: -1] # strangely enough normal pages without redirects are not added to the history if link not in self.urls and link + "/" not in self.urls: self.urls += [link] # if self.defaultEncoding.strip( ): # an encoding should be forced. use BeautifulSoup self.source = r.content self.soup = BeautifulSoup( self.source, "html.parser", from_encoding=self.defaultEncoding) self.encoding = self.soup.original_encoding self.source = str(self.soup) else: self.source = r.content self.encoding = charade.detect(r.content)['encoding'] if self.encoding: self.soup = BeautifulSoup(self.source, "html.parser", from_encoding=self.encoding) #if u"ĂŠ" in unicode(self.soup) or u"é" in unicode(self.soup): if "Ă" in str( self.soup ): # key for finding most wrong windows encodings inside utf-8 for g in isoUtf8Garbage: # check whether it's at least one other typical character if "Ă" + g in str(self.soup): if verbose: print( "Ă + some typical garbage - it's in fact utf-8" ) # typical errors when something is in fact utf-8 but it's decoded as western self.encoding = "utf-8" self.soup = BeautifulSoup( self.source, "html.parser" ) # .decode("utf-8",'replace') break else: self.soup = BeautifulSoup(self.source, "html.parser") self.encoding = r.encoding if verbose: print( "htmlPage self.soup.contains_replacement_characters", self.soup.contains_replacement_characters) print("self.soup.original_encoding", self.soup.original_encoding) #self.encoding = r.encoding self.source = str(self.soup) #if verbose:print self.source.encode("utf-8") self.links = self._getLinks() self.source = self.getCleanHtml() self.text = self.html2text() title = None if self.soup.head: title = self.soup.head.title if not title: title = self.soup.title if title: self.title = title.text self.computeStat() elif self.mime == "application/pdf" and self.takePdf: ######### pdf self.source = r.content # TODO: how about encoding of pdf pages? if verbose: print("reading pdf: " + link.encode("utf-8") + " ok") try: self.text = self.pdf2text(self.source) except: self.text = "" self.computeStat() self.title = link.split("/")[-1] if self.title.endswith(".pdf"): self.title = self.title[:-4] self.source = "pdf" # TODO: what to do with the pdf source code??? can't put it easily into sqlite! elif verbose: print("wrong mime type", self.mime) except IOError: if verbose: print("timeout with ", link.encode("utf-8")) self.error = "Timeout" except Exception as msg: if verbose: print(traceback.format_exc()) print(msg) print("problem with ", link.encode("utf-8")) self.error = str(traceback.format_exc()) + " " + str(msg) else: self.error = "other exception:" + str(msg)
def _use_htmlbytes(self, htmlbytes): encoding = charade.detect(htmlbytes)["encoding"] self.htmldata = htmlbytes.decode(encoding=encoding)
def is_utf8(s): return charade.detect(s)['encoding'] == 'utf-8'