Python UnicodeDammit.UnicodeDammit示例，bs4.UnicodeDammit.UnicodeDammit Python示例

示例#1

0

显示文件

文件： java.py 项目： WM-SEMERU/ds4se

def get_unicode(file_path):
    with open(file_path, 'rb') as f:
        detection = chardet.detect(f.read())

    enc = detection["encoding"]
    if detection["encoding"] == "ascii":
        with open(file_path, encoding="ascii") as f:
            data = f.read()
    elif detection["encoding"] == "ISO-8859-9":
        with open(file_path, encoding="utf-8") as f:
            enc = "utf-8"
            data = f.read()
    else:
        try:
            # Try to open as non unicode file
            with open(file_path, encoding=detection["encoding"]) as f:
                data = f.read()
        except Exception as e:
            raise ValueError(f"Cannot return dictionary from empty or invalid csv file {file_path} due to {e}")

    if not data:
        raise ValueError(f"Cannot return dictionary from empty or invalid csv file {file_path}")

    return UnicodeDammit(data).unicode_markup, enc

示例#2

0

显示文件

文件： rsc.py 项目： usccolumbia/batterydatabase

def parse_rsc_html(htmlstring):
    """Messy RSC HTML needs this special parser to fix problems before creating selector."""
    converted = UnicodeDammit(htmlstring)
    if not converted.unicode_markup:
        raise UnicodeDecodeError('Failed to detect encoding, tried [%s]')
    root = fromstring(htmlstring,
                      parser=HTMLParser(recover=True,
                                        encoding=converted.original_encoding))
    # Add p.otherpara tags around orphan text
    newp = None
    for child in root.get_element_by_id('wrapper'):
        if newp is not None:
            if child.tag in BLOCK_ELEMENTS or child.get(
                    'id', '').startswith('sect') or child.getnext() is None:
                child.addprevious(newp)
                newp = None
            else:
                newp.append(child)
        if newp is None and child.tag in BLOCK_ELEMENTS and child.tail and child.tail.strip(
        ):
            newp = Element('p', **{'class': 'otherpara'})
            newp.text = child.tail
            child.tail = ''
    return root

示例#3

0

显示文件

文件： json2fields.py 项目： sudhanshujain2490/awol-index

def main (args):
    """
    main functions
    """
    logger = logging.getLogger(sys._getframe().f_code.co_name)

    path_source = os.path.realpath(args.json[0])

    fields = []
    files = 0

    for dir_name, sub_dir_list, file_list in os.walk(path_source):
        this_dir = os.path.basename(dir_name)
        try:
            dirname = unicode(this_dir)
        except UnicodeDecodeError:
            try:
                dirname = UnicodeDammit(this_dir).unicode_markup
            except UnicodeDecodeError:
                logger.warning('this directory name is unspeakable evil')
                dirname = u'[[[EVIL]]]'

        for file_name_json in file_list:
            files += 1
            with open(os.path.join(dir_name, file_name_json), 'r') as file_json:
                resource = json.load(file_json)
            for field in resource.keys():
                if field not in fields:
                    fields.append(field)
            pprint(resource)
            del resource
            if files % 250 == 0:
                logger.debug(u'parsed {0} files: {1} fields at {2}'.format(files, len(fields), dirname))

    for field in sorted(fields):
        print (field)

示例#4

0

显示文件

def get_proxies(n=5):
    """Read some notoriously known sites and extract some public proxies.

    Scrapes
        - http://www.samair.ru/proxy/

    The quality of these proxies is probably not worth to be mentioned, but it's
    nice to test the lack of quality and the behaviour of GoogleScraper.
    """
    r = requests.get('http://www.samair.ru/proxy/')
    # Try to parse the google HTML result using lxml
    try:
        doc = UnicodeDammit(r.text, is_html=True)
        parser = lxml.html.HTMLParser(encoding=doc.declared_html_encoding)
        dom = lxml.html.document_fromstring(r.text, parser=parser)
        dom.resolve_base_href()
    except Exception as e:
        print('Some error occurred while lxml tried to parse: {}'.format(e))

    table = dom.xpath('//table[@id=\'proxylist\']')[0]
    for row in table.findall('tr'):
        print(row.xpath('//td[1]')[0].text_content())

    return GoogleScraper.Proxy()

示例#5

0

显示文件

文件： beautifulsoup_example.py 项目： NaelQAWAS/Python

def unicode_dammit_example():
    # Install the 'chardet' or 'cchardet' Python libraries for better guesses

    ### Take a string with unknown encoding and make the string Unicode
    weirdass_string = "Sacr\xc3\xa9 bleu!"
    dammit = UnicodeDammit(weirdass_string)
    print "Original Word with weird encoding:", weirdass_string
    print "Dammit Print:", (dammit.unicode_markup)
    print "Dammit Type:", (dammit.original_encoding)

    ### Take a doc with mostly UTF-8 encoding (and misc encodings due to mult
    # data sources) and convert to UTF-8 Unicode with .Dammit.detwingle()
    snowmen = (u"\N{SNOWMAN}" * 3)
    quote = (
        u"\N{LEFT DOUBLE QUOTATION MARK}I like snowmen!\N{RIGHT DOUBLE QUOTATION MARK}"
    )
    doc = snowmen.encode("utf8") + quote.encode("windows-1252")
    # So now we have one doc with two encodings in it, printing is a mess
    #print "Weird Decoding doc with utf8:", doc # messed up, won't print
    #print (doc.decode("windows-1252")) # So messed up it doesn't even print

    # Decode using UnicodeDammit.detwingle() converts the string to pure UTF-8
    new_doc = UnicodeDammit.detwingle(doc)
    print new_doc.decode("utf8")

示例#6

0

显示文件

def get_fileencoding(filename, default=None, detail=None):
	encoding = default
	skip_bytes = 0
	if os.path.isfile(filename):
		f = __builtin__.open(filename, "rb")
		try:
			s = f.read(2)
			"""
			ANSI：				无格式定义；
			Unicode：			前两个字节为FFFE；
			Unicode big endian：	前两字节为FEFF；
			UTF-8 with BOM：		前三字节为EFBBBF；
			"""
			if s == chr(0xff) + chr(0xfe):
				encoding = "utf_16_le"
				skip_bytes = 2
			elif s == chr(0xfe) + chr(0xff):
				encoding = "utf_16_be"
				skip_bytes = 2
			elif s == chr(0xef) + chr(0xbb):
				encoding = "utf-8-sig"
				skip_bytes = 3
		except:
			pass
		if not encoding:
			# 使用BeautifulSoup的编码识别功能
			f.seek(0)
			line = f.readline()
			dammit = UnicodeDammit(line)
			# 注意，这种方法有时获取到的编码是'windows-1252'（拉丁字符集的一种），因而不可靠。
			encoding = dammit.original_encoding
		f.close()
	if isinstance(detail, dict):
		detail["encoding"] = encoding
		detail["skip_bytes"] = skip_bytes
	return encoding

示例#7

0

显示文件

文件： spider_img_1.py 项目： JackWyj/yuanfang

def spiderImage(url):
    global urls
    global count
    req = urllib.request.Request(url, headers=header) #伪装成浏览器
    data = urllib.request.urlopen(req)   #访问网站
    data = data.read()      #读取网站内容
    dammit = UnicodeDammit(data, ['utf-8', 'gbk'])
    data = dammit.unicode_markup
    soup = BeautifulSoup(data, "html.parser")
    imgs = soup.select("img")
    for img in imgs:
        try:
            src = img["src"]
            url = urllib.parse.urljoin(start_url, src)
            if url not in urls:
                urls.append(url)
                print(url)
                T = threading.Thread(target=download, args=[url,count])
                T.setDaemon(False)
                T.start()
                threads.append(T)
                count = count + 1
        except Exception as err:
             print(err)

示例#8

0

显示文件

def cleanse_href(href_str, base_url):
    """
    Function to sort out the different href parsing methods
    and generate a meaningful URL's to follow
    """
    ret_val = True

    try:
        # getting rid of empties and white spaces
        href_str = href_str.strip()
    except AttributeError:
        ret_val = False

# getting rid of single digit , typically # hrefs
    if ret_val and len(href_str) > 1:
        ret_val = href_str
    else:
        ret_val = False

# converting to unicode
    if ret_val:
        href_str_unicode = UnicodeDammit(href_str)
        href_str = (href_str_unicode.unicode_markup)

# domain specific
    if ret_val and base_url == "http://www.irishtimes.com":
        # irish times puts a counter or a version number at the end
        # of their article pages, like 1.255698, so quick regexp
        # also, putting back the top domain to deliver full URL for irish times

        if re.search(r'\.[0-9]{3,5}', href_str):
            ret_val = TOP_DOMAIN + href_str
        else:
            ret_val = False

    return ret_val

示例#9

0

显示文件

 def parse(self, response):
     """
     default parse method, rule is not useful now
     """
     time.sleep(uniform(1, 10))
     print response.url
     # response = response.replace(url=HtmlParser.remove_url_parameter(response.url))
     hxs = HtmlXPathSelector(response)
     index_level = self.determine_level(response)
     if index_level == 1:
         relative_urls = self.get_top_profile(2, hxs)
         if relative_urls is not None:
             for url in relative_urls:
                 yield Request(url, callback=self.parse)
     elif index_level == 2:
         personProfile = HtmlParser.extract_person_profile(hxs)
         linkedin_id = self.get_linkedin_id(response.url)
         linkedin_id = UnicodeDammit(urllib.unquote_plus(linkedin_id)).markup
         if linkedin_id:
             personProfile['id'] = linkedin_id
             # personProfile['url'] = UnicodeDammit(response.url).markup
             self.mongodb_linkedin.rel_coll.update({'linkedin': response.url}, {'$set': dict(personProfile)})
             print personProfile
             yield personProfile

示例#10

0

显示文件

    def _poll_now(self, action_result, param):
        """ Poll data """
        max_containers = param[phantom.APP_JSON_CONTAINER_COUNT]
        disable_max_containers = self.get_config().get('max_containers')
        single_page = False
        paging_data = {
            "page_cnt": 1,
            "alerts_per_page": 50,
            "total_pages": None
        }
        self.save_progress("start_time:{0}".format(
            param[phantom.APP_JSON_START_TIME]))
        # Convert from epoch tIf an ingestion is already in progresso ISO 8601 format
        dt_start = datetime.datetime.utcfromtimestamp(
            param[phantom.APP_JSON_START_TIME] / 1000)
        dt_start_formatted = datetime.datetime.strftime(
            dt_start, "%Y-%m-%dT%H:%M:%S")
        self.save_progress(
            "Fetching alerts from {0} to now".format(dt_start_formatted))

        filter_value = (ARBORSIGHTLINE_GET_ALERTS_FILTER.format(
            time=dt_start_formatted))
        # Percent-encode our filter query.
        filter_value = urllib.quote(filter_value, safe='')

        # Add query params
        filter_param = "filter={0}".format(filter_value)
        other_param = "include=annotations"
        params = [filter_param, other_param]

        # Filtering the amount of results per page
        if not disable_max_containers and max_containers < paging_data[
                'alerts_per_page']:
            paging_data['alerts_per_page'] = max_containers
            page_param = "perPage={0}".format(paging_data['alerts_per_page'])
            params.append(page_param)
            single_page = True

        url = "{0}?{1}".format(ARBORSIGHTLINE_GET_ALERTS_ENDPOINT,
                               "&".join(params))
        self.save_progress("Url={0}".format(url))

        # Fetch alerts
        ret_val, response = self._get_alerts(action_result, url, paging_data)
        if (phantom.is_fail(ret_val)):
            try:
                self.error_print(action_result.get_status_message())
                self.save_progress(action_result.get_status_message())
            except:
                self.error_print(ARBORSIGHTLINE_GET_ALERTS_FAILED_MSG)
                self.save_progress(ARBORSIGHTLINE_GET_ALERTS_FAILED_MSG)
            return action_result.get_status()

        # Parse returned alerts
        ret_val, total_alerts = self._parse_alerts(action_result, response)
        if (phantom.is_fail(ret_val)):
            try:
                self.error_print(action_result.get_status_message())
                self.save_progress(action_result.get_status_message())
            except:
                self.error_print(ARBORSIGHTLINE_PARSE_ALERTS_FAILED_MSG)
                self.save_progress(ARBORSIGHTLINE_PARSE_ALERTS_FAILED_MSG)
            return action_result.get_status()

        # Handle case of no alerts found
        if total_alerts < 1:
            self.save_progress(ARBORSIGHTLINE_GET_ALERTS_EMPTY_MSG)
            action_result.set_status(phantom.APP_SUCCESS,
                                     ARBORSIGHTLINE_GET_ALERTS_EMPTY_MSG)
            return action_result.get_status()

        # Handle paging to fetch next alerts
        try:
            if not single_page:
                last_page_link = urllib.unquote(
                    response['links']['last']).replace("&amp;", "&")
                paging_data['total_pages'] = int(
                    urlparse.parse_qs(
                        urlparse.urlparse(last_page_link).query)['page'][0])
                paging_data['page_cnt'] += 1

                while paging_data['page_cnt'] <= paging_data['total_pages']:
                    # Exit strategy with max containers
                    if not disable_max_containers:
                        remaining_alerts = max_containers - total_alerts
                        if remaining_alerts <= 0:
                            self.save_progress(
                                "Maximum amount of containers reached: leaving.."
                            )
                            break

                    page_param = "page={0}".format(paging_data['page_cnt'])
                    params = [filter_param, other_param, page_param]
                    url = "{0}?{1}".format(ARBORSIGHTLINE_GET_ALERTS_ENDPOINT,
                                           "&".join(params))

                    ret_val, response = self._get_alerts(
                        action_result, url, paging_data)
                    if (phantom.is_fail(ret_val)):
                        try:
                            self.error_print(
                                action_result.get_status_message())
                            self.save_progress(
                                action_result.get_status_message())
                        except:
                            self.error_print(
                                ARBORSIGHTLINE_GET_ALERTS_FAILED_MSG)
                            self.save_progress(
                                ARBORSIGHTLINE_GET_ALERTS_FAILED_MSG)
                        return action_result.get_status()

                    # Eventually reduce amount of alerts to speed up processing
                    if not disable_max_containers and remaining_alerts < paging_data[
                            'alerts_per_page']:
                        response['data'] = response['data'][:remaining_alerts]

                    ret_val, page_alerts = self._parse_alerts(
                        action_result, response)
                    if (phantom.is_fail(ret_val)):
                        try:
                            self.error_print(
                                action_result.get_status_message())
                            self.save_progress(
                                action_result.get_status_message())
                        except:
                            self.error_print(
                                ARBORSIGHTLINE_PARSE_ALERTS_FAILED_MSG)
                            self.save_progress(
                                ARBORSIGHTLINE_PARSE_ALERTS_FAILED_MSG)
                        return action_result.get_status()

                    # Update counters
                    paging_data['page_cnt'] += 1
                    total_alerts += page_alerts
        except Exception as e:
            try:
                if e.message:
                    error_msg = UnicodeDammit(
                        e.message).unicode_markup.encode('UTF-8')
                else:
                    error_msg = "Error message unavailable"
            except:
                error_msg = "Unable to parse error message"

            return action_result.set_status(
                phantom.APP_ERROR, '{}. Error message: {}'.format(
                    ARBORSIGHTLINE_GET_ALERTS_PAGINATION_FAILED_MSG,
                    error_msg))

        # if single-page closure

        # Save checkpoint
        self._state['last_ingested_epoch'] = param[phantom.APP_JSON_END_TIME]
        self.debug_print("Got new checkpoint: {}".format(
            self._state['last_ingested_epoch']))

        return action_result.set_status(phantom.APP_SUCCESS)

示例#11

0

显示文件

文件： jacobfan-utilities.py 项目： yuandra/scraperwiki-scraper-vault

def detect_encoding(html_content):
    # TODO: make a better version which does not ignore Content-Type
    # http://stackoverflow.com/questions/2686709/encoding-in-python-with-lxml-complex-solution
    from bs4 import UnicodeDammit
    ud = UnicodeDammit(html_content, is_html=True)
    return ud.original_encoding

示例#12

0

显示文件

文件： patch_api.py 项目： zankard/Sub-Zero.bundle

def save_subtitles(video,
                   subtitles,
                   single=False,
                   directory=None,
                   encoding=None,
                   encode_with=None,
                   chmod=None,
                   forced_tag=False,
                   path_decoder=None):
    """Save subtitles on filesystem.

    Subtitles are saved in the order of the list. If a subtitle with a language has already been saved, other subtitles
    with the same language are silently ignored.

    The extension used is `.lang.srt` by default or `.srt` is `single` is `True`, with `lang` being the IETF code for
    the :attr:`~subliminal.subtitle.Subtitle.language` of the subtitle.

    :param video: video of the subtitles.
    :type video: :class:`~subliminal.video.Video`
    :param subtitles: subtitles to save.
    :type subtitles: list of :class:`~subliminal.subtitle.Subtitle`
    :param bool single: save a single subtitle, default is to save one subtitle per language.
    :param str directory: path to directory where to save the subtitles, default is next to the video.
    :param str encoding: encoding in which to save the subtitles, default is to keep original encoding.
    :return: the saved subtitles
    :rtype: list of :class:`~subliminal.subtitle.Subtitle`

    patch: unicode path probems
    """
    saved_subtitles = []
    for subtitle in subtitles:
        # check content
        if subtitle.content is None:
            logger.error('Skipping subtitle %r: no content', subtitle)
            continue

        # check language
        if subtitle.language in set(s.language for s in saved_subtitles):
            logger.debug('Skipping subtitle %r: language already saved',
                         subtitle)
            continue

        # create subtitle path
        subtitle_path = get_subtitle_path(
            video.name,
            None if single else subtitle.language,
            forced_tag=forced_tag)
        if directory is not None:
            subtitle_path = os.path.join(directory,
                                         os.path.split(subtitle_path)[1])

        if path_decoder:
            subtitle_path = path_decoder(subtitle_path)

        # force unicode
        subtitle_path = UnicodeDammit(subtitle_path).unicode_markup

        subtitle.storage_path = subtitle_path

        # save content as is or in the specified encoding
        logger.info('Saving %r to %r', subtitle, subtitle_path)
        has_encoder = callable(encode_with)

        if has_encoder:
            logger.info('Using encoder %s' % encode_with.__name__)

        # save normalized subtitle if encoder or no encoding is given
        if has_encoder or encoding is None:
            content = encode_with(
                subtitle.text) if has_encoder else subtitle.content
            with io.open(subtitle_path, 'wb') as f:
                f.write(content)

            # change chmod if requested
            if chmod:
                os.chmod(subtitle_path, chmod)

            if single:
                break
            continue

        # save subtitle if encoding given
        if encoding is not None:
            with io.open(subtitle_path, 'w', encoding=encoding) as f:
                f.write(subtitle.text)

        # change chmod if requested
        if chmod:
            os.chmod(subtitle_path, chmod)

        saved_subtitles.append(subtitle)

        # check single
        if single:
            break

    return saved_subtitles

示例#13

0

显示文件

文件： lyricsFromGenius.py 项目： anishk74/lyrics-from

artist = input("Enter the artist name: ")
#calculating time taken in searching lyrics
a = datetime.datetime.now()
url_data = stripper(song, artist)  # generate url path using stripper()
url = 'https://genius.com/{}-lyrics'.format(
    url_data)  # format the url with the url path
page = requests.get(url)
if page.status_code != 200:
    url_data = requests.get('https://aadibajpai.pythonanywhere.com/stripper',
                            data={
                                'song': song,
                                'artist': artist
                            }).text
    url = 'https://genius.com/{}-lyrics'.format(url_data)
    page = requests.get(url)
html = BeautifulSoup(page.text, "html.parser")
# TODO: Add error handling
lyrics_path = html.find(
    "div", class_="lyrics")  # finding div on Genius containing the lyrics
if lyrics_path is None:

    lyrics = 'Couldn\'t get lyrics for {song} by {artist}.\n'.format(
        song=song, artist=artist)

else:
    lyrics = UnicodeDammit(lyrics_path.get_text().strip()).unicode_markup
print(lyrics)
b = datetime.datetime.now()
delta = b - a
print('Time taken in millisecond: ', delta.total_seconds() * 1000)

示例#14

0

显示文件

def _toUnicode(value):
    """Convert an unknown string to unicode
    """
    if not isinstance(value, unicode):
        value = UnicodeDammit(value).unicode_markup
    return value

示例#15

0

显示文件

def save_subtitles(video,
                   subtitles,
                   single=False,
                   directory=None,
                   chmod=None,
                   formats=("srt", ),
                   forced_tag=False,
                   path_decoder=None,
                   debug_mods=False):
    """Save subtitles on filesystem.

    Subtitles are saved in the order of the list. If a subtitle with a language has already been saved, other subtitles
    with the same language are silently ignored.

    The extension used is `.lang.srt` by default or `.srt` is `single` is `True`, with `lang` being the IETF code for
    the :attr:`~subliminal.subtitle.Subtitle.language` of the subtitle.

    :param formats: list of "srt" and "vtt"
    :param video: video of the subtitles.
    :type video: :class:`~subliminal.video.Video`
    :param subtitles: subtitles to save.
    :type subtitles: list of :class:`~subliminal.subtitle.Subtitle`
    :param bool single: save a single subtitle, default is to save one subtitle per language.
    :param str directory: path to directory where to save the subtitles, default is next to the video.
    :return: the saved subtitles
    :rtype: list of :class:`~subliminal.subtitle.Subtitle`

    patch: unicode path problems
    """

    logger.debug("Subtitle formats requested: %r", formats)

    saved_subtitles = []
    for subtitle in subtitles:
        # check content
        if subtitle.content is None:
            logger.error('Skipping subtitle %r: no content', subtitle)
            continue

        # check language
        if subtitle.language in set(s.language for s in saved_subtitles):
            logger.debug('Skipping subtitle %r: language already saved',
                         subtitle)
            continue

        # create subtitle path
        subtitle_path = get_subtitle_path(
            video.name,
            None if single else subtitle.language,
            forced_tag=forced_tag)
        if directory is not None:
            subtitle_path = os.path.join(directory,
                                         os.path.split(subtitle_path)[1])

        if path_decoder:
            subtitle_path = path_decoder(subtitle_path)

        # force unicode
        subtitle_path = UnicodeDammit(subtitle_path).unicode_markup

        subtitle.storage_path = subtitle_path

        for format in formats:
            if format != "srt":
                subtitle_path = os.path.splitext(subtitle_path)[0] + (u".%s" %
                                                                      format)

            logger.debug(u"Saving %r to %r", subtitle, subtitle_path)
            content = subtitle.get_modified_content(format=format)
            if content:
                with open(subtitle_path, 'w') as f:
                    f.write(content)
            else:
                logger.error(
                    u"Something went wrong when getting modified subtitle for %s",
                    subtitle)

        # change chmod if requested
        if chmod:
            os.chmod(subtitle_path, chmod)

        saved_subtitles.append(subtitle)

        # check single
        if single:
            break

    return saved_subtitles

示例#16

0

显示文件

文件： response.py 项目： munhyunsu/ApplicationPerformance

 def handle_text(self):
     '''
     Takes care of converting body text to unicode, if its text at all.
     Sets self.original_encoding to original char encoding, and converts body
     to unicode if possible. Must come after handle_compression, and after
     self.mediaType is valid.
     '''
     self.encoding = None
     # if the body is text
     if (self.mediaType and (self.mediaType.type == 'text' or
                             (self.mediaType.type == 'application'
                              and 'xml' in self.mediaType.subtype))):
         # if there was a charset parameter in HTTP header, store it
         if 'charset' in self.mediaType.params:
             override_encodings = [self.mediaType.params['charset']]
         else:
             override_encodings = []
         # if there even is data (otherwise,
         # dammit.originalEncoding might be None)
         if self.body != '':
             if UnicodeDammit:
                 # honestly, I don't mind not abiding by RFC 2023.
                 # UnicodeDammit just does what makes sense, and if the
                 # content is remotely standards-compliant, it will do the
                 # right thing.
                 dammit = UnicodeDammit(self.body, override_encodings)
                 self.text = dammit.unicode_markup
                 self.originalEncoding = dammit.original_encoding
                 # if unicode was found
                 #if dammit.unicode:
                 #    self.text = dammit.unicode
                 #    self.originalEncoding = dammit.originalEncoding
                 #else:
                 #    # unicode could not be decoded, at all
                 #    # HAR can't write data, but body might still
                 #    # be useful as-is
                 #    pass
             else:
                 # try the stupid version, just guess content-type or utf-8
                 u = None
                 # try our list of encodings + utf8 with strict errors
                 for e in override_encodings + ['utf8', 'iso-8859-1']:
                     try:
                         u = self.body.decode(e, 'strict')
                         self.originalEncoding = e
                         break  # if ^^ didn't throw, we're done
                     except UnicodeError:
                         pass
                 # if none of those worked, try utf8
                 # with 'replace' error mode
                 if not u:
                     # unicode has failed
                     u = self.body.decode('utf8', 'replace')
                     self.originalEncoding = None  # ???
                 self.text = u or None
     else:
         # body is not text
         # base64 encode it and set self.encoding
         # TODO: check with list that this is right
         self.text = b64encode(self.body)
         self.encoding = 'base64'

示例#17

0

显示文件

 def beautify(self, data, charset):
     dammit = UnicodeDammit(data, [charset, "utf-8", "gb2312", "gbk", "big5", "gb18030"], smart_quotes_to="html")
     data = dammit.unicode_markup
     return data

示例#18

0

显示文件

文件： parse_script.py 项目： dvp-tran/LSTM

def parse(url, path, name):
    #init variables
    spaces_regex = re.compile("^(\s*).*")
    location_regex = re.compile("^\s*(INT\.|EXT\.)")

    BLOCK_TYPES = [
        'character', 'speech', 'stage direction', 'location', 'unknown'
    ]
    CHARACTER = 0
    SPEECH = 1
    DIRECTIONS = 2
    LOCATION = 3

    time_start = time.time()

    if url.endswith('.pdf'):
        print('The file @ %s is a PDF' % (url))
        return

    script_text, soup = get_script(url)
    #write raw file:
    if not os.path.exists(path + 'raw/'):
        os.makedirs(path + 'raw/')
    with open(path + 'raw/' + "%s.txt" % name, "w") as text_file:
        text_file.write(str(script_text))
    #####

    space_vector, character_presence = white_space_analysis(script_text, soup)
    usual_spaces, flag = identify_usual_spaces(space_vector,
                                               character_presence)

    # Ici on définit les variables qu'on remplira de texte
    is_intro = True
    movie_script = []
    intro = []
    last_line_type = -1
    last_character = 'unknown'
    text = []
    characters = []

    for block in script_text.descendants:
        # Si block est une instance de bs4.Tag, il est entouré de balises HTML
        # Le prochain block contiendra le même texte sans les balises
        # Donc on continue sans parser ce bloc
        if (isinstance(block, Tag)):
            continue

        # UnicodeDammit converts any string to UTF-8
        # does not work so well
        block = UnicodeDammit(block, soup.original_encoding).unicode_markup
        # remove leading and ending end of lines
        block = block.strip('\n').strip('\n\r')

        # if the block doesn't have any text, skip it
        if (re.search('\w', block) == None):
            continue

        for line in block.split('\n'):
            stripped_line = line.strip(' \n\t\r')
            if (re.search('\w', line) == None):
                continue
            # Counting the number of spaces at the beginning of the line
            spmatch = spaces_regex.search(line)
            space_vector.append(len(spmatch.group(1)))
            #print(block)
            #print(line)
            #print(len(spmatch.group(1)))
            line_type = get_line_type(line, stripped_line, usual_spaces)
            #print(line_type)
            #print(line)

            if (last_line_type == -1  # -1 = not initialized
                    or last_line_type == line_type):
                text.append(stripped_line)
            else:
                if (last_line_type == CHARACTER):
                    last_character = '\n'.join(
                        text
                    )  #regex to supress (parenthesis) & replicate speaker
                    if not last_character in characters:
                        characters.append(last_character)
                elif (last_line_type == SPEECH):
                    movie_script.append({
                        'type': BLOCK_TYPES[last_line_type],
                        BLOCK_TYPES[CHARACTER]: last_character,
                        'text': '\n'.join(text)
                    })
                    #print('We just parsed this JSON block:')
                    #print(movie_script[-1])
                else:
                    movie_script.append({
                        'type': BLOCK_TYPES[last_line_type],
                        'text': '\n'.join(text)
                    })
                    #print('We just parsed this JSON block:')
                    #print(movie_script[-1])
                text = [stripped_line]

            last_line_type = line_type
            #print('----------------')

    result = json_normalize(movie_script)
    if flag:
        write_csv(result, name, path)
        print('      Done parsing script at %s in %s' %
              (url, time.time() - time_start))
        print('-----------------')
        return (result)
    else:
        path = path + 'doubtful/'
        write_csv(result, name, path)
        print('      Done parsing script at %s in %s' %
              (url, time.time() - time_start))
        print('-----------------')
        return (result)

示例#19

0

显示文件


if __name__ == "__main__":
    file_list = os.listdir()
    if file_list.count('chs') == 0:
        os.mkdir('./chs')
    file_list = [
        x for x in file_list if os.path.isfile(x) and
        (os.path.splitext(x)[1] == '.ass' or os.path.splitext(x)[1] == '.srt')
    ]
    print(file_list)

    for i in file_list:
        # 以二进制读入数据
        with open(i, 'rb') as b:
            buf = b.read()
        # result = chardet.detect(buf)
        # 解析编码
        result2 = UnicodeDammit(buf)
        print('Encoding: ', result2.original_encoding)
        with open(i, 'r', encoding=result2.original_encoding,
                  errors='ignore') as text:
            text_all = text.read()
            #print(text_all)
        with open(os.path.join('./chs', i),
                  'w',
                  encoding=result2.original_encoding,
                  errors='ignore') as text:
            text.write(Traditional2Simplified(text_all))
        print(i, ' is converted')
    print('Success!')

示例#20

0

显示文件

# -*- coding:utf-8 -*-
from const_var import *
import downloader
import html2text
from bs4 import UnicodeDammit


def html_to_txt(html):
    h = html2text.HTML2Text()
    h.ignore_links = True
    return h.handle(html)
    pass


# ---|---|---|---
if __name__ == '__main__':
    url = "/webfile/jiaozuo/cgxx/jggg/webinfo/2017/07/1498553692194150.htm"
    # url = "/webfile/luoyang/zgxx/jggg/webinfo/2017/07/1498553690093831.htm"
    url = "%s%s" % (mainHTTP, url)
    html = downloader.Downloader()(url)
    dommit = UnicodeDammit(html)
    text = html_to_txt(dommit.unicode_markup)
    if re.search(r'-+\|+', text, re.DOTALL):
        print True
    # print text11

示例#21

0

显示文件

def decode(str_, is_html=False, errors='strict'):
    if isinstance(str_, unicode):
        return str_
    if isinstance(str_, str):
        return UnicodeDammit(str_, ['utf-8'], is_html=is_html).unicode_markup
    return unicode(str_, 'utf-8', errors)

示例#22

0

显示文件

文件： utils.py 项目： jzbjyb/rri_match

def load_from_html(filename,
                   use_boilerpipe=True,
                   use_nltk=True,
                   use_regex=True,
                   binary=False,
                   field=['title', 'body']):
    if binary:
        charset = UnicodeDammit(open(filename, 'rb').read())
        charset = charset.original_encoding
        try:
            content = open(filename, 'r', encoding=charset).read()
        except Exception as e:
            # if has error, return empty results
            logging.warn('encode error: {}, {}'.format(filename, e))
            return {'title': [], 'body': []}
    else:
        content = open(filename, 'r', encoding='utf-8').read()
    start = time.time()
    if not use_regex or not use_boilerpipe:
        bs = BeautifulSoup(content, 'html.parser')
    if 'title' in field:
        if use_regex:
            match = re.search(r'<title.*?>(.+?)</title>', content[:5000],
                              re.DOTALL | re.IGNORECASE)
            title = match.group(1) if match else ''
            title = html.unescape(title).strip()
        else:
            if bs.title != None and bs.title.string != None:
                title = bs.title.string.strip()
            else:
                title = ''
    t1 = time.time() - start
    start = time.time()
    if 'body' in field:
        if use_boilerpipe:
            extractor = Extractor(extractor='ArticleExtractor',
                                  html=content)  # time consuming
            body = extractor.getText()
        else:
            body = bs.select('body')
            if len(body) <= 0:
                body = bs
            else:
                body = body[0]
            # remove all useless label
            [x.extract() for x in body.findAll('script')]
            [x.extract() for x in body.findAll('style')]
            [x.extract() for x in body.findAll('meta')]
            [x.extract() for x in body.findAll('link')]
            body = body.text
    t2 = time.time() - start
    start = time.time()
    result = {}
    if 'title' in field:
        result['title'] = my_word_tokenize(title) if use_nltk else clean_text(
            title).split(' ')
    if 'body' in field:
        result['body'] = my_word_tokenize(body) if use_nltk else clean_text(
            body).split(' ')
    t3 = time.time() - start
    #print('{}\t{}\t{}'.format(t1, t2, t3))
    return result

示例#23

0

显示文件

    def importFitFromFiles(paths, iportuser=None):
        """
        Imports fits from file(s). First processes all provided paths and stores
        assembled fits into a list. This allows us to call back to the GUI as
        fits are processed as well as when fits are being saved.
        returns
        """

        sFit = svcFit.getInstance()

        fit_list = []
        try:
            for path in paths:
                if iportuser:  # Pulse
                    msg = "Processing file:\n%s" % path
                    pyfalog.debug(msg)
                    processing_notify(
                        iportuser,
                        IPortUser.PROCESS_IMPORT | IPortUser.ID_UPDATE, msg)
                    # wx.CallAfter(callback, 1, msg)

                with open(path, "rb") as file_:
                    srcString = file_.read()
                    dammit = UnicodeDammit(srcString)
                    srcString = dammit.unicode_markup

                if len(srcString) == 0:  # ignore blank files
                    pyfalog.debug("File is blank.")
                    continue

                try:
                    _, fitsImport = Port.importAuto(srcString,
                                                    path,
                                                    iportuser=iportuser)
                    fit_list += fitsImport
                except xml.parsers.expat.ExpatError:
                    pyfalog.warning("Malformed XML in:\n{0}", path)
                    return False, "Malformed XML in %s" % path

            # IDs = []  # NOTE: what use for IDs?
            numFits = len(fit_list)
            for idx, fit in enumerate(fit_list):
                # Set some more fit attributes and save
                fit.character = sFit.character
                fit.damagePattern = sFit.pattern
                fit.targetResists = sFit.targetResists
                if len(fit.implants) > 0:
                    fit.implantLocation = ImplantLocation.FIT
                else:
                    useCharImplants = sFit.serviceFittingOptions[
                        "useCharacterImplantsByDefault"]
                    fit.implantLocation = ImplantLocation.CHARACTER if useCharImplants else ImplantLocation.FIT
                db.save(fit)
                # IDs.append(fit.ID)
                if iportuser:  # Pulse
                    pyfalog.debug(
                        "Processing complete, saving fits to database: {0}/{1}",
                        idx + 1, numFits)
                    processing_notify(
                        iportuser,
                        IPortUser.PROCESS_IMPORT | IPortUser.ID_UPDATE,
                        "Processing complete, saving fits to database\n(%d/%d) %s"
                        % (idx + 1, numFits, fit.ship.name))

        except UserCancelException:
            return False, "Processing has been canceled.\n"
        except Exception as e:
            pyfalog.critical("Unknown exception processing: {0}", path)
            pyfalog.critical(e)
            # TypeError: not all arguments converted during string formatting
            #                 return False, "Unknown Error while processing {0}" % path
            return False, "Unknown error while processing %s\n\n Error: %s" % (
                path, e.message)

        return True, fit_list

示例#24

0

显示文件

文件： CHMParser.py 项目： juliantaylor/archmage

 def feed(self, data):
     sgmllib.SGMLParser.feed(self, UnicodeDammit(data).unicode_markup)

示例#25

0

显示文件

文件： subtitle.py 项目： stiangus/Sub-Zero.bundle

    def guess_encoding(self):
        """Guess encoding using the language, falling back on chardet.

        :return: the guessed encoding.
        :rtype: str

        """
        if self._guessed_encoding:
            return self._guessed_encoding

        logger.info('Guessing encoding for language %s', self.language)

        encodings = ['utf-8']

        # add language-specific encodings
        # http://scratchpad.wikia.com/wiki/Character_Encoding_Recommendation_for_Languages

        if self.language.alpha3 == 'zho':
            encodings.extend(
                ['cp936', 'gb2312', 'cp950', 'gb18030', 'big5', 'big5hkscs'])
        elif self.language.alpha3 == 'jpn':
            encodings.extend([
                'shift-jis',
                'cp932',
                'euc_jp',
                'iso2022_jp',
                'iso2022_jp_1',
                'iso2022_jp_2',
                'iso2022_jp_2004',
                'iso2022_jp_3',
                'iso2022_jp_ext',
            ])
        elif self.language.alpha3 == 'tha':
            encodings.extend(['tis-620', 'cp874'])

        # arabian/farsi
        elif self.language.alpha3 in ('ara', 'fas', 'per'):
            encodings.append('windows-1256')
        elif self.language.alpha3 == 'heb':
            encodings.extend(['windows-1255', 'iso-8859-8'])
        elif self.language.alpha3 == 'tur':
            encodings.extend(['windows-1254', 'iso-8859-9', 'iso-8859-3'])

        # Greek
        elif self.language.alpha3 in ('grc', 'gre', 'ell'):
            encodings.extend([
                'windows-1253', 'cp1253', 'cp737', 'iso8859-7', 'cp875',
                'cp869', 'iso2022_jp_2', 'mac_greek'
            ])

        # Polish, Czech, Slovak, Hungarian, Slovene, Bosnian, Croatian, Serbian (Latin script),
        # Romanian and Albanian
        elif self.language.alpha3 in ('pol', 'cze', 'ces', 'slk', 'slo', 'slv',
                                      'hun', 'bos', 'hbs', 'hrv', 'rsb', 'ron',
                                      'rum', 'sqi', 'alb'):

            encodings.extend(['windows-1250', 'iso-8859-2'])

            # Eastern European Group 1
            if self.language.alpha3 == "slv":
                encodings.append('iso-8859-4')

            # Albanian
            elif self.language.alpha3 in ("sqi", "alb"):
                encodings.extend([
                    'windows-1252', 'iso-8859-15', 'iso-8859-1', 'iso-8859-9'
                ])

        # Bulgarian, Serbian and Macedonian, Ukranian and Russian
        elif self.language.alpha3 in ('bul', 'srp', 'mkd', 'mac', 'rus',
                                      'ukr'):
            # Eastern European Group 2
            if self.language.alpha3 in ('bul', 'mkd', 'mac', 'rus', 'ukr'):
                encodings.extend(['windows-1251', 'iso-8859-5'])

            elif self.language.alpha3 == 'srp':
                if self.language.script == "Latn":
                    encodings.extend(['windows-1250', 'iso-8859-2'])
                elif self.language.script == "Cyrl":
                    encodings.extend(['windows-1251', 'iso-8859-5'])
                else:
                    encodings.extend([
                        'windows-1250', 'windows-1251', 'iso-8859-2',
                        'iso-8859-5'
                    ])

        else:
            # Western European (windows-1252) / Northern European
            encodings.extend([
                'latin-1', 'iso-8859-15', 'iso-8859-9', 'iso-8859-4',
                'iso-8859-1'
            ])

        # try to decode
        logger.debug('Trying encodings %r', encodings)
        for encoding in encodings:
            try:
                self.content.decode(encoding)

            except UnicodeDecodeError:
                pass
            else:
                logger.info('Guessed encoding %s', encoding)
                self._guessed_encoding = encoding
                return encoding

        logger.warning('Could not guess encoding from language')

        # fallback on chardet
        encoding = chardet.detect(self.content)['encoding']
        logger.info('Chardet found encoding %s', encoding)

        if not encoding:
            # fallback on bs4
            logger.info('Falling back to bs4 detection')
            a = UnicodeDammit(self.content)

            Log.Debug("bs4 detected encoding: %s" % a.original_encoding)

            if a.original_encoding:
                self._guessed_encoding = a.original_encoding
                return a.original_encoding
            raise ValueError(u"Couldn't guess the proper encoding for %s" %
                             self)

        self._guessed_encoding = encoding
        return encoding

示例#26

0

显示文件

文件： skll_convert.py 项目： monkidea/skll

def main(argv=None):
    """
    Handles command line arguments and gets things started.

    Parameters
    ----------
    argv : list of str
        List of arguments, as if specified on the command-line.
        If None, ``sys.argv[1:]`` is used instead.
    """

    # Get command line arguments
    parser = argparse.ArgumentParser(
        description="Takes an input feature file and converts it to another \
                     format. Formats are determined automatically from file \
                     extensions.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('infile',
                        help='input feature file (ends in .arff, .csv, \
                              .jsonlines, .libsvm, .megam, .ndj, or .tsv)')
    parser.add_argument('outfile',
                        help='output feature file (ends in .arff, .csv, \
                              .jsonlines, .libsvm, .megam, .ndj, or .tsv)')
    parser.add_argument('-i',
                        '--id_col',
                        help='Name of the column which contains the instance \
                              IDs in ARFF, CSV, or TSV files.',
                        default='id')
    parser.add_argument('-l',
                        '--label_col',
                        help='Name of the column which contains the class \
                              labels in ARFF, CSV, or TSV files. For ARFF \
                              files, this must be the final column to count as\
                              the label.',
                        default='y')
    parser.add_argument('-q',
                        '--quiet',
                        help='Suppress printing of "Loading..." messages.',
                        action='store_true')
    parser.add_argument('--arff_regression',
                        help='Create ARFF files for regression, not \
                              classification.',
                        action='store_true')
    parser.add_argument('--arff_relation',
                        help='Relation name to use for ARFF file.',
                        default='skll_relation')
    parser.add_argument('--reuse_libsvm_map',
                        help='If you want to output multiple files that use \
                              the same mapping from labels and features to \
                              numbers when writing libsvm files, you can \
                              specify an existing .libsvm file to reuse the \
                              mapping from.',
                        type=argparse.FileType('rb'))
    parser.add_argument('--version',
                        action='version',
                        version='%(prog)s {0}'.format(__version__))
    args = parser.parse_args(argv)

    # Make warnings from built-in warnings module get formatted more nicely
    logging.captureWarnings(True)
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - '
                                '%(message)s'))
    logger = logging.getLogger(__name__)

    # make sure the input file extension is one we can process
    input_extension = os.path.splitext(args.infile)[1].lower()
    output_extension = os.path.splitext(args.outfile)[1].lower()

    if input_extension not in EXT_TO_READER:
        logger.error(('Input file must be in either .arff, .csv, .jsonlines, '
                      '.libsvm, .megam, .ndj, or .tsv format. You specified: '
                      '{}').format(input_extension))
        sys.exit(1)

    # Build feature and label vectorizers from existing libsvm file if asked
    if args.reuse_libsvm_map and output_extension == '.libsvm':
        feat_map = {}
        label_map = {}
        for line in args.reuse_libsvm_map:
            line = UnicodeDammit(line,
                                 ['utf-8', 'windows-1252']).unicode_markup
            if '#' not in line:
                logger.error('The LibSVM file you want to reuse the map from '
                             'was not created by SKLL and does not actually '
                             'contain the necessary mapping info.')
                sys.exit(1)
            comments = line.split('#')[1]
            _, label_map_str, feat_map_str = comments.split('|')
            feat_map.update(
                _pair_to_dict_tuple(pair)
                for pair in feat_map_str.strip().split())
            label_map.update(
                _pair_to_dict_tuple(pair)
                for pair in label_map_str.strip().split())
        feat_vectorizer = DictVectorizer()
        feat_vectorizer.fit([{name: 1} for name in feat_map])
        feat_vectorizer.vocabulary_ = feat_map
    else:
        feat_vectorizer = None
        label_map = None

    # Iterate through input file and collect the information we need
    reader = EXT_TO_READER[input_extension](args.infile,
                                            quiet=args.quiet,
                                            label_col=args.label_col,
                                            id_col=args.id_col)
    feature_set = reader.read()
    # write out the file in the requested output format
    writer_type = EXT_TO_WRITER[output_extension]
    writer_args = {'quiet': args.quiet}
    if writer_type is DelimitedFileWriter:
        writer_args['label_col'] = args.label_col
        writer_args['id_col'] = args.id_col
    elif writer_type is ARFFWriter:
        writer_args['label_col'] = args.label_col
        writer_args['id_col'] = args.id_col
        writer_args['regression'] = args.arff_regression
        writer_args['relation'] = args.arff_relation
    elif writer_type is LibSVMWriter:
        writer_args['label_map'] = label_map
    writer = writer_type(args.outfile, feature_set, **writer_args)
    writer.write()

示例#27

0

显示文件

    def _parse_alerts(self, action_result, alerts):
        """ Parse alerts to create containers and artifacts """
        alerts_cnt = 0

        # What happens if you do not have alerts returned?
        # data = [] --> returns alerts_cnt = 0
        if alerts.get('data') is None:
            action_result.set_status(
                phantom.APP_ERROR,
                ARBORSIGHTLINE_ALERTS_DATA_KEY_UNAVAILABLE_MSG)
            return action_result.get_status(), None

        try:
            for data in alerts['data']:
                alert_id = data['id']
                target_address = data['attributes']['subobject'][
                    'host_address']
                impact_bps = data['attributes']['subobject']['impact_bps']
                impact_pps = data['attributes']['subobject']['impact_pps']
                victim_router = data['attributes']['subobject'][
                    'impact_boundary']
                classification = data['attributes']['classification']
                description = ""

                for include in alerts['included']:
                    if include['relationships']['parent']['data'][
                            'type'] == 'alert' and include['relationships'][
                                'parent']['data']['id'] == alert_id:
                        description = include['attributes']['text']
                        break

                # Creating container
                c = {
                    'data': {},
                    'description': 'Ingested from Arbor Sightline',
                    'source_data_identifier': alert_id,
                    'name': '{0} {1}'.format(classification, target_address)
                }

                # self.send_progress('Saving container for alert id {0}...'.format(alert_id))
                status, msg, id_ = self.save_container(c)
                # self.save_progress("Container id : {}, {}, {}".format(id_, status, msg))
                if status == phantom.APP_ERROR:
                    action_result.set_status(
                        phantom.APP_ERROR,
                        ARBORSIGHTLINE_CREATE_CONTAINER_FAILED_MSG.format(msg))
                    return action_result.get_status(), None

                # Creating artifacts
                cef = {
                    'targetAddress': target_address,
                    'impactBps': impact_bps,
                    'impactPps': impact_pps,
                    'victimRouter': victim_router,
                    'classification': classification,
                    'description': description
                }
                art = {
                    'container_id': id_,
                    'name': 'Event Artifact',
                    'label': 'event',
                    'source_data_identifier': c['source_data_identifier'],
                    'cef': cef,
                    'run_automation': True
                }

                # self.send_progress('Saving artifact...')
                status, msg, id_ = self.save_artifact(art)
                if status == phantom.APP_ERROR:
                    action_result.set_status(
                        phantom.APP_ERROR,
                        ARBORSIGHTLINE_CREATE_ARTIFACT_FAILED_MSG.format(msg))
                    return action_result.get_status(), None

                alerts_cnt += 1
        except Exception as e:
            try:
                if e.message:
                    error_msg = UnicodeDammit(
                        e.message).unicode_markup.encode('UTF-8')
                else:
                    error_msg = "Error message unavailable"
            except:
                error_msg = "Unable to parse error message"

            action_result.set_status(
                phantom.APP_ERROR, '{}. Error message: {}'.format(
                    ARBORSIGHTLINE_PARSE_ALERTS_FAILED_MSG, error_msg))
            return action_result.get_status(), None

        return phantom.APP_SUCCESS, alerts_cnt

示例#28

0

显示文件

文件： merge-manual-auto.py 项目： iveskins/FAVE-Testing-files

            words[w] = [line]

    header = header + '\t' + '\t'.join([
        'F1_man', 'F2_man', 'F3_man', 'plt_code', 'plt_stress', 'plt_word',
        't_man'
    ]) + '\n'

    fw = open(outputFile, 'w')
    fw.write(header)

    plt_lines = open(pltFile, 'rb').readlines()
    skipped_lines = []
    # skip the first two lines since they contain header information
    for plt_line in plt_lines[2:]:
        print(plt_line)
        plt_line = UnicodeDammit(plt_line,
                                 ['utf-8', 'windows-1252']).unicode_markup
        plt_line = unidecode(plt_line)
        plt_line = plt_line.rstrip()
        plt_F1 = plt_line.split(',')[0]
        # a line beginning with '1' is the first line of the vowel means; this
        # signals the end of the vowel token measurements, so we can stop
        # processing the file
        if plt_F1 == '1':
            break

        plt_w_raw = plt_line.split(',')[5].split(' ')[0]
        plt_w = plt_w_raw.upper()
        plt_w = plt_w.replace('(', '')
        plt_w = plt_w.replace(')', '')
        print(plt_w)
        if plt_w not in words:

示例#29

0

显示文件

文件： remade_script_parser.py 项目： NegatioN/futurama-script-downloader

is_intro = True
movie_script = []
intro = []
last_line_type = -1
last_character = ''
text = []
characters=[]


for block in script_text.descendants:
    if(isinstance(block, Tag)):
        continue

    # UnicodeDammit converts any string to UTF-8
    # does not work so well
    block = UnicodeDammit(block, soup.original_encoding).unicode_markup
    # remove leading and ending end of lines
    block = block.strip('\n')

    # if the block doesn't have any text, skip it
    if( re.search('\w', block) == None ):
        continue

    # bs4 ne coupe pas toujours bien les différents blocs
    # Mieux vaut donc redécouper par paragraphe et les traiter un à un
    for line in block.split('\n'):
        stripped_line = line.strip(' \n\t\r')
        if( re.search('\w', line) == None ):
            continue

        line_type = get_line_type(line, stripped_line, usual_spaces)

示例#30

0

显示文件

print(soup.p.encode('latin-1'))
print(soup.p.encode("utf-8"))

# 编码不支持会转换成特殊字符引用
markup = u"<b>\N{SNOWMAN}</b>"
snowman_soup = BeautifulSoup(markup)
tag = snowman_soup.b

print(tag.encode('utf-8'))
print(tag.encode('latin-1'))
print(tag.encode('ascii'))

### 编码自动检测
from bs4 import UnicodeDammit

dammit = UnicodeDammit("Sacr\xc3\xa9 bleu!")
print(dammit.unicode_markup)
print(dammit.original_encoding)

dammit = UnicodeDammit("Sacr\xe9 bleu!", ["latin-1", "iso-8859-1"])
print(dammit.unicode_markup)
print(dammit.original_encoding)

# 智能引号,使用Unicode时,会自能地把引号转换成HTML的特殊字符
markup = b"<p>I just \x93love\x94 Microsoft Word\x92s smart quotes</p>"

print(
    UnicodeDammit(markup, ['windows-1252'],
                  smart_quotes_to='html').unicode_markup)
print(
    UnicodeDammit(markup, ['windows-1252'],