def generate_toc(content): if isinstance(content, contents.Static): return _toc_run = content.metadata.get( 'toc_run', content.settings['TOC']['TOC_RUN']) if not _toc_run == 'true': return all_ids = set() title = content.metadata.get('title', 'Title') tree = node = HtmlTreeNode(None, '', 'h0', '') soup = BeautifulSoup(content._content, 'html.parser') settoc = False try: header_re = re.compile(content.metadata.get( 'toc_headers', content.settings['TOC']['TOC_HEADERS'])) except re.error as e: logger.error("TOC_HEADERS '%s' is not a valid re\n%s", content.settings['TOC']['TOC_HEADERS']) raise e for header in soup.findAll(header_re): settoc = True node, new_header = node.add(header, all_ids) header.replaceWith(new_header) # to get our ids back into soup if (settoc): tree_string = '{}'.format(tree) tree_soup = BeautifulSoup(tree_string, 'html.parser') content.toc = tree_soup.decode(formatter='html') content._content = soup.decode(formatter='html')
def markdown_figure(instance): """Wraps img in figure tags, adds figcation and if BOOTSTRAPPER_FIGURES is set adds bootstrap attributes to figure tags.""" if instance._content is not None: content = instance._content soup = BeautifulSoup(content, 'html.parser') defaults = [] default_attribute_key = 'class' figure_default = [{'figure': {'class': ['figure']}, 'figcaption': {'class': ['figure-caption']}, 'figure > img': {'class': ['figure-img']}}] is_default_set = lambda setting: True if setting in instance.settings and instance.settings[setting] else False if is_default_set('MDF_DEFAULT_CONFIG'): defaults.extend([instance.settings['MDF_DEFAULT_CONFIG']]) if is_default_set('BOOTSTRAPPER_FIGURES'): defaults.extend(figure_default) figurify(soup) for default in defaults: for selector, value in default.iteritems(): if isinstance(value, dict): for attribute_key, attribute_value in default[selector].iteritems(): replace_in_with(soup, selector, attribute_key, attribute_value) else: replace_in_with(soup, selector, default_attribute_key, value) instance._content = soup.decode(formatter="html")
def convert_summary(input): try: soup = BeautifulSoup(input, "html.parser") value = soup.decode(formatter="minimal") except HTMLParseError: return input return value
def content_object_init(instance): if instance._content is not None: content = instance._content # use Python's built-in parser so no duplicated html & body tags appear, or use tag.unwrap() text = BeautifulSoup(content, "html.parser") if 'a' in content: for link in text.find_all(href=re.compile("(.+?)>")): url = link.get('href') m = re.search(r"(.+?)>", url).groups() name = m[0] if name in interlinks: hi = url.replace(name+">",interlinks[name]) link['href'] = hi if 'img' in content: for img in text.find_all('img', src=re.compile("(.+?)>")): url = img.get('src') m = re.search(r"(.+?)>", url).groups() name = m[0] if name in interlinks: hi = url.replace(name+">",interlinks[name]) img['src'] = hi instance._content = text.decode()
def remove_footnotes(content): ''' Strip footnote reference links from 'content' ''' if content is None: return None soup = BeautifulSoup(content, "lxml") soup.html.unwrap() soup.body.unwrap() for ref_footnote in soup.findAll("a", class_="footnote-reference"): # Remove the footnote-reference link and the space preceding it # Example: # <a href="...">Actual link</a> <a class="footnote-reference">[1]</a> # Access previous element in tree. # This should be a 'NavigableString' with a single space. prev = ref_footnote.previous_sibling if prev.string != " ": raise Exception( "Unexpected HTML surrounding summary footnote reference!") # If that went well, remove the space and the footnote tag. prev.replace_with("") # can't remove, replace with empty string ref_footnote.decompose() # remove and deconstruct return soup.decode()
def extract_toc(content): if isinstance(content, contents.Static): return soup = BeautifulSoup(content._content,'html.parser') filename = content.source_path extension = path.splitext(filename)[1][1:] toc = None # if it is a Markdown file if extension in readers.MarkdownReader.file_extensions: toc = soup.find('div', class_='toc') if toc: toc.extract() # else if it is a reST file elif extension in readers.RstReader.file_extensions: toc = soup.find('div', class_='contents topic') if toc: toc.extract() if toc: tag=BeautifulSoup(str(toc)) tag.div['class']='toc' tag.div['id']='' p=tag.find('p', class_='topic-title first') if p:p.extract() toc=tag elif not toc: # Pandoc reader toc = soup.find('nav', id='TOC') if toc: toc.extract() content._content = soup.decode() content.toc = toc.decode()
def ENMLtoText(contentENML): soup = BeautifulSoup(contentENML.decode('utf-8')) # In ENML, each line in paragraph have <div> tag. for section in soup.find_all('div'): if not section.br: section.append(soup.new_tag("br")) section.unwrap() for section in soup.select('li > p'): section.replace_with( section.contents[0] ) for section in soup.select('li > br'): if section.next_sibling: next_sibling = section.next_sibling.next_sibling if next_sibling: if next_sibling.find('li'): section.extract() else: section.extract() h2t = html2text.HTML2Text() h2t.body_width = 0 content = h2t.handle(soup.decode()) content = re.sub(r' *\n', os.linesep, content) content = content.replace(unichr(160), " ") return content.encode('utf-8')
def nbsp_footnotes(content): ''' Replace space between link and footnote with nbsp. ''' if content is None: return None soup = BeautifulSoup(content, "lxml") soup.html.unwrap() soup.body.unwrap() for ref_footnote in soup.findAll("a", class_="footnote-reference"): # Example: # <a href="...">Actual link</a> <a class="footnote-reference">[1]</a> # Access previous element in tree. # This should be a 'NavigableString' with a single space. prev = ref_footnote.previous_sibling if prev.string != " ": raise Exception( "Unexpected HTML surrounding summary footnote reference!") prev.replace_with(u'\xa0') # U+00A0 = nbsp return soup.decode()
def assertSoupEquals(self, to_parse, compare_parsed_to=None): builder = self.default_builder obj = BeautifulSoup(to_parse, builder=builder) if compare_parsed_to is None: compare_parsed_to = to_parse self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
def cleanse(html_source, pretty_print=False, formatter='html'): if html_source is None: return '' # convert to unicode html_unicode = UnicodeDammit(html_source).unicode_markup # strip classes html_sans_classes = match_classes.subn(r"<\1\2>", html_unicode)[0] # strip artifacts html_sans_artifacts = match_artifacts.subn(r"", html_sans_classes)[0] # insert <hr> tags at section breaks html_with_section_breaks = match_section_breaks.subn( r'<hr class="section-break" />\n', p_corrector.subn(r'<p></p>', html_sans_artifacts)[0])[0] # strip metatags from header soup_sans_metatags = BeautifulSoup(html_with_section_breaks) ex_ = [meta.replace_with(u'') for meta in soup_sans_metatags.findAll(name='meta')] del ex_ # decode most html entities html_sans_most_entities = soup_sans_metatags.decode( pretty_print=pretty_print, formatter=formatter) # consolidate multiple empty lines html_clean = match_multilines.subn(r"\n\n", html_sans_most_entities)[0] # return unicode return html_clean
def wrap_image_tags(p): """ Wrap image tags in links to add Lightbox support Any image tag in the content with class={LBPREFIX}-{SETNAME} will be wrapped with an anchored href with Lightbox support. `LBPREFIX` is defined in the settings file as `LIGHTBOX_PREFIX` with a default of `'lb-'`. :param p: pelican instance :return: None """ lbprefix = p.settings.get('LIGHTBOX_PREFIX', 'lb-') lbset = p.settings.get('LIGHTBOX_SET', 'images') if p._content is not None: content = p._content soup = BeautifulSoup(content) # Wrap each image tag in an anchor with a link. Add the # attribute for the lightbox set to activate. if 'img' in content: for tag in soup('img'): # Skip if no class tag if not tag.has_attr('class'): continue for c in tag['class']: c.split(lbprefix) substr = c.split(lbprefix,1) # If the first element of the split is empty then the prefix # is at the start of the string c. We also must check that # c is not empty. if c and not substr[0]: if substr[1]: gallery = substr[1] else: gallery = lbgallery link_wrapper = soup.new_tag("a", href=tag['src']) link_wrapper['data-lightbox'] = substr[1] # We have to add data-lightbox seperately b/c it fails in the above as a seperate expression (- is a minus sign) # Set the title (ie: lightbox caption) to the alt-text if tag.has_attr('alt'): link_wrapper['title'] = tag['alt'] # Set the title attribute as a caption, if the image is # wrapped in a figure fig = tag.find_parent('div', 'figure') if fig: caption = fig.findChild('p', 'caption') if caption: link_wrapper['title'] = caption.get_text() tag.wrap(link_wrapper) break # So we only use the first class specified p._content = soup.decode()
def performOPFSourceUpdates(data, currentdir, keylist, valuelist): # rebuild serialized lookup dictionary updates = {} for i in range(0, len(keylist)): updates[ keylist[i] ] = valuelist[i] xmlbuilder = LXMLTreeBuilderForXML(parser=None, empty_element_tags=ebook_xml_empty_tags) soup = BeautifulSoup(data, features=None, builder=xmlbuilder) for tag in soup.find_all(["item","reference","site"]): if "href" in tag.attrs : href = tag["href"] if href.find(":") == -1 : parts = href.split('#') url = parts[0] fragment = "" if len(parts) > 1: fragment = parts[1] bookrelpath = os.path.join(currentdir, unquoteurl(url)) bookrelpath = os.path.normpath(bookrelpath) bookrelpath = bookrelpath.replace(os.sep, "/") if bookrelpath in updates: attribute_value = updates[bookrelpath] if fragment != "": attribute_value = attribute_value + "#" + fragment attribute_value = quoteurl(attribute_value) tag["href"] = attribute_value newdata = soup.decode(pretty_print=True, formatter='minimal') return newdata
def grab_and_parse_results(target_url): http_response = urllib.request.urlopen(target_url) html_soup = BeautifulSoup(http_response) # Grab the HTML from the given url address # Pull out only the lines that match the pattern given by the regualar expression below pattern = u'[\u3030-\u9FAF][\u3030-\u9FAF0-9A-Za-z\u3001\u2026 ]+[\u300D|\u3002|\uFF01|\uFF1F|\u2026]+' # Return a list based on a set of unique sentences only (removes duplicates) return list(re.findall(pattern, html_soup.decode('utf-8-sig')))
def run(self, text): soup = BeautifulSoup(text, 'html.parser') new_soup = BeautifulSoup() content = new_soup.new_tag('div', **{'class': self.content_class}) for tag in soup.children: if isinstance(tag, NavigableString): continue if tag.name not in self.incut_tags and len(tag.contents) == 1 and tag.contents[0].name in self.incut_tags: tag = tag.contents[0] if tag.name in self.incut_tags: if len(content): new_soup.append(content) content = new_soup.new_tag('div', **{'class': self.content_class}) klass = self.incut_class if tag.name == 'iframe': klass += ' ' + self.incut_video_class incut = soup.new_tag('div', **{'class': klass}) incut.append(tag) new_soup.append(incut) else: content.append(tag) if len(content): new_soup.append(content) return new_soup.decode()
def fix_urls(document, base_url): soup = Soup(document) for tag in soup('a'): if tag['href'].startswith('/'): tag['href'] = base_url + tag['href'] return soup.decode()
def content_object_init(instance): if instance._content is not None: content = instance._content soup = BeautifulSoup(content) if "img" in content: for img in soup("img"): # TODO: Pretty sure this isn't the right way to do this, too hard coded. # There must be a setting that I should be using? src = instance.settings["PATH"] + "/images/" + os.path.split(img["src"])[1] im = Image.open(src) extra_style = "width: {}px; height: auto;".format(im.size[0]) if instance.settings["RESPONSIVE_IMAGES"]: extra_style += " max-width: 100%;" if img.get("style"): img["style"] += extra_style else: img["style"] = extra_style if img["alt"] == img["src"]: img["alt"] = "" fig = img.find_parent("div", "figure") if fig: if fig.get("style"): fig["style"] += extra_style else: fig["style"] = extra_style instance._content = soup.decode()
def harvest_images_in_fragment(fragment, settings): fragment_changed = False soup = BeautifulSoup(fragment) for img in soup.find_all('img', class_=re.compile("image-process-[-a-zA-Z0-9_]+")): for c in img['class']: match = re.search(r"image-process-([-a-zA-Z0-9_]+)", c) if match is not None: derivative = match.group(1) if derivative not in settings['IMAGE_PROCESS']: raise RuntimeError('Derivative %s undefined.' % (derivative,)) if isinstance(settings['IMAGE_PROCESS'][derivative], dict) and \ 'type' not in settings['IMAGE_PROCESS'][derivative]: raise RuntimeError('"type" is mandatory for %s.' % derivative) if isinstance(settings['IMAGE_PROCESS'][derivative], list) or \ (isinstance(settings['IMAGE_PROCESS'][derivative], dict) and \ settings['IMAGE_PROCESS'][derivative]['type'] == 'image'): # Single source image specification. process_img_tag(img, settings, derivative) fragment_changed = True elif isinstance(settings['IMAGE_PROCESS'][derivative], dict) and \ settings['IMAGE_PROCESS'][derivative]['type'] == 'responsive-image': # srcset image specification. build_srcset(img, settings, derivative) fragment_changed = True elif isinstance(settings['IMAGE_PROCESS'][derivative], dict) and \ settings['IMAGE_PROCESS'][derivative]['type'] == 'picture': # Multiple source (picture) specification. group = img.find_parent() if group.name == 'div': convert_div_to_picture_tag(soup, img, group, settings, derivative) elif group.name == 'picture': process_picture(soup, img, group, settings, derivative) fragment_changed = True break # for c in img['class'] if fragment_changed: # In Python 2, BeautifulSoup put our fragment inside html and # body tags, but in Python 3, it does not (maybe because it is # not using the same HTML parser). body = soup.find('body') if body: new_fragment = ''; for element in body.children: new_fragment += element.decode() else: new_fragment = soup.decode() else: new_fragment = fragment return new_fragment
def content_object_init(instance): if instance._content is not None: content = instance._content soup = BeautifulSoup(content) if 'img' in content: for img in soup('img'): logger.debug('PATH: %s', instance.settings['PATH']) logger.debug('img.src: %s', img['src']) img_path, img_filename = path.split(img['src']) logger.debug('img_path: %s', img_path) logger.debug('img_fname: %s', img_filename) lightbox_style = 'image' # All images on the same page are combined into a set fig = img.find_parent('a') if fig: if not(fig.get('data-lightbox')): fig['data-lightbox'] = lightbox_style fig['data-title'] = img_filename instance._content = soup.decode()
def process_summary(article): """ Ensures summaries are not cut off. Also inserts mathjax script so that math will be rendered. """ summary = article.summary summary_parsed = BeautifulSoup(summary, 'html.parser') math = summary_parsed.find_all(class_='math') if len(math) > 0: last_math_text = math[-1].get_text() if len(last_math_text) > 3 and last_math_text[-3:] == '...': content_parsed = BeautifulSoup(article._content, 'html.parser') full_text = content_parsed.find_all( class_='math' )[len(math)-1].get_text() math[-1].string = "%s ..." % full_text summary = summary_parsed.decode() # clear memoization cache import functools if isinstance(article.get_summary, functools.partial): memoize_instance = article.get_summary.func.__self__ memoize_instance.cache.clear() article._summary = ( "{}<script type='text/javascript'>" "{}" "</script>" ).format( summary, process_summary.mathjax_script )
def parse_images(instance): if instance._content is None or 'img' not in instance._content: return content = instance._content[:] soup = BeautifulSoup(content, "html.parser") for img in soup('img'): # Build the source image filename my_url2path_func = instance.settings['MY_IMG_URL2PATH_FUNC'] if not my_url2path_func: logger.error('Error: MY_IMG_URL2PATH_FUNC not defined in your pelican configuration.\n\ niux2_lazyload_helper cannot determine the image path from its url.\n') return imgPath, new_src = my_url2path_func(img['src']) if not new_src.startswith('http') and not (path.isfile(imgPath) and access(imgPath, R_OK)): logger.error('Error: image file not found: {}'.format(imgPath)) continue img['src'] = new_src # Open the source image and query dimensions if new_src.startswith('http'): img_data = urlopen(new_src).read() fid = TemporaryFile('wb+') fid.write(img_data) fid.flush() fid.seek(0) else: fid = open(imgPath, 'rb') im = Image.open(fid) imgWidth = im.size[0] imgHeight = im.size[1] imgResized = False if not img.get('width'): img['width'] = str(imgWidth) + 'px' else: imgResized = True # for lazyload.js if instance.settings.get('NIUX2_LAZY_LOAD', False): if img.get('class'): img['class'] += 'lazy' else: img['class'] = 'lazy' img['data-original'] = img['src'] del img['src'] if imgResized: newImgWidth = int(_width_attr_reg.sub('', img['width']).strip()) newImgHeight = imgHeight * newImgWidth / imgWidth img['data-width'] = str(newImgWidth) + 'px' img['data-height'] = str(newImgHeight) + 'px' else: img['data-width'] = str(imgWidth) + 'px' img['data-height'] = str(imgHeight) + 'px' instance._content = soup.decode()
def bootstrapify(content): if isinstance(content, contents.Static): return soup = BeautifulSoup(content._content) replace_tables(soup) replace_images(soup) content._content = soup.decode()
def rename_html_img_links(html_input, basename): """Rename all ``<img>`` tag ``src`` attributes based on `basename`. Each `src` of each ``<img>`` tag in `html_input` is renamed to a new location of form ``<BASENAME>_<NUM>.<EXT>`` where ``<BASENAME>`` is the basename of `basename`, ``<NUM>`` a unique number starting with 1 (one) and ``<EXT>`` the filename extension of the original ``src`` file. For example: ``<img src="foo_m1234.jpeg">`` with a `basename` ``sample.html`` will be replaced by ``<img src="sample_1.jpeg">`` if this is the first ``<img>`` tag in the document. Returns a tuple ``<HTML_OUTPUT>, <NAME_MAP>`` where ``<HTML_OUTPUT>`` is the modified HTML and ``<NAME_MAP>`` is a dictionary with a mapping from old filenames to new ones. The latter can be used to rename any real files (which is not done by this function). Links to 'external' sources (http and similar) are ignored. This funtion expects text as input and returns text, not bytes. I.e. you will get unicode snippets under Python 2.x and text (or `str`) under Python 3.x. """ soup = BeautifulSoup(html_input, 'html.parser') img_tags = soup.findAll('img') img_map = {} num = 1 basename = os.path.splitext(basename)[0] basename = basename.replace('.', '_') for tag in img_tags: src = tag.get('src', None) if src is None: continue if src in list(img_map.keys()): # We found a link to the same image already tag['src'] = img_map[src] continue scheme = urlparse(src)[0] if scheme not in ['file', '']: # only handle local files continue ext = '' if '.' in src: ext = os.path.splitext(src)[1] new_src = '%s_%s%s' % (basename, num, ext) num += 1 tag['src'] = new_src img_map[src] = new_src return soup.decode(), img_map
def get_pdf_content(pages, toc): """ :type pages: flask.ext.flatpages.flatpages.FlatPages :param pages: """ content = [] for toc_section in toc: section = {"id": toc_section["title"].replace(" ", "_"), "title": toc_section["title"], "content": []} for reference in toc_section["items"]: url = reference["url"] if url.startswith("/"): url = url[1:] if url.endswith(".html"): url = url[:-5] if url == "docs/reference/grammar": page_html = render_template("pages/grammar.html", kotlinGrammar=get_grammar()).replace("<br>", "<br/>") document = BeautifulSoup(page_html, "html.parser") document = document.find("div", {"class": "grammar"}) page_id = "grammar" title = "Grammar" else: page = pages.get(url) if page is None: continue title = page.meta["title"] document = BeautifulSoup(page.html, "html.parser") page_id = page.path.split("/")[-1] for element in document.find_all(): if "id" in element.attrs: element.attrs["id"] = page_id + "_" + element.attrs["id"] if element.name == "a": if "href" not in element.attrs: continue href = element.attrs["href"] url = urlparse(href) if url.scheme == "": if href.startswith("#"): new_href = page_id + "_" + href[1:] else: url_path = url.path[:-5] if url.path.endswith(".html") else url.path new_href = url_path + ("_" + url.fragment if url.fragment != "" else "") element.attrs["href"] = "#" + new_href header_regex = re.compile("^h(\d)$") if header_regex.match(element.name): level = int(header_regex.match(element.name).group(1)) + 1 element.name = "h" + str(level) section["content"].append({"id": page_id, "title": title, "content": document.decode()}) content.append(section) drive, root_folder_path_rest = path.splitdrive(root_folder_path) page_html = render_template( "pdf.html", content=content, root_folder=(drive + root_folder_path_rest).replace("\\", "/") ) return page_html
def content_object_init(instance): if instance._content is not None: content = instance._content soup = BeautifulSoup(content, 'html.parser') if 'img' in content: for img in soup('img'): logger.debug('Better Fig. PATH: %s', instance.settings['PATH']) logger.debug('Better Fig. img.src: %s', img['src']) img_path, img_filename = path.split(img['src']) logger.debug('Better Fig. img_path: %s', img_path) logger.debug('Better Fig. img_fname: %s', img_filename) # Strip off {filename}, |filename| or /static if img_path.startswith(('{filename}', '|filename|')): img_path = img_path[10:] elif img_path.startswith('/static'): img_path = img_path[7:] elif img_path.startswith('data:image'): # Image is encoded in-line (not a file). continue else: logger.warning('Better Fig. Error: img_path should start with either {filename}, |filename| or /static') # Build the source image filename src = instance.settings['PATH'] + img_path + '/' + img_filename logger.debug('Better Fig. src: %s', src) if not (path.isfile(src) and access(src, R_OK)): logger.error('Better Fig. Error: image not found: %s', src) # Open the source image and query dimensions; build style string im = Image.open(src) extra_style = 'width: {}px; height: auto;'.format(im.size[0]) if 'RESPONSIVE_IMAGES' in instance.settings and instance.settings['RESPONSIVE_IMAGES']: extra_style += ' max-width: 100%;' if img.get('style'): img['style'] += extra_style else: img['style'] = extra_style if img['alt'] == img['src']: img['alt'] = '' fig = img.find_parent('div', 'figure') if fig: if fig.get('style'): fig['style'] += extra_style else: fig['style'] = extra_style instance._content = soup.decode()
def content_object_init(instance): if instance._content is not None: content = instance._content soup = BeautifulSoup(content) if "img" in content: for img in soup("img"): logger.debug("Better Fig. PATH: %s", instance.settings["PATH"]) logger.debug("Better Fig. img.src: %s", img["src"]) img_path, img_filename = path.split(img["src"]) logger.debug("Better Fig. img_path: %s", img_path) logger.debug("Better Fig. img_fname: %s", img_filename) # Strip off {filename}, |filename| or /static if img_path.startswith(("{filename}", "|filename|")): img_path = img_path[10:] elif img_path.startswith("/static"): img_path = img_path[7:] else: logger.warning( "Better Fig. Error: img_path should start with either {filename}, |filename| or /static" ) # Build the source image filename src = instance.settings["PATH"] + img_path + "/" + img_filename logger.debug("Better Fig. src: %s", src) if not (path.isfile(src) and access(src, R_OK)): logger.error("Better Fig. Error: image not found: {}".format(src)) # Open the source image and query dimensions; build style string im = Image.open(src) extra_style = "width: {}px; height: auto;".format(im.size[0]) if instance.settings["RESPONSIVE_IMAGES"]: extra_style += " max-width: 100%;" if img.get("style"): img["style"] += extra_style else: img["style"] = extra_style if img["alt"] == img["src"]: img["alt"] = "" fig = img.find_parent("div", "figure") if fig: if fig.get("style"): fig["style"] += extra_style else: fig["style"] = extra_style instance._content = soup.decode()
def test_handle_content_no_dorks(self): self.handler.no_dorks = True async def test(): self.return_content = await self.handler.handle_content(self.content) self.loop.run_until_complete(test()) soup = BeautifulSoup(self.return_content, "html.parser") self.return_content = soup.decode("utf-8") self.assertEqual(self.return_content, self.no_dorks_content)
def search(page): search1 = "http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E5%8C%97%E4%BA%AC&kw=java&sm=0&sf=10001&st=15000&we=0103&isfilter=1&fl=530&isadv=0&sg=c38460ed2e6c4041994dc4eaabf942ce&p=" search2 = page search_url = search1 + str(search2) print(search_url) data = urllib.request.urlopen(search_url).read() soup = BeautifulSoup(data, "html.parser") soup.decode('UTF-8') strs = soup.findAll(name='td', attrs={"class":"gsmc"}) count = len(strs) file = open("result.txt", 'a') for x in xrange(count): companyName = strs[x].string if companyName == None: companyName = '123' file.write(companyName + '\n') while(page ==3): break file.close()
def content_object_init(instance): if instance._content is not None: content = instance._content soup = BeautifulSoup(content) if 'table' in content: for ctbl in soup.find_all('table', class_="codehilitetable"): wrapper_tag = soup.new_tag('div') wrapper_tag['class'] = 'hilitewrapper' ctbl.wrap(wrapper_tag) instance._content = soup.decode()
def test_handle_content(self): self.handler.no_dorks = False self.handler.get_dorks = AsyncMock(return_value=["test_dork1"]) async def test(): self.return_content = await self.handler.handle_content(self.content) self.loop.run_until_complete(test()) soup = BeautifulSoup(self.return_content, "html.parser") return_content = soup.decode("utf-8") self.assertEqual(return_content, self.expected_content)
def bootstrapify(content): if isinstance(content, contents.Static): return replacements = content.settings['BOOTSTRAPIFY'] soup = BeautifulSoup(content._content, 'html.parser') for selector, classes in replacements.items(): replace_in_with(selector, soup, classes) content._content = soup.decode()
def main(): wb = xlwt.Workbook() sheet = wb.add_sheet('sheet1') sheet.write(0, 0, 'categoryID') #categoryID帮助中心 sheet.write(0, 1, 'status') #status未审核 sheet.write(0, 2, 'recommend') #recommend未推荐 sheet.write(0, 3, 'type') #type运营添加 sheet.write(0, 4, 'tag') #tag标签 sheet.write(0, 5, 'source') #文章来源 sheet.write(0, 6, 'writer') #作者 sheet.write(0, 7, 'md5_id') sheet.write(0, 8, 'title') sheet.write(0, 9, 'description') sheet.write(0, 10, 'content') content_url_list = [ url.strip() for url in open('jrhelp.jd.com_index_detail_url.txt') ] #https://article.jd.com/?id=987009 service_args = [] service_args.append('--load-images=no') #关闭图片加载 service_args.append('--disk-cache=yes') #开启缓存 service_args.append('--ignore-ssl-errors=true') #忽略https错误 browser = webdriver.PhantomJS(service_args=service_args) browser.implicitly_wait(30) #设置超时时间 browser.set_page_load_timeout(30) #设置超时时间 for index, k in enumerate(content_url_list): content_str = '' try: url = 'https://article.jd.com/?id=' + str( k.strip()) #获取文件中的url,具体根据txt里字段定 print index, url except Exception, e: print e #抓正文规则 try: #browser = webdriver.PhantomJS(executable_path=r'D:\programfiles\anaconda\Lib\site-packages\selenium\webdriver\phantomjs\bin\phantomjs.exe') browser.get(url) content_main = browser.find_element_by_class_name( "detail_cont_main").get_attribute('innerHTML') s = BeautifulSoup(content_main, "lxml") #删除商品部分,type=3, 删除头部标题部分 content = [ soup.extract() for soup in s( 'div', attrs={"class": "detail_cm_item detail_cm_goods"}) ] content = [ soup.extract() for soup in s('div', attrs={"class": "detail_cm_head"}) ] #因为懒加载,导致img的src值不是图片的url地址,需要处理 imatag = s.find_all('img') for itag in imatag: if '1x1' in itag.get('src'): itag['src'] = itag.get('data-lazy-img') itag['data-lazy-img'] = 'done' content_txt = str(s.get_text()).decode('utf-8')[0:250] # #替换掉不需要的标签 s = str(s).replace('<html>', '').replace('</html>', '').replace( '<body>', '').replace('</body>', '') title = browser.find_element_by_tag_name("h3").text ''' txttag = browser.find_elements_by_class_name("detail_cm_text")#加个s pictag = browser.find_elements_by_class_name("detail_cm_pic")#加个s for t in txttag: content_str += '</p>'+t.text+'</p>' for pic in pictag: print pic.get_attribute('innerHTML') ''' m1 = md5.new() m1.update(title) md5_str = m1.hexdigest()[8:-8] #取中间16位 content = s.decode('utf-8') sheet.write(index + 1, 0, '148') #categoryID帮助中心 sheet.write(index + 1, 1, '1') #status未审核 sheet.write(index + 1, 2, '1') #recommend未推荐 sheet.write(index + 1, 3, '0') #type运营添加 sheet.write(index + 1, 4, 'jd') #tag标签 sheet.write(index + 1, 5, 'jd') #文章来源 sheet.write(index + 1, 6, 'jd') #作者 sheet.write(index + 1, 7, md5_str) sheet.write(index + 1, 8, title) if content_txt: sheet.write(index + 1, 9, content_txt) else: sheet.write(index + 1, 9, title) if len(content) < 32767: sheet.write(index + 1, 10, content) else: sheet.write(index + 1, 10, 'String longer than 32767 characters') wb.save("result.xls") except Exception, e: print e
import xml.etree.cElementTree as ET from urllib.request import urlopen import ssl from bs4 import BeautifulSoup ctx = ssl.create_default_context() ctx.check_hostname = False ctx.verify_mode = ssl.CERT_NONE url = input('Enter - ') html = urlopen(url, context=ctx).read() soup = BeautifulSoup(html, "html.parser") tree = ET.fromstring(soup.decode()) lst = tree.findall('comments/comment') sum = 0 for item in lst: sum += int(item.find('count').text) print(sum) print(sum)
while True: cur.execute(''' SELECT wind FROM Winds WHERE year = ? AND month = ?''', (year, month)) row = cur.fetchone() aux_url = str(year) + '-' if month < 10: aux_url = aux_url + '0' aux_url = aux_url + str(month) + '.txt' url = core_url + aux_url html = urllib.request.urlopen(url, context=ctx).read() # print(html) soup = BeautifulSoup(html, 'html.parser') print(soup) data = soup.decode() wind = re.findall(expression, data) # print(wind) w = list() [w.append(float(x)) for x in wind] max_wind = max(w) # print(max_wind) cur.execute( ''' INSERT INTO Winds (year, month, wind) VALUES (?, ?, ?)''', (year, month, max_wind)) if (month == int(end_month)) and (year == int(end_year)): break if month == 12:
# To run this, you can install BeautifulSoup # https://pypi.python.org/pypi/beautifulsoup4 # Or download the file # http://www.py4e.com/code3/bs4.zip # and unzip it in the same directory as this file import urllib.request, urllib.parse, urllib.error from bs4 import BeautifulSoup import ssl # Ignore SSL certificate errors ctx = ssl.create_default_context() ctx.check_hostname = False ctx.verify_mode = ssl.CERT_NONE #url = input('Enter - ') url = 'http://py4e-data.dr-chuck.net/comments_42.html' html = urllib.request.urlopen(url, context=ctx).read() print(html.decode()) soup = BeautifulSoup(html, 'html.parser') print(soup.decode()) # Retrieve all of the anchor tags tags = soup('span') for tag in tags: print(tag.contents[0])
def get_html_from_filepath(filepath, start=0, end=None, preprocessors=[], template=None): """Return the HTML from a Jupyter Notebook """ template_file = 'basic' extra_loaders = [] if template: extra_loaders.append( jinja2.FileSystemLoader([os.path.dirname(template)])) template_file = os.path.basename(template) config = get_config() config.update({ 'CSSHTMLHeaderTransformer': { 'enabled': True, 'highlight_class': '.highlight-ipynb' }, 'SubCell': { 'enabled': True, 'start': start, 'end': end } }) exporter = HTMLExporter(config=config, template_file=template_file, extra_loaders=extra_loaders, filters={'highlight2html': custom_highlighter}, preprocessors=[SubCell] + preprocessors) config.CSSHTMLHeaderPreprocessor.highlight_class = " .highlight pre " content, info = exporter.from_filename(filepath) from bs4 import NavigableString if BeautifulSoup: soup = BeautifulSoup(content, 'html.parser') for i in soup.findAll('div', {'class': 'prompt input_prompt'}): i.decompose() for i in soup.findAll('div', {'class': 'prompt output_prompt'}): i.decompose() for i in soup.findAll('div', {'class': 'prompt'}): i.decompose() for i in soup.findAll('a', {'class': 'anchor-link'}): i.decompose() for i in soup.findAll('code'): i.attrs['class'] = 'code-class' content = soup.decode(formatter=None) # url = 'http://localhost:8800/list-comprehension.html' # page = requests.get(url) # soup = BeautifulSoup(page.content, 'html.parser') soup = BeautifulSoup(content, 'html.parser') pre_tags = soup.find_all('div', {'class': 'input_area'}) input_areas = [i for i in pre_tags if i['class'][0] == 'input_area'] output = '\r\n'.join([i.get_text() for i in input_areas]) new_div = soup.new_tag('textarea') new_div['text'] = output new_div['id'] = "myInput" new_div['type'] = "text" new_div['class'] = "codecopy" new_div.insert(0, NavigableString(output)) soup.insert(0, new_div) content = soup.decode(formatter=None) return content, info
def getContent(): """收集内容""" """ 你的 APPID AK SK """ APP_ID = '14658509' API_KEY = 'C14bCL7NkReQpak382maUYXi' SECRET_KEY = '8vWAXHBTmfL3r96PlKIggpwuXwdNl4wz' client = AipNlp(APP_ID, API_KEY, SECRET_KEY) #[1网址,2标题,3内容,4情感分析items词典,5公司名列表,6评论观点列表,7文章分类,8文章标签] #http://linyi.iqilu.com/caijing/2018/1117/4113682.shtml #monitor_result=[] for news_url in urls: one_monitor=[] one_monitor.append(news_url)#①网址 try:#确保一条新闻具有完整性 news=urlopen(news_url,timeout=15)#设置timeout后,urlopen不会一直等待网址响应、也就不会出现卡死现象 news_html=news.read()#str类型的网页源码,这条指令和parse冲突,不能同时运行 #response = requests.get('http://example.com') #doc = Document(response.text) except: one_monitor.append("urlopen_error") monitor_result.append(one_monitor) success_num +=1 print("打开网址错误") continue try:#③内容,评论观点抽取最大就3000字 news_contents=Document(news_html) news_title=news_contents.title().strip(" ")[:39].encode("utf-8")#②标题,此处如果用默认的ascii转码、由于超出范围会报错 #print(news_title)#则删除空白符(包括'\n', '\r', '\t', ' ') one_monitor.append(news_title) news_content=BeautifulSoup(news_contents.summary()).get_text().strip(" ")[:2000].encode("utf-8") #len(news_content)#print(news_content) one_monitor.append(news_content) emotion_content=news_content.decode("utf-8")[:500].encode("utf-8")#要防止str只截取定长字节而有不完整汉字 #print(emotion_content) except: one_monitor.append("extract_error") try: #print(emotion_content) #print(u"我很高兴"[:1000])#我很高兴 emotion=client.sentimentClassify(emotion_content)["items"]#④情感 one_monitor.append(emotion) except: one_monitor.append("emotion_error") try:#⑤机构名列表 # ids = [1,4,3,3,4,2,3,4,5,6,1] # list(set(ids))#结果是重新排序的 orgs=[item["item"].encode("utf-8") for item in client.lexer(news_content)["items"] if item["ne"] =="ORG"] one_monitor.append(";".join(list(set(orgs)))) #print(";".join(list(set(orgs)))) except: one_monitor.append("org_error") try:#⑥评论观点列表 conments=[item['abstract'].encode("utf-8") for item in client.commentTag(news_content)['items']] one_monitor.append(";".join(list(set(conments)))) #print(";".join(list(set(conments)))) except: one_monitor.append("comment_error") try:#⑦文章分类 # a=[[1,2],[4,3,5]] # [c for b in a for c in b] group=client.topic(news_title, news_content)["item"].values()#[[字典],[字典]] #group=client.topic("对严重失信者,能否限制其发预付卡?法学家谈如何破解预付卡立法瓶颈", news_content)["item"].values() value_list=[dic[u'tag'] for dic_list in group for dic in dic_list]#float类型不能参与join one_monitor.append(u";".join(value_list).encode("utf-8")) #print(u";".join(value_list).encode("utf-8")) except: one_monitor.append("topic_error") try:#⑧文章标签 keyword=client.keyword(news_title, news_content)["items"]#[字典] #keyword=client.keyword("对严重失信者,能否限制其发预付卡?法学家谈如何破解预付卡立法瓶颈", news_content)["items"] key_list=[dic[u'tag'] for dic in keyword] one_monitor.append(u";".join(key_list).encode("utf-8")) #print(u";".join(key_list).encode("utf-8")) print("成功%s"%success_num) except: one_monitor.append("keyword_error") error_num +=1 print("其中有误%s"%error_num) monitor_result.append(one_monitor) success_num +=1 #time.sleep(1) if success_num % 200 == 0:#要定期保存,防止功亏一篑 with open("./temp/risk_monitoring%s.csv"%index,"w") as reader: writer = csv.writer(reader) writer.writerows(monitor_result)
def clean(cls, html): """Clean up an HTML string to contain just the allowed elements / attributes""" doc = BeautifulSoup(html, 'html5lib') cls.clean_node(doc, doc) return doc.decode()
class SiteScramble: def __init__(self, html, noise_level, filenum, url): self.filenum = filenum self.soup = BeautifulSoup(html) self.noise_level = noise_level self.url = url self.css_file_list = [] self.att_dict = None self.cwd = os.getcwd() self.html = '' def _change_image_hrefs(self): ''' Changes hrefs in HTML so they have absolute paths instead of relative for images. This way we can open the file locally with the right path. @returns: str - html with absolute paths ''' imgEls = self.soup.findAll("img") for el in imgEls: try: if el["src"] and el["src"][0] == '/': el["src"] = url + el["src"] except: pass #return str(self.soup).decode('utf-8') return str(self.soup) def _get_color(self,html_text): ''' Extracts CSS color code from HTML @param html_text - HTML fragment with CSS in it @returns: str - CSS color code ''' color = None text_split = html_text.split(';') color = [x for x in text_split if 'color' in x] if color: color = color[0].split(':')[1] return color def _get_atts_to_change(self): ''' Searches through HTML for colors, images, and font-sizes we want to change later. ''' self.html = self._change_image_hrefs() att_dict = {} colors = [] images = [] font_sizes = [] if self.soup.findAll('body')[0].get('bgcolor'): colors.append(self.soup.findAll('body')[0]['bgcolor']) pEls = self.soup.findAll('p') for p in pEls: if p.get('style') and 'color' in p['style']: c = self._get_color(p['style']) if c not in colors: colors.append(c) imgEls = self.soup.findAll('img') for img in imgEls: if img.get('height') and img.get('width') and [img.get('height'),img.get('width')] not in images: images.append([img.get('height'),img.get('width')]) allEls = self.soup.findAll() #print len(allEls) for el in allEls: if el.get('style') and 'font-size' in el.get('style'): font_sizes.append(el.get('style')) if colors: att_dict['colors'] = colors if images: att_dict['images'] = images if font_sizes: att_dict['font-sizes'] = font_sizes self.att_dict = att_dict def check_valid_css(self,html_string): ''' Checks to see if string of HTML is a valid CSS code ''' valid_chars = ['a', 'b', 'c', 'd', 'e', 'f'] valid_chars = valid_chars + [str(x) for x in range(10)] is_valid = True for h in html_string: if h not in valid_chars: is_valid = False return is_valid def _change_css(self,css_file, index, num): ''' Goes through a CSS file and replaces color with a new one @param css_file - CSS file @param index - Index of file we are on ''' if not self.att_dict: self._get_atts_to_change() if 'http' not in css_file: css_file = self.url + css_file r = requests.get(css_file) css_html = r.text all_matches = re.findall(r'#.{6}', css_html, re.MULTILINE) valid_css = list(set(x for x in all_matches if self.check_valid_css(x.replace('#','')))) for val in valid_css: new_color = self._change_color(val, self.noise_level) css_html = css_html.replace(val, new_color) with open('output/newcss' + str(num) + str(index) + '.css', 'w') as f: #f.write(css_html.encode('utf-8')) f.write(str(css_html)) def get_css(self): ''' Extracts CSS files from HTML ''' allEls = self.soup.findAll('head')[0].findChildren() css_file_list = [] index = 0 for p in allEls: if p.get('href') and '.css' in p.get('href'): self._change_css(p.get('href'), index, self.filenum) css_file_list.append(p.get('href')) index += 1 self.css_file_list = css_file_list def _change_color(self,orig_color, color_range): ''' Scrambles CSS color code to produce new color @param orig_color - original color used by HTML @param color_range - maximum range of change to CSS color @returns: str - new color code ''' hex_list = list(orig_color.replace('#','').lower()) color_num = {'a':10, 'b':11, 'c':12, 'd':13, 'e':14, 'f':15, 10:'a', 11:'b', 12:'c', 13:'d', 14:'e', 15:'f'} new_color = [] try: for hex_val in hex_list: if hex_val.isalpha(): hex_val = color_num[hex_val] bump = randint(0,color_range) new_val = (int(hex_val) + bump) % 16 if new_val > 9: new_val = color_num[new_val] new_color.append(new_val) return '#' + ''.join(str(n) for n in new_color) except Exception as e: print('error changing color' + str(e)) return orig_color def change_image_hrefs(self): ''' Appends main URL to relative image hrefs to make them absolute ''' imgEls = self.soup.findAll("img") for el in imgEls: try: if el["src"] and el["src"][0] == '/': el["src"] = self.url + el["src"] except: pass html = str(self.soup.decode('utf-8')) def change_css_files(self): ''' Makes new CSS files and replaces the paths with the old CSS files with the new ones ''' for index,css_file in enumerate(self.css_file_list): newfile = 'file://' + self.cwd + '/output/' + 'newcss' + str(self.filenum) + str(index) + '.css' self.html = self.html.replace(css_file,newfile) def _change_font_size(self,fs): ''' Scrambles CSS font size to produce new font size @param fs - original font size used by HTML @returns: str - new font size (or old one if we error) ''' negate = randint(0,1) change_level = randint(0,self.noise_level) font_size_split = fs.split(';') size = '' px = False pt = False percent = False for fss in font_size_split: if 'font-size' in fss: size = fss.replace('font-size:','') if '%' in size: size = size.replace('%','') percent = True if 'pt' in size: size = size.replace('pt','').strip() pt = True if 'px' in size: size = size.replace('px','').strip() px = True if size: try: new_size = int(size) except: size = randint(1,9) new_fs = int(size) * change_level if negate == 0: new_fs = new_fs + int(size) new_fs = 'font-size: ' + str(new_fs) if percent: new_fx = new_fs + '%' if px: new_fs = new_fs + 'px' if pt: new_fs = new_fs + 'pt' else: new_fs = fs return new_fs def _change_image_size(self,im): ''' Scrambles image size to produce new size @param im - original image size by HTML @returns: str - new image size ''' negate = randint(0,1) change_level = randint(0,self.noise_level) new_height = im[0] * change_level new_width = im[1] * change_level if negate == 0: new_height = new_height + im[0] new_width = new_width + im[0] return (int(new_height), int(new_width)) def scramble_colors(self): ''' Scrambles CSS colors in HTML ''' self.get_css() if not self.att_dict: self._get_atts_to_change() self.change_css_files() if 'colors' in self.att_dict: for c in self.att_dict['colors']: newc = self._change_color(c,self.noise_level) self.html = self.html.replace(c,newc) def scramble_image_sizes(self): ''' Scrambles image sizes in HTML ''' if not self.att_dict: self._get_atts_to_change() if 'images' in self.att_dict: for im in self.att_dict['images']: newim = self._change_images(im) self.html = self.html.replace('height="' + im[0],'height="' + str(newim[0])) self.html = self.html.replace('width="' + im[1],'width="' + str(newim[1])) def scramble_font_sizes(self): ''' Scrambles font sizes in HTML ''' if not self.att_dict: self._get_atts_to_change() if 'font-sizes' in self.att_dict: for fs in self.att_dict['font-sizes']: newfs = self._change_font_size(fs) self.html = self.html.replace(fs,newfs)
if tag.name in tags: pass else: tag.hidden = True #让标签隐藏 tag.clear() #清空 continue input_attrs = tag.attrs # {'class': 'c1', 'id': 'i1'} valid_attrs = tags[tag.name] # ['class'] for k in list(input_attrs.keys()): if k in valid_attrs: pass else: del tag.attrs[k] content = soup.decode() # print(content) # pip3 install beatifulsoup4 # from bs4 import BeautifulSoup # soup = BeautifulSoup(content, 'html.parser') # tag = soup.find('script') # tag.hidden = True # tag.clear() # # span = soup.find('span') # # print(span.attrs) # del span.attrs['style'] # # content = soup.decode() # print(content)
def get_pdf_content(pages, toc): """ :type pages: flask.ext.flatpages.flatpages.FlatPages :param pages: """ content = [] for toc_section in toc: section = { 'id': toc_section['title'].replace(' ', '_'), 'title': toc_section['title'], 'content': [] } for reference in toc_section['items']: url = reference['url'] if url.startswith('/'): url = url[1:] if url.endswith('.html'): url = url[:-5] if url == "docs/reference/grammar": page_html = render_template( 'pages/grammar.html', kotlinGrammar=get_grammar()).replace("<br>", "<br/>") document = BeautifulSoup(page_html, 'html.parser') document = document.find("div", {"class": "grammar"}) page_id = "grammar" title = "Grammar" else: page = pages.get(url) if page is None: continue title = page.meta['title'] document = BeautifulSoup(page.html, 'html.parser') page_id = page.path.split('/')[-1] for element in document.find_all(): if 'id' in element.attrs: element.attrs['id'] = page_id + '_' + element.attrs['id'] if element.name == "a": if 'href' not in element.attrs: continue href = element.attrs['href'] url = urlparse(href) if url.scheme == "": if href.startswith('#'): new_href = page_id + '_' + href[1:] else: url_path = url.path[:-5] if url.path.endswith( ".html") else url.path new_href = url_path + ('_' + url.fragment if url.fragment != "" else "") element.attrs['href'] = "#" + new_href header_regex = re.compile('^h(\d)$') if header_regex.match(element.name): level = int(header_regex.match(element.name).group(1)) + 1 element.name = 'h' + str(level) section['content'].append({ 'id': page_id, 'title': title, 'content': document.decode() }) content.append(section) drive, root_folder_path_rest = path.splitdrive(root_folder_path) page_html = render_template('pdf.html', content=content, root_folder=drive + root_folder_path_rest) return page_html
def content_object_init(instance): if instance._content is not None: content = instance._content soup = BeautifulSoup(content, 'html.parser') for img in soup(['img', 'object']): logger.debug('Better Fig. PATH: %s', instance.settings['PATH']) if img.name == 'img': logger.debug('Better Fig. img.src: %s', img['src']) img_path, img_filename = path.split(img['src']) else: logger.debug('Better Fig. img.data: %s', img['data']) img_path, img_filename = path.split(img['data']) logger.debug('Better Fig. img_path: %s', img_path) logger.debug('Better Fig. img_fname: %s', img_filename) # If the image already has attributes... then we can skip it. Assuming it's already optimised if 'style' in img.attrs: sheet = cssutils.parseStyle(img['style']) if len(sheet.width) > 0 or len(sheet.height) > 0: continue # Pelican 3.5+ supports {attach} macro for auto copy, in this use case the content does not exist in output # due to the fact it has not been copied, hence we take it from the source (same as current document) src = None if img_filename.startswith('{attach}'): img_path = os.path.dirname(instance.source_path) img_filename = img_filename[8:] src = os.path.join(img_path, img_filename) elif img_path.startswith(('{filename}', '|filename|')): # Strip off {filename}, |filename| or /static img_path = img_path[10:] elif img_path.startswith('/static'): img_path = img_path[7:] elif img_path.startswith('data:image'): # Image is encoded in-line (not a file). continue else: # Check the location in the output as some plugins create them there. output_path = path.dirname(instance.save_as) image_output_location = path.join( instance.settings['OUTPUT_PATH'], output_path, img_filename) if path.isfile(image_output_location): src = image_output_location logger.info( '{src} located in output, missing from content.'. format(src=img_filename)) else: logger.warning( 'Better Fig. Error: img_path should start with either {attach}, {filename}, |filename| or /static' ) if src is None: # search src path list # 1. Build the source image filename from PATH # 2. Build the source image filename from STATIC_PATHS # if img_path start with '/', remove it. img_path = os.path.sep.join( [el for el in img_path.split("/") if len(el) > 0]) # style: {filename}/static/foo/bar.png src = os.path.join(instance.settings['PATH'], img_path, img_filename) src_candidates = [src] # style: {filename}../static/foo/bar.png src_candidates += [ os.path.join(instance.settings['PATH'], static_path, img_path, img_filename) for static_path in instance.settings['STATIC_PATHS'] ] src_candidates = [ f for f in src_candidates if path.isfile(f) and access(f, R_OK) ] if not src_candidates: logger.error('Better Fig. Error: image not found: %s', src) logger.debug('Better Fig. Skip src: %s', img_path + '/' + img_filename) continue src = src_candidates[0] logger.debug('Better Fig. src: %s', src) # Open the source image and query dimensions; build style string try: if img.name == 'img': im = Image.open(src) extra_style = 'width: {}px; height: auto;'.format( im.size[0]) else: svg = pysvg.parser.parse(src) extra_style = 'width: {}px; height: auto;'.format( svg.get_width()) except IOError as e: logger.debug('Better Fig. Failed to open: %s', src) extra_style = 'width: 100%; height: auto;' if 'RESPONSIVE_IMAGES' in instance.settings and instance.settings[ 'RESPONSIVE_IMAGES']: extra_style += ' max-width: 100%;' if img.get('style'): img['style'] += extra_style else: img['style'] = extra_style if img.name == 'img': if img['alt'] == img['src']: img['alt'] = '' fig = img.find_parent('div', 'figure') if fig: if fig.get('style'): fig['style'] += extra_style else: fig['style'] = extra_style instance._content = soup.decode()
def content_object_init(instance): if instance._content is not None: content = instance._content soup = BeautifulSoup(content) if 'img' in content: for img in soup('img'): logger.debug('Better Fig. PATH: %s', instance.settings['PATH']) logger.debug('Better Fig. img.src: %s', img['src']) img_path, img_filename = path.split(img['src']) logger.debug('Better Fig. img_path: %s', img_path) logger.debug('Better Fig. img_fname: %s', img_filename) # Strip off {filename}, |filename| or /static if img_path.startswith(('{filename}', '|filename|')): img_path = img_path[10:] elif img_path.startswith('/static'): img_path = img_path[7:] else: logger.warning( 'Better Fig. Error: img_path should start with either {filename}, |filename| or /static' ) # Build the source image filename src = instance.settings['PATH'] + img_path + '/' + img_filename logger.debug('Better Fig. src: %s', src) if not (path.isfile(src) and access(src, R_OK)): logger.error( 'Better Fig. Error: image not found: {}'.format(src)) # Open the source image and query dimensions; build style string im = Image.open(src) #img_extra_style = 'width: {}px; height: {}px;'.format(im.size[0], im.size[1]) #fig_extra_style = 'width: {}px; height: auto;'.format(im.size[0]) extra_style = 'width: {}px; height: auto;'.format(im.size[0]) if instance.settings['RESPONSIVE_IMAGES']: # img_extra_style += ' max-width: 100%;' # fig_extra_style += ' max-width: 100%;' extra_style += ' max-width: 100%;' if img.get('style'): #img['style'] += img_extra_style img['style'] += extra_style else: #img['style'] = img_extra_style img['style'] = extra_style if img['alt'] == img['src']: img['alt'] = '' fig = img.find_parent('div', 'figure') if fig: if fig.get('style'): #fig['style'] += fig_extra_style fig['style'] += extra_style else: #fig['style'] = fig_extra_style fig['style'] = extra_style instance._content = soup.decode()
def content_object_init(instance): if instance._content is not None: content = instance._content soup = BeautifulSoup(content, 'html.parser') if 'img' in content: for img in soup('img'): logger.debug('Better Fig. PATH: %s', instance.settings['PATH']) logger.debug('Better Fig. img.src: %s', img['src']) img_path, img_filename = path.split(img['src']) logger.debug('Better Fig. img_path: %s', img_path) logger.debug('Better Fig. img_fname: %s', img_filename) # Strip off {filename}, |filename| or /static if img_path.startswith(('{filename}', '|filename|')): img_path = img_path[10:] elif img_path.startswith('/static'): img_path = img_path[7:] elif img_path.startswith('data:image'): # Image is encoded in-line (not a file). continue else: logger.warning( 'Better Fig. Error: img_path should start with either {filename}, |filename| or /static' ) # search src path list # 1. Build the source image filename from PATH # 2. Build the source image filename from STATIC_PATHS # if img_path start with '/', remove it. img_path = os.path.sep.join( [el for el in img_path.split("/") if len(el) > 0]) # style: {filename}/static/foo/bar.png src = os.path.join(instance.settings['PATH'], img_path, img_filename) src_candidates = [src] # style: {filename}../static/foo/bar.png src_candidates += [ os.path.join(instance.settings['PATH'], static_path, img_path, img_filename) for static_path in instance.settings['STATIC_PATHS'] ] src_candidates = [ f for f in src_candidates if path.isfile(f) and access(f, R_OK) ] if not src_candidates: logger.error('Better Fig. Error: image not found: %s', src) logger.debug('Better Fig. Skip src: %s', img_path + '/' + img_filename) continue src = src_candidates[0] logger.debug('Better Fig. src: %s', src) # Open the source image and query dimensions; build style string im = Image.open(src) extra_style = 'width: {}px; height: auto;'.format(im.size[0]) if 'RESPONSIVE_IMAGES' in instance.settings and instance.settings[ 'RESPONSIVE_IMAGES']: extra_style += ' max-width: 100%;' if img.get('style'): img['style'] += extra_style else: img['style'] = extra_style if img['alt'] == img['src']: img['alt'] = '' fig = img.find_parent('div', 'figure') if fig: if fig.get('style'): fig['style'] += extra_style else: fig['style'] = extra_style instance._content = soup.decode()
def scrape(self, delta=True): if self.isUpdate or self.isDLC: return try: if (not delta or not self.bannerUrl): id = self.id if id in titleRedirects: id = titleRedirects[id] cookies = {'esrb.verified': 'true'} for region in ['JP', 'AU']: result = grabCachedRedirectUrl( "https://ec.nintendo.com/apps/%s/%s" % (id, region), cookies=cookies) _json = '' if not result or result.status_code != 200: continue _json = json.loads( result.text.split('NXSTORE.titleDetail.jsonData = ') [1].split('NXSTORE.titleDetail')[0].replace(';', '')) if _json == '' or _json == None: Print.error('Failed to parse json for ' + "https://ec.nintendo.com/apps/%s/%s" % (id, region)) continue if 'hero_banner_url' in _json: self.bannerUrl = _json['hero_banner_url'] if "release_date_on_eshop" in _json: self.releaseDate = int( _json["release_date_on_eshop"].replace('-', '')) if "id" in _json: self.nsuId = int("%s" % _json["id"]) if "formal_name" in _json: self.name = _json["formal_name"].strip() if 'screenshots' in _json: self.screenshots = [] for i, k in enumerate(_json["screenshots"]): self.screenshots.append(k["images"][0]["url"]) if "demos" in _json: for demo in _json["demos"]: if "id" in demo: if id[0:12] != _json['applications'][0]['id'][ 0:12]: self.nsuId = int(demo["id"]) if "name" in demo: self.name = demo["name"].strip() if "languages" in _json: self.languages = [] for language in _json["languages"]: self.languages.append(language['iso_code']) if "genre" in _json: self.category = _json["genre"].split(' / ') if "total_rom_size" in _json: self.size = _json["total_rom_size"] if "rating_info" in _json: if "rating" in _json["rating_info"]: if "age" in _json["rating_info"]['rating']: self.rating = _json["rating_info"]['rating'][ 'age'] if "content_descriptors" in _json["rating_info"]: content = [] for descriptor in _json["rating_info"][ "content_descriptors"]: content.append(descriptor['name']) self.ratingContent = content if "player_number" in _json: if 'local_max' in _json["player_number"]: self.numberOfPlayers = _json["player_number"][ "local_max"] if 'offline_max' in _json["player_number"]: self.numberOfPlayers = _json["player_number"][ "offline_max"] if "publisher" in _json: if 'name' in _json["publisher"]: self.publisher = _json["publisher"]["name"] if 'title' in _json["publisher"]: self.publisher = _json["publisher"]["title"] if "applications" in _json: if "image_url" in _json["applications"][0]: self.iconUrl = _json["applications"][0][ 'image_url'] if "catch_copy" in _json: intro = re.sub('(?<!\n)\n(?!\n)', ' ', _json["catch_copy"]) intro = re.sub(' ', ' ', intro) self.intro = intro if "description" in _json: desc = re.sub('(?<!\n)\n(?!\n)', ' ', _json["description"]) desc = re.sub(' ', ' ', desc) self.description = desc #<img aria-hidden="true" data-src="https://media.nintendo.com/nintendo/bin/ZppwWK6BnjH5twBNvE5wEEI9aeMGR0XX/hQGr97SGMnlXBWoqOBtgtGX5noK3tNtD.jpg"/> result = grabCachedRedirectUrl( "https://ec.nintendo.com/apps/%s/US" % id, cookies=cookies) if result and result.status_code == 200: if result.url != 'https://www.nintendo.com/games/': soup = BeautifulSoup(result.text, "html.parser") if not self.bannerUrl: m = re.search( r"#hero\s*{\s*background(-image)?:\s*url\('([^)]+)'\)", result.text, re.DOTALL | re.UNICODE | re.MULTILINE | re.IGNORECASE) if m: banner = m.group(2) if banner[0] == '/': banner = 'https://www.nintendo.com' + banner self.bannerUrl = banner rem = re.finditer( '<img aria-hidden="true" data-src="([^"]+)"', result.text) if rem: ss = [] for m in rem: ss.append(m.group(1)) if len(ss) > 0: self.screenshots = ss if soup.find("meta", {"property": "og:url"}) != None: slug = soup.find("meta", {"property": "og:url" })["content"].split('/')[-1] infoJson = json.loads( requests.get( "https://www.nintendo.com/json/content/get/game/%s" % slug, cookies=cookies).text)["game"] if "release_date" in infoJson: self.releaseDate = int( datetime.datetime.strftime( datetime.datetime.strptime( infoJson["release_date"], "%b %d, %Y"), '%Y%m%d')) if "name" in infoJson: self.name = infoJson["name"].strip() if "nsuid" in infoJson: self.nsuId = int(infoJson["nsuid"]) catagories = [] if "game_category_ref" in infoJson: catindex = 0 if "name" in infoJson["game_category_ref"]: catagories.append( infoJson["game_category_ref"]["name"]) elif "title" in infoJson["game_category_ref"]: catagories.append( infoJson["game_category_ref"]["title"]) else: try: for game_category in infoJson[ "game_category_ref"]: catagories.append( infoJson["game_category_ref"] [catindex]["name"]) catindex += 1 except: pass self.category = catagories esrbcontent = [] if "esrb_content_descriptor_ref" in infoJson: esrbindex = 0 if "name" in infoJson[ "esrb_content_descriptor_ref"]: esrbcontent.append( infoJson["esrb_content_descriptor_ref"] ["name"]) elif "title" in infoJson[ "esrb_content_descriptor_ref"]: esrbcontent.append( infoJson["esrb_content_descriptor_ref"] ["title"]) else: try: for descriptor in infoJson[ "esrb_content_descriptor_ref"]: if 'name' in descriptor: esrbcontent.append( descriptor["name"]) if 'title' in descriptor: esrbcontent.append( descriptor["title"]) except: pass self.ratingContent = esrbcontent if "number_of_players" in infoJson: self.numberOfPlayers = re.sub( '[^0-9]', '', infoJson["number_of_players"]) if "esrb_rating_ref" in infoJson: if "esrb_rating" in infoJson[ "esrb_rating_ref"]: if "short_description" in infoJson[ "esrb_rating_ref"]["esrb_rating"]: self.rating = infoJson[ "esrb_rating_ref"]["esrb_rating"][ "short_description"] ''' if not self.screenshots: try: ss = [] for s in infoJson["screenshot_gallery_ref"]["screenshot_gallery"]["screenshots"]: ss.append(s['image']['large_image']['include']['src'].replace('cocoon:/', '')) self.screenshots = ss except: pass ''' if "developer_ref" in infoJson: if "name" in infoJson["developer_ref"]: self.developer = infoJson["developer_ref"][ "name"] if "publisher_ref" in infoJson: if "name" in infoJson["publisher_ref"]: self.publisher = infoJson["publisher_ref"][ "name"] if 'title' in infoJson["publisher_ref"]: self.publisher = infoJson["publisher_ref"][ "title"] if "front_box_art" in infoJson: if "image" in infoJson["front_box_art"]: if "image" in infoJson["front_box_art"][ "image"]: if "url" in infoJson["front_box_art"][ "image"]["image"]: self.frontBoxArt = infoJson[ "front_box_art"]["image"][ "image"]["url"] if "intro" in infoJson: try: details = BeautifulSoup( infoJson["intro"][0], "html.parser") try: details = details.decode( formatter=None) except: details = details.decode() details = re.sub('<[^<]+?>', '', details).strip() details = re.sub(' +', ' ', details) details = re.sub('\n ', '\n', details) details = re.sub('\n\n+', '\n\n', details) details = re.sub('(?<!\n)\n(?!\n)', ' ', details) details = re.sub(' ', ' ', details) self.intro = details except Exception as e: pass if "game_overview_description" in infoJson: details = BeautifulSoup( infoJson["game_overview_description"][0], "html.parser") try: details = details.decode(formatter=None) except: details = details.decode() details = re.sub('<[^<]+?>', '', details).strip() details = re.sub(' +', ' ', details) details = re.sub('\n ', '\n', details) details = re.sub('\n\n+', '\n\n', details) details = re.sub('(?<!\n)\n(?!\n)', ' ', details) details = re.sub(' ', ' ', details) self.description = details #else: #f = open("missing.txt", 'a', encoding="utf8") #f.write(rid+"|title doesn't exist at ec.nintendo.com"+'\n') #f.close() except BaseException as e: pass print(repr(e) + ' ' + self.id) self.bannerFile() self.frontBoxArtFile() self.iconFile() self.screenshotFiles()
import time print("\n\nSelenium library is present!\n\n ") def render_page(url): driver = webdriver.Chrome() # driver can be manually pointed to as well #driver = webdriver.Chrome(r"C:\Users\elili\AppData\Local\Microsoft\WindowsApps\chromedriver.exe") driver.get(url) time.sleep(3) r = driver.page_source #driver.quit() return r 'REGEX for all EMAILS' r = render_page(url) soup_r = BeautifulSoup(r, "html.parser") emails = re.findall(r'[\w\.-]+@[\w\.-]+', soup_r.decode('utf-8')) 'RULE, count social media Open Graph tags, "og", checking for two most common meta tags' og_t = soup.find_all("meta", property="og:title") og_u = soup.find_all("meta", property="og:url") social_count = len(og_t) + len(og_u) 'RULE, determine presence of contact data' mailto_check = [] if "mailto" in soup_r.decode('utf-8'): mailto_check = 1 'RULE, validate agreement of email domains against site domain, list foreign domain emails' consistent_emails = [] suspect_emails = []
def WriteHtml(self, html_template, useAbsolutePaths, filename): info = None try: PixivHelper.makeSubdirs(filename) info = codecs.open(filename, 'wb', encoding='utf-8') except IOError: info = codecs.open(str(self.imageId) + ".html", 'wb', encoding='utf-8') PixivHelper.get_logger().exception( "Error when saving article html: %s, file is saved to: %s.html", filename, self.imageId) cover_image = "" if self.coverImageUrl: cover_image = f'<div class="cover"><img src="{self.coverImageUrl}"/></div>' page = html_template.replace("%coverImage%", cover_image) page = page.replace("%coverImageUrl%", self.coverImageUrl or "") page = page.replace("%artistName%", self.parent.artistName) page = page.replace("%imageTitle%", self.imageTitle) page = page.replace("%worksDate%", self.worksDate) token_body_text = "" token_images = "" token_text = "" if self.type == "article": token_body_text = f'<div class="article caption">{self.body_text}</div>' else: token_images = '<div class="non-article images">{0}</div>'.format( "".join([ '<a href="{0}">{1}</a>'.format( x, f'<img scr="{0}"/>' if x[x.rindex(".") + 1:].lower() in ["jpg", "jpeg", "png", "bmp", "gif"] else x) for x in self.images ])) token_text = '<div class="non-article caption">{0}</div>'.format( "".join([ '<p>{0}</p>'.format(x.rstrip()) for x in self.body_text.split("\n") ])) page = page.replace("%body_text(article)%", token_body_text) page = page.replace("%images(non-article)%", token_images) page = page.replace("%text(non-article)%", token_text) page = BeautifulSoup(page, features="html5lib") imageATags = page.find_all("a", attrs={"href": True}) for imageATag in imageATags: tag = imageATag.img if tag: tag["src"] = imageATag["href"] root = page.find("div", attrs={"class": "main"}) if root: root["class"].append( "non-article" if self.type != "article" else "article") page = page.decode() html_dir = os.path.dirname(filename) for k, v in self.linkToFile.items(): if not useAbsolutePaths: try: v = os.path.relpath(v, html_dir) except ValueError: PixivHelper.get_logger().exception( "Error when converting local paths to relative ones, absolute paths are used", filename, self.imageId) v = "file://" + v else: v = "file://" + v page = page.replace(k, v) info.write(page) info.close()
for tag in new_soup.find_all('tr'): th = tag.th.text.encode('utf-8') parsed = BeautifulSoup( tag.encode('utf-8'), parse_only=SoupStrainer( 'td')).find_all('td')[0].text.encode('utf-8').strip() '''if b'date:' in th.lower(): date = parsed.replace(b'\n',b'').replace(b'\r',b'').decode('utf-8') elif b'time:' in th.lower(): time = parsed.replace(b'\n',b'').replace(b'\r',b'').decode('utf-8') el''' if b'location:' in th.lower(): location = parsed.replace(b'\n', b'').replace(b'\r', b'').decode('utf-8') elif b'\xc2' in th.lower(): description = parsed.decode('utf-8') if any(s in description.lower() for s in strings): print(title + " on " + date + " at " + time + " at " + location) print(description) print(str(date + " at " + time)) start, end = parse(str(date + " at " + time)) print("Start = " + start) print("End = " + end) data = { "Date": date, "Time": time, "Description": description, "Location": location, "Title": title }
def _get_new_data(self, url, soup): # 得到数据 if soup.find('div', class_="main-content").find('h1') is not None: self.view_datas["view_name"] = soup.find( 'div', class_="main-content").find('h1').get_text() # 景点名 print(self.view_datas["view_name"]) else: self.view_datas["view_name"] = soup.find( "div", class_="feature_poster").find("h1").get_text() self.view_datas["view_message"] = soup.find( 'div', class_="lemma-summary").get_text() # 简介 self.view_datas["basic_message"] = soup.find( 'div', class_="basic-info cmn-clearfix").get_text() # 基本信息 self.view_datas["basic_message"] = self.view_datas[ "basic_message"].split("\n") get = [] for line in self.view_datas["basic_message"]: if line != "": get.append(line) self.view_datas["basic_message"] = get i = 1 get2 = [] tmp = "%%" for line in self.view_datas["basic_message"]: if i % 2 == 1: tmp = line else: a = tmp + ":" + line get2.append(a) i = i + 1 self.view_datas["basic_message"] = get2 self.view_datas["catalog"] = soup.find( 'div', class_="lemma-catalog").get_text().split("\n") #目录整体 get = [] for line in self.view_datas["catalog"]: if line != "": get.append(line) self.view_datas["catalog"] = get # 百科内容 view_name = self.view_datas["view_name"] html = urllib.request.urlopen(url) soup2 = BeautifulSoup(html.read(), 'html.parser').decode('utf-8') p = re.compile(r'<div class="para-title level-2"', re.DOTALL) r = p.search(soup2) content_data_node = soup2[r.span(0)[0]:] # 第一个h2(头) p = p = re.compile(r'<div class="album-list">', re.DOTALL) # 尾 r = p.search(content_data_node) content_data = content_data_node[0:r.span(0)[0]] lists = content_data.split('<div class="para-title level-2">') i = 1 for list in lists: # 每一大块 final_soup = BeautifulSoup(list, "html.parser") name_list = None try: part_name = final_soup.find( 'h2', class_="title-text").get_text().replace(view_name, '').strip() part_data = final_soup.get_text().replace( view_name, '').replace(part_name, '').replace('编辑', '') # 历史沿革 name_list = final_soup.findAll('h3', class_="title-text") all_name_list = {} na = "part_name" + str(i) all_name_list[na] = part_name final_name_list = [] ########### for nlist in name_list: nlist = nlist.get_text().replace(view_name, '').strip() final_name_list.append(nlist) fin = "final_name_list" + str(i) all_name_list[fin] = final_name_list print(all_name_list) i = i + 1 # 正文 try: p = re.compile(r'<div class="para-title level-3">', re.DOTALL) final_soup = final_soup.decode('utf-8') r = p.search(final_soup) final_part_data = final_soup[r.span(0)[0]:] part_lists = final_part_data.split( '<div class="para-title level-3">') for part_list in part_lists: final_part_soup = BeautifulSoup( part_list, "html.parser") content_lists = final_part_soup.findAll("div", class_="para") for content_list in content_lists: # 每个最小段 try: pic_word = content_list.find( "div", class_="lemma-picture text-pic layout-right" ).get_text() # 去掉文字中的图片描述 try: pic_word2 = content_list.find( "div", class_="description").get_text( ) # 去掉文字中的图片描述 content_list = content_list.get_text( ).replace(pic_word, '').replace(pic_word2, '') except: content_list = content_list.get_text( ).replace(pic_word, '') except: try: pic_word2 = content_list.find( "div", class_="description").get_text( ) # 去掉文字中的图片描述 content_list = content_list.get_text( ).replace(pic_word2, '') except: content_list = content_list.get_text() r_part = re.compile(r'\[\d.\]|\[\d\]') part_result, number = re.subn( r_part, "", content_list) part_result = "".join(part_result.split()) #print(part_result) except: final_part_soup = BeautifulSoup(list, "html.parser") content_lists = final_part_soup.findAll("div", class_="para") for content_list in content_lists: try: pic_word = content_list.find( "div", class_="lemma-picture text-pic layout-right" ).get_text() # 去掉文字中的图片描述 try: pic_word2 = content_list.find( "div", class_="description").get_text( ) # 去掉文字中的图片描述 content_list = content_list.get_text().replace( pic_word, '').replace(pic_word2, '') except: content_list = content_list.get_text().replace( pic_word, '') except: try: pic_word2 = content_list.find( "div", class_="description").get_text( ) # 去掉文字中的图片描述 content_list = content_list.get_text().replace( pic_word2, '') except: content_list = content_list.get_text() r_part = re.compile(r'\[\d.\]|\[\d\]') part_result, number = re.subn(r_part, "", content_list) part_result = "".join(part_result.split()) #print(part_result) except: print("error") return
def clean_content(self): content = self.cleaned_data.get('content') from bs4 import BeautifulSoup legal_tag_dict = { 'font': ['color', 'size', 'face', '.background-color'], 'span': [ '.color', '.background-color', '.font-size', '.font-family', '.background', '.font-weight', '.font-style', '.text-decoration', '.vertical-align', '.line-height' ], 'div': [ 'align', '.border', '.margin', '.padding', '.text-align', '.color', '.background-color', '.font-size', '.font-family', '.font-weight', '.background', '.font-style', '.text-decoration', '.vertical-align', '.margin-left' ], 'table': [ 'border', 'cellspacing', 'cellpadding', 'width', 'height', 'align', 'bordercolor', '.padding', '.margin', '.border', 'bgcolor', '.text-align', '.color', '.background-color', '.font-size', '.font-family', '.font-weight', '.font-style', '.text-decoration', '.background', '.width', '.height', '.border-collapse' ], 'td': [ 'align', 'valign', 'width', 'height', 'colspan', 'rowspan', 'bgcolor', '.text-align', '.color', '.background-color', '.font-size', '.font-family', '.font-weight', '.font-style', '.text-decoration', '.vertical-align', '.background', '.border' ], 'th': [ 'align', 'valign', 'width', 'height', 'colspan', 'rowspan', 'bgcolor', '.text-align', '.color', '.background-color', '.font-size', '.font-family', '.font-weight', '.font-style', '.text-decoration', '.vertical-align', '.background', '.border' ], 'a': ['href', 'target', 'name'], 'embed': [ 'src', 'width', 'height', 'type', 'loop', 'autostart', 'quality', '.width', '.height', 'align', 'allowscriptaccess' ], 'img': [ 'src', 'width', 'height', 'border', 'alt', 'title', 'align', '.width', '.height', '.border' ], 'p': [ 'align', '.text-align', '.color', '.background-color', '.font-size', '.font-family', '.background', '.font-weight', '.font-style', '.text-decoration', '.vertical-align', '.text-indent', '.margin-left' ], 'ol': [ 'align', '.text-align', '.color', '.background-color', '.font-size', '.font-family', '.background', '.font-weight', '.font-style', '.text-decoration', '.vertical-align', '.text-indent', '.margin-left' ], 'ul': [ 'align', '.text-align', '.color', '.background-color', '.font-size', '.font-family', '.background', '.font-weight', '.font-style', '.text-decoration', '.vertical-align', '.text-indent', '.margin-left' ], 'li': [ 'align', '.text-align', '.color', '.background-color', '.font-size', '.font-family', '.background', '.font-weight', '.font-style', '.text-decoration', '.vertical-align', '.text-indent', '.margin-left' ], 'blockquote': [ 'align', '.text-align', '.color', '.background-color', '.font-size', '.font-family', '.background', '.font-weight', '.font-style', '.text-decoration', '.vertical-align', '.text-indent', '.margin-left' ], 'h1': [ 'align', '.text-align', '.color', '.background-color', '.font-size', '.font-family', '.background', '.font-weight', '.font-style', '.text-decoration', '.vertical-align', '.text-indent', '.margin-left' ], 'h2': [ 'align', '.text-align', '.color', '.background-color', '.font-size', '.font-family', '.background', '.font-weight', '.font-style', '.text-decoration', '.vertical-align', '.text-indent', '.margin-left' ], 'h3': [ 'align', '.text-align', '.color', '.background-color', '.font-size', '.font-family', '.background', '.font-weight', '.font-style', '.text-decoration', '.vertical-align', '.text-indent', '.margin-left' ], 'h4': [ 'align', '.text-align', '.color', '.background-color', '.font-size', '.font-family', '.background', '.font-weight', '.font-style', '.text-decoration', '.vertical-align', '.text-indent', '.margin-left' ], 'h5': [ 'align', '.text-align', '.color', '.background-color', '.font-size', '.font-family', '.background', '.font-weight', '.font-style', '.text-decoration', '.vertical-align', '.text-indent', '.margin-left' ], 'h6': [ 'align', '.text-align', '.color', '.background-color', '.font-size', '.font-family', '.background', '.font-weight', '.font-style', '.text-decoration', '.vertical-align', '.text-indent', '.margin-left' ], 'pre': ['class'], 'hr': ['class', '.page-break-after'], 'br': [], 'tbody': [], 'tr': [], 'strong': [], 'b': [], 'sub': [], 'em': [], 'i': [], 'u': [], 'strike': [], 's': [], 'del': [], } soup = BeautifulSoup(content, 'html.parser') tag_list = soup.find_all() for tag in tag_list: if tag.name not in legal_tag_dict: tag.decompose() else: l = [] if tag.attrs: print(123) for attr in tag.attrs: print(attr) if attr not in legal_tag_dict[tag.name]: l.append(attr) for i in l: del tag.attrs[i] return soup.decode()
def clean(content): valid_tags = { 'font': ['color', 'size', 'face', '.background-color'], 'span': [ '.color', '.background-color', '.font-size', '.font-family', '.background', '.font-weight', '.font-style', '.text-decoration', '.vertical-align', '.line-height' ], 'div': [ 'align', '.border', '.margin', '.padding', '.text-align', '.color', '.background-color', '.font-size', '.font-family', '.font-weight', '.background', '.font-style', '.text-decoration', '.vertical-align', '.margin-left' ], 'table': [ 'border', 'cellspacing', 'cellpadding', 'width', 'height', 'align', 'bordercolor', '.padding', '.margin', '.border', 'bgcolor', '.text-align', '.color', '.background-color', '.font-size', '.font-family', '.font-weight', '.font-style', '.text-decoration', '.background', '.width', '.height', '.border-collapse' ], 'td': [ 'align', 'valign', 'width', 'height', 'colspan', 'rowspan', 'bgcolor', '.text-align', '.color', '.background-color', '.font-size', '.font-family', '.font-weight', '.font-style', '.text-decoration', '.vertical-align', '.background', '.border' ], 'th': [ 'align', 'valign', 'width', 'height', 'colspan', 'rowspan', 'bgcolor', '.text-align', '.color', '.background-color', '.font-size', '.font-family', '.font-weight', '.font-style', '.text-decoration', '.vertical-align', '.background', '.border' ], 'a': ['href', 'target', 'name'], 'embed': [ 'src', 'width', 'height', 'type', 'loop', 'autostart', 'quality', '.width', '.height', 'align', 'allowscriptaccess' ], 'img': [ 'src', 'width', 'height', 'border', 'alt', 'title', 'align', '.width', '.height', '.border' ], 'p': [ 'align', '.text-align', '.color', '.background-color', '.font-size', '.font-family', '.background', '.font-weight', '.font-style', '.text-decoration', '.vertical-align', '.text-indent', '.margin-left' ], 'ol': [ 'align', '.text-align', '.color', '.background-color', '.font-size', '.font-family', '.background', '.font-weight', '.font-style', '.text-decoration', '.vertical-align', '.text-indent', '.margin-left' ], 'ul': [ 'align', '.text-align', '.color', '.background-color', '.font-size', '.font-family', '.background', '.font-weight', '.font-style', '.text-decoration', '.vertical-align', '.text-indent', '.margin-left' ], 'li': [ 'align', '.text-align', '.color', '.background-color', '.font-size', '.font-family', '.background', '.font-weight', '.font-style', '.text-decoration', '.vertical-align', '.text-indent', '.margin-left' ], 'blockquote': [ 'align', '.text-align', '.color', '.background-color', '.font-size', '.font-family', '.background', '.font-weight', '.font-style', '.text-decoration', '.vertical-align', '.text-indent', '.margin-left' ], 'h1': [ 'align', '.text-align', '.color', '.background-color', '.font-size', '.font-family', '.background', '.font-weight', '.font-style', '.text-decoration', '.vertical-align', '.text-indent', '.margin-left' ], 'h2': [ 'align', '.text-align', '.color', '.background-color', '.font-size', '.font-family', '.background', '.font-weight', '.font-style', '.text-decoration', '.vertical-align', '.text-indent', '.margin-left' ], 'h3': [ 'align', '.text-align', '.color', '.background-color', '.font-size', '.font-family', '.background', '.font-weight', '.font-style', '.text-decoration', '.vertical-align', '.text-indent', '.margin-left' ], 'h4': [ 'align', '.text-align', '.color', '.background-color', '.font-size', '.font-family', '.background', '.font-weight', '.font-style', '.text-decoration', '.vertical-align', '.text-indent', '.margin-left' ], 'h5': [ 'align', '.text-align', '.color', '.background-color', '.font-size', '.font-family', '.background', '.font-weight', '.font-style', '.text-decoration', '.vertical-align', '.text-indent', '.margin-left' ], 'h6': [ 'align', '.text-align', '.color', '.background-color', '.font-size', '.font-family', '.background', '.font-weight', '.font-style', '.text-decoration', '.vertical-align', '.text-indent', '.margin-left' ], 'pre': ['class'], 'hr': ['class', '.page-break-after'], 'br': [], 'tbody': [], 'tr,': [], 'strong': [], 'b,': [], 'sub': [], 'sup': [], 'em': [], 'i': [], 'u': [], 'strike': [], 's': [], 'del': [] } soup = BeautifulSoup(content, 'html.parser') tags = soup.find_all() for tag in tags: if tag.name not in valid_tags: tag.decompose() if tag.name == 'img': tag['style'] = 'width:100%' tag.prettify() if tag.attr is not None: for k in tag.attr: if k not in tags[tag.name]: del tag.attr[k] content = soup.decode() return content
def process(filepath): # print "in process" print(filepath) with open(filepath, 'rb') as f: # print "opened " + filepath l = filepath.split('/') name = '' if (l[len(l) - 2]) == 'build': name = l[len(l) - 1] s = f.read() #s = s.replace(find, replace) s = s.replace(b"index.html", b"") s = s.replace(b"<html>", b"<!DOCTYPE html lang=\"en\">") s = s.replace( b'<meta', b"<meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\"><meta" ) soup = BeautifulSoup(s, "lxml") for i in soup.find_all("table", attrs={"summary": "Navigation header"}): i.contents[0].contents[0].clear() if name == "index.html": link = BeautifulSoup("<a href=\"ix01.html\">Index</a>", "lxml") elif name == "ix01.html": link = BeautifulSoup("", "lxml") else: link = BeautifulSoup("<a href=\"../ix01.html\">Index</a>", "lxml") i.contents[0].contents[0].insert(0, link) if name == "index.html": link = BeautifulSoup("", "lxml") elif name == "ix01.html": link = BeautifulSoup( "<a href=\"index.html\">Table of Contents</a>", "lxml") else: link = BeautifulSoup("<a href=\"../\">Table of Contents</a>", "lxml") i.contents[1].contents[1].insert(0, link) soup = BeautifulSoup(soup.renderContents(), "lxml") for j in soup.findAll("table", attrs={"summary": "Navigation footer"}): if name == "index.html": link = BeautifulSoup("<a href=\"ix01.html\">Index</a>", "lxml") elif name == "ix01.html": link = BeautifulSoup("", "lxml") else: link = BeautifulSoup("<a href=\"../ix01.html\">Index</a>", "lxml") j.contents[0].contents[1].insert(0, link) if name == "ix01.html": link = BeautifulSoup( "<a href=\"index.html\">Table of Contents</a>", "lxml") if name == "index.html": link = BeautifulSoup("", "lxml") elif name == "ix01.html": link = BeautifulSoup( "<a href=\"index.html\">Table of Contents</a>", "lxml") else: link = BeautifulSoup("<a href=\"../\">Table of Contents</a>", "lxml") #j.contents[0].contents[1].insert(0, link) j.contents[1].contents[1].clear() j.contents[1].contents[1].insert(0, link) # Now mathjax removed # p = BeautifulSoup("<h3><a href='/'>Site Home</a></h3><p class='alert alert-danger'>Please see <a href=\"http://caniuse.com/#feat=mathml\">http://caniuse.com/#feat=mathml</a> if your browser supports MathML because certain sections of this book rely on MathML. If your browser does not support MathML please install Firefox from <a href=\"https://www.mozilla.org\">Mozilla</a> because AFAIK Firefox supports MathML. On other browsers Mathjax will take its sweet time to render page.</p>", "lxml") #soup.body.insert(0, p) soup = BeautifulSoup(soup.renderContents(), "lxml") for i in soup.find_all("pre", attrs={"class": "CLexer"}): code = BeautifulSoup( highlight(i.string, CLexer(), HtmlFormatter()), "lxml") i.string.replace_with(code) for i in soup.find_all("span", attrs={"class": "mathphrase"}): math = BeautifulSoup(render_latex(i.string), "lxml") i.string.replace_with(math) with open(filepath, "w") as f: f.write(soup.decode(formatter='html'))
valid_tag = {'p': ['class', 'id'], 'img': 'src', 'div': 'class'} #Tag.decompose() 方法将当前节点移除文档树并完全销毁: #找到所有的标签名 tags = soup.find_all() for tag in tags: # print('tag--------',tag) if tag.name in valid_tag: # print(tag.attrs) 取属性 tag.decompose() if tag.attrs: #是否有属性 for k in list(tag.attrs.keys()): #{id:'i1',a=123,b=999} if k not in valid_tag[tag.name]: del tag.attrs[k] content_str = soup.decode() print(content_str) # v = soup.find(name='p',attrs={'id':'i2'}) # print(v) # tag = soup.find(name='p') # sc = tag.find('script') # print(sc) # tag = soup.find(name='p') # sc = tag.find('script') # print(sc)
#print(title) #print(title[0]) #abstract=re.findall('<div id="abstract">\n\s*(.*?)\n\s*</div>',page.decode('utf-8'),re.S) #print(abstract) cnt = 0 for link in page.find_all("dt", class_="ptitle"): uuu = link.find('a') url = str(uuu.get('href')) #print(type(url)) urll = "http://openaccess.thecvf.com/" urll = urll + url print(urll) file1 = urllib.request.urlopen(urll).read() page1 = BeautifulSoup(file1, "html.parser") title = re.findall('<div id="papertitle">\n\s*(.*?)\n\s*</div>', page1.decode('utf-8'), re.S) abstract = re.findall('<div id="abstract">\n\s*(.*?)\n\s*</div>', page1.decode('utf-8'), re.S) if cnt != 0: f.write('\n') f.write('\n') f.write(str(cnt)) f.write('\n') f.write("Title: " + title[0]) f.write('\n') f.write("Abstract: " + abstract[0]) f.write('\n') cnt = cnt + 1
def parse(self, response): item = WikiItem() title = response.xpath( '//h1[@id="firstHeading"]/text()').extract_first() item['title'] = title item['url'] = response.url # tr_list = response.xpath('//table[@class="infobox vcard"]/tr') tr_list = response.css('.infobox tr') image = tr_list.xpath('//a[@class="image"]/img/@src').extract_first() if image is not None: item['image'] = "https:" + image r_part = re.compile(r'\[\d.\]|\[\d\]') # 右侧的info_box表格 info_box = [] for tr in tr_list: th = tr.xpath('./th[@scope="row"]//text()').extract_first() if th is not None: td = re.sub(r_part, "", "".join(tr.xpath('./td//text()').extract())) info_box.append({'key': th, 'value': stripTagSimple(td)}) print(info_box) # print(title) pic = [] thumb_tright = response.xpath( '//div[@class="thumb tright"]/div[@class="thumbinner"]') for p in thumb_tright: if p.xpath('./a/img/@src').extract_first() is not None: img = 'https:' + p.xpath('./a/img/@src').extract_first() img_desc = re.sub( r_part, "", "".join( p.xpath( './div[@class="thumbcaption"]//text()').extract())) pic.append({'url': img, 'img_desc': stripTagSimple(img_desc)}) # print(pic) item['pic'] = pic html_content = response.xpath( '//div[@id="mw-content-text"]').extract_first() soup = BeautifulSoup(html_content, 'html.parser') # 销毁目录节点 catalog = soup.find('div', class_="toc") if catalog is not None: soup.find('div', class_="toc").decompose() # 销毁参考资料节点 ref = soup.find('ol', class_="references") if ref is not None: soup.find('ol', class_="references").decompose() # ps是文中所有的段落 div = soup.find(name='div', class_='mw-parser-output') ps = div.find_all('p', recursive=False) # only direct children index = 0 for p in ps: if p.get_text() == '': break index += 1 summary = {} s_index = 0 while s_index < index: summary[f'{s_index}'] = stripTagSimple(ps[s_index].get_text()) s_index += 1 print(summary) start = re.compile(r'<p>', re.DOTALL) search_result = start.search(soup.decode('utf-8')) if search_result is None: search_result = re.compile(r'<h2>', re.DOTALL).search(soup.decode('utf-8')) content_text = collections.OrderedDict() if search_result is not None: start_node = soup.decode('utf-8')[search_result.start():] lists = start_node.split('<h2>') i = 1 while i < len(lists): lists[i] = '<h2>' + lists[i] final_soup = BeautifulSoup(lists[i], 'html.parser') para_title = final_soup.find( 'span', class_="mw-headline").get_text().strip() if para_title == "外部链接" or "参考" in para_title: i += 1 continue para_contents = final_soup.find_all(['p', 'li', 'table']) texts = [] for para in para_contents: if para.name == 'table': texts.append(para.prettify()) continue texts.append(stripTagSimple(para.get_text('', True))) content_text[para_title.replace('.', '点')] = texts i += 1 catlinks = response.xpath( '//div[@class="catlinks"]/div[@id="mw-normal-catlinks"]//li') tag = {} j = 0 for link in catlinks: href = 'https://zh.wikipedia.org' + link.xpath( './a/@href').extract_first() cat = link.xpath('./a/text()').extract_first() tag[f'{j}'] = cat j += 1 detail = { 'title': title, 'summary': summary, 'infobox': info_box, 'content': content_text, 'category': tag, } item['detail'] = detail now_time = datetime.datetime.fromtimestamp(time.time()) item['updateAt'] = now_time return item
def get_latest_release(distro, release): # Instantiate ImageObject image = ImageObject() if distro == 'ubuntu': # Define URL & other variables (defines all because of clarity) url = 'https://cloud-images.ubuntu.com/releases/{rel}/'.format(rel=release, ) hashfile_url = 'SHA256SUMS' image_hash = None image_name = None image_url = None latest_build = None image_suffix = None # Retrieve the latest image url match_list = [] # Define match_list for later use page = requests.get(url) soup = BeautifulSoup(page.text, 'html.parser') compile_string = 'release-' builds = soup.find_all(href=re.compile(compile_string)) for build in builds: match_list.append(build.string.replace('/', '')) # Remove forward slash in the strings # Sort Alphanumericly sorted_builds = sorted(match_list, key=lambda item: (int(item.partition(' ')[0]) if item[0].isdigit() else float('inf'), item)) latest_build = sorted_builds[-1] # It is nicer to set a variable that is the last in the list if release <= '16.04': # We want to send back the image name image_name = 'ubuntu-{release}-server-cloudimg-amd64-disk1.img'.format(release=release) # We also want to send back the url that was used image_url = '{url}{build}/ubuntu-{release}-server-cloudimg-amd64-disk1.img'.format(url=url, build=latest_build, release=release) image_suffix = 'img' elif release >= '18.04': # We want to send back the image name image_name = 'ubuntu-{release}-server-cloudimg-amd64.img'.format(release=release) # We also want to send back the url that was used image_url = '{url}{build}/ubuntu-{release}-server-cloudimg-amd64.img'.format(url=url, build=latest_build, release=release) image_suffix = 'img' # Get sha256 to compare with database page = requests.get('{url}{build}/{hashfile}'.format(url=url, build=latest_build, hashfile=hashfile_url)) soup = BeautifulSoup(page.text, 'html.parser') hash_list = soup.decode().split("\n") hash_list.pop(-1) for hash in hash_list: search_string = '{}$'.format(image_name) if re.search(search_string, hash): image_hash = hash.split(' ')[0] # v2: ImageObject image.name = image_name image.sha256 = image_hash image.source_url = image_url image.build = latest_build image.file_suffix = image_suffix elif distro == 'centos': # Define URL & other variables (defines all because of clarity) url = 'http://cloud.centos.org/centos/{rel}/images/'.format(rel=release) hashfile_url = 'sha256sum.txt' image_name = "" image_hash = "" image_url = "" latest_build = "" # Retrieve the latest image url page = requests.get(url) soup = BeautifulSoup(page.text, 'html.parser') builds = soup.find_all(href=re.compile('CentOS-[0-9]-x86_64-GenericCloud-[0-9]{4}.qcow2')) match_list = [] for build in builds: pattern1 = re.compile(r'(CentOS-[0-9]-x86_64-GenericCloud-[0-9]{4}.qcow2c)|CentOS-[0-9]-x86_64-GenericCloud-[0-9]{4}.qcow2.xz') match_all_other_files = pattern1.match(build.contents[0]) if match_all_other_files: continue #print('DELETE {}'.format(match_all_other_files.group(0))) else: match_list.append(build.contents[0]) # Sort Alphanumericly sorted_builds = sorted(match_list, key=lambda item: (int(item.partition(' ')[0]) if item[0].isdigit() else float('inf'), item)) # It is nicer to set a variable that is the last in the list latest_build = sorted_builds[-1].split('-')[4].replace('.qcow2', '') image_url = '{url}CentOS-{release}-x86_64-GenericCloud-{build}.qcow2'.format(url=url, release=release, build=latest_build) image_suffix = 'qcow2' # Get sha256 to compare with database page = requests.get('{url}{hashfile}'.format(url=url, hashfile=hashfile_url)) soup = BeautifulSoup(page.text, 'html.parser') hash_list = soup.decode().split("\n") hash_list.pop(-1) for hash in hash_list: search_string = 'CentOS-{release}-x86_64-GenericCloud-{build}.qcow2$'.format(release=release, build=latest_build) if re.search(search_string, hash): image_hash = hash.split(' ')[0] image_name = hash.split(' ')[1] # v2: ImageObject image.name = image_name image.sha256 = image_hash image.source_url = image_url image.build = latest_build image.file_suffix = image_suffix return image
import urllib2 from bs4 import BeautifulSoup import zlib # f=urllib2.urlopen(url) quote_page = 'http://www.emmacloth.com/Tassel-Trim-Dolphin-Hem-Striped-Tee-Dress-p-356028-cat-1727.html' page = urllib2.urlopen(quote_page) decompressed_data = zlib.decompress(page.read(), 16 + zlib.MAX_WBITS) #print decompressed_data soup = BeautifulSoup(page, 'html.parser') print soup.decode('gzip') json_string = soup.find_all('script') print "The json string = ", json_string
def test_rddd001_initial_state(dash_duo): app = dash.Dash(__name__) my_class_attrs = { "id": "p.c.4", "className": "my-class", "title": "tooltip", "style": { "color": "red", "fontSize": 30 }, } # fmt:off app.layout = html.Div([ 'Basic string', 3.14, True, None, html.Div('Child div with basic string', **my_class_attrs), html.Div(id='p.c.5'), html.Div([ html.Div('Grandchild div', id='p.c.6.p.c.0'), html.Div([ html.Div('Great grandchild', id='p.c.6.p.c.1.p.c.0'), 3.14159, 'another basic string' ], id='p.c.6.p.c.1'), html.Div([ html.Div(html.Div([ html.Div([ html.Div(id='p.c.6.p.c.2.p.c.0.p.c.p.c.0.p.c.0'), '', html.Div(id='p.c.6.p.c.2.p.c.0.p.c.p.c.0.p.c.2') ], id='p.c.6.p.c.2.p.c.0.p.c.p.c.0') ], id='p.c.6.p.c.2.p.c.0.p.c'), id='p.c.6.p.c.2.p.c.0') ], id='p.c.6.p.c.2') ], id='p.c.6') ]) # fmt:on dash_duo.start_server(app) # Note: this .html file shows there's no undo/redo button by default with open( os.path.join(os.path.dirname(__file__), "initial_state_dash_app_content.html")) as fp: expected_dom = BeautifulSoup(fp.read().strip(), "lxml") fetched_dom = dash_duo.dash_outerhtml_dom assert (fetched_dom.decode() == expected_dom.decode() ), "the fetching rendered dom is expected" assert dash_duo.get_logs( ) == [], "Check that no errors or warnings were displayed" assert dash_duo.driver.execute_script( "return JSON.parse(JSON.stringify(window.store.getState().layout))" ) == json.loads(json.dumps(app.layout, cls=plotly.utils.PlotlyJSONEncoder) ), "the state layout is identical to app.layout" r = requests.get("{}/_dash-dependencies".format(dash_duo.server_url)) assert r.status_code == 200 assert r.json( ) == [], "no dependencies present in app as no callbacks are defined" paths = dash_duo.redux_state_paths assert paths["objs"] == {} assert paths["strs"] == { abbr: [ int(token) if token in string.digits else token.replace( "p", "props").replace("c", "children") for token in abbr.split(".") ] for abbr in (child.get("id") for child in fetched_dom.find( id="react-entry-point").findChildren(id=True)) }, "paths should reflect to the component hierarchy" assert dash_duo.redux_state_rqs == [], "no callback => no pendingCallbacks" dash_duo.percy_snapshot(name="layout") assert dash_duo.get_logs() == [], "console has no errors"
async def generate_page(self): soup = BeautifulSoup(self.xml, 'lxml') for tag in soup.find_all(recursive=True): try: # add linebreak after certain tags if tag.name in TELEGRAPH_TAGS_INSERT_BR_AFTER: tag.insert_after(soup.new_tag('br')) # remove tags that are not allowed in <li> if tag.name == 'li': disallowed_tags = tag.find_all( TELEGRAPH_DISALLOWED_TAGS_IN_LI, recursive=True) for disallowed_tag in disallowed_tags: disallowed_tag.replaceWithChildren() # deal with tags itself if tag.name in TELEGRAPH_DEL_TAGS: if tag.name == 'table': rows = tag.find_all('tr') if not rows: tag.decompose() continue for row in rows: columns = row.find_all(('td', 'th')) if len(columns) != 1: if env.TABLE_TO_IMAGE: table_img = await convert_table_to_png( str(tag)) if table_img: url_l = await apis.get_account( ).upload(BytesIO(table_img), full=False) url = url_l[0] if url_l else None if url: tag.replaceWith( soup.new_tag('img', src=url)) continue tag.decompose() continue tag.replaceWithChildren() else: tag.decompose() continue elif tag.name in TELEGRAPH_REPLACE_TAGS: old_name = tag.name new_name = TELEGRAPH_REPLACE_TAGS[old_name] tag.name = new_name if old_name.startswith('h') and not new_name.startswith( 'h') and new_name != 'p': # ensure take a whole line tag.insert_before(soup.new_tag('br')) \ if (hasattr(tag.previous_sibling, 'name') and tag.previous_sibling.name not in {'br', 'p'} and not tag.previous_sibling.name.startswith('h')) \ else None tag.insert_after(soup.new_tag('br')) elif tag.name not in TELEGRAPH_ALLOWED_TAGS: tag.replaceWithChildren() # remove disallowed tags continue # verify tags if tag.name == 'a' and not tag.text: tag.replaceWithChildren() # remove invalid links continue elif tag.name == 'img' and is_emoticon(tag): alt = tag.get('alt') tag.replaceWith( alt) if alt else tag.decompose() # drop emoticon continue # deal with attributes if tag.name not in TELEGRAPH_TAGS_ALLOW_ATTR: tag.attrs = {} # remove all attributes continue else: attr_name = TELEGRAPH_TAGS_ALLOW_ATTR[tag.name] attr_content = tag.attrs.get(attr_name) if not attr_content: tag.replaceWithChildren() continue if self.link: attr_content = resolve_relative_link( self.link, attr_content) if not isAbsoluteHttpLink(attr_content): tag.replaceWithChildren() continue if tag.name in { 'video', 'img' } and not attr_content.startswith(env.IMG_RELAY_SERVER): attr_content = env.IMG_RELAY_SERVER + attr_content tag.attrs = {attr_name: attr_content} except (ValueError, AttributeError): pass if self.feed_title: self.telegraph_author = f"{self.feed_title}" if self.author and self.author not in self.feed_title: self.telegraph_author += f' ({self.author})' self.telegraph_author_url = self.link if self.link else '' else: self.telegraph_author = 'Generated by RSStT' self.telegraph_author_url = 'https://github.com/Rongronggg9/RSS-to-Telegram-Bot' self.telegraph_title = self.title if self.title else 'Generated by RSStT' self.telegraph_html_content = ( soup.decode() + '<p>Generated by ' '<a href="https://github.com/Rongronggg9/RSS-to-Telegram-Bot">RSStT</a>. ' 'The copyright belongs to the original author.</p>' # "If images cannot be loaded properly due to anti-hotlinking, " # "please consider install " # "<a href='https://greasyfork.org/scripts/432923'>this userscript</a>." + (f'<p><a href="{self.link}">Source</a></p>' if self.link else ''))
def filter_xss(html_str): # valid_tag_list = ["p", "div", "a", "img", "html", "body", "br", "strong", "b"] valid_dict = { "font": ['color', 'size', 'face', '.background-color'], "span": [ '.color', '.background-color', '.font-size', '.font-family', '.background', '.font-weight', '.font-style', '.text-decoration', '.vertical-align', '.line-height' ], "div": [ 'align', '.border', '.margin', '.padding', '.text-align', '.color', '.background-color', '.font-size', '.font-family', '.font-weight', '.background', '.font-style', '.text-decoration', '.vertical-align', '.margin-left' ], "table": [ 'border', 'cellspacing', 'cellpadding', 'width', 'height', 'align', 'bordercolor', '.padding', '.margin', '.border', 'bgcolor', '.text-align', '.color', '.background-color', '.font-size', '.font-family', '.font-weight', '.font-style', '.text-decoration', '.background', '.width', '.height', '.border-collapse' ], 'td,th': [ 'align', 'valign', 'width', 'height', 'colspan', 'rowspan', 'bgcolor', '.text-align', '.color', '.background-color', '.font-size', '.font-family', '.font-weight', '.font-style', '.text-decoration', '.vertical-align', '.background', '.border' ], "a": ['href', 'target', 'name'], "embed": [ 'src', 'width', 'height', 'type', 'loop', 'autostart', 'quality', '.width', '.height', 'align', 'allowscriptaccess' ], "img": [ 'src', 'width', 'height', 'border', 'alt', 'title', 'align', '.width', '.height', '.border' ], 'p,ol,ul,li,blockquote,h1,h2,h3,h4,h5,h6': [ 'align', '.text-align', '.color', '.background-color', '.font-size', '.font-family', '.background', '.font-weight', '.font-style', '.text-decoration', '.vertical-align', '.text-indent', '.margin-left' ], "pre": ['class'], "hr": ['class', '.page-break-after'], 'br,tbody,tr,strong,b,sub,sup,em,i,u,strike,s,del': [] } from bs4 import BeautifulSoup soup = BeautifulSoup(html_str, "html.parser") # soup -----> document ######### 改成dict for ele in soup.find_all(): # 过滤非法标签 if ele.name not in valid_dict: ele.decompose() # 过滤非法属性 else: attrs = ele.attrs # p {"id":12,"class":"d1","egon":"dog"} l = [] for k in attrs: if k not in valid_dict[ele.name]: l.append(k) for i in l: del attrs[i] print('soup', soup) return soup.decode()
<span style="font-size: 8px">testspan</span> <script>alter('123')</script> ''' # 黑名单方式删除匹配的标签及属性 #从html格式解析content里面的内容 soup=BeautifulSoup(content,'html.parser') # 查找content里面的script标签 tag=soup.find('script') # 打印查找到的标签 print('打印匹配到的script标签',tag) # 清空tag对象,也就是清空查找到的script标签里面的内容,但是script标签本身还是留下 tag.clear() print('decode前',content) # 将转码成字符串,执行后<script><script>里面的alter('123')被清除了。 content=soup.decode(content) print('decode后,清空了script里面的内容',content) #隐藏匹配到的<script>标签 tag.hidden=True content=soup.decode(content) print('隐藏了script标签',content) span=soup.find('span') # 找到span标签属性并以字典显示 print('找到span标签的属性',span.attrs) del span.attrs['style'] print('删除掉span的sytle属性',span) print(content) # 标签白名单