def write_css_js(self, filepath): with html_writer.HTMLWriter( filepath, "a") as zipper, open("chefdata/styles.css") as f: content = f.read() zipper.write_contents("styles.css", content, directory="css/") with html_writer.HTMLWriter( filepath, "a") as zipper, open("chefdata/scripts.js") as f: content = f.read() zipper.write_contents("scripts.js", content, directory="js/")
def write_images(self, filepath, content): self.get_images(content) with html_writer.HTMLWriter(filepath, "a") as zipper: for img_src, img_filename in self.images.items(): try: zipper.write_url(img_src, img_filename, directory="files") except requests.exceptions.HTTPError: pass
def write_contents(self, filepath_index, filename, content, directory="files"): with html_writer.HTMLWriter(filepath_index, "a") as zipper: content = '<html><head><meta charset="utf-8"><link rel="stylesheet" href="../css/styles.css"></head><body>{}<script src="../js/scripts.js"></script></body></html>'.format( content) zipper.write_contents(filename, content, directory=directory)
def _download_file(self, write_to_path): with html_writer.HTMLWriter(write_to_path) as zipper: try: self.zipper = zipper self.to_zip(filename='index.html') except Exception as e: # Any errors here will just say index.html file does not exist, so # print out error for more descriptive debugging LOGGER.error(str(e))
def write_images(self, filepath, images): with html_writer.HTMLWriter(filepath, "a") as zipper: for img_src, img_filename in images.items(): try: if img_src.startswith("data:image/"): pass else: requests.get(img_src, timeout=40) zipper.write_url(img_src, img_filename, directory="") except requests.exceptions.HTTPError: pass except requests.exceptions.ConnectTimeout as e: LOGGER.info(str(e))
def write_images(self, filepath, images): with html_writer.HTMLWriter(filepath, "a") as zipper: for img_src, img_filename in images.items(): try: if img_src.startswith("data:image/") or img_src.startswith( "file://"): pass else: # zipper.write_url(img_src, img_filename, directory="") zipper.write_contents(img_filename, downloader.read(img_src, timeout=5, session=sess), directory="") except (requests.exceptions.HTTPError, requests.exceptions.ConnectTimeout, requests.exceptions.ConnectionError, FileNotFoundError, requests.exceptions.ReadTimeout): pass
def write_index(self, filepath, content): with html_writer.HTMLWriter(filepath, "w") as zipper: zipper.write_index_contents(content)
def download_file(self, write_to_path): # Generate a .zip file with html_writer.HTMLWriter(write_to_path) as zipper: self.zipper = zipper self.to_zip(filename='index.html')
def scrape_snack_page(slug, attempts=5): """ Writes activity to a zipfile Args: slug (str): url slug (e.g. /snacks/drawing-board) attemps (int): number of times to attempt a download Returns write_to_path (str): path to generated zip tags ([str]): list of tags scraped from activity page """ tags = [] write_to_path = os.path.sep.join( [SNACK_DIRECTORY, "{}.zip".format(slug.split('/')[-1])]) try: contents = BeautifulSoup(read(slug), 'html5lib') main_contents = contents.find('div', {'class': 'activity'}) # Gather keywords from page tags.extend( scrape_keywords(main_contents, 'field-name-field-activity-subject')) tags.extend( scrape_keywords(main_contents, 'field-name-field-activity-tags')) # Don't rezip activities that have already been zipped if os.path.isfile(write_to_path): return write_to_path, tags with html_writer.HTMLWriter(write_to_path) as zipper: write_contents = BeautifulSoup("", "html5lib") # Scrape stylesheets for stylesheet in contents.find_all('link', {'rel': 'stylesheet'}): # Don't scrape external style sheets (e.g. fontawesome, google fonts) if "exploratorium.edu" not in stylesheet['href']: continue style_contents = scrape_style(stylesheet['href'], zipper) filename = stylesheet['href'].split('/')[-1] stylesheet['href'] = zipper.write_contents(filename, style_contents, directory="css") write_contents.head.append(stylesheet) # Remove scripts and any unneeded sections cluster = main_contents.find('div', {'id': 'curated-cluster'}) cluster and cluster.decompose() service_links = main_contents.find( 'div', {'class': 'activity-service-links'}) service_links and service_links.decompose() for script in main_contents.find_all("script"): script.decompose() # Get rid of hardcoded height/width on slideshow element slideshow = main_contents.find('div', {'class': 'field-slideshow'}) if slideshow: del slideshow['style'] # Add images for img in main_contents.find_all('img'): img['src'] = zipper.write_url(format_url(img['src']), img['src'].split('/')[-1], directory="images") # Add videos embedded from youtube for video in main_contents.find_all('div', {'class': 'yt-player'}): yt_video_path = download_web_video( video['data-ytid'], "{}.mp4".format(video['data-ytid'])) video_tag = generate_video_tag(yt_video_path, zipper) video_tag['style'] = video.find('div', { 'class': 'placeholder' }).get('style') video.replaceWith(video_tag) # Add videos embedded from brightcove and remove playlist element (if any) for k, v in get_brightcove_mapping(main_contents, get_playlist=True).items(): video_path = download_web_video(v['url'], "{}.mp4".format(k)) if v.get('original_el'): v['original_el'].replaceWith( generate_video_tag(video_path, zipper)) elif v.get('append_to'): if v.get('title'): p_tag = contents.new_tag("p") p_tag.string = v['title'] p_tag[ 'style'] = "margin-top: 40px; margin-bottom: 10px" v['append_to'].parent.append(p_tag) v['append_to'].parent.append( generate_video_tag(video_path, zipper)) playlist = main_contents.find( 'div', {'id': 'media-collection-banner-playlist'}) if playlist: playlist.decompose() # Handle links (need to start with parent as beautifulsoup returns parent as None on links) for paragraph in main_contents.find_all( 'p') + main_contents.find_all('li'): for link in paragraph.find_all('a'): # Skip any previously parsed links if zipper.contains(link['href']): continue # Just bold activities and remove link elif "exploratorium.edu/snacks/" in link['href']: bold_tag = contents.new_tag("b") bold_tag.string = link.text link.replaceWith(bold_tag) # If it's an image, replace the tag with just the image elif link.find('img'): link.replaceWith(link.find('img')) # Get downloadable files and attach them to new pages elif "/sites/default/files/" in link['href']: link['href'] = generate_download_page( link['href'], zipper) # Get any referenced videos elif "exploratorium.edu" in link['href']: linked_page = BeautifulSoup(read(link['href']), 'html5lib') link.replaceWith(link.text.replace(link['href'], '')) for k, v in get_brightcove_mapping( linked_page).items(): video_path = download_web_video( v['url'], "{}.mp4".format(k)) paragraph.append( generate_video_tag(video_path, zipper)) # Scrape any images elif next((e for e in IMAGE_EXTENSIONS if link['href'].lower().endswith(e)), None): img_tag = contents.new_tag('img') img_tag['src'] = zipper.write_url( link['href'], link['href'].split('/')[-1], directory="images") img_tag['style'] = "max-width: 100%;" paragraph.append(img_tag) link.replaceWith(link.text) # Remove hyperlink from external links else: if link['href'] not in link.text and link.text not in link[ 'href']: link.string += " ({}) ".format(link['href']) link.replaceWith(link.text) # Write contents and custom tags write_contents.body.append(main_contents) write_contents.head.append( generate_custom_style_tag()) # Add custom style tag write_contents.body.append(generate_custom_script_tag( )) # Add custom script to handle slideshow # Write main index.html file zipper.write_index_contents( write_contents.prettify().encode('utf-8-sig')) except Exception as e: # Reattempt if there are attempts left if attempts > 0: return scrape_snack_page(slug, attempts=attempts - 1) else: LOGGER.error("Could not scrape {} ({})".format(slug, str(e))) return write_to_path, tags
def write_img(self, img_url, filepath, img_filename): with html_writer.HTMLWriter(filepath, "a") as zipper: path = zipper.write_url(img_url, img_filename, directory="files")
def write_index(self, content): with html_writer.HTMLWriter(self.filename, "w") as zipper: zipper.write_index_contents(content)
def write_img(self, img_url, filename): with html_writer.HTMLWriter(self.filename, "a") as zipper: zipper.write_url(img_url, filename, directory="files")
def write(self, filename, content): with html_writer.HTMLWriter(self.filename, "a") as zipper: zipper.write_contents(filename, content, directory="files")