def generate_download_page(url, zipper): """ Create a page for files that are meant to be downloaded (e.g. worksheets) Args: url (str): url to file that is meant to be downloaded zipper (html_writer): where to write download page to Returns path to page in zipfile (str) """ # Get template soup soup = BeautifulSoup("", "html.parser") with open('download.html', 'rb') as templatecode: newpage = BeautifulSoup(templatecode.read(), 'html5lib') # Determine if link is one of the recognized file types download_url = url.split("?")[0] filename = download_url.split("/")[-1] if download_url.endswith('pdf'): render_tag = soup.new_tag('embed') elif next( (e for e in IMAGE_EXTENSIONS if download_url.lower().endswith(e)), None): render_tag = soup.new_tag('img') else: LOGGER.error("Unknown file type found at {}".format(download_url)) return "" # Add tag to new page and write page to zip render_tag['src'] = zipper.write_url(format_url(download_url), filename) newpage.body.append(render_tag) return zipper.write_contents( filename.split('.')[0] + ".html", newpage.prettify())
def get_parsed_html_from_url(url, *args, **kwargs): response = sess.get(url, *args, **kwargs) if response.status_code != 200: LOGGER.error("STATUS: {}, URL: {}", response.status_code, url) elif not response.from_cache: LOGGER.debug("NOT CACHED:", url) return BeautifulSoup(response.content, "html.parser")
def website_game_webresouce_to_ricecooker_node(lang, web_resource): """ Create Ricecooker Json structure for game from web resource dict `web_resource`. """ game_node = dict( kind=content_kinds.HTML5, source_id=web_resource['source_id'], language=lang, title=web_resource['title'], description='source_url=' + web_resource['url'] if DEBUG_MODE else '', license=PRADIGI_LICENSE, thumbnail=web_resource.get('thumbnail_url'), files=[], ) zip_tmp_path = get_zip_file(web_resource['url'], web_resource['main_file']) if zip_tmp_path: zip_file = dict( file_type=file_types.HTML5, path=zip_tmp_path, language=lang, ) game_node['files'].append(zip_file) LOGGER.debug('Created HTML5AppNode for game ' + web_resource['title']) return game_node else: LOGGER.error('Failed to create zip for game at url=' + web_resource['url']) return None
def scrape_resource(url, topic): resource = BeautifulSoup(downloader.read(url), 'html5lib') LOGGER.info(' {}'.format(resource.find('h2').text)) filepath = download_resource(resource.find('div', {'class': 'decargas'}).find('a')['href']) license = None author = '' for data_section in resource.find('div', {'class': 'datos_generales'}).find_all('h4'): if 'Licencia' in data_section.text: try: license = LICENSE_MAP[data_section.find_next_sibling('p').text](copyright_holder="Ceibal") except KeyError as e: LOGGER.error(str(e)) license = licenses.CC_BYLicense elif 'Autor' in data_section.text: author = data_section.find_next_sibling('p').text if filepath: thumbnail = resource.find('div', {'class': 'img-recurso'}).find('img')['src'] if thumbnail.endswith('.gif'): thumbnail = os.path.sep.join([DOWNLOAD_DIRECTORY, thumbnail.split('/')[-1].replace('.gif', '.png')]) with open(thumbnail, 'wb') as fobj: fobj.write(downloader.read(resource.find('div', {'class': 'img-recurso'}).find('img')['src'])) topic.add_child(nodes.HTML5AppNode( title=resource.find('h2').text, source_id=url, license=license, author=author, description=resource.find('form').find_all('p')[1].text, thumbnail=thumbnail, tags = [tag.text[:30] for tag in resource.find_all('a', {'class': 'tags'})], files=[files.HTMLZipFile(path=filepath)], ))
def make_request(self, url, timeout=60, *args, method='GET', **kwargs): """ Failure-resistant HTTP GET/HEAD request helper method. """ retry_count = 0 max_retries = 5 while True: try: response = self.SESSION.request(method, url, *args, timeout=timeout, **kwargs) break except (requests.exceptions.ConnectionError, requests.exceptions.ReadTimeout) as e: retry_count += 1 LOGGER.warning( "Connection error ('{msg}'); about to perform retry {count} of {trymax}." .format(msg=str(e), count=retry_count, trymax=max_retries)) time.sleep(retry_count * 1) if retry_count >= max_retries: LOGGER.error("FAILED TO RETRIEVE:" + str(url)) return None if response.status_code != 200: LOGGER.error("ERROR " + str(response.status_code) + ' when getting url=' + url) return None return response
def to_tag(self, filename=None): try: img = self.create_tag('img') img['src'] = self.to_zip(filename=filename) return img except BROKEN_EXCEPTIONS as e: LOGGER.error(str(e)) return self.create_broken_link_message(self.url)
def download_resource(endpoint): try: url = '{}{}'.format(BASE_URL, endpoint.lstrip('/')) filename, ext = os.path.splitext(endpoint) filename = '{}.zip'.format(filename.lstrip('/').replace('/', '-')) write_to_path = CeibalPageScraper(url, locale='es').to_file(filename=filename, directory=DOWNLOAD_DIRECTORY) return write_to_path except Exception as e: LOGGER.error(str(e))
def _download_file(self, write_to_path): with html_writer.HTMLWriter(write_to_path) as zipper: try: self.zipper = zipper self.to_zip(filename='index.html') except Exception as e: # Any errors here will just say index.html file does not exist, so # print out error for more descriptive debugging LOGGER.error(str(e))
def to_tag(self, filename=None): try: embed = self.create_tag('embed') embed['src'] = self.to_zip(filename=filename) embed['width'] = '100%' embed['style'] = 'height: 500px;max-height: 100vh;' return embed except BROKEN_EXCEPTIONS as e: LOGGER.error(str(e)) return self.create_broken_link_message(self.url)
def transform_video_vertical(vertical, parent_title=None): if 'children' not in vertical: return None, [] # 1. LOOK FOR AN OPTIONAL html PREFIX TO USE AS DESCRIPTION description = '' # Extract an optional description from the first html node first_child = vertical['children'][0] if first_child['kind'] == 'html': description = extract_text_from_html_item(first_child, translate_from='ar') if parent_title: video_title = parent_title + ' ' + vertical['display_name'] else: video_title = vertical['display_name'] # 2. GET THE VIDEO videos = [ch for ch in vertical['children'] if ch['kind'] == 'video'] assert len(videos) == 1, 'multiple videos found' video = videos[0] video_dict = dict(kind=content_kinds.VIDEO, source_id=video.get('youtube_id') or video.get('path'), title=video_title, author='Edraak', description=description, language=getlang('ar').code, license=EDRAAK_LICENSE, files=[]) if 'youtube_id' in video: file_dict = dict( file_type=content_kinds.VIDEO, youtube_id=video['youtube_id'], language=getlang('ar').code, high_resolution=False, ) elif 'path' in video: file_dict = dict( file_type=content_kinds.VIDEO, path=video['path'], language=getlang('ar').code, ffmpeg_settings={"crf": 24}, ) else: LOGGER.error('Video does not have youtube_id or path ' + str(video)) video_dict['files'].append(file_dict) # 3. LOOK FOR AN OPTIONAL RESOURCES html downloadable_resources = [] htmls = [ch for ch in vertical['children'] if ch['kind'] == 'html'] for html in htmls: if 'downloadable_resources' in html: downloadable_resources.extend(html['downloadable_resources']) return video_dict, downloadable_resources
def to_tag(self, filename=None): try: audio = self.create_tag('audio') audio['controls'] = 'controls' audio['style'] = 'width: 100%;' source = self.create_tag('source') source['src'] = self.to_zip(filename=filename) audio.append(source) return audio except BROKEN_EXCEPTIONS as e: LOGGER.error(str(e)) return self.create_broken_link_message(self.url)
def to_tag(self, filename=None): try: video = self.create_tag('video') video['controls'] = 'controls' video['style'] = 'width: 100%;' video['preload'] = 'auto' source = self.create_tag('source') source['src'] = self.to_zip(filename=filename) video.append(source) return video except BROKEN_EXCEPTIONS as e: LOGGER.error(str(e)) return self.create_broken_link_message(self.url)
def on_story_resource_page(self, url, page, context): LOGGER.debug(' in on_story_resource_page' + url) html = str(page) story_resource_url = get_respath_url_from_html(html) if story_resource_url: page_dict = dict( url=story_resource_url, children=[], ) page_dict.update(context) context['parent']['children'].append(page_dict) else: LOGGER.error('Failed to find story_resource_url on page %s' % url)
def save_book(book_detail, channel): book_id = book_detail["id"] book_source_id = get_book_source_id(book_id) book_title = book_detail["name"] level_id = book_detail["readingLevel"] language = book_detail["language"] language_id = language["id"] tags = book_detail["tags"] epub_url = book_detail["epubUrl"] pdf_urls = book_detail["pdfUrl"] pdf_portrait_url = pdf_urls.get("portraitUrl", "") if pdf_urls else "" pdf_landscape_url = pdf_urls.get("landscapeUrl", "") if pdf_urls else "" pdf_booklet_url = pdf_urls.get("bookletUrl", "") if pdf_urls else "" pdf_url = pdf_portrait_url or pdf_landscape_url or pdf_booklet_url if not pdf_url and not epub_url: LOGGER.error("No file found for \n {}".format(book_source_id)) raise NoFileAvailableError() book_files = [] if pdf_url: pdf_file = files.DocumentFile(path=pdf_url) book_files.append(pdf_file) if epub_url: epub_file = files.EPubFile(path=epub_url) book_files.append(epub_file) book = nodes.DocumentNode( source_id=book_source_id, title=book_title, license=licenses. PUBLIC_DOMAIN, # TODO: get a real license and copyright holder files=book_files) language_topic = get_or_create_language_topic(language, channel) level_topic = get_or_create_level_topic(level_id, language_id, language_topic) if not tags: level_topic.add_child(book) return for tag in tags: tag_topic = get_or_create_tag_topic(tag, language_id, level_id, level_topic) tag_topic.add_child(book)
def on_story_page(self, url, page, context): LOGGER.debug(' in on_story_page' + url) page_dict = dict( kind='story_page', url=url, children=[], ) page_dict.update(context) context['parent']['children'].append(page_dict) try: body_row = page.find('div', {'id': 'body-row'}) contents_row = body_row.find('div', {'class': 'row'}) except Exception as e: LOGGER.error('ERROR on_story_page: %s : %s' % (e, url)) return contents = contents_row.find_all('div', {'class': 'col-md-3'}) for content in contents: try: title = get_text(content.find('div', {'class': 'txtline'})) # TODO: description thumbnail = content.find('a').find('img')['src'] thumbnail = get_absolute_path(thumbnail) # get_fun_content_link link = content.find('a') source_id = link['href'][1:] story_resource_url = get_absolute_path(link['href']) if self.should_ignore_url(story_resource_url): print('ignoring story content', title, story_resource_url) continue LOGGER.debug(' story_resource_page: %s: %s' % (source_id, title)) context = dict( parent = page_dict, kind='story_resource_page', title=title, source_id=source_id, thumbnail_url=thumbnail, ) self.enqueue_url_and_context(story_resource_url, context) except Exception as e: LOGGER.error('on_story_page: %s : %s' % (e, content))
def scrape_video_collection(url, topic): """ Scrape videos under video collection and add to the topic node Args: url (str): url to video page (e.g. https://www.exploratorium.edu/video/inflatable-jimmy-kuehnle) topic (TopicNode): topic to add video nodes to """ try: collection_contents = BeautifulSoup(read(url), 'html5lib') for result in collection_contents.find_all('div', {'class': 'search-result'}): header = result.find('div', {'class': 'views-field-field-html-title'}) LOGGER.info(" {}".format(header.text.strip())) # Get video from given url description = result.find('div', {'class': 'search-description'}) video_contents = BeautifulSoup(read(header.find('a')['href']), 'html.parser') for k, v in get_brightcove_mapping(video_contents).items(): video_node = nodes.VideoNode( source_id=k, title=header.text.strip().replace("’", "'"), description=description.text.strip() if description else "", license=LICENSE, copyright_holder=COPYRIGHT_HOLDER, author=v.get('author') or "", files=[ files.WebVideoFile(v['url'], high_resolution=False) ], thumbnail=get_thumbnail_url(result.find('img')['src']), ) # If video doesn't already exist here, add to topic if not next((c for c in topic.children if c.source_id == video_node.source_id), None): topic.add_child(video_node) # Scrape next page (if any) next_page_url = get_next_page_url(collection_contents) if next_page_url: scrape_video_collection(next_page_url, topic) except requests.exceptions.HTTPError: LOGGER.error("Could not read collection at {}".format(url))
def get_phet_zip_file(zip_file_url, main_file_and_query): """ Phet simulations are provided in the zip file `phet.zip`, and the entry point is passed as a GET parameter in `main_file_and_query`. To make these compatible with Kolibri's default behaviour of loading index.html, we will: - Rename index.html to phetindex.thml - Add a custom index.html that uses javascrpt redirect to phetindex.thml?{sim_id} """ u = urlparse(main_file_and_query) idk, sim_id = u.query.split('=') assert idk == 'id', 'unknown query sting format found' + main_file_and_query main_file = u.scheme + '://' + u.netloc + u.path # skip querystring destpath = tempfile.mkdtemp() LOGGER.info('saving phet zip file in dir ' + destpath) try: download_file(zip_file_url, destpath, request_fn=make_request) zip_filename = zip_file_url.split('/')[-1] zip_basename = zip_filename.rsplit('.', 1)[0] zip_folder = os.path.join(destpath, zip_basename) # Extract zip file contents. local_zip_file = os.path.join(destpath, zip_filename) with zipfile.ZipFile(local_zip_file) as zf: zf.extractall(destpath) # Rename main_file to phetindex.html main_file = main_file.split('/')[-1] src = os.path.join(zip_folder, main_file) dest = os.path.join(zip_folder, 'phetindex.html') os.rename(src, dest) # Create the index_html = PHET_INDEX_HTML_TEMPLATE.format(sim_id=sim_id) with open(os.path.join(zip_folder, 'index.html'), 'w') as indexf: indexf.write(index_html) # Always be zipping! return create_predictable_zip(zip_folder) except Exception as e: LOGGER.error("get_phet_zip_file: %s, %s, %s, %s" % (zip_file_url, main_file_and_query, destpath, e)) return None
def parse_tsv_file(filepath): """ Load data from the TSV file located at `filepath` using csv.DictReader. Returns: a dict {id --> datum} of all the rows. """ print('Loading TSV file', filepath) data_by_id = {} with open(filepath, encoding="utf-8-sig") as tsvfile: reader = csv.DictReader(tsvfile, dialect='excel-tab') for row in reader: if not row['id']: raise ValueError("Row with missing id " + str(row)) try: clean_row = clean_tsv_row(row) data_by_id[row['id']] = clean_row except json.JSONDecodeError as e: LOGGER.error('Failed to parse row=' + str(dict(row))) return data_by_id
def get_subtopics(parent, path): doc = get_page(path) try: menu_row = doc.find('div', {'id': 'body-row'}) menu_row = menu_row.find('div', {'class': 'col-md-2'}) except Exception as e: LOGGER.error('get_subtopics: %s : %s' % (e, doc)) return for subtopic in menu_row.find_all('a'): try: title = subtopic.get_text().strip() source_id = get_source_id(subtopic['href']) LOGGER.info(' subtopic: %s: %s' % (source_id, title)) node = TopicNode(title=title, source_id=source_id) parent.add_child(node) get_lessons(node, subtopic['href']) except Exception as e: LOGGER.error('get_subtopics: %s : %s' % (e, subtopic))
def on_subtopic_page(self, url, page, context): LOGGER.debug(' in on_subtopic_page ' + url) page_dict = dict( kind='subtopic_page', # redundant... url=url, children=[], ) page_dict.update(context) context['parent']['children'].append(page_dict) try: menu_row = page.find('div', {'id': 'body-row'}) menu_row = menu_row.find('div', {'class': 'col-md-9'}) except Exception as e: LOGGER.error('on_subtopic_page: %s : %s' % (e, page)) return for lesson in menu_row.find_all('div', {'class': 'thumbnail'}): try: title = lesson.find('div', {'class': 'txtline'}).get_text().strip() caption = lesson.find('div', class_='caption') description = get_text(caption) if caption else '' lesson_url = urljoin(url, lesson.find('a')['href']) if self.should_ignore_url(lesson_url): LOGGER.info('ignoring lesson' + lesson_url) continue thumbnail_src = lesson.find('a').find('img')['src'] thumbnail_url = urljoin(url, thumbnail_src) source_id = get_source_id(lesson.find('a')['href']) LOGGER.debug(' lesson: %s: %s' % (source_id, title)) context = dict( parent=page_dict, kind='lesson_page', title=title, description=description, source_id=source_id, thumbnail_url=thumbnail_url, children=[], ) self.enqueue_url_and_context(lesson_url, context) # get_contents(node, link) except Exception as e: LOGGER.error('on_subtopic_page: %s : %s' % (e, lesson))
def get_subtitles_using_youtube_dl(youtube_id): youtube_url = 'https://youtube.com/watch?v=' + youtube_id yt_resource = YouTubeResource(youtube_url) lang_codes = [] try: result = yt_resource.get_resource_subtitles() # TODO(ivan) Consider including auto-generated subtitles to increase # coverage and handle edge cases of videos that are transalted # but no metadata: https://www.youtube.com/watch?v=qlGjA9p1UAM if result: for lang_code, lang_subs in result['subtitles'].items(): for lang_sub in lang_subs: if 'ext' in lang_sub and lang_sub[ 'ext'] == 'vtt' and lang_code not in lang_codes: lang_codes.append(lang_code) except Exception as e: LOGGER.error('get_subtitles_using_youtube_dl failed for ' + youtube_url) LOGGER.error(str(e)) return lang_codes
def on_special_subtopic_page(self, url, page, context): LOGGER.debug(' in on_special_subtopic_page ' + url) page_dict = dict( kind='special_subtopic_page', # redundant... -- mismatc with original special_subtopic_page url=url, children=[], ) page_dict.update(context) context['parent']['children'].append(page_dict) try: menu_row = page.find('div', {'id': 'body-row'}) menu_row = menu_row.find('div', {'class': 'col-md-2'}) print(str(menu_row)) except Exception as e: LOGGER.error('on_subtopic_page: %s : %s' % (e, page)) return for link in menu_row.find_all('a', {'class': 'list-group-item'}): try: title = link.get_text().strip() description = '' lesson_url = urljoin(url, link['href']) if self.should_ignore_url(lesson_url): LOGGER.info('ignoring lesson' + lesson_url) continue source_id = get_source_id(link['href']) LOGGER.debug(' special lesson: %s: %s' % (source_id, title)) context = dict( parent=page_dict, kind='fun_page', title=title, description=description, source_id=source_id, thumbnail_url=None, children=[], ) self.enqueue_url_and_context(lesson_url, context) # get_contents(node, link) except Exception as e: LOGGER.error('on_special_subtopic_page: %s : %s' % (e, link))
def get_topics(parent, path): doc = get_page(path) try: menu_row = doc.find('div', {'id': 'menu-row'}) except Exception as e: LOGGER.error('get_topics: %s : %s' % (e, doc)) return for topic in menu_row.find_all('a'): try: if topic['href'] == '#': continue title = topic.get_text().strip() source_id = get_source_id(topic['href']) LOGGER.info('topic: %s: %s' % (source_id, title)) node = TopicNode(title=title, source_id=source_id) parent.add_child(node) get_subtopics(node, topic['href']) if DEBUG_MODE: return except Exception as e: LOGGER.error('get_topics: %s : %s' % (e, topic))
def _get_youtube_info(self, use_proxy=True, use_cache=True, options=None): youtube_info = None # 1. Try to get from cache if allowed: if os.path.exists(self.cache_path) and use_cache: LOGGER.info("==> [%s] Retrieving cached information...", self.__str__()) youtube_info = json.load(open(self.cache_path)) # 2. Fetch info from youtube_dl if not youtube_info: LOGGER.info("==> [%s] Requesting info from youtube...", self.__str__()) os.makedirs(self.cache_dir, exist_ok=True) try: youtube_resource = YouTubeResource(self.url, useproxy=use_proxy) except youtube_dl.utils.ExtractorError as e: if "unavailable" in str(e): LOGGER.error("==> [%s] Resource unavailable for URL: %s", self.__str__, self.url) return None if youtube_resource: try: # Save YouTube info to JSON cache file youtube_info = youtube_resource.get_resource_info(options) if youtube_info: json.dump(youtube_info, open(self.cache_path, 'w'), indent=4, ensure_ascii=False, sort_keys=True) else: LOGGER.error("==> [%s] Failed to extract YouTube info", self.__str__()) except Exception as e: LOGGER.error("==> [%s] Failed to get YouTube info: %s", self.__str__(), e) return None return youtube_info
def download(url, write_to_path, attempts=DOWNLOAD_ATTEMPTS): """ Download the web video Args: url (str): url to video to download write_to_path (str): where to write video to attempts (int): how many times to reattempt a download """ try: video_format = "bestvideo[height<=480][ext=mp4]+bestaudio[ext=m4a]/best[height<=480][ext=mp4]" with youtube_dl.YoutubeDL({ "format": video_format, "outtmpl": write_to_path }) as ydl: ydl.download([url]) except youtube_dl.utils.DownloadError as e: # If there are more attempts, try again. Otherwise, return error if attempts > 0: download(url, write_to_path, attempts=attempts - 1) else: LOGGER.error("Could not download video {} ({})".format( url, str(e))) raise e
def on_topic_page(self, url, page, context): LOGGER.debug('in on_topic_page ' + url) page_dict = dict( kind='topic_page', url=url, children=[], ) page_dict.update(context) context['parent']['children'].append(page_dict) try: body_row = page.find('div', {'id': 'body-row'}) menu_row = body_row.find('div', {'class': 'col-md-2'}) subtopics = menu_row.find_all('a') except Exception as e: LOGGER.error('ERROR get_subtopics: %s : %s' % (e, url)) return for subtopic in subtopics: try: subtopic_url = urljoin(url, subtopic['href']) if self.should_ignore_url(subtopic_url): print('ignoring subtopic', subtopic_url) continue title = get_text(subtopic) source_id = get_source_id(subtopic['href']) LOGGER.debug(' found subtopic: %s: %s' % (source_id, title)) context = dict( parent=page_dict, kind='subtopic_page', title=title, source_id=source_id, children=[], ) self.enqueue_url_and_context(subtopic_url, context) except Exception as e: LOGGER.error('on_topic_page: %s : %s' % (e, subtopic))
def get_subtree_by_subject_en(lang, subject): if lang not in PRADIGI_LANG_URL_MAP: raise ValueError('Language `lang` must be in PRADIGI_LANG_URL_MAP') wrt_filename = 'chefdata/trees/pradigi_{}_web_resource_tree.json'.format( lang) with open(wrt_filename) as jsonfile: web_resource_tree = json.load(jsonfile) subject_subtrees = web_resource_tree['children'] try: for subject_subtree in subject_subtrees: if 'subject_en' in subject_subtree and subject_subtree[ 'subject_en'] == subject: return subject_subtree elif 'source_id' in subject_subtree and subject_subtree[ 'source_id'] == subject: return subject_subtree else: pass # print('no subject_en in '+ subject_subtree['source_id']) except Exception as e: LOGGER.error("in get_subtree_by_subject_en: %s, %s, %s, %s" % (lang, subject, subject_subtree, e)) return None
def get_lessons(parent, path): doc = get_page(path) try: menu_row = doc.find('div', {'id': 'body-row'}) menu_row = menu_row.find('div', {'class': 'col-md-9'}) except Exception as e: LOGGER.error('get_lessons: %s : %s' % (e, doc)) return for lesson in menu_row.find_all('div', {'class': 'thumbnail'}): try: title = lesson.find('div', {'class': 'txtline'}).get_text().strip() link = lesson.find('a')['href'] thumbnail = lesson.find('a').find('img')['src'] thumbnail = get_absolute_path(thumbnail) source_id = get_source_id(link) LOGGER.info(' lesson: %s: %s' % (source_id, title)) node = TopicNode(title=title, source_id=source_id, thumbnail=thumbnail) parent.add_child(node) get_contents(node, link) except Exception as e: LOGGER.error('get_lessons: %s : %s' % (e, lesson))
def get_zip_file(zip_file_url, main_file): """HTML games are provided as zip files, the entry point of the game is main_file. main_file needs to be renamed to index.html to make it compatible with Kolibri. """ destpath = tempfile.mkdtemp() try: download_file(zip_file_url, destpath, request_fn=make_request) zip_filename = zip_file_url.split('/')[-1] zip_basename = zip_filename.rsplit('.', 1)[0] zip_folder = os.path.join(destpath, zip_basename) # Extract zip file contents. local_zip_file = os.path.join(destpath, zip_filename) with zipfile.ZipFile(local_zip_file) as zf: zf.extractall(destpath) # In some cases, the files are under the www directory, # let's move them up one level. www_dir = os.path.join(zip_folder, 'www') if os.path.isdir(www_dir): files = os.listdir(www_dir) for f in files: shutil.move(os.path.join(www_dir, f), zip_folder) # Rename main_file to index.html. main_file = main_file.split('/')[-1] src = os.path.join(zip_folder, main_file) dest = os.path.join(zip_folder, 'index.html') os.rename(src, dest) return create_predictable_zip(zip_folder) except Exception as e: LOGGER.error("get_zip_file: %s, %s, %s, %s" % (zip_file_url, main_file, destpath, e)) return None
def download_zip_file(url): if not url: return (False, None) if get_suffix(url) != '.zip': return (False, None) response = sess.get(url) if response.status_code != 200: LOGGER.error("STATUS: {}, URL: {}", response.status_code, url) return (False, None) elif not response.from_cache: LOGGER.debug("NOT CACHED:", url) archive = zipfile.ZipFile(io.BytesIO(response.content)) archive_members = list( filter(lambda f: f.filename.endswith('.pdf'), archive.infolist())) archive_member_names = [None] * len(archive_members) for i, pdf in enumerate(archive_members): path = os.path.join(PDFS_DATA_DIR, pdf.filename) archive_member_names[i] = path if not os.path.exists(path): archive.extract(pdf, PDFS_DATA_DIR) return (True, archive_member_names)