async def producer(session, queue, base_path, download_settings, url: URL_CONFIG, regex_patterns: REGEX_PATTERN_CONFIG, headers: HEADERS_CONFIG, basic_auth: BASIC_AUTH_CONFIG): session_kwargs = headers_config_to_session_kwargs(headers) session_kwargs.update(basic_auth_config_to_session_kwargs(basic_auth, download_settings)) if url and url[-1] != "/" and "." not in url.split("/")[-1]: url += "/" links = await get_all_file_links(session, url, session_kwargs) for regex_pattern in regex_patterns: pattern = regex_pattern["pattern"] folder_regex = regex_pattern.get("folder", None) if folder_regex is None: folder_regex = "" file_name_regex = regex_pattern.get("file_name", None) link_regex = regex_pattern.get("link_regex", None) for orig_link, html_name in links.items(): if re.search(pattern, orig_link) is None: continue folder_name = re.sub(pattern, folder_regex, orig_link) if link_regex: link = re.sub(pattern, link_regex, orig_link) else: link = orig_link guess_extension = await cache.check_extension(session, link, session_kwargs=session_kwargs) file_name = _get_file_name(guess_extension=guess_extension if guess_extension != "html" else None, html_name=html_name, file_name_regex=file_name_regex, pattern=pattern, orig_link=orig_link, link_name=urlparse(link).path.split("/")[-1]) if guess_extension is None or guess_extension == "html": try: await process_single_file_url(session=session, queue=queue, base_path=safe_path_join(base_path, folder_name), download_settings=download_settings, url=link, name=file_name) except NotSingleFile: pass else: await queue.put({ "url": link, "path": safe_path_join(base_path, folder_name, file_name), "session_kwargs": session_kwargs, })
async def parse_folder(session, queue, base_path, file_id): params = _get_folder_params(file_id) async with session.get(FOLDER_URL, params=params, headers=REFERER_HEADERS) as response: data = await response.json() files = data["files"] while "nextPageToken" in data: page_params = {"pageToken": data["nextPageToken"], **params} async with session.get(FOLDER_URL, params=page_params, headers=REFERER_HEADERS) as response: data = await response.json() files += data["files"] tasks = [] for file in files: path = safe_path_join(base_path, file["name"]) if file["mimeType"] == MIMETYPE_FOLDER: tasks.append(parse_folder(session, queue, path, file["id"])) await asyncio.gather(*tasks) for file in files: with_extension = False path = safe_path_join(base_path, file["name"]) if file["mimeType"] == MIMETYPE_GOOGLE_DOCS: url = f"https://docs.google.com/document/d/{file['id']}/export" params = {"format": "docx"} elif file["mimeType"] == MIMETYPE_DRAWING: url = f"https://docs.google.com/drawings/d/{file['id']}/export/png" params = {} elif file["mimeType"] == MIMETYPE_PRESENTATION: url = f"https://docs.google.com/presentation/d/{file['id']}/export/pptx" params = {} elif file["mimeType"] == MIMETYPE_JAM: url = "https://jamboard.google.com/export" params = {"id": file["id"]} elif file["mimeType"] == MIMETYPE_SPREADSHEET: url = f"https://docs.google.com/spreadsheets/d/{file['id']}/export" params = {"format": "xlsx"} elif "application/vnd.google-apps" in file["mimeType"]: continue else: url = "https://drive.google.com/uc" params = _get_download_params(file["id"]) with_extension = True await queue.put({ "url": url, "path": path, "checksum": file["modifiedTime"], "session_kwargs": { "params": params }, "with_extension": with_extension, })
async def parse_folder_tree(queue, soup, folder_path, last_updated): children = soup.find_all("li", recursive=False) for child in children: if child.find("div", recursive=False) is not None: sub_folder_path = safe_path_join(folder_path, child.div.span.img["alt"]) else: sub_folder_path = folder_path if child.find("ul", recursive=False) is not None: await parse_folder_tree(queue, child.ul, sub_folder_path, last_updated) if child.find("span", recursive=False) is not None: url = child.span.a["href"] name = child.span.a.find("span", recursive=False, class_="fp-filename").get_text(strip=True) item = {"path": safe_path_join(sub_folder_path, name), "url": url, "checksum": last_updated} await queue.put(item)
async def _producer(session, queue, url, base_path, session_kwargs): if url[-1] != "/": url += "/" async with session.get(url, **session_kwargs) as response: html = await response.text() soup = BeautifulSoup(html, BEAUTIFUL_SOUP_PARSER) links = soup.find_all("a") tasks = [] for link in links: href = link.get("href") if href != str(link.string).strip(): continue if href[-1] == "/": href = href[:-1] path = safe_path_join(base_path, href) if "." in href: checksum = str(link.next_sibling.string).strip() await queue.put({"url": url + href, "path": path, "session_kwargs": session_kwargs, "checksum": checksum}) else: coroutine = _producer(session, queue, url + href, path, session_kwargs) tasks.append(asyncio.ensure_future(coroutine)) await asyncio.gather(*tasks)
async def download(session, queue, base_path, url, password=None, file_name=None): domain = re.match(r"https?://([^.]*\.?)zoom.us", url).group(1) agent_header = { "referer": f"https://{domain}zoom.us/", "User-Agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/74.0.3729.169 " "Safari/537.36") } async with session.get(url, headers=agent_header) as response: html = await response.text() if password is not None: meet_id_regex = re.compile("<input[^>]*") for inp in meet_id_regex.findall(html): input_split = inp.split() if input_split[2] == 'id="meetId"': meet_id = input_split[3][7:-1] break data = { "id": meet_id, "passwd": password, "action": "viewdetailpage", "recaptcha": "" } check_url = f"https://{domain}zoom.us/rec/validate_meet_passwd" async with session.post(check_url, data=data, headers=agent_header) as response: pass async with session.get(url, headers=agent_header) as response: html = await response.text() metadata = _get_page_meta(html, ("viewMp4Url", "topic")) if metadata is None: logger.warning(f"Zoom url: {url} has no video") return None vid_url = metadata.get("viewMp4Url", None) if vid_url is None: raise LoginError("Could not Login") extension = vid_url.split("?")[0].split("/")[-1].split(".")[1] name = file_name or metadata.get("topic") # We need to disable the decoding of the url, because zoom is not RFC-compliant (btw f**k zoom). await queue.put({ "url": URL(vid_url, encoded=True), "path": safe_path_join(base_path, f"{name}.{extension}"), "session_kwargs": dict(headers=agent_header) })
async def process_link(session, queue, base_path, site_settings, url, moodle_id, name, password_mapper): if "onedrive.live.com" in url: logger.debug(f"Starting one drive from moodle: {moodle_id}") await one_drive.producer(session, queue, base_path + f"; {safe_path(name)}", site_settings=site_settings, url=url) elif "polybox" in url: logger.debug(f"Starting polybox from moodle: {moodle_id}") poly_type, poly_id = [x.strip() for x in url.split("/") if x.strip() != ""][3:5] password = match_name_to_password(name, password_mapper) await polybox.producer(session, queue, safe_path_join(base_path, name), site_settings, poly_id, poly_type=poly_type, password=password) elif "zoom.us/rec/play" in url or "zoom.us/rec/share" in url: if is_extension_forbidden("mp4", site_settings.allowed_extensions + queue.consumer_kwargs["allowed_extensions"], site_settings.forbidden_extensions + queue.consumer_kwargs["forbidden_extensions"]): logger.debug(f"Skipped zoom download from moodle: {moodle_id}") return logger.debug(f"Starting zoom download from moodle: {moodle_id}") password = match_name_to_password(name, password_mapper) await zoom.download(session=session, queue=queue, base_path=base_path, url=url, file_name=name, password=password)
async def collect_all_links(session, queue, url, base_path, valid_extensions=None): if valid_extensions is None: valid_extensions = ['pdf', 'mp4'] async with session.get(url) as response: html = await response.text() soup = BeautifulSoup(html, BEAUTIFUL_SOUP_PARSER) links = soup.find_all("a") for link in links: href = link.get("href") if "." not in href: continue name, extension = href.split(".") if extension not in valid_extensions: continue await queue.put({ "url": url + href, "path": safe_path_join(base_path, href) })
async def _producer(session, queue, base_path, site_settings, url, etag=None): parameters = parse_qs(urlparse(url).query) api_url = get_api_url(parameters, children=True) authkey = parameters['authkey'][0] item_data = await call_function_or_cache(get_json_response, etag, session, api_url) tasks = [] for item in item_data["value"]: path = safe_path_join(base_path, item["name"]) if "@content.downloadUrl" in item: checksum = item["file"]["hashes"]["sha256Hash"] await queue.put({ "path": path, "url": item["@content.downloadUrl"], "checksum": checksum }) elif "folder" in item: folder_url = await check_url_reference( session, item['webUrl']) + f"?authkey={authkey}" item_etag = item["lastModifiedDateTime"] coroutine = _producer(session, queue, path, site_settings, f"{folder_url}?authkey={authkey}", etag=item_etag) tasks.append(asyncio.ensure_future(coroutine)) await asyncio.gather(*tasks)
async def parse_folder(session, queue, base_path, site_settings, key, secure_hash, sub_path, cut_path_num=0): data = _get_data(key, secure_hash, sub_path) async with session.post(LIST_ENTRIES_URL, cookies=DEFAULT_COOKIE, data=data) as response: result = await response.json() tasks = [] for entry, share_tokens in zip(result["entries"], result["share_tokens"]): if entry["is_dir"]: coroutine = parse_folder(session=session, queue=queue, base_path=base_path, site_settings=site_settings, key=share_tokens["linkKey"], secure_hash=share_tokens["secureHash"], sub_path=share_tokens["subPath"], cut_path_num=cut_path_num) tasks.append(asyncio.create_task(coroutine)) continue checksum = entry["sjid"] href = entry["href"] url = href.replace("dl=0", "dl=1") sub_path = share_tokens["subPath"] path = safe_path_join(base_path, *sub_path.split("/")[cut_path_num + 1:]) await queue.put({"url": url, "path": path, "checksum": checksum, }) await asyncio.gather(*tasks)
def _init_base_path(self, use_folder): if not self.use_folder: return self.parent.base_path if self.folder_name is None: return if self.parent.base_path is None: return return safe_path_join(self.parent.base_path, self.folder_name)
async def parse_single_file(session, queue, base_path, poly_id, poly_type="s", name=None, password=None): if poly_type != "s": raise NotSingleFile() auth = BasicAuth(login=poly_id, password="******" if password is None else password) async with session.request("PROPFIND", url=WEBDAV_PUBLIC_URL, data=PROPFIND_DATA, headers=BASIC_HEADER, auth=auth) as response: xml = await response.text() tree = ET.fromstring(xml) if len(tree) != 1: raise NotSingleFile() response = tree[0] status = go_down_tree(response, "d:propstat", "d:status", to_text=True) if status != "HTTP/1.1 200 OK": return url = f"{INDEX_URL}s/{poly_id}/download" orig_filename = await cache.check_filename(session, url, session_kwargs={"auth": auth}) if orig_filename is None: raise NotSingleFile() extension = get_extension(orig_filename) if name: filename = add_extension(name, extension) else: filename = orig_filename await queue.put({ "url": url, "path": safe_path_join(base_path, filename), "session_kwargs": { "auth": auth }, })
async def parse_assign_files_tree(queue, soup, path): for assign_files_tree in soup.find_all("div", id=re.compile("assign_files_tree[0-9a-f]*")): for item in assign_files_tree.ul.find_all("li", recursive=False): date_time = str(item.find("div", class_="fileuploadsubmissiontime").string) fileuploadsubmission_soup = item.find("div", class_="fileuploadsubmission") name = str(fileuploadsubmission_soup.a.string) url = fileuploadsubmission_soup.a["href"] await queue.put({ "path": safe_path_join(path, name), "url": url, "checksum": date_time, })
async def parse_sections(session, queue, section, base_path, site_settings, moodle_id, process_external_links, last_updated_dict, password_mapper, index=None, keep_section_order=False): section_name = str(section["aria-label"]) if keep_section_order: section_name = f"[{index + 1:02}] {section_name}" base_path = safe_path_join(base_path, section_name) modules = section.find_all("li", id=re.compile("module-[0-9]+")) tasks = [] for module in modules: coroutine = parse_mtype(session=session, queue=queue, site_settings=site_settings, base_path=base_path, module=module, last_updated_dict=last_updated_dict, moodle_id=moodle_id, process_external_links=process_external_links, password_mapper=password_mapper) tasks.append(asyncio.ensure_future(coroutine)) if process_external_links: for text_link in module.find_all("a"): url = text_link.get("href", None) name = text_link.string if url is None or name is None: continue coroutine = process_link(session=session, queue=queue, base_path=base_path, site_settings=site_settings, url=url, moodle_id=moodle_id, name=str(name), password_mapper=password_mapper) tasks.append(asyncio.ensure_future(exception_handler(coroutine, moodle_id, url))) await asyncio.gather(*tasks)
async def parse_sections(session, queue, section, base_path, download_settings, moodle_id, process_external_links, last_updated_dict, password_mapper, index=None, keep_section_order=False, keep_file_order=False): if "aria-labelledby" in section.attrs: section_title_id = str(section["aria-labelledby"]) section_name = str(section.find("h3", id=section_title_id).string).strip() else: section_name = str(section["aria-label"]).strip() if keep_section_order: section_name = f"[{index + 1:02}] {section_name}" base_path = safe_path_join(base_path, section_name) title_link = section.find("a", href=re.compile(r"id=[0-9]+§ion=[0-9]+")) if title_link is not None: # Old moodle site where we have to call the section explicit with a request await parse_single_section( session=session, queue=queue, download_settings=download_settings, base_path=base_path, href=title_link["href"], moodle_id=moodle_id, last_updated_dict=last_updated_dict, process_external_links=process_external_links, keep_file_order=keep_file_order, password_mapper=password_mapper) await _parse_section(session=session, queue=queue, download_settings=download_settings, base_path=base_path, section=section, last_updated_dict=last_updated_dict, moodle_id=moodle_id, process_external_links=process_external_links, keep_file_order=keep_file_order, password_mapper=password_mapper)
async def parse_folder(session, queue, site_settings, module, base_path, last_updated): folder_tree = module.find("div", id=re.compile("folder_tree[0-9]+"), class_="filemanager") if folder_tree is not None: await parse_folder_tree(queue, folder_tree.ul, base_path, last_updated) return instance = module.find("div", class_="activityinstance") folder_name = str(instance.a.span.contents[0]) folder_path = safe_path_join(base_path, folder_name) href = instance.a["href"] folder_soup = await call_function_or_cache(get_filemanager, last_updated, session, href) await parse_sub_folders(queue, folder_soup, folder_path, last_updated)
async def producer(session, queue, base_path, site_settings, department: DEPARTMENT_CONFIG, year: YEAR_CONFIG, semester: SEMESTER_CONFIG, course_id: COURSE_ID_CONFIG, pwd_username: PWD_USERNAME_CONFIG = None, pwd_password: PWD_PASSWORD_CONFIG = None): absolute_path = os.path.join(site_settings.base_path, base_path) course_url = f"{BASE_URL}{department}/{year}/{semester}/{course_id}" meta_data = await get_meta_data(session, course_url) if os.path.exists(absolute_path): downloaded_episodes = os.listdir(absolute_path) else: downloaded_episodes = [] tasks = [] for episode in meta_data["episodes"]: ep_id = episode['id'] name = episode["title"] date_time = episode["createdAt"] date, time = date_time.split("T") file_name = f"{date} {name}.mp4" if file_name in downloaded_episodes: continue video_url = f"{course_url}/{ep_id}" meta_video_url = video_url + ".series-metadata.json" coroutine = put_in_queue(session, queue, safe_path_join(base_path, file_name), site_settings, department, year, semester, course_id, meta_video_url, pwd_username, pwd_password) tasks.append(asyncio.ensure_future(coroutine)) await asyncio.gather(*tasks)
async def _parse_tree(session, queue, base_path, url, auth, cut_parts_num=3): tasks = [] async with session.request("PROPFIND", url=url, data=PROPFIND_DATA, headers=BASIC_HEADER, auth=auth) as response: xml = await response.text() tree = ET.fromstring(xml) for response in tree: href = go_down_tree(response, "d:href", to_text=True) prop = go_down_tree(response, "d:propstat", "d:prop") checksum = go_down_tree(prop, "oc:checksums", "oc:checksum", to_text=True) contenttype = go_down_tree(prop, "d:getcontenttype", to_text=True) if contenttype is None: continue path = PurePath(unquote(href)) path = safe_path_join("", *path.parts[cut_parts_num:]) if not path: raise ValueError("Can not download single file") url = BASE_URL + href absolute_path = os.path.join(base_path, path) await queue.put({ "url": url, "path": absolute_path, "checksum": checksum, "session_kwargs": { "auth": auth }, }) await asyncio.gather(*tasks)
async def process_link(session, queue, base_path, download_settings, url, moodle_id, name, password_mapper): try: guess_extension = await cache.check_extension(session, url) except aiohttp.client_exceptions.ClientResponseError: return if guess_extension is None or guess_extension in ["html", "json"]: password = match_name_to_password(name, password_mapper) try: await process_single_file_url(session=session, queue=queue, base_path=base_path, download_settings=download_settings, url=url, name=name, password=password) except NotSingleFile: pass else: name = add_extension(name, guess_extension) await queue.put({"url": url, "path": safe_path_join(base_path, name)})
async def add_producers(self, producers, session, queue, site_settings, cancellable_pool, signal_handler): if check_if_null(self.function_kwargs): raise ParseTemplateRuntimeError("Found null field") if self.login_module_name is not None: login_module = importlib.import_module(self.login_module_name) login_function = getattr(login_module, self.login_function_name) await safe_login_module(session, site_settings, login_function, self.function_kwargs) if self.base_path is None: self.folder_name = await self.retrieve_folder_name( session=session, signal_handler=signal_handler, site_settings=site_settings) self.base_path = safe_path_join(self.parent.base_path, self.folder_name) signal_handler.update_base_path(self.unique_key, self.base_path) queue_wrapper = QueueWrapper(queue, signal_handler=signal_handler, unique_key=self.unique_key, site_settings=site_settings, cancellable_pool=cancellable_pool, **self.consumer_kwargs) site_module = importlib.import_module(self.module_name) producer_function = getattr(site_module, self.function_name) coroutine = self.exception_handler(producer_function, signal_handler)( session=session, queue=queue_wrapper, base_path=self.base_path, site_settings=site_settings, **self.function_kwargs) producers.append(asyncio.ensure_future(coroutine))
async def producer(session, queue, base_path, site_settings, url: URL_CONFIG, regex_patterns: REGEX_PATTERN_CONFIG, headers: HEADERS_CONFIG, basic_auth: BASIC_AUTH_CONFIG): session_kwargs = headers_config_to_session_kwargs(headers) session_kwargs.update( basic_auth_config_to_session_kwargs(basic_auth, site_settings)) links = await get_all_file_links(session, url, session_kwargs) for regex_pattern in regex_patterns: pattern = regex_pattern["pattern"] folder_regex = regex_pattern["folder"] if folder_regex is None: folder_regex = "" file_name_regex = regex_pattern["file_name"] for link, html_name in links: if re.search(pattern, link) is None: continue folder_name = re.sub(pattern, folder_regex, link) o = urlparse(link) file_name = _get_file_name(url_file_name=o.path.split("/")[-1], html_name=html_name, file_name_regex=file_name_regex, pattern=pattern, link=link) await queue.put({ "url": link, "path": safe_path_join(base_path, folder_name, file_name), "session_kwargs": session_kwargs, })
async def search_tree(session, queue, base_path, download_settings, ilias_id): url = GOTO_URL + str(ilias_id) async with session.get(url) as response: html = await response.text() if str(response.url) != url: raise LoginError( "Module ilias isn't logged in or you are not allowed to access these files" ) strainer = SoupStrainer("div", attrs={"class": "ilCLI ilObjListRow row"}) soup = BeautifulSoup(html, get_beautiful_soup_parser(), parse_only=strainer) rows = soup.find_all("div", attrs={"class": "ilCLI ilObjListRow row"}) tasks = [] for row in rows: content = row.find("div", attrs={"class": "ilContainerListItemContent"}) link = content.find("a") href = link["href"] name = str(link.string) path = safe_path_join(base_path, name) if "download" in href: extension = str( content.find("span", attrs={ "class": "il_ItemProperty" }).string).strip() checksum = "".join([ str(x.string).strip() for x in content.find_all("span", attrs={"class": "il_ItemProperty"}) ]) if "Today" in checksum: today_date = datetime.datetime.now() checksum = checksum.replace( "Today", format_datetime(today_date, locale='en', format="dd. MMM YYYY")) elif "Yesterday" in checksum: yesterday_date = datetime.datetime.now() - datetime.timedelta( days=1) checksum = checksum.replace( "Yesterday", format_datetime(yesterday_date, locale='en', format="dd. MMM YYYY")) await queue.put({ "url": href, "path": f"{path}.{extension}", "checksum": checksum }) else: ref_id = re.search("ref_id=([0-9]+)&", href).group(1) coroutine = search_tree(session, queue, path, download_settings, ref_id) tasks.append(asyncio.ensure_future(coroutine)) await asyncio.gather(*tasks)
async def parse_mtype(session, queue, site_settings, base_path, module, last_updated_dict, moodle_id, process_external_links, password_mapper): mtype = module["class"][1] module_id = int(re.search("module-([0-9]+)", module["id"])[1]) if mtype == MTYPE_FILE: instance = module.find("div", class_="activityinstance") try: file_name = str(instance.a.span.contents[0]) except AttributeError: return last_updated = last_updated_dict[module_id] with_extension = False if "pdf-24" in instance.a.img["src"]: file_name += ".pdf" with_extension = True url = instance.a["href"] + "&redirect=1" await queue.put({"path": safe_path_join(base_path, file_name), "url": url, "with_extension": with_extension, "checksum": last_updated}) elif mtype == MTYPE_DIRECTORY: last_updated = last_updated_dict[module_id] await parse_folder(session, queue, site_settings, module, base_path, last_updated) elif mtype == MTYPE_EXTERNAL_LINK: if not process_external_links: return instance = module.find("div", class_="activityinstance") url = instance.a["href"] + "&redirect=1" name = str(instance.a.span.contents[0]) driver_url = await check_url_reference(session, url) await process_link(session=session, queue=queue, base_path=base_path, site_settings=site_settings, url=driver_url, moodle_id=moodle_id, name=name, password_mapper=password_mapper) elif mtype == MTYPE_ASSIGN: instance = module.find("div", class_="activityinstance") link = instance.a if link is not None: href = instance.a["href"] last_updated = last_updated_dict[module_id] name = str(instance.a.span.contents[0]) assign_file_tree_soup_soup = await call_function_or_cache(get_assign_files_tree, last_updated, session, href) await parse_assign_files_tree(queue=queue, soup=assign_file_tree_soup_soup, path=safe_path_join(base_path, name))