def parse_content_page(soup) -> List[Union[SDoc, SFolder, SLecture]]: # NOTE e.g. "course_id": "_306327_1", "content_id": "_1790226_1" contentList = soup.find("ul", {"id": "content_listContainer"}) if contentList is None: return [] children = [c for c in contentList.children if c.name is not None] result: List[Union[SDoc, SFolder, SLecture]] = [] def is_content_folder(child, img): return img is not None and img.get("alt") == "Content Folder" def is_item(child, img): # some items do not have the item icon return img is not None and (img.get("alt") == "Item" or (img.get("alt") == "" and child.find( "div", {"class": "item clearfix"}))) def is_AcuStudio(child, img): return img is not None and img.get("alt") == "AcuStudio" def is_file(child, img): return img is not None and img.get("alt") == "File" for c in children: img = c.find("img") if is_content_folder(c, img): hyperlink = c.find("a") name = hyperlink.text link = hyperlink.get("href") details = c.find("div", {"class": "details"}).text result.append( SFolder(name.strip(), link.strip(), details.strip(), None)) elif is_item(c, img): folder = __item_to_folder(c) if folder: result.append(folder) elif is_AcuStudio(c, img): # ignore that for now hyperlink = c.find("a") name = hyperlink.text link = hyperlink.get("href") result.append(SLecture(name.strip(), link)) elif is_file(c, img): hyperlink = c.find("a") # sometimes file link is broken, in that case no href tag is rendered # see: https://github.com/leafgecko/NTULearn-Downloader/issues/8 if hyperlink is None: continue name = hyperlink.text link = hyperlink.get("href") result.append(SDoc(name.strip(), link)) else: pass return result
def __item_to_folder(item): folder_name = item.find("h3").text details = item.find("div", {"class", "details"}) dl_links = [ a for a in details.find_all("a") if is_download_link(a.get("href")) ] children = [] for a_tag in dl_links: link, name = a_tag.get("href"), a_tag.text children.append(SDoc(name=name.strip(), link=link)) if children: return SFolder(name=folder_name.strip(), link=None, details="", children=children) return None