def download(self, c): content = c.get_page(self.json["student_view_url"]) soup = BeautifulSoup.BeautifulSoup(content, 'lxml') html_content = soup.find('div', attrs={"class": "edx-notes-wrapper"}) if html_content == None: html_content = str( soup.find('div', attrs={"class": "course-wrapper"})) self.html = dl_dependencies(html_content, self.output_path, self.folder_name, c)
def download(self,c): download("https://www.google.com/s2/favicons?domain=" + self.instance_url , os.path.join(self.output_path,"favicon.png"),None) logging.info("Get homepage") content=c.get_page(self.course_url) make_dir(os.path.join(self.output_path,"home")) self.html_homepage=[] soup=BeautifulSoup.BeautifulSoup(content, 'lxml') html_content=soup.find('div', attrs={"class": "welcome-message" }) if html_content is None: html_content=soup.find_all('div', attrs={"class": re.compile("info-wrapper")}) if html_content == [] : self.no_homepage=True else: for x in range(0,len(html_content)): article=html_content[x] dismiss = article.find("div", attrs={"class": "dismiss-message"}) if dismiss != None: dismiss.decompose() bookmark = article.find("a", attrs={"class": "action-show-bookmarks"}) if bookmark != None: bookmark.decompose() buttons = article.find_all("button",attrs={"class": "toggle-visibility-button"}) if buttons != None: for button in buttons: button.decompose() article['class']="toggle-visibility-element article-content" self.html_homepage.append(dl_dependencies(article.prettify(),os.path.join(self.output_path, "home"),"home",c)) else: dismiss = html_content.find("div", attrs={"class": "dismiss-message"}) if dismiss != None: dismiss.decompose() bookmark = html_content.find("a", attrs={"class": "action-show-bookmarks"}) if bookmark != None: bookmark.decompose() buttons = html_content.find_all("button",attrs={"class": "toggle-visibility-button"}) if buttons != None: for button in buttons: button.decompose() self.html_homepage.append(dl_dependencies(html_content.prettify(),os.path.join(self.output_path, "home"),"home",c)) logging.info("Get content") for x in self.object: x.download(c)
def download(self, c): content = c.get_page(self.json["student_view_url"]) soup = BeautifulSoup.BeautifulSoup(content, 'lxml') html_content = soup.find('div', attrs={"class": "edx-notes-wrapper"}) if html_content == None: html_content = str( soup.find('div', attrs={"class": "course-wrapper"})) soup = BeautifulSoup.BeautifulSoup(html_content, "lxml") text_area = soup.find("textarea", attrs={"class": "student_answer"}) check = soup.find("button", attrs={"class": "check"}).decompose() save = soup.find("button", attrs={"class": "save"}) text_area["id"] = self.id #check["onclick"] = 'check_freetext("{}")'.format(self.id) save["onclick"] = 'save_freetext("{}")'.format(self.id) html_no_answers = '<div class="noanswers"><p data-l10n-id="no_answers_for_freetext" > <b> Warning : </b> There is not correction for Freetext block. </p> </div>' self.html = html_no_answers + dl_dependencies( str(soup), self.output_path, self.folder_name, c)
def annexe(self,c): logging.info("Try to get specific page of mooc") content=c.get_page(self.course_url) soup=BeautifulSoup.BeautifulSoup(content, 'lxml') top_bs=soup.find('ol', attrs={"class": "course-material" }) or soup.find('ul', attrs={"class": "course-material" }) or soup.find('ul', attrs={"class": "navbar-nav" }) or soup.find('ol', attrs={"class": "course-tabs"}) if top_bs != None: for top_elem in top_bs.find_all("li"): top_elem=top_elem.find("a") if top_elem["href"][-1] == "/": path=top_elem["href"][:-1].split("/")[-1] else: path=top_elem["href"].split("/")[-1] if path == "course" or "courseware" in path: name = top_elem.get_text().replace(", current location", "") self.top[name] = "course/" + self.head.folder_name + "/index.html" if "info" in path: name = top_elem.get_text().replace(", current location", "") self.top[name] = "/index.html" if path == "course" or "edxnotes" in path or "progress" in path or "info" in path or "courseware" in path: continue if "wiki" in path: self.wiki, self.wiki_name, path=annexe.wiki(c,self) elif "forum" in path: path="forum/" self.forum_thread, self.forum_category, self.staff_user_forum = annexe.forum(c,self) else: output_path = os.path.join(self.output_path,path) make_dir(output_path) page_content=c.get_page(self.instance_url + top_elem["href"]) soup_page=BeautifulSoup.BeautifulSoup(page_content, 'lxml') just_content = soup_page.find('section', attrs={"class": "container"}) if just_content != None : html_content=dl_dependencies(str(just_content),output_path,"",c) self.page_annexe.append({ "output_path": output_path, "content": html_content,"title" : soup_page.find('title').get_text()}) else: book=soup_page.find('section', attrs={"class": "book-sidebar"}) if book != None: self.book_list_list.append({"output_path": output_path, "book_list" : annexe.booknav(self,book,output_path), "dir_path": path}) else: logging.warning("Oh it's seems we does not support one type of extra content (in top bar) :" + path) continue self.top[top_elem.get_text()]= path + "/index.html"
def download(self, c): content = c.get_page(self.json["student_view_url"]) soup = BeautifulSoup.BeautifulSoup(content, 'lxml') try: html_content_from_div = str( soup.find('div', attrs={"class": "problems-wrapper"})['data-content']) except: problem_json_url = str( soup.find('div', attrs={"class": "problems-wrapper"})['data-url']) html_content_from_div = str( c.get_api_json(problem_json_url + "/problem_get")["html"]) soup = BeautifulSoup.BeautifulSoup(html_content_from_div, 'lxml') #self.has_hint=soup.find("button", attrs={"class" : "hint-button"}) #Remove comment when hint ok for div in soup.find_all('div', attrs={"class": "notification"}): div.decompose() for input_tag in soup.find_all('input'): if input_tag.has_attr("value"): input_tag["value"] = "" if input_tag.has_attr("checked"): del input_tag.attrs['checked'] soup.find('div', attrs={"class": "action"}).decompose() for span in soup.find_all('span', attrs={"class": "unanswered"}): span.decompose() for span in soup.find_all('span', attrs={"class": "sr"}): span.decompose() html_content = str(soup) html_content = dl_dependencies(html_content, self.output_path, self.folder_name, c) self.html_content = str(html_content) #Save json answers path_answers = os.path.join(self.output_path, "problem_show") answers_content = {"success": None} retry = 0 while "success" in answers_content and retry < 6: #We use our check to finally get anwers answers_content = c.get_api_json( "/courses/" + self.mooc.course_id + "/xblock/" + self.json["id"] + "/handler/xmodule_handler/problem_show") if "success" in answers_content: """ #IMPROUVEMENT connection , same as hint ? post_data=urlencode({'event_type': "problem_show", "event": { "problem": self.json["id"] }, "page" : self.json["lms_web_url"]}).encode('utf-8') c.get_api_json("/event", post_data) """ c.get_api_json("/courses/" + self.mooc.course_id + "/xblock/" + self.json["id"] + "/handler/xmodule_handler/problem_check") retry += 1 if "success" in answers_content: logging.warning(" fail to get answers to this problem : " + self.json["id"] + " (" + self.json["lms_web_url"] + ")") self.answers = None else: with open(path_answers, "w") as f: json.dump(answers_content, f) self.answers = [] self.explanation = [] for qid in answers_content["answers"]: if not "solution" in qid: for response in answers_content["answers"][qid]: self.answers.append("input_" + qid + "_" + response) else: self.explanation.append({ "name": "solution_" + qid, "value": json.dumps(answers_content["answers"][qid]) }) self.problem_id = str(uuid4()) """
def wiki(c,mooc): #Get redirection to wiki first_page=c.get_redirection(mooc.instance_url + "/courses/" + mooc.course_id + "/course_wiki") page_to_visit=[first_page] wiki_data={} #Data from page already visit # "[url]" : { "rooturl": , "path": , "text": , "title": , "dir" : , "children": [] } #Extract wiki name wiki_name = first_page.replace(mooc.instance_url + "/wiki/", "")[:-1] wiki_path = os.path.join("wiki", first_page.replace(mooc.instance_url + "/wiki/","")) while page_to_visit: get_page_error=False url = page_to_visit.pop() try: content=c.get_page(url) except HTTPError as e: if e.code == 404 or e.code == 403: get_page_error=True else: logging.warning("Fail to get " + url + "Error :" + str(e.code)) pass wiki_data[url]={} web_path=os.path.join("wiki", url.replace(mooc.instance_url + "/wiki/","")) path=os.path.join(mooc.output_path, web_path) make_dir(path) wiki_data[url]["path"] = path rooturl="../" for x in range(0,len(web_path.split("/"))): rooturl+="../" wiki_data[url]["rooturl"]= rooturl wiki_data[url]["children"]=[] #Parse content page soup=BeautifulSoup.BeautifulSoup(content, 'lxml') text=soup.find("div", attrs={"class": "wiki-article"}) if text != None : #If it's a page (and not a list of page) #Find new wiki page in page content for link in text.find_all("a"): if link.has_attr("href") and "/wiki/" in link["href"]: if link not in wiki_data and link not in page_to_visit: if not link["href"][0:4] == "http": page_to_visit.append(mooc.instance_url + link["href"]) else: page_to_visit.append(link["href"]) if not link["href"][0:4] == "http": #Update path in wiki page link["href"] = rooturl[:-1] + link["href"].replace(mooc.instance_url,"") + "/index.html" wiki_data[url]["text"] = dl_dependencies(str(text),path,"",c) wiki_data[url]["title"] = soup.find("title").text wiki_data[url]["last-modif"] = soup.find("span", attrs={"class": "date"}).text wiki_data[url]["children"]=[] elif get_page_error: wiki_data[url]["text"] = """<div><h1 class="page-header">Permission Denied</h1><p class="alert denied">Sorry, you don't have permission to view this page.</p></div>""" wiki_data[url]["title"] = "Permission Denied | Wiki" wiki_data[url]["last-modif"] = "Unknow" wiki_data[url]["children"]=[] #find new url of wiki in the list children page see_children=soup.find('div', attrs={"class": "see-children"}) if see_children: allpage_url=str(see_children.find("a")["href"]) wiki_data[url]["dir"] = allpage_url content=c.get_page(mooc.instance_url + allpage_url) soup=BeautifulSoup.BeautifulSoup(content, 'lxml') table=soup.find("table") if table != None: for link in table.find_all("a"): if link.has_attr("class") and "list-children" in link["class"]: pass else: if link["href"] not in wiki_data and link["href"] not in page_to_visit: page_to_visit.append(mooc.instance_url + link["href"]) wiki_data[url]["children"].append(mooc.instance_url + link["href"]) return wiki_data, wiki_name, wiki_path
def forum(c,mooc): forum_output=os.path.join(mooc.output_path, "forum") make_dir(forum_output) content=c.get_page(mooc.instance_url + "/courses/" + mooc.course_id + "/discussion/forum") good_content=BeautifulSoup.BeautifulSoup(content, 'lxml').find("script", attrs={"id": "thread-list-template"}) category=OrderedDict() if good_content: soup=BeautifulSoup.BeautifulSoup(content, 'lxml') all_category=soup.find_all('li', attrs={"class": "forum-nav-browse-menu-item"}) if len(all_category) == 0: soup=BeautifulSoup.BeautifulSoup(good_content.text, 'lxml') #On Fun plateform, categorie list is in script with id thread-list-template all_category=soup.find_all('li', attrs={"class": "forum-nav-browse-menu-item"}) for cat in all_category: if (cat.has_attr("id") and cat["id"] in [ "all_discussions", "posts_following" ]) or (cat.has_attr("class") and ("forum-nav-browse-menu-all" in cat["class"] or "forum-nav-browse-menu-following" in cat["class"])): continue if not cat.has_attr("data-discussion-id"): #and cat.find("a") != None: category[str(uuid4())] = { "title" : cat.find(["a", "span"], attrs={"class": "forum-nav-browse-title"}).text, "catego_with_sub_catego" : True} elif cat.has_attr("data-discussion-id"): category[cat["data-discussion-id"]] = {"title": str(cat.text).replace("\n","")} else: logging.error("No forum category found") threads=[] #Search for Staff user : json_user={} section_user = BeautifulSoup.BeautifulSoup(content, 'lxml').find("section", attrs={"id": "discussion-container"}) if section_user and section_user.has_attr("data-roles"): if """ in section_user["data-roles"]: json_user = json.loads(unescape(section_user["data-roles"])) else: json_user = json.loads(section_user["data-roles"]) else: section_user = re.search("roles: [^\n]*", content) if section_user: #TODO check ok in this case json_user=json.loads(re.sub(r"roles: (.*),", r'\1', section_user.group())) staff_user = [] for x in json_user: staff_user += [ str(y) for y in json_user[x]] #Search category for x in category: make_dir(os.path.join(forum_output,x)) url="/courses/" + mooc.course_id + "/discussion/forum/" + x + "/inline?ajax=1&page=1&sort_key=activity&sort_order=desc" data=c.get_api_json(url) d=data["discussion_data"] threads+=d for i in range(1,data["num_pages"]): url="/courses/" + mooc.course_id + "/discussion/forum/" + x + "/inline?ajax=1&page=" + str(i+1) + "&sort_key=activity&sort_order=desc" data=c.get_api_json(url) d=data["discussion_data"] threads+=d for thread in threads: url = "/courses/" + mooc.course_id + "/discussion/forum/" + thread["commentable_id"] + "/threads/" + thread["id"] + "?ajax=1&resp_skip=0&resp_limit=100" make_dir(os.path.join(forum_output,thread["id"])) try: thread["data_thread"]=c.get_api_json(url, referer=mooc.instance_url+url.split("?")[0]) total_answers = 100 while total_answers < thread["data_thread"]["content"]["resp_total"]: url = "/courses/" + mooc.course_id + "/discussion/forum/" + thread["commentable_id"] + "/threads/" + thread["id"] + "?ajax=1&resp_skip="+ str(total_answers) + "&resp_limit=100" new_answers=c.get_api_json(url, referer=mooc.instance_url+url.split("?")[0])["content"]["children"] thread["data_thread"]["content"]["children"] += new_answers total_answers += 100 except: try: thread["data_thread"]=c.get_api_json(url) except: logging.log("Can not get " + mooc.instance_url + url + "discussion") if ("endorsed_responses" in thread["data_thread"]["content"] or "non_endorsed_responses" in thread["data_thread"]["content"]) and "children" in thread["data_thread"]["content"]: logging.warning("pb endorsed VS children" + thread["id"]) if "children" not in thread["data_thread"]["content"]: thread["data_thread"]["content"]["children"] = [] if "endorsed_responses" in thread["data_thread"]["content"]: thread["data_thread"]["content"]["children"] += thread["data_thread"]["content"]["endorsed_responses"] if "non_endorsed_responses" in thread["data_thread"]["content"]: thread["data_thread"]["content"]["children"] += thread["data_thread"]["content"]["non_endorsed_responses"] thread["data_thread"]["content"]["body"] = dl_dependencies(markdown(thread["data_thread"]["content"]["body"]),os.path.join(forum_output,thread["id"]),"",c) for children in thread["data_thread"]["content"]["children"]: children["body"]=dl_dependencies(markdown(children["body"]),os.path.join(forum_output,thread["id"]),"",c) if "children" in children: for children_children in children["children"]: children_children["body"]=dl_dependencies(markdown(children_children["body"]),os.path.join(forum_output,thread["id"]),"",c) return threads, category, staff_user