async def gacco_parser(site, session): url = site_url(site, "/data/course/gacco_list.json") text = await session.text_from_url(url) site.add_to_fingerprint(text) data = json.loads(text) count = len(data["opened_courses"]) url = site_url(site, "/data/course/gacco_archive.json") text = await session.text_from_url(url) site.add_to_fingerprint(text) data = json.loads(text) count += len(data["archived_courses"]) return count
async def edcast_org_parser(site, session): url = site_url(site, "/search") text = await session.text_from_url(url) site.add_to_fingerprint(text) h4 = element_by_css(text, ".search-navigation-row h4") result = parse_text("All Courses ({:d} matches)", h4.text) return result[0]
async def hku_hk_parser(site, session): url = site_url(site, "/mbbs_admin/public/downloadMbbsJsonFile") text = await session.text_from_url(url) site.process_text(text) data = json.loads(text) count = len(data) return count
async def count_elements_parser(site, session, rel_url, css): url = site_url(site, rel_url) text = await session.text_from_url(url) site.add_to_fingerprint(text) elts = elements_by_css(text, css) count = len(elts) return count
async def openedu_ru_parser(site, session): url = site_url(site, "/course/") text = await session.text_from_url(url) site.add_to_fingerprint(text) count = element_by_css(text, "span#courses-found") assert " кур" in count.text return int(count.text.split()[0])
async def campus_il_parser(site, session): url = site_url(site, "/course") text = await session.text_from_url(url) site.add_to_fingerprint(text) elt = element_by_css(text, "span#add-sum-course") count = int(elt.text) return count
async def learning_hku_parser(site, session): url = site_url(site, "/catalog/all-courses/") text = await session.text_from_url(url) site.add_to_fingerprint(text) elt = element_by_css(text, "li#course-all span") count = int(elt.text) return count
async def hku_nursing_parser(site, session): url = site_url(site, "/nurs_admin/public/downloadNursJsonFile") text = await session.text_from_url(url) site.add_to_fingerprint(text) data = json.loads(text) count = len(data) return count
async def enlightme_parser(site, session): url = site_url(site, "/courses/") text = await session.text_from_url(url) site.add_to_fingerprint(text) elt = element_by_css(text, ".course-index span") result = parse_text("Showing 1-10 of {:d} results", elt.text) return result[0]
async def entuze_parser(site, session): url = site_url(site, "/course_packages/") text = await session.text_from_url(url) site.add_to_fingerprint(text) elt = element_by_css(text, "div#discovery-message") result = parse_text("Viewing {:d} courses", elt.text) return result[0]
async def edraak_org_parser(site, session): url = site_url(site, "/en/courses/") text = await session.text_from_url(url) site.add_to_fingerprint(text) elts = elements_by_css(text, "aside.all-courses div.course span") count = 0 for elt in elts: count += int(elt.text.strip(" ()")) return count
async def iitbombayx_parser(site, session): url = site_url(site, "/courses") text = await session.text_from_url(url) site.add_to_fingerprint(text) elts = elements_by_css(text, "#block-timeline-2 .facet-item__count") count = 0 for elt in elts: count += int(elt.text.strip("()")) return count
async def studio_to_tiles(site, session): url = site_url(site, "/") text = await session.text_from_url(url) site.process_text(text) lms_links = elements_by_css(text, "#lms-link") if len(lms_links) == 1: lms_link = lms_links[0].get("href") return await count_tiles(lms_link, site, session) raise NotTrying("Not studio I guess")
async def contact_page(site, session): current_courses = site.attempt_course_count() if current_courses is None: raise NotTrying("No point trying /contact") # Only try the contact page if we got some data from the site. url = site_url(site, "/contact") text = await session.text_from_url(url) site.process_text(text, fingerprint=False, emails=True) raise NotTrying("Not looking for courses on /contact")
async def millionlights_parser(site, session): url = site_url(site, "/Course/AllCourses") text = await session.text_from_url(url) site.add_to_fingerprint(text) # Find the language-faceted results, and add up their parenthesized # numbers. elts = elements_by_xpath( text, "//a[contains(text(), 'English (')]/ancestor::ul//a") count = 0 for elt in elts: result = parse_text("{} ({:d})", elt.text) count += result[1] return count
async def gotoclass_parser(site, session): url = site_url(site, "/courses/") count = 0 while True: text = await session.text_from_url(url) site.add_to_fingerprint(text) elts = elements_by_css(text, "div.course-block") count += len(elts) next_a = elements_by_css(text, "a.next.page-numbers") if not next_a: break assert len(next_a) == 1 url = urllib.parse.urljoin(url, next_a[0].get('href')) return count
async def cognitiveclass_parser(site, session): url = site_url(site, "/courses") count = 0 while True: text = await session.text_from_url(url) site.add_to_fingerprint(text) elts = elements_by_css(text, "article.course.card") count += len(elts) # Find the a element with '>' as the text, get its href. next_href = elements_by_xpath(text, "//a/span[text() = '>']/../@href") if not next_href: break assert len(next_href) == 1 url = urllib.parse.urljoin(url, next_href[0]) return count
async def edx_org_parser(site, session): url = site_url(site, "/api/v1/catalog/search?page=1&page_size=200") count = 0 while True: text = await session.text_from_url(url) site.add_to_fingerprint(text) data = json.loads(text) objs = data['objects']['results'] count += len(objs) for obj in objs: course_id = obj.get('key') if course_id: site.course_ids[course_id] += 1 url = data['objects'].get('next') if not url: break return count
async def regex_extract_parser(site, session, rel_url, pattern): url = site_url(site, rel_url) text = await session.text_from_url(url) return int(re.search(pattern, text)[1])
async def courses_page_full_of_tiles(site, session): url = site_url(site, "/courses") return await count_tiles(url, site, session)
async def json_total_value_parser(site, session, rel_url, key): url = site_url(site, rel_url) text = await session.text_from_url(url) site.add_to_fingerprint(text) data = json.loads(text) return data[key]