def get_course_outline(self) -> str: soup = get_soup(self.base_url) course_summary = next( (x for x in soup.find_all(["h1", "h2", "h3"]) if x.get_text(strip=True) == "Course Summary"), None, ) outline = "" if course_summary is not None: # This course uses webcms orig = course_summary.name course_summary = course_summary.next_sibling while course_summary.name != orig: if not isinstance(course_summary, NavigableString): outline += course_summary.get_text(strip=True) course_summary = course_summary.next_sibling else: # Check for iframe external = soup.find_all(text=re.compile("View in browser")) if external: self.base_url = external[0].parent["href"] # for now, just record the url return "" else: return "" return outline.strip()
def get_random_image_url(page_url): print("...function get_random_image_url called") #get the soup soup = helpers.get_soup(page_url) # make list of all the image urls links = [ element['data-lazy-srcset'] for element in soup.findAll('img', attrs={'data-lazy-srcset': True}) ] # select a random url from the list of links image_url = links[helpers.random_list_number(links)] # do some cleanup image_url = image_url.split( ',' )[1] # split list elements, keep only the second one ([1]), the 480p one image_url = image_url.split( ' ' )[1] # split list elements, keep only the first one ([0]), drop the 2x image_url = image_url.replace("_480", "1280") # get the larger size images print("...Using this url: {}".format(image_url)) return image_url
def update(self): """Update self based on information scraped from UQ """ if ("acad_prog" not in self.code): print("not a program") return base_url = 'https://my.uq.edu.au{}'.format(str(self.code)) soup = get_soup(base_url) self.title = soup.find(id="program-title").get_text() self.level = soup.find(id="program-title").get_text().split(' ')[0].lower() self.units = int(soup.find(id="program-domestic-units").get_text()) self.code = int(self.code[-4:])
def home(): # we want do display the menu for the upcoming week: span = 7 # Initialize empty vectors dates = [None] * span nice_dates = [] soups = [None] * span lunches = [None] * span dinners = [None] * span dessert_lunches = [None] * span dessert_dins = [None] * span # Textual month, day and year (for display in jumbotron) d8 = datetime.today() d = d8.strftime("%b. %d, %Y") for i in range(span): x = d8 + timedelta(days=i) date = x.date() dates.append(date) nice_dates.append(date.strftime('%B %d')) ## SOUP OF THE DAY soup = get_soup(date) soups[i] = unique(soup) # soup = pd.DataFrame(unique(soup)) # soups[i] = soup.to_html() ## ENTREES lunch = get_lunch(date) lunches[i] = unique(lunch) dinner = get_dinner(date) dinners[i] = unique(dinner) # DESSERT dessert_lunch = get_dessert(date, meal=1) dessert_lunches[i] = unique(dessert_lunch) dessert_din = get_dessert(date, meal=2) dessert_dins[i] = unique(dessert_din) return render_template("home.html", dates=dates, nice_dates=nice_dates, soups=soups, lunches=lunches, dinners=dinners, dessert_lunches=dessert_lunches, dessert_dins=dessert_dins, d=d)
def update(self, linkCode: str): """Updates self based on information scraped from UQ """ base_url = 'https://my.uq.edu.au{}'.format(linkCode) soup = get_soup(base_url) self.title = soup.find(id="page-head").find("h1").get_text() self.code = linkCode[-10:] # Treat specialisations as extended majors if ("Extended Major" in self.title or "Specialisation" in self.title): self.type = "eMajor" elif ("Minor" in self.title): self.type = "minor" else: self.type = "major"
def get_total_pages(base_url): print("...function total_pages called") # get some soup for our url soup = helpers.get_soup(base_url + "1") total_pages = soup.find( 'form', { 'class': 'add_search_params pure-form hide-xs hide-sm hide-md' }).getText() total_pages = [int(s) for s in total_pages.split() if s.isdigit()][0] print("...returning total pages: {}".format(total_pages)) return total_pages
def get_champs_table(league): print(f"Workin on {league}") url = championship_urls[league][0] selector = championship_urls[league][1] soup = get_soup(url) if league == "La Liga": champ_table = soup.find_all("table", {"class": selector})[-1] else: champ_table = soup.select_one(selector) champ_df = pd.read_html(str(champ_table))[0] champ_df["League"] = league league = league.replace(" ", "") return champ_df
def update(self): """Updates self based on information scraped from UQ """ base_url = 'http://www.uq.edu.au/study/course.html?course_code={}'.format( self.code) soup = get_soup(base_url) if soup is None or soup.find(id="course-notfound"): return None description = soup.find(id="course-summary").get_text().replace( '"', '').replace("'", "''") # apparent edge case; see STAT2203 if '\n' in description: description = description.split('\n')[0] self.description = description self.title = soup.find(id="course-title").get_text()[:-11].replace( "'", "''") self.units = int(soup.find(id="course-units").get_text()) semester_offerings = str(soup.find_all(id="course-current-offerings")) if "Semester 1, " in semester_offerings: self.sem1 = 1 if "Semester 2, " in semester_offerings: self.sem2 = 1 if "Summer Semester, " in semester_offerings: self.summer = 1 prereq = soup.find(id="course-prerequisite") if prereq is not None: if (type(prereq) != type("")): prereq = prereq.get_text() self.prereq = prereq incomp = soup.find(id="course-incompatible") if incomp is not None: incomp = incomp.get_text() self.incomp = incomp
def is_webcms3(course_code, offering_term): url = WEBCMS_URL.format(course_code, offering_term) soup = get_soup(url) return url if soup.find("h2", string="The page was not found.") is None else ""
courses = json.load(courses_file) # if len(sys.argv) > 1: # courses = [c for c in courses if c["code"] == sys.argv[1]] # with open("src/course_scraper/course_host.json") as course_host_file: # course_hosts = json.load(course_host_file) keywords = ["course objectives", "course summary", "course aims", "aims"] headers = ["h1", "h2", "h3"] for course in courses: if (("outline" in course and course["outline"]) or not course["url"] or "private" in course): continue course_code = course["code"] url = course["url"] soup = get_soup(url) # print(soup.find_all(["h1", "h2", "h3"])[3].get_text(strip=True).lower() in keywords) course_summary = next( (x for x in soup.find_all(headers) if x.get_text(strip=True).lower() in keywords), None, ) print(course_summary) outline = "" if course_summary is not None: orig = course_summary.name course_summary = course_summary.next_sibling while course_summary is not None and ( course_summary.name not in headers or not course_summary.get_text(strip=True)): if not isinstance(course_summary, NavigableString):