def main(start_date, dbFile): scriptDir = os.path.dirname(os.path.realpath(__file__)) os.chdir(scriptDir) with Display(visible=False, size=(1200, 1500)): if start_date: start_date = dateParse(start_date) else: with open('last_scan'.format(scriptDir), 'r') as dateFile: start_date = dateParse(dateFile.read()) if dbFile: session = prepareSession(dbFile) else: session = prepareSession() scrapers = [ birs.scrape, fields.scrape, ias.scrape, ihes.scrape, mathtube.scrape, msri.scrape, simons.scrape ] for scraper in scrapers: scraper(start_date=start_date, process=(lambda talk: addTalk(talk, session))) with open('last_scan', 'w') as dateFile: dateFile.write(str(date.today() - timedelta(weeks=1)))
def scrape(start_date=date(1980, 1, 1), process=None): # process should be Talk -> None year = datetime.today().year hostname = "http://www.birs.ca" while year < 3000: URL = "http://www.birs.ca/videos/" + str(year) try: page = requests.get(URL) soup = BeautifulSoup(page.content, 'html.parser') workshop_divs = soup.find_all('div', class_="workshop-event") if len(workshop_divs) == 0: break for workshop_div in workshop_divs: try: workshop = "Birs- " + workshop_div.find( 'div', class_='event-title').text.strip() video_divs = workshop_div.find_all('div', class_='video') for video_div in video_divs: try: date = dateParse( video_div.find( 'span', { "itemprop": "datePublished"}).text) if date < start_date: year = 3000 break except Exception as e: raise e a_tag = video_div.find( 'div', class_="actions").find('a') if a_tag.text == "Watch video": link = hostname + a_tag["href"] else: break talk = Talk(link, workshop=workshop) try: speaker = video_div.find( 'div', class_="lecturer").find( 'span', {"itemprop": "author"}).text talk.firstName, talk.lastName = cleanSpeaker( speaker) except BaseException: pass try: talk.title = video_div.find( 'div', class_="talk-title").text except BaseException: pass print(talk) if process: process(talk) except BaseException: pass except BaseException: pass year = year - 1 return None
def urlToMaybeDate(url, driver): try: driver.get(url) time.sleep(3) date = driver.find_element_by_id('date').text[1:] date = dateParse(date) return date except BaseException: return None
def scrape(start_date=date(1980, 1, 1), process=None ): # process should be Talk -> None year = datetime.today().year month = datetime.today().month fireFoxOptions = webdriver.FirefoxOptions() fireFoxOptions.set_headless() driver = webdriver.Firefox(firefox_options=fireFoxOptions) # driver = webdriver.Firefox() delay = 10 while year < 3000: URL = "https://video-archive.fields.utoronto.ca/browse/" + \ str(year) + "/" + str(month).zfill(2) try: driver.get(URL) time.sleep(4) watch_buttons = driver.find_elements_by_class_name( "talk-list-watch") if len(watch_buttons) == 0: year = 4000 break for i in reversed(range(len(watch_buttons))): watch_buttons[i].click() time.sleep(3) talk = getTalk(driver) if process: process(talk) print(talk) try: date = driver.find_element_by_class_name( "date-time").text[0:-2] date = dateParse(date) if date < start_date: year = 3010 break except BaseException: pass driver.back() time.sleep(1) watch_buttons = driver.find_elements_by_class_name( "talk-list-watch") except BaseException: pass if month > 1: month = month - 1 else: month = 12 year = year - 1 driver.quit() return None
def scrape(start_date=date(1980, 1, 1), process=None): # process should be Talk -> None hostname = "https://www.msri.org" URL = "https://www.msri.org/events/semester?from=2021-01-01&to=2021-05-31" page = requests.get(URL) soup = BeautifulSoup(page.content, 'html.parser') rightPanel = soup.find('ul', id="semester-menu") years = rightPanel.find_all('li', recursive=False) semesterLinks = [hostname + a['href'] for year in years for a in year.find_all('a')] for semesterLink in semesterLinks: page = requests.get(semesterLink) soup = BeautifulSoup(page.content, 'html.parser') workshopLis = soup.find('div', class_="page-content").find('ol', recursive=False).find_all('li', recursive=False) for workshopLi in workshopLis: date = dateParse( cutStringUntilSequence( workshopLi.find('time').text, ["(", "-"])) if date < start_date: break workshop = "MSRI- " + workshopLi.find('a').text talkLis = workshopLi.find( 'ul', class_="schedules-with-videos").find_all('li', recursibe=False) if len(talkLis) < 4: workshop = None for talkLi in talkLis: try: link = hostname + talkLi.find('a', recursive=False)['href'] title = talkLi.find('a', recursive=False).text talk = Talk(link, title=title) talk.workshop = workshop if (abstract := urlToMaybeAbstract(link)): talk.abstract = abstract try: speaker = talkLi.find( 'span', class_="person").text.strip() talk.firstName, talk.lastName = cleanSpeaker(speaker) except BaseException: pass if talk.firstName is None and talk.lastName is None: pass if process: process(talk) print(talk) except BaseException: pass
def scrape(start_date=date(1980, 1, 1), process=None): # process should be Talk -> None hostname = "https://www.ias.edu" page_number = 1 while page_number >= 0: URL = 'https://www.ias.edu/video?tags=All&page=' + str(page_number) page_number = page_number + 1 page = requests.get(URL) soup = BeautifulSoup(page.content, 'html.parser') try: talkDivs = soup.find( 'div', class_="view-content").find_all( 'div', recursive=False) for talkDiv in talkDivs: try: date = talkDiv.find( 'div', class_="field field--name-field-date-single field--type-datetime field--label-hidden").text date = dateParse(date) if date < start_date: page_number = -1 break title = talkDiv.find( 'h3', class_="teaser-full-width__label").text.strip() link = hostname + talkDiv.find( 'h3', class_="teaser-full-width__label").find('a')['href'] talk = Talk(link, title=title) if (abstract := urlToMaybeAbstract(link)): talk.abstract = abstract try: speaker = talkDiv.find( 'div', class_="teaser-full-width__detail").text.strip() talk.firstName, talk.lastName = cleanSpeaker(speaker) except BaseException: pass if process: process(talk) print(talk) except BaseException: pass
def scrape(start_date=date(1980, 1, 1), process=None ): # process should be Talk -> None page_number = 0 while page_number >= 0: URL = 'https://mathtube.org/videotype?page=' + str(page_number) page_number = page_number + 1 try: page = requests.get(URL) soup = BeautifulSoup(page.content, 'html.parser') talk_trs = [] try: talk_trs = soup.find('tbody').find_all('tr') except BaseException: pass if len(talk_trs) == 0: page_number = -1 # This will break out of the outer loop if the current page has no talks break for talk_tr in talk_trs: date = dateParse( talk_tr.find( 'td', class_='views-field views-field-field-date').text) if (date < start_date): page_number = -1 # This will break out of the outer loop if the current page has no talks break link = "https://mathtube.org" + talk_tr.find( 'td', class_='views-field views-field-title').find('a')['href'] if (talk := urlToTalk(link)): if process: process(talk) print(talk) except BaseException: break return None
def scrape(start_date=date(1980, 1, 1), process=None ): # process should be Talk -> None page_number = 1 hostname = "http://scgp.stonybrook.edu" while page_number >= 0: URL = 'http://scgp.stonybrook.edu/video_portal/index.php?page=' + str( page_number) page_number = page_number + 1 try: page = requests.get(URL) soup = BeautifulSoup(page.content, 'html.parser') new_talks_div = soup.find('div', class_="col-lg-7 col-md-9") talk_divs = new_talks_div.find_all('div', class_="col-xs-8") if len(talk_divs) == 0: page_number = -1 break for talk_div in talk_divs: link = hostname + talk_div.find( 'a', class_="btn btn-primary btn-xs")['href'] talk = Talk(link) dataField = talk_div.text # Tokens are used in the regular expression to extract talk # data all_tokens = "(\nTitle: |\nEvent: |\nName: |\nDate: |\nLocation: |\nview video)" tokens = { 'title': '\nTitle: ', 'workshop': '\nEvent: ', 'speaker': '\nName: ', 'date': '\nDate: ' } try: date = dateParse( re.search('%s(.*?)%s' % (tokens['date'], all_tokens), dataField, flags=re.DOTALL | re.MULTILINE).group(1).replace('@', '')) if date < start_date: page_number = -1 break except BaseException: pass try: speaker = re.search( '%s(.*?)%s' % (tokens['speaker'], all_tokens), dataField, flags=re.DOTALL | re.MULTILINE).group(1).strip() talk.firstName, talk.lastName = cleanSpeaker(speaker) except BaseException: pass try: title = re.search( '%s(.*?)%s' % (tokens['title'], all_tokens), dataField, flags=re.DOTALL | re.MULTILINE).group(1).strip() if title != '': talk.title = title except BaseException: pass try: workshop = re.search( '%s(.*?)%s' % (tokens['workshop'], all_tokens), dataField, flags=re.DOTALL | re.MULTILINE).group(1).strip() if workshop != '': talk.workshop = "Simons- " + workshop except BaseException: pass if (abstract := urlToMaybeAbstract(link)): talk.abstract = abstract if process: process(talk) print(talk) except BaseException: pass return None