示例#1
0
def main(start_date, dbFile):

    scriptDir = os.path.dirname(os.path.realpath(__file__))
    os.chdir(scriptDir)
    with Display(visible=False, size=(1200, 1500)):
        if start_date:
            start_date = dateParse(start_date)
        else:
            with open('last_scan'.format(scriptDir), 'r') as dateFile:
                start_date = dateParse(dateFile.read())

        if dbFile:
            session = prepareSession(dbFile)
        else:
            session = prepareSession()

        scrapers = [
            birs.scrape, fields.scrape, ias.scrape, ihes.scrape,
            mathtube.scrape, msri.scrape, simons.scrape
        ]

        for scraper in scrapers:
            scraper(start_date=start_date,
                    process=(lambda talk: addTalk(talk, session)))

        with open('last_scan', 'w') as dateFile:
            dateFile.write(str(date.today() - timedelta(weeks=1)))
示例#2
0
def scrape(start_date=date(1980, 1, 1), process=None):  # process should be Talk -> None

    year = datetime.today().year
    hostname = "http://www.birs.ca"
    while year < 3000:
        URL = "http://www.birs.ca/videos/" + str(year)
        try:
            page = requests.get(URL)
            soup = BeautifulSoup(page.content, 'html.parser')
            workshop_divs = soup.find_all('div', class_="workshop-event")
            if len(workshop_divs) == 0:
                break
            for workshop_div in workshop_divs:
                try:
                    workshop = "Birs- " + workshop_div.find(
                        'div', class_='event-title').text.strip()
                    video_divs = workshop_div.find_all('div', class_='video')
                    for video_div in video_divs:
                        try:
                            date = dateParse(
                                video_div.find(
                                    'span', {
                                        "itemprop": "datePublished"}).text)
                            if date < start_date:
                                year = 3000
                                break
                        except Exception as e:
                            raise e
                        a_tag = video_div.find(
                            'div', class_="actions").find('a')
                        if a_tag.text == "Watch video":
                            link = hostname + a_tag["href"]
                        else:
                            break
                        talk = Talk(link, workshop=workshop)
                        try:
                            speaker = video_div.find(
                                'div', class_="lecturer").find(
                                'span', {"itemprop": "author"}).text
                            talk.firstName, talk.lastName = cleanSpeaker(
                                speaker)
                        except BaseException:
                            pass
                        try:
                            talk.title = video_div.find(
                                'div', class_="talk-title").text
                        except BaseException:
                            pass
                        print(talk)
                        if process:
                            process(talk)

                except BaseException:
                    pass
        except BaseException:
            pass
        year = year - 1

    return None
示例#3
0
def urlToMaybeDate(url, driver):
    try:
        driver.get(url)
        time.sleep(3)
        date = driver.find_element_by_id('date').text[1:]
        date = dateParse(date)
        return date
    except BaseException:
        return None
示例#4
0
def scrape(start_date=date(1980, 1, 1), process=None
           ):  # process should be Talk -> None

    year = datetime.today().year
    month = datetime.today().month
    fireFoxOptions = webdriver.FirefoxOptions()
    fireFoxOptions.set_headless()
    driver = webdriver.Firefox(firefox_options=fireFoxOptions)
    # driver = webdriver.Firefox()
    delay = 10
    while year < 3000:
        URL = "https://video-archive.fields.utoronto.ca/browse/" + \
            str(year) + "/" + str(month).zfill(2)

        try:
            driver.get(URL)
            time.sleep(4)

            watch_buttons = driver.find_elements_by_class_name(
                "talk-list-watch")
            if len(watch_buttons) == 0:
                year = 4000
                break
            for i in reversed(range(len(watch_buttons))):
                watch_buttons[i].click()
                time.sleep(3)
                talk = getTalk(driver)
                if process:
                    process(talk)
                print(talk)
                try:
                    date = driver.find_element_by_class_name(
                        "date-time").text[0:-2]
                    date = dateParse(date)
                    if date < start_date:
                        year = 3010
                        break
                except BaseException:
                    pass
                driver.back()
                time.sleep(1)
                watch_buttons = driver.find_elements_by_class_name(
                    "talk-list-watch")

        except BaseException:
            pass
        if month > 1:
            month = month - 1
        else:
            month = 12
            year = year - 1

    driver.quit()
    return None
示例#5
0
def scrape(start_date=date(1980, 1, 1), process=None):  # process should be Talk -> None
    hostname = "https://www.msri.org"
    URL = "https://www.msri.org/events/semester?from=2021-01-01&to=2021-05-31"
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, 'html.parser')
    rightPanel = soup.find('ul', id="semester-menu")
    years = rightPanel.find_all('li', recursive=False)
    semesterLinks = [hostname + a['href']
                     for year in years for a in year.find_all('a')]
    for semesterLink in semesterLinks:

        page = requests.get(semesterLink)
        soup = BeautifulSoup(page.content, 'html.parser')
        workshopLis = soup.find('div',
                                class_="page-content").find('ol',
                                                            recursive=False).find_all('li',
                                                                                      recursive=False)

        for workshopLi in workshopLis:
            date = dateParse(
                cutStringUntilSequence(
                    workshopLi.find('time').text,
                    ["(", "-"]))
            if date < start_date:
                break
            workshop = "MSRI- " + workshopLi.find('a').text
            talkLis = workshopLi.find(
                'ul', class_="schedules-with-videos").find_all('li', recursibe=False)
            if len(talkLis) < 4:
                workshop = None
            for talkLi in talkLis:
                try:
                    link = hostname + talkLi.find('a',
                                                  recursive=False)['href']
                    title = talkLi.find('a', recursive=False).text
                    talk = Talk(link, title=title)
                    talk.workshop = workshop
                    if (abstract := urlToMaybeAbstract(link)):
                        talk.abstract = abstract

                    try:
                        speaker = talkLi.find(
                            'span', class_="person").text.strip()
                        talk.firstName, talk.lastName = cleanSpeaker(speaker)
                    except BaseException:
                        pass
                    if talk.firstName is None and talk.lastName is None:
                        pass
                    if process:
                        process(talk)
                    print(talk)
                except BaseException:
                    pass
示例#6
0
def scrape(start_date=date(1980, 1, 1), process=None):  # process should be Talk -> None
    hostname = "https://www.ias.edu"
    page_number = 1
    while page_number >= 0:
        URL = 'https://www.ias.edu/video?tags=All&page=' + str(page_number)
        page_number = page_number + 1
        page = requests.get(URL)
        soup = BeautifulSoup(page.content, 'html.parser')
        try:
            talkDivs = soup.find(
                'div', class_="view-content").find_all(
                'div', recursive=False)
            for talkDiv in talkDivs:
                try:
                    date = talkDiv.find(
                        'div',
                        class_="field field--name-field-date-single field--type-datetime field--label-hidden").text
                    date = dateParse(date)
                    if date < start_date:
                        page_number = -1
                        break
                    title = talkDiv.find(
                        'h3', class_="teaser-full-width__label").text.strip()
                    link = hostname + talkDiv.find(
                        'h3', class_="teaser-full-width__label").find('a')['href']
                    talk = Talk(link, title=title)

                    if (abstract := urlToMaybeAbstract(link)):
                        talk.abstract = abstract

                    try:
                        speaker = talkDiv.find(
                            'div', class_="teaser-full-width__detail").text.strip()
                        talk.firstName, talk.lastName = cleanSpeaker(speaker)
                    except BaseException:
                        pass

                    if process:
                        process(talk)
                    print(talk)

                except BaseException:
                    pass
示例#7
0
def scrape(start_date=date(1980, 1, 1), process=None
           ):  # process should be Talk -> None
    page_number = 0
    while page_number >= 0:
        URL = 'https://mathtube.org/videotype?page=' + str(page_number)
        page_number = page_number + 1
        try:
            page = requests.get(URL)
            soup = BeautifulSoup(page.content, 'html.parser')

            talk_trs = []
            try:
                talk_trs = soup.find('tbody').find_all('tr')
            except BaseException:
                pass
            if len(talk_trs) == 0:
                page_number = -1  # This will break out of the outer loop if the current page has no talks
                break

            for talk_tr in talk_trs:
                date = dateParse(
                    talk_tr.find(
                        'td',
                        class_='views-field views-field-field-date').text)
                if (date < start_date):
                    page_number = -1  # This will break out of the outer loop if the current page has no talks
                    break
                link = "https://mathtube.org" + talk_tr.find(
                    'td',
                    class_='views-field views-field-title').find('a')['href']
                if (talk := urlToTalk(link)):
                    if process:
                        process(talk)
                    print(talk)
        except BaseException:
            break

    return None
示例#8
0
def scrape(start_date=date(1980, 1, 1), process=None
           ):  # process should be Talk -> None
    page_number = 1
    hostname = "http://scgp.stonybrook.edu"
    while page_number >= 0:
        URL = 'http://scgp.stonybrook.edu/video_portal/index.php?page=' + str(
            page_number)
        page_number = page_number + 1

        try:
            page = requests.get(URL)
            soup = BeautifulSoup(page.content, 'html.parser')
            new_talks_div = soup.find('div', class_="col-lg-7 col-md-9")
            talk_divs = new_talks_div.find_all('div', class_="col-xs-8")

            if len(talk_divs) == 0:
                page_number = -1
                break

            for talk_div in talk_divs:
                link = hostname + talk_div.find(
                    'a', class_="btn btn-primary btn-xs")['href']
                talk = Talk(link)
                dataField = talk_div.text
                # Tokens are used in the regular expression to extract talk
                # data
                all_tokens = "(\nTitle: |\nEvent: |\nName: |\nDate: |\nLocation: |\nview video)"
                tokens = {
                    'title': '\nTitle: ',
                    'workshop': '\nEvent: ',
                    'speaker': '\nName: ',
                    'date': '\nDate: '
                }

                try:
                    date = dateParse(
                        re.search('%s(.*?)%s' % (tokens['date'], all_tokens),
                                  dataField,
                                  flags=re.DOTALL
                                  | re.MULTILINE).group(1).replace('@', ''))
                    if date < start_date:
                        page_number = -1
                        break
                except BaseException:
                    pass

                try:
                    speaker = re.search(
                        '%s(.*?)%s' % (tokens['speaker'], all_tokens),
                        dataField,
                        flags=re.DOTALL | re.MULTILINE).group(1).strip()
                    talk.firstName, talk.lastName = cleanSpeaker(speaker)
                except BaseException:
                    pass

                try:
                    title = re.search(
                        '%s(.*?)%s' % (tokens['title'], all_tokens),
                        dataField,
                        flags=re.DOTALL | re.MULTILINE).group(1).strip()
                    if title != '':
                        talk.title = title
                except BaseException:
                    pass

                try:
                    workshop = re.search(
                        '%s(.*?)%s' % (tokens['workshop'], all_tokens),
                        dataField,
                        flags=re.DOTALL | re.MULTILINE).group(1).strip()
                    if workshop != '':
                        talk.workshop = "Simons- " + workshop
                except BaseException:
                    pass

                if (abstract := urlToMaybeAbstract(link)):
                    talk.abstract = abstract

                if process:
                    process(talk)
                print(talk)

        except BaseException:
            pass
    return None