Пример #1
0
def crawl_show(starting_url, transcript_link, db_cursor, db_connection, title,
               episode_id_start, speaker_id_start, phrase_id_start):
    '''
    Crawl all transcripts for a given show, for the requested time frame.

    Inputs:
        starting_url: (str) link to show page (to help give complete url for a
            transcript)
        transcript_link: (str) link to transcript page
        db_cursor, db_connection: database cursor and connection
        title: (str) name of show)
        episode_id_start, speaker_id_start, phrase_id_start: (ints) numbers at
            which to start speakers, episodes, and phrases

    Outputs:
        episode_id_start, speaker_id_start, phrase_id_start: (ints) updated
            starting points for the next transcript to be crawled
    '''
    episode_id_init = episode_id_start

    # Create soup object from starting page
    transcripts_request = crawler_util.get_request(transcript_link)
    transcripts_text = transcripts_request.text
    articles_soup = bs4.BeautifulSoup(transcripts_text, "html5lib")
    transcripts_by_day = articles_soup.find('div', class_='cnnSectBulletItems')
    transcripts_raw_links = transcripts_by_day.find_all('a', href=True)

    title = articles_soup.find('p', class_='cnnTransHead').get_text()

    most_recent_year = LIMIT_YEAR
    while most_recent_year >= LIMIT_YEAR:
        for article in transcripts_raw_links:
            headline = article.get_text()
            if "Did Not Air" in headline:
                continue
            link = crawler_util.convert_if_relative_url(
                starting_url, article.get('href'))
            link = re.sub("\n", "", link)
            transcript_request = crawler_util.get_request(link)
            if transcript_request is None:
                continue
            most_recent_year, episode_id_start, speaker_id_start, \
                phrase_id_start = crawl_transcript(link, db_cursor,
                    db_connection, title, headline, episode_id_start,
                    speaker_id_start, phrase_id_start)

        transcripts_by_day = transcripts_by_day.find_next_sibling('div', \
            class_='cnnSectBulletItems')
        transcripts_raw_links = transcripts_by_day.find_all('a', href=True)

    if episode_id_start > episode_id_init:
        db_cursor.execute('INSERT INTO show VALUES(?, ?)',
                          (title, 'CNN')).fetchall()

    return episode_id_start, speaker_id_start, phrase_id_start
Пример #2
0
def get_show_transcripts_page(show, starting_url):
    '''
    Get link to the show's transcript page.
    Returns None if there is no valid transcript.
    '''

    for show_link in show.find_all('a'):
        if show_link.get_text().strip() == "Transcripts":
            return crawler_util.convert_if_relative_url(\
                starting_url, show_link.get('href'))

    first_show_link = crawler_util.convert_if_relative_url(
        starting_url,
        show.find('a').get('href'))

    # Go to first show link to find transcripts page
    show_page_request = crawler_util.get_request(first_show_link)
    show_page_text = show_page_request.text
    show_page_soup = bs4.BeautifulSoup(show_page_text, "html5lib")
    subnav = show_page_soup.find('nav', class_='show-subnav')
    if subnav:
        for link in subnav.find_all('a'):
            if link.get_text() == "Transcripts":
                return crawler_util.convert_if_relative_url(\
                                    starting_url, link.get('href'))
Пример #3
0
def go(db_cursor,
       db_connection,
       speaker_id_start=0,
       episode_id_start=0,
       phrase_id_start=0):
    '''
    Crawls transcripts of CNN shows and returns some TBD data structure.
    start_date: first date of transcripts to include (inclusive)
    end_date: last date of transcripts to include (inclusive)
    '''

    starting_url = ("http://www.msnbc.com/transcripts")

    # Create soup object from starting page
    starting_request = crawler_util.get_request(starting_url)
    starting_text = starting_request.text
    starting_soup = bs4.BeautifulSoup(starting_text, "html5lib")
    show_list = starting_soup.find('div', class_='item-list').find_all('a')

    # for show in show_list:
    for show in show_list[0:1]:
        link = show.get('href')
        if "/nav-" in link:
            continue
        title = show.get_text()
        transcripts_link = crawler_util.convert_if_relative_url(\
                starting_url, show.get('href'))

        episode_id_start, speaker_id_start, phrase_id_start = crawl_show(\
            starting_url, transcripts_link, db_cursor, db_connection,
            title, episode_id_start, speaker_id_start, phrase_id_start)

    return speaker_id_start, episode_id_start, phrase_id_start
Пример #4
0
def go(db_cursor, db_connection, speaker_id_start=0, episode_id_start=0,
       phrase_id_start=0):
    '''
    Crawls transcripts of CNN shows and returns some TBD data structure.
    start_date: first date of transcripts to include (inclusive)
    end_date: last date of transcripts to include (inclusive)
    '''

    starting_url = ("http://transcripts.cnn.com/TRANSCRIPTS/")

    # Create soup object from starting page
    starting_request = crawler_util.get_request(starting_url)
    starting_text = starting_request.text
    starting_soup = bs4.BeautifulSoup(starting_text, "html5lib")
    show_subsection = starting_soup.find_all('span',
        class_='cnnSectBulletItems')

    show_dict = {}
    for section in show_subsection[0:1]:
        for show in section.find_all('a')[0:1]:

            title = show.get_text().strip()
            if title in show_dict:
                continue

            transcript_link = crawler_util.convert_if_relative_url(\
                    starting_url, show.get('href'))
            show_dict[title] = transcript_link

            episode_id_start, speaker_id_start, phrase_id_start = crawl_show(\
                starting_url, transcript_link, db_cursor, db_connection,
                title, episode_id_start, speaker_id_start, phrase_id_start)
    db_connection.commit()

    return speaker_id_start, episode_id_start, phrase_id_start
def crawl_msnbc_transcript(link, db_cursor, db_connection, title,
                           episode_id_start, speaker_id_start,
                           phrase_id_start):
    '''
    Crawl MSNBC transcript.
    '''

    transcript_request = crawler_util.get_request(link)
    transcript_text = transcript_request.text
    article_soup = bs4.BeautifulSoup(transcript_text, "html5lib")

    year, airtime = get_msnbc_transcript_date(article_soup)
    if year != LIMIT_YEAR:
        return episode_id_start, speaker_id_start, phrase_id_start

    headline_raw = article_soup.find('meta', \
        property="nv:title").get('content')
    headline = re.search("(.*?) TRANSCRIPT", headline_raw).group(1)
    db_cursor.execute('INSERT INTO episode VALUES(?, ?, ?, ?)',
                      (episode_id_start, headline, airtime, title))

    transcript_raw_text = article_soup.find('div', \
        itemprop="articleBody").find_all('p')
    transcript_text = crawler_util.join_text_chunks(transcript_raw_text)

    begin_flag = ".*"
    end_flag = "(THIS IS A RUSH TRANSCRIPT|Copyright 2020).*"
    speaker_id_start, phrase_id_start = crawler_util.\
        crawl_transcript(transcript_text, begin_flag, end_flag,
            episode_id_start, speaker_id_start, phrase_id_start, db_cursor)

    db_connection.commit()
    episode_id_start += 1

    return episode_id_start, speaker_id_start, phrase_id_start
Пример #6
0
def crawl_transcript(link, db_cursor, db_connection, title, headline,
                episode_id_start, speaker_id_start, phrase_id_start):
    '''
    Crawl CNN transcript.

    Inputs:
        link: URL to transcript page
        db_cursor: DB cursor to perform SQL operations
        db_connection: connection to database
        title: title of show
        headline: name of article
        episode_id_start: (int) current episode ID
        speaker_id_start: (int) current speaker ID
        phrase_id_start: (int) current text clip ID
    '''

    transcript_request = crawler_util.get_request(link)
    transcript_text = transcript_request.text
    article_soup = bs4.BeautifulSoup(transcript_text, "html5lib")
    subheading = article_soup.find('p', class_="cnnTransSubHead")
    
    year, airtime = get_cnn_transcript_date(article_soup)
    py_date = datetime.date(year, int(airtime[5:7]), int(airtime[8:10]))
    if py_date >= MAX_DATE:
        return year, episode_id_start, speaker_id_start, phrase_id_start, True
    elif py_date <= MIN_DATE:
        return year, episode_id_start, speaker_id_start, phrase_id_start, False
    assert (py_date <= MAX_DATE and py_date >= MIN_DATE)

    if (headline == "White House Coronavirus Update; Federal Reserve Cuts" + \
                "Rate To Zero; Coronavirus Testing Available To All 50" + \
                "States. Aired 5-6p ET" and
        airtime == "2020-03-15 17:00"):
        return year, episode_id_start, speaker_id_start, phrase_id_start, True

    if year != LIMIT_YEAR or "Did Not Air" in subheading.get_text():
        return year, episode_id_start, speaker_id_start, phrase_id_start, True

    for br in article_soup.find_all("br"):
        br.replace_with("\n")
    transcript_text = article_soup.find_all('p','cnnBodyText')[2].getText()

    begin_flag = '(\n[A-Z][^a-z^\n]+?:|\(BEGIN .*?\)).*'
    end_flag = ""

    phrase_id_init = phrase_id_start
    speaker_id_start, phrase_id_start = crawler_util.\
        crawl_transcript(transcript_text, begin_flag, end_flag,
            episode_id_start, speaker_id_start, phrase_id_start, db_cursor)

    if phrase_id_init != phrase_id_start:
        db_cursor.execute('INSERT INTO episode VALUES(?, ?, ?, ?)',
            (episode_id_start, headline, airtime, title))
        episode_id_start += 1

    db_connection.commit()

    return int(year), episode_id_start, speaker_id_start, phrase_id_start, True
Пример #7
0
def crawl_show(starting_url, transcripts_link, db_cursor, db_connection, title,
               episode_id_start, speaker_id_start, phrase_id_start):
    '''
    Crawl all transcripts for a given show, for the requested time frame.

    Inputs:
        starting_url: (str) link to show page (to help give complete url for a
            transcript)
        transcript_link: (str) link to transcript page
        db_cursor, db_connection: database cursor and connection
        title: (str) name of show)
        episode_id_start, speaker_id_start, phrase_id_start: (ints) numbers at
            which to start speakers, episodes, and phrases

    Outputs:
        episode_id_start, speaker_id_start, phrase_id_start: (ints) updated
            starting points for the next transcript to be crawled
    '''

    episode_id_init = episode_id_start

    # Create soup object from starting page
    transcripts_request = crawler_util.get_request(transcripts_link)
    transcripts_text = transcripts_request.text
    articles_soup = bs4.BeautifulSoup(transcripts_text, "html5lib")
    show_day = articles_soup.find('div', class_='transcript-item')

    year = int(show_day.find('a').get_text()[-4:])
    while year >= LIMIT_YEAR:
        # Crawl transcript
        link = crawler_util.convert_if_relative_url(\
                starting_url, show_day.find('a').get('href'))

        episode_id_start, speaker_id_start, phrase_id_start, \
            within_date_range = crawl_msnbc_transcript(\
                link, db_cursor, db_connection, title, episode_id_start,
                speaker_id_start, phrase_id_start)

        if not within_date_range:
            break

        show_day = show_day.find_next('div', class_='transcript-item')
        if show_day is None:
            break
        year = int(show_day.find('a').get_text()[-4:])

    if episode_id_start > episode_id_init:
        db_cursor.execute('INSERT INTO show VALUES(?, ?)',
                          (title, 'MSNBC')).fetchall()

    return episode_id_start, speaker_id_start, phrase_id_start
def go(db_cursor,
       db_connection,
       speaker_id_start=0,
       episode_id_start=0,
       phrase_id_start=0):
    '''
    Crawls the Fox transcripts site and updates database of transcripts,
    speakers, titles, shows, and episodes.
    This function modifies the input database.

    Inputs:
        db_cursor, db_connection: cursor and connection to database
        speaker_id_start, episode_id_start, phrase_id_start: (ints) numbers at
            which to start speakers, episodes, and phrases

    Outputs:
        episode_id_start, speaker_id_start, phrase_id_start: (ints) updated
            starting points for the next transcript to be crawled

    '''

    starting_url = "https://www.foxnews.com/shows"

    # Create soup object from starting page
    starting_request = crawler_util.get_request(starting_url)
    starting_text = starting_request.text
    starting_soup = bs4.BeautifulSoup(starting_text, "html5lib")

    # Get list of shows to loop through
    show_info_list = starting_soup.find_all('li', class_='showpage')
    for show in show_info_list[0:1]:
        title = show.find('h2', class_='title').get_text().strip()
        print(title)

        transcript_link = get_show_transcripts_page(show, starting_url)

        # If show has transcripts available, scrape episodes
        if transcript_link:

            episode_id_start, speaker_id_start, phrase_id_start = crawl_show(\
                starting_url, transcript_link, db_cursor, db_connection,
                title, episode_id_start, speaker_id_start, phrase_id_start)
            db_connection.commit()

    return speaker_id_start, episode_id_start, phrase_id_start
Пример #9
0
def crawl_transcripts(starting_url, all_show_transcripts, db_cursor,
                      db_connection, title, episode_id_start, speaker_id_start,
                      phrase_id_start, index_start):
    '''
    Crawl all transcripts for a given show.

    Inputs:
        starting_url: URL to network page
        all_show_transcripts: list of transcripts
        db_cursor: DB cursor to perform SQL operations
        db_connection: connection to database
        title: title of show
        episode_id_start: (int) current episode ID
        speaker_id_start: (int) current speaker ID
        phrase_id_start: (int) current text clip ID
        index_start: index of transcript in list (to reference
            after clicking "Show More" button)
    '''

    year = None
    for transcript in all_show_transcripts[index_start:]:
        link = crawler_util.convert_if_relative_url(
            starting_url,
            transcript.find('a').get('href'))

        # Skip over non-transcripts:
        if "transcript" not in link:
            continue

        transcript_page_request = crawler_util.get_request(\
            link)
        transcript_page_text = transcript_page_request.text
        transcript_page_soup = bs4.BeautifulSoup(\
            transcript_page_text, "html5lib")

        year, airtime = get_fox_transcript_date(transcript_page_soup)
        if year != LIMIT_YEAR:
            continue

        meta_data = transcript_page_soup.find("script",
                                              {"type": "application/ld+json"})
        meta_data_dict = json.loads("".join(meta_data.contents))
        headline = meta_data_dict['headline']

        db_cursor.execute('INSERT INTO episode VALUES(?, ?, ?, ?)',
                          (episode_id_start, headline, airtime, title))

        # https://stackoverflow.com/questions/54162988/how-to-find-a-tag-
        # without-specific-attribute-using-beautifulsoup
        transcript_raw_text = transcript_page_soup.find_all(
            'p', class_='speakable') + transcript_page_soup.find_all(
                'p', class_=None)

        transcript_text = crawler_util.join_text_chunks(transcript_raw_text)

        begin_flag = '(\n[A-Z][^a-z^\n]+?:|\(BEGIN .*?\)).*'
        end_flag = "Content and Programming Copyright.*"

        speaker_id_start, phrase_id_start = crawler_util.\
            crawl_transcript(transcript_text, begin_flag, end_flag,
                episode_id_start, speaker_id_start, phrase_id_start,
                db_cursor)

        db_connection.commit()
        episode_id_start += 1

    return year, len(all_show_transcripts), episode_id_start, \
        speaker_id_start, phrase_id_start