Пример #1
0
def get_show_transcripts_page(show, starting_url):
    '''
    Get link to the show's transcript page.
    Returns None if there is no valid transcript.
    '''

    for show_link in show.find_all('a'):
        if show_link.get_text().strip() == "Transcripts":
            return crawler_util.convert_if_relative_url(\
                starting_url, show_link.get('href'))

    first_show_link = crawler_util.convert_if_relative_url(
        starting_url,
        show.find('a').get('href'))

    # Go to first show link to find transcripts page
    show_page_request = crawler_util.get_request(first_show_link)
    show_page_text = show_page_request.text
    show_page_soup = bs4.BeautifulSoup(show_page_text, "html5lib")
    subnav = show_page_soup.find('nav', class_='show-subnav')
    if subnav:
        for link in subnav.find_all('a'):
            if link.get_text() == "Transcripts":
                return crawler_util.convert_if_relative_url(\
                                    starting_url, link.get('href'))
Пример #2
0
def go(db_cursor,
       db_connection,
       speaker_id_start=0,
       episode_id_start=0,
       phrase_id_start=0):
    '''
    Crawls transcripts of CNN shows and returns some TBD data structure.
    start_date: first date of transcripts to include (inclusive)
    end_date: last date of transcripts to include (inclusive)
    '''

    starting_url = ("http://www.msnbc.com/transcripts")

    # Create soup object from starting page
    starting_request = crawler_util.get_request(starting_url)
    starting_text = starting_request.text
    starting_soup = bs4.BeautifulSoup(starting_text, "html5lib")
    show_list = starting_soup.find('div', class_='item-list').find_all('a')

    # for show in show_list:
    for show in show_list[0:1]:
        link = show.get('href')
        if "/nav-" in link:
            continue
        title = show.get_text()
        transcripts_link = crawler_util.convert_if_relative_url(\
                starting_url, show.get('href'))

        episode_id_start, speaker_id_start, phrase_id_start = crawl_show(\
            starting_url, transcripts_link, db_cursor, db_connection,
            title, episode_id_start, speaker_id_start, phrase_id_start)

    return speaker_id_start, episode_id_start, phrase_id_start
Пример #3
0
def go(db_cursor, db_connection, speaker_id_start=0, episode_id_start=0,
       phrase_id_start=0):
    '''
    Crawls transcripts of CNN shows and returns some TBD data structure.
    start_date: first date of transcripts to include (inclusive)
    end_date: last date of transcripts to include (inclusive)
    '''

    starting_url = ("http://transcripts.cnn.com/TRANSCRIPTS/")

    # Create soup object from starting page
    starting_request = crawler_util.get_request(starting_url)
    starting_text = starting_request.text
    starting_soup = bs4.BeautifulSoup(starting_text, "html5lib")
    show_subsection = starting_soup.find_all('span',
        class_='cnnSectBulletItems')

    show_dict = {}
    for section in show_subsection[0:1]:
        for show in section.find_all('a')[0:1]:

            title = show.get_text().strip()
            if title in show_dict:
                continue

            transcript_link = crawler_util.convert_if_relative_url(\
                    starting_url, show.get('href'))
            show_dict[title] = transcript_link

            episode_id_start, speaker_id_start, phrase_id_start = crawl_show(\
                starting_url, transcript_link, db_cursor, db_connection,
                title, episode_id_start, speaker_id_start, phrase_id_start)
    db_connection.commit()

    return speaker_id_start, episode_id_start, phrase_id_start
Пример #4
0
def crawl_show(starting_url, transcript_link, db_cursor, db_connection, title,
               episode_id_start, speaker_id_start, phrase_id_start):
    '''
    Crawl all transcripts for a given show, for the requested time frame.

    Inputs:
        starting_url: (str) link to show page (to help give complete url for a
            transcript)
        transcript_link: (str) link to transcript page
        db_cursor, db_connection: database cursor and connection
        title: (str) name of show)
        episode_id_start, speaker_id_start, phrase_id_start: (ints) numbers at
            which to start speakers, episodes, and phrases

    Outputs:
        episode_id_start, speaker_id_start, phrase_id_start: (ints) updated
            starting points for the next transcript to be crawled
    '''
    episode_id_init = episode_id_start

    # Create soup object from starting page
    transcripts_request = crawler_util.get_request(transcript_link)
    transcripts_text = transcripts_request.text
    articles_soup = bs4.BeautifulSoup(transcripts_text, "html5lib")
    transcripts_by_day = articles_soup.find('div', class_='cnnSectBulletItems')
    transcripts_raw_links = transcripts_by_day.find_all('a', href=True)

    title = articles_soup.find('p', class_='cnnTransHead').get_text()

    most_recent_year = LIMIT_YEAR
    while most_recent_year >= LIMIT_YEAR:
        for article in transcripts_raw_links:
            headline = article.get_text()
            if "Did Not Air" in headline:
                continue
            link = crawler_util.convert_if_relative_url(
                starting_url, article.get('href'))
            link = re.sub("\n", "", link)
            transcript_request = crawler_util.get_request(link)
            if transcript_request is None:
                continue
            most_recent_year, episode_id_start, speaker_id_start, \
                phrase_id_start = crawl_transcript(link, db_cursor,
                    db_connection, title, headline, episode_id_start,
                    speaker_id_start, phrase_id_start)

        transcripts_by_day = transcripts_by_day.find_next_sibling('div', \
            class_='cnnSectBulletItems')
        transcripts_raw_links = transcripts_by_day.find_all('a', href=True)

    if episode_id_start > episode_id_init:
        db_cursor.execute('INSERT INTO show VALUES(?, ?)',
                          (title, 'CNN')).fetchall()

    return episode_id_start, speaker_id_start, phrase_id_start
Пример #5
0
def crawl_show(starting_url, transcripts_link, db_cursor, db_connection, title,
               episode_id_start, speaker_id_start, phrase_id_start):
    '''
    Crawl all transcripts for a given show, for the requested time frame.

    Inputs:
        starting_url: (str) link to show page (to help give complete url for a
            transcript)
        transcript_link: (str) link to transcript page
        db_cursor, db_connection: database cursor and connection
        title: (str) name of show)
        episode_id_start, speaker_id_start, phrase_id_start: (ints) numbers at
            which to start speakers, episodes, and phrases

    Outputs:
        episode_id_start, speaker_id_start, phrase_id_start: (ints) updated
            starting points for the next transcript to be crawled
    '''

    episode_id_init = episode_id_start

    # Create soup object from starting page
    transcripts_request = crawler_util.get_request(transcripts_link)
    transcripts_text = transcripts_request.text
    articles_soup = bs4.BeautifulSoup(transcripts_text, "html5lib")
    show_day = articles_soup.find('div', class_='transcript-item')

    year = int(show_day.find('a').get_text()[-4:])
    while year >= LIMIT_YEAR:
        # Crawl transcript
        link = crawler_util.convert_if_relative_url(\
                starting_url, show_day.find('a').get('href'))

        episode_id_start, speaker_id_start, phrase_id_start, \
            within_date_range = crawl_msnbc_transcript(\
                link, db_cursor, db_connection, title, episode_id_start,
                speaker_id_start, phrase_id_start)

        if not within_date_range:
            break

        show_day = show_day.find_next('div', class_='transcript-item')
        if show_day is None:
            break
        year = int(show_day.find('a').get_text()[-4:])

    if episode_id_start > episode_id_init:
        db_cursor.execute('INSERT INTO show VALUES(?, ?)',
                          (title, 'MSNBC')).fetchall()

    return episode_id_start, speaker_id_start, phrase_id_start
Пример #6
0
def crawl_transcripts(starting_url, all_show_transcripts, db_cursor,
                      db_connection, title, episode_id_start, speaker_id_start,
                      phrase_id_start, index_start):
    '''
    Crawl all transcripts for a given show.

    Inputs:
        starting_url: URL to network page
        all_show_transcripts: list of transcripts
        db_cursor: DB cursor to perform SQL operations
        db_connection: connection to database
        title: title of show
        episode_id_start: (int) current episode ID
        speaker_id_start: (int) current speaker ID
        phrase_id_start: (int) current text clip ID
        index_start: index of transcript in list (to reference
            after clicking "Show More" button)
    '''

    year = None
    for transcript in all_show_transcripts[index_start:]:
        link = crawler_util.convert_if_relative_url(
            starting_url,
            transcript.find('a').get('href'))

        # Skip over non-transcripts:
        if "transcript" not in link:
            continue

        transcript_page_request = crawler_util.get_request(\
            link)
        transcript_page_text = transcript_page_request.text
        transcript_page_soup = bs4.BeautifulSoup(\
            transcript_page_text, "html5lib")

        year, airtime = get_fox_transcript_date(transcript_page_soup)
        if year != LIMIT_YEAR:
            continue

        meta_data = transcript_page_soup.find("script",
                                              {"type": "application/ld+json"})
        meta_data_dict = json.loads("".join(meta_data.contents))
        headline = meta_data_dict['headline']

        db_cursor.execute('INSERT INTO episode VALUES(?, ?, ?, ?)',
                          (episode_id_start, headline, airtime, title))

        # https://stackoverflow.com/questions/54162988/how-to-find-a-tag-
        # without-specific-attribute-using-beautifulsoup
        transcript_raw_text = transcript_page_soup.find_all(
            'p', class_='speakable') + transcript_page_soup.find_all(
                'p', class_=None)

        transcript_text = crawler_util.join_text_chunks(transcript_raw_text)

        begin_flag = '(\n[A-Z][^a-z^\n]+?:|\(BEGIN .*?\)).*'
        end_flag = "Content and Programming Copyright.*"

        speaker_id_start, phrase_id_start = crawler_util.\
            crawl_transcript(transcript_text, begin_flag, end_flag,
                episode_id_start, speaker_id_start, phrase_id_start,
                db_cursor)

        db_connection.commit()
        episode_id_start += 1

    return year, len(all_show_transcripts), episode_id_start, \
        speaker_id_start, phrase_id_start