def crawl_show(starting_url, transcript_link, db_cursor, db_connection, title, episode_id_start, speaker_id_start, phrase_id_start): ''' Crawl all transcripts for a given show, for the requested time frame. Inputs: starting_url: (str) link to show page (to help give complete url for a transcript) transcript_link: (str) link to transcript page db_cursor, db_connection: database cursor and connection title: (str) name of show) episode_id_start, speaker_id_start, phrase_id_start: (ints) numbers at which to start speakers, episodes, and phrases Outputs: episode_id_start, speaker_id_start, phrase_id_start: (ints) updated starting points for the next transcript to be crawled ''' episode_id_init = episode_id_start # Create soup object from starting page transcripts_request = crawler_util.get_request(transcript_link) transcripts_text = transcripts_request.text articles_soup = bs4.BeautifulSoup(transcripts_text, "html5lib") transcripts_by_day = articles_soup.find('div', class_='cnnSectBulletItems') transcripts_raw_links = transcripts_by_day.find_all('a', href=True) title = articles_soup.find('p', class_='cnnTransHead').get_text() most_recent_year = LIMIT_YEAR while most_recent_year >= LIMIT_YEAR: for article in transcripts_raw_links: headline = article.get_text() if "Did Not Air" in headline: continue link = crawler_util.convert_if_relative_url( starting_url, article.get('href')) link = re.sub("\n", "", link) transcript_request = crawler_util.get_request(link) if transcript_request is None: continue most_recent_year, episode_id_start, speaker_id_start, \ phrase_id_start = crawl_transcript(link, db_cursor, db_connection, title, headline, episode_id_start, speaker_id_start, phrase_id_start) transcripts_by_day = transcripts_by_day.find_next_sibling('div', \ class_='cnnSectBulletItems') transcripts_raw_links = transcripts_by_day.find_all('a', href=True) if episode_id_start > episode_id_init: db_cursor.execute('INSERT INTO show VALUES(?, ?)', (title, 'CNN')).fetchall() return episode_id_start, speaker_id_start, phrase_id_start
def get_show_transcripts_page(show, starting_url): ''' Get link to the show's transcript page. Returns None if there is no valid transcript. ''' for show_link in show.find_all('a'): if show_link.get_text().strip() == "Transcripts": return crawler_util.convert_if_relative_url(\ starting_url, show_link.get('href')) first_show_link = crawler_util.convert_if_relative_url( starting_url, show.find('a').get('href')) # Go to first show link to find transcripts page show_page_request = crawler_util.get_request(first_show_link) show_page_text = show_page_request.text show_page_soup = bs4.BeautifulSoup(show_page_text, "html5lib") subnav = show_page_soup.find('nav', class_='show-subnav') if subnav: for link in subnav.find_all('a'): if link.get_text() == "Transcripts": return crawler_util.convert_if_relative_url(\ starting_url, link.get('href'))
def go(db_cursor, db_connection, speaker_id_start=0, episode_id_start=0, phrase_id_start=0): ''' Crawls transcripts of CNN shows and returns some TBD data structure. start_date: first date of transcripts to include (inclusive) end_date: last date of transcripts to include (inclusive) ''' starting_url = ("http://www.msnbc.com/transcripts") # Create soup object from starting page starting_request = crawler_util.get_request(starting_url) starting_text = starting_request.text starting_soup = bs4.BeautifulSoup(starting_text, "html5lib") show_list = starting_soup.find('div', class_='item-list').find_all('a') # for show in show_list: for show in show_list[0:1]: link = show.get('href') if "/nav-" in link: continue title = show.get_text() transcripts_link = crawler_util.convert_if_relative_url(\ starting_url, show.get('href')) episode_id_start, speaker_id_start, phrase_id_start = crawl_show(\ starting_url, transcripts_link, db_cursor, db_connection, title, episode_id_start, speaker_id_start, phrase_id_start) return speaker_id_start, episode_id_start, phrase_id_start
def go(db_cursor, db_connection, speaker_id_start=0, episode_id_start=0, phrase_id_start=0): ''' Crawls transcripts of CNN shows and returns some TBD data structure. start_date: first date of transcripts to include (inclusive) end_date: last date of transcripts to include (inclusive) ''' starting_url = ("http://transcripts.cnn.com/TRANSCRIPTS/") # Create soup object from starting page starting_request = crawler_util.get_request(starting_url) starting_text = starting_request.text starting_soup = bs4.BeautifulSoup(starting_text, "html5lib") show_subsection = starting_soup.find_all('span', class_='cnnSectBulletItems') show_dict = {} for section in show_subsection[0:1]: for show in section.find_all('a')[0:1]: title = show.get_text().strip() if title in show_dict: continue transcript_link = crawler_util.convert_if_relative_url(\ starting_url, show.get('href')) show_dict[title] = transcript_link episode_id_start, speaker_id_start, phrase_id_start = crawl_show(\ starting_url, transcript_link, db_cursor, db_connection, title, episode_id_start, speaker_id_start, phrase_id_start) db_connection.commit() return speaker_id_start, episode_id_start, phrase_id_start
def crawl_msnbc_transcript(link, db_cursor, db_connection, title, episode_id_start, speaker_id_start, phrase_id_start): ''' Crawl MSNBC transcript. ''' transcript_request = crawler_util.get_request(link) transcript_text = transcript_request.text article_soup = bs4.BeautifulSoup(transcript_text, "html5lib") year, airtime = get_msnbc_transcript_date(article_soup) if year != LIMIT_YEAR: return episode_id_start, speaker_id_start, phrase_id_start headline_raw = article_soup.find('meta', \ property="nv:title").get('content') headline = re.search("(.*?) TRANSCRIPT", headline_raw).group(1) db_cursor.execute('INSERT INTO episode VALUES(?, ?, ?, ?)', (episode_id_start, headline, airtime, title)) transcript_raw_text = article_soup.find('div', \ itemprop="articleBody").find_all('p') transcript_text = crawler_util.join_text_chunks(transcript_raw_text) begin_flag = ".*" end_flag = "(THIS IS A RUSH TRANSCRIPT|Copyright 2020).*" speaker_id_start, phrase_id_start = crawler_util.\ crawl_transcript(transcript_text, begin_flag, end_flag, episode_id_start, speaker_id_start, phrase_id_start, db_cursor) db_connection.commit() episode_id_start += 1 return episode_id_start, speaker_id_start, phrase_id_start
def crawl_transcript(link, db_cursor, db_connection, title, headline, episode_id_start, speaker_id_start, phrase_id_start): ''' Crawl CNN transcript. Inputs: link: URL to transcript page db_cursor: DB cursor to perform SQL operations db_connection: connection to database title: title of show headline: name of article episode_id_start: (int) current episode ID speaker_id_start: (int) current speaker ID phrase_id_start: (int) current text clip ID ''' transcript_request = crawler_util.get_request(link) transcript_text = transcript_request.text article_soup = bs4.BeautifulSoup(transcript_text, "html5lib") subheading = article_soup.find('p', class_="cnnTransSubHead") year, airtime = get_cnn_transcript_date(article_soup) py_date = datetime.date(year, int(airtime[5:7]), int(airtime[8:10])) if py_date >= MAX_DATE: return year, episode_id_start, speaker_id_start, phrase_id_start, True elif py_date <= MIN_DATE: return year, episode_id_start, speaker_id_start, phrase_id_start, False assert (py_date <= MAX_DATE and py_date >= MIN_DATE) if (headline == "White House Coronavirus Update; Federal Reserve Cuts" + \ "Rate To Zero; Coronavirus Testing Available To All 50" + \ "States. Aired 5-6p ET" and airtime == "2020-03-15 17:00"): return year, episode_id_start, speaker_id_start, phrase_id_start, True if year != LIMIT_YEAR or "Did Not Air" in subheading.get_text(): return year, episode_id_start, speaker_id_start, phrase_id_start, True for br in article_soup.find_all("br"): br.replace_with("\n") transcript_text = article_soup.find_all('p','cnnBodyText')[2].getText() begin_flag = '(\n[A-Z][^a-z^\n]+?:|\(BEGIN .*?\)).*' end_flag = "" phrase_id_init = phrase_id_start speaker_id_start, phrase_id_start = crawler_util.\ crawl_transcript(transcript_text, begin_flag, end_flag, episode_id_start, speaker_id_start, phrase_id_start, db_cursor) if phrase_id_init != phrase_id_start: db_cursor.execute('INSERT INTO episode VALUES(?, ?, ?, ?)', (episode_id_start, headline, airtime, title)) episode_id_start += 1 db_connection.commit() return int(year), episode_id_start, speaker_id_start, phrase_id_start, True
def crawl_show(starting_url, transcripts_link, db_cursor, db_connection, title, episode_id_start, speaker_id_start, phrase_id_start): ''' Crawl all transcripts for a given show, for the requested time frame. Inputs: starting_url: (str) link to show page (to help give complete url for a transcript) transcript_link: (str) link to transcript page db_cursor, db_connection: database cursor and connection title: (str) name of show) episode_id_start, speaker_id_start, phrase_id_start: (ints) numbers at which to start speakers, episodes, and phrases Outputs: episode_id_start, speaker_id_start, phrase_id_start: (ints) updated starting points for the next transcript to be crawled ''' episode_id_init = episode_id_start # Create soup object from starting page transcripts_request = crawler_util.get_request(transcripts_link) transcripts_text = transcripts_request.text articles_soup = bs4.BeautifulSoup(transcripts_text, "html5lib") show_day = articles_soup.find('div', class_='transcript-item') year = int(show_day.find('a').get_text()[-4:]) while year >= LIMIT_YEAR: # Crawl transcript link = crawler_util.convert_if_relative_url(\ starting_url, show_day.find('a').get('href')) episode_id_start, speaker_id_start, phrase_id_start, \ within_date_range = crawl_msnbc_transcript(\ link, db_cursor, db_connection, title, episode_id_start, speaker_id_start, phrase_id_start) if not within_date_range: break show_day = show_day.find_next('div', class_='transcript-item') if show_day is None: break year = int(show_day.find('a').get_text()[-4:]) if episode_id_start > episode_id_init: db_cursor.execute('INSERT INTO show VALUES(?, ?)', (title, 'MSNBC')).fetchall() return episode_id_start, speaker_id_start, phrase_id_start
def go(db_cursor, db_connection, speaker_id_start=0, episode_id_start=0, phrase_id_start=0): ''' Crawls the Fox transcripts site and updates database of transcripts, speakers, titles, shows, and episodes. This function modifies the input database. Inputs: db_cursor, db_connection: cursor and connection to database speaker_id_start, episode_id_start, phrase_id_start: (ints) numbers at which to start speakers, episodes, and phrases Outputs: episode_id_start, speaker_id_start, phrase_id_start: (ints) updated starting points for the next transcript to be crawled ''' starting_url = "https://www.foxnews.com/shows" # Create soup object from starting page starting_request = crawler_util.get_request(starting_url) starting_text = starting_request.text starting_soup = bs4.BeautifulSoup(starting_text, "html5lib") # Get list of shows to loop through show_info_list = starting_soup.find_all('li', class_='showpage') for show in show_info_list[0:1]: title = show.find('h2', class_='title').get_text().strip() print(title) transcript_link = get_show_transcripts_page(show, starting_url) # If show has transcripts available, scrape episodes if transcript_link: episode_id_start, speaker_id_start, phrase_id_start = crawl_show(\ starting_url, transcript_link, db_cursor, db_connection, title, episode_id_start, speaker_id_start, phrase_id_start) db_connection.commit() return speaker_id_start, episode_id_start, phrase_id_start
def crawl_transcripts(starting_url, all_show_transcripts, db_cursor, db_connection, title, episode_id_start, speaker_id_start, phrase_id_start, index_start): ''' Crawl all transcripts for a given show. Inputs: starting_url: URL to network page all_show_transcripts: list of transcripts db_cursor: DB cursor to perform SQL operations db_connection: connection to database title: title of show episode_id_start: (int) current episode ID speaker_id_start: (int) current speaker ID phrase_id_start: (int) current text clip ID index_start: index of transcript in list (to reference after clicking "Show More" button) ''' year = None for transcript in all_show_transcripts[index_start:]: link = crawler_util.convert_if_relative_url( starting_url, transcript.find('a').get('href')) # Skip over non-transcripts: if "transcript" not in link: continue transcript_page_request = crawler_util.get_request(\ link) transcript_page_text = transcript_page_request.text transcript_page_soup = bs4.BeautifulSoup(\ transcript_page_text, "html5lib") year, airtime = get_fox_transcript_date(transcript_page_soup) if year != LIMIT_YEAR: continue meta_data = transcript_page_soup.find("script", {"type": "application/ld+json"}) meta_data_dict = json.loads("".join(meta_data.contents)) headline = meta_data_dict['headline'] db_cursor.execute('INSERT INTO episode VALUES(?, ?, ?, ?)', (episode_id_start, headline, airtime, title)) # https://stackoverflow.com/questions/54162988/how-to-find-a-tag- # without-specific-attribute-using-beautifulsoup transcript_raw_text = transcript_page_soup.find_all( 'p', class_='speakable') + transcript_page_soup.find_all( 'p', class_=None) transcript_text = crawler_util.join_text_chunks(transcript_raw_text) begin_flag = '(\n[A-Z][^a-z^\n]+?:|\(BEGIN .*?\)).*' end_flag = "Content and Programming Copyright.*" speaker_id_start, phrase_id_start = crawler_util.\ crawl_transcript(transcript_text, begin_flag, end_flag, episode_id_start, speaker_id_start, phrase_id_start, db_cursor) db_connection.commit() episode_id_start += 1 return year, len(all_show_transcripts), episode_id_start, \ speaker_id_start, phrase_id_start