def nav_scrape(row): """DataFrame wrapper for inserting scraped text into appropriate column.""" if (row.name + 1) % 3 == 0: self.driver.quit() self.driver = ScraperBase(fresh=False).driver print(f'Scraping Source #{row.name}', end='\r') hyperlink = row['hyperlink'] self.driver.get(hyperlink) self.driver.implicitly_wait(1) raw_html = BeautifulSoup(self.driver.page_source, features='lxml') if raw_html.find('p', {'class': 'story-body-text'}): text = ' '.join([ p.get_text() for p in raw_html.find_all( 'p', {'class': 'story-body-text'}) ]) return text elif raw_html.find('p', {'class': 'css-18icg9x'}): text = ' '.join([ p.get_text() for p in raw_html.find_all('p', {'class': 'css-18icg9x'}) ]) return text
def nav_scrape(row, iteration): """DataFrame wrapper for inserting scraped text into appropriate column.""" if (row.name + 1) % 30 == 0: self.driver.quit() self.driver = ScraperBase(fresh=False).driver print(f'Scraping source #{row.name}', end='\r') hyperlink = row['hyperlink'] self.driver.get(hyperlink) self.driver.implicitly_wait(1) raw_html = BeautifulSoup(self.driver.page_source, features='lxml') if raw_html.find('div', {'class': 'story-body__inner'}) or raw_html.find( 'div', {'property': 'articleBody'}): story = raw_html.find('div', {'class': 'story-body__inner'}) story = raw_html.find('div', {'property': 'articleBody'}) text = ' '.join([p.get_text() for p in story.find_all('p')]) return text elif raw_html.find('div', {'class': 'synopsis-toggle__long'}): story = raw_html.find('div', {'class': 'synopsis-toggle__long'}) text = ' '.join([p.get_text() for p in story.find_all('p')]) return text elif raw_html.find('div', {'class': 'vxp-media__summary'}): story = raw_html.find('div', {'class': 'vxp-media__summary'}) text = ' '.join([ p.get_text() for p in story.find_all('p') if not p.findChild('i') ]) return text else: if iteration == 1: nav_scrape(row, 2)
def __init__(self): self.base = ScraperBase(fresh=True) self.driver = self.base.driver self.query_terms = self.base.query_terms
class NYTScraper: """Selenium connection to NYT search page with bound methods.""" def __init__(self): self.base = ScraperBase(fresh=True) self.driver = self.base.driver self.query_terms = self.base.query_terms def query_nyt(self): """Capture URL addresses for search results.""" query_base = 'https://www.nytimes.com/search?query={}{}{}' query_string = query_base.format(*self.query_terms) self.driver.get(query_string) self.driver.implicitly_wait(20) try: topic = True nyt_topic = self.driver.find_elements_by_class_name( 'css-107jdae')[0] topic_page = nyt_topic.find_element_by_tag_name('a') page_url = topic_page.get_attribute('href') self.driver.get(page_url) btns = self.driver.find_elements_by_tag_name('button') nyt_more_btn = [ btn for btn in btns if btn.text.lower() == 'show more' ] Actions(self.driver).click(nyt_more_btn[0]).wait(1).perform() current_browser_height = self.driver.execute_script( 'return document.body.scrollHeight') while topic: self.driver.execute_script( 'window.scrollTo(0, document.body.scrollHeight);') time.sleep(3.5) new_browser_height = self.driver.execute_script( 'return document.body.scrollHeight') if new_browser_height == current_browser_height: break current_browser_height = new_browser_height except: topic = False for i in range(0, 80): try: btns = self.driver.find_elements_by_tag_name('button') nyt_more_btn = [ btn for btn in btns if btn.text.lower() == 'show more' ] Actions(self.driver).click( nyt_more_btn[0]).wait(1).perform() except: pass raw_html = BeautifulSoup(self.driver.page_source, features='lxml') return (raw_html, topic) def story_extractor(self, raw_html, topic): """Extract dictionary of articles and publication dates.""" if topic: stories = raw_html.find('div', {'class': 'stream'}).find_all('li') stories = [ story for story in stories if 'story-id' in story.get('id') ] published_dates = [ article.findChild('time', recursive=True)['datetime'] for article in stories ] headlines = [ article.findChild( 'h2', recursive=True).get_text().strip().strip('\n') for article in stories ] hyperlinks = [ article.findChild('a', recursive=True)['href'] for article in stories ] query = [self.query_terms for i in range(0, len(headlines))] source = ['nyt' for i in range(0, len(headlines))] index = list(range(0, len(headlines))) else: stories = raw_html.find_all('li', {'class': 'css-1l4w6pd'}) published_dates = [ article.findChild('time', recursive=True).get_text() for article in stories ] headlines = [ article.findChild( 'h4', recursive=True).get_text().strip().strip('\n') for article in stories ] hyperlinks = [ 'https://www.nytimes.com' + article.findChild('a', recursive=True)['href'] for article in stories ] query = [self.query_terms for i in range(0, len(headlines))] source = ['nyt' for i in range(0, len(headlines))] index = list(range(0, len(headlines))) elements_extracted = { a[0]: [a[4], a[5], a[3], a[2], a[1]] for a in list( zip(index, source, query, published_dates, headlines, hyperlinks)) } print(f'Keyword search results: {len(elements_extracted)}') return elements_extracted def content_fetcher(self, elements_extracted): """Navigate to hyperlink and fetch article text.""" def nav_scrape(row): """DataFrame wrapper for inserting scraped text into appropriate column.""" if (row.name + 1) % 3 == 0: self.driver.quit() self.driver = ScraperBase(fresh=False).driver print(f'Scraping Source #{row.name}', end='\r') hyperlink = row['hyperlink'] self.driver.get(hyperlink) self.driver.implicitly_wait(1) raw_html = BeautifulSoup(self.driver.page_source, features='lxml') if raw_html.find('p', {'class': 'story-body-text'}): text = ' '.join([ p.get_text() for p in raw_html.find_all( 'p', {'class': 'story-body-text'}) ]) return text elif raw_html.find('p', {'class': 'css-18icg9x'}): text = ' '.join([ p.get_text() for p in raw_html.find_all('p', {'class': 'css-18icg9x'}) ]) return text content_dataframe = pd.DataFrame.from_dict( elements_extracted, orient='index', columns=['headline', 'hyperlink', 'published', 'query', 'source']) content_dataframe['content'] = content_dataframe.apply(nav_scrape, axis=1) return content_dataframe
class BBCScraper: """Selenium connection to BBC search page with bound methods.""" def __init__(self): self.base = ScraperBase(fresh=True) self.driver = self.base.driver self.query_terms = self.base.query_terms def query_bbc(self): """Capture URL addresses for search results.""" query_base = 'https://www.bbc.co.uk/search?q={}{}{}&filter=news' query_string = query_base.format(*self.query_terms) self.driver.get(query_string) self.driver.implicitly_wait(15) for i in range(0, 50): try: bbc_more_btn = self.driver.find_elements_by_class_name('more') Actions(self.driver).click(bbc_more_btn[0]).wait(.6).perform() except: pass raw_html = BeautifulSoup(self.driver.page_source, features='lxml') return raw_html def story_extractor(self, raw_html): """Extract dictionary of articles and publication dates.""" published_dates = [ article.findChild('time', recursive=True)['datetime'] for article in raw_html.find_all( lambda tag: tag.has_attr('data-result-number')) ] headlines = [ article.findChild('h1', recursive=True).find('a').get_text() for article in raw_html.find_all( lambda tag: tag.has_attr('data-result-number')) ] hyperlinks = [ article.findChild('a', recursive=True)['href'] for article in raw_html.find_all(lambda tag: tag.has_attr('data-result-number')) ] query = [self.query_terms for i in range(0, len(headlines))] source = ['bbc' for i in range(0, len(headlines))] index = list(range(0, len(headlines))) elements_extracted = { a[0]: [a[4], a[5], a[3], a[2], a[1]] for a in list( zip(index, source, query, published_dates, headlines, hyperlinks)) } print(f'Keyword search results: {len(elements_extracted)}') return elements_extracted def content_fetcher(self, elements_extracted): """Navigate to hyperlink and fetch article text.""" def nav_scrape(row, iteration): """DataFrame wrapper for inserting scraped text into appropriate column.""" if (row.name + 1) % 30 == 0: self.driver.quit() self.driver = ScraperBase(fresh=False).driver print(f'Scraping source #{row.name}', end='\r') hyperlink = row['hyperlink'] self.driver.get(hyperlink) self.driver.implicitly_wait(1) raw_html = BeautifulSoup(self.driver.page_source, features='lxml') if raw_html.find('div', {'class': 'story-body__inner'}) or raw_html.find( 'div', {'property': 'articleBody'}): story = raw_html.find('div', {'class': 'story-body__inner'}) story = raw_html.find('div', {'property': 'articleBody'}) text = ' '.join([p.get_text() for p in story.find_all('p')]) return text elif raw_html.find('div', {'class': 'synopsis-toggle__long'}): story = raw_html.find('div', {'class': 'synopsis-toggle__long'}) text = ' '.join([p.get_text() for p in story.find_all('p')]) return text elif raw_html.find('div', {'class': 'vxp-media__summary'}): story = raw_html.find('div', {'class': 'vxp-media__summary'}) text = ' '.join([ p.get_text() for p in story.find_all('p') if not p.findChild('i') ]) return text else: if iteration == 1: nav_scrape(row, 2) content_dataframe = pd.DataFrame.from_dict( elements_extracted, orient='index', columns=['headline', 'hyperlink', 'published', 'query', 'source']) content_dataframe['content'] = content_dataframe.apply(nav_scrape, iteration=1, axis=1) return content_dataframe