def extract_info(tab, driver, url): if already_exists(tab, url): return False, 'already exists' # Get the html from the site and create a BeautifulSoup object from it driver.get(url) soup = BeautifulSoup(driver.page_source, 'html.parser') try: headline = parse_str( soup.find('h1', attrs={ 'class': 'wsj-article-headline', 'itemprop': 'headline' }).text) except: print 'WARNING: Error extracting headline' return False, '' if headline == 'Corrections & Amplifications': return False, '' try: date_published = soup.find('time', attrs={ 'class': 'timestamp' }).text.replace('\n', '').replace('Updated', '').strip() except: print 'WARNING: Error extracting date_published' print url return False, '' try: author = soup.find('span', attrs={ 'class': 'name', 'itemprop': 'name' }).text except: author = None try: tag = soup.find('div', attrs={ 'id': 'wsj-article-wrap', 'itemprop': 'articleBody' }).findAll('p') article_text = parse_str(' \n '.join([line.text for line in tag])) except: print 'WARNING: Error extracting article text' print url return False, '' insert = { 'url': url, 'source': 'wsj', 'headline': headline, 'date_published': date_published, 'author': author, 'article_text': article_text } return True, insert
def extract_info(tab, article): ''' INPUT: Mongo table pointer, JSON object from NYT API OUTPUT: bool on whether extration was successful or not (Will also return False if the url already exists in the Mongo table, if the source is no longer availible on NYTime, or the section is something we don't care about) Dict to insert into Mongo Database or empty string if it isn't something we want to insert into Mongo By checking the Mongo table during the extraction process we can save time by not getting the html of the url if that url already exists in the table. ''' date_published = parse_str(article['pub_date']) url = article['web_url'] source = article['source'] content_type = article['type_of_material'] # Skip extraction if article already exists in Mongo if already_exists(tab, url): return False, '' # Skip these sections as they don't contain any text that we'd be # interested in. sections_to_skip = ['Video', 'Interactive Feature', 'Paid Death Notice', 'Slideshow', 'Question', 'Review'] # Skip extraction if the source is 'AP' or 'Reuters' as those sources # aren't accessable through NYTimes anymore if source in ['AP', 'Reuters'] or content_type in sections_to_skip: return False, '' html = get(url) soup = BeautifulSoup(html.content, 'html.parser') try: author = ['{} {}'.format(person['firstname'], person[ 'lastname']) for person in article['byline']['person']] except: author = None try: if content_type == 'Blog': lines = soup.find( 'div', attrs={'class': 'entry-content'}).findAll('p') else: lines = soup.find( 'div', attrs={'class': 'story-body'}).findAll('p') article_text = parse_str(' \n '.join([line.text for line in lines])) except: print 'WARNING! Text Extraction Failed' print (url, content_type, source) return False, '' try: headline = parse_str( soup.find('h1', attrs={'itemprop': 'headline', 'class': 'entry-title'}).text) except: headline = parse_str(article['headline']['main']) insert = {'url': url, 'source': 'nyt', 'content_source': source, 'content_type': content_type, 'headline': headline, 'date_published': date_published, 'author': author, 'article_text': article_text} return True, insert
def extract_info(tab, driver, url): if already_exists(tab, url): return False, 'already exists' # Get the html from the site and create a BeautifulSoup object from it driver.get(url) soup = BeautifulSoup(driver.page_source, 'html.parser') try: headline = parse_str(soup.find('h1', attrs={'class': 'wsj-article-headline', 'itemprop': 'headline'}).text) except: print 'WARNING: Error extracting headline' return False, '' if headline == 'Corrections & Amplifications': return False, '' try: date_published = soup.find('time', attrs={'class': 'timestamp'}).text.replace('\n', '').replace('Updated', '').strip() except: print 'WARNING: Error extracting date_published' print url return False, '' try: author = soup.find('span', attrs={'class': 'name', 'itemprop': 'name'}).text except: author = None try: tag = soup.find('div', attrs={'id': 'wsj-article-wrap', 'itemprop': 'articleBody'}).findAll('p') article_text = parse_str(' \n '.join([line.text for line in tag])) except: print 'WARNING: Error extracting article text' print url return False, '' insert = {'url': url, 'source': 'wsj', 'headline': headline, 'date_published': date_published, 'author': author, 'article_text': article_text} return True, insert
def extract_info(tab, driver, url): if already_exists(tab, url): return False, 'already exists' # Get the html from the site and create a BeautifulSoup object from it driver.get(url) try: soup = BeautifulSoup(driver.page_source, 'html.parser') except: print('WARNING: Error opening BeautifulSoup') try: headline = parse_str( soup.find('h1', attrs={ 'class': 'wsj-article-headline', 'itemprop': 'headline' }).text) except: print('WARNING: Error extracting headline') try: date_published = soup.find('time', attrs={ 'class': 'timestamp' }).text.replace('\n', '').replace('Updated', '').strip() except: try: date_published = driver.find_elements_by_class_name( 'timestamp')[0].text.split('\n')[0] except: print('WARNING: Error extracting date_published') print(url) return False, '' try: author = soup.find('span', attrs={ 'class': 'name', 'itemprop': 'name' }).text except: author = None try: tag = soup.find('div', attrs={ 'id': 'wsj-article-wrap', 'itemprop': 'articleBody' }) if tag == None: print('slideshow', url) return False, '' tag = tag.findAll('p') article_text = parse_str(' \n '.join([line.text for line in tag])) except: print('WARNING: Error extracting article text') import pdb pdb.set_trace() print(url) return False, '' insert = { 'url': url, 'source': 'wsj', 'headline': headline, 'date_published': date_published, 'author': author, 'article_text': article_text } return True, insert