def get_articles_c_tribune(complement): ''' Given a string (complement) of the form 2011/01/01, get articles from the Chicago Tribune Archives. Inputs: a string called complement containing the date for a given day Returns: info dictionary for that day writes csv file with nltk scores for complement ''' c_tribune = 'http://articles.chicagotribune.com/' archive_url = c_tribune + complement + '/' articles = {} pm = urllib3.PoolManager() html = pm.urlopen(url=archive_url, method="GET").data soup = bs4.BeautifulSoup(html, 'lxml') #print(soup) tag_list = soup.find_all('h3') if tag_list: for index, tag in enumerate(tag_list): rv = {} articles[index] = rv article = c_tribune + tag.a['href'] #print(article) config = Configuration() config.browser_user_agent = get_user_agent() article_object = Article(article) article_object.download() if article_object: article_object.parse() if 'Death Notice:' in article_object.title: continue title = article_object.title #date = article_object.publish_date text = article_object.text rv['article'] = title rv['pub_date'] = complement rv['nltk_score'] = get_nltk_score(text) rv['nltk_score_title'] = get_nltk_score(title) rv['source'] = 'Chicago Tribune' write_csv_pro( articles, 'chicago_tribune_' + re.sub("/", "_", complement) + '.csv')
def get_articles_pro(complement): ''' Given a string (complement) of the form 2011/01/01, get articles for a given day from ProPublica Inputs: a string called complement containing the date for a given day propublica tag_type = 'div' propublica class_type = 'excerpt-thumb' Returns: Dictionary with articles for that day Writes csv files with nltk scores ''' propublica = 'https://www.propublica.org/archive/' archive_url = propublica + complement + '/' articles = {} pm = urllib3.PoolManager() html = pm.urlopen(url=archive_url, method="GET").data soup = bs4.BeautifulSoup(html, 'lxml') tag_list = soup.find_all('div', class_='excerpt-thumb') if tag_list: for index, tag in enumerate(tag_list): rv = {} articles[index] = rv article = tag.a['href'] print(article) config = Configuration() config.browser_user_agent = get_user_agent() article_object = Article(article) article_object.download() if article_object: article_object.parse() title = article_object.title #date = article_object.publish_date text = article_object.text rv['article'] = title rv['pub_date'] = complement rv['nltk_score'] = get_nltk_score(text) rv['nltk_score_title'] = get_nltk_score(title) rv['source'] = 'ProPublica' write_csv_pro(articles, 'propublica_' + re.sub("/", "_", complement) + '.csv') return articles
def get_info(dictionary): ''' Get information for all the articles for the selected sections in La Jornada Inputs: Dictionary with selected sections as keys and list of urls representing articles in every section Returns: A dictionary with nltk scores for title and text for every article in every section ''' rv = {} count = 0 for key, item in dictionary.items(): for i in item: config = Configuration() config.browser_user_agent = get_user_agent() article = Article(i, language='es') article.download() if article.is_downloaded == True: irv = {} rv[count] = irv article.parse() count = count + 1 title = article.title tr_title = mtranslate.translate(title, "en", "auto") #print(title, key, count) date = article.publish_date.date() text = article.text tr_text = translate_article(text) #if key not in rv: irv['article'] = tr_title irv['pub_date'] = date irv['nltk_score'] = get_nltk_score( tr_text) #will be converted into sentiment score irv['source'] = 'Jornada' irv['nltk_score_title'] = get_nltk_score(tr_title) #rv[key].append((title, date, text)) return rv