예제 #1
0
def get_articles_c_tribune(complement):
    '''
    Given a string (complement) of the form 2011/01/01,
    get articles from the Chicago Tribune Archives.

    Inputs: a string called complement containing the date
            for a given day
    Returns: 
            info dictionary for that day
            writes csv file with nltk scores for complement

    '''
    c_tribune = 'http://articles.chicagotribune.com/'
    archive_url = c_tribune + complement + '/'
    articles = {}
    pm = urllib3.PoolManager()
    html = pm.urlopen(url=archive_url, method="GET").data
    soup = bs4.BeautifulSoup(html, 'lxml')
    #print(soup)
    tag_list = soup.find_all('h3')

    if tag_list:
        for index, tag in enumerate(tag_list):
            rv = {}
            articles[index] = rv
            article = c_tribune + tag.a['href']
            #print(article)
            config = Configuration()
            config.browser_user_agent = get_user_agent()
            article_object = Article(article)
            article_object.download()

            if article_object:
                article_object.parse()
                if 'Death Notice:' in article_object.title:
                    continue
                title = article_object.title
                #date = article_object.publish_date
                text = article_object.text
                rv['article'] = title
                rv['pub_date'] = complement
                rv['nltk_score'] = get_nltk_score(text)
                rv['nltk_score_title'] = get_nltk_score(title)
                rv['source'] = 'Chicago Tribune'

            write_csv_pro(
                articles,
                'chicago_tribune_' + re.sub("/", "_", complement) + '.csv')
예제 #2
0
def get_articles_pro(complement):
    '''
    Given a string (complement) of the form 2011/01/01,
    get articles for a given day from ProPublica
    Inputs:
            a string called complement containing the date
            for a given day
            propublica tag_type = 'div'
            propublica class_type = 'excerpt-thumb'
    Returns:
            Dictionary with articles for that day
            Writes csv files with nltk scores
    '''
    propublica = 'https://www.propublica.org/archive/'
    archive_url = propublica + complement + '/'
    articles = {}
    pm = urllib3.PoolManager()
    html = pm.urlopen(url=archive_url, method="GET").data
    soup = bs4.BeautifulSoup(html, 'lxml')
    tag_list = soup.find_all('div', class_='excerpt-thumb')

    if tag_list:
        for index, tag in enumerate(tag_list):
            rv = {}
            articles[index] = rv
            article = tag.a['href']
            print(article)
            config = Configuration()
            config.browser_user_agent = get_user_agent()
            article_object = Article(article)
            article_object.download()
            if article_object:
                article_object.parse()
                title = article_object.title
                #date = article_object.publish_date
                text = article_object.text
                rv['article'] = title
                rv['pub_date'] = complement
                rv['nltk_score'] = get_nltk_score(text)
                rv['nltk_score_title'] = get_nltk_score(title)
                rv['source'] = 'ProPublica'

        write_csv_pro(articles,
                      'propublica_' + re.sub("/", "_", complement) + '.csv')

    return articles
예제 #3
0
def get_info(dictionary):
    '''
    Get information for all the articles
    for the selected sections in La Jornada
    Inputs:
            Dictionary with selected sections
            as keys and list of urls representing
            articles in every section
    Returns:
            A dictionary with nltk scores for title
            and text for every article in every section
    '''
    rv = {}
    count = 0
    for key, item in dictionary.items():
        for i in item:
            config = Configuration()
            config.browser_user_agent = get_user_agent()
            article = Article(i, language='es')
            article.download()
            if article.is_downloaded == True:
                irv = {}
                rv[count] = irv
                article.parse()
                count = count + 1
                title = article.title
                tr_title = mtranslate.translate(title, "en", "auto")
                #print(title, key, count)
                date = article.publish_date.date()
                text = article.text
                tr_text = translate_article(text)
                #if key not in rv:
                irv['article'] = tr_title
                irv['pub_date'] = date
                irv['nltk_score'] = get_nltk_score(
                    tr_text)  #will be converted into sentiment score
                irv['source'] = 'Jornada'
                irv['nltk_score_title'] = get_nltk_score(tr_title)
                #rv[key].append((title, date, text))
    return rv