Exemplo n.º 1
0
def extract_info(tab, driver, url):
    if already_exists(tab, url):
        return False, 'already exists'

    # Get the html from the site and create a BeautifulSoup object from it
    driver.get(url)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    try:
        headline = parse_str(
            soup.find('h1',
                      attrs={
                          'class': 'wsj-article-headline',
                          'itemprop': 'headline'
                      }).text)
    except:
        print 'WARNING: Error extracting headline'
        return False, ''

    if headline == 'Corrections & Amplifications':
        return False, ''

    try:
        date_published = soup.find('time', attrs={
            'class': 'timestamp'
        }).text.replace('\n', '').replace('Updated', '').strip()
    except:
        print 'WARNING: Error extracting date_published'
        print url
        return False, ''
    try:
        author = soup.find('span', attrs={
            'class': 'name',
            'itemprop': 'name'
        }).text
    except:
        author = None
    try:
        tag = soup.find('div',
                        attrs={
                            'id': 'wsj-article-wrap',
                            'itemprop': 'articleBody'
                        }).findAll('p')
        article_text = parse_str(' \n '.join([line.text for line in tag]))
    except:
        print 'WARNING: Error extracting article text'
        print url
        return False, ''

    insert = {
        'url': url,
        'source': 'wsj',
        'headline': headline,
        'date_published': date_published,
        'author': author,
        'article_text': article_text
    }
    return True, insert
Exemplo n.º 2
0
def extract_info(tab, article):
    '''
    INPUT: Mongo table pointer, JSON object from NYT API
    OUTPUT:
        bool on whether extration was successful or not (Will also return False if the url already exists in the Mongo table, if the source is no longer availible on NYTime, or the section is something we don't care about)
        Dict to insert into Mongo Database or empty string if it isn't something we want to insert into Mongo
    By checking the Mongo table during the extraction process we can save time by not getting the html of the url if that url already exists in the table.
    '''
    date_published = parse_str(article['pub_date'])
    url = article['web_url']
    source = article['source']
    content_type = article['type_of_material']
    # Skip extraction if article already exists in Mongo
    if already_exists(tab, url):
        return False, ''
    # Skip these sections as they don't contain any text that we'd be
    # interested in.
    sections_to_skip = ['Video', 'Interactive Feature',
                        'Paid Death Notice', 'Slideshow', 'Question', 'Review']
    # Skip extraction if the source is 'AP' or 'Reuters' as those sources
    # aren't accessable through NYTimes anymore
    if source in ['AP', 'Reuters'] or content_type in sections_to_skip:
        return False, ''
    html = get(url)
    soup = BeautifulSoup(html.content, 'html.parser')
    try:
        author = ['{} {}'.format(person['firstname'], person[
                                 'lastname']) for person in article['byline']['person']]
    except:
        author = None
    try:
        if content_type == 'Blog':
            lines = soup.find(
                'div', attrs={'class': 'entry-content'}).findAll('p')
        else:
            lines = soup.find(
                'div', attrs={'class': 'story-body'}).findAll('p')
        article_text = parse_str(' \n '.join([line.text for line in lines]))
    except:
        print 'WARNING! Text Extraction Failed'
        print (url, content_type, source)
        return False, ''
    try:
        headline = parse_str(
            soup.find('h1', attrs={'itemprop': 'headline', 'class': 'entry-title'}).text)
    except:
        headline = parse_str(article['headline']['main'])
    insert = {'url': url,
              'source': 'nyt',
              'content_source': source,
              'content_type': content_type,
              'headline': headline,
              'date_published': date_published,
              'author': author,
              'article_text': article_text}
    return True, insert
Exemplo n.º 3
0
def extract_info(tab, driver, url):
    if already_exists(tab, url):
        return False, 'already exists'

    # Get the html from the site and create a BeautifulSoup object from it
    driver.get(url)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    try:
        headline = parse_str(soup.find('h1', attrs={'class': 'wsj-article-headline', 'itemprop': 'headline'}).text)
    except:
        print 'WARNING: Error extracting headline'
        return False, ''

    if headline == 'Corrections & Amplifications':
        return False, ''

    try:
        date_published = soup.find('time', attrs={'class': 'timestamp'}).text.replace('\n', '').replace('Updated', '').strip()
    except:
        print 'WARNING: Error extracting date_published'
        print url
        return False, ''
    try:
        author = soup.find('span', attrs={'class': 'name', 'itemprop': 'name'}).text
    except:
        author = None
    try:
        tag = soup.find('div', attrs={'id': 'wsj-article-wrap', 'itemprop': 'articleBody'}).findAll('p')
        article_text = parse_str(' \n '.join([line.text for line in tag]))
    except:
        print 'WARNING: Error extracting article text'
        print url
        return False, ''

    insert = {'url': url,
              'source': 'wsj',
              'headline': headline,
              'date_published': date_published,
              'author': author,
              'article_text': article_text}
    return True, insert
Exemplo n.º 4
0
def extract_info(tab, driver, url):
    if already_exists(tab, url):
        return False, 'already exists'

    # Get the html from the site and create a BeautifulSoup object from it
    driver.get(url)
    try:
        soup = BeautifulSoup(driver.page_source, 'html.parser')
    except:
        print('WARNING: Error opening BeautifulSoup')
    try:
        headline = parse_str(
            soup.find('h1',
                      attrs={
                          'class': 'wsj-article-headline',
                          'itemprop': 'headline'
                      }).text)
    except:
        print('WARNING: Error extracting headline')

    try:
        date_published = soup.find('time', attrs={
            'class': 'timestamp'
        }).text.replace('\n', '').replace('Updated', '').strip()
    except:
        try:
            date_published = driver.find_elements_by_class_name(
                'timestamp')[0].text.split('\n')[0]
        except:
            print('WARNING: Error extracting date_published')
            print(url)
            return False, ''
    try:
        author = soup.find('span', attrs={
            'class': 'name',
            'itemprop': 'name'
        }).text
    except:
        author = None
    try:
        tag = soup.find('div',
                        attrs={
                            'id': 'wsj-article-wrap',
                            'itemprop': 'articleBody'
                        })
        if tag == None:
            print('slideshow', url)
            return False, ''
        tag = tag.findAll('p')
        article_text = parse_str(' \n '.join([line.text for line in tag]))
    except:
        print('WARNING: Error extracting article text')
        import pdb
        pdb.set_trace()
        print(url)
        return False, ''

    insert = {
        'url': url,
        'source': 'wsj',
        'headline': headline,
        'date_published': date_published,
        'author': author,
        'article_text': article_text
    }
    return True, insert