示例#1
0
def scrape_articles( found, extract, opts):
    """Scrape list of articles, return error counts.

    found -- list of article contexts to scrape
    extract -- extract function
    opts:
        max_errors -- tolerated number of errors before bailing
        test
        force_rescrape
        etc...
    """

    extralogging = False
    max_errors = getattr(opts,'max_errors',0)
    expected_journo = getattr(opts,'expected_journo',None)

    # remove dupes (eg often articles appear in more than one RSS feed)
    found = unique_articles(found)

    # randomise the order of articles, so that if the scraper does abort
    # due to too many errors, successive runs should be able to pick up
    # all the scrapable articles.
    random.shuffle(found)
    #assert(len(found)>0)
    ukmedia.DBUG2("%d articles to scrape\n" % (len(found)))

    if opts.test:
        ukmedia.DBUG("DRY RUN\n")

    store = ArticleDB.ArticleDB()


    failcount = 0
    abortcount = 0
    newcount = 0
    had_count = 0
    rescrape_count = 0

    for context in found:

        try:
            known_urls = set((context['srcurl'], context['permalink']))
            got = store.find_article(known_urls)
            if len(got) > 0:
                if extralogging:
                    for article_id in got:
                        ukmedia.DBUG( u"already got %s [a%s] (attributed to: %s)\n" % (context['srcurl'], article_id,GetAttrLogStr(article_id)))
                if not opts.force_rescrape:
                    had_count += 1
                    continue;   # skip it - we've already got it
                else:
                    assert(len(got) == 1)


            #ukmedia.DBUG2( u"fetching %s\n" % (context['srcurl']) )
            resp = urllib2.urlopen( context['srcurl'] )

            # is the server sending an charset encoding?
            kwargs = {}
            content_type = resp.info().getheader('Content-Type','')
            m = re.compile(r';\s*charset\s*=\s*([^;]*)', re.I).search(content_type)
            if m:
                kwargs['encoding'] = m.group(1)

            # grab the content
            html = resp.read()

            # add any URLs we were redirected via...
            for code,url in resp.redirects:
                known_urls.add(url)
                if code==301:    # permanant redirect
                    context['permalink'] = url

            # check html for a rel="canonical" link:
            canonical_url = extract_canonical_url(html, context['permalink'])
            if canonical_url is not None:
                known_urls.add(canonical_url)
                context['permalink'] = canonical_url

            # strip off "?rss=yes" etc from permalink
            tidied_url = tidy_url(context['permalink'])
            if tidied_url != context['permalink']:
                context['permalink'] = tidied_url
                known_urls.add(tidied_url)

            context['urls'] = known_urls

            # check that all urls are OK (eg express.co.uk have a habit of publishing borked ones for blogs)
            for url in known_urls:
                url.encode('utf-8') # will raise an exception if dud

            # repeat url-based existence check with the urls we now have
            # TODO: if so, add any new urls... maybe rescrape and update article? 
            article_id = None
            got = store.find_article(known_urls)
            if len(got) > 0:
                if extralogging:
                    for article_id in got:
                        ukmedia.DBUG( u"already got %s [a%s] (attributed to: %s)\n" % (context['srcurl'], article_id,GetAttrLogStr(article_id)))
                if not opts.force_rescrape:
                    had_count += 1
                    continue;   # skip it - we've already got it
                else:
                    assert(len(got) == 1)
                    article_id = got[0]

            # some extra, last minute context :-)
            context[ 'lastscraped' ] = datetime.now()

            art = extract(html, context, **kwargs)

            if art:
                # set the srcorg id for the article
                if 'srcorgname' in art and art['srcorgname'] is not None:
                    srcorg = Misc.GetOrgID( art[ 'srcorgname' ] )
                else:
                    # no publication specified - look up using domain name
                    o = urlparse.urlparse(art['permalink'])
                    domain = o[1].lower()
                    srcorg = Publication.find_or_create(domain)
                art['srcorg'] = srcorg


                # resolve bylined authors to journo ids
                authors = Byline.CrackByline(art['byline'])
                attributed = []
                for author in authors:
                    attributed.append(Journo.find_or_create(author, art, expected_journo))
                art['journos'] = attributed

                if opts.test:
                    ukmedia.PrettyDump( art )

                if article_id:
                    # rescraping existing article
                    art['id'] = article_id
                    article_id = store.upsert( art )
                    rescrape_count += 1
                else:
                    #
                    article_id = store.upsert( art )
                    newcount += 1



                if opts.test:
                    DB.conn().rollback()
                else:
                    DB.conn().commit()


        except Exception, err:
            DB.conn().rollback()

            # always just bail out upon ctrl-c
            if isinstance( err, KeyboardInterrupt ):
                raise

            failcount = failcount+1
            # TODO: phase out NonFatal! just get scraper to print out a warning message instead
            if isinstance( err, ukmedia.NonFatal ):
                continue

            report = traceback.format_exc()

            if 'title' in context:
                msg = u"FAILED (%s): '%s' (%s)" % (err, context['title'], context['srcurl'])
            else:
                msg = u"FAILED (%s): (%s)" % (err,context['srcurl'])
            ukmedia.DBUG( msg + "\n" )
            ukmedia.DBUG2( report + "\n" )
            ukmedia.DBUG2( '-'*60 + "\n" )

            abortcount = abortcount + 1
            if abortcount > max_errors:
                print >>sys.stderr, "Too many errors - ABORTING"
                raise
示例#2
0
    def upsert( self, art ):
        """Insert or update an article"""


        # if no separate 'urls' set, create it
        if not 'urls' in art:
            art['urls'] = set((art['permalink'], art['srcurl']))

        # fill in some defaults if missing
        if 'lastscraped' not in art:
            art['lastscraped'] = datetime.now()
        if 'lastseen' not in art:
            art['lastseen'] = datetime.now()
        if 'description' not in art:
            art['description'] = ukmedia.FirstPara(art['content'])

        CheckArticle( art )

        # send text to the DB as utf-8
        title = art['title'].encode( 'utf-8' )
        byline = art[ 'byline' ].encode( 'utf-8' )
        description = art['description'].encode( 'utf-8' )
        pubdate = "%s" %(art['pubdate'])
        lastscraped = "%s" % (art['lastscraped'])
        lastseen = "%s" % (art['lastseen'])
        firstseen = lastseen    # it's a new entry
        srcurl = art['srcurl']
        permalink = art['permalink']
        srcorg = art['srcorg']

        # phasing out srcid...
        if 'srcid' in art:
            srcid = art['srcid']
        else:
            srcid = art['permalink']

        wordcount = None
        content = None
        # does article include content?
        if 'content' in art:
            content = art['content'].encode( 'utf-8' )
            # noddy wordcount
            txt = ukmedia.StripHTML( art['content'] )
            wordcount = len( txt.split() );

        # send to db!
        cursor = DB.conn().cursor()



        updating = False
        if 'id' in art:
            updating = True

        if updating:
            # update existing
            article_id = art['id']
            q = 'UPDATE article SET (title, byline, description, lastscraped, pubdate, lastseen, permalink, srcurl, srcorg, srcid, wordcount, last_comment_check) = (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) WHERE id=%s'
            cursor.execute(q, (title, byline, description, lastscraped, pubdate, lastseen, permalink, srcurl, srcorg, srcid, wordcount, lastscraped, article_id))
        else:
            # insert new
            q = 'INSERT INTO article (title, byline, description, lastscraped, pubdate, firstseen, lastseen, permalink, srcurl, srcorg, srcid, wordcount, last_comment_check) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
            cursor.execute( q, ( title, byline, description, lastscraped, pubdate, firstseen, lastseen, permalink, srcurl, srcorg, srcid, wordcount, lastscraped ) )
            # get the newly-allocated id
            cursor.execute( "select currval('article_id_seq')" )
            article_id = cursor.fetchone()[0]


        # add the known urls for the article
        if updating:
            cursor.execute( "DELETE FROM article_url WHERE article_id=%s", (article_id,))
        for url in set(art['urls']):
            cursor.execute( "INSERT INTO article_url (url,article_id) VALUES (%s,%s)", (url,article_id))

        # update content, if included
        if content is None:
            insert_content = False
        else:
            insert_content = True
            if updating:
                # TODO: keep multiple revisions to track changes
                # has the content actually changed?
                cursor.execute("SELECT id FROM article_content WHERE article_id=%s AND content=%s", (article_id,content))
                foo = cursor.fetchall()     # gah... couldn't get cursor.rowcount to work...
                if len(foo)>=1:
                    # no change, so just leave it as is
                    insert_content = False

        if insert_content:
            cursor.execute("DELETE FROM article_content WHERE article_id=%s", (article_id,))
            q = 'INSERT INTO article_content (article_id, content,scraped) VALUES ( %s,%s,%s )'
            cursor.execute(q, (article_id, content, lastscraped))

        # queue it for xapian indexing
        cursor.execute("DELETE FROM article_needs_indexing WHERE article_id=%s", (article_id,))
        cursor.execute("INSERT INTO article_needs_indexing (article_id) VALUES (%s)", (article_id,))

        # if there was a scraper error entry for this article, delete it now
        cursor.execute( "DELETE FROM error_articlescrape WHERE srcid=%s", (srcid,) )

        # if there were images, add them too
        if updating:
            cursor.execute("DELETE FROM article_image WHERE article_id=%s", (article_id,))
        if 'images' in art:
            for im in art['images']:
                cap = im['caption'].encode('utf-8')
                cred = ''
                if 'credit' in im:
                    cred = im['credit'].encode('utf-8')
                cursor.execute("INSERT INTO article_image (article_id,url,caption,credit) VALUES (%s,%s,%s,%s)",
                    (article_id, im['url'], cap, cred))

        # if there were commentlinks, add them too
        if 'commentlinks' in art:
            for c in art['commentlinks']:
                c['source'] = art['srcorgname']
                CommentLink.upsert(article_id, c)

        # add tags
        Tags.generate(article_id, art['content'])

        # attribute journos
        assert 'journos' in art
        cursor.execute("DELETE FROM journo_attr WHERE article_id=%s", (article_id,))
        for journo_id in art['journos']:
            cursor.execute("INSERT INTO journo_attr (journo_id,article_id) VALUES (%s,%s)", (journo_id,article_id))

            # make sure journo activates if they meet the criteria
            Journo.update_activation(journo_id)

            # also clear the html cache for that journos page
            cachename = 'j%s' % (journo_id)
            cursor.execute( "DELETE FROM htmlcache WHERE name=%s", (cachename,) )


        op = 'update' if updating else 'new'
        if insert_content:
            op += ' meta+content'
        else:
            op += ' meta'

        ukmedia.DBUG2( u"%s: %s [a%s %s ] ('%s' %s)\n" % (
            art['srcorgname'] if 'srcorgname' in art else srcorg,
            op,
            article_id,
            art['srcurl'],
            art['byline'],
            ','.join( [ '[j%s]'%(j) for j in art['journos'] ] )
            ))
        return article_id