def scrape_articles( found, extract, opts): """Scrape list of articles, return error counts. found -- list of article contexts to scrape extract -- extract function opts: max_errors -- tolerated number of errors before bailing test force_rescrape etc... """ extralogging = False max_errors = getattr(opts,'max_errors',0) expected_journo = getattr(opts,'expected_journo',None) # remove dupes (eg often articles appear in more than one RSS feed) found = unique_articles(found) # randomise the order of articles, so that if the scraper does abort # due to too many errors, successive runs should be able to pick up # all the scrapable articles. random.shuffle(found) #assert(len(found)>0) ukmedia.DBUG2("%d articles to scrape\n" % (len(found))) if opts.test: ukmedia.DBUG("DRY RUN\n") store = ArticleDB.ArticleDB() failcount = 0 abortcount = 0 newcount = 0 had_count = 0 rescrape_count = 0 for context in found: try: known_urls = set((context['srcurl'], context['permalink'])) got = store.find_article(known_urls) if len(got) > 0: if extralogging: for article_id in got: ukmedia.DBUG( u"already got %s [a%s] (attributed to: %s)\n" % (context['srcurl'], article_id,GetAttrLogStr(article_id))) if not opts.force_rescrape: had_count += 1 continue; # skip it - we've already got it else: assert(len(got) == 1) #ukmedia.DBUG2( u"fetching %s\n" % (context['srcurl']) ) resp = urllib2.urlopen( context['srcurl'] ) # is the server sending an charset encoding? kwargs = {} content_type = resp.info().getheader('Content-Type','') m = re.compile(r';\s*charset\s*=\s*([^;]*)', re.I).search(content_type) if m: kwargs['encoding'] = m.group(1) # grab the content html = resp.read() # add any URLs we were redirected via... for code,url in resp.redirects: known_urls.add(url) if code==301: # permanant redirect context['permalink'] = url # check html for a rel="canonical" link: canonical_url = extract_canonical_url(html, context['permalink']) if canonical_url is not None: known_urls.add(canonical_url) context['permalink'] = canonical_url # strip off "?rss=yes" etc from permalink tidied_url = tidy_url(context['permalink']) if tidied_url != context['permalink']: context['permalink'] = tidied_url known_urls.add(tidied_url) context['urls'] = known_urls # check that all urls are OK (eg express.co.uk have a habit of publishing borked ones for blogs) for url in known_urls: url.encode('utf-8') # will raise an exception if dud # repeat url-based existence check with the urls we now have # TODO: if so, add any new urls... maybe rescrape and update article? article_id = None got = store.find_article(known_urls) if len(got) > 0: if extralogging: for article_id in got: ukmedia.DBUG( u"already got %s [a%s] (attributed to: %s)\n" % (context['srcurl'], article_id,GetAttrLogStr(article_id))) if not opts.force_rescrape: had_count += 1 continue; # skip it - we've already got it else: assert(len(got) == 1) article_id = got[0] # some extra, last minute context :-) context[ 'lastscraped' ] = datetime.now() art = extract(html, context, **kwargs) if art: # set the srcorg id for the article if 'srcorgname' in art and art['srcorgname'] is not None: srcorg = Misc.GetOrgID( art[ 'srcorgname' ] ) else: # no publication specified - look up using domain name o = urlparse.urlparse(art['permalink']) domain = o[1].lower() srcorg = Publication.find_or_create(domain) art['srcorg'] = srcorg # resolve bylined authors to journo ids authors = Byline.CrackByline(art['byline']) attributed = [] for author in authors: attributed.append(Journo.find_or_create(author, art, expected_journo)) art['journos'] = attributed if opts.test: ukmedia.PrettyDump( art ) if article_id: # rescraping existing article art['id'] = article_id article_id = store.upsert( art ) rescrape_count += 1 else: # article_id = store.upsert( art ) newcount += 1 if opts.test: DB.conn().rollback() else: DB.conn().commit() except Exception, err: DB.conn().rollback() # always just bail out upon ctrl-c if isinstance( err, KeyboardInterrupt ): raise failcount = failcount+1 # TODO: phase out NonFatal! just get scraper to print out a warning message instead if isinstance( err, ukmedia.NonFatal ): continue report = traceback.format_exc() if 'title' in context: msg = u"FAILED (%s): '%s' (%s)" % (err, context['title'], context['srcurl']) else: msg = u"FAILED (%s): (%s)" % (err,context['srcurl']) ukmedia.DBUG( msg + "\n" ) ukmedia.DBUG2( report + "\n" ) ukmedia.DBUG2( '-'*60 + "\n" ) abortcount = abortcount + 1 if abortcount > max_errors: print >>sys.stderr, "Too many errors - ABORTING" raise
def upsert( self, art ): """Insert or update an article""" # if no separate 'urls' set, create it if not 'urls' in art: art['urls'] = set((art['permalink'], art['srcurl'])) # fill in some defaults if missing if 'lastscraped' not in art: art['lastscraped'] = datetime.now() if 'lastseen' not in art: art['lastseen'] = datetime.now() if 'description' not in art: art['description'] = ukmedia.FirstPara(art['content']) CheckArticle( art ) # send text to the DB as utf-8 title = art['title'].encode( 'utf-8' ) byline = art[ 'byline' ].encode( 'utf-8' ) description = art['description'].encode( 'utf-8' ) pubdate = "%s" %(art['pubdate']) lastscraped = "%s" % (art['lastscraped']) lastseen = "%s" % (art['lastseen']) firstseen = lastseen # it's a new entry srcurl = art['srcurl'] permalink = art['permalink'] srcorg = art['srcorg'] # phasing out srcid... if 'srcid' in art: srcid = art['srcid'] else: srcid = art['permalink'] wordcount = None content = None # does article include content? if 'content' in art: content = art['content'].encode( 'utf-8' ) # noddy wordcount txt = ukmedia.StripHTML( art['content'] ) wordcount = len( txt.split() ); # send to db! cursor = DB.conn().cursor() updating = False if 'id' in art: updating = True if updating: # update existing article_id = art['id'] q = 'UPDATE article SET (title, byline, description, lastscraped, pubdate, lastseen, permalink, srcurl, srcorg, srcid, wordcount, last_comment_check) = (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) WHERE id=%s' cursor.execute(q, (title, byline, description, lastscraped, pubdate, lastseen, permalink, srcurl, srcorg, srcid, wordcount, lastscraped, article_id)) else: # insert new q = 'INSERT INTO article (title, byline, description, lastscraped, pubdate, firstseen, lastseen, permalink, srcurl, srcorg, srcid, wordcount, last_comment_check) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)' cursor.execute( q, ( title, byline, description, lastscraped, pubdate, firstseen, lastseen, permalink, srcurl, srcorg, srcid, wordcount, lastscraped ) ) # get the newly-allocated id cursor.execute( "select currval('article_id_seq')" ) article_id = cursor.fetchone()[0] # add the known urls for the article if updating: cursor.execute( "DELETE FROM article_url WHERE article_id=%s", (article_id,)) for url in set(art['urls']): cursor.execute( "INSERT INTO article_url (url,article_id) VALUES (%s,%s)", (url,article_id)) # update content, if included if content is None: insert_content = False else: insert_content = True if updating: # TODO: keep multiple revisions to track changes # has the content actually changed? cursor.execute("SELECT id FROM article_content WHERE article_id=%s AND content=%s", (article_id,content)) foo = cursor.fetchall() # gah... couldn't get cursor.rowcount to work... if len(foo)>=1: # no change, so just leave it as is insert_content = False if insert_content: cursor.execute("DELETE FROM article_content WHERE article_id=%s", (article_id,)) q = 'INSERT INTO article_content (article_id, content,scraped) VALUES ( %s,%s,%s )' cursor.execute(q, (article_id, content, lastscraped)) # queue it for xapian indexing cursor.execute("DELETE FROM article_needs_indexing WHERE article_id=%s", (article_id,)) cursor.execute("INSERT INTO article_needs_indexing (article_id) VALUES (%s)", (article_id,)) # if there was a scraper error entry for this article, delete it now cursor.execute( "DELETE FROM error_articlescrape WHERE srcid=%s", (srcid,) ) # if there were images, add them too if updating: cursor.execute("DELETE FROM article_image WHERE article_id=%s", (article_id,)) if 'images' in art: for im in art['images']: cap = im['caption'].encode('utf-8') cred = '' if 'credit' in im: cred = im['credit'].encode('utf-8') cursor.execute("INSERT INTO article_image (article_id,url,caption,credit) VALUES (%s,%s,%s,%s)", (article_id, im['url'], cap, cred)) # if there were commentlinks, add them too if 'commentlinks' in art: for c in art['commentlinks']: c['source'] = art['srcorgname'] CommentLink.upsert(article_id, c) # add tags Tags.generate(article_id, art['content']) # attribute journos assert 'journos' in art cursor.execute("DELETE FROM journo_attr WHERE article_id=%s", (article_id,)) for journo_id in art['journos']: cursor.execute("INSERT INTO journo_attr (journo_id,article_id) VALUES (%s,%s)", (journo_id,article_id)) # make sure journo activates if they meet the criteria Journo.update_activation(journo_id) # also clear the html cache for that journos page cachename = 'j%s' % (journo_id) cursor.execute( "DELETE FROM htmlcache WHERE name=%s", (cachename,) ) op = 'update' if updating else 'new' if insert_content: op += ' meta+content' else: op += ' meta' ukmedia.DBUG2( u"%s: %s [a%s %s ] ('%s' %s)\n" % ( art['srcorgname'] if 'srcorgname' in art else srcorg, op, article_id, art['srcurl'], art['byline'], ','.join( [ '[j%s]'%(j) for j in art['journos'] ] ) )) return article_id