today = date.today().isoformat() comics = getComics('working') total = len(comics) i = 0 #for comic in comics: for comic in comics: i += 1 print "Processing ", i, "/", total, "[" + str( (float(i)/float(total))*100.0 )+"%]" # Test to see if the comic is already in the DB strips = db.query('select id from strips where comic_id=' + str(comic['id']) + ' and date = %s', today) if len(strips) > 0: print "Today's " + comic.name + " is already in the database, skipping." continue # Scrape the comic image from the comic website scraper = ComicScraper(comic) # Check for same image on page strips = db.query('select MAX(date), url from strips where comic_id=' + str(comic['id']) + ';') if len(strips) > 0: print "Checking for known image..." url = strips[0]['url'] if scraper.contentHasImage(url): print "Same image as last scrape, skipping." continue print "Scraping " + comic.name image = scraper.findComicImage() # If the scrape was successful pop it into the strips DB
from datetime import date import re today = date.today().isoformat() i = 0 comic_name = 'lookingforgroup' comic_id = '8' top = 1 for j in range(top): #url = 'http://www.agreeablecomics.com/therack/?p='+ str(j) #url = 'http://www.xkcd.com' url = 'http://lfgcomic.com/page/latest' scraper = ComicScraper({'site_url':url, 'comic_url': u'0'}) i += 1 print "Processing ", i, "/", top, "[" + str( (float(i)/float(top))*100.0 )+"%]" image = scraper.findComicImage() print image # # # # if ('rack-header.jpg') in image['src']: print 'header: BUST!\n' continue # Check for same image on page strips = db.query('select url from strips where comic_id=' + comic_id + ';') if len(strips) > 0: