def handle(self, *args, **options): opener = urllib2.build_opener() opener.addheaders = [('User-agent', ('Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0')) ] rl = ign_reviewlist_parser() ip = ign_review_parser() i = 1 url = "http://uk.ign.com/movies/reviews" f = opener.open(url) html = f.read().decode('utf8') rl.feed(html) reviews = rl.reviews for review in reviews: g = opener.open("http://uk.ign.com/" + review['url']) html_review = g.read().decode('utf8') ip.feed(html_review) ip.clean() review.update(ip.review) if review['score'] == '': ip.reset() time.sleep(1) continue # Search for author author = MediaPerson.objects.filter(name=review['author']) if author.exists(): author = author[0] else: author = MediaPerson(name=review['author']) author.save() # Search for user's review of item item = Movie.objects.filter(name=review['name']) if item.exists(): print '"{0}" already crawled. Skipping.'.format(review['name']) item = item[0] else: print 'Crawling "{0}"...'.format(review['name']), item = Movie(name=review['name'], image_url = review['imgurl'], author=author, score=0, user_score=0) item.MediaItem_save() print "Done." # Search for critic critic = Critic.objects.filter(name='IGN') if critic.exists(): critic = critic[0] else: critic = Critic(name='IGN') critic.save() # Search for critic review critic_review = CriticReview.objects.filter(media_item=item, min_score=0, max_score=10, score=review['score']) # Create review if critic_review.exists(): critic_review = critic_review[0] else: critic_review = CriticReview(media_item=item, min_score=0, max_score=10, critic=critic, url= "http://uk.ign.com" + review['url'], body=review['body'][0:255], score=review['score'].strip()) critic_review.save() ip.reset() time.sleep(1)
def handle(self, *args, **options): opener = urllib2.build_opener() opener.addheaders = [ ("User-agent", ("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0")) ] pat_album_url = re.compile('/album/(.*?)"') nm = am_newmusic_Parser() ip = am_InfoParser() i = 1 url = "http://www.allmusic.com/newreleases" f = opener.open(url) html = f.read() nm.feed(html) html = nm.html urls = re.findall(pat_album_url, html) album_urls = [] for i in urls: if i not in album_urls: album_urls.append(i) for u in album_urls: g = opener.open("http://www.allmusic.com/album/" + u) encoding = g.headers.getparam("charset") html_review = g.read().decode(encoding, "ignore") ip.feed(html_review) ip.clean() if ip.review["score"] == "": ip.reset() time.sleep(1) continue # Search for author author = MediaPerson.objects.filter(name=ip.review["author"]) if author.exists(): author = author[0] else: author = MediaPerson(name=ip.review["author"]) author.save() # Search for user's review of item album = Music.objects.filter(name=ip.review["name"]) if album.exists(): print '"{0}" already crawled. Skipping.'.format(smart_str(ip.review["name"])) album = album[0] else: print 'Crawling "{0}"...'.format(smart_str(ip.review["name"])), album = Music( name=ip.review["name"], image_url=ip.review["imgurl"], author=author, score=0, user_score=0 ) album.MediaItem_save() print "Done." # Search for critic critic = Critic.objects.filter(name="allmusic") if critic.exists(): critic = critic[0] else: critic = Critic(name="allmusic") critic.save() # Search for critic review review = CriticReview.objects.filter( media_item=album, min_score=0, max_score=10, critic=critic, score=ip.review["score"] ) # Create review if review.exists(): review = review[0] else: review = CriticReview( media_item=album, min_score=0, max_score=10, critic=critic, url="http://www.allmusic.com/album/" + u, body=ip.review["body"][0:255], score=ip.review["score"].strip(), ) review.save() ip.reset() time.sleep(1)