示例#1
0
	def handle(self, *args, **options):
		opener = urllib2.build_opener()
		opener.addheaders = [('User-agent', 
			('Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0'))
		]
		
		

		rl = ign_reviewlist_parser()
		ip = ign_review_parser()
		
		i = 1
		url = "http://uk.ign.com/movies/reviews"

		f = opener.open(url)

		
		html = f.read().decode('utf8')
		
		rl.feed(html)

		
		reviews = rl.reviews
		
		for review in reviews:

			g = opener.open("http://uk.ign.com/" + review['url'])
		
			html_review = g.read().decode('utf8')
			
			ip.feed(html_review)

			ip.clean()
			
			review.update(ip.review)
						
			if review['score'] == '':
				ip.reset()
				time.sleep(1)
				continue	
				
			# Search for author
			author = MediaPerson.objects.filter(name=review['author'])
			
			if author.exists():
				author = author[0]
			else:
				author = MediaPerson(name=review['author'])
				author.save()
				
			# Search for user's review of item
			item = Movie.objects.filter(name=review['name'])
			
			if item.exists():
				print '"{0}" already crawled. Skipping.'.format(review['name'])
				item = item[0]
			else:
				print 'Crawling "{0}"...'.format(review['name']),
			
				item = Movie(name=review['name'],
				image_url = review['imgurl'],
				author=author,
				score=0,
				user_score=0)
				item.MediaItem_save()
				
				print "Done."

			# Search for critic 
			critic = Critic.objects.filter(name='IGN')
			if critic.exists():
				critic = critic[0]
			else:
				critic = Critic(name='IGN')
				critic.save()
			
			# Search for critic review
			critic_review = CriticReview.objects.filter(media_item=item,
				min_score=0,
				max_score=10,
				score=review['score'])
			
			# Create review
			if critic_review.exists():
				critic_review = critic_review[0]
			else:
				critic_review = CriticReview(media_item=item,
				min_score=0,
				max_score=10,
				critic=critic,
				url= "http://uk.ign.com" + review['url'],
				body=review['body'][0:255],
				score=review['score'].strip())

				
				critic_review.save()

			ip.reset()

			time.sleep(1)
示例#2
0
    def handle(self, *args, **options):
        opener = urllib2.build_opener()
        opener.addheaders = [
            ("User-agent", ("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0"))
        ]

        pat_album_url = re.compile('/album/(.*?)"')

        nm = am_newmusic_Parser()
        ip = am_InfoParser()

        i = 1
        url = "http://www.allmusic.com/newreleases"

        f = opener.open(url)

        html = f.read()

        nm.feed(html)

        html = nm.html

        urls = re.findall(pat_album_url, html)
        album_urls = []
        for i in urls:
            if i not in album_urls:
                album_urls.append(i)

        for u in album_urls:

            g = opener.open("http://www.allmusic.com/album/" + u)

            encoding = g.headers.getparam("charset")
            html_review = g.read().decode(encoding, "ignore")
            ip.feed(html_review)
            ip.clean()

            if ip.review["score"] == "":
                ip.reset()
                time.sleep(1)
                continue

                # Search for author
            author = MediaPerson.objects.filter(name=ip.review["author"])

            if author.exists():
                author = author[0]
            else:
                author = MediaPerson(name=ip.review["author"])
                author.save()

                # Search for user's review of item
            album = Music.objects.filter(name=ip.review["name"])

            if album.exists():
                print '"{0}" already crawled. Skipping.'.format(smart_str(ip.review["name"]))
                album = album[0]
            else:
                print 'Crawling "{0}"...'.format(smart_str(ip.review["name"])),

                album = Music(
                    name=ip.review["name"], image_url=ip.review["imgurl"], author=author, score=0, user_score=0
                )
                album.MediaItem_save()

                print "Done."

                # Search for critic
            critic = Critic.objects.filter(name="allmusic")
            if critic.exists():
                critic = critic[0]
            else:
                critic = Critic(name="allmusic")
                critic.save()

                # Search for critic review
            review = CriticReview.objects.filter(
                media_item=album, min_score=0, max_score=10, critic=critic, score=ip.review["score"]
            )

            # Create review
            if review.exists():
                review = review[0]
            else:
                review = CriticReview(
                    media_item=album,
                    min_score=0,
                    max_score=10,
                    critic=critic,
                    url="http://www.allmusic.com/album/" + u,
                    body=ip.review["body"][0:255],
                    score=ip.review["score"].strip(),
                )

                review.save()

            ip.reset()

            time.sleep(1)