def parse_grade(self, response): selector = Selector(response) item = ReYingMovie() item["name"] = selector.xpath('//h1/text()').extract_first() item["createdtime"] = str(datetime.now()) item["comefrom"] = "格瓦拉" item["filmid"] = re.findall(r'\d+', response.url)[0] item["crawldate"] = str(datetime.today()) try: item["movieDate"] = selector.xpath( '//div[@id="ui_movieInfo_open"]/div/ul/li[@class="first"]/text()' ).extract_first()[5:] item["Grade"] = selector.xpath( '//span[@class="point"]/text()').extract_first() item["gradePeople"] = selector.xpath( '//span[@class="txt"]/em/text()').extract_first()[2:-1] rating = selector.xpath('//span[@class="pect"]/text()').extract() item["five"] = rating[0][1:-1] item["four"] = rating[1][1:-1] item["three"] = rating[2][1:-1] item["two"] = rating[3][1:-1] item["one"] = rating[4][1:-1] except IndexError: item["movieDate"] = "暂无" item["doubanGrade"] = "暂无" item["gradePeople"] = "暂无" item["five"] = "暂无" item["four"] = "暂无" item["three"] = "暂无" item["two"] = "暂无" item["one"] = "暂无" return item
def parse_futuregrade(self, response): data = json.loads(response.text) for i in data["content"]: item = ReYingMovie() item["name"] = i["name"] item["comefrom"] = "微博" item["filmid"] = i["film_id"] item["crawldate"] = str(datetime.today()) item["createdtime"] = str(datetime.now()) item["movieDate"] = i["release_time"] item["want"] = i["want_number"] yield item
def parse(self, response): jsonstr = re.findall('var result_\d+ = (.*);var', response.text)[0] item = ReYingMovie() for i in json.loads(jsonstr)['value']['hotplayRatingList']: item["filmid"] = i['Id'] url = self.detail_url.format(str(item['filmid'])) # print(url) yield scrapy.Request(url, callback=self.parse_grade) for i in json.loads(jsonstr)['value']['upcomingTicketList']: item["filmid"] = i['Id'] url = self.detail_url.format(str(item['filmid'])) # print(url) yield scrapy.Request(url, callback=self.parse_grade)
def parse_grade(self, response): data = json.loads(response.text) for i in data["content"]: item = ReYingMovie() item["name"] = i["trendinfo"]["name"] item["createdtime"] = str(datetime.now()) item["movieDate"] = i["release_date"] item["want"] = i["want_number"] item["comefrom"] = "微博" item["filmid"] = i["film_id"] item["crawldate"] = str(datetime.today()) item["Grade"] = i["markinfo"]["score"] item["gradePeople"] = i["markinfo"]["score_count"] item["good"] = format(i["markinfo"]["good_rate"], '.0%') item["bad"] = format(i["markinfo"]["bad_rate"], '.0%') yield item
def parse_grade(self, response): get_data = re.findall(r'({.*})', response.text)[0] data = json.loads(get_data)["html"] page_source = etree.HTML(data) title = page_source.xpath('//a[@class="movie-pic"]/img/@alt') grade = page_source.xpath('//span[@class="num nuomi-red"]/text()') moviedate = page_source.xpath('///ul[@class="info"]/li[3]/text()') movieid = page_source.xpath('//a[@class="movie-pic"]/@data-data') for i in range(0, 10): item = ReYingMovie() item["name"] = title[i].strip() item["comefrom"] = "糯米" item["movieDate"] = moviedate[i].strip()[5:] item["Grade"] = grade[i].strip() item["createdtime"] = str(datetime.now()) item["filmid"] = re.findall(r"(\d+)", movieid[i])[0] item["crawldate"] = str(datetime.today()) yield item
def parse_grade(self, response): selector = Selector(response) item = ReYingMovie() item["name"] = selector.xpath( '//span[@property="v:itemreviewed"]/text()').extract_first() item["createdtime"] = str(datetime.now()) item["comefrom"] = "豆瓣" item["filmid"] = selector.xpath( '//span[@class="rec"]/a/@share-id').extract_first() item["crawldate"] = str(datetime.today()) try: item["movieDate"] = selector.xpath( '//span[@property="v:initialReleaseDate"]/text()' ).extract_first() item["Grade"] = selector.xpath( '//strong[@property]/text()').extract_first() item["gradePeople"] = selector.xpath( '//span[@property="v:votes"]/text()').extract_first() rating = selector.xpath( '//span[@class="rating_per"]/text()').extract() if len(rating) > 0: item["five"] = rating[0] item["four"] = rating[1] item["three"] = rating[2] item["two"] = rating[3] item["one"] = rating[4] item["want"] = selector.xpath( '//div[@class="subject-others-interests-ft"]/a[2]/text()' ).extract_first()[:-3] except IndexError: item["movieDate"] = selector.xpath( '//span[@property="v:initialReleaseDate"]/text()' ).extract_first() # item["Grade"] = "" # item["gradePeople"] = "" # item["five"] = "" # item["four"] = "" # item["three"] = "" # item["two"] = "" # item["one"] = "" item["want"] = selector.xpath( '//div[@class="subject-others-interests-ft"]/a[2]/text()' ).extract_first()[:-3] return item
def parse_grade(self, response): data = json.loads(re.findall(r'({.*})', response.text)[0]) #print(data) #print(data['value']['movieRating']['RatingFinal']) item = ReYingMovie() item["createdtime"] = str(datetime.now()) item["comefrom"] = "时光" item['name'] = data['value']['movieTitle'] item["Grade"] = data['value']['movieRating']["RatingFinal"] if item["Grade"] < 0: item["Grade"] = None item["gradePeople"] = data['value']['movieRating']["Usercount"] item["want"] = data['value']['movieRating']["AttitudeCount"] item["music"] = data['value']['movieRating']["ROtherFinal"] item["frames"] = data['value']['movieRating']["RPictureFinal"] item["story"] = data['value']['movieRating']["RStoryFinal"] item["director"] = data['value']['movieRating']["RDirectorFinal"] item["filmid"] = data['value']['movieRating']['MovieId'] item["crawldate"] = str(datetime.today()) #print(item) return item
def parse_grade(self,response): item =ReYingMovie() woff =response.xpath("//style/text()").extract_first() woffurl = 'http://'+re.findall(r'url\(\'//(.*?.woff)',woff)[0] woffdata = requests.get(woffurl).content b64 = base64.b64encode(woffdata) fontMapping = extract_fonts(b64) item["name"] = response.xpath('//h3[@class="name"]/text()').extract_first() item["createdtime"]=str(datetime.now()) item["movieDate"]=response.xpath('//li[@class="ellipsis"][3]/text()').extract_first()[:-4] item["comefrom"]="猫眼" item["filmid"] = re.findall(r'\d+', response.url)[0] item["crawldate"] = str(datetime.today()) people = response.xpath( '//div[@class="movie-index-content score normal-score"]/div/span/span/text()').extract_first() if people!=None: gradeor = response.xpath( '//div[@class="movie-index-content score normal-score"]/span/span/text()').extract_first() grade = self.decode_value(fontMapping, gradeor) item["Grade"] = grade people = response.xpath( '//div[@class="movie-index-content score normal-score"]/div/span/span/text()').extract_first() realpeo = self.decode_value(fontMapping, people) item["gradePeople"] = realpeo piaofang = response.xpath('//div[@class="movie-index"][2]/div/span/text()').extract_first() realpiao = self.decode_value(fontMapping, piaofang) + response.xpath('//div[@class="movie-index"][2]/div/span[2]/text()').extract_first() item['piaofang'] = realpiao return item else: item["name"] = response.xpath('//h3[@class="name"]/text()').extract_first() item["createdtime"] = str(datetime.now()) item["movieDate"] = response.xpath('//li[@class="ellipsis"][3]/text()').extract_first()[:-4] want=response.xpath('//div[@class="movie-index-content score normal-score"]/span/span/text()').extract_first() wantpeople=self.decode_value(fontMapping,want) item["want"]=wantpeople item["comefrom"]="猫眼" print(item) return item
def parse_futuregrade(self, response): item = ReYingMovie() item["name"] = response.xpath('//h1/text()').extract_first() item["createdtime"] = str(datetime.now()) item["comefrom"] = "格瓦拉" item["filmid"] = re.findall(r'\d+', response.url)[0] item["crawldate"] = str(datetime.today()) movieDate = response.xpath( '//div[@id="ui_movieInfo_open"]/div/ul/li[@class="first"]/text()' ).extract_first() print(movieDate) if movieDate == None: movieDate = response.xpath( '//div[@class="toggleInfo clear"]/span[2]/text()' ).extract_first() if movieDate == None: movieDate = response.xpath( '//div[@class="toggleInfo clear"]/span[3]/text()' ).extract_first() item["movieDate"] = movieDate[5:] item["want"] = response.xpath( '//span[@class="focusCount"]/text()').extract_first()[:-2] return item