def parse_detail(self, response): type_name = None for k,v in self.movie_category.items(): if str(v) == re.search('type=(.*?)&', unquote(response.url)).group(1): type_name = k datas = json.loads((response.text)) for data in datas: type_name = type_name titie = data.get('title') rank = data.get('rank') url = data.get('url') actors = data.get('actors') cover_url = data.get('cover_url') regions = data.get('regions') release_date = data.get('release_date') score = data.get('score') types = data.get('types') vote_count = data.get('vote_count') douban_item = DoubanMovieItem() for field in douban_item.fields: try: douban_item[field] = eval(field) except: print('Field is Not Defined', field) yield douban_item
def parse(self, response): #print(response.body.decode()) item=DoubanMovieItem() #movie_list=json.loads(response.body.decode()) # chrome_options = Options() # chrome_options.add_argument('--headless') # chrome_options.add_argument('--disable-gpu') #chrome_options.add_argument('--no-sandbox') #chrome_driver=r'C:\Users\igogle\Desktop\douban2\douban_movie\chromedriver.exe' #browser=webdriver.Chrome(executable_path=chrome_driver) #browser.get('https://www.amazon.com/ref=nav_logo') #browser.implicitly_wait(5) print('s') print('s') #movies=browser.find_element_by_xpath('//*[@id="desktop-4"]') #test=movies[0].get_attribute("li") movies= response.xpath('//*[@id="desktop-4"]//li/@data-sgproduct') print('s') print('s') print(movies) print(str(len(movies))) print('s') print('s') #print(response.xpath('//*[@id="content"]/div/div[1]/div/div[4]/div/a[1]/p/text()')) for movie in movies: print('s') print(movie.xpath('//p/text()').extract()[0]) #names=movie.xpath('.//p/text()').extract()[0] #item['name']=names #item['score']=movie['score'] #item['url']=movie['url'] #yield item
def parse_item(self, response): sel = Selector(response) item = DoubanMovieItem() title = sel.xpath('//*[@id="content"]/h1/span[1]/text()').extract() director = sel.xpath( '//*[@id="info"]/span[1]/span[2]/a/text()').extract() actor = sel.xpath('//*[@id="info"]/span[3]/span[2]/a/text()').extract() #release_time = sel.xpath('//*[@id="info"]/span[11]/text()').extract() #time = sel.xpath('//*[@id="info"]/span[13]/text()').extract() star = sel.xpath( '//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()').extract() item['title'] = title item['director'] = director item['actor'] = actor #item['release_time'] = release_time #item['time'] = time item['star'] = star yield item print(title) print(director) print(actor) # print(release_time) #print(time) print(star)
def parse(self, response): time.sleep(random.randint(2, 5)) global START item = DoubanMovieItem() # https://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&tags=%E7%94%B5%E5%BD%B1&start=20 # while True: pat_title = r'title":"(.*?)"' pat_rate = r'rate":"(.*?)"' pat_url = r'url":"(.*?)"' item['title'] = re.compile(pat_title).findall( str(response.body.decode('utf-8'))) item['rate'] = re.compile(pat_rate).findall( str(response.body.decode('utf-8'))) item['url'] = re.compile(pat_url).findall( str(response.body.decode('utf-8'))) print(item['title']) if item['title'] == []: exit() print('return item success') yield item print('yield item success') next_url = 'https://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&tags=%E7%94%B5%E5%BD%B1&start=' START = START + 20 print('next url :' + next_url + str(START)) yield Request( next_url + str(START), callback=self.parse, headers={ 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" }, encoding='utf-8') print('yield request success')
def next_parse(self, response): item2 = DoubanMovieItem() print(item2) item2['_name'] = response.xpath( '//div[@id="content"]/h1/span[@property="v:itemreviewed"]/text()' ).getall() item2['_year'] = response.xpath( '//div[@id="content"]/h1/span[@class="year"]/text()').getall() item2['_pic'] = response.xpath( '//div[@id="mainpic"]/a/img/@src').getall() print(item2) selector_next = response.xpath('//div[@id="info"]') item2['_director'] = selector_next.xpath( '//span[@class="attrs"]/a/text()').getall()[:2] item2['_writer'] = selector_next.xpath( '//span[@class="attrs"]/a/text()').getall()[2:] item2['_cast'] = selector_next.xpath( '//span[@class="actor"]/span[@class="attrs"]/span/a/text()' ).getall() item2['_type'] = selector_next.xpath( 'span[@property="v:genre"]/text()').getall() item2['_country'] = selector_next.xpath('text()').getall()[7] item2['_language'] = selector_next.xpath('text()').getall()[9] item2['_premiere'] = selector_next.xpath( 'span[@property="v:initialReleaseDate"]/text()').getall() item2['_episode'] = selector_next.xpath('text()').getall()[13] item2['_runningtime'] = selector_next.xpath('text()').getall()[15] item2['_plot'] = response.xpath( '//span[@property="v:summary"]/text()').getall() return item2
def parse(self, response): selector = Selector(response) ol_li = selector.xpath('//div[@class="item"]') for li in ol_li: movie = DoubanMovieItem() movie['_id'] = str(ObjectId()) movie['rank'] = li.xpath( 'div[@class="pic"]/em/text()').extract_first() movie['link'] = li.xpath( 'div[@class="pic"]/a/@href').extract_first() movie['img'] = li.xpath( 'div[@class="pic"]/a/img/@src').extract_first() movie['title'] = li.xpath( 'div[@class="pic"]/a/img/@alt').extract_first() movie['star'] = li.xpath( 'div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()' ).extract_first() movie['quote'] = li.xpath( 'div[@class="info"]/div[@class="bd"]/p[@class="quote"]/span[@class="inq"]/text()' ).extract_first() yield movie next_page = response.xpath('//span[@class="next"]/a/@href') if next_page: url = 'https://movie.douban.com/top250' + next_page[0].extract() yield Request(url=url, callback=self.parse)
def parse_movie(self, response): global cookies global headers # 从返回的正文中取出[{...}]之间的内容 dict_str = re.search('\[\{.*}]', response.body).group() # 取出所有{。。。}之间的内容,是dict格式的字符串 temp = re.findall('\{.*?}', dict_str) # 替换布尔量,否则报错 temp1 = [x.replace('false', 'False') for x in temp] temp2 = [x.replace('true', 'True') for x in temp1] # 替换转义符号 temp3 = [x.replace('\\', '') for x in temp2] # 使用eval把正确的字符串转为对应的类型 dict_list = [eval(x) for x in temp3] for x in dict_list: item = DoubanMovieItem() item['title'] = x['title'] item['post_urls'] = [ x['cover'], ] # pdb.set_trace() yield scrapy.Request( url=x['url'], meta={'item': item}, # 通过meta把item传送到另外一个页面抓取中 cookies=cookies, headers=headers, callback=self.parse_intro, # dont_filter=True, )
def parse_movie(self, response): hxs = HtmlXPathSelector(response) movie = DoubanMovieItem() movie['url_in_douban'] = unicode(response.url) movie['name_in_douban'] = "".join( hxs.select("//span[@property='v:itemreviewed']/text()").extract() ).strip() movie['year'] = ("".join( hxs.select("//span[@class='year']/text()").extract()).strip() )[1:-1] movie['length'] = "".join( hxs.select( "//span[@property='v:runtime']/@content").extract()).strip() movie['url_in_imdb'] = "".join( hxs.select("//div[@id='info']/a[last()]/@href").extract()).strip() movie['score'] = "".join( hxs.select( "//strong[@class='ll rating_num'][@property='v:average']/text()" ).extract()).strip() movie['scored_num'] = "".join( hxs.select( "//span[@property='v:votes']/text()").extract()).strip() tag_names = hxs.select("//div[@class='tags-body']/a/text()").extract() tag_times = hxs.select( "//div[@class='tags-body']/a/span/text()").extract() tag_num = len(tag_names) tags = dict() for i in range(tag_num): tags[tag_names[i]] = tag_times[i][1:-1] movie['tags'] = tags return [movie]
def parse_movie(self, response): print(response.status) print(response.xpath('//li/span[@class="rec"]/@id')) print(response.xpath('//span[@class="rating_per"]/text()')) _setDNSCache() movie_item = DoubanMovieItem() # movie id movie_item['movie_id'] = response.xpath( '//li/span[@class="rec"]/@id').extract() # movie title movie_item['movie_title'] = response.xpath( '//*[@id="content"]/h1/span[1]').extract() # release_date movie_item['release_date'] = response.xpath( './/h1/span[@class="year"]/text()').extract() # 导演 movie_item['directedBy'] = response.xpath( './/a[@rel="v:directedBy"]/text()').extract() # 电影主演 movie_item['starring'] = response.xpath( './/a[@rel="v:starring"]/text()').extract() # 电影类别 movie_item['genre'] = response.xpath( './/span[@property="v:genre"]/text()').extract() # 电影时长 movie_item['runtime'] = response.xpath( './/span[@property="v:runtime"]/text()').extract() # # 电影的国别和语言 # temp = response.xpath('.//div[@id="info"]/text()').extract() # movie_item['country'] = [p for p in temp if (p.strip() != '') & (p.strip() != '/')][0].strip() # movie_item['language'] = [p for p in temp if (p.strip() != '') & (p.strip() != '/')][1].strip() # 电影的评分 movie_item['rating_num'] = response.xpath( './/strong[@class="ll rating_num"]/text()').extract() # 评分的人数 movie_item['vote_num'] = response.xpath( './/span[@property="v:votes"]/text()').extract() # 电影1-5星的百分比 # movie_item['rating_per_stars5'] = response.xpath('.//span[@class="rating_per"]/text()').extract()[0].strip() # movie_item['rating_per_stars4'] = response.xpath('.//span[@class="rating_per"]/text()').extract()[1].strip() # movie_item['rating_per_stars3'] = response.xpath('.//span[@class="rating_per"]/text()').extract()[2].strip() # movie_item['rating_per_stars2'] = response.xpath('.//span[@class="rating_per"]/text()').extract()[3].strip() # movie_item['rating_per_stars1'] = response.xpath('.//span[@class="rating_per"]/text()').extract()[4].strip() # 电影的剧情简介 intro = response.xpath('.//span[@class="all hidden"]/text()').extract() if len(intro): movie_item['intro'] = intro else: movie_item['intro'] = response.xpath( './/span[@property="v:summary"]/text()').extract() # 电影的短评数 # movie_item['comment_num'] = response.xpath('.//div[@class="mod-hd"]/h2/span/a/text()').extract()[0].strip() # # 电影的提问数 # movie_item['question_num'] = response.xpath('.//div[@class="mod-hd"]/h2/span/a/text()').extract()[1].strip() # 最后输出 yield movie_item
def parse(self, response): item = DoubanMovieItem() json_text = response.text movie_dict = json.loads(json_text) for one_movie in movie_dict["subjects"]: item["title"] = one_movie["title"] item["rate"] = one_movie["rate"] yield item
def parse_item(self, response): for item in response.xpath( '//div[@class="body-bg"]/div[@class="w1000"]/div/div/ul/li'): l = DoubanMovieItemLoader(DoubanMovieItem(), item) l.add_xpath('ebtang_id', './a/@href') l.add_xpath('title', './a/text()') l.add_xpath('date', './span/text()') l.add_value('crawl_time', datetime.now()) yield l.load_item()
def parse(self, response): json_string = response.body.decode('utf-8') content = json.loads(json_string) for movie in content['subjects']: item = DoubanMovieItem() item['movie_info'] = movie['url'] item['movie_pic'] = movie['cover'] item['movie_title'] = movie['title'] item['movie_score'] = movie['rate'] yield item
def parse_movie_item(self, response): item = DoubanMovieItem() item['url'] = response.url item['name'] = response.xpath( '//span[@property="v:itemreviewed"]/text()').extract_first() item['summary'] = response.xpath( '//span[@property="v:summary"]/text()').extract_first() item['score'] = response.xpath( '//strong[@property="v:average"]/text()').extract_first() return item
def parse(self, response): movies = Selector(response=response).xpath('//div[@class="hd"]') for movie in movies: item = DoubanMovieItem() title = movie.xpath('./a/span[1]/text()').extract() link = str( movie.xpath('./a/@href').extract()[0]) + 'comments?status=P' item['title'] = title item['link'] = link yield scrapy.Request(url=link, meta={'item': item}, callback=self.parse2)
def parse_item(self, response): i = {} #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract() #i['name'] = response.xpath('//div[@id="name"]').extract() #i['description'] = response.xpath('//div[@id="description"]').extract() i['url'] = response.url i['name'] = response.xpath( '//span[@property="v:itemreviewed"]/text()').extract_first() i['summary'] = response.xpath( '//span[@property="v:summary"]/text()').extract_first().strip() i['score'] = float(response.xpath('//strong/text()').extract_first()) return DoubanMovieItem(i)
def parse_item(self, response): for item in response.xpath('//div[@id="content"]/div/div[1]/ol/li'): l = DoubanMovieItemLoader(DoubanMovieItem(), item) l.add_xpath('rank', './div/div/em/text()') l.add_xpath('picture', './div/div/a/img/@src') l.add_xpath('title', './div/div/div/a/span/text()') l.add_xpath('info', './div/div/div/p/text()') l.add_css('star', 'div.star span.rating_num::text') l.add_xpath('people', './div/div/div/div/span[4]/text()') l.add_css('quote', 'p.quote > span.inq::text') l.add_value('crawl_time', datetime.now()) yield l.load_item()
def parse_page(self, response): item = DoubanMovieItem() try: soup = BeautifulSoup(response.body, 'html.parser', from_encoding='utf-8') movie_name_tag = soup.find('div', id='content').findChild('h1') no = soup.find('span', 'top250-no').get_text() # no = response.xpath('//span[@class=top250-no]/text()').extract() movie_name = movie_name_tag.findChildren()[0].get_text( ) + movie_name_tag.findChildren()[1].get_text() # movie_name = response.xpath('//h1/span/text()').extract() # print(no,movie_name) director = soup.find('a', rel='v:directedBy').get_text() writer = soup.find('span', text='编剧').next_sibling.next_sibling.text actor = '/'.join(star.text for star in soup.findAll('a', rel='v:starring')) type = '/'.join( genre.text for genre in soup.findAll('span', property='v:genre')) region = soup.find('span', text='制片国家/地区:').next_sibling language = soup.find('span', text='语言:').next_sibling date = soup.find('span', property='v:initialReleaseDate').text length_tag = soup.find('span', property='v:runtime') if str(length_tag.next_sibling) != '<br/>': length = length_tag.text + str(length_tag.next_sibling) else: length = length_tag.text another_name = soup.find('span', text='又名:').next_sibling introduction = soup.find('span', property='v:summary').text grade = soup.find('strong', property='v:average').text comment_times = soup.find('span', property='v:votes').text item['no'] = no item['movie_name'] = movie_name item['director'] = director item['writer'] = writer item['actor'] = actor item['type'] = type item['region'] = region item['language'] = language item['date'] = date item['length'] = length item['another_name'] = another_name item['introduction'] = introduction item['grade'] = grade item['comment_times'] = comment_times except exception as e: print('Parse error:', e) return item
def parse_item(self, response): yield DoubanMovieItem({ 'url': response.url, 'name': response.xpath( '//span[@property="v:itemreviewed"]/text()').extract(), 'summary': response.xpath( '//span[@property="v:summary"]/text()').extract_first(), 'score': response.xpath( '//strong[@property="v:average"]/text()').extract_first() })
def parse(self, response): datas = json.loads(response.text).get('data') logger.info("request successful") for data in datas: item = DoubanMovieItem() for field in item.fields: if field in data: item[field] = data[field] id = item['movie_id'] = data['id'] if redis_db.add_movie_id(id): # logger.info('add %s in redis' %data['title']) yield item else: self.repeat_count += 1
def parse(self, response): item = DoubanMovieItem() divlist = response.xpath(r"//div[@class='pic']") for div in divlist: item['Url'] = div.xpath("a/@href").extract_first() item['Img_url'] = div.xpath('a/img/@src').extract_first() yield item yield scrapy.Request(url=item['Url'], meta={'item': item}, callback=self.parse_movie) next_href = response.xpath('//link[@rel="next"]/@href').extract() if next_href: nexturl = 'http://movie.douban.com/top250' + next_href[0] yield scrapy.Request(url=nexturl, headers=self.headers)
def parse(self, response): sel = Selector(response) movie_name = sel.xpath("//div[@class='pl2']/a/text()").extract() movie_url = sel.xpath("//div[@class='pl2']/a/@href").extract() movie_star = sel.xpath( "//div[@class='pl2']/div/span[@class='rating_nums']/text()" ).extract() item = DoubanMovieItem() item['movie_name'] = [n for n in movie_name] item['movie_url'] = [n for n in movie_url] item['movie_star'] = [n for n in movie_star] yield item
def parse(self, response): datas = json.loads(response.body) item = DoubanMovieItem() if datas: for data in datas: item['ranking'] = data['rank'] item['movie_name'] = data['title'] item['score_num'] = data['vote_count'] yield item # 如果datas存在数据则对下一页进行采集 page_num = re.search(r'start=(\d+)', response.url).group(1) page_num = 'start=' + str(int(page_num) + 20) next_url = re.sub(r'start=\d+', page_num, response.url) yield Request(next_url, headers=self.headers)
def parse(self, response): item = DoubanMovieItem() movies = response.xpath('//ol[@class="grid_view"]/li') for movie in movies: item['rank'] = movie.xpath('.//div[@class="pic"]/em/text()').extract()[0] item['score'] = movie.xpath('.//div[@class="star"]/span[@class="rating_num"]/text()').extract()[0] item['name'] = movie.xpath('.//div[@class="hd"]/a/span[1]/text()').extract()[0] yield item next_url = response.xpath('//span[@class="next"]/a/@href').extract() if next_url: next_url = 'https://movie.douban.com/top250' + next_url[0] yield Request(next_url, headers=self.headers)
def parse_item(self, response): # hxs=HtmlXPathSelector(response) # sel=Selector(response) item = DoubanMovieItem() # items.py定义的内容 # url=Field() # ID=Field() # name=Field() # director=Field() # writer=Field() # role=Field() # types=Field() # summary=Field() item['url'] = re.match( string=''.join(response.url), pattern='(https://movie.douban.com/subject/\d+)/.*').group(1) item['movieid'] = item['url'].split('/')[-1] item['ID'] = '/'.join( response.xpath('//*/a[contains(@href,"subject")]/@href').re( 'movie.douban.com/subject/(\d+)/(?:\?from|$)')) # item['ID']=''.join(response.xpath('//*[@id="content"]/div/div[1]/div[1]/div[3]/ul/li[5]/span/@id').extract()) item['name'] = ''.join( response.xpath('//*[@id="content"]/h1/span[1]/text()').extract()) item['director'] = '/'.join( response.xpath( '//*[@id="info"]/span[1]/span[2]/a/text()').extract()) item['writer'] = '/'.join( response.xpath( '//*[@id="info"]/span[2]/span[2]/a/text()').extract()) item['role'] = '/'.join( response.xpath( '//*[@id="info"]/span[3]/span[2]/a/text()').extract()) item['types'] = '/'.join( response.xpath('//span[@property="v:genre"]/text()').extract()) item['summary'] = ''.join( response.xpath('//span[@property="v:summary"]/text()').extract()) item['summary'] = item['summary'].strip().\ replace('<br />', '').replace('\t', ' ').\ replace('\n', ' ').replace('&', '').replace('"','').replace(u'\u3000', '') item['summary'] = re.sub(r' {1,}', ' ', item['summary']) if self.count == self.MAX_MOVIE: while True: print 'You have got {0} movies, please quit!'.format( self.MAX_MOVIE) time.sleep(2) self.count += 1 yield item
def parse(self, response): print(response.text) item = DoubanMovieItem() selector = response.xpath('//div[@class="list-wp"]/a') item['name'] = selector.xpath('p/span[@class="title"]/text()').getall() item['score'] = selector.xpath('p/span[@class="rate"]/text()').getall() item['pic'] = selector.xpath('div/span/img/@src').getall() item['link'] = selector.xpath('@href').getall() yield item print(type(item['link']), len(item['link'])) for link in item['link']: yield response.follow(link, callback=self.next_parse, meta={'depth': 1}) '''
def parse(self, response): movie_list = response.xpath('//ol[@class="grid_view"]//li') for movie in movie_list: item = DoubanMovieItem() item['movie_pic'] = movie.xpath( './/div[@class="item"]//div[@class="pic"]//img/@src' ).extract_first() item['movie_title'] = movie.xpath( './/div[@class="item"]//div[@class="hd"]//a//span[@class="title"]/text()' ).extract_first() span2 = movie.xpath( './/div[@class="item"]//div[@class="hd"]//a//span[2]/text()' ).extract_first() span3 = movie.xpath( './/div[@class="item"]//div[@class="hd"]//a//span[3]/text()' ).extract_first() item['movie_other'] = ((span2 if span2 else '') + (span3 if span3 else '')).replace( u'\xa0', u' ') item['movie_introduce'] = movie.xpath( 'normalize-space(.//div[@class="item"]//div[@class="bd"]//p[1]/text())' ).extract_first().replace(u'\xa0', u' ') item['movie_star'] = movie.xpath( './/div[@class="bd"]//div[@class="star"]/span[2]/text()' ).extract_first() item['evaluate_num'] = movie.xpath( './/div[@class="bd"]//div[@class="star"]/span[4]/text()' ).extract_first() item['movie_description'] = movie.xpath( './/div[@class="bd"]//p[@class="quote"]/span/text()' ).extract_first() yield item next_url = response.xpath( '//span[@class="next"]/a/@href').extract_first() if next_url: print(self.start_urls[0] + next_url) yield scrapy.Request(self.start_urls[0] + next_url, callback=self.parse)
def parse(self, response): item = DoubanMovieItem() film_list = json.loads(response.body.decode()) if film_list == [] or self.offset > 1000: return for film in film_list['data']: item['film_name'] = film['title'] item['film_directors'] = film['directors'] item['film_rate'] = film['rate'] item['film_actors'] = film['casts'] item['film_image_url'] = film['cover'] urllib.request.urlretrieve( item['film_image_url'], self.file_path + "/" + item['film_name'] + "." + item['film_image_url'].split(".")[-1]) yield item self.offset = self.offset + 20 new_url = 'https://movie.douban.com/j/new_search_subjects?sort=U&range=0,10&tags=1&start=' + str( self.offset) yield scrapy.Request(url=new_url, callback=self.parse)
def parse(self, response): movie_blocks = response.xpath('//ol[@class="grid_view"]/li') for block in movie_blocks: name = block.css('span.title::text').extract_first() # name = block.xpath() star = block.xpath( ".//span[@class='rating_num']/text()").extract_first() e = block.xpath( ".//div[@class='star']/span[4]/text()").extract_first() evaluation = self.eval_re.search(e).group() #group()和groups(),前一个输出字符串,后一个输出元组 introduction = block.css('span.inq::text').extract_first() item = DoubanMovieItem() item['name'] = name item['star'] = star item['evaluation'] = evaluation item['introduction'] = introduction yield item #下一次迭代从yield之后开始 next_url = response.css('span.next > a::attr(href)').extract_first() # 注意地址取法,之前都是去text,而这里取的是<>里的,指定attr和对应名称。 # <a href="?start=25&filter=" style="background: rgb(204, 136, 136); border: 2px solid red;">后页></a> if next_url: next_url = response.urljoin(next_url) # ruljoin创建完整的url yield scrapy.Request(next_url, callback=self.parse)
def parse_item(self, response): global failed_count global real_parse_count item = DoubanMovieItem() try: real_parse_count += 1 print("real parse count = %d" % (real_parse_count)) # get movie id url = response.url id = url.split('/')[-2].strip() item["movie_id"] = id # get movie name name = response.xpath( '//div[@id="content"]/h1/span[1]/text()').extract_first() item["movie_name"] = name.strip() if name else "" #get movie year year = response.xpath( '//div[@id="content"]/h1/span[2]/text()').extract_first() item["movie_year"] = year.strip("()() ") if year else "" # get movie rate rate = response.xpath( "//div[@class='rating_self clearfix']/strong/text()" ).extract_first() item["movie_rate"] = float(rate.strip() if rate else "-1") # get movie rate people rate_num = response.xpath( "//span[@property='v:votes']/text()").extract_first() item["movie_rate_people"] = int( rate_num.strip() if rate_num else "-1") # get hot short comments comments = response.xpath( "//div[@id='hot-comments']//div[@class='comment-item']//div[@class='comment']/p/text()" ).extract() votes = response.xpath( "//div[@id='hot-comments']//div[@class='comment-item']//div[@class='comment']//span[@class='votes pr5']/text()" ).extract() rates = response.xpath( "//div[@id='hot-comments']//div[@class='comment-item']//span[@class='comment-info']/span[1]/@title" ).extract() if len(comments) == len(votes) and len(votes) == len(rates): commentsarray = [] for i in range(len(votes)): short_comments = {} short_comments['comment'] = comments[i] short_comments['votes'] = int(votes[i]) short_comments['rates'] = rates[i] commentsarray.append(short_comments) item["movie_hot_short_comments"] = commentsarray seenwish = response.xpath( "//div[@class='subject-others-interests-ft']//a//text()" ).extract() if seenwish and len(seenwish) == 2: item['movie_seen'] = int(seenwish[0][:-3]) item['movie_wishes'] = int(seenwish[1][:-3]) # get movie info info = response.xpath("//div[@id='info']") infoarray = info.extract() infostr = ''.join(infoarray).strip() director = info.xpath("span[1]/span[2]/a/text()").extract() self.add_array("movie_director", director, item) writor = info.xpath("span[2]/span[2]/a/text()").extract() self.add_array("movie_writor", writor, item) actors = info.xpath("span[3]/span[2]/a/text()").extract() self.add_array("movie_actors", actors, item) time = info.xpath( "span[@property='v:runtime']/@content").extract_first() item["movie_time"] = float(time.strip() if time else "-1") types = info.xpath("span[@property='v:genre']/text()").extract() self.add_array("movie_type", types, item) try: lang = re.search(language_pattern, infostr) if lang: language = lang.group(1).strip() item["movie_language"] = language.strip() except: pass try: regionmatch = re.search(region_pattern, infostr) if regionmatch: region = regionmatch.group(1).strip() item["movie_region"] = region.strip() except: pass try: dialectmatch = re.search(dialect_pattern, infostr) if dialectmatch: dialect = dialectmatch.group(1).strip() item["movie_dialect"] = dialect.strip() except: pass desc = response.xpath("//span[@property='v:summary']/node()" ).extract_first().strip() item["movie_desc"] = desc.strip() if desc else "" tags = response.xpath( "//div[@class='tags-body']/a/text()").extract() self.add_array("movie_tags", tags, item) pic = response.xpath( "//div[@id='mainpic']/a/img/@src").extract_first() item["movie_pic_url"] = pic yield item except Exception, e: # do nothing logging.info("Parse error:%s" % (str(e))) print("failed_count = %d" % (failed_count + 1)) failed_count += 1 pass
def parse_content(self, response): movieid = self.movie[0] tag = self.movie[1] title = self.movie[2] director = self.movie[3] actor = self.movie[4] rate = self.movie[5] star = self.movie[6] cover = self.movie[7] html = BeautifulSoup(response.body, 'lxml') info = html.select('#info') if len(info) == 0: print(response.text) return [-2] info = html.select('#info')[0].get_text().split('\n') print(info) # print(len(info)) category = '' district = '' showtime = '' length = '' for item in info: item = item.split(':') if item[0] == '类型': category = item[-1].strip() elif item[0] == '制片国家/地区': district = item[-1].strip() elif item[0] == '上映日期': showtime = item[-1].strip().split('-')[0] elif item[0] == '片长': length = item[-1].strip() length = re.findall('\d+', length)[0] category = category.replace(r'/', ',') if len(district) > 0: district = district[:50] if len(category) > 0: category = category[:30] rate_count = html.select( '#interest_sectl > div.rating_wrap.clearbox > div.rating_self.clearfix > div > div.rating_sum > a > span' )[0].get_text() # interest_sectl > div.rating_wrap.clearbox > div.ratings-on-weight > div:nth-child(1) > span.rating_per rate5 = html.select( '#interest_sectl > div.rating_wrap.clearbox > div.ratings-on-weight > div:nth-of-type(1) > span.rating_per' )[0].get_text().split('%')[0] rate4 = html.select( '#interest_sectl > div.rating_wrap.clearbox > div.ratings-on-weight > div:nth-of-type(2) > span.rating_per' )[0].get_text().split('%')[0] rate3 = html.select( '#interest_sectl > div.rating_wrap.clearbox > div.ratings-on-weight > div:nth-of-type(3) > span.rating_per' )[0].get_text().split('%')[0] rate2 = html.select( '#interest_sectl > div.rating_wrap.clearbox > div.ratings-on-weight > div:nth-of-type(4) > span.rating_per' )[0].get_text().split('%')[0] rate1 = html.select( '#interest_sectl > div.rating_wrap.clearbox > div.ratings-on-weight > div:nth-of-type(5) > span.rating_per' )[0].get_text().split('%')[0] item = DoubanMovieItem() item['movieid'] = movieid item['title'] = title item['tag'] = tag item['directors'] = director item['actors'] = actor item['showtime'] = showtime item['length'] = length item['district'] = district item['category'] = category item['star'] = star item['rate'] = rate item['rate_count'] = rate_count item['rate5'] = rate5 item['rate4'] = rate4 item['rate3'] = rate3 item['rate2'] = rate2 item['rate1'] = rate1 item['cover'] = cover print('###### ') print(item) print('######')