def parse_normal_showing(self, response): booked_seat_count = len(response.css('[alt~="購入済(選択不可)"]')) result = init_show_booking_loader( response=response, item=response.meta["data_proto"]) result.add_value('book_seat_count', booked_seat_count) result.add_time_data() yield result.load_item()
def parse_normal_showing(self, response): result = init_show_booking_loader( response=response, item=response.meta["data_proto"]) booked_seat_count = len(response.xpath( '//img[contains(@src,"seat_no.gif")]')) result.add_value('book_seat_count', booked_seat_count) result.add_time_data() yield result.load_item()
def parse_normal_showing(self, response): # some cinemas are free seat ordered, so data may not be crawled booked_seat_count = len( response.xpath('//img[contains(@src,"seat_102.gif")]')) result = init_show_booking_loader(response=response, item=response.meta["data_proto"]) result.add_value('book_seat_count', booked_seat_count) result.add_time_data() yield result.load_item()
def parse_showing(self, response, curr_showing, data_proto, result_list): def parse_time(time_str): time = time_str.split(":") return (int(time[0]), int(time[1])) showing_data_proto = ShowingLoader(response=response) showing_data_proto.add_value(None, data_proto.load_item()) screen_name = curr_showing.xpath('./th/div/text()').extract_first() showing_data_proto.add_screen_name(screen_name) start_time = curr_showing.xpath( './td[@class="time"]/div/text()').extract_first() start_hour, start_minute = parse_time(start_time) showing_data_proto.add_value( 'start_time', self.get_time_from_text(start_hour, start_minute)) end_time = curr_showing.xpath( './td[@class="time"]/div/span/text()').extract_first()[1:] end_hour, end_minute = parse_time(end_time) showing_data_proto.add_value( 'end_time', self.get_time_from_text(end_hour, end_minute)) showing_data_proto.add_value('seat_type', 'NormalSeat') # query screen number from database showing_data_proto.add_total_seat_count() # check whether need to continue crawl booking data or stop now if not self.crawl_booking_data: result_list.append(showing_data_proto.load_item()) return booking_data_proto = init_show_booking_loader(response=response) booking_data_proto.add_value('showing', showing_data_proto.load_item()) book_status = curr_showing.xpath( './/img[contains(@src,"icon_seat_vacant")]/@alt').extract_first() booking_data_proto.add_book_status(book_status, util=KoronaUtil) book_status = booking_data_proto.get_output_value('book_status') if book_status in ['SoldOut', 'NotSold']: # sold out or not sold total_seat_count = showing_data_proto.get_output_value( 'total_seat_count') book_seat_count = (total_seat_count if book_status == 'SoldOut' else 0) booking_data_proto.add_value('book_seat_count', book_seat_count) booking_data_proto.add_time_data() result_list.append(booking_data_proto.load_item()) return else: # normal, need to crawl book number on order page url = curr_showing.xpath( './td[@class="btnReservation"]/div/a/@href').extract_first() request = scrapy.Request(url, callback=self.parse_normal_showing) request.meta["data_proto"] = booking_data_proto.load_item() result_list.append(request)
def parse_normal_showing(self, response): result = init_show_booking_loader(response=response, item=response.meta["data_proto"]) # extract seat info from javascript script_text = response.xpath( '//script[contains(., "seat_info")]/text()').extract_first() m = re.search(r'"total_seats":"(\d+)"', script_text) total_seat_count = int(m.group(1)) m = re.search(r'"unsold_seat_number":"(\d+)"', script_text) unsold_seat_count = int(m.group(1)) booked_seat_count = total_seat_count - unsold_seat_count result.add_value('book_seat_count', booked_seat_count) result.add_time_data() yield result.load_item()
def parse_showing_seat_json(self, response): try: seat_data = json.loads(response.text) except json.JSONDecodeError: return result = init_show_booking_loader(response=response, item=response.meta["data_proto"]) empty_seat_count = len(seat_data) booked_seat_count = ( result.get_output_value('showing')['total_seat_count'] - empty_seat_count) result.add_value('book_seat_count', booked_seat_count) result.add_time_data() yield result.load_item()
def parse_showing(self, response, curr_showing, data_proto, result_list): def parse_time(time_str): time = time_str.split(":") return (int(time[0]), int(time[1])) showing_data_proto = ShowingLoader(response=response) showing_data_proto.add_value(None, data_proto.load_item()) screen_name = curr_showing.xpath('./p/text()').extract_first() showing_data_proto.add_screen_name(screen_name) start_time = curr_showing.xpath( './/span[@class="strong fontXL"]/text()').extract_first() start_hour, start_minute = parse_time(start_time) showing_data_proto.add_value('start_time', self.get_time_from_text( start_hour, start_minute)) end_time = curr_showing.xpath( './/span[@class="strong fontXL"]/../text()').extract_first()[1:] end_hour, end_minute = parse_time(end_time) showing_data_proto.add_value('end_time', self.get_time_from_text( end_hour, end_minute)) showing_data_proto.add_value('seat_type', 'NormalSeat') # query screen number from database showing_data_proto.add_total_seat_count() # check whether need to continue crawl booking data or stop now if not self.crawl_booking_data: result_list.append(showing_data_proto.load_item()) return booking_data_proto = init_show_booking_loader(response=response) booking_data_proto.add_value('showing', showing_data_proto.load_item()) book_status = curr_showing.xpath('.//img/@src').extract_first() booking_data_proto.add_book_status(book_status, util=MovixUtil) book_status = booking_data_proto.get_output_value('book_status') if book_status in ['SoldOut', 'NotSold']: # sold out or not sold total_seat_count = showing_data_proto.get_output_value( 'total_seat_count') book_seat_count = ( total_seat_count if book_status == 'SoldOut' else 0) booking_data_proto.add_value('book_seat_count', book_seat_count) booking_data_proto.add_time_data() result_list.append(booking_data_proto.load_item()) return else: # normal, need to crawl book number on order page showing_script = curr_showing.xpath('./@onclick').extract_first() url = re.findall(r'\(\'(.+?)\'\,', showing_script)[0] request = scrapy.Request(url, callback=self.parse_normal_showing) request.meta["data_proto"] = booking_data_proto.load_item() result_list.append(request)
def parse_showing(self, response, curr_showing, showing_url_parameter, data_proto, result_list): def parse_time(time_str): """ ex. "24:40" """ time = time_str.split(":") return (int(time[0]), int(time[1])) showing_url_parameter['showing_cd'] = curr_showing['code'] showing_data_proto = ShowingLoader(response=response) showing_data_proto.add_value(None, data_proto.load_item()) # time like 24:40 can not be directly parsed, # so we need to shift time properly start_hour, start_minute = parse_time(curr_showing['showingStart']) showing_data_proto.add_value('start_time', self.get_time_from_text( start_hour, start_minute)) end_hour, end_minute = parse_time(curr_showing['showingEnd']) showing_data_proto.add_value('end_time', self.get_time_from_text( end_hour, end_minute)) showing_data_proto.add_value('seat_type', 'NormalSeat') # query screen number from database showing_data_proto.add_total_seat_count() # check whether need to continue crawl booking data or stop now if not self.crawl_booking_data: result_list.append(showing_data_proto.load_item()) return booking_data_proto = init_show_booking_loader(response=response) booking_data_proto.add_value('showing', showing_data_proto.load_item()) book_status = curr_showing['unsoldSeatInfo']['unsoldSeatStatus'] booking_data_proto.add_book_status(book_status, util=TohoUtil) book_status = booking_data_proto.get_output_value('book_status') if book_status in ['SoldOut', 'NotSold']: # sold out or not sold total_seat_count = showing_data_proto.get_output_value( 'total_seat_count') book_seat_count = ( total_seat_count if book_status == 'SoldOut' else 0) booking_data_proto.add_value('book_seat_count', book_seat_count) booking_data_proto.add_time_data() result_list.append(booking_data_proto.load_item()) return else: # normal, need to crawl book number on order page url = self.generate_showing_url(**showing_url_parameter) request = scrapy.Request(url, callback=self.parse_normal_showing) request.meta["data_proto"] = booking_data_proto.load_item() result_list.append(request)
def parse_seat_json_api(self, response): result = init_show_booking_loader(response=response, item=response.meta["data_proto"]) booked_normal_seat_count = len( response.xpath('//a[@class="seat seat-none"]')) booked_wheelseat_count = len( response.xpath('//a[@class="seat wheelseat-none"]')) booked_executive_seat_count = len( response.xpath('//a[@class="seat executive-none"]')) booked_seat_count = (booked_normal_seat_count + booked_wheelseat_count + booked_executive_seat_count) result.add_value('book_seat_count', booked_seat_count) result.add_time_data() yield result.load_item()
def parse_showing(self, response, curr_showing, data_proto, result_list): def parse_time(time_str): return (int(time_str[:2]), int(time_str[2:])) showing_data_proto = ShowingLoader(response=response) showing_data_proto.add_value(None, data_proto.load_item()) start_hour, start_minute = parse_time(curr_showing['start_time']) showing_data_proto.add_value( 'start_time', self.get_time_from_text(start_hour, start_minute)) end_hour, end_minute = parse_time(curr_showing['end_time']) showing_data_proto.add_value( 'end_time', self.get_time_from_text(end_hour, end_minute)) showing_data_proto.add_value('seat_type', 'NormalSeat') # TODO get seat type right now # query screen number from database showing_data_proto.add_total_seat_count() # check whether need to continue crawl booking data or stop now if not self.crawl_booking_data: result_list.append(showing_data_proto.load_item()) return booking_data_proto = init_show_booking_loader(response=response) booking_data_proto.add_value('showing', showing_data_proto.load_item()) booking_data_proto.add_book_status(curr_showing['available'], util=CinemaSunshineUtil) book_status = booking_data_proto.get_output_value('book_status') if book_status in ['SoldOut', 'NotSold']: # sold out or not sold total_seat_count = showing_data_proto.get_output_value( 'total_seat_count') book_seat_count = (total_seat_count if book_status == 'SoldOut' else 0) booking_data_proto.add_value('book_seat_count', book_seat_count) booking_data_proto.add_time_data() result_list.append(booking_data_proto.load_item()) return else: # normal, need to crawl book number on order page url = curr_showing['url'] request = scrapy.Request(url, callback=self.parse_pre_ordering) request.meta["data_proto"] = booking_data_proto.load_item() request.meta["dont_merge_cookies"] = True result_list.append(request)
def parse_normal_showing(self, response): result = init_show_booking_loader( response=response, item=response.meta["data_proto"]) time_text = response.xpath( '//span[@class="screenTime"]/text()').extract_first() time_list = time_text.split('-') start_time = time_list[0].strip() start_hour, start_minute = self.parse_time(start_time) result.get_output_value('showing')['start_time'] = \ self.get_time_from_text(start_hour, start_minute) end_time = time_list[1].strip() end_hour, end_minute = self.parse_time(end_time) result.get_output_value('showing')['end_time'] = \ self.get_time_from_text(end_hour, end_minute) booked_seat_count = len(response.xpath( '//li[@class="seatSell seatOff"]')) result.add_value('book_seat_count', booked_seat_count) result.add_time_data() yield result.load_item()
def parse_showing(self, response, curr_showing, data_proto, result_list): showing_data_proto = ShowingLoader(response=response) showing_data_proto.add_value(None, data_proto.load_item()) start_time = curr_showing.xpath( './div/text()').extract_first()[:-1] start_hour, start_minute = self.parse_time(start_time) showing_data_proto.add_value('start_time', self.get_time_from_text( start_hour, start_minute)) # end time not displayed in schedule page showing_data_proto.add_value('seat_type', 'NormalSeat') # query screen number from database showing_data_proto.add_total_seat_count() # check whether need to continue crawl booking data or stop now if not self.crawl_booking_data: result_list.append(showing_data_proto.load_item()) return booking_data_proto = init_show_booking_loader(response=response) booking_data_proto.add_value('showing', showing_data_proto.load_item()) book_status = curr_showing.xpath('./div/@class').extract_first() booking_data_proto.add_book_status(book_status, util=KinezoUtil) book_status = booking_data_proto.get_output_value('book_status') if book_status in ['SoldOut', 'NotSold']: # sold out or not sold total_seat_count = showing_data_proto.get_output_value( 'total_seat_count') book_seat_count = ( total_seat_count if book_status == 'SoldOut' else 0) booking_data_proto.add_value('book_seat_count', book_seat_count) booking_data_proto.add_time_data() result_list.append(booking_data_proto.load_item()) return else: # normal, need to crawl book number on order page url = curr_showing.xpath('./@href').extract_first() url = response.urljoin(url) request = scrapy.Request(url, callback=self.parse_normal_showing) request.meta["data_proto"] = booking_data_proto.load_item() result_list.append(request)
def parse_showing_json(self, response): """ extract showing info from json data """ # TODO D-Box seat need check if handled right script_text = copy.deepcopy(response.text) script_text = re.sub(r'[\t\r\n]', '', script_text, re.DOTALL) script_text = re.sub(r'if\( typeof.+?{}}WMC_E_DATA = ', '', script_text, re.DOTALL) json_data = demjson.decode(script_text) result = init_show_booking_loader(response=response, item=response.meta["data_proto"]) booked_seat_count = 0 for i in range(len(json_data['SeatMaps']['FLAG'][0])): for j in range(len(json_data['SeatMaps']['FLAG'][0][i])): curr_num = json_data['SeatMaps']['FLAG'][0][i][j] if curr_num == '3': booked_seat_count += 1 result.add_value('book_seat_count', booked_seat_count) result.add_time_data() yield result.load_item()
def parse_normal_showing(self, response): seat_block = response.xpath('//div[@class="cinema_seets step1"]') all_li = len(seat_block.xpath('.//li')) useless_li = ( len(seat_block.xpath('.//li[contains(@class,"none")]')) + len(seat_block.xpath('.//li[contains(@class,"seet_row_head")]'))) total_seat_count = all_li - useless_li result = init_show_booking_loader(response=response, item=response.meta["data_proto"]) result.get_output_value( 'showing')['total_seat_count'] = total_seat_count # empty seat is generated by json api, so we need another request # extract json url from javascript script_text = response.xpath( '//script[contains(.,"ajax")]/text()').extract_first() m = re.search(r"url: \"(.+)\"", script_text) tail = m.group(1) m = re.search(r"data: \"(.+)\"", script_text) parameters = m.group(1) url = self.generate_seat_json_url(tail=tail, parameters=parameters) request = scrapy.Request(url, callback=self.parse_showing_seat_json) request.meta["data_proto"] = result.load_item() yield request
def parse_showing(self, response, curr_showing, data_proto, result_list): def parse_time(time_str): time = time_str.split(":") return (int(time[0]), int(time[1])) showing_data_proto = ShowingLoader(response=response) showing_data_proto.add_value(None, data_proto.load_item()) start_time = curr_showing.xpath( './span[@class="start-time digit"]/text()').extract_first() start_hour, start_minute = parse_time(start_time) showing_data_proto.add_value( 'start_time', self.get_time_from_text(start_hour, start_minute)) end_time = curr_showing.xpath( './span[@class="end-time digit"]/text()').extract_first() end_hour, end_minute = parse_time(end_time) showing_data_proto.add_value( 'end_time', self.get_time_from_text(end_hour, end_minute)) # TODO cinema name extract failed # TODO extract name may be different from real name cinema_name = curr_showing.xpath( './span[@class="movie-info-theater"]/text()').extract_first() # if extract cinema name from showing info, use this one if cinema_name: showing_data_proto.replace_cinema_name(cinema_name) screen_name = "unknown" url = curr_showing.xpath( './span[@class="purchase-block"]/a/@href').extract_first() if url: # extract screen name by url parameter screen_number = re.findall(r'&sc=(\d+)&', url) if screen_number: screen_number = screen_number[-1] screen_name = "シアター" + screen_number # CANNOTSOLVE we cannot get screen name from site for # sold out and not sold showings so we have to give it a special # screen name showing_data_proto.add_screen_name(screen_name) showing_data_proto.add_value('seat_type', 'NormalSeat') # query screen number from database showing_data_proto.add_total_seat_count() # check whether need to continue crawl booking data or stop now if not self.crawl_booking_data: result_list.append(showing_data_proto.load_item()) return booking_data_proto = init_show_booking_loader(response=response) booking_data_proto.add_value('showing', showing_data_proto.load_item()) book_status = curr_showing.xpath( './span[@class="purchase-block"]/a/@class').extract_first() booking_data_proto.add_book_status(book_status, util=ForumUtil) book_status = booking_data_proto.get_output_value('book_status') if book_status in ['SoldOut', 'NotSold']: # sold out or not sold total_seat_count = showing_data_proto.get_output_value( 'total_seat_count') book_seat_count = (total_seat_count if book_status == 'SoldOut' else 0) booking_data_proto.add_value('book_seat_count', book_seat_count) booking_data_proto.add_time_data() result_list.append(booking_data_proto.load_item()) return else: # normal, need to crawl book number on order page url = curr_showing.xpath( './span[@class="purchase-block"]/a/@href').extract_first() request = scrapy.Request(url, callback=self.parse_normal_showing) request.meta["data_proto"] = booking_data_proto.load_item() result_list.append(request)
def parse_showing(self, response, curr_showing, data_proto, result_list): def parse_time(time_str): time_str = unicodedata.normalize('NFKC', start_time) time = time_str.split(":") return (int(time[0]), int(time[1])) # showing section passed in may be unusable and need to be filtered time_section = curr_showing.xpath('./div[@class="time"]') if not time_section: return showing_data_proto = ShowingLoader(response=response) showing_data_proto.add_value(None, data_proto.load_item()) start_time = time_section.xpath('./span/span/text()').extract_first() start_hour, start_minute = parse_time(start_time) showing_data_proto.add_value( 'start_time', self.get_time_from_text(start_hour, start_minute)) end_time = time_section.xpath('./span/text()').extract_first() end_hour, end_minute = parse_time(end_time) showing_data_proto.add_value( 'end_time', self.get_time_from_text(end_hour, end_minute)) screen_name = curr_showing.xpath('./div[2]/a/text()').extract_first() showing_data_proto.add_screen_name(screen_name) # when site ordering is stopped stop crawling site_status = curr_showing.xpath('./a/span[2]/text()').extract_first() if site_status == '予約停止中': return # handle free order seat type showings seat_type = curr_showing.xpath( './div[@class="icon"]//img/@alt').extract_first() showing_data_proto.add_value('seat_type', AeonUtil.standardize_seat_type(seat_type)) # query screen number from database showing_data_proto.add_total_seat_count() # check whether need to continue crawl booking data or stop now if not self.crawl_booking_data: result_list.append(showing_data_proto.load_item()) return booking_data_proto = init_show_booking_loader(response=response) booking_data_proto.add_value('showing', showing_data_proto.load_item()) book_status = curr_showing.xpath('./a/span/text()').extract_first() booking_data_proto.add_book_status(book_status, util=AeonUtil) book_status = booking_data_proto.get_output_value('book_status') seat_type = showing_data_proto.get_output_value('seat_type') if (seat_type == 'FreeSeat' or book_status in ['SoldOut', 'NotSold']): # sold out or not sold total_seat_count = showing_data_proto.get_output_value( 'total_seat_count') book_seat_count = (total_seat_count if book_status == 'SoldOut' else 0) booking_data_proto.add_value('book_seat_count', book_seat_count) booking_data_proto.add_time_data() result_list.append(booking_data_proto.load_item()) return else: # normal, generate request to showing page showing_request = self.generate_agreement_request( response=response, curr_showing=curr_showing) # go to shchedule page again to generate independent cookie # for each showing schedule_url = response.meta['schedule_url'] request = scrapy.Request(schedule_url, dont_filter=True, callback=self.parse_new_cookie) request.meta["data_proto"] = booking_data_proto.load_item() request.meta["showing_request"] = showing_request (performance_id, _, _) = self.extract_showing_parameters(curr_showing) request.meta["cookiejar"] = performance_id result_list.append(request)
def parse_showing(self, response, curr_showing, data_proto, result_list): def parse_time(time_str): time = time_str.split(":") return (int(time[0]), int(time[1])) showing_data_proto = ShowingLoader(response=response) showing_data_proto.add_value(None, data_proto.load_item()) start_time = curr_showing.xpath( './div/ol/li[@class="startTime"]/text()').extract_first() start_hour, start_minute = parse_time(start_time) showing_data_proto.add_value('start_time', self.get_time_from_text( start_hour, start_minute)) end_time = curr_showing.xpath( './div/ol/li[@class="endTime"]/text()').extract_first()[1:] end_hour, end_minute = parse_time(end_time) showing_data_proto.add_value('end_time', self.get_time_from_text( end_hour, end_minute)) # handle free order seat type showings seat_type = curr_showing.xpath( './div/ul/li[@class="seatIcon"]/img/@src').extract_first() showing_data_proto.add_value( 'seat_type', UnitedUtil.standardize_seat_type(seat_type)) # query screen number from database showing_data_proto.add_total_seat_count() # check whether need to continue crawl booking data or stop now if not self.crawl_booking_data: result_list.append(showing_data_proto.load_item()) return booking_data_proto = init_show_booking_loader(response=response) booking_data_proto.add_value('showing', showing_data_proto.load_item()) book_status = curr_showing.xpath( './div/ul/li[@class="uolIcon"]//img[1]/@src').extract_first() booking_data_proto.add_book_status(book_status, util=UnitedUtil) book_status = booking_data_proto.get_output_value('book_status') seat_type = showing_data_proto.get_output_value('seat_type') if (seat_type == 'FreeSeat' or book_status in ['SoldOut', 'NotSold']): # sold out or not sold total_seat_count = showing_data_proto.get_output_value( 'total_seat_count') book_seat_count = ( total_seat_count if book_status == 'SoldOut' else 0) booking_data_proto.add_value('book_seat_count', book_seat_count) booking_data_proto.add_time_data() result_list.append(booking_data_proto.load_item()) return else: # normal, need to crawl book number on order page # we will visit schedule page again to generate independent cookie # as same cookie will lead to confirm page url = curr_showing.xpath( './div/ul/li[@class="uolIcon"]/a/@href').extract_first() # determine if next page is 4dx confirm page by title title = showing_data_proto.get_output_value('title') if '4DX' in title: request = scrapy.Request( url, callback=self.parse_4dx_confirm_page) else: request = scrapy.Request( url, callback=self.parse_normal_showing) request.meta["data_proto"] = booking_data_proto.load_item() # use independent cookie to avoid affecting each other request.meta["cookiejar"] = url result_list.append(request)