def init(_genre): # load setting file global genre global setting global ignores global api global vision global tweets global themes global users global infos genre = _genre with open('settings.yaml') as f: fcntl.flock(f, fcntl.LOCK_SH) setting = yaml.load(f)[genre] with open(setting['ignores']) as f: fcntl.flock(f, fcntl.LOCK_SH) ignores = yaml.load(f) # prepare twitter object api = get_api(setting['rt_account']) # prepare Google Cloud Vision API with open('.google-api-key') as f: key = f.read().strip() vision = build('vision', 'v1', developerKey=key) tweets = get_mongo_client()[genre + '_1draw_collections']['tweets'] themes = get_mongo_client()[genre + '_1draw_collections']['themes'] users = get_mongo_client()[genre + '_1draw_collections']['users'] infos = get_mongo_client()[genre + '_1draw_collections']['infos']
def insert_docs(docs): ids = [] c = get_mongo_client()[config['target']].entries for doc in docs: if c.find({'_id': doc['_id']}).count(): continue res = c.insert_one(doc) ids.append(res.inserted_id) return ids
def index(request): db = get_mongo_client().kinpri_theater_checker last_updated = db.theaters.find().sort([('last_updated', -1) ]).limit(1)[0]['last_updated'] today = datetime.datetime.fromordinal(datetime.date.today().toordinal()) day_num = 4 days = [today + datetime.timedelta(days=i) for i in range(day_num)] theater_shows = [(theater, [make_shows(db, theater['name'], day) for day in days]) for theater in db.theaters.find()] support_theater_num = len(db.shows_latest.distinct('theater')) present_total_theater_num = db.theaters.find({ 'start_date': parse('6/10') }).count() total_theater_num = db.theaters.find().count() return render( request, 'kinpri_theater_checker/index.html', { 'last_updated': last_updated, 'days': days, 'theater_shows': theater_shows, 'support_theater_num': support_theater_num, 'total_theater_num': total_theater_num, 'present_total_theater_num': present_total_theater_num, })
class MovixSpider(scrapy.Spider): name = 'movix' custom_settings = { 'ITEM_PIPELINES': { 'kinpri_theater_checker.pipelines.ShowPipeline': 300, }, 'CONCURRENT_REQUESTS': 2, } allowed_domains = ['smt-cinema.com'] # prepare start_urls db = get_mongo_client().kinpri_theater_checker.theaters script = ''' treat = require("treat") function get_movie(splash, i) local button = splash.execjs( "document.querySelectorAll('.scrollDate:not(.nonactive)')[" .. i .. "]") button.mouse_click() local res = { html = splash:html(), ok = true, } return res end function main(splash) local days = splash:execjs( "document.querySelectorAll('.scrollDate:not(.nonactive)').length") local movies = treat.as_array({}) for i = 0, days - 1 do movies[i] = get_movie(splash, i) end return movies end ''' def start_requests(self): theater_regex = re.compile('|'.join(self.allowed_domains)) start_urls = [t['link'] for t in self.db.find({'link': theater_regex})] for url in start_urls: yield SplashRequest( url=url, callback=self.parse, args={'wait': 3}, ) def parse(self, response): # get theater name if 'redirect_urls' in response.request.meta: request_url = response.request.meta['redirect_urls'][0] else: request_url = response.url theater = self.db.find_one({'link': request_url}).get('name') date = response.css('#Day_schedule h1::text').extract_first() movies = response.css('.scheduleBox') for movie in movies: title = movie.css('h2 ::text').extract_first() # skip the movie is not kinpri if not utils.is_title_kinpri(title): continue shows = movie.css('.scheduleBox>table>tbody>tr>td') for s in shows: show = Show() show['updated'] = datetime.datetime.now() show['theater'] = theater show['schedule_url'] = response.url show['date'] = date show['title'] = title show['movie_types'] = utils.get_kinpri_types(title) screen = s.css('p::text').re(r'\d+') if not screen: break show['screen'] = screen[0] show['start_time'] = s.css('span::text').extract_first() show['end_time'] = s.css('tr>td::text').re(r'\d+:\d+')[0] show['ticket_state'] = s.css('img::attr(alt)').extract_first() reservation_url = s.css('td[onclick]::attr(onclick)') if reservation_url: reservation_url = reservation_url.re(r"'(https://.+?)'")[0] yield scrapy.Request( url=reservation_url, callback=self.parse_reservation, meta={'show': show}, ) else: show['remaining_seats_num'] = 0 show['total_seats_num'] = None show['reserved_seats'] = None show['remaining_seats'] = [] show['reservation_url'] = None yield show def parse_reservation(self, response): show = response.meta['show'] seats = response.css('#choice td a.tip') remainings = [] reserveds = [] for seat in seats: if seat.css('img[src*="seat_no"]'): id = seat.css('::attr(title)').extract_first() reserveds.append(id) elif seat.css('img[src*="seat_off"]'): id = seat.css('::attr(title)').extract_first() remainings.append(id) show['remaining_seats_num'] = len(remainings) show['total_seats_num'] = len(remainings) + len(reserveds) show['reserved_seats'] = reserveds show['remaining_seats'] = remainings yield show
def open_spider(self, spider): self.client = get_mongo_client() self.db = self.client[self.mongo_db]
return db[collection].update_one({'_id': id}, {'$inc': { key: value }}, upsert=True) def get_value_db(collection, id, key): doc = db[collection].find_one({'_id': id}) if doc: return doc.get(key) else: return None # prepare db db = get_mongo_client().nanami_kyupikon kyupikon_db = redis.Redis() kyupikons_queue_name = 'twitter_nanami_kyupiko_kyupikons_queue' kyupikons_reply_queue_name = 'twitter_nanami_kyupiko_kyupikons_reply_queue' # parse args parser = argparse.ArgumentParser() parser.add_argument('--debug', action='store_true', help='enable debug mode to avoid actual tweeting') parser.add_argument('--reset_counts', action='store_true', help='reset reply counts database') args = parser.parse_args() # prepare api object
class CinecittaSpider(scrapy.Spider): name = "cinecitta" custom_settings = { 'ITEM_PIPELINES': { 'kinpri_theater_checker.pipelines.ShowPipeline': 300, }, } allowed_domains = ['cinecitta.co.jp', 'cinecitta.jp'] # prepare start_urls db = get_mongo_client().kinpri_theater_checker.theaters theater_regex = re.compile(r'cinecitta.co.jp') start_urls = [t['link'] for t in db.find({'link': theater_regex})] def parse(self, response): # get theater name if 'redirect_urls' in response.request.meta: request_url = response.request.meta['redirect_urls'][0] else: request_url = response.url theater = self.db.find_one({'link': request_url}).get('name') calendar_url = response.urljoin(response.css('iframe::attr(src)').extract_first()) yield scrapy.Request(url=calendar_url, callback=self.parse_calendar, meta={'theater': theater}) def parse_calendar(self, response): urls_dates = zip( response.css('a::attr(href)').extract(), response.css('a::text').extract()) for url, date in urls_dates: response.meta['date'] = date yield response.request.replace(url=url, callback=self.parse_schedule_iframe, meta=response.meta) def parse_schedule_iframe(self, response): url = response.urljoin(response.css('#ifrParent::attr(src)').extract_first()) yield response.request.replace(url=url, callback=self.parse_schedule) def parse_schedule(self, response): movies = response.css('table.movietitle') for movie in movies: title = movie.css('.item1 ::text').extract_first() # skip the movie is not kinpri if not utils.is_title_kinpri(title): continue screen = movie.css('a.theaterlink::text').re(r'\d+')[0] shows = movie.css('.time1, .time2') for s in shows: show = Show() show['updated'] = datetime.datetime.now() show['theater'] = response.meta['theater'] show['schedule_url'] = response.url show['date'] = response.meta['date'] show['title'] = title show['screen'] = screen show['movie_types'] = utils.get_kinpri_types(title) times = s.css('span::text').re(r'\d+:\d+') if not times: break show['start_time'] = times[0] show['end_time'] = times[1] show['ticket_state'] = s.css('img::attr(src)').extract()[-1] show['reservation_url'] = s.css('a::attr(href)').extract_first() # TODO: check seats # if reservation_url: # # yield scrapy.Request(url=reservation_url, # callback=self.parse_reservation, # meta={'show': show}, # ) # else: show['remaining_seats_num'] = 0 show['total_seats_num'] = None show['reserved_seats'] = None show['remaining_seats'] = [] show['reservation_url'] = None yield show def parse_check_continue(self, response): if '購入途中' in response.css('h2::text').extract_first(): url = response.urljoin(response.css('form::attr(action)').extract()[-1]) yield response.request.replace(url=url, callback=self.parse_reservation, method='POST', body='rm=start') yield response.request.replace(callback=self.parse_reservation) def parse_reservation(self, response): show = response.meta['show'] remainings = [s.css('::attr(id)').extract_first() for s in response.css('#view_seat td[value="0"]')] reserveds = [s.css('::attr(id)').extract_first() for s in response.css('#view_seat td[value="1"]')] show['remaining_seats_num'] = len(remainings) show['total_seats_num'] = len(remainings) + len(reserveds) show['reserved_seats'] = reserveds show['remaining_seats'] = remainings yield show
with open('settings.yaml') as f: settings = yaml.load(f).get(account) if not settings: raise ValueError('There is no account name', account) return settings if __name__ == '__main__': print(datetime.datetime.now()) IGNORE_USERS = get_ignore_users() IGNORE_DATES = get_ignore_dates() IGNORE_IDS = get_ignore_ids() parser = argparse.ArgumentParser() parser.add_argument('account') parser.add_argument('command', choices=['retweet', 'update_themes']) parser.add_argument('--dry-run', action='store_true') args = parser.parse_args() settings = get_settings(args.account) print(settings) api = get_api(settings['rt_bot_screen_name']) tag = settings['tag'] tws = get_mongo_client()[settings['db_name']].tweets ths = get_mongo_client()[settings['db_name']].themes if args.command == 'retweet': retweet() elif args.command == 'update_themes': update_themes()
'print_tomorrow', 'tweet_date', 'print_date', 'retweet', 'follow', 'run_command_from_tos', ]) parser.add_argument('--date') parser.add_argument('--delta', type=int, default=1) parser.add_argument('--screen_names', nargs='+') # for retweet parser.add_argument('--ids', nargs='+') # for retweet args = parser.parse_args() # prepare database c = get_mongo_client().kinpri_goods_wiki # get tweepy api if args.debug: api = get_api('sakuramochi_pre') else: api = get_api('goods_yamada') # run command # today if args.command == 'tweet_today': tweet_date_items(get_date()) elif args.command == 'print_today': print_date_items(get_date())
settings = yaml.load(f).get(account) if not settings: raise ValueError('There is no account name', account) return settings if __name__ == '__main__': print(datetime.datetime.now()) IGNORE_USERS = get_ignore_users() IGNORE_DATES = get_ignore_dates() IGNORE_IDS = get_ignore_ids() parser = argparse.ArgumentParser() parser.add_argument('account') parser.add_argument('command', choices=['retweet', 'update_themes']) parser.add_argument('--dry-run', action='store_true') args = parser.parse_args() settings = get_settings(args.account) print(settings) api = get_api(settings['rt_bot_screen_name']) tag = settings['tag'] tws = get_mongo_client()[settings['db_name']].tweets ths = get_mongo_client()[settings['db_name']].themes if args.command == 'retweet': retweet() elif args.command == 'update_themes': update_themes()
parser.add_argument('command', type=str, choices=[ 'tweet_today', 'print_today', 'tweet_tomorrow', 'print_tomorrow', 'tweet_date', 'print_date', 'retweet', 'follow', 'run_command_from_tos', ]) parser.add_argument('--date') parser.add_argument('--delta', type=int, default=1) parser.add_argument('--screen_names', nargs='+') # for retweet parser.add_argument('--ids', nargs='+') # for retweet args = parser.parse_args() # prepare database c = get_mongo_client().kinpri_goods_wiki # get tweepy api if args.debug: api = get_api('sakuramochi_pre') else: api = get_api('goods_yamada') # run command # today if args.command == 'tweet_today': tweet_date_items(get_date()) elif args.command == 'print_today': print_date_items(get_date())
class TtcgSpider(scrapy.Spider): name = "ttcg" custom_settings = { 'ITEM_PIPELINES': { 'kinpri_theater_checker.pipelines.ShowPipeline': 300, } } allowed_domains = ['ttcg.jp'] # prepare start_urls db = get_mongo_client().kinpri_theater_checker.theaters start_urls = [t['link'] for t in db.find({'link': re.compile(r'ttcg.jp')})] def parse(self, response): # get theater name if 'redirect_urls' in response.request.meta: request_url = response.request.meta['redirect_urls'][0] else: request_url = response.url theater = self.db.find_one({'link': request_url}).get('name') url = response.css('#navschedule a::attr(href)').extract_first() yield scrapy.Request(url=url, callback=self.parse_schedule, meta={'theater': theater}) def parse_schedule(self, response): date = response.css('.today::text').extract_first() movies = response.css('.timeschedule') for movie in movies: title = movie.css( '.mtitle span.fontm::text').extract_first().strip() # skip not kinpri if not utils.is_title_kinpri(title): continue shows = movie.css('td') for s in shows: show = Show() show['updated'] = datetime.datetime.now() show['title'] = title show['movie_types'] = utils.get_kinpri_types(title) show['date'] = date show['theater'] = response.meta['theater'] show['schedule_url'] = response.url show['start_time'] = s.css('.start ::text').extract_first() if not show['start_time']: break show['end_time'] = s.css('.end ::text').extract_first() show['screen'] = None state = s.css('.icon_kuuseki ::text').extract_first() show['ticket_state'] = state reservation_url = None if reservation_url: show['reservation_url'] = reservation_url yield scrapy.Request(url=reservation_url, callback=self.parse_reservation, meta={'show': show}) else: yield show next_day_url = response.css('.schehead .b-next a::attr(href)') \ .extract_first() if not next_day_url == response.url: print(next_day_url) yield response.request.replace(url=next_day_url) # TODO: def parse_reservation(self, response): show = response.meta['show'] remaining = [ s.css('::attr(title)').extract_first() for s in response.css('li.seatSell.seatOn') ] reserved = [ s.css('::attr(title)').extract_first() for s in response.css('li.seatSell.seatOff') ] show['remaining_seats_num'] = len(remaining) show['total_seats_num'] = len(remaining) + len(reserved) show['reserved_seats'] = reserved show['remaining_seats'] = remaining yield show
class TohoSpider(scrapy.Spider): name = 'toho' custom_settings = { 'ITEM_PIPELINES': { 'kinpri_theater_checker.pipelines.ShowPipeline': 300, }, 'CONCURRENT_REQUESTS': 2, } allowed_domains = ['tohotheater.jp'] # prepare start_urls db = get_mongo_client().kinpri_theater_checker.theaters def start_requests(self): theater_regex = re.compile('|'.join(self.allowed_domains)) start_urls = [t['link'] for t in self.db.find({'link': theater_regex})] for url in start_urls: yield SplashRequest( url=url, callback=self.parse, args={'wait': 3}, ) def parse(self, response): # get theater name if 'redirect_urls' in response.request.meta: request_url = response.request.meta['redirect_urls'][0] else: request_url = response.url theater = self.db.find_one({'link': request_url}).get('name') date = response.css('.schedule-body-day::text').extract_first() movies = response.css('.schedule-body-section-item') for movie in movies: title = movie.css('.schedule-body-title::text').extract_first() # skip the movie is not kinpri if not utils.is_title_kinpri(title): continue screens = movie.css('.schedule-screen') for s in screens: screen = s.css('.schedule-screen-title::text').extract_first() shows = s.css('.schedule-item') for s in shows: show = Show() show['updated'] = datetime.datetime.now() show['theater'] = theater show['schedule_url'] = response.url show['date'] = date show['title'] = title show['movie_types'] = utils.get_kinpri_types(title) show['screen'] = screen show['start_time'] = s.css( '.time .start::text').extract_first() show['end_time'] = s.css( '.time .end::text').extract_first() show['ticket_state'] = s.css( '.status::attr(class)').extract_first() reservation_url = s.css('a') # if reservation_url: # reservation_url = # yield scrapy.Request(url=reservation_url, # callback=self.parse_reservation, # meta={'show': show}, # ) # else: show['remaining_seats_num'] = 0 show['total_seats_num'] = None show['reserved_seats'] = None show['remaining_seats'] = [] show['reservation_url'] = None yield show def parse_reservation(self, response): show = response.meta['show'] seats = response.css('#choice td a.tip') remainings = [] reserveds = [] for seat in seats: if seat.css('img[src*="seat_no"]'): id = eat.css('::attr(title)').extract_first() reserveds.append(id) elif seat.css('img[src*="seat_off"]'): id = eat.css('::attr(title)').extract_first() remainings.append(id) show['remaining_seats_num'] = len(remainings) show['total_seats_num'] = len(remainings) + len(reserveds) show['reserved_seats'] = reserveds show['remaining_seats'] = remainings yield show
class KinezoSpider(scrapy.Spider): name = "kinezo" custom_settings = { 'ITEM_PIPELINES': { 'kinpri_theater_checker.pipelines.ShowPipeline': 300, }, 'USER_AGENT': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.24 Mobile Safari/537.36 kinpri_theater_checker (+https://skrm.ch/prettyrhythm/kinpri-theater-checker/)', } allowed_domains = ["kinezo.jp"] # prepare start_urls db = get_mongo_client().kinpri_theater_checker.theaters kinezo_regex = re.compile(r'kinezo.jp|tjoy.net') start_urls = [t['link'] for t in db.find({'link': kinezo_regex})] def parse(self, response): # get theater name if 'redirect_urls' in response.request.meta: request_url = response.request.meta['redirect_urls'][0] else: request_url = response.url theater = self.db.find_one({'link': request_url}).get('name') event_url = list( filter(lambda x: '/event_list' in x, response.css('#headerMenuData a::attr(href)').extract()))[0] self.logger.info('event_url: ' + event_url) # parse normal list movies = response.css('a[name="movieItem"]') for movie in movies: title = ' '.join(movie.css('span::text').extract()) if not utils.is_title_kinpri(title): continue url = movie.css('::attr(href)').extract_first() self.logger.info('title: ' + title) self.logger.info('url: ' + url) yield scrapy.Request(url=url, callback=self.parse_schedule, meta={'theater': theater}) def parse_schedule(self, response): schedule_days = response.css('#schedule p[id^="day_"]') schedule_list = response.css('.schedule_list ul') # self.logger.info('schedule_days: ' + schedule_days.extract_first()) for day, ul in zip(schedule_days, schedule_list): date = day.css('span::text').extract_first() for li in ul.css('li'): show = Show() # skip no schedule day state = li.css('::attr(class)').extract_first() if not state or state == 'noSchedule': continue show['updated'] = datetime.datetime.now() title = ' '.join(response.css('.text span::text').extract()) show['title'] = title show['movie_types'] = utils.get_kinpri_types(title) show['date'] = date show['ticket_state'] = state show['theater'] = response.meta['theater'] show['screen'], time = li.css('::text').extract() show['start_time'], show['end_time'] = time.split(' - ') show['schedule_url'] = response.url reservation_url = li.css('a::attr(href)').extract_first() if state == 'sec05': # soldout show['remaining_seats_num'] = 0 show['total_seats_num'] = None show['reserved_seats'] = None show['remaining_seats'] = [] show['reservation_url'] = None yield show else: show['reservation_url'] = reservation_url yield scrapy.Request(url=reservation_url, callback=self.parse_reservation, meta={'show': show}) def parse_reservation(self, response): show = response.meta['show'] remaining = [ s.css('::attr(title)').extract_first() for s in response.css('li.seatSell.seatOn') ] reserved = [ s.css('::attr(title)').extract_first() for s in response.css('li.seatSell.seatOff') ] show['remaining_seats_num'] = len(remaining) show['total_seats_num'] = len(remaining) + len(reserved) show['reserved_seats'] = reserved show['remaining_seats'] = remaining yield show
'inform', 'retweet', 'fetch_tos', 'check_replies', 'convert_birthday_to_csv', 'update_birthday_spreadsheet', 'add_ignore_users', 'remove_ignore_users', ]) parser.add_argument('--users', '-u', nargs='+') parser.add_argument('--target_works', nargs='+') parser.add_argument('--ids', nargs='+') args = parser.parse_args() api = get_api(args.account) tws = get_mongo_client()[api.auth.username].tweets replies = get_mongo_client()[api.auth.username].replies if args.command == 'inform': inform() elif args.command == 'retweet': if args.ids: retweet(args.ids) else: retweet() elif args.command == 'fetch_tos': fetch_tos() elif args.command == 'check_replies': check_replies() elif args.command == 'convert_birthday_to_csv': convert_birthday_to_csv()
class AeoncinemaSpider(scrapy.Spider): name = "aeoncinema" custom_settings = { 'ITEM_PIPELINES': { 'kinpri_theater_checker.pipelines.ShowPipeline': 300, } } allowed_domains = ["aeoncinema.com"] # prepare start_urls db = get_mongo_client().kinpri_theater_checker.theaters aeoncinema_regex = re.compile(r'aeoncinema.com') start_urls = [t['link'] for t in db.find({'link': aeoncinema_regex})] def parse(self, response): # get theater name if 'redirect_urls' in response.request.meta: request_url = response.request.meta['redirect_urls'][0] else: request_url = response.url theater = self.db.find_one({'link': request_url}).get('name') url = response.css('li.schedule a::attr(href)').extract_first() yield scrapy.Request(url=url, callback=self.parse_schedule, meta={'theater': theater}) def parse_schedule(self, response): date = response.css('.today::text').extract_first() movies = response.css('.movielist') for movie in movies: title = movie.css('.main a::text').extract_first().strip() # skip not kinpri if not utils.is_title_kinpri(title): continue shows = movie.css('.timetbl [class^="tbl"]')[1:] for s in shows: show = Show() show['updated'] = datetime.datetime.now() show['title'] = title show['movie_types'] = utils.get_kinpri_types(title) show['date'] = date show['theater'] = response.meta['theater'] show['schedule_url'] = response.url start_time, end_time = s.css('.time ::text').extract() show['start_time'] = ':'.join(re.findall(r'(\d{1,2})', start_time)) show['end_time'] = ':'.join(re.findall(r'(\d{1,2})', end_time)) show['screen'] = s.css('.screen ::text').extract_first() state = s.css('.icon_kuuseki ::text').extract_first() show['ticket_state'] = state # TODO: run javascript via splash reservation_url = None if reservation_url: show['reservation_url'] = reservation_url yield scrapy.Request(url=reservation_url, callback=self.parse_reservation, meta={'show': show}) else: yield show # TODO: def parse_reservation(self, response): show = response.meta['show'] remaining = [s.css('::attr(title)').extract_first() for s in response.css('li.seatSell.seatOn')] reserved = [s.css('::attr(title)').extract_first() for s in response.css('li.seatSell.seatOff')] show['remaining_seats_num'] = len(remaining) show['total_seats_num'] = len(remaining) + len(reserved) show['reserved_seats'] = reserved show['remaining_seats'] = remaining yield show
def tweet_new_docs(): c = get_mongo_client()[config['target']].entries docs = c.find({'meta.tweeted': False}).sort('_id') for doc in docs: success_id = tweet_doc(doc) c.update_one({'_id': success_id}, {'$set': {'meta.tweeted': True}})
class KinezoSpider(scrapy.Spider): name = "unitedcinemas" custom_settings = { 'ITEM_PIPELINES': { 'kinpri_theater_checker.pipelines.ShowPipeline': 300, }, } allowed_domains = ["unitedcinemas.jp"] # prepare start_urls db = get_mongo_client().kinpri_theater_checker.theaters theater_regex = re.compile(r'unitedcinemas.jp') start_urls = [t['link'] for t in db.find({'link': theater_regex})] def parse(self, response): # TODO: create start_requests() and get theater & url in it # get theater name if 'redirect_urls' in response.request.meta: request_url = response.request.meta['redirect_urls'][0] else: request_url = response.url theater = self.db.find_one({'link': request_url}).get('name') urls = response.css('#carouselCalendar li a::attr(href)').extract() for url in urls: next_url = response.urljoin(url) yield scrapy.Request(url=next_url, callback=self.parse_schedule, meta={'theater': theater}) def parse_schedule(self, response): date = response.url movies = response.css('#dailyList>li') for movie in movies: title = movie.css('.movieTitle a::text').extract_first() # skip the movie is not kinpri if not utils.is_title_kinpri(title): continue shows_rows = movie.css('.tl>li') for shows_row in shows_rows: screen = shows_row.css('.screenNumber img::attr(alt)').re( r'(\d+)')[0] shows = shows_row.css('div') for s in shows: show = Show() show['updated'] = datetime.datetime.now() show['theater'] = response.meta['theater'] show['schedule_url'] = response.url show['date'] = date show['title'] = title show['screen'] = screen show['movie_types'] = utils.get_kinpri_types(title) show['start_time'] = s.css( '.startTime::text').extract_first() show['end_time'] = s.css('.endTime::text').extract_first() state = s.css('.uolIcon .scheduleIcon::attr(alt)').re( r'\[(.)\]') if state: show['ticket_state'] = state[0] else: show['ticket_state'] = None reservation_url = movie.css( '.uolIcon a::attr(href)').extract_first() if reservation_url: show['reservation_url'] = reservation_url yield scrapy.Request( url=reservation_url, callback=self.parse_check_continue, meta={'show': show}, dont_filter=True, ) else: show['remaining_seats_num'] = 0 show['total_seats_num'] = None show['reserved_seats'] = None show['remaining_seats'] = [] show['reservation_url'] = None yield show def parse_check_continue(self, response): if '購入途中' in response.css('h2::text').extract_first(): url = response.urljoin( response.css('form::attr(action)').extract()[-1]) yield response.request.replace(url=url, callback=self.parse_reservation, method='POST', body='rm=start') yield response.request.replace(callback=self.parse_reservation) def parse_reservation(self, response): show = response.meta['show'] remainings = [ s.css('::attr(id)').extract_first() for s in response.css('#view_seat td[value="0"]') ] reserveds = [ s.css('::attr(id)').extract_first() for s in response.css('#view_seat td[value="1"]') ] show['remaining_seats_num'] = len(remainings) show['total_seats_num'] = len(remainings) + len(reserveds) show['reserved_seats'] = reserveds show['remaining_seats'] = remainings yield show
import datetime from dateutil.parser import parse from urllib.parse import unquote from get_mongo_client import get_mongo_client cli = get_mongo_client() c = cli.kinpri_goods_wiki.items for i in c.find({'date': {'$gte': parse('2015-1-1')}}).sort('date'): if i['date_extra']: date = i['date'].strftime('%m月{}旬'.format(i['date_extra'])) else: date = i['date'].strftime('%m月%d日') name = i['name'].replace('KING OF PRISM by PrettyRhythm', '').replace('KING OF PRISM', '').strip() page = unquote(i['url'].split('/')[-1], encoding='euc-jp') print('| {} | [[{}>{}]] | |'.format(date, name, page))