def get_movies_showtimes(theater, date): """Get movie names and times from Showtimes' website :theater: str :date: str (yyyy-mm-dd) (default: today) :returns: (list of movie names, list of lists of movie times) """ BASE_URL = 'https://www.showtimes.com/movie-theaters/{}' D_THEATERS = { 'regal fenway': lambda *args: 'regal-fenway-stadium-13-rpx-6269', 'ua court st': lambda *args: 'ua-court-street-stadium-12-rpx-6608' } try: soup = soup_me( BASE_URL.format( D_THEATERS.get(theater.lower(), get_theaterpg_showtimes)( theater))) # fallback for unlisted theater # (phrased as functions, so theaterpg scraper won't run until necessary) movies = soup('li', class_='movie-info-box') except (Exception) as e: print(error_str.format(e)) # error msg only movies = [] # no matching theater movie_names = [ ''.join((re.sub('[\r\n].*', '', name.text.strip()) for name in m('h2', class_='media-heading'))) for m in movies ] nested_buttons = [ # [[day, time, time, day, time], ..] -> [[[day, time, time], [day, time]], ..] list( split_before((button.text for button in m('button', type='button')), lambda txt: ',' in txt)) for m in movies ] movie_datetimes = [ flatten( [['{} @ {}'.format(day.replace(':', ''), time) for time in times] for day, *times in buttons if (convert_date(day.replace(':', '')) == date)]) for buttons in nested_buttons ] movie_times = filter_past(movie_datetimes) movie_names, movie_times = combine_times( *filter_movies(movie_names, movie_times)) return movie_names, movie_times
def get_movies_film_forum(theater, date): """Get movie names and times from Film Forum's website :theater: str :date: str (yyyy-mm-dd) (default: today) :returns: (list of movie names, list of lists of movie times) """ BASE_URL = 'https://filmforum.org/' soup = soup_me(BASE_URL, from_headless=True) # headers = { # 'Host': 'filmforum.org', # 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0', # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en-US,en;q=0.5', # 'Accept-Encoding': 'gzip, deflate, br', # 'Cookie': 'exp_last_visit=1540095881; exp_last_activity=1541986743; prod_last_visit=1567621614; prod_last_activity=1567704700; visid_incap_2071502=8iHssZTnTnSmmcBr3w91Wt3MQ10AAAAAQUIPAAAAAACxMng+kgllZnm0qc4wuBX7; prod_tracker=%7B%220%22%3A%22index%22%2C%221%22%3A%22film%2Faga%22%2C%222%22%3A%22index%22%2C%22token%22%3A%220e8a94586278438a8abd9a2e22f6d71dc58ef797d480e691f6f7d52135be3b8604fc9bc72b9f98e33959ea6c363f6da7%22%7D; incap_ses_139_2071502=/FOyW/1BcEwESBAC4NjtAeH/b10AAAAAu5KRM62+voKYu930nS4qZA==; prod_csrf_token=add79bc2b230529b1baee4c15e4742a3599b154f; incap_ses_529_2071502=LeJGc8MKg19kn678pmNXB3xGcV0AAAAAk5FGgxjtbO141Wfk/d5SNg==', # 'DNT': '1', # 'Connection': 'keep-alive', # 'Upgrade-Insecure-Requests': '1', # 'Cache-Control': 'max-age=0, no-cache', # 'If-Modified-Since': 'Thu, 05 Sep 2019 17:31:41 GMT', # 'Pragma': 'no-cache' # } # soup_me(BASE_URL) # first request is blocked by ROBOTS # sleep(5) # soup = soup_me(BASE_URL, headers=headers) try: assert not soup.meta.attrs.get('name', '').lower() == 'robots', 'robots' except (AssertionError) as e: print(error_str.format(e)) # error msg only return [], [] # blocked from getting movies :( days = [ d.text for d in (soup.find('div', class_='sidebar-container').find_all('li')) ] iday = index_into_days(days, date=date) day = soup.find('div', id='tabs-{}'.format(iday)) movie_names = [ ''.join( (txt for txt in mdiv.contents if isinstance(txt, str))).strip() # ignore txt in extra <span>s for mdiv in day('a', href=re.compile('^https://filmforum.org/film')) ] # N.B. could have modifier like "♪" after time PATTERN = re.compile('([0-9])\*?$') movie_datetimes = [ [ '{} @ {}'.format(date, re.sub( PATTERN, r'\1 pm', time.text)) # only AM is labeled explicitly for time in p('span', class_=None) ] for p in day('p') ] movie_times = filter_past(movie_datetimes) movie_names, movie_times = combine_times( *filter_movies(movie_names, movie_times)) return movie_names, movie_times
def get_movies_google(theater, date, *args, **kwargs): """Get movie names and times from Google search :theater: str :date: str (yyyy-mm-dd) (default: today) :args, kwargs: other search terms, e.g. zip code :returns: (list of movie names, list of lists of movie times) """ # date = convert_date(date, fmt_out='%A %m/%d') fdate = convert_date(date, fmt_out='%A') # formatted for search fdate = fdate if fdate != convert_date('today', fmt_out='%A') else 'today' #'' # date = convert_date(date, fmt_out='%m/%d') # /%y') BASE_URL = 'https://www.google.com/search' PARAMS = { 'q': safe_encode('showtimes', '"{}"'.format(theater), fdate), 'ie': 'utf-8', 'client': 'firefox-b-1-e' } # soup = soup_me(BASE_URL, PARAMS) #, **kwargs) # ^ passing params directly to requests gives problems with extraneous % encoding soup = soup_me(compose_query(BASE_URL, PARAMS)) # TODO google static html only returns up to 10 movies.. CLASS = AttrDict(timelist='lr_c_fcc', time=re.compile('^(std-ts)|(lr_c_stnl)$'), fmt='lr_c_vn') try: relevant_div = soup.find('div', {'data-date': True}) # check date date_found = relevant_div.attrs['data-date'] assert convert_date(date_found) == date, '{} != {}'.format( date_found, date) movies = relevant_div('div', {'data-movie-name': True}) except (AssertionError, AttributeError) as e: # print(error_str.format(e)) # error msg only # movies = [] # no movies found for desired theater/date print(error_str.format('No matching theater on google')) raise (NoMoviesException(e)) movie_names = [m.span.text for m in movies] movie_times = [ # nested times per format per movie [[time.text for time in timelst('div', class_=CLASS.time)] for timelst in m('div', class_=CLASS.timelist)] for m in movies ] movie_formats = [ [ getattr(timelst.find('div', class_=CLASS.fmt), 'text', None) # default if no format listed for timelst in m('div', class_=CLASS.timelist) ] for m in movies ] # flatten timelists for movies with multiple formats n_timelists_per_movie = [len(timelsts) for timelsts in movie_times] movie_names = list( chain.from_iterable( [name] * n for name, n in zip(movie_names, n_timelists_per_movie))) # annotate with format movie_times = [ (times if fmt == 'Standard' or not times or not fmt else times + ['[ {} ]'.format(fmt)]) for times, fmt in zip(flatten(movie_times), flatten(movie_formats)) ] # no need to filter - tags only correspond to upcoming movie times return movie_names, movie_times