Exemplo n.º 1
0
def get_movies_showtimes(theater, date):
    """Get movie names and times from Showtimes' website

    :theater: str
    :date: str (yyyy-mm-dd) (default: today)
    :returns: (list of movie names, list of lists of movie times)
    """
    BASE_URL = 'https://www.showtimes.com/movie-theaters/{}'

    D_THEATERS = {
        'regal fenway': lambda *args: 'regal-fenway-stadium-13-rpx-6269',
        'ua court st': lambda *args: 'ua-court-street-stadium-12-rpx-6608'
    }

    try:
        soup = soup_me(
            BASE_URL.format(
                D_THEATERS.get(theater.lower(), get_theaterpg_showtimes)(
                    theater)))  # fallback for unlisted theater
        # (phrased as functions, so theaterpg scraper won't run until necessary)

        movies = soup('li', class_='movie-info-box')

    except (Exception) as e:
        print(error_str.format(e))  # error msg only
        movies = []  # no matching theater

    movie_names = [
        ''.join((re.sub('[\r\n].*', '', name.text.strip())
                 for name in m('h2', class_='media-heading'))) for m in movies
    ]

    nested_buttons = [  # [[day, time, time, day, time], ..] -> [[[day, time, time], [day, time]], ..]
        list(
            split_before((button.text
                          for button in m('button', type='button')),
                         lambda txt: ',' in txt)) for m in movies
    ]

    movie_datetimes = [
        flatten(
            [['{} @ {}'.format(day.replace(':', ''), time) for time in times]
             for day, *times in buttons
             if (convert_date(day.replace(':', '')) == date)])
        for buttons in nested_buttons
    ]

    movie_times = filter_past(movie_datetimes)
    movie_names, movie_times = combine_times(
        *filter_movies(movie_names, movie_times))

    return movie_names, movie_times
Exemplo n.º 2
0
def get_movies_film_forum(theater, date):
    """Get movie names and times from Film Forum's website

    :theater: str
    :date: str (yyyy-mm-dd) (default: today)
    :returns: (list of movie names, list of lists of movie times)
    """
    BASE_URL = 'https://filmforum.org/'

    soup = soup_me(BASE_URL, from_headless=True)
    # headers = {
    #     'Host': 'filmforum.org',
    #     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0',
    #     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    #     'Accept-Language': 'en-US,en;q=0.5',
    #     'Accept-Encoding': 'gzip, deflate, br',
    #     'Cookie': 'exp_last_visit=1540095881; exp_last_activity=1541986743; prod_last_visit=1567621614; prod_last_activity=1567704700; visid_incap_2071502=8iHssZTnTnSmmcBr3w91Wt3MQ10AAAAAQUIPAAAAAACxMng+kgllZnm0qc4wuBX7; prod_tracker=%7B%220%22%3A%22index%22%2C%221%22%3A%22film%2Faga%22%2C%222%22%3A%22index%22%2C%22token%22%3A%220e8a94586278438a8abd9a2e22f6d71dc58ef797d480e691f6f7d52135be3b8604fc9bc72b9f98e33959ea6c363f6da7%22%7D; incap_ses_139_2071502=/FOyW/1BcEwESBAC4NjtAeH/b10AAAAAu5KRM62+voKYu930nS4qZA==; prod_csrf_token=add79bc2b230529b1baee4c15e4742a3599b154f; incap_ses_529_2071502=LeJGc8MKg19kn678pmNXB3xGcV0AAAAAk5FGgxjtbO141Wfk/d5SNg==',
    #     'DNT': '1',
    #     'Connection': 'keep-alive',
    #     'Upgrade-Insecure-Requests': '1',
    #     'Cache-Control': 'max-age=0, no-cache',
    #     'If-Modified-Since': 'Thu, 05 Sep 2019 17:31:41 GMT',
    #     'Pragma': 'no-cache'
    # }

    # soup_me(BASE_URL) # first request is blocked by ROBOTS
    # sleep(5)
    # soup = soup_me(BASE_URL, headers=headers)

    try:
        assert not soup.meta.attrs.get('name',
                                       '').lower() == 'robots', 'robots'
    except (AssertionError) as e:
        print(error_str.format(e))  # error msg only
        return [], []  # blocked from getting movies :(

    days = [
        d.text
        for d in (soup.find('div', class_='sidebar-container').find_all('li'))
    ]
    iday = index_into_days(days, date=date)

    day = soup.find('div', id='tabs-{}'.format(iday))

    movie_names = [
        ''.join(
            (txt for txt in mdiv.contents
             if isinstance(txt, str))).strip()  # ignore txt in extra <span>s
        for mdiv in day('a', href=re.compile('^https://filmforum.org/film'))
    ]

    # N.B. could have modifier like "♪" after time
    PATTERN = re.compile('([0-9])\*?$')

    movie_datetimes = [
        [
            '{} @ {}'.format(date, re.sub(
                PATTERN, r'\1 pm', time.text))  # only AM is labeled explicitly
            for time in p('span', class_=None)
        ] for p in day('p')
    ]

    movie_times = filter_past(movie_datetimes)
    movie_names, movie_times = combine_times(
        *filter_movies(movie_names, movie_times))

    return movie_names, movie_times
Exemplo n.º 3
0
def get_movies_google(theater, date, *args, **kwargs):
    """Get movie names and times from Google search

    :theater: str
    :date: str (yyyy-mm-dd) (default: today)
    :args, kwargs: other search terms, e.g. zip code
    :returns: (list of movie names, list of lists of movie times)
    """
    # date = convert_date(date, fmt_out='%A %m/%d')
    fdate = convert_date(date, fmt_out='%A')  # formatted for search
    fdate = fdate if fdate != convert_date('today',
                                           fmt_out='%A') else 'today'  #''
    # date = convert_date(date, fmt_out='%m/%d') # /%y')

    BASE_URL = 'https://www.google.com/search'

    PARAMS = {
        'q': safe_encode('showtimes', '"{}"'.format(theater), fdate),
        'ie': 'utf-8',
        'client': 'firefox-b-1-e'
    }

    # soup = soup_me(BASE_URL, PARAMS) #, **kwargs)
    # ^ passing params directly to requests gives problems with extraneous % encoding
    soup = soup_me(compose_query(BASE_URL, PARAMS))

    # TODO google static html only returns up to 10 movies..

    CLASS = AttrDict(timelist='lr_c_fcc',
                     time=re.compile('^(std-ts)|(lr_c_stnl)$'),
                     fmt='lr_c_vn')

    try:
        relevant_div = soup.find('div', {'data-date': True})

        # check date
        date_found = relevant_div.attrs['data-date']
        assert convert_date(date_found) == date, '{} != {}'.format(
            date_found, date)

        movies = relevant_div('div', {'data-movie-name': True})

    except (AssertionError, AttributeError) as e:
        # print(error_str.format(e)) # error msg only
        # movies = []                # no movies found for desired theater/date
        print(error_str.format('No matching theater on google'))
        raise (NoMoviesException(e))

    movie_names = [m.span.text for m in movies]

    movie_times = [  # nested times per format per movie
        [[time.text for time in timelst('div', class_=CLASS.time)]
         for timelst in m('div', class_=CLASS.timelist)] for m in movies
    ]

    movie_formats = [
        [
            getattr(timelst.find('div', class_=CLASS.fmt), 'text',
                    None)  # default if no format listed
            for timelst in m('div', class_=CLASS.timelist)
        ] for m in movies
    ]

    # flatten timelists for movies with multiple formats
    n_timelists_per_movie = [len(timelsts) for timelsts in movie_times]
    movie_names = list(
        chain.from_iterable(
            [name] * n for name, n in zip(movie_names, n_timelists_per_movie)))

    # annotate with format
    movie_times = [
        (times if fmt == 'Standard' or not times or not fmt else times +
         ['[ {} ]'.format(fmt)])
        for times, fmt in zip(flatten(movie_times), flatten(movie_formats))
    ]

    # no need to filter - tags only correspond to upcoming movie times
    return movie_names, movie_times