示例#1
0
    def scrape_and_write_medias(self):
        """Gets movies and shows and writes to database.
           ShoScraper uses lighter requests/bs4 not selenium/chromedriver.
        """
        logging.info('SHOWTIME MOVIE SEARCH')
        self.get_movie_pages()
        self.get_movies()
        self.lookup_and_write_medias(medias=self.movies, mtype='movie')

        logging.info('SHOWTIME SHOW SEARCH')
        self.get_shows()
        self.lookup_and_write_medias(medias=self.shows, mtype='show')

        # remove any sources not just updated: media this provider no longer has
        flaskapp.remove_old_sources('showtime')
示例#2
0
    def scrape_and_write_medias(self):
        """Gets movies and shows and writes to database."""
        self.start_driver(window_size='--window-size=1920,6000')

        logging.info('HBO MOVIE SEARCH')
        movies = self.get_medias_from_page('/movies', mtype='movie')
        movies = self.add_years_to_movies(movies)
        self.lookup_and_write_medias(medias=movies, mtype='movie')

        logging.info('HBO SHOW SEARCH')
        shows = self.get_medias_from_page('/series', mtype='show')
        self.lookup_and_write_medias(medias=shows, mtype='show')

        self.stop_driver()
        # remove any sources not just updated: media this provider no longer has
        flaskapp.remove_old_sources('hbo')
示例#3
0
    def scrape_and_write_medias(self):
        """Gets movies and shows and writes to database."""
        self.start_driver()
        if self.login():
            logging.info('HULU MOVIE SEARCH')
            movie_genre_pages = self.get_genre_pages(mtype='movie')
            movies = self.get_medias(movie_genre_pages)
            self.lookup_and_write_medias(medias=movies, mtype='movie')

            logging.info('HULU SHOW SEARCH')
            show_genre_pages = self.get_genre_pages(mtype='show')
            shows = self.get_medias(show_genre_pages)
            self.lookup_and_write_medias(medias=shows, mtype='show')
        else:
            logging.error('HULU login failed')

        self.stop_driver()
        flaskapp.remove_old_sources('hulu')  # remove sources not just updated
        flaskapp.remove_hulu_addon_media()  # remove overlap of sho and hulu
示例#4
0
    def scrape_and_write_medias(self):
        """Gets movies and shows and writes to database."""
        self.start_driver()
        if self.login():
            logging.info('NETFLIX SHOW SEARCH')
            shows = self.get_medias(mtype='show')
            self.lookup_and_write_medias(medias=shows, mtype='show')

            logging.info('NETFLIX MOVIE SEARCH')
            movies = self.get_medias(mtype='movie')
            # restart driver, effectively logging out, for rate limiting
            self.stop_driver()
            self.start_driver()
            # add years to movies
            movies = self.add_years_to_movies(movies)
            self.lookup_and_write_medias(medias=movies, mtype='movie')

        else:
            logging.error('NETFLIX login failed')

        self.stop_driver()
        # remove any sources not just updated: media this provider no longer has
        flaskapp.remove_old_sources('netflix')
def search_hulu():
    # source dict to be added to media sources[] in db for found titles
    source = {
        'name': 'hulu',
        'display_name': 'Hulu',
        'link': 'http://www.hulu.com'
    }

    # go to hulu splash page
    driver = webdriver.PhantomJS(service_log_path='log/phantomjs.log')
    driver.implicitly_wait(10)  # seconds
    driver.set_window_size(1920, 1080)
    driver.get('https://www.hulu.com')
    time.sleep(1.2)

    # click on log in link
    links = driver.find_elements_by_tag_name('a')
    links[2].click()
    time.sleep(1.2)
    logging.info('hulu, clicked on log in link')

    # enter credentials and click login button
    popup = driver.find_element_by_id('login-popup-section')
    form = popup.find_element_by_tag_name('form')
    email_input = form.find_elements_by_tag_name('input')[0]
    pw_input = form.find_elements_by_tag_name('input')[1]
    email_input.send_keys(creds['hulu_u'])
    pw_input.send_keys(creds['hulu_p'])
    logging.info('hulu, pasted u/p')
    # driver.save_screenshot('static/screenshot.png')
    # driver.find_element_by_id('recaptcha_response_field').send_keys('')
    form.find_element_by_tag_name('button').click()
    time.sleep(1.2)
    try:  # sometimes first click does not work
        form.find_element_by_tag_name('button').click()
    except:
        pass
    time.sleep(1.2)

    # switch out of iframe and click profile link
    driver.find_element_by_id('62038018').click()
    time.sleep(1.2)
    logging.info('hulu, clicked profile')
    driver.save_screenshot('static/screenshot2.png')

    def get_medias_from_genre_pages(genre_pages):
        medias = []
        for page in genre_pages:
            if page == 'https://www.hulu.com/videogames':
                continue
            if page == 'https://www.hulu.com/latino':
                continue  # says movie genre but shows not movies
            # get page and pointer to top panel, holding about 6 medias
            driver.get(page)
            logging.info('did get on page: {}'.format(page))
            time.sleep(8)
            top_panel = driver.find_element_by_class_name('tray')
            next_btn = top_panel.find_element_by_class_name('next')
            next_counter = 0

            # get visible media, click next, repeat until no next button
            while True:
                thumbnails = top_panel.find_elements_by_class_name('row')
                for t in thumbnails:
                    try:  # get movie year, show first air year not displayed
                        year = t.find_element_by_tag_name('img')
                        year = year.get_attribute('alt')
                        if re.search('\([0-9][0-9][0-9][0-9]\)$', year):
                            year = year[-5:-1]
                        else:
                            year = ''
                    except:
                        year = ''
                    try:
                        title = t.find_element_by_class_name('title')
                        title = title.get_attribute('innerHTML')
                        # logging.info('title in html found: {}'.format(title))
                        link = t.find_element_by_class_name('beacon-click')
                        link = link.get_attribute('href')
                        medias += [{
                            'title': title,
                            'link': link,
                            'year': year
                        }]
                    except NoSuchElementException:
                        logging.warning('no title in row html, blank grid')
                        # with open('log/selenium_error_html_dump.txt',
                        #           'w') as f:
                        #    f.write(str(driver.page_source))
                        continue
                    except StaleElementReferenceException:
                        logging.error('missed a title, may need to wait more')
                        continue
                if not next_btn.is_displayed():
                    break  # exit loop if next button is not displayed
                next_btn.click()
                next_counter += 1
                if next_counter % 10 == 0:
                    logging.info('clicked next {} times'.format(next_counter))
                if next_counter >= 120:
                    logging.error('next button never went away, may have ' +
                                  'not gotten all media on: {}'.format(page))
                    break  # exit loop, pages should never be this long
                time.sleep(float(random.randrange(1900, 2300, 1)) / 1000)
            logging.info('len(medias) so far: {}'.format(len(medias)))
        return medias

    # MOVIE SEARCH SECTION
    logging.info('HULU MOVIE SEARCH')
    driver.get('https://www.hulu.com/movies/genres')
    time.sleep(1.5)
    all_genre = driver.find_element_by_id('all_movies_genres')
    anchors = all_genre.find_elements_by_class_name('beacon-click')
    genre_pages = [a.get_attribute('href') for a in anchors]
    logging.info('hulu, got movie genres')
    medias = get_medias_from_genre_pages(genre_pages)
    lookup_and_write_medias(medias, mtype='movie', source=source)

    # SHOW SEARCH SECTION
    logging.info('HULU SHOW SEARCH')
    driver.get('https://www.hulu.com/tv/genres')
    time.sleep(1.5)
    all_genre = driver.find_element_by_id('all_tv_genres')
    anchors = all_genre.find_elements_by_class_name('beacon-click')
    genre_pages = [a.get_attribute('href') for a in anchors]
    logging.info('hulu, got tv genres')
    medias = get_medias_from_genre_pages(genre_pages)
    lookup_and_write_medias(medias, mtype='show', source=source)

    driver.quit()

    # remove any sources not just updated: media this provider no longer has
    flaskapp.remove_old_sources('hulu')
def search_hbo():
    driver = webdriver.PhantomJS(service_log_path='log/phantomjs.log')
    driver.implicitly_wait(10)  # seconds
    driver.set_window_size(1920, 15000)

    base_url = 'https://play.hbogo.com'
    source = {'name': 'hbo', 'display_name': 'HBO', 'link': base_url}
    pages = [{
        'url': '/movies',
        'mtype': 'movie'
    }, {
        'url': '/series',
        'mtype': 'show'
    }, {
        'url': '/documentaries',
        'mtype': 'movie'
    }]

    for page in pages:
        logging.info('HBO SEARCH OF ' + page['url'])
        driver.get(base_url + page['url'])
        time.sleep(5)
        driver.execute_script("window.scrollTo(0, 10000);")
        time.sleep(15)

        # get all boxes with media image and text
        boxes = driver.find_elements_by_xpath(
            "//a[@class='default class2 class4']")
        logging.info(u'num of media boxes found: {}'.format(len(boxes)))

        # create list of titles and links, replacing newline
        medias = []
        for i, b in enumerate(boxes):
            title = b.text.replace('\n', ' ')
            medias += [{'title': title, 'link': b.get_attribute('href')}]

        # remove non-media, TODO make not catch false positives
        medias = [m for m in medias if not m['title'].isupper()]

        # get year if not already in database
        logging.info('getting year for all media not in database')
        for m in medias:
            if page['mtype'] == 'movie' and not flaskapp.db_lookup_via_link(
                    m['link']):
                driver.get(m['link'])
                time.sleep(float(random.randrange(5000, 10000, 1)) / 1000)
                texts = driver.find_element_by_tag_name("body").text
                texts = texts.split('\n')

                years = [t for t in texts if re.search('^\d{4}.+min$', t)]
                if len(years) > 0:
                    m['year'] = years[0][:4]
                logging.info('year lookup: {}: {}'.format(
                    m['title'], m.get('year', '')))

        lookup_and_write_medias(medias, mtype=page['mtype'], source=source)

    driver.quit()

    # remove any sources not just updated: media this provider no longer has
    flaskapp.remove_old_sources('hbo')
def search_showtime():
    # source dict to be added to media sources[] in db for found titles
    base_url = 'http://www.sho.com'
    source = {'name': 'showtime', 'display_name': 'Showtime', 'link': base_url}

    # MOVIE SEARCH SECTION
    logging.info('SHOWTIME MOVIE SEARCH')
    r = requests.get(base_url + '/movies')
    soup = BeautifulSoup(r.text, 'html.parser')

    # get all movie genre pages
    full_mov_lib = soup.find('section', {'data-context': 'slider:genres'})
    genre_links = full_mov_lib.find_all('a', {'class': 'promo__link'})
    genre_links = [a['href'] for a in genre_links]
    genre_links = [i for i in genre_links if 'adult' not in i]

    # for all root genre pages, get extra pagination links to scrape
    all_extra_pages = []
    for link in genre_links:
        r = requests.get(base_url + link)
        soup = BeautifulSoup(r.text, 'html.parser')
        extra_pages = soup.find('ul', 'pagination__list')
        if extra_pages:
            extra_pages = extra_pages.find_all('a')
            extra_pages = [a['href'] for a in extra_pages]
            all_extra_pages.extend(extra_pages)
    genre_links.extend(all_extra_pages)

    # for all root and extra genre pages, get movie titles
    catalog = []
    for link in genre_links:
        r = requests.get(base_url + link)
        logging.info('did get on page: {}'.format(link))
        soup = BeautifulSoup(r.text, 'html.parser')

        anchors = soup.find_all('a', {'class': 'movies-gallery__item'})
        for a in anchors:
            title = a['data-label']
            title = title[title.find(':') + 1:]
            catalog += [{'title': title, 'link': base_url + a['href']}]
    logging.info('will now check avail on {} catalog items'.format(
        len(catalog)))

    # check availability via link, build medias list
    medias = []
    for i, c in enumerate(catalog):
        time.sleep(0.100)
        r = requests.get(c['link'])
        soup = BeautifulSoup(r.text, 'html.parser')

        year = soup.find_all('dd')[-1].text
        if year and re.search('^\d{4}$', year):
            c['year'] = year
        if soup.find(text='STREAM THIS MOVIE'):
            medias += [c]
        if i % 100 == 0:
            logging.info(u'checked availability on {} items'.format(i))

    lookup_and_write_medias(medias, mtype='movie', source=source)

    # SHOW SEARCH SECTION
    logging.info('SHOWTIME SHOW SEARCH')
    r = requests.get(base_url + '/series')
    soup = BeautifulSoup(r.text, 'html.parser')
    all_series = soup.find('section',
                           {'data-context': 'promo group:All Showtime Series'})

    # get all show titles
    medias = []
    anchors = all_series.find_all('a', {'class': 'promo__link'})
    for a in anchors:
        title = a.text.strip()
        link = base_url + a['href']
        medias += [{'title': title, 'link': link}]

    lookup_and_write_medias(medias, mtype='show', source=source)

    # remove any sources not just updated: media this provider no longer has
    flaskapp.remove_old_sources('showtime')
def search_netflix():
    # source dict to be added to media sources[] in db for found titles
    base_url = 'http://www.netflix.com'
    source = {'name': 'netflix', 'display_name': 'Netflix', 'link': base_url}

    # log in to provider
    driver = webdriver.PhantomJS(service_log_path='log/phantomjs.log')
    driver.implicitly_wait(10)  # seconds
    driver.set_window_size(1920, 1080)
    driver.get('https://www.netflix.com/login')
    inputs = driver.find_elements_by_tag_name('input')
    inputs[0].send_keys(creds['nf_u'])
    inputs[1].send_keys(creds['nf_p'])
    driver.find_element_by_tag_name('button').click()
    logging.info('netflix, logged in')

    def get_medias_from_genre_pages(genre_pages):
        medias = []
        for page in genre_pages:
            # get page and scroll to bottom many times
            time.sleep(1.5)
            driver.get(page + '?so=su')
            logging.info('did get on page: {}'.format(page))
            for i in range(40):
                driver.execute_script(
                    "window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(float(random.randrange(900, 1400, 1)) / 1000)

            # put source into beautifulsoup and get titles
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            divs = soup('div', 'ptrack-content')
            for d in divs:
                title = d.find('div', 'video-preload-title-label').text
                elements = d['data-ui-tracking-context'].split(',')
                vid_element = [i for i in elements if 'video_id' in i]
                netflix_id = vid_element[0][vid_element[0].find(':') + 1:]
                link = base_url + '/title/' + netflix_id
                medias += [{'title': title, 'link': link}]
            logging.info('len(medias) so far: {}'.format(len(medias)))
        return medias

    # MOVIE SEARCH SECTION
    logging.info('NETFLIX MOVIE SEARCH')
    genre_pages = [
        'https://www.netflix.com/browse/genre/5977',  # gay
        'https://www.netflix.com/browse/genre/1365',  # action
        'https://www.netflix.com/browse/genre/5763',  # drama
        'https://www.netflix.com/browse/genre/7077',  # indie
        'https://www.netflix.com/browse/genre/8711',  # horror
        'https://www.netflix.com/browse/genre/6548',  # comedy
        'https://www.netflix.com/browse/genre/31574',  # classics
        'https://www.netflix.com/browse/genre/7424',  # anime
        'https://www.netflix.com/browse/genre/783',  # kid
        'https://www.netflix.com/browse/genre/7627',  # cult
        'https://www.netflix.com/browse/genre/6839',  # docs ~1321
        'https://www.netflix.com/browse/genre/78367',  # internat'l
        'https://www.netflix.com/browse/genre/8883',  # romance
        'https://www.netflix.com/browse/genre/1492',  # scifi
        'https://www.netflix.com/browse/genre/8933'  # thrillers
    ]
    medias = get_medias_from_genre_pages(genre_pages)
    medias = get_netflix_year(medias)
    lookup_and_write_medias(medias, mtype='movie', source=source)

    # SHOW SEARCH SECTION
    logging.info('NETFLIX SHOW SEARCH')
    genre_pages = [
        'https://www.netflix.com/browse/genre/83',  # tv popular
        'https://www.netflix.com/browse/genre/10673',  # action
        'https://www.netflix.com/browse/genre/10375',  # com
        'https://www.netflix.com/browse/genre/11714',  # drama
        'https://www.netflix.com/browse/genre/83059',  # horror
        'https://www.netflix.com/browse/genre/4366',  # mystery
        'https://www.netflix.com/browse/genre/52780',  # sci
        'https://www.netflix.com/browse/genre/4814',  # miniseries
        'https://www.netflix.com/browse/genre/46553'  # classic
    ]
    medias = get_medias_from_genre_pages(genre_pages)
    lookup_and_write_medias(medias, mtype='show', source=source)

    driver.quit()

    # remove any sources not just updated: media this provider no longer has
    flaskapp.remove_old_sources('netflix')