def get_seasons(self, tvshow_page_url): seasons = [] page = scraper_lib.get_page_soup(url=tvshow_page_url) iframe = self.__get_iframe_page(page) seasons_container = scraper_lib.Container( iframe, tag="div", first=True, container_class="buttons-bar seasons").get_container() season_soup = scraper_lib.get_soup_prettified(seasons_container) seasons_block = scraper_lib.Container(season_soup, tag="li").get_container() for season in seasons_block: season_no = scraper_lib.get_text(season).strip() seasons.append( Season(title="Stagione {0}".format( scraper_lib.get_text(season).strip()), season_no=season_no, ref_url=scraper_lib.get_hrefs(season)[0])) return seasons
def get_all_seasons(self): if sys.version_info[0] < 3: del self.seasons_lst[:] else: self.seasons_lst.clear() if self.is_modern_state: season_container = scraper_lib.Container( block=self.seasons_wrapper, tag='div', container_class='accordion-item') title = scraper_lib.Element(block='', el_tag='ul', get_text=True) else: #old version season_container = scraper_lib.Container( block=self.seasons_wrapper, tag='p', text=True, recursive=False) title = scraper_lib.Element(block='', el_tag='span', get_text=True) seasons_block = season_container.get_container() for i, season_block in enumerate(seasons_block): title.block = season_block season_title = title.get_element() season = Season(title=season_title, season_no=i) self.seasons_lst.append(season) return self.seasons_lst
def get_episodes(self, episodes_url): episodes = [] page = scraper_lib.get_page_soup(url=episodes_url) episodes_container = scraper_lib.Container( page, tag="div", first=True, container_class="buttons-bar episodes").get_container() episode_soup = scraper_lib.get_soup_prettified(episodes_container) episodes_block = scraper_lib.Container(episode_soup, tag="li").get_container() for episode in episodes_block: episode_no = scraper_lib.get_text(episode).strip() episodes.append( Episode(title="Episodio {0}".format(episode_no), episode_no=episode_no, urls=scraper_lib.get_hrefs(episode)[0])) return episodes
def get_all_players(self): self.hdpass_page = scraper_lib.get_page_soup(self.current_url) hosts = scraper_lib.Container(self.hdpass_page, 'div', container_class='hosts-bar', first=True).get_container() players = scraper_lib.Container(hosts, 'li').get_container() return [scraper_lib.get_text(player) for player in players]
def get_episodes_by_season_number(self, season_no): #return all episodes of that season self.get_all_seasons() if len(self.seasons_lst) > 0: try: season_obj = self.seasons_lst[season_no] except IndexError: season_obj = self.seasons_lst[-1] if self.is_modern_state: #new version config seasons = scraper_lib.Container( self.seasons_wrapper, tag='div', container_class='accordion-item').get_container() s_title = scraper_lib.Element(block='', el_tag='ul', get_text=True) season_content = scraper_lib.Container( block='', tag='div', first=True, container_class='content') else: #old version config seasons = scraper_lib.Container( self.seasons_wrapper, tag='p', recursive=False).get_container() s_title = scraper_lib.Element(block='', el_tag='span', get_text=True) for season in seasons: try: s_title.block = season title = s_title.get_element() except: title = "nd" if title == season_obj.title: if not self.is_modern_state: content = scraper_lib.get_next_sibling(season) else: season_content.block = season content = season_content.get_container() all_episodes = self.get_all_episodes(content) return all_episodes return None
def get_search_result(self, keyword): movies_list = [] raw_keyword = keyword keyword = raw_keyword.replace(" ", "+") search_result = scraper_lib.get_page_soup(url=self.search_url.format( self.domain, keyword), check_result=True) if (search_result == -1): result_url = self.get_movie_url_from_google(keyword) search_result = scraper_lib.get_page_soup(url=result_url) movies = scraper_lib.Container( block=search_result, tag='div', container_class='col-lg-3 col-md-4 col-xs-4 mb-30').get_container( ) for movie in movies: movies_list.append(self.__get_post_info(movie)) if (not movies_list): try: title = scraper_lib.Element(block=search_result, el_tag='title', get_text=True).get_element() except: title = raw_keyword movie = Movie(title=title, page_url=result_url) movie.image_url = "n.d." movies_list.append(movie) return movies_list
def get_fpt_posts(self, keyword, media_type): try: posts_list = [] key_search = keyword.replace(" ", "+") url_search = self.filmpertutti_url.format(self.domain, key_search) soup = scraper_lib.get_page_soup(url_search) container = scraper_lib.Container(block=soup, tag='ul', first=True, container_class="posts").get_container() posts = scraper_lib.Container(block=container, tag='li').get_container() for post in posts: posts_list.append(self.get_post_info(post, media_type)) return posts_list except: return None
def get_seasons_wrapper(self): #return the html tag which cointains all the seasons seasons_wrapper = scraper_lib.Container( block=self.soup, tag='div', first=True, container_class='seasons-wraper').get_container() if seasons_wrapper is not None: return seasons_wrapper else: self.is_modern_state = False container = scraper_lib.Container( block=self.soup, tag='div', container_id='info', container_class='pad').get_container() return container[-1]
def get_filmpertutti_domain(self): try: soup = scraper_lib.get_page_soup(self.url_check_domain, timeout=5) wrapper = scraper_lib.Container(block=soup, tag='div', first=True, container_class="content").get_container() domain_url = scraper_lib.get_hrefs(wrapper, ["filmpertutti."])[0] return domain_url except: return self.default_domain
def get_seasons(self, page_url): seasons_page = [] page = scraper_lib.get_page_soup(page_url) seasons = scraper_lib.Container( page, tag="div", container_class="su-spoiler-title").get_container() for i, season in enumerate(seasons, start=1): seasons_page.append( Season(title="stagione: {0}".format(i), season_no=i)) return seasons_page
def get_search_result(self, keyword): tvshow_lst = [] keyword = keyword.replace(" ", "+") search_result = scraper_lib.get_page_soup( url=self.search_url.format(self.domain, keyword)) div_posts = scraper_lib.Container(block=search_result, tag='ul', container_class='recent-posts', first=True).get_container() tvshows = scraper_lib.Container(block=div_posts, tag='li').get_container() for tvshow in tvshows: try: tvshow_lst.append(self.__get_post_info(tvshow)) except: pass return tvshow_lst
def get_embed_values_by_player(self, player_name): href = None if self.hdpass_page is None: self.hdpass_page = scraper_lib.get_page_soup(self.current_url) hosts = scraper_lib.Container(self.hdpass_page, 'div', container_class='hosts-bar', first=True).get_container() players = scraper_lib.Container(hosts, 'li').get_container() for player in players: if player_name == scraper_lib.get_text(player): a_href = scraper_lib.get_tag(player, 'a') href = a_href["href"].replace("amp;", "") break if href is not None: r = scraper_lib.get_page_soup(url=href) return scraper_lib.Element(r, 'iframe', el_property="custom-src").get_element() return None
def get_episodes(self, page_url, season_no=1): episodes_lst = [] block = scraper_lib.get_page_soup(page_url) episodes_container = scraper_lib.Container( block, tag="div", container_class="su-spoiler-content").get_container()[season_no - 1] episodes_block = scraper_lib.Container( episodes_container, tag="div", container_class="su-link-ep").get_container() for i, episode in enumerate(episodes_block, start=1): urls = [] ep_title = scraper_lib.get_text(episode).replace('\n', '').strip() urls.append( scraper_lib.Element(episode, el_tag="a", el_property="href").get_element()) episodes_lst.append( Episode(title=ep_title, urls=urls, episode_no=i)) return episodes_lst
def get_search_result(self, keyword): results = [] search_result = scraper_lib.get_page_soup( url=self.search_url.format(self.domain, keyword)) tvshows = scraper_lib.Container( block=search_result, tag='div', container_class='col-xl-3 col-lg-3 col-md-3 col-sm-6 col-6' ).get_container() for tvshow in tvshows: results.append(self.__get_post_info(tvshow)) return results
def get_seasons(self, page): season_lst = [] seasons = scraper_lib.Container( page, tag="a", container_class="button-sel-serie").get_container() for i, season in enumerate(seasons, start=1): season_block = scraper_lib.Element( page, el_tag="div", el_class="row-stagione-{0}".format(i)).get_element() season_lst.append( Season(title="stagione: {0}".format(i), season_no=i, html_block=season_block)) return season_lst
def get_episodes(self, page): page = scraper_lib.get_soup(page) episodes = [] episodes_block = scraper_lib.Container( page, tag="a", container_class="box-link-serie").get_container() for i, episode_block in enumerate(episodes_block, start=1): urls = self.get_playable_urls(episode_block) episode_name = scraper_lib.Element(episode_block, el_tag="div", get_text=True).get_element() episodes.append( Episode(title=episode_name, urls=urls, episode_no=i)) return episodes
def get_search_result(self, keyword): keyword = keyword.replace(" ", "+") self.cf_session = scraper_lib.get_cf_session() search_result = scraper_lib.get_page_soup(url=self.search_url.format( self.domain, keyword), scraper=self.cf_session) tvshow = scraper_lib.Container( block=search_result, tag='div', first=True, container_class='col-xs-6 col-sm-2-5').get_container() info = self.__get_post_info(tvshow) time.sleep(1) return self.get_seasons( scraper_lib.get_page_soup(info["url"], scraper=self.cf_session))
def get_all_episodes(self, episode_wrapper): episodes = [] if self.is_modern_state: episodes_wrapper = scraper_lib.Container( block=episode_wrapper, tag='div', container_class='episode-wrap').get_container() for i, episode in enumerate(episodes_wrapper): episodes.append(self.get_episode_info(episode, i)) else: #old version episodes_html = "{0}".format(episode_wrapper).split("<br/>") for i, episode in enumerate(episodes_html): episodes.append(self.get_episode_info(episode.strip(), i)) return episodes
def __get_post_info(self, block): info_block = scraper_lib.Container(block=block, tag="h2", first=True).get_container() post_title = scraper_lib.Element(block=info_block, el_tag="a", get_text=True).get_element() post_ref_url = scraper_lib.Element(block=block, el_tag="a", el_property="href").get_element() try: image = scraper_lib.Element(block=block, el_tag="img", el_class="Thumbnail", el_property="src").get_element() except: image = "n.d." tvshow = TvShow(title=post_title, page_url=post_ref_url) tvshow.image_url = image return tvshow