def __get_yt_json( self, url: str, path: Optional[Union[List[Union[str, int]], Union[str, int]]] = None ) -> Optional[dict]: try: j = json.loads( '{' + strings.between( self._get(url).text, 'var ytInitialData = {', '</script>' ).rstrip().rstrip(';') ) if path: if isinstance(path, str) or isinstance(path, int): path = [path] for path_component in path: j = j[path_component] return j except Exception as e: if self.debug: print('ERROR - YoutubeScraper - __get_yt_json({})'.format(url), e) return None # ---------------------------------------------------------------------------------------------------------------------------------------- #
def get_channel_video_ids( self, channel_id: Optional[str] = None, ignored_titles: Optional[List[str]] = None ) -> List[str]: video_ids = [] ignored_titles = ignored_titles or [] channel_id = channel_id or self.current_user_id try: self.get(self.__channel_videos_url(channel_id)) last_page_source = self.browser.driver.page_source while True: self.browser.scroll(1500) i=0 max_i = 100 sleep_time = 0.1 should_break = True while i < max_i: i += 1 time.sleep(sleep_time) if len(last_page_source) != len(self.browser.driver.page_source): last_page_source = self.browser.driver.page_source should_break = False break if should_break: break soup = bs(self.browser.driver.page_source, 'lxml') elems = soup.find_all('a', {'id':'video-title', 'class':'yt-simple-endpoint style-scope ytd-grid-video-renderer'}) for elem in elems: if 'title' in elem.attrs: should_continue = False title = elem['title'].strip().lower() for ignored_title in ignored_titles: if ignored_title.strip().lower() == title: should_continue = True break if should_continue: continue if 'href' in elem.attrs and '/watch?v=' in elem['href']: vid_id = strings.between(elem['href'], '?v=', '&') if vid_id is not None and vid_id not in video_ids: video_ids.append(vid_id) except Exception as e: self.print(e) return video_ids
def get_current_channel_id(self, _click_avatar: bool = False, _get_home_url: bool = False) -> Optional[str]: if not self.is_logged_in: print('Error - \'upload\': Isn\'t logged in') return None if _get_home_url: self.get(YT_URL) try: if _click_avatar: avatar_button = self.browser.find_by('button', id_='avatar-btn', timeout=0.5) if avatar_button: avatar_button.click() href_containers = self.browser.find_all_by('a', class_='yt-simple-endpoint style-scope ytd-compact-link-renderer', timeout=0.5) if href_containers: for href_container in href_containers: href = href_container.get_attribute('href') if href and 'channel/' in href: return strings.between(href, 'channel/', '?') except Exception as e: self.print(e) if not _click_avatar: return self.get_current_channel_id(_click_avatar=True, _get_home_url=_get_home_url) elif not _get_home_url: return self.get_current_channel_id(_click_avatar=False, _get_home_url=True) return None
def get_current_channel_id(self) -> Optional[str]: self.browser.get(YT_URL) try: return json.loads( strings.between(self.browser.driver.page_source, 'var ytInitialGuideData = ', '};') + '}')['responseContext']['serviceTrackingParams'][2]['params'][ 0]['value'] except Exception as e: print('get_current_channel_id', e) return None
def parse_products_page(self, response): asin_ids = [] soup = BeautifulSoup(response.content, 'lxml') results = soup.find_all('span', {'class':'a-declarative'}) for elem in results: try: asin_id = strings.between(elem['data-a-popover'], 'asin=', '&') if asin_id is not None: asin_ids.append(asin_id) except: pass return asin_ids
def __init__(self, preview_dict: Dict): try: img_dict = preview_dict['images'][0]['source'] self.url = 'https://i.redd.it/' + strings.between( img_dict['url'], 'redd.it/', '?') self.width = img_dict['width'] self.height = img_dict['height'] except: self.url = None self.width = None self.height = None # ---------------------------------------------------------------------------------------------------------------------------------------- #
def watch_video( self, video_id: str, percent_to_watch: float = -1, # 0-100 # -1 means all like: bool = False ) -> Tuple[bool, bool]: # watched, liked watched = False liked = False try: self.get(YT_WATCH_VIDEO_URL.format(video_id)) length_s = float( strings.between(self.browser.driver.page_source, 'detailpage\\\\u0026len=', '\\\\')) play_button = self.browser.find_by( 'button', class_='ytp-large-play-button ytp-button', timeout=0.5) if play_button and play_button.is_displayed(): play_button.click() time.sleep(1) while True: ad = self.browser.find_by('div', class_='video-ads ytp-ad-module', timeout=0.5) if not ad or not ad.is_displayed(): break time.sleep(0.1) watched = True seconds_to_watch = percent_to_watch / 100 * length_s if percent_to_watch >= 0 else length_s if seconds_to_watch > 0: self.print('Goinng to watch', seconds_to_watch) time.sleep(seconds_to_watch) return watched, self.like( video_id) if like and self.is_logged_in else False except Exception as e: self.print(e) return watched, liked
def parse_product(self, response) -> Optional[Dict]: categories = [] features = [] video_urls = [] soup = BeautifulSoup(response.content, 'lxml') parsed_json = self.__json_loads(strings.between(response.text, 'var obj = jQuery.parseJSON(\'', '\')')) if parsed_json is None: return None title = parsed_json['title'] asin = parsed_json['mediaAsin'] images = parsed_json videos = parsed_json['videos'] features = [] try: for feature in soup.find('div', {'class':'a-section a-spacing-medium a-spacing-top-small'}).find_all('span', {'class':'a-list-item'}): try: features.append(feature.get_text().strip()) except: pass except: pass try: categories_container = soup.find('div', {'id':'wayfinding-breadcrumbs_container'}) for category_a in categories_container.find_all('a', {'class':'a-link-normal a-color-tertiary'}): try: categories.append(BeautifulSoup(category_a.text, "lxml").text.replace('\\', '/').replace('<', ' ').replace('>', ' ').strip().lower()) except: pass except: pass # print('categories', categories) try: price_text = soup.find('span', {'id':'priceblock_ourprice'}).text.replace('$', '').strip() price = float(price_text) except: price = None table_for_product_info = soup.find('table', {'id':'productDetails_detailBullets_sections1', 'class':'a-keyvalue prodDetTable'}) product_information_dict = {} if table_for_product_info is not None: for tr in table_for_product_info.find_all('tr'): key = tr.find('th').get_text().strip() if key is not None and key not in ['Customer Reviews', 'Best Sellers Rank']: value = tr.find('td').get_text().strip() product_information_dict[key] = value image_details = {} if 'colorToAsin' in images and images['colorToAsin'] is not None: colors = images['colorToAsin'] for color_name, color_dict in colors.items(): _asin = color_dict['asin'] image_details[_asin] = { 'name' : color_name, 'image_urls' : [] } images_by_color = images['colorImages'][color_name] for elem in images_by_color: if 'hiRes' in elem: image_details[_asin]['image_urls'].append(elem['hiRes']) for url in videos: if 'url' in url: video_urls.append(url['url']) if image_details is None or image_details == {}: try: images_json = self.__json_loads(strings.between(response.text, '\'colorImages\': { \'initial\': ', '}]},') + '}]') if images_json is not None: image_details[asin] = { 'name' : asin, 'image_urls' : [] } for image_json in images_json: try: image_details[asin]['image_urls'].append(image_json['large']) except Exception as e: print(e) pass except: pass associated_asins = [] try: associated_asins_json = self.__json_loads(strings.between(response.text, 'dimensionToAsinMap :', '},').strip() + '}') if associated_asins_json is not None: for val in associated_asins_json.values(): associated_asins.append(val) except: pass return { 'title': title, 'price': price, 'categories': categories, 'features': features, 'product information': product_information_dict, 'images': image_details, 'videos_url': video_urls, 'associated_asins': associated_asins }
def parse_product(cls, response: Optional[Response], debug: bool = False) -> Optional[Product]: if not response or response.status_code not in [200, 201]: return None categories = [] features = [] videos = [] soup = bs(response.content, 'lxml') parsed_json = cls.__json_loads( strings.between(response.text, 'var obj = jQuery.parseJSON(\'', '\')')) if parsed_json is None: return None images = parsed_json title = parsed_json['title'].strip() asin = parsed_json['mediaAsin'] videos = parsed_json['videos'] try: for feature in soup.find( 'div', { 'class': 'a-section a-spacing-medium a-spacing-top-small' }).find_all('span', {'class': 'a-list-item'}): try: features.append(feature.get_text().strip()) except: pass except Exception as e: if debug: print(e) try: for cat_a in soup.find('div', { 'id': 'wayfinding-breadcrumbs_container' }).find_all('a', class_='a-link-normal a-color-tertiary'): try: categories.append( bs(cat_a.text, "lxml").text.replace('\\', '/').replace( '<', ' ').replace('>', ' ').strip().lower()) except: pass except Exception as e: if debug: print(e) try: price_text = soup.find('span', { 'id': 'priceblock_ourprice' }).text.replace('$', '').strip() price = float(price_text) except: price = None try: table_for_product_info = soup.find( 'table', { 'id': 'productDetails_detailBullets_sections1', 'class': 'a-keyvalue prodDetTable' }) details = {} if table_for_product_info is not None: for tr in table_for_product_info.find_all('tr'): key = tr.find('th').get_text().strip() if key is not None and key not in [ 'Customer Reviews', 'Best Sellers Rank' ]: value = tr.find('td').get_text().strip() details[key] = value except: pass image_details = {} if 'colorToAsin' in images and images['colorToAsin'] is not None: colors = images['colorToAsin'] for color_name, color_dict in colors.items(): _asin = color_dict['asin'] image_details[_asin] = {'name': color_name, 'image_urls': []} images_by_color = images['colorImages'][color_name] for elem in images_by_color: if 'hiRes' in elem: image_details[_asin]['image_urls'].append( elem['hiRes']) added_video_urls = [] for elem in videos: try: vid_url = elem['url'] print(vid_url) if vid_url in added_video_urls: continue video = {'url': vid_url} video['title'] = elem['title'].strip() video['height'] = int(elem['videoHeight'] if 'videoHeight' in elem else elem['height']) video['width'] = int(elem['videoWidth'] if 'videoWidth' in elem else elem['width']) videos.append(video) added_video_urls.append(vid_url) except Exception as e: if debug: print(e) if image_details is None or image_details == {}: try: images_json = cls.__json_loads( strings.between(response.text, '\'colorImages\': { \'initial\': ', '}]},') + '}]') if images_json is not None: image_details[asin] = {'name': asin, 'image_urls': []} for image_json in images_json: try: image_details[asin]['image_urls'].append( image_json['large']) except Exception as e: if debug: print(e) except: pass associated_asins = [] try: associated_asins_json = cls.__json_loads( strings.between(response.text, 'dimensionToAsinMap :', '},').strip() + '}') if associated_asins_json is not None: for val in associated_asins_json.values(): associated_asins.append(val) except: pass return Product(title, asin, price, categories, features, details, image_details, videos)