def __download_videos(self, descrs): channel_id = descrs[Tab.HomePage][0]['owner_channel']['id'] for descr in descrs[Tab.Videos]: video_id = descr['id'] # Check in Cache video_id if self.__cache.check_exist_video(video_id): logging.info("such video already exist (video_id=%s)" % video_id) continue # Download video try: full_video_descr = self.scrappy_decorator( self.__video_downloader.load, video_id) except Exception as e: self.__cache.update_failed_video(video_id) msg = "problem with video downloading (video_id=%s)" % video_id logging.warning(utils.CrawlerError(e=e, msg=msg)) continue data = self.__create_video(video_id, channel_id, full_video_descr, descr) try: self.__cache.insert_video_descr(data) except Exception as e: msg = "problem with video inserting into db (video_id=%s)" % video_id logging.warning(utils.CrawlerError(e=e, msg=msg)) logging.error(e)
def process(self, channel_ids=None): if channel_ids is None: channel_ids = [] if not isinstance(channel_ids, list): raise utils.CrawlerError("channel_ids is not list") logging.info("setting channel ids from arguments into cache") self.__set_base_videos(channel_ids) # Getting first channel from Cache channel_id = self.__cache.get_best_channel_id() while channel_id is not None: full_descr, is_scrappy = self.__scrappy(channel_id) if not is_scrappy: channel_id = self.__cache.get_best_channel_id() continue self.__set_neighb_channels(full_descr) # Downloading youtube for ChannelId # TODO: move to scrapper self.__download_videos(full_descr) # Channel was downloaded if not self.__update_channel_downloaded(channel_id): channel_id = self.__cache.get_best_channel_id() continue # Getting next channel from Cache channel_id = self.__cache.get_best_channel_id()
def __set_base_videos(self, channel_ids): msg = None try: ch_ids_str = ','.join(channel_ids) msg = "set base channels was failed (channel_ids=%s)" % ch_ids_str self.__cache.set_base_channels(channel_ids) except Exception as e: logging.exception(utils.CrawlerError(e=e, msg=msg))
def __update_channel_downloaded(self, channel_id): try: self.__cache.update_channel_downloaded(channel_id) except Exception as e: msg = "problem with update channel_id. " + self.__crash_msg % ( "channel_id", channel_id) e = utils.CrawlerError(e=e, msg=msg) logging.error(e) return False return True
def scrappy_decorator(self, fn, *args, **kwargs): count = 0 e = None while count < self.__max_attempts: try: return fn(*args, **kwargs) except Exception as e: logging.warning( utils.CrawlerError(e=e, msg="problem into scrapper. retry: %d" % count)) count += 1 raise e
def __set_neighb_channels(self, full_descr): neighb_channels = None try: # Setting neighbours channels into Cache. ChannelId neighb_channels = self.__get_neighb_channels(full_descr) self.__cache.set_channels(neighb_channels, scrapped=False, valid=True) except Exception as e: ch_ids_str = ','.join([ch['id'] for ch in neighb_channels]) e = utils.CrawlerError(e=e, msg=self.__crash_msg % ("channel_ids", ch_ids_str)) logging.error(e)