def _download_submission(self, submission: praw.models.Submission): if not isinstance(submission, praw.models.Submission): logger.warning(f'{submission.id} is not a submission') return if not self.download_filter.check_url(submission.url): logger.debug( f'Download filter removed submission {submission.id} with URL {submission.url}' ) return try: downloader_class = DownloadFactory.pull_lever(submission.url) downloader = downloader_class(submission) logger.debug( f'Using {downloader_class.__name__} with url {submission.url}') except errors.NotADownloadableLinkError as e: logger.error(f'Could not download submission {submission.id}: {e}') return try: content = downloader.find_resources(self.authenticator) except errors.SiteDownloaderError as e: logger.error( f'Site {downloader_class.__name__} failed to download submission {submission.id}: {e}' ) return for destination, res in self.file_name_formatter.format_resource_paths( content, self.download_directory): if destination.exists(): logger.debug(f'File {destination} already exists, continuing') else: try: res.download(self.args.max_wait_time) except errors.BulkDownloaderException as e: logger.error( f'Failed to download resource {res.url} with downloader {downloader_class.__name__}: {e}' ) return resource_hash = res.hash.hexdigest() destination.parent.mkdir(parents=True, exist_ok=True) if resource_hash in self.master_hash_list: if self.args.no_dupes: logger.info( f'Resource hash {resource_hash} from submission {submission.id} downloaded elsewhere' ) return elif self.args.make_hard_links: self.master_hash_list[resource_hash].link_to( destination) logger.info( f'Hard link made linking {destination} to {self.master_hash_list[resource_hash]}' ) return with open(destination, 'wb') as file: file.write(res.content) logger.debug(f'Written file to {destination}') self.master_hash_list[resource_hash] = destination logger.debug(f'Hash added to master list: {resource_hash}') logger.info( f'Downloaded submission {submission.id} from {submission.subreddit.display_name}' )
def test_is_web_resource(test_url: str, expected: bool): result = DownloadFactory.is_web_resource(test_url) assert result == expected
def test_sanitise_url(test_url: str, expected: str): result = DownloadFactory.sanitise_url(test_url) assert result == expected
def test_factory_lever_bad(test_url: str): with pytest.raises(NotADownloadableLinkError): DownloadFactory.pull_lever(test_url)
def test_factory_lever_good(test_submission_url: str, expected_class: BaseDownloader, reddit_instance: praw.Reddit): result = DownloadFactory.pull_lever(test_submission_url) assert result is expected_class
def _download_submission(self, submission: praw.models.Submission): if submission.id in self.excluded_submission_ids: logger.debug(f'Object {submission.id} in exclusion list, skipping') return elif submission.subreddit.display_name.lower() in self.args.skip_subreddit: logger.debug(f'Submission {submission.id} in {submission.subreddit.display_name} in skip list') return elif submission.author.name in self.args.ignore_user: logger.debug( f'Submission {submission.id} in {submission.subreddit.display_name} skipped' f' due to {submission.author.name} being an ignored user') return elif not isinstance(submission, praw.models.Submission): logger.warning(f'{submission.id} is not a submission') return elif not self.download_filter.check_url(submission.url): logger.debug(f'Submission {submission.id} filtered due to URL {submission.url}') return logger.debug(f'Attempting to download submission {submission.id}') try: downloader_class = DownloadFactory.pull_lever(submission.url) downloader = downloader_class(submission) logger.debug(f'Using {downloader_class.__name__} with url {submission.url}') except errors.NotADownloadableLinkError as e: logger.error(f'Could not download submission {submission.id}: {e}') return if downloader_class.__name__.lower() in self.args.disable_module: logger.debug(f'Submission {submission.id} skipped due to disabled module {downloader_class.__name__}') return try: content = downloader.find_resources(self.authenticator) except errors.SiteDownloaderError as e: logger.error(f'Site {downloader_class.__name__} failed to download submission {submission.id}: {e}') return for destination, res in self.file_name_formatter.format_resource_paths(content, self.download_directory): if destination.exists(): logger.debug(f'File {destination} from submission {submission.id} already exists, continuing') continue elif not self.download_filter.check_resource(res): logger.debug(f'Download filter removed {submission.id} file with URL {submission.url}') continue try: res.download({'max_wait_time': self.args.max_wait_time}) except errors.BulkDownloaderException as e: logger.error(f'Failed to download resource {res.url} in submission {submission.id} ' f'with downloader {downloader_class.__name__}: {e}') return resource_hash = res.hash.hexdigest() destination.parent.mkdir(parents=True, exist_ok=True) if resource_hash in self.master_hash_list: if self.args.no_dupes: logger.info( f'Resource hash {resource_hash} from submission {submission.id} downloaded elsewhere') return elif self.args.make_hard_links: self.master_hash_list[resource_hash].link_to(destination) logger.info( f'Hard link made linking {destination} to {self.master_hash_list[resource_hash]}' f' in submission {submission.id}') return try: with open(destination, 'wb') as file: file.write(res.content) logger.debug(f'Written file to {destination}') except OSError as e: logger.exception(e) logger.error(f'Failed to write file in submission {submission.id} to {destination}: {e}') return creation_time = time.mktime(datetime.fromtimestamp(submission.created_utc).timetuple()) os.utime(destination, (creation_time, creation_time)) self.master_hash_list[resource_hash] = destination logger.debug(f'Hash added to master list: {resource_hash}') logger.info(f'Downloaded submission {submission.id} from {submission.subreddit.display_name}')