def test_normal_auto_platform_linux(self, value, expected): if isinstance(expected, str): sanitized = sanitize_filepath(value, platform="auto") assert is_valid_filepath(sanitized, platform="auto") else: with pytest.raises(expected): sanitize_filepath(value, platform="auto")
def determineName(self, filename, ext=None): class Default(dict): def __missing__(self, key): return "{" + key + "}" md = self.metdata # padding for issue md.issue = IssueString(md.issue).asString(pad=self.issue_zero_padding) template = self.template pathComponents = template.split(os.sep) new_name = "" fmt = MetadataFormatter(self.smart_cleanup) for Component in pathComponents: new_name = os.path.join(new_name, fmt.vformat(Component, args=[], kwargs=Default(vars(md))).replace("/", "-")) if ext is None or ext == "": ext = os.path.splitext(filename)[1] new_name += ext # some tweaks to keep various filesystems happy new_name = new_name.replace(": ", " - ") new_name = new_name.replace(":", "-") # remove padding md.issue = IssueString(md.issue).asString() if self.move: return sanitize_filepath(new_name.strip()) else: return os.path.basename(sanitize_filepath(new_name.strip()))
async def download_article(self, article: Article, semaphore: asyncio.Semaphore) -> str: async with semaphore: await self._init() await self._inject_js() url = f"https://time.geekbang.org/column/article/{article.id}" # print(f'🔗 {url}') await self.page.goto(url, waitUntil=["load", "networkidle0"]) title = await self.page.title() title = title.strip(" - 极客时间") foldername = sanitize_filepath( # f"{article.column.title}/{article.chapter.id}_{article.chapter.title}" article.column.title) os.makedirs(name=foldername, exist_ok=True) filename = sanitize_filepath( f"{foldername}/{article.chapter.id}_{article.chapter.title}_{article.id}_{article.title}" ) filename = filename.replace(" ", "_") await self._process_and_print(filename) await self.page.browser.close() await asyncio.sleep(random.randint(1, 2)) return title
def create_folder(path): sanitize_filepath(path) if not os.path.exists(path): os.makedirs(path) os.chdir(path) return os.getcwd()
def main(): requests.packages.urllib3.disable_warnings(InsecureRequestWarning) input_parser = create_input_parser() args = input_parser.parse_args() tululu_category_link = "https://tululu.org/l55/" book_folder = sanitize_filepath(os.path.join(args.dest_folder, "books")) cover_folder = sanitize_filepath(os.path.join(args.dest_folder, "images")) book_description_filepath = sanitize_filepath(args.json_filepath) book_descriptions = [] Path(book_folder).mkdir(parents=True, exist_ok=True) Path(cover_folder).mkdir(parents=True, exist_ok=True) filepath, filename = os.path.split(book_description_filepath) Path(filepath).mkdir(parents=True, exist_ok=True) if not args.last_page: args.last_page = fetch_category_last_page(tululu_category_link) try: book_links = get_book_links(tululu_category_link, args.start_page, args.last_page) except ( requests.exceptions.ConnectionError, requests.exceptions.HTTPError, ): print( "При получении списка книг возникла ошибка! Проверь соединение с интернетом." ) sys.exit() for index, book_link in enumerate(book_links): try: response = fetch_response(book_link) html_soup = BeautifulSoup(response.content, "lxml") text_link = fetch_book_text_link(html_soup) if not text_link: print( f"У книги по сслыке: { book_link } нет текстового файла! Пропускаем!" ) continue book_description = parse_book_page(html_soup) if not args.skip_imgs: book_description["img_src"] = download_cover( book_link, book_description, cover_folder, index) if not args.skip_txt: book_title = f"{ index }_{ book_description['title'] }" book_description["book_path"] = download_book_text( book_link, text_link["href"], book_folder, book_title) book_description.pop("img_src_link", None) book_descriptions.append(book_description) except ( requests.exceptions.ConnectionError, requests.exceptions.HTTPError, ) as error: print( f"При скачивании книги по сслыке: { book_link } возникла ошибка: { error }! Пропускаем!" ) continue with open(book_description_filepath, "a") as file: json.dump(book_descriptions, file, ensure_ascii=False, indent=4)
def downloadContent(self, root: Union[Path, str] = Path("Albums"), printProgress: bool = True): """ Downloads all pictures that don't already exist in the directory to the folder `root` The progress bar can be disabled by passing False to printProgress Returns the list of downloaded files' filepaths """ paths = [] if (isinstance(root, str)): root = Path(root) root = root.joinpath(sanitize_filepath(self.sanitizedName)) root.mkdir(parents=True, exist_ok=True) with trange(len(self.contentUrls), disable=not printProgress, desc=self.name) as tq: for i in tq: if (self.isManga): fpath = root.joinpath( f"{self.sanitizedName}_{str(i).zfill(len(str(self.pictureCount-1)))}" ) else: fpath = root.joinpath( Path(urlparse(self.contentUrls[i]).path).name) printName = f'"{self.name}" page {i+1}/{self.pictureCount}' globResult = list(root.glob(f"{fpath.stem}*")) if (globResult): tq.set_description(f"{printName} exists") paths.append(globResult[0]) continue else: try: r = self.handler.get(self.contentUrls[i]) fpath = fpath.with_suffix( mimetypes.guess_extension( r.headers['content-type'])) with open(sanitize_filepath(fpath), "wb") as f: f.write(r.content) tq.set_description(f'{printName} done') paths.append(fpath) except Exception as e: with open( sanitize_filepath( fpath.with_name(fpath.name + "_SKIPPED")), "wb") as _: pass tq.set_description(f'{printName} skipped because {e}') paths.append(fpath) return paths
def downloadContent(self, downloadQuality: int = 0, root: Union[Path, str] = Path("Videos"), printProgress: bool = True): """ FIXME for some reason access to videos are forbidden. This was not the case before. If anybody can help feel free to raise an issue or a pull request downloads the video if it doesn't already exist in the directory to the folder `root` The quality will be chosen by `downloadQuality` that defaults to the lowest quality `downloadQuality` can be a number from 0 to 3 with 0 representing 240p (the lowest quality) if the chosen quality is not available it will default to the highest quality available (which is always lower than the chosen quality) The progress bar can be disabled by passing False to printProgress Returns the path of the downloaded video """ if (isinstance(root, str)): root = Path(root) root = root.joinpath(sanitize_filepath(self.sanitizedName)) root.mkdir(parents=True, exist_ok=True) url = self.contentUrls[downloadQuality] if (not url): for i in range(downloadQuality + 1): url = self.contentUrls[i] if self.contentUrls[i] else url fpath = root.joinpath(self.sanitizedName) printName = self.name r = self.handler.get(url, stream=True) fpath = fpath.with_suffix( mimetypes.guess_extension(r.headers['content-type'])) total_size_in_bytes = int(r.headers.get('content-length', 0)) with tqdm(total=total_size_in_bytes, disable=not printProgress, unit='iB', unit_scale=True, desc=self.name) as tq: with open(sanitize_filepath(fpath), 'wb') as file: for data in r.iter_content(1024): tq.update(len(data)) file.write(data) if total_size_in_bytes != 0 and tq.n != total_size_in_bytes: with open( sanitize_filepath( fpath.with_name(fpath.name + "_SKIPPED")), "wb") as _: pass tq.set_description(f'{printName} skipped') return fpath else: return fpath
async def run_chapter(chapter): # print(chapter) base = sanitize_filepath('%03d-%s' % (chapter['id'], chapter['tags']['title'])) base = os.path.join(outputDirectory, base) file = base + '.mp3' tempfile = base + '-tmp.mp3' if os.path.isfile(file): return 1, float(chapter['end_time']) - float(chapter['start_time']), file cmd = [] cmd.extend(ffmpeg) cmd.extend(['-activation_bytes', activationBytes]) cmd.extend(['-i', inputFile]) cmd.extend(['-ss', chapter['start_time']]) cmd.extend(['-to', chapter['end_time']]) cmd.extend(['-vn', '-codec:a', 'libmp3lame']) cmd.extend(['-b:a', '%dk' % cbr]) # cmd.extend(['-filter_complex', '[0:a]channelsplit=channel_layout=stereo:channels=FR[right]', '-map', '[right]']) cmd.extend(['-map_metadata', '-1']) cmd.extend(metadata) cmd.extend(['-metadata', 'track=%d/%d' % (chapter['id'], numChapters)]) cmd.extend(['-y', tempfile]) # print(" ".join(map(shlex.quote, cmd))) process = await asyncio.create_subprocess_exec(*cmd) # , stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE) returncode = await process.wait() if returncode == 0: os.rename(tempfile, file) else: os.remove(tempfile) return returncode, float(chapter['end_time']) - float(chapter['start_time']), file
def unique_path(path, extension='pdf', sanitize=True): """Append a number to the path, if it is not unique. Parameters ---------- path : str Path of the filename without the extension. extension : str, optional File extension. sanitize : bool, optional If True, sanitizes the filename by removing illegal characters and making the path compatible with the operating system. Returns ------- str Unique path. """ if sanitize: path = sanitize_filepath(path, platform='auto') full_path = '{}.{}'.format(path, extension) if os.path.exists(full_path): number = 1 while True: number += 1 new_full_path = '{}-{}.{}'.format(path, number, extension) if os.path.exists(new_full_path): continue else: full_path = new_full_path break return full_path
def save_pickle(variable, path, allow_overwrite=False, verbose=False, sanitize=True): """Saves variable to a pickle file. Parameters ---------- variable : any Variable to be saved. path : str Path to the pickle file, excluding extension. allow_overwrite : bool, optional If False, will not check for existing files with the same name and will overwrite if such files exist. verbose : bool, optional If True, notifies the user that the file has been saved. sanitize : bool, optional If True, sanitizes the filename by removing illegal characters and making the path compatible with the operating system. """ if sanitize: path = sanitize_filepath(path, platform='auto') if allow_overwrite: path = '{}.pickle'.format(path) else: path = unique_path(path, 'pickle') with open(path, 'wb') as handle: pickle.dump(variable, handle, protocol=pickle.HIGHEST_PROTOCOL) if verbose: print('Saved {}.'.format(path))
def __init__(self, base, file_path=None, full_file_path=None): super().__init__(base, file_path, full_file_path) self._path = self.remove_dotslash(self._path) # noinspection PyUnresolvedReferences self._path = pathvalidate.sanitize_filepath(self._path, '_').strip(". /\\").strip() if not len(self._path): self._path = '_'
def test_normal_pathlike(self, value, replace_text, expected): sanitized_name = sanitize_filepath(value, replace_text) assert sanitized_name == expected assert is_pathlike_obj(sanitized_name) validate_filepath(sanitized_name) assert is_valid_filepath(sanitized_name)
def update_index(self): """Function to download new files from CMS and add them to our MongoDB database """ for course_name, files in self.scraper.get_courses_docs(): if not files: continue print("Checking", course_name, "for new docs.") course_docs = Doc.objects(course=course_name).only( "file_path") # Get all the docs of the course doc_paths = set(doc.file_path for doc in course_docs) for file in files: file_path: Path = file["file_path"] if file_path.suffix not in self.ALLOWED_EXTS: continue sanitized_path = str(sanitize_filepath( file_path)) # Remove illegal characters from the path if sanitized_path in doc_paths: # TODO: Also check updated_at of file continue # Already processed the file print("\tDownloading", file_path.name, end=". ") save_path = get_real_path(sanitized_path) self.scraper.download_file(save_path, file["file_url"]) print("Done.") doc = Doc( file_path=sanitized_path, course=course_name, downloaded_at=datetime.now(), ) doc.save() # Add the new doc to DB sentences = extract_sentences(save_path) self.add_to_index(doc, sentences)
def download_directory(self, path, node, retry, dcb, ecb): disk_path_object = pathlib.Path(path.lstrip('/')) disk_path_object = pathlib.Path( sanitize_filepath(str(disk_path_object))) if disk_path_object.is_dir(): self.logger.info('跳過已經存在的資料夾 {}' \ .format(str(disk_path_object))) return True download_ok = False for i in range(retry): try: if i != 0: self.logger.error('無法建立資料夾 {},正在嘗試第 {} 次' \ .format(str(disk_path_object), i + 1)) dcb(path, False, None, None, None) disk_path_object.mkdir(parents=True, exist_ok=False) dcb(path, True, None, None, None) ecb(path) download_ok = True break except IOError as err: ecb(path) self.logger.error(err) if not download_ok: return False return True
def download_txt(url, filename, folder=None): """Функция для скачивания текстовых файлов. Args: url (str): Cсылка на текст, который хочется скачать. filename (str): Имя файла, с которым сохранять. folder (str): Папка, куда сохранять. Returns: str: Путь до файла, куда сохранён текст. """ try: txt_response = request_tululu(url) except requests.RequestException: return if txt_response.status_code != 200: return correct_filename = f"{get_hash_sum(txt_response)}_{sanitize_filename(filename)}.txt" if folder is not None: correct_folder = sanitize_filepath(os.path.join(folder, 'books')) else: correct_folder = "books" correct_path = os.path.join(correct_folder, correct_filename).replace("\\", "/") os.makedirs(correct_folder, exist_ok=True) with open(correct_path, 'w', encoding='utf8', newline='') as file: file.write(txt_response.text) return correct_path
def download_uri(uri, path, fname): """ Downloads the content at the specified URI to {path}/{fname}. """ path = sanitize_filepath(path, platform="auto") os.makedirs(path, exist_ok=True) sp.run(['curl', '-s', '-o', osp.join(path, fname), uri])
def write_url_list(path, text_set): """write list of strings to file as lines Args: path (str): local file path text_set (set[str]): list of strings to write Returns: path (str): sanitized local file path """ assert isinstance( text_set, (tuple, list, set)), "write_url_list: text_set is not an iterable." assert isinstance(path, str), "write_url_list: path is not a string." path = pathvalidate.sanitize_filepath(path, platform='universal') directory = os.path.dirname(path) if not os.path.exists(directory): os.makedirs(directory) with open(path, 'w') as wf: for s in text_set: wf.write(s + '\n') return path
def handle_valid_user(user): year = date.today().year month = date.today().month if not database_interface.is_attestation_sent_for_month( user.user_id, month, year): print( f"Sending attestation for user {user.first_name} {user.last_name}\n" ) error_msg = "" try: body, subject = text_generator.get_email_text( user.first_name, user.last_name, year, month) filename = f"{user.first_name.upper()}_{user.last_name.upper()}_{month}_{year}_attestation_navigo.pdf" filename = sanitize_filepath(filename) download_folder = os.path.join( os.path.dirname(os.path.abspath(__file__)), "downloaded_attestation") filename = os.path.join(download_folder, filename) download_attestation.download_attestation(user.navigo_id, user.navigo_token, filename, month, year) organization_email = database_interface.get_organization_email( user.organization_id) send_email.send_email(subject, body, [user.email, organization_email], filename=filename) except AttributeError as e: error_msg = handle_exception(user, str(e)) database_interface.add_attestation(user_id=user.user_id, error_msg=str(error_msg))
def exportEntity(self, entity, rootPath, folderName=None): from labstep.entities.file.model import File if folderName is None: if entityNameInFolderName: folderName = f'{entity.id} - {entity.name}' if hasattr( entity, 'name') else f'{entity.id}' else: folderName = str(entity.id) entityDir = Path(rootPath).joinpath(sanitize_filepath(folderName)) entityDir.mkdir(parents=True, exist_ok=True) infoFile = entityDir.joinpath('entity.json') with open(infoFile, 'w') as out: json.dump(entity.__data__, out, indent=2) if hasattr(entity, 'file') and entity.file is not None: lsFile = File(entity.file, entity.__user__) fileDir = entityDir.joinpath('files') lsFile.export(fileDir) if hasattr(entity, 'files') and entity.files is not None: for file in entity.files: lsFile = File(file, entity.__user__) fileDir = entityDir.joinpath('files') lsFile.export(fileDir) return entityDir
def get_whole_target_path(self, output_folder: str, name_template: str): _ = sanitize_filepath(file_path=path_join( output_folder, self.get_output_name(name_template)), platform="Linux") return "/".join( sanitize_filename(filename=x, platform="Windows") for x in _.split("/"))
def download_image(image_link: str, image_folder: str = "./") -> str: image_filename = get_image_name(image_link) sanitized_folder = sanitize_filepath(image_folder) sanitized_filename = sanitize_filename(image_filename) filepath = os.path.join(sanitized_folder, sanitized_filename) image_data = fetch_response(image_link) write_image_to_file(image_data.content, filepath) return filepath
def test_normal_str(self, platform, value, replace_text, expected): sanitized_name = sanitize_filepath(value, platform=platform, replacement_text=replace_text) assert sanitized_name == expected assert isinstance(sanitized_name, str) validate_filepath(sanitized_name, platform=platform) assert is_valid_filepath(sanitized_name, platform=platform)
def test_normal_multibyte(self, test_platform, value, replace_text, expected): sanitized_name = sanitize_filepath(value, replace_text, platform=test_platform) assert sanitized_name == expected validate_filepath(sanitized_name, platform=test_platform) assert is_valid_filepath(sanitized_name, platform=test_platform)
def download_and_save_url(save_path, fname, url): url_parse = os.path.splitext(urlparse(url).path)[1] if len(url_parse) > 0: file_name = fname + url_parse else: r = requests.head(url, stream=True, allow_redirects=True) file_name = fname ContentDisposition = r.headers.get("Content-Disposition", None) name_error = False if ContentDisposition is None: name_error = True else: file_name = get_name_from_content_disposition( fname, ContentDisposition) if name_error or file_name is None: r = requests.get(url, stream=True, allow_redirects=True) ContentDisposition = r.headers.get("Content-Disposition", None) if ContentDisposition is not None: file_name = get_name_from_content_disposition( file_name, ContentDisposition) else: print(url) print("Server file name error. Use default name.") print(r.headers) file_name += ".bin" r = requests.get(url, stream=True, allow_redirects=True) file_size = int(r.headers.get('content-length', 0)) initial_pos = 0 file_path = sanitize_filepath(os.path.join(save_path, file_name)) #if len(file_path) > 260: # file_path = '\\\\?\\' + file_path #TODO: add check file exists. print("Start download url: {0} To: {1}".format(url, file_name)) try: with open(file_path, 'wb') as f: with tqdm(total=file_size, unit='B', unit_scale=True, unit_divisor=1024, desc=file_name, initial=initial_pos, ascii=True, miniters=1, file=sys.stdout) as pbar: for chunk in r.iter_content(32 * 1024): f.write(chunk) pbar.update(len(chunk)) except IOError as ioe: print(ioe) quit() print("\n")
def download_image(from_=None, to=None): try: path = sanitize_filepath(to, platform="auto") content = get_content_from_url(from_) if not content: raise EmptyImageError(f"Got empty image from {from_}") save_image(path, content) except Exception as e: print(e)
def download_txt(download_url, title, folder='books/'): response = requests.get(download_url) folder = sanitize_filepath(folder) file_name = sanitize_filename(title) + '.txt' filepath = os.path.join(folder, file_name) with open(filepath, "w") as my_file: my_file.write(response.text) return filepath
def download_txt(from_="", to="", urlparams=None): try: path = sanitize_filepath(to, platform="auto") content = get_text_from_url(from_, urlparams) if not content: raise EmptyBookError(f"Got empty textfile from {from_}") save_book(path, content) except Exception as e: print(e)
def remove_metadata(file, replace=True): ext = os.path.splitext(file)[1] new = f'.Noname{ext}' ffmpeg_run(file, new, out_options={'c': 'copy'}) # title = os.path.splitext(file)[0] if replace: os.remove(file) # while os.path.exists(title + '.webm'): # title += '_webm' shutil.move(new, sanitize_filepath(file, platform="auto"))
def run_ffmpeg_multiple_files(self, input_paths, out_path, opts, opts_before=[]): self.check_version() # sanitize file path out_path = pathvalidate.sanitize_filepath(out_path) oldest_mtime = min( os.stat(encodeFilename(path)).st_mtime for path in input_paths) opts += self._configuration_args() files_cmd = [] for path in input_paths: files_cmd.extend([ encodeArgument('-i'), encodeFilename(self._ffmpeg_filename_argument(path), True) ]) cmd = [ encodeFilename(self.executable, True), encodeArgument('-y'), ] # without -y there is a error callen, if the file exists if self.basename == 'ffmpeg': cmd += [encodeArgument('-loglevel'), encodeArgument('repeat+info')] cmd += ( [encodeArgument(o) for o in opts_before] + files_cmd + [encodeArgument(o) for o in opts] + [encodeFilename(self._ffmpeg_filename_argument(out_path), True)]) if self._downloader.params.get('verbose', False): self._downloader.to_screen('[debug] ffmpeg command line: %s' % shell_quote(cmd)) p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, universal_newlines=True) last_line = '' for line in p.stderr: # line = line.decode('utf-8', 'replace') if line.find('time=') > 0: print('\033[K' + line.replace('\n', '') + '\r', end='') last_line = line print('') std_out, std_err = p.communicate() if p.returncode != 0: msg = last_line.strip().split('\n')[-1] raise FFmpegPostProcessorError(msg) self.try_utime(out_path, oldest_mtime, oldest_mtime)
def parse_folders(path): config_file = os.path.join(path, "boostnote.json") with open(config_file, "r") as f: config = json.load(f) folders = config["folders"] f_dict = {} for i in folders: f_dict[i["key"]] = sanitize_filepath(i["name"]) return f_dict
def test_exception_type(self, value, expected): with pytest.raises(expected): sanitize_filepath(value) assert not is_valid_filepath(value)
def test_normal_str(self, platform, value, replace_text, expected): sanitized_name = sanitize_filepath(value, platform=platform, replacement_text=replace_text) assert sanitized_name == expected assert isinstance(sanitized_name, six.text_type) validate_filepath(sanitized_name, platform=platform) assert is_valid_filepath(sanitized_name, platform=platform)
def test_normal_reserved_name(self, value, test_platform, expected): filename = sanitize_filepath(value, platform=test_platform) assert filename == expected assert is_valid_filepath(filename, platform=test_platform)
def test_normal_multibyte(self, value, replace_text, expected): sanitized_name = sanitize_filepath(value, replace_text) assert sanitized_name == expected validate_filepath(sanitized_name) assert is_valid_filepath(sanitized_name)