def _check_dir(cls, dir: PurePath): """Create empty directory if not exists.""" if not dir.exists(): dir.mkdir(parents=True)
def export_space_files(space_id: str, folder: PurePath, auth_token: str, fetch_after_timestamp: int = 0) -> int: """Export spaces, starting with the newest and going back only to the fetch_after_timestamp. This order is implemented since paging from oldest to newest doesn't seem to be implemented for the beta resource API. The downside to this approach is that if file name collisions occurs, the names will be non-deterministic and depend on when the files are created relative to prior executions of this method. For instance, myfile.txt and myfile 1.txt will refer to two files with title myfile.txt. Because we start from newest to oldest, myfile.txt will normally be the newest file, and myfile 1.txt will be older. This may already be counter-intuitive, but additionally, if the task was already run and an earlier file was downloaded as myfile.txt, it's possible the newer file will be named myfile 1.txt""" if fetch_after_timestamp: logger.info("Exporting files only back to %s ms", fetch_after_timestamp) file_graphqlitem_by_id = {} file_entries_file_path = folder / constants.FILES_META_FOLDER / constants.FILE_ENTRIES_FILE_NAME if (file_entries_file_path).exists(): with open(file_entries_file_path, "r", encoding=constants.FILE_ENCODING) as f: file_graphqlitem_by_id = json.load(f) file_path_by_id = {} file_paths_file_path = folder / constants.FILES_META_FOLDER / constants.FILE_PATHS_FILE_NAME if (file_paths_file_path).exists(): with open(file_paths_file_path, "r", encoding=constants.FILE_ENCODING) as f: file_path_by_id = json.load(f) downloaded = 0 already_downloaded = 0 duplicates = 0 try: previous_page_ids = set() next_page_time_in_milliseconds = None while True: space_files_page = queries.space_files.execute( auth_token, spaceid=space_id, timestamp=next_page_time_in_milliseconds) if space_files_page: logger.debug("Fetched page with %s files for space %s", len(space_files_page), space_id) elif len(previous_page_ids) == 0: logger.debug("No files found for space %s", space_id) break else: logger.error( "Fetched page with no files for space %s, but expected this page to contain at least one file." ) break folder.mkdir(exist_ok=True, parents=True) found_file = False page_ids = set() for file in space_files_page: file_created_ms = int( parse(file["created"]).timestamp() * 1000) if file_created_ms >= fetch_after_timestamp: file_graphqlitem_by_id[file["id"]] = file if file["id"] in previous_page_ids: logger.debug( "skipping file with id %s since it was in the last page", file["id"]) else: found_file = True page_ids.add(file["id"]) if next_page_time_in_milliseconds: next_page_time_in_milliseconds = min( next_page_time_in_milliseconds, file_created_ms) else: next_page_time_in_milliseconds = file_created_ms if file["id"] in file_path_by_id and Path( file_path_by_id[file["id"]]).exists(): logger.debug( "file %s is already downloaded to %s, skipping download", file["id"], file_path_by_id[file["id"]]) already_downloaded += 1 else: file_path, new_file = queries.download( file["id"], file["title"], folder, auth_token) file_path_by_id[file["id"]] = str(file_path) if new_file: downloaded += 1 else: duplicates += 1 else: logger.debug( "ignoring file %s since it is before the requested resume point %s", file["id"], fetch_after_timestamp) previous_page_ids = page_ids if not found_file: break finally: # if we have some metadata, write it if len(file_graphqlitem_by_id) > 0: file_entries_file_path.parent.mkdir(exist_ok=True, parents=True) with open(file_entries_file_path, "w+", encoding=constants.FILE_ENCODING) as f: json.dump(file_graphqlitem_by_id, f) if len(file_path_by_id) > 0: file_paths_file_path.parent.mkdir(exist_ok=True, parents=True) with open(file_paths_file_path, "w+", encoding=constants.FILE_ENCODING) as f: json.dump(file_path_by_id, f) logger.info( "Downloaded %s files, %s files were skipped because they were downloaded according to meta files, %s downloaded files were duplicates of files already downloaded", downloaded, already_downloaded, duplicates) return downloaded
def mkdirs(self, path: PurePath, exist_ok: bool = True) -> None: for path in self.dir_paths(path): path.mkdir(exist_ok=exist_ok, parents=True)