def __init__( self, url: str = DEFAULT_TAIGA_URL, cache_dir: Optional[str] = None, token_path: Optional[str] = None, figshare_map_file: Optional[str] = None, ): self.url = url self.token = None self.token_path = token_path self.api: TaigaApi = None if cache_dir is None: cache_dir = DEFAULT_CACHE_DIR self.cache_dir = os.path.expanduser(cache_dir) if not os.path.exists(self.cache_dir): os.mkdir(self.cache_dir) cache_file_path = os.path.join(self.cache_dir, CACHE_FILE) self.cache = TaigaCache(self.cache_dir, cache_file_path) if figshare_map_file is not None: self.figshare_map = parse_figshare_map_file(figshare_map_file) else: self.figshare_map = None
def test_remove_from_cache(populated_cache: TaigaCache): populated_cache.remove_from_cache(COLUMNAR_FULL_TAIGA_ID, COLUMNAR_FULL_TAIGA_ID) assert ( populated_cache.get_entry(COLUMNAR_FULL_TAIGA_ID, COLUMNAR_FULL_TAIGA_ID) is None ) assert ( populated_cache.get_entry(COLUMNAR_TAIGA_ID_ALIAS, COLUMNAR_FULL_TAIGA_ID) is None ) assert ( populated_cache.get_entry(COLUMNAR_VIRTUAL_TAIGA_ID, COLUMNAR_FULL_TAIGA_ID) is None ) c = populated_cache.conn.cursor() c.execute( """ SELECT * FROM datafiles """ ) rows = c.fetchall() assert len(rows) == 1
def test_get_entry( populated_cache: TaigaCache, expected_df: pd.DataFrame, full_taiga_id: str, taiga_id_alias: str, virtual_taiga_id: str, ): """ TODO: - get full_taiga_id should return the df - get virtual taiga id should not return the df until the link has been added - then add virtual taiga id (need to figure out parameters) - get virtual taiga id should return df - same for alias - removing the actual file should remove the entry just for the actual taiga id, but not the aliases/virtual taiga id """ df = populated_cache.get_entry(full_taiga_id, full_taiga_id) assert df.equals(expected_df) df_alias = populated_cache.get_entry(taiga_id_alias, full_taiga_id) assert df_alias.equals(df) df_virtual = populated_cache.get_entry(virtual_taiga_id, full_taiga_id) assert df_virtual.equals(df)
def test_get_raw_entry(tmpdir, populated_cache: TaigaCache): p = tmpdir.join("foobar.txt") with open(str(p), "w+") as f: f.write("baz") populated_cache.add_raw_entry( str(p), "raw-dataset.1", "raw-dataset.1/some-file", DataFileFormat.Raw ) path_from_cache = populated_cache.get_raw_path( "raw-dataset.1", "raw-dataset.1/some-file" ) with open(path_from_cache) as f: assert f.read() == "baz"
def populated_cache(tmpdir: py._path.local.LocalPath): cache = TaigaCache(str(tmpdir), str(tmpdir.join(CACHE_FILE))) p = tmpdir.join("foobar.csv") COLUMNAR_DATAFRAME.to_csv(p, index=False), cache.add_entry( str(p), COLUMNAR_FULL_TAIGA_ID, COLUMNAR_FULL_TAIGA_ID, DataFileFormat.Columnar, COLUMNAR_TYPES, None, ) MATRIX_DATAFRAME.astype(float).to_csv(p) cache.add_entry( str(p), MATRIX_FULL_TAIGA_ID, MATRIX_FULL_TAIGA_ID, DataFileFormat.HDF5, None, None, ) return cache
def test_add_entry( populated_cache: TaigaCache, full_taiga_id: str, taiga_id_alias: str, virtual_taiga_id: str, datafile_format: DataFileFormat, column_types: Optional[Mapping[str, str]], ): with patch( "taigapy.taiga_cache._write_csv_to_feather" ) as mock_write_csv_to_feather: populated_cache.add_entry( None, full_taiga_id, full_taiga_id, datafile_format, column_types, None ) assert not mock_write_csv_to_feather.called populated_cache.add_entry( None, taiga_id_alias, full_taiga_id, datafile_format, column_types, None ) assert not mock_write_csv_to_feather.called populated_cache.add_entry( None, virtual_taiga_id, full_taiga_id, datafile_format, column_types, None ) assert not mock_write_csv_to_feather.called
class TaigaClient: def __init__( self, url: str = DEFAULT_TAIGA_URL, cache_dir: Optional[str] = None, token_path: Optional[str] = None, figshare_map_file: Optional[str] = None, ): self.url = url self.token = None self.token_path = token_path self.api: TaigaApi = None if cache_dir is None: cache_dir = DEFAULT_CACHE_DIR self.cache_dir = os.path.expanduser(cache_dir) if not os.path.exists(self.cache_dir): os.mkdir(self.cache_dir) cache_file_path = os.path.join(self.cache_dir, CACHE_FILE) self.cache = TaigaCache(self.cache_dir, cache_file_path) if figshare_map_file is not None: self.figshare_map = parse_figshare_map_file(figshare_map_file) else: self.figshare_map = None def _set_token_and_initialized_api(self): if self.token is not None: return if self.token_path is None: token_path = find_first_existing( ["./.taiga-token", os.path.join(self.cache_dir, "token")] ) else: token_path = find_first_existing([self.token_path]) with open(token_path, "rt") as r: self.token = r.readline().strip() self.api = TaigaApi(self.url, self.token) def _validate_file_for_download( self, id_or_permaname: Optional[str], dataset_name: Optional[str], dataset_version: Optional[str], datafile_name: Optional[str], ) -> DataFileMetadata: if id_or_permaname is None and dataset_name is None: # TODO standardize exceptions raise ValueError("id or name must be specified") elif ( id_or_permaname is None and dataset_name is not None and dataset_version is None ): dataset_metadata: DatasetMetadataDict = ( self.api.get_dataset_version_metadata(dataset_name, None) ) dataset_version = get_latest_valid_version_from_metadata(dataset_metadata) print( cf.orange( "No dataset version provided. Using version {}.".format( dataset_version ) ) ) metadata = self.api.get_datafile_metadata( id_or_permaname, dataset_name, dataset_version, datafile_name ) if metadata is None: raise ValueError( "No data for the given parameters. Please check your inputs are correct." ) dataset_version_id = metadata.dataset_version_id dataset_permaname = metadata.dataset_permaname dataset_version = metadata.dataset_version datafile_name = metadata.datafile_name data_state = metadata.state data_reason_state = metadata.reason_state assert dataset_version_id is not None assert dataset_permaname is not None assert dataset_version is not None assert datafile_name is not None if data_state == DatasetVersionState.deprecated.value: print( cf.orange( "WARNING: This version is deprecated. Please use with caution, and see the reason below:" ) ) print(cf.orange("\t{}".format(data_reason_state))) elif data_state == DatasetVersionState.deleted.value: self.cache.remove_all_from_cache( "{}.{}/".format(dataset_permaname, dataset_version) ) raise TaigaDeletedVersionException( "{} version {} is deleted. The data is not available anymore. Contact the maintainer of the dataset.".format( dataset_permaname, dataset_version ) ) return metadata def _download_file_and_save_to_cache( self, query: str, full_taiga_id: str, datafile_metadata: DataFileMetadata, get_dataframe: bool, ) -> Union[str, pd.DataFrame]: with tempfile.NamedTemporaryFile(dir=self.cache_dir, delete=False) as tf: dataset_permaname = datafile_metadata.dataset_permaname dataset_version = datafile_metadata.dataset_version datafile_name = datafile_metadata.datafile_name datafile_format: DataFileFormat = datafile_metadata.datafile_format self.api.download_datafile( dataset_permaname, dataset_version, datafile_name, tf.name ) if not get_dataframe: return self.cache.add_raw_entry( tf.name, query, full_taiga_id, DataFileFormat(datafile_metadata.datafile_format), ) column_types = None if datafile_format == DataFileFormat.Columnar: column_types = self.api.get_column_types( dataset_permaname, dataset_version, datafile_name ) return self.cache.add_entry( tf.name, query, full_taiga_id, datafile_format, column_types, datafile_metadata.datafile_encoding, ) def _get_dataframe_or_path_from_figshare( self, taiga_id: Optional[str], get_dataframe: bool ): if taiga_id is None: raise ValueError("Taiga ID must be specified to use figshare_file_map") if taiga_id in self.figshare_map: figshare_file_metadata = self.figshare_map[taiga_id] else: raise ValueError("{} is not in figshare_file_map".format(taiga_id)) if get_dataframe and figshare_file_metadata["format"] == DataFileFormat.Raw: raise ValueError( "The file is a Raw one, please use instead `download_to_cache` with the same parameters" ) get_from_cache = ( self.cache.get_entry if get_dataframe else self.cache.get_raw_path ) d = get_from_cache(taiga_id, taiga_id) if d is not None: return d with tempfile.NamedTemporaryFile(dir=self.cache_dir, delete=False) as tf: download_file_from_figshare(figshare_file_metadata["download_url"], tf.name) if not get_dataframe: return self.cache.add_raw_entry( tf.name, taiga_id, taiga_id, figshare_file_metadata["format"] ) else: return self.cache.add_entry( tf.name, taiga_id, taiga_id, figshare_file_metadata["format"], figshare_file_metadata.get("column_types"), figshare_file_metadata.get("encoding"), ) def _get_dataframe_or_path( self, id: Optional[str], name: Optional[str], version: Optional[DatasetVersion], file: Optional[str], get_dataframe: bool, ) -> Optional[Union[str, pd.DataFrame]]: if self.figshare_map is not None: try: return self._get_dataframe_or_path_from_figshare(id, get_dataframe) except ValueError as e: print(cf.red(str(e))) return None self._set_token_and_initialized_api() if not self.api.is_connected(): return self._get_dataframe_or_path_offline( id, name, version, file, get_dataframe ) # Validate inputs try: datafile_metadata = self._validate_file_for_download( id, name, str(version) if version is not None else version, file ) version = datafile_metadata.dataset_version except (TaigaDeletedVersionException, ValueError, Exception) as e: print(cf.red(str(e))) return None datafile_format = datafile_metadata.datafile_format if get_dataframe and datafile_format == DataFileFormat.Raw: print( cf.red( "The file is a Raw one, please use instead `download_to_cache` with the same parameters" ) ) return None # Check the cache if id is not None: query = id else: query = format_datafile_id(name, version, file) full_taiga_id = format_datafile_id_from_datafile_metadata(datafile_metadata) if datafile_metadata.underlying_file_id is not None: full_taiga_id = datafile_metadata.underlying_file_id underlying_datafile_metadata = self.api.get_datafile_metadata( datafile_metadata.underlying_file_id, None, None, None ) if underlying_datafile_metadata.state != DatasetVersionState.approved: print( cf.orange( f"The underlying datafile for the file you are trying to download is from a {underlying_datafile_metadata.state.value} dataset version." ) ) get_from_cache = ( self.cache.get_entry if get_dataframe else self.cache.get_raw_path ) try: df_or_path = get_from_cache(query, full_taiga_id) if df_or_path is not None: return df_or_path except TaigaCacheFileCorrupted as e: print(cf.orange(str(e))) # Download from Taiga try: return self._download_file_and_save_to_cache( query, full_taiga_id, datafile_metadata, get_dataframe ) except (Taiga404Exception, ValueError) as e: print(cf.red(str(e))) return None def _get_dataframe_or_path_offline( self, id: Optional[str], name: Optional[str], version: Optional[DatasetVersion], file: Optional[str], get_dataframe: bool, ): print( cf.orange( "You are in offline mode, please be aware that you might be out of sync with the state of the dataset version (deprecation)." ) ) if id is not None: query = id else: if name is None: print(cf.red("If id is not specified, name must be specified")) return None if version is None: print(cf.red("Dataset version must be specified")) return None query = format_datafile_id(name, version, file) get_from_cache = ( self.cache.get_entry if get_dataframe else self.cache.get_raw_path ) try: df_or_path = get_from_cache(query, query) if df_or_path is not None: return df_or_path except TaigaRawTypeException as e: print( cf.red( "The file is a Raw one, please use instead `download_to_cache` with the same parameters" ) ) return None except TaigaCacheFileCorrupted as e: print(cf.red(str(e))) return None print(cf.red("The datafile you requested was not in the cache.")) return None def _validate_create_dataset_arguments( self, dataset_name: str, upload_files: MutableSequence[UploadS3DataFileDict], add_taiga_ids: MutableSequence[UploadVirtualDataFileDict], folder_id: str, ): if len(dataset_name) == 0: raise ValueError("dataset_name must be a nonempty string.") if len(upload_files) == 0 and len(add_taiga_ids) == 0: raise ValueError("upload_files and add_taiga_ids cannot both be empty.") upload_s3_datafiles, upload_virtual_datafiles = modify_upload_files( upload_files, add_taiga_ids ) try: self.api.get_folder(folder_id) except Taiga404Exception: raise ValueError("No folder found with id {}.".format(folder_id)) return upload_s3_datafiles, upload_virtual_datafiles def _validate_update_dataset_arguments( self, dataset_id: Optional[str], dataset_permaname: Optional[str], dataset_version: Optional[DatasetVersion], changes_description: Optional[str], upload_files: MutableSequence[UploadS3DataFileDict], add_taiga_ids: MutableSequence[UploadVirtualDataFileDict], add_all_existing_files: bool, ) -> Tuple[ List[UploadS3DataFile], List[UploadVirtualDataFile], DatasetVersionMetadataDict ]: if changes_description is None or changes_description == "": raise ValueError("Description of changes cannot be empty.") if dataset_id is not None: if "." in dataset_id: ( dataset_permaname, dataset_version, _, ) = untangle_dataset_id_with_version(dataset_id) else: dataset_metadata: DatasetMetadataDict = self._get_dataset_metadata( dataset_id, None ) dataset_permaname = dataset_metadata["permanames"][-1] elif dataset_permaname is not None: dataset_metadata = self._get_dataset_metadata(dataset_permaname, None) else: # TODO standardize exceptions raise ValueError("Dataset id or name must be specified.") if dataset_version is None: dataset_version = get_latest_valid_version_from_metadata(dataset_metadata) print( cf.orange( "No dataset version provided. Using version {}.".format( dataset_version ) ) ) dataset_version_metadata: DatasetVersionMetadataDict = ( self._get_dataset_metadata(dataset_permaname, dataset_version) ) upload_s3_datafiles, upload_virtual_datafiles = modify_upload_files( upload_files, add_taiga_ids, dataset_version_metadata, add_all_existing_files, ) return upload_s3_datafiles, upload_virtual_datafiles, dataset_version_metadata async def _upload_to_s3_and_request_conversion( self, session_id: str, upload_file: UploadS3DataFile, s3_credentials: S3Credentials, ): print("Uploading {} to S3".format(upload_file.file_name)) bucket = s3_credentials.bucket partial_prefix = s3_credentials.prefix key = "{}{}/{}".format(partial_prefix, session_id, upload_file.file_name) session = aiobotocore.get_session() async with session.create_client( "s3", aws_access_key_id=s3_credentials.access_key_id, aws_secret_access_key=s3_credentials.secret_access_key, aws_session_token=s3_credentials.session_token, ) as s3_client: with open(upload_file.file_path, "rb") as f: resp = await s3_client.put_object(Bucket=bucket, Key=key, Body=f) upload_file.add_s3_upload_information(bucket, key) print("Finished uploading {} to S3".format(upload_file.file_name)) print("Uploading {} to Taiga".format(upload_file.file_name)) self.api.upload_file_to_taiga(session_id, upload_file) print("Finished uploading {} to Taiga".format(upload_file.file_name)) async def _upload_files_async( self, upload_s3_datafiles: List[UploadS3DataFile], upload_virtual_datafiles: List[UploadVirtualDataFile], upload_session_id: str, s3_credentials: S3Credentials, ): tasks = [ asyncio.ensure_future( self._upload_to_s3_and_request_conversion( upload_session_id, f, s3_credentials ) ) for f in upload_s3_datafiles ] try: await asyncio.gather(*tasks) except Exception as e: for t in tasks: t.cancel() raise e for upload_file in upload_virtual_datafiles: print("Linking virtual file {}".format(upload_file.taiga_id)) self.api.upload_file_to_taiga(upload_session_id, upload_file) def _upload_files_serial( self, upload_s3_datafiles: List[UploadS3DataFile], upload_virtual_datafiles: List[UploadVirtualDataFile], upload_session_id: str, s3_credentials: S3Credentials, ): # Configuration of the Boto3 client s3_client = boto3.client( "s3", aws_access_key_id=s3_credentials.access_key_id, aws_secret_access_key=s3_credentials.secret_access_key, aws_session_token=s3_credentials.session_token, ) for upload_file in upload_s3_datafiles: bucket = s3_credentials.bucket partial_prefix = s3_credentials.prefix key = "{}{}/{}".format( partial_prefix, upload_session_id, upload_file.file_name ) s3_client.upload_file(upload_file.file_path, bucket, key) upload_file.add_s3_upload_information(bucket, key) print("Finished uploading {} to S3".format(upload_file.file_name)) print("Uploading {} to Taiga".format(upload_file.file_name)) self.api.upload_file_to_taiga(upload_session_id, upload_file) print("Finished uploading {} to Taiga".format(upload_file.file_name)) for upload_virtual_file in upload_virtual_datafiles: print("Linking virtual file {}".format(upload_virtual_file.taiga_id)) self.api.upload_file_to_taiga(upload_session_id, upload_virtual_file) def _upload_files( self, upload_s3_datafiles: List[UploadS3DataFile], upload_virtual_datafiles: List[UploadVirtualDataFile], upload_async: bool, ) -> str: upload_session_id = self.api.create_upload_session() s3_credentials = self.api.get_s3_credentials() if upload_async: loop = asyncio.new_event_loop() nest_asyncio.apply(loop) asyncio.set_event_loop(loop) loop.run_until_complete( self._upload_files_async( upload_s3_datafiles, upload_virtual_datafiles, upload_session_id, s3_credentials, ) ) loop.close() else: self._upload_files_serial( upload_s3_datafiles, upload_virtual_datafiles, upload_session_id, s3_credentials, ) return upload_session_id def _get_dataset_metadata( self, dataset_id: str, version: Optional[DatasetVersion] ) -> Optional[Union[DatasetMetadataDict, DatasetVersionMetadataDict]]: self._set_token_and_initialized_api() if "." in dataset_id: dataset_id, version, _ = untangle_dataset_id_with_version(dataset_id) return self.api.get_dataset_version_metadata(dataset_id, version) # User-facing functions def get( self, id: Optional[str] = None, name: Optional[str] = None, version: Optional[DatasetVersion] = None, file: Optional[str] = None, ) -> pd.DataFrame: """Retrieves a Table or NumericMatrix datafile from Taiga (or local cache, if available) and returns it as a pandas.DataFrame. Stores the file in the cache if it is not already stored. Errors if the requested datafile is not a Table or NumericMatrix (i.e. is a Raw datafile). If used while offline, will get datafiles that are already in the cache. Keyword Arguments: id {Optional[str]} -- Datafile ID of the datafile to get, in the form dataset_permaname.dataset_version/datafile_name, or dataset_permaname.dataset_version if there is only one file in the dataset. Required if dataset_name is not provided. Takes precedence if both are provided. (default: {None}) name {Optional[str]} -- Permaname or id of the dataset with the datafile. Required if id is not provided. Not used if both are provided. (default: {None}) version {Optional[Union[str, int]]} -- Version of the dataset. If not provided, will use the latest approved (i.e. not deprecated or deleted) dataset. Required if id is not provided. Not used if both are provided. (default: {None}) file {Optional[str]} -- Name of the datafile in the dataset. Required if id is not provided and the dataset contains more than one file. Not used if id is provided. (default: {None}) Returns: pd.DataFrame -- If the file is a NumericMatrix, the row headers will be used as the DataFrame's index. """ return self._get_dataframe_or_path(id, name, version, file, get_dataframe=True) def download_to_cache( self, id: Optional[str] = None, name: Optional[str] = None, version: Optional[DatasetVersion] = None, file: Optional[str] = None, ) -> str: """Retrieves a datafile from Taiga in its raw format (CSV or plain text file). Keyword Arguments: id {Optional[str]} -- Datafile ID of the datafile to get, in the form dataset_permaname.dataset_version/datafile_name, or dataset_permaname.dataset_version if there is only one file in the dataset. Required if dataset_name is not provided. Takes precedence if both are provided. (default: {None}) name {Optional[str]} -- Permaname or id of the dataset with the datafile. Required if id is not provided. Not used if both are provided. (default: {None}) version {Optional[Union[str, int]]} -- Version of the dataset. If not provided, will use the latest approved (i.e. not deprecated or deleted) dataset. Required if id is not provided. Not used if both are provided. (default: {None}) file {Optional[str]} -- Name of the datafile in the dataset. Required if id is not provided and the dataset contains more than one file. Not used if id is provided. (default: {None}) Returns: str -- The path of the downloaded file. """ return self._get_dataframe_or_path(id, name, version, file, get_dataframe=False) def get_dataset_metadata( self, dataset_id: str, version: Optional[DatasetVersion] = None ) -> Optional[Union[DatasetMetadataDict, DatasetVersionMetadataDict]]: """Get metadata about a dataset Keyword Arguments: id {Optional[str]} -- Datafile ID of the datafile to get, in the form dataset_permaname.dataset_version/datafile_name, or dataset_permaname.dataset_version if there is only one file in the dataset. Required if dataset_name is not provided. Takes precedence if both are provided. (default: {None}) name {Optional[str]} -- Permaname or id of the dataset with the datafile. Required if id is not provided. Not used if both are provided. (default: {None}) version {Optional[Union[str, int]]} -- Version of the dataset. If not provided, will use the latest approved (i.e. not deprecated or deleted) dataset. Required if id is not provided. Not used if both are provided. (default: {None}) file {Optional[str]} -- Name of the datafile in the dataset. Required if id is not provided and the dataset contains more than one file. Not used if id is provided. (default: {None}) Returns: Union[DatasetMetadataDict, DatasetVersionMetadataDict] -- See docs at https://github.com/broadinstitute/taigapy for more details """ try: return self._get_dataset_metadata(dataset_id, version) except (ValueError, Taiga404Exception) as e: print(cf.red(str(e))) return None def create_dataset( self, dataset_name: str, dataset_description: Optional[str] = None, upload_files: Optional[MutableSequence[UploadS3DataFileDict]] = None, add_taiga_ids: Optional[MutableSequence[UploadVirtualDataFileDict]] = None, folder_id: str = None, upload_async: bool = True, ) -> Optional[str]: """Creates a new dataset named dataset_name with local files upload_files and virtual datafiles add_taiga_ids in the folder with id parent_folder_id. If multiple files in the union of upload_files and add_taiga_ids share the same name, Taiga will throw and error and the dataset will not be created. Arguments: dataset_name {str} -- The name of the new dataset. Keyword Arguments: dataset_description {Optional[str]} -- Description of the dataset. (default: {None}) upload_files {Optional[List[Dict[str, str]]]} -- List of files to upload, where files are provided as dictionary objects d where - d["path"] is the path of the file to upload - d["name"] is what the file should be named in the dataset. Uses the base name of the file if not provided - d["format"] is the Format of the file (as a string). And optionally, - d["encoding"] is the character encoding of the file. Uses "UTF-8" if not provided (default: {None}) add_taiga_ids {Optional[List[Dict[str, str]]]} -- List of virtual datafiles to add, where files are provided as dictionary objects with keys - "taiga_id" equal to the Taiga ID of the reference datafile in dataset_permaname.dataset_version/datafile_name format - "name" (optional) for what the virtual datafile should be called in the new dataset (will use the reference datafile name if not provided). (default: {None}) folder_id {str} -- The ID of the containing folder. If not specified, will use home folder of user. (default: {None}) upload_async {bool} -- Whether to upload asynchronously (parallel) or in serial Returns: Optional[str] -- The id of the new dataset, or None if the operation was not successful. """ self._set_token_and_initialized_api() if upload_files is None: upload_files = [] if add_taiga_ids is None: add_taiga_ids = [] try: if folder_id is None: folder_id = self.api.get_user()["home_folder_id"] ( upload_s3_datafiles, upload_virtual_datafiles, ) = self._validate_create_dataset_arguments( dataset_name, upload_files, add_taiga_ids, folder_id ) except ValueError as e: print(cf.red(str(e))) return None if upload_s3_datafiles is None: # User declined to upload to public folder return None try: upload_session_id = self._upload_files( upload_s3_datafiles, upload_virtual_datafiles, upload_async ) except ValueError as e: print(cf.red(str(e))) return None dataset_id = self.api.create_dataset( upload_session_id, folder_id, dataset_name, dataset_description ) print( cf.green( "Dataset created. Access it directly with this url: {}\n".format( self.url + "/dataset/" + dataset_id ) ) ) return dataset_id def update_dataset( self, dataset_id: Optional[str] = None, dataset_permaname: Optional[str] = None, dataset_version: Optional[DatasetVersion] = None, dataset_description: Optional[str] = None, changes_description: Optional[str] = None, upload_files: Optional[MutableSequence[UploadS3DataFileDict]] = None, add_taiga_ids: Optional[MutableSequence[UploadVirtualDataFileDict]] = None, add_all_existing_files: bool = False, upload_async: bool = True, ) -> Optional[str]: """Creates a new version of dataset specified by dataset_id or dataset_name (and optionally dataset_version). Keyword Arguments: dataset_id {Optional[str]} -- Generated id or id in the format dataset_permaname.dataset_version (default: {None}) dataset_permaname {Optional[str]} -- Permaname of the dataset to update. Must be provided if `dataset_id` is not (default: {None}) dataset_version {Optional[Union[str, int]]} -- Dataset version to base the new version off of. If not specified, will use the latest version. (default: {None}) dataset_description {Optional[str]} -- Description of new dataset version. Uses previous version's description if not specified. (default: {None}) changes_description {Optional[str]} -- Description of changes new to this version, required. (default: {None}) upload_files {Optional[List[Dict[str, str]]]} -- List of files to upload, where files are provided as dictionary objects d where - d["path"] is the path of the file to upload - d["name"] is what the file should be named in the dataset. Uses the base name of the file if not provided - d["format"] is the Format of the file (as a string). And optionally, - d["encoding"] is the character encoding of the file. Uses "UTF-8" if not provided (default: {None}) add_taiga_ids {Optional[List[Dict[str, str]]]} -- List of virtual datafiles to add, where files are provided as dictionary objects with keys - "taiga_id" equal to the Taiga ID of the reference datafile in dataset_permaname.dataset_version/datafile_name format - "name" (optional) for what the virtual datafile should be called in the new dataset (will use the reference datafile name if not provided). (default: {None}) add_all_existing_files {bool} -- Whether to add all files from the base dataset version as virtual datafiles in the new dataset version. If a name collides with one in upload_files or add_taiga_ids, that file is ignored. (default: {False}) upload_async {bool} -- Whether to upload asynchronously (parallel) or in serial Returns: Optional[str] -- The id of the new dataset version, or None if the operation was not successful. """ self._set_token_and_initialized_api() try: ( upload_s3_datafiles, upload_virtual_datafiles, dataset_version_metadata, ) = self._validate_update_dataset_arguments( dataset_id, dataset_permaname, dataset_version, changes_description, upload_files or [], add_taiga_ids or [], add_all_existing_files, ) except (ValueError, Taiga404Exception) as e: print(cf.red(str(e))) return None try: upload_session_id = self._upload_files( upload_s3_datafiles, upload_virtual_datafiles, upload_async ) except ValueError as e: print(cf.red(str(e))) return None dataset_description = ( dataset_description if dataset_description is not None else dataset_version_metadata["datasetVersion"]["description"] ) new_dataset_version_id = self.api.update_dataset( dataset_version_metadata["dataset"]["id"], upload_session_id, dataset_description, changes_description, ) print( cf.green( "Dataset version created. Access it directly with this url: {}".format( self.url + "/dataset_version/" + new_dataset_version_id ) ) ) return new_dataset_version_id def get_canonical_id(self, queried_taiga_id: str) -> Optional[str]: """Get the canonical Taiga ID of a datafile specified by queried_taiga_id. A canonical ID is of the form dataset_permaname.dataset_version/datafile_name. If the datafile specified by queried_taiga_id is a virtual datafile, the canonical ID is that of the underlying datafile. Arguments: queried_taiga_id {str} -- Taiga ID in the form dataset_permaname.dataset_version/datafile_name or dataset_permaname.dataset_version Returns: Optional[str] -- The canonical ID, or None if no datafile was found. """ self._set_token_and_initialized_api() full_taiga_id = self.cache.get_full_taiga_id(queried_taiga_id) if full_taiga_id is not None: return full_taiga_id try: if "." in queried_taiga_id: ( dataset_permaname, dataset_version, datafile_name, ) = untangle_dataset_id_with_version(queried_taiga_id) datafile_metadata = self.api.get_datafile_metadata( None, dataset_permaname, dataset_version, datafile_name ) else: datafile_metadata = self.api.get_datafile_metadata( queried_taiga_id, None, None, None ) except Taiga404Exception as e: print(cf.red(str(e))) return None dataset_version_metadata: DatasetVersionMetadataDict = ( self.get_dataset_metadata( format_datafile_id_from_datafile_metadata(datafile_metadata) ) ) # Add canonical IDs for all other files in dataset, while we're at it for f in dataset_version_metadata["datasetVersion"]["datafiles"]: if "type" not in f.keys(): # GCS files do not have type, and are not available to interact with, so skip caching them. continue datafile_id = format_datafile_id( datafile_metadata.dataset_permaname, datafile_metadata.dataset_version, f["name"], ) real_datafile_id = ( datafile_id if "underlying_file_id" not in f else f["underlying_file_id"] ) self.cache.add_full_id( datafile_id, real_datafile_id, DataFileFormat(f["type"]) ) if f["name"] == datafile_metadata.datafile_name: self.cache.add_full_id( queried_taiga_id, real_datafile_id, datafile_metadata.datafile_format, ) return self.cache.get_full_taiga_id(queried_taiga_id) def upload_to_gcs(self, queried_taiga_id: str, dest_gcs_path: str) -> bool: """Upload a Taiga datafile to a specified location in Google Cloud Storage. The service account [email protected] must have storage.buckets.create access for this request. Arguments: queried_taiga_id {str} -- Taiga ID in the form dataset_permaname.dataset_version/datafile_name or dataset_permaname.dataset_version dest_gcs_path {str} -- Google Storage path to upload to, in the form bucket:path Returns: bool -- Whether the file was successfully uploaded """ self._set_token_and_initialized_api() full_taiga_id = self.get_canonical_id(queried_taiga_id) if full_taiga_id is None: return False try: self.api.upload_to_gcs(full_taiga_id, dest_gcs_path) return True except (ValueError, TaigaHttpException) as e: print(cf.red(str(e))) return False
def test_remove_all_from_cache(populated_cache: TaigaCache): populated_cache.remove_all_from_cache("columnar-dataset.1/")