def get_all_bound_file_paths(ws_namespace, ws_name): request = firecloud_api.list_entity_types(ws_namespace, ws_name) if request.status_code != 200: fail(request.text) entity_types_json = request.json() attribute_name_for_url_to_entity_json = defaultdict(list) referenced_file_paths_in_workspace = [] # for entity_type in entity_types_json: entity_count = entity_types_json[entity_type]["count"] page_size = 1000 num_pages = int(math.ceil(float(entity_count) / page_size)) for i in range(1, num_pages + 1): for entity_json in get_entity_by_page(ws_namespace, ws_name, entity_type, i, page_size)["results"]: for attribute_name, attribute_value in entity_json[ "attributes"].items(): if re.match(r"gs://", str(attribute_value)): referenced_file_paths_in_workspace.append( attribute_value) attribute_name_for_url_to_entity_json[ attribute_name].append(entity_json) return attribute_name_for_url_to_entity_json
def copy_workspace_entities_sushma(destination_workspace_namespace, destination_workspace_name, source_workspace_namespace, source_workspace_name, destination_workspace_bucket): """Copy workspace data tables to destination workspace.""" source_etypes = fapi.list_entity_types(source_workspace_namespace, source_workspace_name) if source_etypes.status_code != 200: # getting list of data tables fails message = f"Failed to retrieve list of data tables (entity types) from: {source_workspace_namespace}/{source_workspace_name}. API error: {source_etypes.text}." print(message) return False, message source_set_etypes = [s for s in list(source_etypes.json().keys()) if s.endswith("_set")] source_single_etypes = [s for s in list(source_etypes.json().keys()) if not s.endswith("_set")] # for each table that is not a set for etype in source_single_etypes: # get entity names for etype entities = fapi.get_entities(source_workspace_namespace, source_workspace_name, etype) if entities.status_code != 200: # getting an etype's entities fails message = f"Failed to retrieve entities (row names) for {etype}. API error: {entities.text}" print(message) return False, message entity_names = [ent["name"] for ent in entities.json()] # copy single etype (with entities) to destination workspace copy_response = fapi.copy_entities(source_workspace_namespace, source_workspace_name, destination_workspace_namespace, destination_workspace_name, etype, entity_names, link_existing_entities=True) if copy_response.status_code not in [201, 409]: # if copying table with entities fails message = f"Failed to copy {etype} with entities({entity_names}) to {destination_workspace_namespace}/{destination_workspace_name}. API error: {copy_response.text}." print(message) return False, message for set_etype in source_set_etypes: # get entity names for etype set_entities = fapi.get_entities(source_workspace_namespace, source_workspace_name, set_etype) if set_entities.status_code != 200: # getting a set etype's entities fails message = f"Failed to retrieve entities (row names) for {set_etype}. API error: {set_entities.text}" print(message) return False, message set_entity_names = [ent["name"] for ent in set_entities.json()] # copy single etype (with entities) to destination workspace set_copy_response = fapi.copy_entities(source_workspace_namespace, source_workspace_name, destination_workspace_namespace, destination_workspace_name, set_etype, set_entity_names, link_existing_entities=True) if set_copy_response.status_code not in [201, 409]: # if copying set table with entities fails message = f"Failed to copy {set_etype} with entities({set_entity_names}) to {destination_workspace_namespace}/{destination_workspace_name}. API error: {set_copy_response.text}." print(message) return False, message print(f"Successfully copied data tables to {destination_workspace_namespace}/{destination_workspace_name}: {list(source_etypes.json().keys())}") # get original workpace bucket id get_bucket_success, get_bucket_message = get_workspace_bucket(source_workspace_name, source_workspace_namespace) # TODO: handle if getting workspace bucket fails source_bucket = json.loads(get_bucket_message)["workspace"]["bucketName"] destination_bucket = destination_workspace_bucket.replace("gs://", "") # update bucket links in the destination workspace so that it matches the path structure of what the WDL generates when it migrates data # gs://new_bucket_id/original_bucket_id/[original data structure] update_entities(destination_workspace_name, destination_workspace_namespace, replace_this=source_bucket, with_this=f"{destination_bucket}/{source_bucket}") print(f"Successfully updated data tables with new bucket paths data tables in {destination_workspace_namespace}/{destination_workspace_name}.") return True, list(source_etypes.json().keys())
def callFirecloud(): try: response = fapi.list_entity_types(BILLING_PROJECT_ID, WORKSPACE) if response.status_code != 200: print("Error in Firecloud, check your billing project ID and the name of your workspace.") raise else: print("Firecloud has found your workspace!") directory = BUCKET + SUBDIRECTORY return directory except NameError: print("Caught a NameError exception. This may mean the kernal was restarted or you didn't run ", "the cells above. Try running the cells above again.") raise
def get_single_entity_types(workspace, project): """Get a list of all non-set entity types in given workspace.""" # API call to get all entity types in workspace (type set and non-set) res_etypes = fapi.list_entity_types(project, workspace) dict_all_etypes = json.loads(res_etypes.text) # get non-set entities and add to list # "set" entities do not need to be updated because they only reference the unique ID of each single entity # the unique ID of any single entity is not modified so sets should remain the same single_etypes_list = [] single_etypes_list = [ key for key in dict_all_etypes.keys() if not key.endswith("_set") ] print(f"List of entity types that will be updated, if applicable:") print('\n'.join(['\t' * 7 + c for c in single_etypes_list])) return single_etypes_list
def test_list_entity_types(self): """Test list_entity_types().""" r = fapi.list_entity_types(self.project, self.workspace) print(r.status_code, r.content) self.assertEqual(r.status_code, 200)
def download_tsv_from_workspace(project, workspace, entity_type, tsv_name, page_size=DEFAULT_PAGE_SIZE, attr_list=None): """Download large TSV file from Terra workspace by designated number of rows.""" # get all entity types in workspace using API call # API = https://api.firecloud.org/#!/Entities/getEntityTypes response = fapi.list_entity_types(project, workspace) if response.status_code != 200: print(response.text) exit(1) # get/report # of entities + associated attributes(column names) of input entity type entity_types_json = response.json() entity_count = entity_types_json[entity_type]["count"] entity_id = entity_types_json[entity_type]["idName"] # if user provided list of specific attributes to return, else return all attributes if attr_list: all_attribute_names = entity_types_json[entity_type]["attributeNames"] attribute_names = [ attr for attr in all_attribute_names if attr in attr_list ] else: attribute_names = entity_types_json[entity_type]["attributeNames"] # add the entity_id value to list of attributes (not a default attribute of API response) attribute_names.insert(0, entity_id) print(f'{entity_count} {entity_type}(s) to export.') with open(tsv_name, "w") as tsvout: # add header with attribute values to tsv tsvout.write("\t".join(attribute_names) + "\n") # set starting row value and calculate number of pages row_num = 0 num_pages = int(math.ceil(float(entity_count) / page_size)) # get entities by page where each page has page_size # of rows using API call print(f'Getting all {num_pages} pages of entity data.') all_page_responses = [] for page in tqdm(range(1, num_pages + 1)): all_page_responses.append( get_entity_by_page(project, workspace, entity_type, page, page_size)) # for each response(page) in all_page_responses[] - contains parameter metadata print(f'Writing {entity_count} attributes to tsv file.') for page_response in tqdm(all_page_responses): # for each set of attributes in results (no parameters) get attribute names and entity_id(name) for entity_json in page_response["results"]: attributes = entity_json["attributes"] name = entity_json["name"] # add name and value to dictionary of attributes attributes[entity_id] = name values = [] # for each attribute(column name) in list of attribute names(all columns for entity) for attribute_name in attribute_names: value = "" # if entity's attribute(column) is in list of attributes from response, set response's attribute value if attribute_name in attributes: value = attributes[attribute_name] values.append(str(value)) tsvout.write("\t".join(values) + "\n") row_num += 1 print(f'Finished exporting {entity_type}(s) to tsv with name {tsv_name}.')
# Make sure to include slashes in your SUBDIRECTORY variable. # ## Environmental variables # In[ ]: BILLING_PROJECT_ID = os.environ['GOOGLE_PROJECT'] WORKSPACE = os.path.basename(os.path.dirname(os.getcwd())) BUCKET = os.environ["WORKSPACE_BUCKET"] # ## Call FireCloud # In[ ]: try: response = fapi.list_entity_types(BILLING_PROJECT_ID, WORKSPACE) if response.status_code != 200: print( "Error in Firecloud, check your billing project ID and the name of your workspace." ) raise else: print("Firecloud has found your workspace!") directory = BUCKET + SUBDIRECTORY except NameError: print( "Caught a NameError exception. This may mean the kernal was restarted or you didn't run ", "the cells above. Try running the cells above again.") raise # ## Display the contents of your workspace bucket
def get_schema(namespace, workspace): """Fetch all entity types.""" return FAPI.list_entity_types(namespace=namespace, workspace=workspace).json()