class SpreadsheetImport(TestCase): def setUp(self): self.test_data_path = os.path.dirname(os.path.realpath(__file__)) self.configure_ingest_client() def configure_ingest_client(self): gcp_credentials_file = os.environ.get('GOOGLE_APPLICATION_CREDENTIALS') self.s2s_token_client = S2STokenClient( ServiceCredential.from_file(gcp_credentials_file), INGEST_API_JWT_AUDIENCE) self.token_manager = TokenManager(self.s2s_token_client) self.ingest_api = IngestApi(url=INGEST_API, token_manager=self.token_manager) def test_spreadsheet_import(self): self.metadata_spreadsheet_path = os.path.join(self.test_data_path, SPREADSHEET_FILE) download_file(SPREADSHEET_LOCATION, self.metadata_spreadsheet_path) importer = XlsImporter(self.ingest_api) submission_resource = self.ingest_api.create_submission() submission_url = submission_resource["_links"]["self"]["href"].rsplit( "{")[0] submission, _ = importer.import_file(self.metadata_spreadsheet_path, submission_url, False) entities_by_type = {} for entity in submission.get_entities(): entity_type = entity.type if not entities_by_type.get(entity_type): entities_by_type[entity_type] = [] entities_by_type[entity_type].append(entity) files = list(self.ingest_api.get_entities(submission_url, 'files')) biomaterials = list( self.ingest_api.get_entities(submission_url, 'biomaterials')) protocols = list( self.ingest_api.get_entities(submission_url, 'protocols')) processes = list( self.ingest_api.get_entities(submission_url, 'processes')) self.assertEquals(len(files), len(entities_by_type['file'])) self.assertEquals(len(biomaterials), len(entities_by_type['biomaterial'])) self.assertEquals(len(protocols), len(entities_by_type['protocol'])) self.assertEquals(len(processes), len(entities_by_type['process'])) def tearDown(self) -> None: if self.metadata_spreadsheet_path: delete_file(self.metadata_spreadsheet_path)
class IngestHydrator(Hydrator): """ DCP Ingest Service Submission hydrator class. Enables importing of HCA Ingest Service submissions by specifying a Submission ID. """ def __init__(self, graph, submission_uuid): super().__init__(graph) self._logger.info( f"Started ingest hydrator for for submission [{submission_uuid}]") self._ingest_api = IngestApi(Config['INGEST_API']) project_url = self._ingest_api.get_submission_by_uuid( submission_uuid)['_links']['relatedProjects']['href'] project = self._ingest_api.get( project_url).json()['_embedded']['projects'][0] self._logger.info( f"Found project for submission {project['uuid']['uuid']}") self._entities = {} for submission in self.fetch_submissions_in_project(project): self._logger.info( f"Found submission for project with uuid {submission['uuid']['uuid']}" ) for entity in self.build_entities_from_submission(submission): self._entities[entity['uuid']] = entity self._nodes = self.get_nodes() self._edges = self.get_edges() def fetch_submissions_in_project(self, project: dict) -> [dict]: self._logger.debug( f"Fetching submissions for project {project['uuid']['uuid']}") return self._ingest_api.get( project['_links']['submissionEnvelopes'] ['href']).json()['_embedded']['submissionEnvelopes'] def build_entities_from_submission(self, submission: dict): id_field_map = { 'biomaterials': "biomaterial_core.biomaterial_id", 'files': "file_core.file_name", 'processes': "process_core.process_id", 'projects': "project_core.project_short_name", 'protocols': "protocol_core.protocol_id", } for entity_type in [ "biomaterials", "files", "processes", "projects", "protocols" ]: for entity in self._ingest_api.get_entities( submission['_links']['self']['href'], entity_type): properties = flatten(entity['content']) new_entity = { 'properties': properties, 'labels': [entity['type'].lower()], 'node_id': properties[id_field_map[entity_type]], 'links': entity['_links'], 'uuid': entity['uuid']['uuid'], } concrete_type = new_entity['properties']['describedBy'].rsplit( '/', 1)[1] new_entity['labels'].append(concrete_type) yield new_entity @benchmark def get_nodes(self): self._logger.debug("importing nodes") nodes = {} for entity_uuid, entity in self._entities.items(): node_id = entity['node_id'] nodes[entity_uuid] = Node( *entity['labels'], **entity['properties'], uuid=entity['uuid'], self_link=entity['links']['self']['href'], id=node_id) self._logger.debug(f"({node_id})") self._logger.info(f"imported {len(nodes)} nodes") return nodes @benchmark def get_edges(self): self._logger.debug("importing edges") edges = [] relationship_map = { 'projects': "projects", 'protocols': "protocols", 'inputToProcesses': "processes", 'derivedByProcesses': "processes", 'inputBiomaterials': "biomaterials", 'derivedBiomaterials': "biomaterials", 'supplementaryFiles': "files", 'inputFiles': "files", 'derivedFiles': "files", } for entity_uuid, entity in self._entities.items(): for relationship_type in relationship_map.keys(): if relationship_type in entity['links']: relationships = self._ingest_api.get_all( entity['links'][relationship_type]['href'], relationship_map[relationship_type]) for end_entity in relationships: start_node = self._nodes[entity_uuid] relationship_name = convert_to_macrocase( relationship_type) try: end_node = self._nodes[end_entity['uuid']['uuid']] edges.append( Relationship(start_node, relationship_name, end_node)) # Adding additional relationships to the graphs. if relationship_name == 'INPUT_TO_PROCESSES': edges.append( Relationship(start_node, 'DUMMY_EXPERIMENTAL_DESIGN', end_node)) if relationship_name == 'DERIVED_BY_PROCESSES': edges.append( Relationship(end_node, 'DUMMY_EXPERIMENTAL_DESIGN', start_node)) self._logger.debug( f"({start_node['id']})-[:{relationship_name}]->({end_node['id']})" ) except KeyError: self._logger.debug( f"Missing end node at a [{start_node['id']}] entity." ) self._logger.info(f"imported {len(edges)} edges") return edges