def unzip_and_get_tclin_tchem(zip_file: str, output_dir: str) -> List[str]: unzip_to_tempdir(zip_file, output_dir) # get tclin filename tclin_files = \ [f for f in os.listdir(output_dir) if re.match(r'tclin_.*\.tsv', f)] if len(tclin_files) > 1: raise RuntimeError("Found more than one tclin file:\n%s" % "\n".join(tclin_files)) elif len(tclin_files) < 1: raise RuntimeError("Couldn't find tclin file in zipfile %s" % zip_file) else: tclin_file: str = os.path.join(output_dir, tclin_files[0]) # get tchem filename tchem_files = \ [f for f in os.listdir(output_dir) if re.match(r'tchem_.*\.tsv', f)] if len(tchem_files) > 1: raise RuntimeError("Found more than one tchem file:\n%s" % "\n".join(tchem_files)) elif len(tchem_files) < 1: raise RuntimeError("Couldn't find tchem file in zipfile %s" % zip_file) else: tchem_file: str = os.path.join(output_dir, tchem_files[0]) return [tclin_file, tchem_file]
def parse_annotations(self, node_handle: Any, edge_handle: Any, data_file1: str, data_file2: str) -> None: """Parse annotations from CORD-19_1_5.zip. Args: node_handle: File handle for nodes.csv. edge_handle: File handle for edges.csv. data_file1: Path to first CORD-19_1_5.zip. data_file2: Path to second CORD-19_1_5.zip. Returns: None. """ pbar = tqdm(total=2, desc="Unzipping files") # unzip to tmpdir, remove after use, to avoid cluttering raw/ with processed # data with tempfile.TemporaryDirectory(dir=self.input_base_dir) as tmpdir: unzip_to_tempdir(data_file1, tmpdir) pbar.update(1) unzip_to_tempdir(data_file2, tmpdir) pbar.update(1) pbar.close() subsets = ['pmc_json', 'pdf_json'] for subset in subsets: subset_dir = os.path.join(tmpdir, subset) for filename in tqdm(os.listdir(subset_dir)): file = os.path.join(subset_dir, filename) doc = json.load(open(file)) self.parse_annotation_doc(node_handle, edge_handle, doc)
def run(self, data_file: Optional[str] = None): """Method to run transform to ingest data from IntAct for viral/human PPIs""" data_files = list() if not data_file: data_files.append( os.path.join(self.input_base_dir, 'intact_coronavirus.zip')) else: data_files.append(data_file) zip_file = data_files[0] # for tsv output: output_node_file = os.path.join(self.output_dir, 'nodes.tsv') output_edge_file = os.path.join(self.output_dir, 'edges.tsv') # make directory in data/transformed os.makedirs(self.output_dir, exist_ok=True) with open(output_node_file, 'w') as node, \ open(output_edge_file, 'w') as edge: # write node.tsv header node.write('\t'.join(self.node_header) + '\n') edge.write('\t'.join(self.edge_header) + '\n') xml_tempdir = tempfile.mkdtemp() unzip_to_tempdir(zip_file, xml_tempdir) extracted_base_dir_list = os.listdir(xml_tempdir) file_path = os.path.join(xml_tempdir, extracted_base_dir_list[0]) for file in os.listdir(file_path): if not fnmatch.fnmatch(file, '*.xml'): logging.warning("Skipping non-xml file %s" % file) nodes_edges = self.parse_xml_to_nodes_edges( os.path.join(file_path, file)) # write out nodes for this_node in nodes_edges['nodes']: write_node_edge_item(fh=node, header=self.node_header, data=this_node) # write out edges for this_edge in nodes_edges['edges']: write_node_edge_item(fh=edge, header=self.edge_header, data=this_edge)
def run(self, data_file: Optional[str] = None): rel_zip_file_name = os.path.join(self.input_base_dir, "relationships.zip") relationship_file_name = "relationships.tsv" gene_mapping_zip_file = os.path.join(self.input_base_dir, "pharmgkb_genes.zip") gene_mapping_file_name = "genes.tsv" drug_mapping_zip_file = os.path.join(self.input_base_dir, "pharmgkb_drugs.zip") drug_mapping_file_name = "drugs.tsv" # # file stuff # # get relationship file (what we are ingesting here) # TODO: unlink relationship_tempdir and gene_id_tempdir relationship_tempdir = tempfile.mkdtemp() relationship_file_path = os.path.join(relationship_tempdir, relationship_file_name) unzip_to_tempdir(rel_zip_file_name, relationship_tempdir) if not os.path.exists(relationship_file_path): raise PharmGKBFileError( "Can't find relationship file needed for ingest") # get mapping file for gene ids gene_id_tempdir = tempfile.mkdtemp() gene_mapping_file_path = os.path.join(gene_id_tempdir, gene_mapping_file_name) unzip_to_tempdir(gene_mapping_zip_file, gene_id_tempdir) if not os.path.exists(gene_mapping_file_path): raise PharmGKBFileError( "Can't find gene map file needed for ingest") self.gene_id_map = self.make_id_mapping_file(gene_mapping_file_path) # get mapping file for drug ids drug_id_tempdir = tempfile.mkdtemp() drug_mapping_file_path = os.path.join(drug_id_tempdir, drug_mapping_file_name) unzip_to_tempdir(drug_mapping_zip_file, drug_id_tempdir) if not os.path.exists(drug_mapping_file_path): raise PharmGKBFileError( "Can't find drug map file needed for ingest") self.drug_id_map = self.make_id_mapping_file(drug_mapping_file_path) # # read in and transform relationship.tsv # with open(relationship_file_path) as relationships, \ open(self.output_node_file, 'w') as node, \ open(self.output_edge_file, 'w') as edge: # write headers (change default node/edge headers if necessary node.write("\t".join(self.node_header) + "\n") edge.write("\t".join(self.edge_header) + "\n") rel_header = parse_header(relationships.readline()) for line in relationships: line_data = self.parse_pharmgkb_line(line, rel_header) if set(self.edge_of_interest) == \ set([line_data['Entity1_type'], line_data['Entity2_type']]): # # Make nodes for drug and chemical # for entity_id, entity_name, entity_type in [ [ line_data['Entity1_id'], line_data['Entity1_name'], line_data['Entity1_type'] ], [ line_data['Entity2_id'], line_data['Entity2_name'], line_data['Entity2_type'] ] ]: if entity_type == 'Gene': self.make_pharmgkb_gene_node( fh=node, this_id=entity_id, name=entity_name, biolink_type=self.gene_node_type) elif entity_type == 'Chemical': self.make_pharmgkb_chemical_node( fh=node, chem_id=entity_id, name=entity_name, biolink_type=self.drug_node_type) else: raise PharmKGBInvalidNodeType( "Node type isn't gene or chemical!") # # Make edge # self.make_pharmgkb_edge(fh=edge, line_data=line_data)