示例#1
0
def unzip_and_get_tclin_tchem(zip_file: str, output_dir: str) -> List[str]:
    unzip_to_tempdir(zip_file, output_dir)
    # get tclin filename
    tclin_files = \
        [f for f in os.listdir(output_dir) if re.match(r'tclin_.*\.tsv', f)]
    if len(tclin_files) > 1:
        raise RuntimeError("Found more than one tclin file:\n%s" %
                           "\n".join(tclin_files))
    elif len(tclin_files) < 1:
        raise RuntimeError("Couldn't find tclin file in zipfile %s" % zip_file)
    else:
        tclin_file: str = os.path.join(output_dir, tclin_files[0])

    # get tchem filename
    tchem_files = \
        [f for f in os.listdir(output_dir) if re.match(r'tchem_.*\.tsv', f)]
    if len(tchem_files) > 1:
        raise RuntimeError("Found more than one tchem file:\n%s" %
                           "\n".join(tchem_files))
    elif len(tchem_files) < 1:
        raise RuntimeError("Couldn't find tchem file in zipfile %s" % zip_file)
    else:
        tchem_file: str = os.path.join(output_dir, tchem_files[0])

    return [tclin_file, tchem_file]
示例#2
0
    def parse_annotations(self, node_handle: Any, edge_handle: Any,
                          data_file1: str, data_file2: str) -> None:
        """Parse annotations from CORD-19_1_5.zip.

        Args:
            node_handle: File handle for nodes.csv.
            edge_handle: File handle for edges.csv.
            data_file1: Path to first CORD-19_1_5.zip.
            data_file2: Path to second CORD-19_1_5.zip.

        Returns:
             None.

        """
        pbar = tqdm(total=2, desc="Unzipping files")

        # unzip to tmpdir, remove after use, to avoid cluttering raw/ with processed
        # data
        with tempfile.TemporaryDirectory(dir=self.input_base_dir) as tmpdir:
            unzip_to_tempdir(data_file1, tmpdir)
            pbar.update(1)
            unzip_to_tempdir(data_file2, tmpdir)
            pbar.update(1)
            pbar.close()

            subsets = ['pmc_json', 'pdf_json']
            for subset in subsets:
                subset_dir = os.path.join(tmpdir, subset)
                for filename in tqdm(os.listdir(subset_dir)):
                    file = os.path.join(subset_dir, filename)
                    doc = json.load(open(file))
                    self.parse_annotation_doc(node_handle, edge_handle, doc)
示例#3
0
    def run(self, data_file: Optional[str] = None):
        """Method to run transform to ingest data from IntAct for viral/human PPIs"""

        data_files = list()
        if not data_file:
            data_files.append(
                os.path.join(self.input_base_dir, 'intact_coronavirus.zip'))
        else:
            data_files.append(data_file)

        zip_file = data_files[0]

        # for tsv output:
        output_node_file = os.path.join(self.output_dir, 'nodes.tsv')
        output_edge_file = os.path.join(self.output_dir, 'edges.tsv')

        # make directory in data/transformed
        os.makedirs(self.output_dir, exist_ok=True)

        with open(output_node_file, 'w') as node, \
                open(output_edge_file, 'w') as edge:

            # write node.tsv header
            node.write('\t'.join(self.node_header) + '\n')
            edge.write('\t'.join(self.edge_header) + '\n')

            xml_tempdir = tempfile.mkdtemp()
            unzip_to_tempdir(zip_file, xml_tempdir)

            extracted_base_dir_list = os.listdir(xml_tempdir)
            file_path = os.path.join(xml_tempdir, extracted_base_dir_list[0])
            for file in os.listdir(file_path):
                if not fnmatch.fnmatch(file, '*.xml'):
                    logging.warning("Skipping non-xml file %s" % file)

                nodes_edges = self.parse_xml_to_nodes_edges(
                    os.path.join(file_path, file))

                # write out nodes
                for this_node in nodes_edges['nodes']:
                    write_node_edge_item(fh=node,
                                         header=self.node_header,
                                         data=this_node)
                # write out edges
                for this_edge in nodes_edges['edges']:
                    write_node_edge_item(fh=edge,
                                         header=self.edge_header,
                                         data=this_edge)
    def run(self, data_file: Optional[str] = None):
        rel_zip_file_name = os.path.join(self.input_base_dir,
                                         "relationships.zip")
        relationship_file_name = "relationships.tsv"
        gene_mapping_zip_file = os.path.join(self.input_base_dir,
                                             "pharmgkb_genes.zip")
        gene_mapping_file_name = "genes.tsv"
        drug_mapping_zip_file = os.path.join(self.input_base_dir,
                                             "pharmgkb_drugs.zip")
        drug_mapping_file_name = "drugs.tsv"

        #
        # file stuff
        #
        # get relationship file (what we are ingesting here)
        # TODO: unlink relationship_tempdir and gene_id_tempdir

        relationship_tempdir = tempfile.mkdtemp()
        relationship_file_path = os.path.join(relationship_tempdir,
                                              relationship_file_name)
        unzip_to_tempdir(rel_zip_file_name, relationship_tempdir)
        if not os.path.exists(relationship_file_path):
            raise PharmGKBFileError(
                "Can't find relationship file needed for ingest")

        # get mapping file for gene ids
        gene_id_tempdir = tempfile.mkdtemp()
        gene_mapping_file_path = os.path.join(gene_id_tempdir,
                                              gene_mapping_file_name)
        unzip_to_tempdir(gene_mapping_zip_file, gene_id_tempdir)
        if not os.path.exists(gene_mapping_file_path):
            raise PharmGKBFileError(
                "Can't find gene map file needed for ingest")
        self.gene_id_map = self.make_id_mapping_file(gene_mapping_file_path)

        # get mapping file for drug ids
        drug_id_tempdir = tempfile.mkdtemp()
        drug_mapping_file_path = os.path.join(drug_id_tempdir,
                                              drug_mapping_file_name)
        unzip_to_tempdir(drug_mapping_zip_file, drug_id_tempdir)

        if not os.path.exists(drug_mapping_file_path):
            raise PharmGKBFileError(
                "Can't find drug map file needed for ingest")
        self.drug_id_map = self.make_id_mapping_file(drug_mapping_file_path)

        #
        # read in and transform relationship.tsv
        #
        with open(relationship_file_path) as relationships, \
                open(self.output_node_file, 'w') as node, \
                open(self.output_edge_file, 'w') as edge:
            # write headers (change default node/edge headers if necessary
            node.write("\t".join(self.node_header) + "\n")
            edge.write("\t".join(self.edge_header) + "\n")

            rel_header = parse_header(relationships.readline())
            for line in relationships:
                line_data = self.parse_pharmgkb_line(line, rel_header)

                if set(self.edge_of_interest) == \
                        set([line_data['Entity1_type'], line_data['Entity2_type']]):

                    #
                    # Make nodes for drug and chemical
                    #
                    for entity_id, entity_name, entity_type in [
                        [
                            line_data['Entity1_id'], line_data['Entity1_name'],
                            line_data['Entity1_type']
                        ],
                        [
                            line_data['Entity2_id'], line_data['Entity2_name'],
                            line_data['Entity2_type']
                        ]
                    ]:
                        if entity_type == 'Gene':
                            self.make_pharmgkb_gene_node(
                                fh=node,
                                this_id=entity_id,
                                name=entity_name,
                                biolink_type=self.gene_node_type)
                        elif entity_type == 'Chemical':
                            self.make_pharmgkb_chemical_node(
                                fh=node,
                                chem_id=entity_id,
                                name=entity_name,
                                biolink_type=self.drug_node_type)
                        else:
                            raise PharmKGBInvalidNodeType(
                                "Node type isn't gene or chemical!")

                    #
                    # Make edge
                    #
                    self.make_pharmgkb_edge(fh=edge, line_data=line_data)