Пример #1
0
    def process_gene_interaction(self, limit):
        """
        The gene interaction file includes identified interactions,
        that are between two or more gene (products).
        In the case of interactions with >2 genes, this requires creating
        groups of genes that are involved in the interaction.
        From the wormbase help list: In the example WBInteraction000007779
        it would likely be misleading to suggest that lin-12 interacts with
        (suppresses in this case) smo-1 ALONE or that lin-12 suppresses let-60
        ALONE; the observation in the paper; see Table V in paper PMID:15990876
        was that a lin-12 allele (heterozygous lin-12(n941/+)) could suppress
        the "multivulva" phenotype induced synthetically by simultaneous
        perturbation of BOTH smo-1 (by RNAi) AND let-60 (by the n2021 allele).
        So this is necessarily a three-gene interaction.

        Therefore, we can create groups of genes based on their "status" of
        Effector | Effected.

        Status:  IN PROGRESS

        :param limit:
        :return:

        """

        raw = '/'.join((self.rawdir, self.files['gene_interaction']['file']))

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        gu = GraphUtils(curie_map.get())

        logger.info("Processing gene interaction associations")
        line_counter = 0

        with gzip.open(raw, 'rb') as csvfile:
            filereader = csv.reader(
                io.TextIOWrapper(csvfile, newline=""), delimiter='\t',
                quotechar="'")

            for row in filereader:
                line_counter += 1
                if re.match(r'#', ''.join(row)):
                    continue

                (interaction_num, interaction_type, interaction_subtype,
                 summary, citation) = row[0:5]
                print(row)
                interaction_id = 'WormBase:'+interaction_num

                # TODO deal with subtypes
                interaction_type_id = None
                if interaction_type == 'Genetic':
                    interaction_type_id = \
                        InteractionAssoc.interaction_object_properties[
                            'genetically_interacts_with']
                elif interaction_type == 'Physical':
                    interaction_type_id = \
                        InteractionAssoc.interaction_object_properties[
                            'molecularly_interacts_with']
                elif interaction_type == 'Regulatory':
                    interaction_type_id = \
                        InteractionAssoc.interaction_object_properties[
                            'regulates']
                else:
                    logger.info(
                        "An interaction type I don't understand %s",
                        interaction_type)

                num_interactors = (len(row) - 5) / 3
                if num_interactors != 2:
                    logger.info(
                        "Skipping interactions with !=2 participants:\n %s",
                        str(row))
                    continue

                gene_a_id = 'WormBase:'+row[5]
                gene_b_id = 'WormBase:'+row[8]

                if self.testMode \
                        and gene_a_id not in self.test_ids['gene'] \
                        and gene_b_id not in self.test_ids['gene']:
                    continue

                assoc = InteractionAssoc(
                    self.name, gene_a_id, gene_b_id, interaction_type_id)
                assoc.set_association_id(interaction_id)
                assoc.add_association_to_graph(g)
                assoc_id = assoc.get_association_id()
                # citation is not a pmid or WBref - get this some other way
                gu.addDescription(g, assoc_id, summary)

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

        return
Пример #2
0
    def _get_interactions(self, limit):
        logger.info("getting interactions")
        line_counter = 0
        f = '/'.join((self.rawdir, self.files['interactions']['file']))
        myzip = ZipFile(f, 'r')
        # assume that the first entry is the item
        fname = myzip.namelist()[0]
        matchcounter = 0

        with myzip.open(fname, 'r') as csvfile:
            for line in csvfile:
                # skip comment lines
                if re.match(r'^#', line.decode()):
                    logger.debug("Skipping header line")
                    continue
                line_counter += 1
                line = line.decode().strip()
                # print(line)
                (interactor_a, interactor_b, alt_ids_a, alt_ids_b, aliases_a,
                 aliases_b, detection_method, pub_author, pub_id, taxid_a,
                 taxid_b, interaction_type, source_db, interaction_id,
                 confidence_val) = line.split('\t')

                # get the actual gene ids,
                # typically formated like: gene/locuslink:351|BIOGRID:106848
                gene_a_num = re.search(
                    r'locuslink\:(\d+)\|?', interactor_a).groups()[0]
                gene_b_num = re.search(
                    r'locuslink\:(\d+)\|?', interactor_b).groups()[0]

                if self.testMode:
                    g = self.testgraph
                    # skip any genes that don't match our test set
                    if (int(gene_a_num) not in self.test_ids) or\
                            (int(gene_b_num) not in self.test_ids):
                        continue
                else:
                    g = self.graph
                    # when not in test mode, filter by taxon
                    if int(re.sub(r'taxid:', '', taxid_a.rstrip())) not in\
                            self.tax_ids or\
                            int(re.sub(
                                r'taxid:', '', taxid_b.rstrip())) not in\
                            self.tax_ids:
                        continue
                    else:
                        matchcounter += 1

                gene_a = 'NCBIGene:'+gene_a_num
                gene_b = 'NCBIGene:'+gene_b_num

                # get the interaction type
                # psi-mi:"MI:0407"(direct interaction)
                int_type = re.search(r'MI:\d+', interaction_type).group()
                rel = self._map_MI_to_RO(int_type)

                # scrub pubmed-->PMID prefix
                pub_id = re.sub(r'pubmed', 'PMID', pub_id)
                # remove bogus whitespace
                pub_id = pub_id.strip()

                # get the method, and convert to evidence code
                det_code = re.search(r'MI:\d+', detection_method).group()
                evidence = self._map_MI_to_ECO(det_code)

                # note that the interaction_id is some kind of internal biogrid
                # identifier that does not map to a public URI.
                # we will construct a monarch identifier from this

                assoc = InteractionAssoc(self.name, gene_a, gene_b, rel)
                assoc.add_evidence(evidence)
                assoc.add_source(pub_id)
                assoc.add_association_to_graph(g)
                assoc.load_all_properties(g)

                if not self.testMode and (
                        limit is not None and line_counter > limit):
                    break

        myzip.close()

        return
Пример #3
0
    def process_gene_interaction(self, limit):
        """
        The gene interaction file includes identified interactions,
        that are between two or more gene (products).
        In the case of interactions with >2 genes, this requires creating
        groups of genes that are involved in the interaction.
        From the wormbase help list: In the example WBInteraction000007779
        it would likely be misleading to suggest that lin-12 interacts with
        (suppresses in this case) smo-1 ALONE or that lin-12 suppresses let-60
        ALONE; the observation in the paper; see Table V in paper PMID:15990876
        was that a lin-12 allele (heterozygous lin-12(n941/+)) could suppress
        the "multivulva" phenotype induced synthetically by simultaneous
        perturbation of BOTH smo-1 (by RNAi) AND let-60 (by the n2021 allele).
        So this is necessarily a three-gene interaction.

        Therefore, we can create groups of genes based on their "status" of
        Effector | Effected.

        Status:  IN PROGRESS

        :param limit:
        :return:

        """

        raw = '/'.join((self.rawdir, self.files['gene_interaction']['file']))

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        model = Model(g)
        logger.info("Processing gene interaction associations")
        line_counter = 0

        with gzip.open(raw, 'rb') as csvfile:
            filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""),
                                    delimiter='\t',
                                    quotechar="'")

            for row in filereader:
                line_counter += 1
                if re.match(r'#', ''.join(row)):
                    continue

                (interaction_num, interaction_type, interaction_subtype,
                 summary, citation) = row[0:5]
                # print(row)
                interaction_id = 'WormBase:' + interaction_num

                # TODO deal with subtypes
                interaction_type_id = None
                if interaction_type == 'Genetic':
                    interaction_type_id = \
                        InteractionAssoc.interaction_object_properties[
                            'genetically_interacts_with']
                elif interaction_type == 'Physical':
                    interaction_type_id = \
                        InteractionAssoc.interaction_object_properties[
                            'molecularly_interacts_with']
                elif interaction_type == 'Regulatory':
                    interaction_type_id = \
                        InteractionAssoc.interaction_object_properties[
                            'regulates']
                else:
                    logger.info("An interaction type I don't understand %s",
                                interaction_type)

                num_interactors = (len(row) - 5) / 3
                if num_interactors != 2:
                    logger.info(
                        "Skipping interactions with !=2 participants:\n %s",
                        str(row))
                    continue

                gene_a_id = 'WormBase:' + row[5]
                gene_b_id = 'WormBase:' + row[8]

                if self.testMode \
                        and gene_a_id not in self.test_ids['gene'] \
                        and gene_b_id not in self.test_ids['gene']:
                    continue

                assoc = InteractionAssoc(g, self.name, gene_a_id, gene_b_id,
                                         interaction_type_id)
                assoc.set_association_id(interaction_id)
                assoc.add_association_to_graph()
                assoc_id = assoc.get_association_id()
                # citation is not a pmid or WBref - get this some other way
                model.addDescription(assoc_id, summary)

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

        return
Пример #4
0
    def _get_interactions(self, limit):
        LOG.info("getting interactions")
        line_counter = 0
        f = '/'.join((self.rawdir, self.files['interactions']['file']))
        myzip = ZipFile(f, 'r')
        # assume that the first entry is the item
        fname = myzip.namelist()[0]
        matchcounter = 0

        with myzip.open(fname, 'r') as csvfile:
            for line in csvfile:
                # skip comment lines
                if re.match(r'^#', line.decode()):
                    LOG.debug("Skipping header line")
                    continue
                line_counter += 1
                line = line.decode().strip()
                # print(line)
                (interactor_a, interactor_b, alt_ids_a, alt_ids_b, aliases_a,
                 aliases_b, detection_method, pub_author, pub_id, taxid_a,
                 taxid_b, interaction_type, source_db, interaction_id,
                 confidence_val) = line.split('\t')
                taxid_a = taxid_a.rstrip()
                taxid_b = taxid_b.rstrip()

                # get the actual gene ids,
                # typically formated like: gene/locuslink:351|BIOGRID:106848
                gene_a_num = re.search(r'locuslink\:(\d+)\|?',
                                       interactor_a).groups()[0]
                gene_b_num = re.search(r'locuslink\:(\d+)\|?',
                                       interactor_b).groups()[0]

                if self.test_mode:
                    graph = self.testgraph
                    # skip any genes that don't match our test set
                    if (int(gene_a_num) not in self.test_ids) or\
                            (int(gene_b_num) not in self.test_ids):
                        continue
                else:
                    graph = self.graph
                    # when not in test mode, filter by taxon
                    if taxid_a.split(':')[-1] not in self.tax_ids or \
                            taxid_b.split(':')[-1] not in self.tax_ids:
                        continue
                    else:
                        matchcounter += 1

                gene_a = 'NCBIGene:' + gene_a_num
                gene_b = 'NCBIGene:' + gene_b_num

                # get the interaction type
                # psi-mi:"MI:0407"(direct interaction)
                int_type = re.search(r'MI:\d+', interaction_type).group()
                rel = self.resolve(int_type, False)
                if rel == int_type:
                    rel = self.globaltt['interacts with']

                # scrub pubmed-->PMID prefix
                pub_id = re.sub(r'pubmed', 'PMID', pub_id)
                # remove bogus whitespace
                pub_id = pub_id.strip()

                # get the method, and convert to evidence code
                det_code = re.search(r'MI:\d+', detection_method).group()
                evidence = self.resolve(det_code, False)
                if evidence == det_code:
                    evidence = self.globaltt["experimental evidence"]

                # note that the interaction_id is some kind of internal biogrid
                # identifier that does not map to a public URI.
                # we will construct a monarch identifier from this

                assoc = InteractionAssoc(graph, self.name, gene_a, gene_b, rel)
                assoc.add_evidence(evidence)
                assoc.add_source(pub_id)
                assoc.add_association_to_graph()

                if not self.test_mode and (limit is not None
                                           and line_counter > limit):
                    break

        myzip.close()

        return