Пример #1
0
    def process_rnai_phenotypes(self, limit=None):

        raw = '/'.join((self.rawdir, self.files['rnai_pheno']['file']))

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        # gu = GraphUtils(curie_map.get())  # TODO unused

        logger.info("Processing RNAi phenotype associations")
        line_counter = 0
        geno = Genotype(g)
        with open(raw, 'r') as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (gene_num, gene_alt_symbol, phenotype_label, phenotype_id,
                 rnai_and_refs) = row
# WBGene00001908	F17E9.9	locomotion variant	WBPhenotype:0000643	WBRNAi00025129|WBPaper00006395 WBRNAi00025631|WBPaper00006395
# WBGene00001908	F17E9.9	avoids bacterial lawn	WBPhenotype:0000402	WBRNAi00095640|WBPaper00040984
# WBGene00001908	F17E9.9	RAB-11 recycling endosome localization variant	WBPhenotype:0002107	WBRNAi00090830|WBPaper00041129

                if self.testMode and gene_num not in self.test_ids['gene']:
                    continue

                gene_id = 'WormBase:'+gene_num
                # refs = list()  # TODO unused

                # the rnai_and_refs has this so that
                # WBRNAi00008687|WBPaper00005654 WBRNAi00025197|WBPaper00006395 WBRNAi00045381|WBPaper00025054
                # space delimited between RNAi sets;
                # then each RNAi should have a paper

                rnai_sets = re.split(r' ', rnai_and_refs)

                for s in rnai_sets:

                    # get the rnai_id
                    (rnai_num, ref_num) = re.split(r'\|', s)
                    if len(re.split(r'\|', s)) > 2:
                        logger.warning(
                            "There's an unexpected number of items in %s", s)
                    if rnai_num not in self.rnai_gene_map:
                        self.rnai_gene_map[rnai_num] = set()

                    # to use for looking up later
                    self.rnai_gene_map[rnai_num].add(gene_num)

                    rnai_id = 'WormBase:'+rnai_num
                    geno.addGeneTargetingReagent(
                        rnai_id, None, geno.genoparts['RNAi_reagent'], gene_id)

                    # make the "allele" of the gene
                    # that is targeted by the reagent
                    allele_id = self.make_reagent_targeted_gene_id(
                        gene_num, rnai_num, self.nobnodes)
                    allele_label = gene_alt_symbol+'<'+rnai_num+'>'
                    geno.addReagentTargetedGene(
                        rnai_id, gene_id, allele_id, allele_label)

                    assoc = G2PAssoc(self.name, allele_id, phenotype_id)
                    assoc.add_source('WormBase:'+ref_num)
                    # eco_id = 'ECO:0000019'  # RNAi evidence  # TODO unused
                    assoc.add_association_to_graph(g)

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

        return
Пример #2
0
    def process_allele_phenotype(self, limit=None):
        """
        This file compactly lists variant to phenotype associations,
        such that in a single row, there may be >1 variant listed
        per phenotype and paper.  This indicates that each variant is
        individually assocated with the given phenotype,
        as listed in 1+ papers.
        (Not that the combination of variants is producing the phenotype.)
        :param limit:
        :return:

        """

        raw = '/'.join((self.rawdir, self.files['allele_pheno']['file']))

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        # gu = GraphUtils(curie_map.get())  # TODO unused

        logger.info("Processing Allele phenotype associations")
        line_counter = 0
        geno = Genotype(g)
        with open(raw, 'r') as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                if re.match(r'!', ''.join(row)):  # header
                    continue
                line_counter += 1
                (db, gene_num, gene_symbol, is_not, phenotype_id, ref,
                 eco_symbol, with_or_from, aspect, gene_name, gene_synonym,
                 gene_class, taxon, date, assigned_by, blank, blank2) = row

                if self.testMode and gene_num not in self.test_ids['gene']:
                    continue

                # TODO add NOT phenotypes
                if is_not == 'NOT':
                    continue

                eco_id = None
                if eco_symbol == 'IMP':
                    eco_id = 'ECO:0000015'
                elif eco_symbol.strip() != '':
                    logger.warning(
                        "Encountered an ECO code we don't have: %s",
                        eco_symbol)

                # according to the GOA spec, persons are not allowed to be
                # in the reference column, therefore they the variant and
                # persons are swapped between the reference and with column.
                # we unswitch them here.
                temp_var = temp_ref = None
                if re.search(r'WBVar|WBRNAi', ref):
                    temp_var = ref
                    # move the paper from the with column into the ref
                if re.search(r'WBPerson', with_or_from):
                    temp_ref = with_or_from
                if temp_var is not None or temp_ref is not None:
                    with_or_from = temp_var
                    ref = temp_ref

                allele_list = re.split(r'\|', with_or_from)
                if len(allele_list) == 0:
                    logger.error(
                        "Missing alleles from phenotype assoc at line %d",
                        line_counter)
                    continue
                else:
                    for a in allele_list:
                        allele_num = re.sub(r'WB:', '', a.strip())
                        allele_id = 'WormBase:'+allele_num
                        gene_id = 'WormBase:'+gene_num

                        if re.search(r'WBRNAi', allele_id):
                            # make the reagent-targeted gene,
                            # & annotate that instead of the RNAi item directly
                            rnai_num = re.sub(r'WormBase:', '', allele_id)
                            rnai_id = allele_id
                            rtg_id = self.make_reagent_targeted_gene_id(
                                gene_num, rnai_num, self.nobnodes)
                            geno.addReagentTargetedGene(
                                rnai_id, 'WormBase:'+gene_num, rtg_id)
                            geno.addGeneTargetingReagent(
                                rnai_id, None, geno.genoparts['RNAi_reagent'],
                                gene_id)
                            allele_id = rtg_id
                        elif re.search(r'WBVar', allele_id):
                            # this may become deprecated by using wormmine
                            # make the allele to gene relationship
                            # the WBVars are really sequence alterations

                            # the public name will come from elsewhere
                            geno.addSequenceAlteration(allele_id, None)
                            vl_id = '_'+'-'.join((gene_num, allele_num))
                            if self.nobnodes:
                                vl_id = ':'+vl_id
                            geno.addSequenceAlterationToVariantLocus(
                                allele_id, vl_id)
                            geno.addAlleleOfGene(vl_id, gene_id)
                        else:
                            logger.warning(
                                "Some kind of allele I don't recognize: %s",
                                allele_num)
                            continue
                        assoc = G2PAssoc(self.name, allele_id, phenotype_id)

                        if eco_id is not None:
                            assoc.add_evidence(eco_id)

                        if ref is not None and ref != '':
                            ref = re.sub(r'(WB:|WB_REF:)', 'WormBase:', ref)
                            r = Reference(ref)
                            if re.search(r'Person', ref):
                                r.setType(r.ref_types['person'])
                                # also add
                                # inferred from background scientific knowledge
                                assoc.add_evidence('ECO:0000001')
                            r.addRefToGraph(g)
                            assoc.add_source(ref)

                        assoc.add_association_to_graph(g)

                        # finish looping through all alleles

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

        return
Пример #3
0
    def process_allele_phenotype(self, limit=None):
        """
        This file compactly lists variant to phenotype associations,
        such that in a single row, there may be >1 variant listed
        per phenotype and paper.  This indicates that each variant is
        individually assocated with the given phenotype,
        as listed in 1+ papers.
        (Not that the combination of variants is producing the phenotype.)
        :param limit:
        :return:

        """

        raw = '/'.join((self.rawdir, self.files['allele_pheno']['file']))

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        logger.info("Processing Allele phenotype associations")
        line_counter = 0
        geno = Genotype(g)
        with open(raw, 'r') as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                if re.match(r'!', ''.join(row)):  # header
                    continue
                line_counter += 1
                (db, gene_num, gene_symbol, is_not, phenotype_id, ref,
                 eco_symbol, with_or_from, aspect, gene_name, gene_synonym,
                 gene_class, taxon, date, assigned_by, blank, blank2) = row

                if self.testMode and gene_num not in self.test_ids['gene']:
                    continue

                # TODO add NOT phenotypes
                if is_not == 'NOT':
                    continue

                eco_id = None
                if eco_symbol == 'IMP':
                    eco_id = 'ECO:0000015'
                elif eco_symbol.strip() != '':
                    logger.warning("Encountered an ECO code we don't have: %s",
                                   eco_symbol)

                # according to the GOA spec, persons are not allowed to be
                # in the reference column, therefore they the variant and
                # persons are swapped between the reference and with column.
                # we unswitch them here.
                temp_var = temp_ref = None
                if re.search(r'WBVar|WBRNAi', ref):
                    temp_var = ref
                    # move the paper from the with column into the ref
                if re.search(r'WBPerson', with_or_from):
                    temp_ref = with_or_from
                if temp_var is not None or temp_ref is not None:
                    with_or_from = temp_var
                    ref = temp_ref

                allele_list = re.split(r'\|', with_or_from)
                if len(allele_list) == 0:
                    logger.error(
                        "Missing alleles from phenotype assoc at line %d",
                        line_counter)
                    continue
                else:
                    for a in allele_list:
                        allele_num = re.sub(r'WB:', '', a.strip())
                        allele_id = 'WormBase:' + allele_num
                        gene_id = 'WormBase:' + gene_num

                        if re.search(r'WBRNAi', allele_id):
                            # make the reagent-targeted gene,
                            # & annotate that instead of the RNAi item directly
                            rnai_num = re.sub(r'WormBase:', '', allele_id)
                            rnai_id = allele_id
                            rtg_id = self.make_reagent_targeted_gene_id(
                                gene_num, rnai_num)
                            geno.addReagentTargetedGene(
                                rnai_id, 'WormBase:' + gene_num, rtg_id)
                            geno.addGeneTargetingReagent(
                                rnai_id, None, geno.genoparts['RNAi_reagent'],
                                gene_id)
                            allele_id = rtg_id
                        elif re.search(r'WBVar', allele_id):
                            # this may become deprecated by using wormmine
                            # make the allele to gene relationship
                            # the WBVars are really sequence alterations

                            # the public name will come from elsewhere
                            geno.addSequenceAlteration(allele_id, None)
                            vl_id = '_:' + '-'.join((gene_num, allele_num))
                            geno.addSequenceAlterationToVariantLocus(
                                allele_id, vl_id)
                            geno.addAlleleOfGene(vl_id, gene_id)
                        else:
                            logger.warning(
                                "Some kind of allele I don't recognize: %s",
                                allele_num)
                            continue
                        assoc = G2PAssoc(g, self.name, allele_id, phenotype_id)

                        if eco_id is not None:
                            assoc.add_evidence(eco_id)

                        if ref is not None and ref != '':
                            ref = re.sub(r'(WB:|WB_REF:)', 'WormBase:', ref)
                            reference = Reference(g, ref)
                            if re.search(r'Person', ref):
                                reference.setType(
                                    reference.ref_types['person'])
                                # also add
                                # inferred from background scientific knowledge
                                assoc.add_evidence('ECO:0000001')
                            reference.addRefToGraph()
                            assoc.add_source(ref)

                        assoc.add_association_to_graph()

                        # finish looping through all alleles

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

        return
Пример #4
0
    def process_rnai_phenotypes(self, limit=None):

        raw = '/'.join((self.rawdir, self.files['rnai_pheno']['file']))

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        model = Model(g)

        logger.info("Processing RNAi phenotype associations")
        line_counter = 0
        geno = Genotype(g)
        with open(raw, 'r') as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (gene_num, gene_alt_symbol, phenotype_label, phenotype_id,
                 rnai_and_refs) = row
                # WBGene00001908	F17E9.9	locomotion variant	WBPhenotype:0000643	WBRNAi00025129|WBPaper00006395 WBRNAi00025631|WBPaper00006395
                # WBGene00001908	F17E9.9	avoids bacterial lawn	WBPhenotype:0000402	WBRNAi00095640|WBPaper00040984
                # WBGene00001908	F17E9.9	RAB-11 recycling endosome localization variant	WBPhenotype:0002107	WBRNAi00090830|WBPaper00041129

                if self.testMode and gene_num not in self.test_ids['gene']:
                    continue

                gene_id = 'WormBase:' + gene_num
                # refs = list()  # TODO unused

                # the rnai_and_refs has this so that
                # WBRNAi00008687|WBPaper00005654 WBRNAi00025197|WBPaper00006395 WBRNAi00045381|WBPaper00025054
                # space delimited between RNAi sets;
                # then each RNAi should have a paper

                rnai_sets = re.split(r' ', rnai_and_refs)

                for s in rnai_sets:

                    # get the rnai_id
                    (rnai_num, ref_num) = re.split(r'\|', s)
                    if len(re.split(r'\|', s)) > 2:
                        logger.warning(
                            "There's an unexpected number of items in %s", s)
                    if rnai_num not in self.rnai_gene_map:
                        self.rnai_gene_map[rnai_num] = set()

                    # to use for looking up later
                    self.rnai_gene_map[rnai_num].add(gene_num)

                    rnai_id = 'WormBase:' + rnai_num
                    geno.addGeneTargetingReagent(
                        rnai_id, None, geno.genoparts['RNAi_reagent'], gene_id)

                    # make the "allele" of the gene
                    # that is targeted by the reagent
                    allele_id = self.make_reagent_targeted_gene_id(
                        gene_num, rnai_num)
                    allele_label = gene_alt_symbol + '<' + rnai_num + '>'
                    geno.addReagentTargetedGene(rnai_id, gene_id, allele_id,
                                                allele_label)

                    assoc = G2PAssoc(g, self.name, allele_id, phenotype_id)
                    assoc.add_source('WormBase:' + ref_num)
                    # eco_id = 'ECO:0000019'  # RNAi evidence  # TODO unused
                    assoc.add_association_to_graph()

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

        return
Пример #5
0
    def process_rnai_phenotypes(self, limit=None):
        src_key = 'rnai_pheno'
        raw = '/'.join((self.rawdir, self.files[src_key]['file']))
        LOG.info("Processing: %s", self.files[src_key]['file'])
        graph = self.graph
        geno = Genotype(graph)
        col = self.files[src_key]['columns']
        with open(raw, 'r') as csvfile:
            reader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            # no header row to check
            collen = len(col)
            for row in reader:
                if len(row) != collen:
                    LOG.error('In %s line %i expected %i colums but got %s.',
                              self.files[src_key]['file'], reader.line_num,
                              collen, row)
                    pass
                gene_num = row[col.index('gene_num')]
                gene_alt_symbol = row[col.index('gene_alt_symbol')]
                # phenotype_label = row[col.index('phenotype_label')]
                phenotype_id = row[col.index('phenotype_id')]
                rnai_and_refs = row[col.index('rnai_and_refs')]

                gene_curie = 'WormBase:' + gene_num
                '''
WBGene00001908	F17E9.9	locomotion variant	WBPhenotype:0000643	WBRNAi00025129|WBPaper00006395 WBRNAi00025631|WBPaper00006395
WBGene00001908	F17E9.9	avoids bacterial lawn	WBPhenotype:0000402	WBRNAi00095640|WBPaper00040984
WBGene00001908	F17E9.9	RAB-11 recycling endosome localization variant	WBPhenotype:0002107	WBRNAi00090830|WBPaper00041129
                '''

                # the rnai_and_refs has this so that
                '''
WBRNAi00008687|WBPaper00005654 WBRNAi00025197|WBPaper00006395 WBRNAi00045381|WBPaper00025054
                '''
                # space delimited between RNAi sets;
                # then each RNAi should have a paper

                rnai_sets = re.split(r' ', rnai_and_refs)

                for rnais in rnai_sets:
                    # get the rnai_id
                    pair = rnais.split('|')
                    if len(pair) > 2:
                        LOG.warning(
                            "There's an unexpected number of items in %s",
                            rnais)
                    else:
                        (rnai_num, ref_num) = pair
                    if rnai_num not in self.rnai_gene_map:
                        self.rnai_gene_map[rnai_num] = set()

                    # to use for looking up later
                    self.rnai_gene_map[rnai_num].add(gene_num)

                    rnai_curie = 'WormBase:' + rnai_num
                    geno.addGeneTargetingReagent(rnai_curie, None,
                                                 self.globaltt['RNAi_reagent'],
                                                 gene_curie)

                    # make the "allele" of the gene
                    # that is targeted by the reagent
                    allele_id = self.make_reagent_targeted_gene_id(
                        gene_num, rnai_num)
                    allele_label = gene_alt_symbol + '<' + rnai_num + '>'
                    geno.addReagentTargetedGene(rnai_curie, gene_curie,
                                                allele_id, allele_label)

                    assoc = G2PAssoc(graph, self.name, allele_id, phenotype_id)
                    assoc.add_source('WormBase:' + ref_num)
                    # eco_id = 'ECO:0000019'  # RNAi evidence  # TODO unused
                    assoc.add_association_to_graph()

                if limit is not None and reader.line_num > limit:
                    break