예제 #1
0
class Coriell(Source):
    """
    The Coriell Catalog provided to Monarch includes metadata and descriptions
    of NIGMS, NINDS, NHGRI, and NIA cell lines.  These lines are made available
    for research purposes. Here, we create annotations for the cell lines as
    models of the diseases from which they originate.

    We create a handle for a patient from which the given cell line is derived
    (since there may be multiple cell lines created from a given patient).
    A genotype is assembled for a patient, which includes a karyotype
    (if specified) and/or a collection of variants.
    Both the genotype (has_genotype) and disease are linked to the patient
    (has_phenotype), and the cell line is listed as derived from the patient.
    The cell line is classified by it's
    [CLO cell type](http://www.ontobee.org/browser/index.php?o=clo),
    which itself is linked to a tissue of origin.

    Unfortunately, the omim numbers listed in this file are both for genes
    & diseases; we have no way of knowing a priori if a designated omim number
    is a gene or disease; so we presently link the patient to any omim id via
    the has_phenotype relationship.

    Notice: The Coriell catalog is delivered to Monarch in a specific format,
    and requires ssh rsa fingerprint identification.  Other groups wishing to
    get this data in it's raw form will need to contact Coriell for credential
    This needs to be placed into your configuration file for it to work.

    """

    terms = {
        'cell_line_repository': 'CLO:0000008',
        'race': 'SIO:001015',
        'ethnic_group': 'EFO:0001799',
        'age': 'EFO:0000246',
        'sampling_time': 'EFO:0000689',
        'collection': 'ERO:0002190'
    }

    files = {
        'NINDS': {
            'file': 'NINDS.csv',
            'id': 'NINDS',
            'label': 'NINDS Human Genetics DNA and Cell line Repository',
            'page': 'https://catalog.coriell.org/1/NINDS'},
        'NIGMS': {
            'file': 'NIGMS.csv',
            'id': 'NIGMS',
            'label': 'NIGMS Human Genetic Cell Repository',
            'page': 'https://catalog.coriell.org/1/NIGMS'},
        'NIA': {
            'file': 'NIA.csv',
            'id': 'NIA',
            'label': 'NIA Aging Cell Repository',
            'page': 'https://catalog.coriell.org/1/NIA'},
        'NHGRI': {
            'file': 'NHGRI.csv',
            'id': 'NHGRI',
            'label': 'NHGRI Sample Repository for Human Genetic Research',
            'page': 'https://catalog.coriell.org/1/NHGRI'}
    }

    # the following will house the specific cell lines to use for test output
    test_lines = [
        'ND02380', 'ND02381', 'ND02383', 'ND02384', 'GM17897', 'GM17898',
        'GM17896', 'GM17944', 'GM17945', 'ND00055', 'ND00094', 'ND00136',
        'GM17940', 'GM17939', 'GM20567', 'AG02506', 'AG04407', 'AG07602'
        'AG07601', 'GM19700', 'GM19701', 'GM19702', 'GM00324', 'GM00325',
        'GM00142', 'NA17944', 'AG02505', 'GM01602', 'GM02455', 'AG00364',
        'GM13707', 'AG00780']

    def __init__(self):
        Source.__init__(self, 'coriell')

        self.load_bindings()

        self.dataset = Dataset(
            'coriell', 'Coriell', 'http://ccr.coriell.org/', None)

        # data-source specific warnings
        # (will be removed when issues are cleared)

        logger.warning(
            'We assume that if a species is not provided, '
            'that it is a Human-derived cell line')
        logger.warning(
            'We map all omim ids as a disease/phenotype entity, '
            'but should be fixed in the future')

        # check if config exists; if it doesn't, error out and let user know
        if 'dbauth' not in config.get_config() or \
                'coriell' not in config.get_config()['dbauth']:
            logger.error("not configured with FTP user/password.")

        return

    def fetch(self, is_dl_forced=False):
        """
        Here we connect to the coriell sftp server using private connection
        details.  They dump bi-weekly files with a timestamp in the filename.
        For each catalog, we poll the remote site and pull the most-recently
        updated file, renaming it to our local *_latest.csv.

        Be sure to have pg user/password connection details in your conf.json
        file, like:
        dbauth : {
            "coriell" : {
                "user" : "<username>", "password" : "<password>",
                "host" : <host>, "private_key"=path/to/rsa_key}
        }

        :param is_dl_forced:
        :return:

        """
        host = config.get_config()['dbauth']['coriell']['host']
        user = config.get_config()['dbauth']['coriell']['user']
        passwd = config.get_config()['dbauth']['coriell']['password']
        key = config.get_config()['dbauth']['coriell']['private_key']

        with pysftp.Connection(
                host, username=user, password=passwd, private_key=key) as sftp:
            # check to make sure each file is in there
            # get the remote files
            remote_files = sftp.listdir_attr()
            files_by_repo = {}
            for attr in remote_files:
                # for each catalog, get the most-recent filename
                m = re.match('(NIGMS|NIA|NHGRI|NINDS)', attr.filename)
                if m is not None and len(m.groups()) > 0:
                    # there should just be one now
                    files_by_repo[m.group(1)] = attr
            # sort each array in hash,
            # & get the name and time of the most-recent file for each catalog
            for r in self.files:
                logger.info("Checking on %s catalog file", r)
                fname = self.files[r]['file']
                remotef = files_by_repo[r]
                target_name = '/'.join((self.rawdir, fname))
                # check if the local file is out of date, if so, download.
                # otherwise, skip.
                # we rename (for simplicity) the original file
                st = None
                if os.path.exists(target_name):
                    st = os.stat(target_name)
                    logger.info(
                        "Local file date: %s",
                        datetime.utcfromtimestamp(st[stat.ST_CTIME]))
                if st is None or remotef.st_mtime > st[stat.ST_CTIME]:
                    if st is None:
                        logger.info(
                            "File does not exist locally; downloading...")
                    else:
                        logger.info(
                            "There's a new version of %s catalog available; "
                            "downloading...", r)
                    sftp.get(remotef.filename, target_name)
                    logger.info(
                        "Fetched remote %s -> %s",
                        remotef.filename, target_name)
                    st = os.stat(target_name)
                    filedate = \
                        datetime.utcfromtimestamp(
                            remotef.st_mtime).strftime("%Y-%m-%d")
                    logger.info(
                        "New file date: %s",
                        datetime.utcfromtimestamp(st[stat.ST_CTIME]))

                else:
                    logger.info("File %s exists; using local copy", fname)
                    filedate = \
                        datetime.utcfromtimestamp(
                            st[stat.ST_CTIME]).strftime("%Y-%m-%d")

                self.dataset.setFileAccessUrl(remotef.filename)
                self.dataset.setVersion(filedate)
        return

    def parse(self, limit=None):
        if limit is not None:
            logger.info("Only parsing first %s rows of each file", limit)

        logger.info("Parsing files...")

        if self.testOnly:
            self.testMode = True

        for f in self.files:
            file = '/'.join((self.rawdir, self.files[f]['file']))
            self._process_collection(
                self.files[f]['id'],
                self.files[f]['label'],
                self.files[f]['page'])
            self._process_data(file, limit)

        logger.info("Finished parsing.")

        self.load_bindings()

        logger.info("Found %d nodes in graph", len(self.graph))
        logger.info("Found %d nodes in testgraph", len(self.testgraph))

        return

    def _process_data(self, raw, limit=None):
        """
        This function will process the data files from Coriell.
        We make the assumption that any alleles listed are variants
        (alternates to w.t.)

        Triples: (examples)

        :NIGMSrepository a CLO_0000008 #repository
        label : NIGMS Human Genetic Cell Repository
        foaf:page https://catalog.coriell.org/0/sections/collections/NIGMS/?SsId=8

            line_id a CL_0000057,  #fibroblast line
                derives_from patient_id
                part_of :NIGMSrepository
                RO:model_of OMIM:disease_id

            patient id a foaf:person,
                label: "fibroblast from patient 12345 with disease X"
                member_of family_id  #what is the right thing here?
                SIO:race EFO:caucasian  #subclass of EFO:0001799
                in_taxon NCBITaxon:9606
                dc:description Literal(remark)
                RO:has_phenotype OMIM:disease_id
                GENO:has_genotype genotype_id

            family_id a owl:NamedIndividual
                foaf:page "https://catalog.coriell.org/0/Sections/BrowseCatalog/FamilyTypeSubDetail.aspx?PgId=402&fam=2104&coll=GM"

            genotype_id a intrinsic_genotype
                GENO:has_alternate_part allelic_variant_id
                we don't necessarily know much about the genotype,
                other than the allelic variant. also there's the sex here

            pub_id mentions cell_line_id

        :param raw:
        :param limit:
        :return:
        """
        logger.info("Processing Data from %s", raw)
        gu = GraphUtils(curie_map.get())

        if self.testMode:      # set the graph to build
            g = self.testgraph
        else:
            g = self.graph

        line_counter = 0
        geno = Genotype(g)
        du = DipperUtil()

        gu.loadProperties(g, geno.object_properties, gu.OBJPROP)
        gu.loadAllProperties(g)

        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter=',', quotechar='\"')
            next(filereader, None)  # skip the header row
            for row in filereader:
                if not row:
                    pass
                else:
                    line_counter += 1

                    (catalog_id, description, omim_number, sample_type,
                     cell_line_available, dna_in_stock, dna_ref, gender, age,
                     race, ethnicity, affected, karyotype, relprob, mutation,
                     gene, family_id, collection, url, cat_remark, pubmed_ids,
                     family_member, variant_id, dbsnp_id, species) = row

                    # example:
                    # GM00003,HURLER SYNDROME,607014,Fibroblast,Yes,No,,Female,26 YR,Caucasian,,,,
                    # parent,,,39,NIGMS Human Genetic Cell Repository,
                    # http://ccr.coriell.org/Sections/Search/Sample_Detail.aspx?Ref=GM00003,
                    # 46;XX; clinically normal mother of a child with Hurler syndrome; proband not in Repository,,
                    # 2,,18343,H**o sapiens

                    if self.testMode and catalog_id not in self.test_lines:
                        # skip rows not in our test lines, when in test mode
                        continue

                    # ###########    BUILD REQUIRED VARIABLES    ###########

                    # Make the cell line ID
                    cell_line_id = 'Coriell:'+catalog_id.strip()

                    # Map the cell/sample type
                    cell_type = self._map_cell_type(sample_type)

                    # Make a cell line label
                    line_label = \
                        collection.partition(' ')[0]+'-'+catalog_id.strip()

                    # Map the repository/collection
                    repository = self._map_collection(collection)

                    # patients are uniquely identified by one of:
                    # dbsnp id (which is == an individual haplotype)
                    # family id + family member (if present) OR
                    # probands are usually family member zero
                    # cell line id
                    # since some patients have >1 cell line derived from them,
                    # we must make sure that the genotype is attached to
                    # the patient, and can be inferred to the cell line
                    # examples of repeated patients are:
                    #   famid=1159, member=1; fam=152,member=1

                    # Make the patient ID

                    # make an anonymous patient
                    patient_id = '_person'
                    if self.nobnodes:
                        patient_id = ':'+patient_id
                    if family_id != '':
                        patient_id = \
                            '-'.join((patient_id, family_id, family_member))
                    else:
                        # make an anonymous patient
                        patient_id = '-'.join((patient_id, catalog_id.strip()))

                    # properties of the individual patients:  sex, family id,
                    # member/relproband, description descriptions are
                    # really long and ugly SCREAMING text, so need to clean up
                    # the control cases are so odd with this labeling scheme;
                    # but we'll deal with it as-is for now.
                    short_desc = (description.split(';')[0]).capitalize()
                    if affected == 'Yes':
                        affected = 'affected'
                    elif affected == 'No':
                        affected = 'unaffected'
                    gender = gender.lower()
                    patient_label = ' '.join((affected, gender, relprob))
                    if relprob == 'proband':
                        patient_label = \
                            ' '.join(
                                (patient_label.strip(), 'with', short_desc))
                    else:
                        patient_label = \
                            ' '.join(
                                (patient_label.strip(), 'of proband with',
                                 short_desc))

                    # #############    BUILD THE CELL LINE    #############

                    # Adding the cell line as a typed individual.
                    cell_line_reagent_id = 'CLO:0000031'

                    gu.addIndividualToGraph(
                        g, cell_line_id, line_label, cell_line_reagent_id)

                    # add the equivalent id == dna_ref
                    if dna_ref != '' and dna_ref != catalog_id:
                        equiv_cell_line = 'Coriell:'+dna_ref
                        # some of the equivalent ids are not defined
                        # in the source data; so add them
                        gu.addIndividualToGraph(
                            g, equiv_cell_line, None, cell_line_reagent_id)
                        gu.addSameIndividual(g, cell_line_id, equiv_cell_line)

                    # Cell line derives from patient
                    geno.addDerivesFrom(cell_line_id, patient_id)
                    geno.addDerivesFrom(cell_line_id, cell_type)

                    # Cell line a member of repository
                    gu.addMember(g, repository, cell_line_id)

                    if cat_remark != '':
                        gu.addDescription(g, cell_line_id, cat_remark)

                    # Cell age_at_sampling
                    # TODO add the age nodes when modeled properly in #78
                    # if (age != ''):
                        # this would give a BNode that is an instance of Age.
                        # but i don't know how to connect
                        # the age node to the cell line? we need to ask @mbrush
                        # age_id = '_'+re.sub('\s+','_',age)
                        # gu.addIndividualToGraph(
                        #   g,age_id,age,self.terms['age'])
                        # gu.addTriple(
                        #   g,age_id,self.properties['has_measurement'],age,
                        #   True)

                    # #############    BUILD THE PATIENT    #############

                    # Add the patient ID as an individual.
                    gu.addPerson(g, patient_id, patient_label)
                    # TODO map relationship to proband as a class
                    # (what ontology?)

                    # Add race of patient
                    # FIXME: Adjust for subcategories based on ethnicity field
                    # EDIT: There are 743 different entries for ethnicity...
                    # Too many to map?
                    # Add ethnicity as literal in addition to the mapped race?
                    # Adjust the ethnicity txt (if using)
                    # to initial capitalization to remove ALLCAPS

                    # TODO race should go into the individual's background
                    # and abstracted out to the Genotype class punting for now.
                    # if race != '':
                    #    mapped_race = self._map_race(race)
                    #    if mapped_race is not None:
                    #        gu.addTriple(
                    #           g,patient_id,self.terms['race'],mapped_race)
                    #        gu.addSubclass(
                    #           g,self.terms['ethnic_group'],mapped_race)

                    # #############    BUILD THE FAMILY    #############

                    # Add triples for family_id, if present.
                    if family_id != '':
                        family_comp_id = 'CoriellFamily:'+family_id

                        family_label = \
                            ' '.join(('Family of proband with', short_desc))

                        # Add the family ID as a named individual
                        gu.addIndividualToGraph(
                            g, family_comp_id, family_label,
                            geno.genoparts['family'])

                        # Add the patient as a member of the family
                        gu.addMemberOf(g, patient_id, family_comp_id)

                    # #############    BUILD THE GENOTYPE   #############

                    # the important things to pay attention to here are:
                    # karyotype = chr rearrangements  (somatic?)
                    # mutation = protein-level mutation as a label,
                    # often from omim
                    # gene = gene symbol - TODO get id
                    # variant_id = omim variant ids (; delimited)
                    # dbsnp_id = snp individual ids = full genotype?

                    # note GM00633 is a good example of chromosomal variation
                    # - do we have enough to capture this?
                    # GM00325 has both abnormal karyotype and variation

                    # make an assumption that if the taxon is blank,
                    # that it is human!
                    if species is None or species == '':
                        species = 'H**o sapiens'
                    taxon = self._map_species(species)

                    # if there's a dbSNP id,
                    # this is actually the individual's genotype
                    genotype_id = None
                    genotype_label = None
                    if dbsnp_id != '':
                        genotype_id = 'dbSNPIndividual:'+dbsnp_id.strip()

                    omim_map = {}
                    gvc_id = None

                    # some of the karyotypes are encoded
                    # with terrible hidden codes. remove them here
                    # i've seen a <98> character
                    karyotype = du.remove_control_characters(karyotype)
                    karyotype_id = None
                    if karyotype.strip() != '':
                        karyotype_id = \
                            '_'+re.sub('MONARCH:', '', self.make_id(karyotype))
                        if self.nobnodes:
                            karyotype_id = ':'+karyotype_id
                        # add karyotype as karyotype_variation_complement
                        gu.addIndividualToGraph(
                            g, karyotype_id, karyotype,
                            geno.genoparts['karyotype_variation_complement'])
                        # TODO break down the karyotype into parts
                        # and map into GENO. depends on #77

                        # place the karyotype in a location(s).
                        karyo_chrs = \
                            self._get_affected_chromosomes_from_karyotype(
                                karyotype)
                        for c in karyo_chrs:
                            chr_id = makeChromID(c, taxon, 'CHR')
                            # add an anonymous sequence feature,
                            # each located on chr
                            karyotype_feature_id = '-'.join((karyotype_id, c))
                            karyotype_feature_label = \
                                'some karyotype alteration on chr'+str(c)
                            f = Feature(
                                karyotype_feature_id, karyotype_feature_label,
                                geno.genoparts['sequence_alteration'])
                            f.addFeatureStartLocation(None, chr_id)
                            f.addFeatureToGraph(g)
                            f.loadAllProperties(g)
                            geno.addParts(
                                karyotype_feature_id, karyotype_id,
                                geno.object_properties['has_alternate_part'])

                    if gene != '':
                        vl = gene+'('+mutation+')'

                    # fix the variant_id so it's always in the same order
                    vids = variant_id.split(';')
                    variant_id = ';'.join(sorted(list(set(vids))))

                    if karyotype.strip() != '' \
                            and not self._is_normal_karyotype(karyotype):
                        mutation = mutation.strip()
                        gvc_id = karyotype_id
                        if variant_id != '':
                            gvc_id = '_' + variant_id.replace(';', '-') + '-' \
                                    + re.sub(r'\w*:', '', karyotype_id)
                        if mutation.strip() != '':
                            gvc_label = '; '.join((vl, karyotype))
                        else:
                            gvc_label = karyotype
                    elif variant_id.strip() != '':
                        gvc_id = '_' + variant_id.replace(';', '-')
                        gvc_label = vl
                    else:
                        # wildtype?
                        pass

                    if gvc_id is not None and gvc_id != karyotype_id \
                            and self.nobnodes:
                        gvc_id = ':'+gvc_id

                    # add the karyotype to the gvc.
                    # use reference if normal karyotype
                    karyo_rel = geno.object_properties['has_alternate_part']
                    if self._is_normal_karyotype(karyotype):
                        karyo_rel = \
                            geno.object_properties['has_reference_part']
                    if karyotype_id is not None \
                            and not self._is_normal_karyotype(karyotype) \
                            and gvc_id is not None and karyotype_id != gvc_id:
                        geno.addParts(karyotype_id, gvc_id, karyo_rel)

                    if variant_id.strip() != '':
                        # split the variants & add them as part of the genotype
                        # we don't necessarily know their zygosity,
                        # just that they are part of the genotype variant ids
                        # are from OMIM, so prefix as such we assume that the
                        # sequence alts will be defined in OMIM not here
                        # TODO sort the variant_id list, if the omim prefix is
                        # the same, then assume it's the locus make a hashmap
                        # of the omim id to variant id list;
                        # then build the genotype hashmap is also useful for
                        # removing the "genes" from the list of "phenotypes"

                        # will hold gene/locus id to variant list
                        omim_map = {}

                        locus_num = None
                        for v in variant_id.split(';'):
                            # handle omim-style and odd var ids
                            # like 610661.p.R401X
                            m = re.match(r'(\d+)\.+(.*)', v.strip())
                            if m is not None and len(m.groups()) == 2:
                                (locus_num, var_num) = m.groups()

                            if locus_num is not None \
                                    and locus_num not in omim_map:
                                omim_map[locus_num] = [var_num]
                            else:
                                omim_map[locus_num] += [var_num]

                        for o in omim_map:
                            # gene_id = 'OMIM:' + o  # TODO unused
                            vslc_id = \
                                '_' + '-'.join(
                                    [o + '.' + a for a in omim_map.get(o)])
                            if self.nobnodes:
                                vslc_id = ':'+vslc_id
                            vslc_label = vl
                            # we don't really know the zygosity of
                            # the alleles at all.
                            # so the vslcs are just a pot of them
                            gu.addIndividualToGraph(
                                g, vslc_id, vslc_label,
                                geno.genoparts[
                                    'variant_single_locus_complement'])
                            for v in omim_map.get(o):
                                # this is actually a sequence alt
                                allele1_id = 'OMIM:'+o+'.'+v
                                geno.addSequenceAlteration(allele1_id, None)

                                # assume that the sa -> var_loc -> gene
                                # is taken care of in OMIM
                                geno.addPartsToVSLC(
                                    vslc_id, allele1_id, None,
                                    geno.zygosity['indeterminate'],
                                    geno.object_properties[
                                        'has_alternate_part'])

                            if vslc_id != gvc_id:
                                geno.addVSLCtoParent(vslc_id, gvc_id)

                    if affected == 'unaffected':
                        # let's just say that this person is wildtype
                        gu.addType(g, patient_id, geno.genoparts['wildtype'])
                    elif genotype_id is None:
                        # make an anonymous genotype id
                        genotype_id = '_geno'+catalog_id.strip()
                        if self.nobnodes:
                            genotype_id = ':'+genotype_id

                    # add the gvc
                    if gvc_id is not None:
                        gu.addIndividualToGraph(
                            g, gvc_id, gvc_label,
                            geno.genoparts['genomic_variation_complement'])

                        # add the gvc to the genotype
                        if genotype_id is not None:
                            if affected == 'unaffected':
                                rel = \
                                    geno.object_properties[
                                        'has_reference_part']
                            else:
                                rel = \
                                    geno.object_properties[
                                        'has_alternate_part']
                            geno.addParts(gvc_id, genotype_id, rel)
                        if karyotype_id is not None \
                                and self._is_normal_karyotype(karyotype):
                            if gvc_label is not None and gvc_label != '':
                                genotype_label = \
                                    '; '.join((gvc_label, karyotype))
                            else:
                                genotype_label = karyotype
                            if genotype_id is None:
                                genotype_id = karyotype_id
                            else:
                                geno.addParts(
                                    karyotype_id, genotype_id,
                                    geno.object_properties[
                                        'has_reference_part'])
                        else:
                            genotype_label = gvc_label
                            # use the catalog id as the background
                        genotype_label += ' ['+catalog_id.strip()+']'

                    if genotype_id is not None and gvc_id is not None:
                        # only add the genotype if it has some parts
                        geno.addGenotype(
                            genotype_id, genotype_label,
                            geno.genoparts['intrinsic_genotype'])
                        geno.addTaxon(taxon, genotype_id)
                        # add that the patient has the genotype
                        # TODO check if the genotype belongs to
                        # the cell line or to the patient
                        gu.addTriple(
                            g, patient_id,
                            geno.properties['has_genotype'], genotype_id)
                    else:
                        geno.addTaxon(taxon, patient_id)

                    # TODO: Add sex/gender  (as part of the karyotype?)

                    # #############    DEAL WITH THE DISEASES   #############

                    # we associate the disease to the patient
                    if affected == 'affected':
                        if omim_number != '':
                            for d in omim_number.split(';'):
                                if d is not None and d != '':
                                    # if the omim number is in omim_map,
                                    # then it is a gene not a pheno
                                    if d not in omim_map:
                                        disease_id = 'OMIM:'+d.strip()
                                        # assume the label is taken care of
                                        gu.addClassToGraph(g, disease_id, None)

                                        # add the association:
                                        #   the patient has the disease
                                        assoc = G2PAssoc(
                                            self.name, patient_id, disease_id)
                                        assoc.add_association_to_graph(g)

                                        # this line is a model of this disease
                                        # TODO abstract out model into
                                        # it's own association class?
                                        gu.addTriple(
                                            g, cell_line_id,
                                            gu.properties['model_of'],
                                            disease_id)
                                    else:
                                        logger.info(
                                            'removing %s from disease list ' +
                                            'since it is a gene', d)

                    # #############    ADD PUBLICATIONS   #############

                    if pubmed_ids != '':
                        for s in pubmed_ids.split(';'):
                            pubmed_id = 'PMID:'+s.strip()
                            ref = Reference(pubmed_id)
                            ref.setType(Reference.ref_types['journal_article'])
                            ref.addRefToGraph(g)
                            gu.addTriple(
                                g, pubmed_id, gu.properties['mentions'],
                                cell_line_id)

                    if not self.testMode \
                            and (limit is not None and line_counter > limit):
                        break

            Assoc(self.name).load_all_properties(g)

        return

    def _process_collection(self, collection_id, label, page):
        """
        This function will process the data supplied internally
        about the repository from Coriell.

        Triples:
            Repository a ERO:collection
            rdf:label Literal(label)
            foaf:page Literal(page)

        :param collection_id:
        :param label:
        :param page:
        :return:
        """
        # #############    BUILD THE CELL LINE REPOSITORY    #############
        for g in [self.graph, self.testgraph]:
            # FIXME: How to devise a label for each repository?
            gu = GraphUtils(curie_map.get())
            repo_id = 'CoriellCollection:'+collection_id
            repo_label = label
            repo_page = page

            gu.addIndividualToGraph(
                g, repo_id, repo_label, self.terms['collection'])
            gu.addPage(g, repo_id, repo_page)

        return

    @staticmethod
    def _map_cell_type(sample_type):
        ctype = None
        type_map = {
            # FIXME: mesenchymal stem cell of adipose
            'Adipose stromal cell': 'CL:0002570',
            # FIXME: amniocyte?
            'Amniotic fluid-derived cell line': 'CL:0002323',
            # B cell
            'B-Lymphocyte': 'CL:0000236',
            # FIXME: No Match
            'Chorionic villus-derived cell line': 'CL:0000000',
            # endothelial cell
            'Endothelial': 'CL:0000115',
            # epithelial cell
            'Epithelial': 'CL:0000066',
            # FIXME: No Match. "Abnormal precursor (virally transformed)
            # of mouse erythrocytes that can be grown in culture and
            # induced to differentiate by treatment with, for example, DMSO."
            'Erythroleukemic cell line': 'CL:0000000',

            'Fibroblast': 'CL:0000057',         # fibroblast
            'Keratinocyte': 'CL:0000312',       # keratinocyte
            'Melanocyte': 'CL:0000148',         # melanocyte
            'Mesothelial': 'CL:0000077',
            'Microcell hybrid': 'CL:0000000',   # FIXME: No Match
            'Myoblast': 'CL:0000056',           # myoblast
            'Smooth muscle': 'CL:0000192',      # smooth muscle cell
            'Stem cell': 'CL:0000034',          # stem cell
            'T-Lymphocyte': 'CL:0000084',       # T cell
            # FIXME: No Match. "Cells isolated from a mass of neoplastic cells,
            # i.e., a growth formed by abnormal cellular proliferation."
            # Oncocyte? CL:0002198
            'Tumor-derived cell line': 'CL:0002198'
        }
        if sample_type.strip() in type_map:
            ctype = type_map.get(sample_type)
        else:
            logger.error("Cell type not mapped: %s", sample_type)

        return ctype

    @staticmethod
    def _map_race(race):
        rtype = None
        type_map = {
            'African American': 'EFO:0003150',
            # 'American Indian': 'EFO',
            'Asian': 'EFO:0003152',
            # FIXME: Asian?
            'Asian; Other': 'EFO:0003152',
            # Asian Indian
            'Asiatic Indian': 'EFO:0003153',
            # FIXME: African American? There is also African.
            'Black': 'EFO:0003150',
            'Caucasian': 'EFO:0003156',
            'Chinese': 'EFO:0003157',
            'East Indian': 'EFO:0003158',  # Eastern Indian
            'Filipino': 'EFO:0003160',
            # Hispanic: EFO:0003169, Latino: EFO:0003166 see next
            'Hispanic/Latino': 'EFO:0003169',
            'Japanese': 'EFO:0003164',
            'Korean': 'EFO:0003165',
            # 'More than one race': 'EFO',
            # 'Not Reported': 'EFO',
            # 'Other': 'EFO',
            # Asian/Pacific Islander
            'Pacific Islander': 'EFO:0003154',
            # Asian/Pacific Islander
            'Polynesian': 'EFO:0003154',
            # 'Unknown': 'EFO',
            # Asian
            'Vietnamese': 'EFO:0003152',
        }
        if race.strip() in type_map:
            rtype = type_map.get(race)
        else:
            logger.warning("Race type not mapped: %s", race)

        return rtype

    @staticmethod
    def _map_species(species):
        tax = None
        type_map = {
            'Mus musculus': 'NCBITaxon:10090',
            'Peromyscus peromyscus californicus': 'NCBITaxon:42520',
            'Peromyscus peromyscus maniculatus': 'NCBITaxon:10042',
            'Peromyscus peromyscus leucopus': 'NCBITaxon:10041',
            'Peromyscus peromyscus polionotus': 'NCBITaxon:42413',
            'Macaca fascicularis': 'NCBITaxon:9541',
            'Rattus norvegicus': 'NCBITaxon:10116',
            'Papio anubis': 'NCBITaxon:9555',
            'Cricetulus griseus': 'NCBITaxon:10029',
            'Geochelone elephantopus': 'NCBITaxon:66189',
            'Muntiacus muntjak': 'NCBITaxon:9888',
            'Ailurus fulgens': 'NCBITaxon:9649',
            'Sus scrofa': 'NCBITaxon:9823',
            'Bos taurus': 'NCBITaxon:9913',
            'Oryctolagus cuniculus': 'NCBITaxon:9986',
            'Macaca nemestrina': 'NCBITaxon:9545',
            'Canis familiaris': 'NCBITaxon:9615',
            'Equus caballus': 'NCBITaxon:9796',
            'Macaca mulatta': 'NCBITaxon:9544',
            'Mesocricetus auratus': 'NCBITaxon:10036',
            'Macaca nigra': 'NCBITaxon:54600',
            'Erythrocebus patas': 'NCBITaxon:9538',
            'Pongo pygmaeus': 'NCBITaxon:9600',
            'Callicebus moloch': 'NCBITaxon:9523',
            'Lagothrix lagotricha': 'NCBITaxon:9519',
            'Saguinus fuscicollis': 'NCBITaxon:9487',
            'Saimiri sciureus': 'NCBITaxon:9521',
            'Saguinus labiatus': 'NCBITaxon:78454',
            'Pan paniscus': 'NCBITaxon:9597',
            'Ovis aries': 'NCBITaxon:9940',
            'Felis catus': 'NCBITaxon:9685',
            'H**o sapiens': 'NCBITaxon:9606'
        }
        if species.strip() in type_map:
            tax = type_map.get(species)
        else:
            logger.warning("Species type not mapped: %s", species)

        return tax

    @staticmethod
    def _map_collection(collection):
        ctype = None
        type_map = {
            'NINDS Repository':
                'CoriellCollection:NINDS',
            'NIGMS Human Genetic Cell Repository':
                'CoriellCollection:NIGMS',
            'NIA Aging Cell Culture Repository':
                'CoriellCollection:NIA',
            'NHGRI Sample Repository for Human Genetic Research':
                'CoriellCollection:NHGRI'
        }
        if collection.strip() in type_map:
            ctype = type_map.get(collection)
        else:
            logger.warning("ERROR: Collection type not mapped: %s", collection)

        return ctype

    @staticmethod
    def _get_affected_chromosomes_from_karyotype(karyotype):

        affected_chromosomes = set()
        chr_regex = r'(\d+|X|Y|M|\?);?'
        abberation_regex = r'(?:add|del|der|i|idic|inv|r|rec|t)\([\w;]+\)'
        sex_regex = r'(?:;)(X{2,}Y+|X?Y{2,}|X{3,}|X|Y)(?:;|$)'

        # first fetch the set of abberations
        abberations = re.findall(abberation_regex, karyotype)

        # iterate over them to get the chromosomes
        for a in abberations:
            chrs = re.findall(chr_regex, a)
            affected_chromosomes = affected_chromosomes.union(set(chrs))

        # remove the ? as a chromosome, since it isn't valid
        if '?' in affected_chromosomes:
            affected_chromosomes.remove('?')

        # check to see if there are any abnormal sex chromosomes
        m = re.search(sex_regex, karyotype)
        if m is not None:
            if re.search(r'X?Y{2,}', m.group(1)):
                # this is the only case where there is an extra Y chromosome
                affected_chromosomes.add('Y')
            else:
                affected_chromosomes.add('X')

        return affected_chromosomes

    @staticmethod
    def _is_normal_karyotype(karyotype):
        """
        This will default to true if no karyotype is provided.
        This is assuming human karyotypes.
        :param karyotype:
        :return:
        """

        is_normal = True
        if karyotype is not None:
            karyotype = karyotype.strip()
            if karyotype not in ['46;XX', '46;XY', '']:
                is_normal = False

        return is_normal

    def getTestSuite(self):
        import unittest
        from tests.test_coriell import CoriellTestCase
        # TODO add G2PAssoc, Genotype tests

        test_suite = \
            unittest.TestLoader().loadTestsFromTestCase(CoriellTestCase)

        return test_suite
예제 #2
0
class EOM(PostgreSQLSource):
    """
    Elements of Morphology is a resource from NHGRI that has definitions of
    morphological abnormalities, together with image depictions.
    We pull those relationships, as well as our local mapping of equivalences
    between EOM and HP terminologies.

    The website is crawled monthly by NIF's DISCO crawler system,
        which we utilize here.
    Be sure to have pg user/password connection details in your conf.json file,
    like:
      dbauth : {
        'disco' : {'user' : '<username>', 'password' : '<password>'}
      }

    Monarch-curated data for the HP to EOM mapping is stored at
        https://phenotype-ontologies.googlecode.com

    Since this resource is so small, the entirety of it is the "test" set.

    """

    # we are using the production view here; should we be using services?
    tables = [
        'dvp.pr_nlx_157874_1'
    ]

    files = {
        'map': {
            'file': 'hp-to-eom-mapping.tsv',
            'url': 'https://phenotype-ontologies.googlecode.com/svn/trunk/src/ontology/hp/mappings/hp-to-eom-mapping.tsv'
        }
    }

    def __init__(self):
        super().__init__('eom')
        self.namespaces.update(curie_map.get())

        # update the dataset object with details about this resource
        # TODO put this into a conf file?
        self.dataset = Dataset(
            'eom', 'EOM', 'http://elementsofmorphology.nih.gov', None,
            'http://www.genome.gov/copyright.cfm',
            'https://creativecommons.org/publicdomain/mark/1.0/')

        # check if config exists; if it doesn't, error out and let user know
        if 'dbauth' not in config.get_config() or \
                'disco' not in config.get_config()['dbauth']:
            logger.error("not configured with PG user/password.")

        # source-specific warnings.  will be cleared when resolved.

        return

    def fetch(self, is_dl_forced=False):
        '''create the connection details for DISCO'''

        cxn = config.get_config()['dbauth']['disco']
        cxn.update(
            {'host': 'nif-db.crbs.ucsd.edu', 'database': 'disco_crawler',
             'port': 5432})

        self.dataset.setFileAccessUrl(
            ''.join(('jdbc:postgresql://', cxn['host'], ':', str(cxn['port']),
                    '/', cxn['database'])))

        # process the tables
        # self.fetch_from_pgdb(self.tables,cxn,100)  #for testing
        self.fetch_from_pgdb(self.tables, cxn)

        self.get_files(is_dl_forced)

        # FIXME: Everything needed for data provenance?
        st = os.stat('/'.join((self.rawdir, 'dvp.pr_nlx_157874_1')))
        filedate = datetime.utcfromtimestamp(st[ST_CTIME]).strftime("%Y-%m-%d")
        self.dataset.setVersion(filedate)

        return

    def parse(self, limit=None):
        '''
            Over ride Source.parse inherited via PostgreSQLSource
        '''

        if limit is not None:
            logger.info("Only parsing first %s rows of each file", limit)

        if self.testOnly:
            self.testMode = True

        logger.info("Parsing files...")

        self._process_nlx_157874_1_view('/'.join((self.rawdir,
                                                  'dvp.pr_nlx_157874_1')),
                                        limit)
        self._map_eom_terms('/'.join((self.rawdir, self.files['map']['file'])),
                            limit)

        logger.info("Finished parsing.")

        self.load_bindings()

        # since it's so small,
        # we default to copying the entire graph to the test set
        self.testgraph = self.graph

        logger.info("Found %s nodes", len(self.graph))
        return

    def _process_nlx_157874_1_view(self, raw, limit=None):
        """
        This table contains the Elements of Morphology data that has been
        screen-scraped into DISCO.
        Note that foaf:depiction is inverse of foaf:depicts relationship.

        Since it is bad form to have two definitions,
        we concatenate the two into one string.

        Triples:
            <eom id> a owl:Class
                rdf:label Literal(eom label)
                OIO:hasRelatedSynonym Literal(synonym list)
                IAO:definition Literal(objective_def. subjective def)
                foaf:depiction Literal(small_image_url),
                               Literal(large_image_url)
                foaf:page Literal(page_url)
                rdfs:comment Literal(long commented text)


        :param raw:
        :param limit:
        :return:
        """

        gu = GraphUtils(curie_map.get())
        line_counter = 0
        with open(raw, 'r') as f1:
            f1.readline()  # read the header row; skip
            filereader = csv.reader(f1, delimiter='\t', quotechar='\"')
            for line in filereader:
                line_counter += 1
                (morphology_term_id, morphology_term_num,
                 morphology_term_label, morphology_term_url,
                 terminology_category_label, terminology_category_url,
                 subcategory, objective_definition, subjective_definition,
                 comments, synonyms, replaces, small_figure_url,
                 large_figure_url, e_uid, v_uid, v_uuid,
                 v_last_modified) = line

                # note:
                # e_uid v_uuid v_last_modified terminology_category_url
                # subcategory v_uid morphology_term_num
                # terminology_category_label hp_label notes
                # are currently unused.

                # Add morphology term to graph as a class
                # with label, type, and description.
                gu.addClassToGraph(self.graph, morphology_term_id,
                                   morphology_term_label)

                # Assemble the description text

                if subjective_definition != '' and not (
                        re.match(r'.+\.$', subjective_definition)):
                    # add a trailing period.
                    subjective_definition = subjective_definition.strip() + '.'
                if objective_definition != '' and not (
                        re.match(r'.+\.$', objective_definition)):
                    # add a trailing period.
                    objective_definition = objective_definition.strip() + '.'

                definition = \
                    '  '.join(
                        (objective_definition, subjective_definition)).strip()

                gu.addDefinition(self.graph, morphology_term_id, definition)

                # <term id> FOAF:depicted_by literal url
                # <url> type foaf:depiction

                # do we want both images?
                # morphology_term_id has depiction small_figure_url
                if small_figure_url != '':
                    gu.addDepiction(self.graph, morphology_term_id,
                                    small_figure_url)

                # morphology_term_id has depiction large_figure_url
                if large_figure_url != '':
                    gu.addDepiction(self.graph, morphology_term_id,
                                    large_figure_url)

                # morphology_term_id has comment comments
                if comments != '':
                    gu.addComment(self.graph, morphology_term_id,
                                  comments.strip())

                if synonyms != '':
                    for s in synonyms.split(';'):
                        gu.addSynonym(
                            self.graph, morphology_term_id, s.strip(),
                            gu.properties['hasExactSynonym'])

                # morphology_term_id hasRelatedSynonym replaces (; delimited)
                if replaces != '' and replaces != synonyms:
                    for s in replaces.split(';'):
                        gu.addSynonym(
                            self.graph, morphology_term_id, s.strip(),
                            gu.properties['hasRelatedSynonym'])

                # morphology_term_id has page morphology_term_url
                gu.addPage(self.graph, morphology_term_id, morphology_term_url)

                if limit is not None and line_counter > limit:
                    break
        return

    def _map_eom_terms(self, raw, limit=None):
        """
        This table contains the HP ID mappings from the local tsv file.
        Triples:
            <eom id> owl:equivalentClass <hp id>
        :param raw:
        :param limit:
        :return:
        """

        gu = GraphUtils(curie_map.get())

        line_counter = 0
        with open(raw, 'r') as f1:
            f1.readline()  # read the header row; skip
            for line in f1:
                line_counter += 1

                (morphology_term_id, morphology_term_label, hp_id, hp_label,
                 notes) = line.split('\t')

                # Sub out the underscores for colons.
                hp_id = re.sub('_', ':', hp_id)
                if re.match(".*HP:.*", hp_id):
                    # add the HP term as a class
                    gu.addClassToGraph(self.graph, hp_id, None)
                    # Add the HP ID as an equivalent class
                    gu.addEquivalentClass(
                        self.graph, morphology_term_id, hp_id)
                else:
                    logger.warning('No matching HP term for %s',
                                   morphology_term_label)

                if limit is not None and line_counter > limit:
                    break

        return

    def getTestSuite(self):
        import unittest
        # TODO PYLINT: Unable to import 'tests.test_eom'
        from tests.test_eom import EOMTestCase

        test_suite = unittest.TestLoader().loadTestsFromTestCase(EOMTestCase)

        return test_suite
예제 #3
0
class MMRRC(Source):
    """
    Here we process the Mutant Mouse Resource and Research Center
    (https://www.mmrrc.org) strain data,
    which includes:
    *  strains, their mutant alleles
    *  phenotypes of the alleles
    *  descriptions of the research uses of the strains

    Note that some gene identifiers are not included
    (for many of the transgenics with human genes) in the raw data.
    We do our best to process the links between the variant and
    the affected gene, but sometimes the mapping is not clear,
    and we do not include it.
    Many of these details will be solved by merging this source with
    the MGI data source, who has the variant-to-gene designations.

    Also note that even though the strain pages at the MMRRC site do list
    phenotypic differences in the context of the strain backgrounds,
    they do not provide that data to us,
    and thus we cannot supply that disambiguation here.
    """

    files = {
        'catalog': {
            'file': 'mmrrc_catalog_data.csv',
            'url': 'https://www.mmrrc.org/about/mmrrc_catalog_data.csv'},
    }

    test_ids = [
        'MMRRC:037507-MU', 'MMRRC:041175-UCD', 'MMRRC:036933-UNC',
        'MMRRC:037884-UCD', 'MMRRC:000255-MU', 'MMRRC:037372-UCD',
        'MMRRC:000001-UNC'
    ]

    def __init__(self):
        Source.__init__(self, 'mmrrc')
        self.strain_hash = {}
        self.id_label_hash = {}
        self.load_bindings()
        self.dataset = Dataset(
            'mmrrc', 'Mutant Mouse Regional Resource Centers',
            'https://www.mmrrc.org', None,
            'https://www.mmrrc.org/about/data_download.php')

        return

    def fetch(self, is_dl_forced=False):

        self.get_files(is_dl_forced)
        fname = '/'.join((self.rawdir, self.files['catalog']['file']))
        st = os.stat(fname)
        filedate = datetime.utcfromtimestamp(st[ST_CTIME]).strftime("%Y-%m-%d")

        # TODO note: can set the data version to what is in the header
        # first line like:
        # This MMRRC catalog data file was generated on 2015-04-22

        self.dataset.setVersion(filedate)

        return

    def parse(self, limit=None):
        if limit is not None:
            logger.info("Only parsing first %s rows", limit)

        logger.info("Parsing files...")

        if self.testOnly:
            self.testMode = True

        self._process_phenotype_data(limit)

        logger.info("Finished parsing.")

        return

    def _process_phenotype_data(self, limit):
        """
        NOTE: If a Strain carries more than one mutation,
        then each Mutation description,
        i.e., the set: (
            Mutation Type - Chromosome - Gene Symbol -
            Gene Name - Allele Symbol - Allele Name)
        will require a separate line.

        Note that MMRRC curates phenotypes to alleles,
        even though they distribute only one file with the
        phenotypes appearing to be associated with a strain.

        So, here we process the allele-to-phenotype relationships separately
        from the strain-to-allele relationships.

        :param limit:
        :return:

        """
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        line_counter = 0
        gu = GraphUtils(curie_map.get())
        fname = '/'.join((self.rawdir, self.files['catalog']['file']))

        self.strain_hash = {}
        self.id_label_hash = {}
        genes_with_no_ids = set()
        stem_cell_class = 'CL:0000034'
        mouse_taxon = 'NCBITaxon:10090'
        geno = Genotype(g)
        with open(fname, 'r', encoding="utf8") as csvfile:
            filereader = csv.reader(csvfile, delimiter=',', quotechar='\"')
            for row in filereader:
                line_counter += 1
                # skip the first 3 lines which are header, etc.
                if line_counter < 4:
                    continue

                (strain_id, strain_label, strain_type_symbol, strain_state,
                 mgi_allele_id, mgi_allele_symbol, mgi_allele_name,
                 mutation_type, chrom, mgi_gene_id, mgi_gene_symbol,
                 mgi_gene_name, sds_url, accepted_date, mp_ids, pubmed_nums,
                 research_areas) = row

                if self.testMode and (strain_id not in self.test_ids):
                    continue

                # strip off stuff after the dash -
                # is the holding center important?
                # MMRRC:00001-UNC --> MMRRC:00001
                strain_id = re.sub(r'-\w+$', '', strain_id)

                self.id_label_hash[strain_id] = strain_label

                # get the variant or gene to save for later building of
                # the genotype
                if strain_id not in self.strain_hash:
                    self.strain_hash[strain_id] = {'variants': set(),
                                                   'genes': set()}

                # clean up the bad one
                if mgi_allele_id == 'multiple mutation':
                    logger.error("Erroneous gene id: %s", mgi_allele_id)
                    mgi_allele_id = ''

                if mgi_allele_id != '':
                    self.strain_hash[strain_id]['variants'].add(mgi_allele_id)
                    self.id_label_hash[mgi_allele_id] = mgi_allele_symbol

                    # use the following if needing to add the
                    # sequence alteration types
                    # var_type =
                    #   self._get_variant_type_from_abbrev(mutation_type)
                    # make a sequence alteration for this variant locus,
                    # and link the variation type to it
                    # sa_id = '_'+re.sub(r':','',mgi_allele_id)+'SA'
                    # if self.nobnodes:
                    #     sa_id = ':'+sa_id
                    # gu.addIndividualToGraph(g, sa_id, None, var_type)
                    # geno.addSequenceAlterationToVariantLocus(sa_id,
                    #                                          mgi_allele_id)

                # scrub out any spaces
                mgi_gene_id = re.sub(r'\s+', '', mgi_gene_id)
                if mgi_gene_id.strip() != '':
                    if re.match(r'Gene\s*ID:', mgi_gene_id, re.I):
                        mgi_gene_id = re.sub(r'Gene\s*ID:\s*', 'NCBIGene:',
                                             mgi_gene_id)
                    elif not re.match(r'MGI', mgi_gene_id):
                        logger.info("Gene id not recognized: %s", mgi_gene_id)
                        if re.match(r'\d+$', mgi_gene_id):
                            # assume that if it's all numbers, then it's MGI
                            mgi_gene_id = 'MGI:'+str(mgi_gene_id)
                            logger.info("Assuming numerics are MGI.")
                    self.strain_hash[strain_id]['genes'].add(mgi_gene_id)
                    self.id_label_hash[mgi_gene_id] = mgi_gene_symbol

                # catch some errors -
                # some things have gene labels, but no identifiers - report
                if mgi_gene_symbol.strip() != '' and mgi_gene_id == '':
                    logger.error(
                        "Gene label with no identifier for strain %s: %s",
                        strain_id, mgi_gene_symbol)
                    genes_with_no_ids.add(mgi_gene_symbol.strip())
                    # make a temp id for genes that aren't identified
                    # tmp_gene_id = '_'+mgi_gene_symbol
                    # self.id_label_hash[tmp_gene_id] = mgi_gene_symbol
                    # self.strain_hash[strain_id]['genes'].add(tmp_gene_id)

                # split apart the mp ids
                # ataxia [MP:0001393] ,hypoactivity [MP:0001402] ...
                # mp_ids are now a comma delimited list
                # with MP terms in brackets
                phenotype_ids = []
                if mp_ids != '':
                    for i in re.split(r',', mp_ids):
                        i = i.strip()
                        mps = re.search(r'\[(.*)\]', i)
                        if mps is not None:
                            mp_id = mps.group(1).strip()
                            phenotype_ids.append(mp_id)

                # pubmed ids are space delimited
                pubmed_ids = []
                if pubmed_nums.strip() != '':
                    for i in re.split(r'\s+', pubmed_nums):
                        pmid = 'PMID:'+i.strip()
                        pubmed_ids.append(pmid)
                        r = Reference(pmid,
                                      Reference.ref_types['journal_article'])
                        r.addRefToGraph(g)

                # https://www.mmrrc.org/catalog/sds.php?mmrrc_id=00001
                # is a good example of 4 genotype parts

                gu.addClassToGraph(g, mouse_taxon, None)
                if research_areas.strip() == '':
                    research_areas = None
                else:
                    research_areas = 'Research Areas: '+research_areas
                strain_type = mouse_taxon
                if strain_state == 'ES':
                    strain_type = stem_cell_class
                gu.addIndividualToGraph(
                    g, strain_id, strain_label, strain_type,
                    research_areas)  # an inst of mouse??
                gu.makeLeader(g, strain_id)

                # phenotypes are associated with the alleles
                for pid in phenotype_ids:
                    # assume the phenotype label is in the ontology
                    gu.addClassToGraph(g, pid, None)
                    if mgi_allele_id is not None and mgi_allele_id != '':
                        assoc = G2PAssoc(self.name, mgi_allele_id, pid,
                                         gu.object_properties['has_phenotype'])
                        for p in pubmed_ids:
                            assoc.add_source(p)
                        assoc.add_association_to_graph(g)
                    else:
                        logger.info("Phenotypes and no allele for %s",
                                    strain_id)

                if not self.testMode and (
                        limit is not None and line_counter > limit):
                    break

            # now that we've collected all of the variant information, build it
            # we don't know their zygosities
            for s in self.strain_hash:
                h = self.strain_hash.get(s)
                variants = h['variants']
                genes = h['genes']
                vl_set = set()
                # make variant loci for each gene
                if len(variants) > 0:
                    for v in variants:
                        vl_id = v
                        vl_symbol = self.id_label_hash[vl_id]
                        geno.addAllele(vl_id, vl_symbol,
                                       geno.genoparts['variant_locus'])
                        vl_set.add(vl_id)
                        if len(variants) == 1 and len(genes) == 1:
                            for gene in genes:
                                geno.addAlleleOfGene(vl_id, gene)
                        else:
                            geno.addAllele(vl_id, vl_symbol)
                else:  # len(vars) == 0
                    # it's just anonymous variants in some gene
                    for gene in genes:
                        vl_id = '_'+gene+'-VL'
                        vl_id = re.sub(r':', '', vl_id)
                        if self.nobnodes:
                            vl_id = ':'+vl_id
                        vl_symbol = self.id_label_hash[gene]+'<?>'
                        self.id_label_hash[vl_id] = vl_symbol
                        geno.addAllele(vl_id, vl_symbol,
                                       geno.genoparts['variant_locus'])
                        geno.addGene(gene, self.id_label_hash[gene])
                        geno.addAlleleOfGene(vl_id, gene)
                        vl_set.add(vl_id)

                # make the vslcs
                vl_list = sorted(vl_set)
                vslc_list = []
                for vl in vl_list:
                    # for unknown zygosity
                    vslc_id = '_'+re.sub(r'^_', '', vl)+'U'
                    vslc_id = re.sub(r':', '', vslc_id)
                    if self.nobnodes:
                        vslc_id = ':' + vslc_id
                    vslc_label = self.id_label_hash[vl] + '/?'
                    self.id_label_hash[vslc_id] = vslc_label
                    vslc_list.append(vslc_id)
                    geno.addPartsToVSLC(
                        vslc_id, vl, None, geno.zygosity['indeterminate'],
                        geno.object_properties['has_alternate_part'], None)
                    gu.addIndividualToGraph(
                        g, vslc_id, vslc_label,
                        geno.genoparts['variant_single_locus_complement'])
                if len(vslc_list) > 0:
                    if len(vslc_list) > 1:
                        gvc_id = '-'.join(vslc_list)
                        gvc_id = re.sub(r':', '', gvc_id)
                        if self.nobnodes:
                            gvc_id = ':'+gvc_id
                        gvc_label = \
                            '; '.join(self.id_label_hash[v] for v in vslc_list)
                        gu.addIndividualToGraph(
                            g, gvc_id, gvc_label,
                            geno.genoparts['genomic_variation_complement'])
                        for vslc_id in vslc_list:
                            geno.addVSLCtoParent(vslc_id, gvc_id)
                    else:
                        # the GVC == VSLC, so don't have to make an extra piece
                        gvc_id = vslc_list.pop()
                        gvc_label = self.id_label_hash[gvc_id]

                    genotype_label = gvc_label + ' [n.s.]'
                    bkgd_id = \
                        '_' + re.sub(r':', '', '-'.join(
                            (geno.genoparts['unspecified_genomic_background'],
                             s)))
                    genotype_id = '-'.join((gvc_id, bkgd_id))
                    if self.nobnodes:
                        bkgd_id = ':'+bkgd_id
                    geno.addTaxon(mouse_taxon, bkgd_id)
                    geno.addGenomicBackground(
                        bkgd_id, 'unspecified ('+s+')',
                        geno.genoparts['unspecified_genomic_background'],
                        "A placeholder for the " +
                        "unspecified genetic background for "+s)
                    geno.addGenomicBackgroundToGenotype(
                        bkgd_id, genotype_id,
                        geno.genoparts['unspecified_genomic_background'])
                    geno.addParts(
                        gvc_id, genotype_id,
                        geno.object_properties['has_alternate_part'])
                    geno.addGenotype(genotype_id, genotype_label)
                    gu.addTriple(
                        g, s, geno.object_properties['has_genotype'],
                        genotype_id)
                else:
                    # logger.debug(
                    #   "Strain %s is not making a proper genotype.", s)
                    pass

            gu.loadProperties(
                g, G2PAssoc.object_properties, G2PAssoc.OBJECTPROP)
            gu.loadProperties(
                g, G2PAssoc.datatype_properties, G2PAssoc.DATAPROP)
            gu.loadProperties(
                g, G2PAssoc.annotation_properties, G2PAssoc.ANNOTPROP)
            gu.loadAllProperties(g)

            logger.warning(
                "The following gene symbols did not list identifiers: %s",
                str(sorted(list(genes_with_no_ids))))

        return

    @staticmethod
    def _get_variant_type_from_abbrev(abbrev):
        """
        All variants are generically typed as "sequence_alterations"
        unless otherwise stated.
        :param abbrev:
        :return:

        """
        variant_type = None

        var_dict = {
            'SM': 'SO:0001059',  # spontaneous mutation
            'TM': 'SO:0001059',  # targeted mutation
            'TG': 'SO:xxxxxxx',  # transgenic
            'GT': 'SO:0001059',  # gene trap
            'CI': 'SO:0001059',  # chemically induced mutation
            'RAD': 'SO:0001059',  # radiation induced mutation
            # chromosomal aberration --> chromosomal structure variation
            'CH': 'SO:1000183',
            'RB': 'SO:1000043',  # Robertsonian translocation
            'TL': 'SO:1000048',  # reciprocal translocation
            'TP': 'SO:0000453',  # transposition
            'INV': 'SO:1000036',  # inversion
            'INS': 'SO:0000667',  # insertion
            'DEL': 'SO:0000159',  # deletion
            'DP': 'SO:1000035',  # duplication
            'OTH': 'SO:0001059'  # other
        }
        if abbrev in var_dict:
            variant_type = var_dict[abbrev]
        else:
            logger.warning("Variant type not recognized: %s", abbrev)

        return variant_type

    def getTestSuite(self):
        import unittest
        from tests.test_mmrrc import MMRRCTestCase

        test_suite = unittest.TestLoader().loadTestsFromTestCase(MMRRCTestCase)

        return test_suite
예제 #4
0
class CTD(Source):
    """
    The Comparative Toxicogenomics Database (CTD) includes curated data
    describing cross-species chemical–gene/protein interactions and
    chemical– and gene–disease associations to illuminate molecular mechanisms
    underlying variable susceptibility and environmentally influenced diseases.

    Here, we fetch, parse, and convert data from CTD into triples,
    leveraging only the associations based on DIRECT evidence
    (not using the inferred associations).
    We currently process the following associations:
    * chemical-disease
    * gene-pathway
    * gene-disease

    CTD curates relationships between genes and chemicals/diseases with
    marker/mechanism and/or therapeutic.
    Unfortunately, we cannot disambiguate between marker (gene expression) and
    mechanism (causation) for these associations.  Therefore, we are left to
    relate these simply by "marker".

    CTD also pulls in genes and pathway membership from KEGG and REACTOME.
    We create groups of these following the pattern that the specific pathway
    is a subclass of 'cellular process' (a go process), and the gene is
    "involved in" that process.

    For diseases, we preferentially use OMIM identifiers when they can be used
    uniquely over MESH.  Otherwise, we use MESH ids.

    Note that we scrub the following identifiers and their associated data:
    * REACT:REACT_116125 - generic disease class
    * MESH:D004283 - dog diseases
    * MESH:D004195 - disease models, animal
    * MESH:D030342 - genetic diseases, inborn
    * MESH:D040181 - genetic dieases, x-linked
    * MESH:D020022 - genetic predisposition to a disease
    """

    files = {
        'chemical_disease_interactions': {
            'file': 'CTD_chemicals_diseases.tsv.gz',
            'url': 'http://ctdbase.org/reports/CTD_chemicals_diseases.tsv.gz'
        },
        'gene_pathway': {
            'file': 'CTD_genes_pathways.tsv.gz',
            'url': 'http://ctdbase.org/reports/CTD_genes_pathways.tsv.gz'
        },
        'gene_disease': {
            'file': 'CTD_genes_diseases.tsv.gz',
            'url': 'http://ctdbase.org/reports/CTD_genes_diseases.tsv.gz'
        }
    }
    static_files = {
        'publications': {'file': 'CTD_curated_references.tsv'}
    }

    def __init__(self, graph_type, are_bnodes_skolemized):
        super().__init__(graph_type, are_bnodes_skolemized, 'ctd')
        self.dataset = Dataset(
            'ctd', 'CTD', 'http://ctdbase.org', None,
            'http://ctdbase.org/about/legal.jsp')

        if 'test_ids' not in config.get_config() \
                or 'gene' not in config.get_config()['test_ids']:
            logger.warning("not configured with gene test ids.")
            self.test_geneids = []
        else:
            self.test_geneids = config.get_config()['test_ids']['gene']

        if 'test_ids' not in config.get_config() \
                or 'disease' not in config.get_config()['test_ids']:
            logger.warning("not configured with disease test ids.")
            self.test_diseaseids = []
        else:
            self.test_diseaseids = config.get_config()['test_ids']['disease']

        self.g = self.graph
        self.geno = Genotype(self.graph)
        self.pathway = Pathway(self.graph)

        return

    def fetch(self, is_dl_forced=False):
        """
        Override Source.fetch()
        Fetches resources from CTD using the CTD.files dictionary
        Args:
        :param is_dl_forced (bool): Force download
        Returns:
        :return None
        """
        self.get_files(is_dl_forced)

        self._fetch_disambiguating_assoc()

        # consider creating subsets of the files that
        # only have direct annotations (not inferred)
        return

    def parse(self, limit=None):
        """
        Override Source.parse()
        Parses version and interaction information from CTD
        Args:
        :param limit (int, optional) limit the number of rows processed
        Returns:
        :return None
        """
        if limit is not None:
            logger.info("Only parsing first %d rows", limit)

        logger.info("Parsing files...")
        # pub_map = dict()
        # file_path = '/'.join((self.rawdir,
        # self.static_files['publications']['file']))
        # if os.path.exists(file_path) is True:
        #     pub_map = self._parse_publication_file(
        #         self.static_files['publications']['file']
        #     )

        if self.testOnly:
            self.testMode = True

        if self.testMode:
            self.g = self.testgraph
        else:
            self.g = self.graph
        self.geno = Genotype(self.g)
        self.pathway = Pathway(self.g)

        self._parse_ctd_file(
            limit, self.files['chemical_disease_interactions']['file'])
        self._parse_ctd_file(limit, self.files['gene_pathway']['file'])
        self._parse_ctd_file(limit, self.files['gene_disease']['file'])
        self._parse_curated_chem_disease(limit)

        logger.info("Done parsing files.")

        return

    def _parse_ctd_file(self, limit, file):
        """
        Parses files in CTD.files dictionary
        Args:
            :param limit (int): limit the number of rows processed
            :param file (str): file name (must be defined in CTD.file)
        Returns:
            :return None
        """
        row_count = 0
        version_pattern = re.compile(r'^# Report created: (.+)$')
        is_versioned = False
        file_path = '/'.join((self.rawdir, file))
        with gzip.open(file_path, 'rt') as tsvfile:
            reader = csv.reader(tsvfile, delimiter="\t")
            for row in reader:
                # Scan the header lines until we get the version
                # There is no official version sp we are using
                # the upload timestamp instead
                if is_versioned is False:
                    match = re.match(version_pattern, ' '.join(row))
                    if match:
                        version = re.sub(r'\s|:', '-', match.group(1))
                        # TODO convert this timestamp to a proper timestamp
                        self.dataset.setVersion(version)
                        is_versioned = True
                elif re.match(r'^#', ' '.join(row)):
                    pass
                else:
                    row_count += 1
                    if file == self.files[
                            'chemical_disease_interactions']['file']:
                        self._process_interactions(row)
                    elif file == self.files['gene_pathway']['file']:
                        self._process_pathway(row)
                    elif file == self.files['gene_disease']['file']:
                        self._process_disease2gene(row)

                if not self.testMode and \
                        limit is not None and row_count >= limit:
                    break

        return

    def _process_pathway(self, row):
        """
        Process row of CTD data from CTD_genes_pathways.tsv.gz
        and generate triples
        Args:
            :param row (list): row of CTD data
        Returns:
            :return None
        """
        model = Model(self.g)
        self._check_list_len(row, 4)
        (gene_symbol, gene_id, pathway_name, pathway_id) = row

        if self.testMode and (int(gene_id) not in self.test_geneids):
            return

        entrez_id = 'NCBIGene:' + gene_id

        pathways_to_scrub = [
            'REACT:REACT_116125',  # disease
            "REACT:REACT_111045",  # developmental biology
            "REACT:REACT_200794",  # Mus musculus biological processes
            "REACT:REACT_13685"]   # neuronal system ?

        if pathway_id in pathways_to_scrub:
            # these are lame "pathways" like generic
            # "disease" and "developmental biology"
            return

        # convert KEGG pathway ids... KEGG:12345 --> KEGG-path:map12345
        if re.match(r'KEGG', pathway_id):
            pathway_id = re.sub(r'KEGG:', 'KEGG-path:map', pathway_id)
        # just in case, add it as a class
        model.addClassToGraph(entrez_id, None)

        self.pathway.addPathway(pathway_id, pathway_name)
        self.pathway.addGeneToPathway(entrez_id, pathway_id)

        return

    def _fetch_disambiguating_assoc(self):
        """
        For any of the items in the chemical-disease association file that have
        ambiguous association types we fetch the disambiguated associations
        using the batch query API, and store these in a file. Elsewhere, we can
        loop through the file and create the appropriate associations.

        :return:

        """

        disambig_file = '/'.join(
            (self.rawdir, self.static_files['publications']['file']))
        assoc_file = '/'.join(
            (self.rawdir, self.files['chemical_disease_interactions']['file']))

        # check if there is a local association file,
        # and download if it's dated later than the original intxn file
        if os.path.exists(disambig_file):
            dfile_dt = os.stat(disambig_file)
            afile_dt = os.stat(assoc_file)
            if dfile_dt < afile_dt:
                logger.info(
                    "Local file date before chem-disease assoc file. "
                    " Downloading...")
            else:
                logger.info(
                    "Local file date after chem-disease assoc file. "
                    " Skipping download.")
                return

        all_pubs = set()
        dual_evidence = re.compile(r'^marker\/mechanism\|therapeutic$')
        # first get all the unique publications
        with gzip.open(assoc_file, 'rt') as tsvfile:
            reader = csv.reader(tsvfile, delimiter="\t")
            for row in reader:
                if re.match(r'^#', ' '.join(row)):
                    continue
                self._check_list_len(row, 10)
                (chem_name, chem_id, cas_rn, disease_name, disease_id,
                 direct_evidence, inferred_gene_symbol, inference_score,
                 omim_ids, pubmed_ids) = row
                if direct_evidence == '' or not \
                        re.match(dual_evidence, direct_evidence):
                    continue
                if pubmed_ids is not None and pubmed_ids != '':
                    all_pubs.update(set(re.split(r'\|', pubmed_ids)))
        sorted_pubs = sorted(list(all_pubs))

        # now in batches of 4000, we fetch the chemical-disease associations
        batch_size = 4000
        params = {
            'inputType': 'reference',
            'report': 'diseases_curated',
            'format': 'tsv',
            'action': 'Download'
        }

        url = 'http://ctdbase.org/tools/batchQuery.go?q'
        start = 0
        end = min((batch_size, len(all_pubs)))  # get them in batches of 4000

        with open(disambig_file, 'wb') as f:
            while start < len(sorted_pubs):
                params['inputTerms'] = '|'.join(sorted_pubs[start:end])
                # fetch the data from url
                logger.info(
                    'fetching %d (%d-%d) refs: %s',
                    len(re.split(r'\|', params['inputTerms'])),
                    start, end, params['inputTerms'])
                data = urllib.parse.urlencode(params)
                encoding = 'utf-8'
                binary_data = data.encode(encoding)
                req = urllib.request.Request(url, binary_data)
                resp = urllib.request.urlopen(req)
                f.write(resp.read())
                start = end
                end = min((start + batch_size, len(sorted_pubs)))

        return

    def _process_interactions(self, row):
        """
        Process row of CTD data from CTD_chemicals_diseases.tsv.gz
        and generate triples. Only create associations based on direct evidence
        (not using the inferred-via-gene), and unambiguous relationships.
        (Ambiguous ones will be processed in the sister method using the
        disambiguated file). There are no OMIM ids for diseases in these cases,
        so we associate with only the mesh disease ids.
        Args:
            :param row (list): row of CTD data
        Returns:
            :return None
        """
        model = Model(self.g)
        self._check_list_len(row, 10)
        (chem_name, chem_id, cas_rn, disease_name, disease_id, direct_evidence,
         inferred_gene_symbol, inference_score, omim_ids, pubmed_ids) = row

        if direct_evidence == '':
            return

        evidence_pattern = re.compile(r'^therapeutic|marker\/mechanism$')
        # dual_evidence = re.compile(r'^marker\/mechanism\|therapeutic$')

        # filter on those diseases that are mapped to omim ids in the test set
        intersect = list(
            set(['OMIM:' + str(i) for i in omim_ids.split('|')] +
                [disease_id]) & set(self.test_diseaseids))
        if self.testMode and len(intersect) < 1:
            return
        chem_id = 'MESH:' + chem_id
        reference_list = self._process_pubmed_ids(pubmed_ids)
        if re.match(evidence_pattern, direct_evidence):
            rel_id = self._get_relationship_id(direct_evidence)
            model.addClassToGraph(chem_id, chem_name)
            model.addClassToGraph(disease_id, None)
            self._make_association(chem_id, disease_id, rel_id, reference_list)
        else:
            # there's dual evidence, but haven't mapped the pubs
            pass
            # logger.debug(
            #   "Dual evidence for %s (%s) and %s (%s)",
            #   chem_name, chem_id, disease_name, disease_id)

        return

    def _process_disease2gene(self, row):
        """
        Here, we process the disease-to-gene associations.
        Note that we ONLY process direct associations
        (not inferred through chemicals).
        Furthermore, we also ONLY process "marker/mechanism" associations.

        We preferentially utilize OMIM identifiers over MESH identifiers
        for disease/phenotype.
        Therefore, if a single OMIM id is listed under the "omim_ids" list,
        we will choose this over any MeSH id that might be listed as
        the disease_id. If multiple OMIM ids are listed in the omim_ids column,
        we toss this for now.
        (Mostly, we are not sure what to do with this information.)

        We associate "some variant of gene X" with the phenotype,
        rather than the gene directly.

        We also pull in the MeSH labels here (but not OMIM) to ensure that
        we have them (as they may not be brought in separately).
        :param row:
        :return:

        """

        # if self.testMode:
        # g = self.testgraph
        # else:
        #     g = self.graph
        # self._check_list_len(row, 9)
        # geno = Genotype(g)
        # gu = GraphUtils(curie_map.get())
        model = Model(self.g)
        (gene_symbol, gene_id, disease_name, disease_id, direct_evidence,
         inference_chemical_name, inference_score, omim_ids, pubmed_ids) = row

        # we only want the direct associations; skipping inferred for now
        if direct_evidence == '' or direct_evidence != 'marker/mechanism':
            return

        # scrub some of the associations...
        # it seems odd to link human genes to the following "diseases"
        diseases_to_scrub = [
            'MESH:D004283',  # dog diseases
            'MESH:D004195',  # disease models, animal
            'MESH:D030342',  # genetic diseases, inborn
            'MESH:D040181',  # genetic dieases, x-linked
            'MESH:D020022']   # genetic predisposition to a disease

        if disease_id in diseases_to_scrub:
            logger.info(
                "Skipping association between NCBIGene:%s and %s",
                str(gene_id), disease_id)
            return

        intersect = list(
            set(['OMIM:' + str(i) for i in omim_ids.split('|')] +
                [disease_id]) & set(self.test_diseaseids))
        if self.testMode and (
                int(gene_id) not in self.test_geneids or len(intersect) < 1):
            return

        # there are three kinds of direct evidence:
        # (marker/mechanism | marker/mechanism|therapeutic | therapeutic)
        # we are only using the "marker/mechanism" for now
        # TODO what does it mean for a gene to be therapeutic for disease?
        # a therapeutic target?

        gene_id = 'NCBIGene:' + gene_id

        preferred_disease_id = disease_id
        if omim_ids is not None and omim_ids != '':
            omim_id_list = re.split(r'\|', omim_ids)
            # If there is only one OMIM ID for the Disease ID
            # or in the omim_ids list,
            # use the OMIM ID preferentially over any MeSH ID.
            if re.match(r'OMIM:.*', disease_id):
                if len(omim_id_list) > 1:
                    # the disease ID is an OMIM ID and
                    # there is more than one OMIM entry in omim_ids.
                    # Currently no entries satisfy this condition
                    pass
                elif disease_id != ('OMIM:' + omim_ids):
                    # the disease ID is an OMIM ID and
                    # there is only one non-equiv OMIM entry in omim_ids
                    # we preferentially use the disease_id here
                    logger.warning(
                        "There may be alternate identifier for %s: %s",
                        disease_id, omim_ids)
                    # TODO: What should be done with the alternate disease IDs?
            else:
                if len(omim_id_list) == 1:
                    # the disease ID is not an OMIM ID
                    # and there is only one OMIM entry in omim_ids.
                    preferred_disease_id = 'OMIM:' + omim_ids
                elif len(omim_id_list) > 1:
                    # This is when the disease ID is not an OMIM ID and
                    # there is more than one OMIM entry in omim_ids.
                    pass

        # we actually want the association between the gene and the disease
        # to be via an alternate locus not the "wildtype" gene itself. So we
        # make an anonymous alternate locus, and put that in the association.
        alt_id = gene_id + '-' + preferred_disease_id + 'VL'
        # can't have colons in the bnodes
        alt_locus = re.sub(r':', '', alt_id)
        alt_locus = "_:" + alt_locus

        alt_label = 'some variant of ' + gene_symbol + ' that is ' \
                    + direct_evidence + ' for ' + disease_name
        model.addIndividualToGraph(
            alt_locus, alt_label,
            self.geno.genoparts['variant_locus'])
        # assume that the label gets added elsewhere
        model.addClassToGraph(gene_id, None)
        self.geno.addAffectedLocus(alt_locus, gene_id)
        model.addBlankNodeAnnotation(alt_locus)

        # not sure if MESH is getting added separately.
        # adding labels here for good measure
        dlabel = None
        if re.match(r'MESH', preferred_disease_id):
            dlabel = disease_name
        model.addClassToGraph(preferred_disease_id, dlabel)

        # Add the disease to gene relationship.
        rel_id = self._get_relationship_id(direct_evidence)
        refs = self._process_pubmed_ids(pubmed_ids)

        self._make_association(alt_locus, preferred_disease_id, rel_id, refs)

        return

    def _make_association(self, subject_id, object_id, rel_id, pubmed_ids):
        """
        Make a reified association given an array of pubmed identifiers.

        Args:
            :param subject_id  id of the subject of the association (gene/chem)
            :param object_id  id of the object of the association (disease)
            :param rel_id  relationship id
            :param pubmed_ids an array of pubmed identifiers
        Returns:
            :return None

        """

        # TODO pass in the relevant Assoc class rather than relying on G2P
        assoc = G2PAssoc(self.g, self.name, subject_id, object_id, rel_id)
        if pubmed_ids is not None and len(pubmed_ids) > 0:
            eco = self._get_evidence_code('TAS')
            for pmid in pubmed_ids:
                r = Reference(
                    self.g, pmid, Reference.ref_types['journal_article'])
                r.addRefToGraph()
                assoc.add_source(pmid)
                assoc.add_evidence(eco)

        assoc.add_association_to_graph()
        return

    @staticmethod
    def _process_pubmed_ids(pubmed_ids):
        """
        Take a list of pubmed IDs and add PMID prefix
        Args:
            :param pubmed_ids -  string representing publication
                                 ids seperated by a | symbol
        Returns:
            :return list: Pubmed curies

        """
        if pubmed_ids.strip() == '':
            id_list = []
        else:
            id_list = pubmed_ids.split('|')
        for (i, val) in enumerate(id_list):
            id_list[i] = 'PMID:' + val
        return id_list

    @staticmethod
    def _get_evidence_code(evidence):
        """
        Get curie for evidence class label
        Args:
        :param evidence (str): evidence label
        Label:
        :return str: curie for evidence label from ECO

        """

        eco_map = {
            'TAS': 'ECO:0000033'
        }
        return eco_map[evidence]

    @staticmethod
    def _get_relationship_id(rel):
        """
        Get curie from relationship property label
        Args:
            :param rel (str): relationship label
        Returns:
            :return str: curie for relationship label
        """
        rel_map = {
            'therapeutic': Model.object_properties['substance_that_treats'],
            'marker/mechanism': Model.object_properties['is_marker_for'],
        }
        return str(rel_map[rel])

    @staticmethod
    def _get_class_id(clslab):
        """
        Get curie from CLASS_MAP dictionary
        Args:
            :param cls (str): class label
        Returns:
            :return str: curie for class label
        """
        class_map = {
            'pathway': 'PW:0000001',
            'signal transduction': 'GO:0007165'
        }

        return class_map[clslab]

    def _parse_curated_chem_disease(self, limit):
        model = Model(self.g)
        line_counter = 0
        file_path = '/'.join(
            (self.rawdir, self.static_files['publications']['file']))
        with open(file_path, 'r') as tsvfile:
            reader = csv.reader(tsvfile, delimiter="\t")
            for row in reader:
                # catch comment lines
                if re.match(r'^#', ' '.join(row)):
                    continue
                line_counter += 1
                self._check_list_len(row, 10)
                (pub_id, disease_label, disease_id, disease_cat, evidence,
                 chem_label, chem_id, cas_rn, gene_symbol, gene_acc) = row

                if disease_id.strip() == '' or chem_id.strip() == '':
                    continue

                rel_id = self._get_relationship_id(evidence)
                chem_id = 'MESH:' + chem_id
                model.addClassToGraph(chem_id, chem_label)
                model.addClassToGraph(disease_id, None)
                if pub_id != '':
                    pub_id = 'PMID:' + pub_id
                    r = Reference(
                        pub_id, Reference.ref_types['journal_article'])
                    r.addRefToGraph(self.g)
                    pubids = [pub_id]
                else:
                    pubids = None
                self._make_association(chem_id, disease_id, rel_id, pubids)

                if not self.testMode and limit is not None \
                        and line_counter >= limit:
                    break
        return

    def getTestSuite(self):
        import unittest
        from tests.test_ctd import CTDTestCase

        test_suite = unittest.TestLoader().loadTestsFromTestCase(CTDTestCase)
        # test_suite.addTests(
        #   unittest.TestLoader().loadTestsFromTestCase(InteractionsTestCase))

        return test_suite
예제 #5
0
class BioGrid(Source):
    """
    Biogrid interaction data

    """
    # TODO write up class summary for docstring

    files = {
        'interactions': {
            'file': 'interactions.mitab.zip',
            'url': BGDL + '/BIOGRID-ALL-LATEST.mitab.zip'},
        'identifiers':  {
            'file': 'identifiers.tab.zip',
            'url': BGDL + '/BIOGRID-IDENTIFIERS-LATEST.tab.zip'}
    }

    # biogrid-specific identifiers for use in subsetting identifier mapping
    biogrid_ids = [
        106638, 107308, 107506, 107674, 107675, 108277, 108506, 108767, 108814,
        108899, 110308, 110364, 110678, 111642, 112300, 112365, 112771, 112898,
        199832, 203220, 247276, 120150, 120160, 124085]

    def __init__(self, tax_ids=None):
        super().__init__('biogrid')

        self.tax_ids = tax_ids
        self.load_bindings()

        self.dataset = Dataset(
            'biogrid', 'The BioGrid', 'http://thebiogrid.org/', None,
            'http://wiki.thebiogrid.org/doku.php/terms_and_conditions')

        # Defaults
        # our favorite animals
        # taxids = [9606,10090,10116,7227,7955,6239,8355]
        if self.tax_ids is None:
            self.tax_ids = [9606, 10090, 7955]

        if 'test_ids' not in config.get_config() or \
                'gene' not in config.get_config()['test_ids']:
            logger.warning("not configured with gene test ids.")
        else:
            self.test_ids = config.get_config()['test_ids']['gene']

        # data-source specific warnings
        # (will be removed when issues are cleared)
        logger.warning(
            "several MI experimental codes do not exactly map to ECO; "
            "using approximations.")
        return

    def fetch(self, is_dl_forced=False):
        """

        :param is_dl_forced:
        :return:  None
        """

        self.get_files(is_dl_forced)

        # the version number is encoded in the filename in the zip.
        # for example, the interactions file may unzip to
        # BIOGRID-ALL-3.2.119.mitab.txt, where the version number is 3.2.119
        f = '/'.join((self.rawdir, self.files['interactions']['file']))
        st = os.stat(f)
        filedate = datetime.utcfromtimestamp(st[ST_CTIME]).strftime("%Y-%m-%d")
        with ZipFile(f, 'r') as myzip:
            flist = myzip.namelist()
            # assume that the first entry is the item
            fname = flist[0]
            # get the version from the filename
            version = \
                re.match(r'BIOGRID-ALL-(\d+\.\d+\.\d+)\.mitab.txt', fname)
        myzip.close()

        self.dataset.setVersion(filedate, str(version.groups()[0]))

        return

    def parse(self, limit=None):
        """

        :param limit:
        :return:

        """
        if self.testOnly:
            self.testMode = True

        self._get_interactions(limit)
        self._get_identifiers(limit)

        self.load_bindings()

        logger.info("Loaded %d test graph nodes", len(self.testgraph))
        logger.info("Loaded %d full graph nodes", len(self.graph))

        return

    def _get_interactions(self, limit):
        logger.info("getting interactions")
        line_counter = 0
        f = '/'.join((self.rawdir, self.files['interactions']['file']))
        myzip = ZipFile(f, 'r')
        # assume that the first entry is the item
        fname = myzip.namelist()[0]
        matchcounter = 0

        with myzip.open(fname, 'r') as csvfile:
            for line in csvfile:
                # skip comment lines
                if re.match(r'^#', line.decode()):
                    logger.debug("Skipping header line")
                    continue
                line_counter += 1
                line = line.decode().strip()
                # print(line)
                (interactor_a, interactor_b, alt_ids_a, alt_ids_b, aliases_a,
                 aliases_b, detection_method, pub_author, pub_id, taxid_a,
                 taxid_b, interaction_type, source_db, interaction_id,
                 confidence_val) = line.split('\t')

                # get the actual gene ids,
                # typically formated like: gene/locuslink:351|BIOGRID:106848
                gene_a_num = re.search(
                    r'locuslink\:(\d+)\|?', interactor_a).groups()[0]
                gene_b_num = re.search(
                    r'locuslink\:(\d+)\|?', interactor_b).groups()[0]

                if self.testMode:
                    g = self.testgraph
                    # skip any genes that don't match our test set
                    if (int(gene_a_num) not in self.test_ids) or\
                            (int(gene_b_num) not in self.test_ids):
                        continue
                else:
                    g = self.graph
                    # when not in test mode, filter by taxon
                    if int(re.sub(r'taxid:', '', taxid_a.rstrip())) not in\
                            self.tax_ids or\
                            int(re.sub(
                                r'taxid:', '', taxid_b.rstrip())) not in\
                            self.tax_ids:
                        continue
                    else:
                        matchcounter += 1

                gene_a = 'NCBIGene:'+gene_a_num
                gene_b = 'NCBIGene:'+gene_b_num

                # get the interaction type
                # psi-mi:"MI:0407"(direct interaction)
                int_type = re.search(r'MI:\d+', interaction_type).group()
                rel = self._map_MI_to_RO(int_type)

                # scrub pubmed-->PMID prefix
                pub_id = re.sub(r'pubmed', 'PMID', pub_id)
                # remove bogus whitespace
                pub_id = pub_id.strip()

                # get the method, and convert to evidence code
                det_code = re.search(r'MI:\d+', detection_method).group()
                evidence = self._map_MI_to_ECO(det_code)

                # note that the interaction_id is some kind of internal biogrid
                # identifier that does not map to a public URI.
                # we will construct a monarch identifier from this

                assoc = InteractionAssoc(self.name, gene_a, gene_b, rel)
                assoc.add_evidence(evidence)
                assoc.add_source(pub_id)
                assoc.add_association_to_graph(g)
                assoc.load_all_properties(g)

                if not self.testMode and (
                        limit is not None and line_counter > limit):
                    break

        myzip.close()

        return

    def _get_identifiers(self, limit):
        """
        This will process the id mapping file provided by Biogrid.
        The file has a very large header, which we scan past,
        then pull the identifiers, and make equivalence axioms

        :param limit:
        :return:

        """

        logger.info("getting identifier mapping")
        line_counter = 0
        f = '/'.join((self.rawdir, self.files['identifiers']['file']))
        myzip = ZipFile(f, 'r')
        # assume that the first entry is the item
        fname = myzip.namelist()[0]
        foundheader = False

        gu = GraphUtils(curie_map.get())

        # TODO align this species filter with the one above
        # speciesfilters = 'H**o sapiens,Mus musculus,Drosophila melanogaster,
        # Danio rerio, Caenorhabditis elegans,Xenopus laevis'.split(',')

        speciesfilters = 'H**o sapiens,Mus musculus'.split(',')
        with myzip.open(fname, 'r') as csvfile:
            for line in csvfile:
                # skip header lines
                if not foundheader:
                    if re.match(r'BIOGRID_ID', line.decode()):
                        foundheader = True
                    continue

                line = line.decode().strip()
                # BIOGRID_ID
                # IDENTIFIER_VALUE
                # IDENTIFIER_TYPE
                # ORGANISM_OFFICIAL_NAME
                # 1	814566	ENTREZ_GENE	Arabidopsis thaliana
                (biogrid_num, id_num, id_type,
                 organism_label) = line.split('\t')

                if self.testMode:
                    g = self.testgraph
                    # skip any genes that don't match our test set
                    if int(biogrid_num) not in self.biogrid_ids:
                        continue
                else:
                    g = self.graph

                # for each one of these,
                # create the node and add equivalent classes
                biogrid_id = 'BIOGRID:'+biogrid_num
                prefix = self._map_idtype_to_prefix(id_type)

                # TODO make these filters available as commandline options
                # geneidtypefilters='NCBIGene,OMIM,MGI,FlyBase,ZFIN,MGI,HGNC,
                #                   WormBase,XenBase,ENSEMBL,miRBase'.split(',')
                geneidtypefilters = 'NCBIGene,MGI,ENSEMBL,ZFIN,HGNC'.split(',')
                # proteinidtypefilters='HPRD,Swiss-Prot,NCBIProtein'
                if (speciesfilters is not None) \
                        and (organism_label.strip() in speciesfilters):
                    line_counter += 1
                    if (geneidtypefilters is not None) \
                            and (prefix in geneidtypefilters):
                        mapped_id = ':'.join((prefix, id_num))
                        gu.addEquivalentClass(g, biogrid_id, mapped_id)
                    # this symbol will only get attached to the biogrid class
                    elif id_type == 'OFFICIAL_SYMBOL':
                        gu.addClassToGraph(g, biogrid_id, id_num)
                    # elif (id_type == 'SYNONYM'):
                    #   FIXME - i am not sure these are synonyms, altids?
                    #   gu.addSynonym(g,biogrid_id,id_num)

                if not self.testMode and limit is not None \
                        and line_counter > limit:
                    break

        myzip.close()

        return

    @staticmethod
    def _map_MI_to_RO(mi_id):
        rel = InteractionAssoc.interaction_object_properties
        mi_ro_map = {
            # colocalization
            'MI:0403': rel['colocalizes_with'],
            # direct interaction
            'MI:0407': rel['interacts_with'],
            # synthetic genetic interaction defined by inequality
            'MI:0794': rel['genetically_interacts_with'],
            # suppressive genetic interaction defined by inequality
            'MI:0796': rel['genetically_interacts_with'],
            # additive genetic interaction defined by inequality
            'MI:0799': rel['genetically_interacts_with'],
            # association
            'MI:0914': rel['interacts_with'],
            # physical association
            'MI:0915': rel['interacts_with']
        }

        ro_id = rel['interacts_with']  # default
        if mi_id in mi_ro_map:
            ro_id = mi_ro_map.get(mi_id)

        return ro_id

    @staticmethod
    def _map_MI_to_ECO(mi_id):
        eco_id = 'ECO:0000006'  # default to experimental evidence
        mi_to_eco_map = {
            'MI:0018': 'ECO:0000068',  # yeast two-hybrid
            'MI:0004': 'ECO:0000079',  # affinity chromatography
            'MI:0047': 'ECO:0000076',  # far western blotting
            'MI:0055': 'ECO:0000021',  # should be FRET, but using physical_interaction FIXME
            'MI:0090': 'ECO:0000012',  # desired: protein complementation, using: functional complementation
            'MI:0096': 'ECO:0000085',  # desired: pull down, using: immunoprecipitation
            'MI:0114': 'ECO:0000324',  # desired: x-ray crystallography, using: imaging assay
            'MI:0254': 'ECO:0000011',  # desired: genetic interference, using: genetic interaction evidence
            'MI:0401': 'ECO:0000172',  # desired: biochemical, using: biochemical trait evidence
            'MI:0415': 'ECO:0000005',  # desired: enzymatic study, using: enzyme assay evidence
            'MI:0428': 'ECO:0000324',  # imaging
            'MI:0686': 'ECO:0000006',  # desired: unspecified, using: experimental evidence
            'MI:1313': 'ECO:0000006'   # None?
        }
        if mi_id in mi_to_eco_map:
            eco_id = mi_to_eco_map.get(mi_id)
        else:
            logger.warning(
                "unmapped code %s. Defaulting to experimental_evidence", mi_id)

        return eco_id

    @staticmethod
    def _map_idtype_to_prefix(idtype):
        """
        Here we need to reformat the BioGrid source prefixes
        to standard ones used in our curie-map.
        :param idtype:
        :return:

        """
        prefix = idtype
        idtype_to_prefix_map = {
            'XENBASE': 'XenBase',
            'TREMBL': 'TrEMBL',
            'MGI': 'MGI',
            'REFSEQ_DNA_ACCESSION': 'RefSeqNA',
            'MAIZEGDB': 'MaizeGDB',
            'BEEBASE': 'BeeBase',
            'ENSEMBL': 'ENSEMBL',
            'TAIR': 'TAIR',
            'GENBANK_DNA_GI': 'NCBIgi',
            'CGNC': 'CGNC',
            'RGD': 'RGD',
            'GENBANK_GENOMIC_DNA_GI': 'NCBIgi',
            'SWISSPROT': 'Swiss-Prot',
            'MIM': 'OMIM',
            'FLYBASE': 'FlyBase',
            'VEGA': 'VEGA',
            'ANIMALQTLDB': 'AQTLDB',
            'ENTREZ_GENE_ETG': 'ETG',
            'HPRD': 'HPRD',
            'APHIDBASE': 'APHIDBASE',
            'GENBANK_PROTEIN_ACCESSION': 'NCBIProtein',
            'ENTREZ_GENE': 'NCBIGene',
            'SGD': 'SGD',
            'GENBANK_GENOMIC_DNA_ACCESSION': 'NCBIGenome',
            'BGD': 'BGD',
            'WORMBASE': 'WormBase',
            'ZFIN': 'ZFIN',
            'DICTYBASE': 'dictyBase',
            'ECOGENE': 'ECOGENE',
            'BIOGRID': 'BIOGRID',
            'GENBANK_DNA_ACCESSION': 'NCBILocus',
            'VECTORBASE': 'VectorBase',
            'MIRBASE': 'miRBase',
            'IMGT/GENE-DB': 'IGMT',
            'HGNC': 'HGNC',
            'SYSTEMATIC_NAME': None,
            'OFFICIAL_SYMBOL': None,
            'REFSEQ_GENOMIC_DNA_ACCESSION': 'NCBILocus',
            'GENBANK_PROTEIN_GI': 'NCBIgi',
            'REFSEQ_PROTEIN_ACCESSION': 'RefSeqProt',
            'SYNONYM': None,
            'GRID_LEGACY': None,
            # the following showed up in 3.3.124
            'UNIPROT-ACCESSION': 'UniprotKB',
            'SWISS-PROT': 'Swiss-Prot',
            'OFFICIAL SYMBOL': None,
            'ENSEMBL RNA': None,
            'GRID LEGACY': None,
            'ENSEMBL PROTEIN': None,
            'REFSEQ-RNA-GI': None,
            'REFSEQ-RNA-ACCESSION': None,
            'REFSEQ-PROTEIN-GI': None,
            'REFSEQ-PROTEIN-ACCESSION-VERSIONED': None,
            'REFSEQ-PROTEIN-ACCESSION': None,
            'REFSEQ-LEGACY': None,
            'SYSTEMATIC NAME': None,
            'ORDERED LOCUS': None,
            'UNIPROT-ISOFORM': 'UniprotKB',
            'ENSEMBL GENE': 'ENSEMBL',
            'CGD': None,  # Not sure what this is?
            'WORMBASE-OLD': 'WormBase'

        }
        if idtype in idtype_to_prefix_map:
            prefix = idtype_to_prefix_map.get(idtype)
        else:
            logger.warning("unmapped prefix %s", prefix)

        return prefix

    def getTestSuite(self):
        import unittest
        from tests.test_biogrid import BioGridTestCase
        # TODO add InteractionAssoc tests
        # TODO add test about if all prefixes are mapped?

        test_suite = \
            unittest.TestLoader().loadTestsFromTestCase(BioGridTestCase)

        return test_suite
예제 #6
0
class HPOAnnotations(Source):
    """
    The [Human Phenotype Ontology](http://human-phenotype-ontology.org) group curates and assembles
    over 115,000 annotations to hereditary diseases using the HPO ontology.
    Here we create OBAN-style associations between diseases and phenotypic features, together with their
    evidence, and age of onset and frequency (if known).
    The parser currently only processes the "abnormal" annotations.  Association to "remarkable normality"
    will be added in the near future.

    In order to properly test this class, you should have a conf.json file configured with some test ids, in
    the structure of:
        <pre>
        test_ids: {
            "disease" : ["OMIM:119600", "OMIM:120160"]  # as examples.  put your favorite ids in the config.
        }
        </pre>
    """

    files = {
        'annot': {'file' : 'phenotype_annotation.tab',
                   'url' : 'http://compbio.charite.de/hudson/job/hpo.annotations/lastStableBuild/artifact/misc/phenotype_annotation.tab'},
        'version': {'file' : 'data_version.txt',
                    'url' : 'http://compbio.charite.de/hudson/job/hpo.annotations/lastStableBuild/artifact/misc/data_version.txt'},
#       'neg_annot': {'file' : 'phenotype_annotation.tab',
#                     'url' : 'http://compbio.charite.de/hudson/job/hpo.annotations/lastStableBuild/artifact/misc/negative_phenotype_annotation.tab'
#        },
    }

    # note, two of these codes are awaiting term requests.  see #114 and
    # https://code.google.com/p/evidenceontology/issues/detail?id=32
    eco_dict = {
        "ICE": "ECO:0000305",  # FIXME currently using "curator inference used in manual assertion"
        "IEA": "ECO:0000501",  # Inferred from Electronic Annotation
        "PCS": "ECO:0000269",  # FIXME currently using "experimental evidence used in manual assertion"
        "TAS": "ECO:0000304"   # Traceable Author Statement
    }

    def __init__(self):
        Source.__init__(self, 'hpoa')

        self.load_bindings()

        self.dataset = Dataset('hpoa', 'Human Phenotype Ontology',
                               'http://www.human-phenotype-ontology.org', None,
                               'http://www.human-phenotype-ontology.org/contao/index.php/legal-issues.html')

        if 'test_ids' not in config.get_config() or 'disease' not in config.get_config()['test_ids']:
            logger.warn("not configured with disease test ids.")
        else:
            self.test_ids = config.get_config()['test_ids']['disease']

        # data-source specific warnings (will be removed when issues are cleared)
        logger.warn("note that some ECO classes are missing for ICE and PCS; using temporary mappings.")

        return

    def fetch(self, is_dl_forced=False):

        self.get_files(is_dl_forced)

        self.scrub()

        # get the latest build from jenkins
        # NOT DOING THIS ANY MORE - but leaving it in for reference
        # jenkins_info = eval(urllib.request.urlopen('http://compbio.charite.de/hudson/job/hpo.annotations/lastSuccessfulBuild/api/python').read())
        # version = jenkins_info['number']

        # use the files['version'] file as the version
        fname = '/'.join((self.rawdir, self.files['version']['file']))

        with open(fname, 'r', encoding="utf8") as f:
            # 2015-04-23 13:01
            v = f.readline()  # read the first line (the only line, really)
            d = datetime.strptime(v.strip(), '%Y-%m-%d %H:%M').strftime("%Y-%m-%d-%H-%M")
        f.close()

        st = os.stat(fname)
        filedate = datetime.utcfromtimestamp(st[ST_CTIME]).strftime("%Y-%m-%d")

        # this will cause two dates to be attached to the dataset (one from the filedate, and the other from here)
        # TODO when #112 is implemented, this will result in only the whole dataset being versioned
        self.dataset.setVersion(filedate, d)

        return

    def scrub(self):
        """
        Perform various data-scrubbing on the raw data files prior to parsing.
        For this resource, this currently includes:
        * revise errors in identifiers for some OMIM and PMIDs
        :return: None
        """
        # scrub file of the oddities...lots of publication rewriting
        f = '/'.join((self.rawdir, self.files['annot']['file']))
        logger.info('scrubbing PubMed:12345 --> PMID:12345')
        pysed.replace("PubMed", 'PMID', f)

        logger.info('scrubbing pmid:12345 --> PMID:12345')
        pysed.replace("pmid", 'PMID', f)

        logger.info('scrubbing PMID12345 --> PMID:12345')
        pysed.replace("PMID([0-9][0-9]*)", 'PMID:\\1', f)

        logger.info('scrubbing MIM12345 --> OMIM:12345')
        pysed.replace('MIM([0-9][0-9]*)', 'OMIM:\\1', f)

        logger.info('scrubbing MIM:12345 --> OMIM:12345')
        pysed.replace(";MIM", ";OMIM", f)

        logger.info('scrubbing ORPHANET --> Orphanet')
        pysed.replace("ORPHANET", "Orphanet", f)
        return

    # here we're reading and building a full named graph of this resource, then dumping it all at the end
    # we can investigate doing this line-by-line later
    # supply a limit if you want to test out parsing the head X lines of the file
    def parse(self, limit=None):
        if limit is not None:
            logger.info("Only parsing first %s rows", limit)

        logger.info("Parsing files...")

        if self.testOnly:
            self.testMode = True

        self._process_phenotype_tab('/'.join((self.rawdir, self.files['annot']['file'])), limit)

        # TODO add negative phenotype statements #113
        # self._process_negative_phenotype_tab(self.rawfile,self.outfile,limit)

        logger.info("Finished parsing.")

        return

    def _map_evidence_to_codes(self, code_string):
        """
        A simple mapping of the code_string to it's ECO class using the dictionary defined here
        Currently includes ICE, IEA, PCS, TAS
        :param code_string:
        :return:
        """
        return self.eco_dict.get(code_string)

    def _process_phenotype_tab(self, raw, limit):
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        line_counter = 0
        gu = GraphUtils(curie_map.get())
        with open(raw, 'r', encoding="utf8") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (db, num, name, qual, pheno_id, publist, eco, onset, freq, w, asp, syn, date, curator) = row
                disease_id = db + ":" + str(num)

                if self.testMode and disease_id.strip() not in config.get_config()['test_ids']['disease']:
                    continue

                # logger.info('adding %s', disease_id)

                gu.addClassToGraph(g, disease_id, None)
                gu.addClassToGraph(g, pheno_id, None)
                eco_id = self._map_evidence_to_codes(eco)
                gu.addClassToGraph(g, eco_id, None)
                if onset is not None and onset.strip() != '':
                    gu.addClassToGraph(g, onset, None)

                # we want to do things differently depending on the aspect of the annotation
                if asp == 'O' or asp == 'M':  # organ abnormality or mortality
                    assoc = D2PAssoc(self.name, disease_id, pheno_id, onset, freq)
                elif asp == 'I':  # inheritance patterns for the whole disease
                    assoc = DispositionAssoc(self.name, disease_id, pheno_id)
                elif asp == 'C':  # clinical course / onset
                    assoc = DispositionAssoc(self.name, disease_id, pheno_id)
                else:
                    logger.error("I don't know what this aspect is:", asp)

                assoc.add_evidence(eco_id)

                publist = publist.split(';')
                # blow these apart if there is a list of pubs
                for pub in publist:
                    pub = pub.strip()
                    if pub != '':
                        # if re.match('http://www.ncbi.nlm.nih.gov/bookshelf/br\.fcgi\?book=gene', pub):
                        #     #http://www.ncbi.nlm.nih.gov/bookshelf/br.fcgi?book=gene&part=ced
                        #     m = re.search('part\=(\w+)', pub)
                        #     pub_id = 'GeneReviews:'+m.group(1)
                        # elif re.search('http://www.orpha.net/consor/cgi-bin/OC_Exp\.php\?lng\=en\&Expert\=', pub):
                        #     m = re.search('Expert=(\d+)', pub)
                        #     pub_id = 'Orphanet:'+m.group(1)
                        if not re.match('http', pub):
                            r = Reference(pub)
                            if re.match('PMID', pub):
                                r.setType(Reference.ref_types['journal_article'])
                            r.addRefToGraph(g)
                        # TODO add curator
                        assoc.add_source(pub)

                assoc.add_association_to_graph(g)

                if not self.testMode and limit is not None and line_counter > limit:
                    break

            Assoc(None).load_all_properties(g)

        return

    def getTestSuite(self):
        import unittest
        from tests.test_hpoa import HPOATestCase
        # TODO add D2PAssoc tests

        test_suite = unittest.TestLoader().loadTestsFromTestCase(HPOATestCase)

        return test_suite
예제 #7
0
파일: BioGrid.py 프로젝트: d3borah/dipper
class BioGrid(Source):
    """
    Biogrid interaction data
    """

    # TODO write up class summary for docstring

    files = {
        "interactions": {
            "file": "interactions.mitab.zip",
            "url": "http://thebiogrid.org/downloads/archives/Latest%20Release/BIOGRID-ALL-LATEST.mitab.zip",
        },
        "identifiers": {
            "file": "identifiers.tab.zip",
            "url": "http://thebiogrid.org/downloads/archives/Latest%20Release/BIOGRID-IDENTIFIERS-LATEST.tab.zip",
        },
    }

    # biogrid-specific identifiers for use in subsetting identifier mapping
    biogrid_ids = [
        106638,
        107308,
        107506,
        107674,
        107675,
        108277,
        108506,
        108767,
        108814,
        108899,
        110308,
        110364,
        110678,
        111642,
        112300,
        112365,
        112771,
        112898,
        199832,
        203220,
        247276,
        120150,
        120160,
        124085,
    ]

    def __init__(self, tax_ids=None):
        super().__init__("biogrid")

        self.tax_ids = tax_ids
        self.load_bindings()

        self.dataset = Dataset(
            "biogrid",
            "The BioGrid",
            "http://thebiogrid.org/",
            None,
            "http://wiki.thebiogrid.org/doku.php/terms_and_conditions",
        )

        # Defaults
        # taxids = [9606,10090,10116,7227,7955,6239,8355]  #our favorite animals
        if self.tax_ids is None:
            self.tax_ids = [9606, 10090, 7955]

        if "test_ids" not in config.get_config() or "gene" not in config.get_config()["test_ids"]:
            logger.warn("not configured with gene test ids.")
        else:
            self.test_ids = config.get_config()["test_ids"]["gene"]

        # data-source specific warnings (will be removed when issues are cleared)
        logger.warn("several MI experimental codes do not exactly map to ECO; using approximations.")
        return

    def fetch(self, is_dl_forced=False):
        """

        :param is_dl_forced:
        :return:  None
        """

        self.get_files(is_dl_forced)

        # the version number is encoded in the filename in the zip.
        # for example, the interactions file may unzip to BIOGRID-ALL-3.2.119.mitab.txt,
        # where the version number is 3.2.119
        f = "/".join((self.rawdir, self.files["interactions"]["file"]))
        st = os.stat(f)
        filedate = datetime.utcfromtimestamp(st[ST_CTIME]).strftime("%Y-%m-%d")
        with ZipFile(f, "r") as myzip:
            flist = myzip.namelist()
            # assume that the first entry is the item
            fname = flist[0]
            # get the version from the filename
            version = re.match("BIOGRID-ALL-(\d+\.\d+\.\d+)\.mitab.txt", fname)
        myzip.close()

        self.dataset.setVersion(filedate, str(version.groups()[0]))

        return

    def parse(self, limit=None):
        """

        :param limit:
        :return:
        """
        if self.testOnly:
            self.testMode = True

        self._get_interactions(limit)
        self._get_identifiers(limit)

        self.load_bindings()

        logger.info("Loaded %d test graph nodes", len(self.testgraph))
        logger.info("Loaded %d full graph nodes", len(self.graph))

        return

    def _get_interactions(self, limit):
        logger.info("getting interactions")
        line_counter = 0
        f = "/".join((self.rawdir, self.files["interactions"]["file"]))
        myzip = ZipFile(f, "r")
        # assume that the first entry is the item
        fname = myzip.namelist()[0]
        matchcounter = 0

        with myzip.open(fname, "r") as csvfile:
            for line in csvfile:
                # skip comment lines
                if re.match("^#", line.decode()):
                    logger.debug("Skipping header line")
                    continue
                line_counter += 1
                line = line.decode().strip()
                # print(line)
                (
                    interactor_a,
                    interactor_b,
                    alt_ids_a,
                    alt_ids_b,
                    aliases_a,
                    aliases_b,
                    detection_method,
                    pub_author,
                    pub_id,
                    taxid_a,
                    taxid_b,
                    interaction_type,
                    source_db,
                    interaction_id,
                    confidence_val,
                ) = line.split("\t")

                # get the actual gene ids, typically formated like: gene/locuslink:351|BIOGRID:106848
                gene_a_num = re.search("locuslink\:(\d+)\|?", interactor_a).groups()[0]
                gene_b_num = re.search("locuslink\:(\d+)\|?", interactor_b).groups()[0]

                if self.testMode:
                    g = self.testgraph
                    # skip any genes that don't match our test set
                    if (int(gene_a_num) not in self.test_ids) or (int(gene_b_num) not in self.test_ids):
                        continue
                else:
                    g = self.graph
                    # when not in test mode, filter by taxon
                    if (
                        int(re.sub("taxid:", "", taxid_a.rstrip())) not in self.tax_ids
                        or int(re.sub("taxid:", "", taxid_b.rstrip())) not in self.tax_ids
                    ):
                        continue
                    else:
                        matchcounter += 1

                gene_a = "NCBIGene:" + gene_a_num
                gene_b = "NCBIGene:" + gene_b_num

                # get the interaction type
                # psi-mi:"MI:0407"(direct interaction)
                int_type = re.search("MI:\d+", interaction_type).group()
                rel = self._map_MI_to_RO(int_type)

                # scrub pubmed-->PMID prefix
                pub_id = re.sub("pubmed", "PMID", pub_id)
                # remove bogus whitespace
                pub_id = pub_id.strip()

                # get the method, and convert to evidence code
                det_code = re.search("MI:\d+", detection_method).group()
                evidence = self._map_MI_to_ECO(det_code)

                # note that the interaction_id is some kind of internal biogrid identifier that does not
                # map to a public URI.  we will construct a monarch identifier from this

                assoc = InteractionAssoc(self.name, gene_a, gene_b, rel)
                assoc.add_evidence(evidence)
                assoc.add_source(pub_id)
                assoc.add_association_to_graph(g)
                assoc.load_all_properties(g)

                if not self.testMode and (limit is not None and line_counter > limit):
                    break

        myzip.close()

        return

    def _get_identifiers(self, limit):
        """
        This will process the id mapping file provided by Biogrid.
        The file has a very large header, which we scan past, then pull the identifiers, and make
        equivalence axioms

        :param limit:
        :return:
        """

        logger.info("getting identifier mapping")
        line_counter = 0
        f = "/".join((self.rawdir, self.files["identifiers"]["file"]))
        myzip = ZipFile(f, "r")
        # assume that the first entry is the item
        fname = myzip.namelist()[0]
        foundheader = False

        gu = GraphUtils(curie_map.get())

        # TODO align this species filter with the one above
        # speciesfilters = 'H**o sapiens,Mus musculus,Drosophila melanogaster,Danio rerio,
        # Caenorhabditis elegans,Xenopus laevis'.split(',')

        speciesfilters = "H**o sapiens,Mus musculus".split(",")
        with myzip.open(fname, "r") as csvfile:
            for line in csvfile:
                # skip header lines
                if not foundheader:
                    if re.match("BIOGRID_ID", line.decode()):
                        foundheader = True
                    continue

                line = line.decode().strip()
                # BIOGRID_ID	IDENTIFIER_VALUE	IDENTIFIER_TYPE	ORGANISM_OFFICIAL_NAME
                # 1	814566	ENTREZ_GENE	Arabidopsis thaliana
                (biogrid_num, id_num, id_type, organism_label) = line.split("\t")

                if self.testMode:
                    g = self.testgraph
                    # skip any genes that don't match our test set
                    if int(biogrid_num) not in self.biogrid_ids:
                        continue
                else:
                    g = self.graph

                # for each one of these, create the node and add equivalent classes
                biogrid_id = "BIOGRID:" + biogrid_num
                prefix = self._map_idtype_to_prefix(id_type)

                # TODO make these filters available as commandline options
                # geneidtypefilters='NCBIGene,OMIM,MGI,FlyBase,ZFIN,MGI,HGNC,WormBase,XenBase,ENSEMBL,miRBase'.split(',')
                geneidtypefilters = "NCBIGene,MGI,ENSEMBL,ZFIN,HGNC".split(",")
                # proteinidtypefilters='HPRD,Swiss-Prot,NCBIProtein'
                if (speciesfilters is not None) and (organism_label.strip() in speciesfilters):
                    line_counter += 1
                    if (geneidtypefilters is not None) and (prefix in geneidtypefilters):
                        mapped_id = ":".join((prefix, id_num))
                        gu.addEquivalentClass(g, biogrid_id, mapped_id)
                    elif id_type == "OFFICIAL_SYMBOL":  # this symbol will only get attached to the biogrid class
                        gu.addClassToGraph(g, biogrid_id, id_num)
                    # elif (id_type == 'SYNONYM'):
                    #    gu.addSynonym(g,biogrid_id,id_num)  #FIXME - i am not sure these are synonyms, altids?

                if not self.testMode and limit is not None and line_counter > limit:
                    break

        myzip.close()

        return

    @staticmethod
    def _map_MI_to_RO(mi_id):
        rel = InteractionAssoc.interaction_object_properties
        mi_ro_map = {
            "MI:0403": rel["colocalizes_with"],  # colocalization
            "MI:0407": rel["interacts_with"],  # direct interaction
            "MI:0794": rel["genetically_interacts_with"],  # synthetic genetic interaction defined by inequality
            "MI:0796": rel["genetically_interacts_with"],  # suppressive genetic interaction defined by inequality
            "MI:0799": rel["genetically_interacts_with"],  # additive genetic interaction defined by inequality
            "MI:0914": rel["interacts_with"],  # association
            "MI:0915": rel["interacts_with"],  # physical association
        }

        ro_id = rel["interacts_with"]  # default
        if mi_id in mi_ro_map:
            ro_id = mi_ro_map.get(mi_id)

        return ro_id

    @staticmethod
    def _map_MI_to_ECO(mi_id):
        eco_id = "ECO:0000006"  # default to experimental evidence
        mi_to_eco_map = {
            "MI:0018": "ECO:0000068",  # yeast two-hybrid
            "MI:0004": "ECO:0000079",  # affinity chromatography
            "MI:0047": "ECO:0000076",  # far western blotting
            "MI:0055": "ECO:0000021",  # should be FRET, but using physical_interaction FIXME
            "MI:0090": "ECO:0000012",  # desired: protein complementation, using: functional complementation
            "MI:0096": "ECO:0000085",  # desired: pull down, using: immunoprecipitation
            "MI:0114": "ECO:0000324",  # desired: x-ray crystallography, using: imaging assay
            "MI:0254": "ECO:0000011",  # desired: genetic interference, using: genetic interaction evidence
            "MI:0401": "ECO:0000172",  # desired: biochemical, using: biochemical trait evidence
            "MI:0415": "ECO:0000005",  # desired: enzymatic study, using: enzyme assay evidence
            "MI:0428": "ECO:0000324",  # imaging
            "MI:0686": "ECO:0000006",  # desired: unspecified, using: experimental evidence
            "MI:1313": "ECO:0000006",  # None?
        }
        if mi_id in mi_to_eco_map:
            eco_id = mi_to_eco_map.get(mi_id)
        else:
            logger.warn("unmapped code %s. Defaulting to experimental_evidence", mi_id)

        return eco_id

    @staticmethod
    def _map_idtype_to_prefix(idtype):
        """
        Here we need to reformat the BioGrid source prefixes to standard ones used in our curie-map.
        :param idtype:
        :return:
        """
        prefix = idtype
        idtype_to_prefix_map = {
            "XENBASE": "XenBase",
            "TREMBL": "TrEMBL",
            "MGI": "MGI",
            "REFSEQ_DNA_ACCESSION": "RefSeqNA",
            "MAIZEGDB": "MaizeGDB",
            "BEEBASE": "BeeBase",
            "ENSEMBL": "ENSEMBL",
            "TAIR": "TAIR",
            "GENBANK_DNA_GI": "NCBIgi",
            "CGNC": "CGNC",
            "RGD": "RGD",
            "GENBANK_GENOMIC_DNA_GI": "NCBIgi",
            "SWISSPROT": "Swiss-Prot",
            "MIM": "OMIM",
            "FLYBASE": "FlyBase",
            "VEGA": "VEGA",
            "ANIMALQTLDB": "AQTLDB",
            "ENTREZ_GENE_ETG": "ETG",
            "HPRD": "HPRD",
            "APHIDBASE": "APHIDBASE",
            "GENBANK_PROTEIN_ACCESSION": "NCBIProtein",
            "ENTREZ_GENE": "NCBIGene",
            "SGD": "SGD",
            "GENBANK_GENOMIC_DNA_ACCESSION": "NCBIGenome",
            "BGD": "BGD",
            "WORMBASE": "WormBase",
            "ZFIN": "ZFIN",
            "DICTYBASE": "dictyBase",
            "ECOGENE": "ECOGENE",
            "BIOGRID": "BIOGRID",
            "GENBANK_DNA_ACCESSION": "NCBILocus",
            "VECTORBASE": "VectorBase",
            "MIRBASE": "miRBase",
            "IMGT/GENE-DB": "IGMT",
            "HGNC": "HGNC",
            "SYSTEMATIC_NAME": None,
            "OFFICIAL_SYMBOL": None,
            "REFSEQ_GENOMIC_DNA_ACCESSION": "NCBILocus",
            "GENBANK_PROTEIN_GI": "NCBIgi",
            "REFSEQ_PROTEIN_ACCESSION": "RefSeqProt",
            "SYNONYM": None,
            "GRID_LEGACY": None,
            # the following showed up in 3.3.124
            "UNIPROT-ACCESSION": "UniprotKB",
            "SWISS-PROT": "Swiss-Prot",
            "OFFICIAL SYMBOL": None,
            "ENSEMBL RNA": None,
            "GRID LEGACY": None,
            "ENSEMBL PROTEIN": None,
            "REFSEQ-RNA-GI": None,
            "REFSEQ-RNA-ACCESSION": None,
            "REFSEQ-PROTEIN-GI": None,
            "REFSEQ-PROTEIN-ACCESSION-VERSIONED": None,
            "REFSEQ-PROTEIN-ACCESSION": None,
            "REFSEQ-LEGACY": None,
            "SYSTEMATIC NAME": None,
            "ORDERED LOCUS": None,
            "UNIPROT-ISOFORM": "UniprotKB",
            "ENSEMBL GENE": "ENSEMBL",
            "CGD": None,  # Not sure what this is?
            "WORMBASE-OLD": "WormBase",
        }
        if idtype in idtype_to_prefix_map:
            prefix = idtype_to_prefix_map.get(idtype)
        else:
            logger.warn("unmapped prefix %s", prefix)

        return prefix

    def getTestSuite(self):
        import unittest
        from tests.test_biogrid import BioGridTestCase

        # TODO add InteractionAssoc tests
        # TODO add test about if all prefixes are mapped?

        test_suite = unittest.TestLoader().loadTestsFromTestCase(BioGridTestCase)

        return test_suite
예제 #8
0
class HPOAnnotations(Source):
    """
    The [Human Phenotype Ontology](http://human-phenotype-ontology.org) group
    curates and assembles over 115,000 annotations to hereditary diseases
    using the HPO ontology. Here we create OBAN-style associations
    between diseases and phenotypic features, together with their evidence,
    and age of onset and frequency (if known).
    The parser currently only processes the "abnormal" annotations.
    Association to "remarkable normality" will be added in the near future.

    We create additional associations from text mining.  See info at
    http://pubmed-browser.human-phenotype-ontology.org/.

    Also, you can read about these annotations in
    [PMID:26119816](http://www.ncbi.nlm.nih.gov/pubmed/26119816).

    In order to properly test this class,
    you should have a conf.json file configured with some test ids, in
    the structure of:
    # as examples.  put your favorite ids in the config.
    <pre>
    test_ids: {"disease" : ["OMIM:119600", "OMIM:120160"]}
    </pre>

    """

    files = {
        'annot': {
            'file': 'phenotype_annotation.tab',
            'url': HPOADL + '/phenotype_annotation.tab'},
        'version': {
            'file': 'data_version.txt',
            'url': HPOADL + '/data_version.txt'},
        # 'neg_annot': {
        #   'file': 'phenotype_annotation.tab',
        #    'url': HPOADL + '/negative_phenotype_annotation.tab'},
        'doid': {
            'file': 'doid.owl',
            'url': 'http://purl.obolibrary.org/obo/doid.owl'
        }
    }

    # note, two of these codes are awaiting term requests.  see #114 and
    # https://code.google.com/p/evidenceontology/issues/detail?id=32
    # TODO TEC see if the GC issue translated into a GH issue
    eco_dict = {
        # FIXME currently using "curator inference used in manual assertion"
        "ICE": "ECO:0000305",
        # Inferred from Electronic Annotation
        "IEA": "ECO:0000501",
        # FIXME currently is"experimental evidence used in manual assertion"
        "PCS": "ECO:0000269",
        # Traceable Author Statement
        "TAS": "ECO:0000304",
        # FIXME currently using computational combinatorial evidence
        # in automatic assertion
        "ITM": "ECO:0000246",
    }

    def __init__(self, graph_type, are_bnodes_skolemized):
        super().__init__(graph_type, are_bnodes_skolemized, 'hpoa')

        self.dataset = Dataset(
            'hpoa', 'Human Phenotype Ontology',
            'http://www.human-phenotype-ontology.org', None,
            'http://www.human-phenotype-ontology.org/contao/index.php/legal-issues.html')

        self.replaced_id_count = 0

        if 'test_ids' not in config.get_config()\
                or 'disease' not in config.get_config()['test_ids']:
            logger.warning("not configured with disease test ids.")
            self.test_ids = []
        else:
            self.test_ids = config.get_config()['test_ids']['disease']

        # data-source specific warnings to be removed when issues are cleared
        logger.warning(
            "note that some ECO classes are missing for ICE, PCS, and ITM;" +
            " using temporary mappings.")

        return

    def fetch(self, is_dl_forced=False):

        self.get_files(is_dl_forced)

        self.scrub()

        # get the latest build from jenkins

        # use the files['version'] file as the version
        fname = '/'.join((self.rawdir, self.files['version']['file']))

        with open(fname, 'r', encoding="utf8") as f:
            # 2015-04-23 13:01
            v = f.readline()  # read the first line (the only line, really)
            d = datetime.strptime(
                v.strip(), '%Y-%m-%d %H:%M').strftime("%Y-%m-%d-%H-%M")
        f.close()

        st = os.stat(fname)
        filedate = datetime.utcfromtimestamp(st[ST_CTIME]).strftime("%Y-%m-%d")

        # this will cause two dates to be attached to the dataset
        # (one from the filedate, and the other from here)
        # TODO when #112 is implemented,
        # this will result in only the whole dataset being versioned
        self.dataset.setVersion(filedate, d)

        self.get_common_files()

        return

    def scrub(self):
        """
        Perform various data-scrubbing on the raw data files prior to parsing.
        For this resource, this currently includes:
        * revise errors in identifiers for some OMIM and PMIDs

        :return: None

        """

        # scrub file of the oddities...lots of publication rewriting
        f = '/'.join((self.rawdir, self.files['annot']['file']))
        logger.info('scrubbing PubMed:12345 --> PMID:12345')
        pysed.replace(r'PubMed:', 'PMID:', f)

        logger.info('scrubbing pmid:12345 --> PMID:12345')
        pysed.replace(r'pmid:', 'PMID:', f)

        logger.info('scrubbing PMID:    12345 --> PMID:12345')
        pysed.replace(r'PMID:  *', 'PMID:', f)

        logger.info('scrubbing PMID12345 --> PMID:12345')
        pysed.replace(r'PMID([0-9][0-9]*)', r'PMID:\1', f)

        logger.info('scrubbing MIM12345 --> OMIM:12345')
        pysed.replace(r'MIM([0-9][0-9]*)', r'OMIM:\1', f)

        logger.info('scrubbing MIM:12345 --> OMIM:12345')
        pysed.replace(r";MIM", ";OMIM", f)

        logger.info('scrubbing ORPHANET --> Orphanet')
        pysed.replace("ORPHANET", "Orphanet", f)

        logger.info('scrubbing ORPHA --> Orphanet')
        pysed.replace("ORPHA", "Orphanet", f)
        return

    def parse(self, limit=None):
        if limit is not None:
            logger.info("Only parsing first %s rows", limit)

        self.add_common_files_to_file_list()

        logger.info("Parsing files...")

        if self.testOnly:
            self.testMode = True

        # rare disease-phenotype associations
        self._process_phenotype_tab('/'.join((self.rawdir,
                                              self.files['annot']['file'])),
                                    limit)

        # TODO add negative phenotype statements #113
        # self._process_negative_phenotype_tab(self.rawfile,self.outfile,limit)

        # common disease-phenotype associations from text mining work
        self.process_all_common_disease_files(limit)

        logger.info("Finished parsing.")

        return

    def _map_evidence_to_codes(self, code_string):
        """
        A simple mapping of the code_string to it's ECO class
        using the dictionary defined here
        Currently includes ICE, IEA, PCS, TAS
        :param code_string:
        :return:

        """
        return self.eco_dict.get(code_string)

    def _process_phenotype_tab(self, raw, limit):
        """
        see info on format here:
        http://www.human-phenotype-ontology.org/contao/index.php/annotation-guide.html

        :param raw:
        :param limit:
        :return:

        """
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        model = Model(g)
        line_counter = 0
        with open(raw, 'r', encoding="utf8") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                row = [str(col).strip() for col in row]
                # Note from Seb in Dec 2017, a 15th column was added
                # inadverterntly and will be removed in the winter 2018
                # release of hpo data
                (db, num, name, qual, pheno_id, publist, eco, onset, freq, w,
                 asp, syn, date, curator, extra) = row
                disease_id = db + ":" + num

                if self.testMode:
                    try:
                        id_list = self.test_ids
                        if id_list is None \
                                or disease_id not in id_list:
                            continue
                    except AttributeError:
                        continue

                # logger.info('adding %s', disease_id)

                model.addClassToGraph(disease_id, None)
                model.addClassToGraph(pheno_id, None)
                eco_id = self._map_evidence_to_codes(eco)
                model.addClassToGraph(eco_id, None)
                if onset is not None and onset != '':
                    model.addClassToGraph(onset, None)

                # we want to do things differently depending on
                # the aspect of the annotation
                # TODO PYLINT Redefinition of assoc type from
                #   dipper.models.assoc.D2PAssoc.D2PAssoc to
                #   dipper.models.assoc.DispositionAssoc.DispositionAssoc
                if asp == 'O' or asp == 'M':  # organ abnormality or mortality
                    assoc = D2PAssoc(
                        g, self.name, disease_id, pheno_id, onset, freq)
                elif asp == 'I':  # inheritance patterns for the whole disease
                    assoc = DispositionAssoc(
                        g, self.name, disease_id, pheno_id)
                elif asp == 'C':  # clinical course / onset
                    assoc = DispositionAssoc(
                        g, self.name, disease_id, pheno_id)
                else:
                    logger.error("I don't know what this aspect is: %s", asp)

                assoc.add_evidence(eco_id)

                publist = re.split(r'[,;]', publist)
                # blow these apart if there is a list of pubs
                for pub in publist:
                    pub = pub.strip()
                    pubtype = None
                    if pub != '':
                        # if re.match(
                        #       r'http://www.ncbi.nlm.nih.gov/bookshelf/br\.fcgi\?book=gene',
                        #        pub):
                        #     #http://www.ncbi.nlm.nih.gov/bookshelf/br.fcgi?book=gene&part=ced
                        #     m = re.search(r'part\=(\w+)', pub)
                        #     pub_id = 'GeneReviews:'+m.group(1)
                        # elif re.search(
                        #        r'http://www.orpha.net/consor/cgi-bin/OC_Exp\.php\?lng\=en\&Expert\=',
                        #        pub):
                        #     m = re.search(r'Expert=(\d+)', pub)
                        #     pub_id = 'Orphanet:'+m.group(1)

                        if re.match(r'(PMID|ISBN-13|ISBN-10|ISBN|HPO)', pub):
                            if re.match(r'PMID', pub):
                                pubtype = \
                                    Reference.ref_types['journal_article']
                            elif re.match(r'HPO', pub):
                                pubtype = Reference.ref_types['person']
                            else:
                                pubtype = Reference.ref_types['publication']
                            r = Reference(g, pub, pubtype)
                            r.addRefToGraph()
                        elif re.match(r'(OMIM|Orphanet|DECIPHER)', pub):
                            # make the pubs a reference to the website,
                            # instead of the curie
                            if re.match(r'OMIM', pub):
                                omimnum = re.sub(r'OMIM:', '', pub)
                                omimurl = '/'.join(('http://omim.org/entry',
                                                    str(omimnum).strip()))
                                pub = omimurl
                            elif re.match(r'Orphanet:', pub):
                                orphanetnum = re.sub(r'Orphanet:', '', pub)
                                orphaneturl = \
                                    ''.join((
                                        'http://www.orpha.net/consor/cgi-bin/OC_Exp.php?lng=en&Expert=',
                                        str(orphanetnum)))
                                pub = orphaneturl
                            elif re.match(r'DECIPHER:', pub):
                                deciphernum = re.sub(r'DECIPHER:', '', pub)
                                decipherurl = '/'.join(
                                    ('https://decipher.sanger.ac.uk/syndrome',
                                     deciphernum))
                                pub = decipherurl
                            pubtype = Reference.ref_types['webpage']
                        elif re.match(r'http', pub):
                            pass
                        else:
                            logger.error('Unknown pub type for %s: %s',
                                         disease_id, pub)
                            print(disease_id, 'pubs:', str(publist))
                            continue

                        if pub is not None:
                            assoc.add_source(pub)

                        # TODO add curator

                assoc.add_association_to_graph()

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

        return

    def get_common_files(self):
        """
        Fetch the raw hpo-annotation-data by cloning/pulling the
        [repository](https://github.com/monarch-initiative/hpo-annotation-data.git)
        These files get added to the files object,
        and iterated over separately.
        :return:

        """

        repo_dir = '/'.join((self.rawdir, 'git'))
        REMOTE_URL = \
            "[email protected]:monarch-initiative/hpo-annotation-data.git"
        HTTPS_URL = \
            "https://github.com/monarch-initiative/hpo-annotation-data.git"

        # TODO if repo doesn't exist, then clone otherwise pull
        if os.path.isdir(repo_dir):
            shutil.rmtree(repo_dir)

        logger.info("Cloning common disease files from %s", REMOTE_URL)
        try:
            Repo.clone_from(REMOTE_URL, repo_dir)
        except GitCommandError:
            # Try with https and if this doesn't work fail
            Repo.clone_from(HTTPS_URL, repo_dir)

        return

    def add_common_files_to_file_list(self):
        repo_dir = '/'.join((self.rawdir, 'git'))
        common_disease_dir = '/'.join((repo_dir, 'common-diseases'))

        # add the files to the self.files object
        filelist = os.listdir(common_disease_dir)
        fcount = 0
        for f in filelist:
            if not re.search(r'\.tab', f):
                continue
            fcount += 1
            self.files['common'+str(fcount).zfill(7)] = {
                'file': '/'.join((common_disease_dir, f)),
                # TODO add url to reference the file?
                # need to get git version somehow?
            }
            # TODO add this to the dataset
        logger.info("Found %d common disease files", fcount)

        return

    def process_all_common_disease_files(self, limit=None):
        """
        Loop through all of the files that we previously fetched from git,
        creating the disease-phenotype assoc.
        :param limit:
        :return:

        """

        self.replaced_id_count = 0
        unpadded_doids = self.get_doid_ids_for_unpadding()
        total_processed = 0
        logger.info("Iterating over all common disease files")
        common_file_count = 0
        for f in self.files:
            if not re.match(r'common', f):
                continue
            common_file_count += 1
            raw = self.files[f]['file']
            total_processed += self.process_common_disease_file(
                raw, unpadded_doids, limit)
            if not self.testMode \
                    and limit is not None and total_processed > limit:
                break
        logger.info("Finished iterating over all common disease files.")
        logger.info("Fixed %d/%d incorrectly zero-padded ids",
                    self.replaced_id_count, common_file_count)
        return

    def get_doid_ids_for_unpadding(self):
        """
        Here, we fetch the doid owl file, and get all the doids.
        We figure out which are not zero-padded, so we can map the DOID
        to the correct identifier when processing the common annotation files.

        This may become obsolete when
        https://github.com/monarch-initiative/hpo-annotation-data/issues/84
        is addressed.

        :return:

        """

        logger.info("Building list of non-zero-padded DOIDs")
        raw_file = '/'.join((self.rawdir, self.files['doid']['file']))
        doids = set()
        # scan the file and get all doids
        with open(raw_file, 'r', encoding="utf8") as f:
            for line in f:
                matches = re.search(r'(DOID_\d+)', line)
                if matches is not None:
                    for m in matches.groups():
                        doids.add(re.sub(r'_', ':', m))

        nopad_doids = set()
        for d in doids:
            num = re.sub(r'DOID[:_]', '', d)
            # look for things not starting with zero
            if not re.match(r'^0', str(num)):
                nopad_doids.add(num)

        logger.info("Found %d/%d DOIDs are not zero-padded",
                    len(nopad_doids), len(doids))

        return nopad_doids

    def process_common_disease_file(self, raw, unpadded_doids, limit=None):
        """
        Make disaese-phenotype associations.
        Some identifiers need clean up:
        * DOIDs are listed as DOID-DOID: --> DOID:
        * DOIDs may be unnecessarily zero-padded.
        these are remapped to their non-padded equivalent.

        :param raw:
        :param unpadded_doids:
        :param limit:
        :return:

        """
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        line_counter = 0
        assoc_count = 0
        replace_id_flag = False

        with open(raw, 'r', encoding="utf8") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            header = csvfile.readline()  # skip the header row
            logger.info("HEADER: %s", header)
            disease_id = None
            for row in filereader:

                if 21 == len(row):
                    (did, dname, gid, gene_name, genotype, gene_symbols,
                     phenotype_id, phenotype_name, age_of_onset_id,
                     age_of_onset_name, eid, evidence_name, frequency, sex_id,
                     sex_name, negation_id, negation_name, description,
                     pub_ids, assigned_by,
                     date_created) = [str(col).strip() for col in row]
                else:
                    logger.warning(
                        "Wrong number of columns! expected 21, got: %s in: %s",
                        len(row), raw)
                    logger.warning("%s", row)
                    continue
                # b/c "PMID:    17223397"
                pub_ids = re.sub(r'  *', '', pub_ids)

                disease_id = re.sub(r'DO(ID)?[-\:](DOID:)?', 'DOID:', did)
                disease_id = re.sub(r'MESH-', 'MESH:', disease_id)
                if not re.search(r'(DOID\:|MESH\:\w)\d+', disease_id):
                    logger.warning("Invalid id format: %s", disease_id)

                # figure out if the doid should be unpadded,
                # then use the unpadded version instead
                if re.match(r'DOID', disease_id):
                    unpadded_num = re.sub(r'DOID:', '', disease_id)
                    unpadded_num = unpadded_num.lstrip('0')
                    if unpadded_num in unpadded_doids:
                        fixed_id = 'DOID:' + unpadded_num
                        replace_id_flag = True
                        disease_id = fixed_id.strip()

                if self.testMode and disease_id not in self.test_ids:
                    # since these are broken up into disease-by-disease,
                    # just skip the whole file
                    return 0
                else:
                    line_counter += 1

                if negation_id != '':
                    continue  # TODO add negative associations

                if disease_id != '' and phenotype_id != '':
                    assoc = D2PAssoc(
                        g, self.name, disease_id, phenotype_id.strip())
                    if age_of_onset_id != '':
                        assoc.onset = age_of_onset_id
                    if frequency != '':
                        assoc.frequency = frequency
                    eco_id = self._map_evidence_to_codes(eid)
                    if eco_id is None:
                        eco_id = self._map_evidence_to_codes('ITM')
                    assoc.add_evidence(eco_id)
                    # TODO add sex? - not in dataset yet
                    if description != '':
                        assoc.set_description(description)
                    if pub_ids != '':
                        for p in pub_ids.split(';'):
                            p = re.sub(r'  *', '', p)
                            if re.search(r'(DOID|MESH)', p) \
                                    or re.search(r'Disease name contained',
                                                 description):
                                # skip "pubs" that are derived from
                                # the classes themselves
                                continue
                            assoc.add_source(p.strip())
                    # TODO assigned by?

                    assoc.add_association_to_graph()
                    assoc_count += 1

                if not self.testMode and limit is not None\
                        and line_counter > limit:
                    break

            if replace_id_flag:
                logger.info("replaced DOID with unpadded version")
                self.replaced_id_count += 1
            logger.info(
                "Added %d associations for %s.", assoc_count, disease_id)
        return assoc_count

    def getTestSuite(self):
        import unittest
        from tests.test_hpoa import HPOATestCase

        test_suite = unittest.TestLoader().loadTestsFromTestCase(HPOATestCase)

        return test_suite
예제 #9
0
class EOM(PostgreSQLSource):
    """
    Elements of Morphology is a resource from NHGRI that has definitions of
    morphological abnormalities, together with image depictions.
    We pull those relationships, as well as our local mapping of equivalences
    between EOM and HP terminologies.

    The website is crawled monthly by NIF's DISCO crawler system,
    which we utilize here.
    Be sure to have pg user/password connection details in your conf.json file,
    like:
    dbauth : {'disco' : {'user' : '<username>', 'password' : '<password>'}}

    Monarch-curated data for the HP to EOM mapping is stored at
    https://raw.githubusercontent.com/obophenotype/human-phenotype-ontology/master/src/mappings/hp-to-eom-mapping.tsv

    Since this resource is so small, the entirety of it is the "test" set.

    """

    # we are using the production view here; should we be using services?
    tables = ['dvp.pr_nlx_157874_1']

    files = {
        'map': {
            'file':
            'hp-to-eom-mapping.tsv',
            'url':
            'https://raw.githubusercontent.com/obophenotype/human-phenotype-ontology/master/src/mappings/hp-to-eom-mapping.tsv'
        }
    }

    def __init__(self, graph_type, are_bnodes_skolemized):
        super().__init__(graph_type, are_bnodes_skolemized, 'eom')

        # update the dataset object with details about this resource
        # TODO put this into a conf file?
        self.dataset = Dataset(
            'eom', 'EOM', 'http://elementsofmorphology.nih.gov', None,
            'http://www.genome.gov/copyright.cfm',
            'https://creativecommons.org/publicdomain/mark/1.0/')

        # check if config exists; if it doesn't, error out and let user know
        if 'dbauth' not in config.get_config() or \
                'disco' not in config.get_config()['dbauth']:
            logger.error("not configured with PG user/password.")

        # source-specific warnings.  will be cleared when resolved.

        return

    def fetch(self, is_dl_forced=False):
        '''create the connection details for DISCO'''

        cxn = config.get_config()['dbauth']['disco']
        cxn.update({
            'host': 'nif-db.crbs.ucsd.edu',
            'database': 'disco_crawler',
            'port': 5432
        })

        self.dataset.setFileAccessUrl(''.join(
            ('jdbc:postgresql://', cxn['host'], ':', str(cxn['port']), '/',
             cxn['database'])),
                                      is_object_literal=True)

        # process the tables
        # self.fetch_from_pgdb(self.tables,cxn,100)  #for testing
        self.fetch_from_pgdb(self.tables, cxn)

        self.get_files(is_dl_forced)

        # FIXME: Everything needed for data provenance?
        st = os.stat('/'.join((self.rawdir, 'dvp.pr_nlx_157874_1')))
        filedate = datetime.utcfromtimestamp(st[ST_CTIME]).strftime("%Y-%m-%d")
        self.dataset.setVersion(filedate)

        return

    def parse(self, limit=None):
        '''
            Over ride Source.parse inherited via PostgreSQLSource
        '''

        if limit is not None:
            logger.info("Only parsing first %s rows of each file", limit)

        if self.testOnly:
            self.testMode = True

        logger.info("Parsing files...")

        self._process_nlx_157874_1_view(
            '/'.join((self.rawdir, 'dvp.pr_nlx_157874_1')), limit)
        self._map_eom_terms('/'.join((self.rawdir, self.files['map']['file'])),
                            limit)

        logger.info("Finished parsing.")

        # since it's so small,
        # we default to copying the entire graph to the test set
        self.testgraph = self.graph

        return

    def _process_nlx_157874_1_view(self, raw, limit=None):
        """
        This table contains the Elements of Morphology data that has been
        screen-scraped into DISCO.
        Note that foaf:depiction is inverse of foaf:depicts relationship.

        Since it is bad form to have two definitions,
        we concatenate the two into one string.

        Triples:
            <eom id> a owl:Class
                rdf:label Literal(eom label)
                OIO:hasRelatedSynonym Literal(synonym list)
                IAO:definition Literal(objective_def. subjective def)
                foaf:depiction Literal(small_image_url),
                               Literal(large_image_url)
                foaf:page Literal(page_url)
                rdfs:comment Literal(long commented text)


        :param raw:
        :param limit:
        :return:
        """

        model = Model(self.graph)
        line_counter = 0
        with open(raw, 'r') as f1:
            f1.readline()  # read the header row; skip
            filereader = csv.reader(f1, delimiter='\t', quotechar='\"')
            for line in filereader:
                line_counter += 1
                (morphology_term_id, morphology_term_num,
                 morphology_term_label, morphology_term_url,
                 terminology_category_label, terminology_category_url,
                 subcategory, objective_definition, subjective_definition,
                 comments, synonyms, replaces, small_figure_url,
                 large_figure_url, e_uid, v_uid, v_uuid, v_last_modified,
                 v_status, v_lastmodified_epoch) = line

                # note:
                # e_uid v_uuid v_last_modified terminology_category_url
                # subcategory v_uid morphology_term_num
                # terminology_category_label hp_label notes
                # are currently unused.

                # Add morphology term to graph as a class
                # with label, type, and description.
                model.addClassToGraph(morphology_term_id,
                                      morphology_term_label)

                # Assemble the description text

                if subjective_definition != '' and not (re.match(
                        r'.+\.$', subjective_definition)):
                    # add a trailing period.
                    subjective_definition = subjective_definition.strip() + '.'
                if objective_definition != '' and not (re.match(
                        r'.+\.$', objective_definition)):
                    # add a trailing period.
                    objective_definition = objective_definition.strip() + '.'

                definition = \
                    '  '.join(
                        (objective_definition, subjective_definition)).strip()

                model.addDefinition(morphology_term_id, definition)

                # <term id> FOAF:depicted_by literal url
                # <url> type foaf:depiction

                # do we want both images?
                # morphology_term_id has depiction small_figure_url
                if small_figure_url != '':
                    model.addDepiction(morphology_term_id, small_figure_url)

                # morphology_term_id has depiction large_figure_url
                if large_figure_url != '':
                    model.addDepiction(morphology_term_id, large_figure_url)

                # morphology_term_id has comment comments
                if comments != '':
                    model.addComment(morphology_term_id, comments.strip())

                if synonyms != '':
                    for s in synonyms.split(';'):
                        model.addSynonym(
                            morphology_term_id, s.strip(),
                            model.annotation_properties['hasExactSynonym'])

                # morphology_term_id hasRelatedSynonym replaces (; delimited)
                if replaces != '' and replaces != synonyms:
                    for s in replaces.split(';'):
                        model.addSynonym(
                            morphology_term_id, s.strip(),
                            model.annotation_properties['hasRelatedSynonym'])

                # morphology_term_id has page morphology_term_url
                reference = Reference(self.graph)
                reference.addPage(morphology_term_id, morphology_term_url)

                if limit is not None and line_counter > limit:
                    break
        return

    def _map_eom_terms(self, raw, limit=None):
        """
        This table contains the HP ID mappings from the local tsv file.
        Triples:
            <eom id> owl:equivalentClass <hp id>
        :param raw:
        :param limit:
        :return:
        """

        model = Model(self.graph)
        line_counter = 0
        with open(raw, 'r') as f1:
            f1.readline()  # read the header row; skip
            for line in f1:
                line_counter += 1

                (morphology_term_id, morphology_term_label, hp_id, hp_label,
                 notes) = line.split('\t')

                # Sub out the underscores for colons.
                hp_id = re.sub('_', ':', hp_id)
                if re.match(".*HP:.*", hp_id):
                    # add the HP term as a class
                    model.addClassToGraph(hp_id, None)
                    # Add the HP ID as an equivalent class
                    model.addEquivalentClass(morphology_term_id, hp_id)
                else:
                    logger.warning('No matching HP term for %s',
                                   morphology_term_label)

                if limit is not None and line_counter > limit:
                    break

        return

    def getTestSuite(self):
        import unittest
        # TODO PYLINT: Unable to import 'tests.test_eom'
        from tests.test_eom import EOMTestCase

        test_suite = unittest.TestLoader().loadTestsFromTestCase(EOMTestCase)

        return test_suite
예제 #10
0
파일: MMRRC.py 프로젝트: putmantime/dipper
class MMRRC(Source):
    """
    Here we process the Mutant Mouse Resource and Research Center
    (https://www.mmrrc.org) strain data,
    which includes:
    *  strains, their mutant alleles
    *  phenotypes of the alleles
    *  descriptions of the research uses of the strains

    Note that some gene identifiers are not included
    (for many of the transgenics with human genes) in the raw data.
    We do our best to process the links between the variant and
    the affected gene, but sometimes the mapping is not clear,
    and we do not include it.
    Many of these details will be solved by merging this source with
    the MGI data source, who has the variant-to-gene designations.

    Also note that even though the strain pages at the MMRRC site do list
    phenotypic differences in the context of the strain backgrounds,
    they do not provide that data to us,
    and thus we cannot supply that disambiguation here.
    """

    files = {
        'catalog': {
            'file': 'mmrrc_catalog_data.csv',
            'url': 'https://www.mmrrc.org/about/mmrrc_catalog_data.csv'
        },
    }

    test_ids = [
        'MMRRC:037507-MU', 'MMRRC:041175-UCD', 'MMRRC:036933-UNC',
        'MMRRC:037884-UCD', 'MMRRC:000255-MU', 'MMRRC:037372-UCD',
        'MMRRC:000001-UNC'
    ]

    def __init__(self, graph_type, are_bnodes_skolemized):
        super().__init__(graph_type, are_bnodes_skolemized, 'mmrrc')
        self.strain_hash = {}
        self.id_label_hash = {}
        self.dataset = Dataset(
            'mmrrc', 'Mutant Mouse Regional Resource Centers',
            'https://www.mmrrc.org', None,
            'https://www.mmrrc.org/about/data_download.php')

        return

    def fetch(self, is_dl_forced=False):

        self.get_files(is_dl_forced)
        fname = '/'.join((self.rawdir, self.files['catalog']['file']))
        st = os.stat(fname)
        filedate = datetime.utcfromtimestamp(st[ST_CTIME]).strftime("%Y-%m-%d")

        # TODO note: can set the data version to what is in the header
        # first line like:
        # This MMRRC catalog data file was generated on 2015-04-22

        self.dataset.setVersion(filedate)

        return

    def parse(self, limit=None):
        if limit is not None:
            logger.info("Only parsing first %s rows", limit)

        logger.info("Parsing files...")

        if self.testOnly:
            self.testMode = True

        self._process_phenotype_data(limit)

        logger.info("Finished parsing.")

        return

    def _process_phenotype_data(self, limit):
        """
        NOTE: If a Strain carries more than one mutation,
        then each Mutation description,
        i.e., the set: (
            Mutation Type - Chromosome - Gene Symbol -
            Gene Name - Allele Symbol - Allele Name)
        will require a separate line.

        Note that MMRRC curates phenotypes to alleles,
        even though they distribute only one file with the
        phenotypes appearing to be associated with a strain.

        So, here we process the allele-to-phenotype relationships separately
        from the strain-to-allele relationships.

        :param limit:
        :return:

        """
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        model = Model(g)
        line_counter = 0
        fname = '/'.join((self.rawdir, self.files['catalog']['file']))

        self.strain_hash = {}
        self.id_label_hash = {}
        genes_with_no_ids = set()
        stem_cell_class = 'CL:0000034'
        mouse_taxon = 'NCBITaxon:10090'
        geno = Genotype(g)
        with open(fname, 'r', encoding="utf8") as csvfile:
            filereader = csv.reader(csvfile, delimiter=',', quotechar='\"')
            for row in filereader:
                line_counter += 1
                # skip the first 3 lines which are header, etc.
                if line_counter < 4:
                    continue

                (strain_id, strain_label, strain_type_symbol, strain_state,
                 mgi_allele_id, mgi_allele_symbol, mgi_allele_name,
                 mutation_type, chrom, mgi_gene_id, mgi_gene_symbol,
                 mgi_gene_name, sds_url, accepted_date, mp_ids, pubmed_nums,
                 research_areas) = row

                if self.testMode and (strain_id not in self.test_ids) \
                        or mgi_gene_name == 'withdrawn':
                    continue

                # strip off stuff after the dash -
                # is the holding center important?
                # MMRRC:00001-UNC --> MMRRC:00001
                strain_id = re.sub(r'-\w+$', '', strain_id)

                self.id_label_hash[strain_id] = strain_label

                # get the variant or gene to save for later building of
                # the genotype
                if strain_id not in self.strain_hash:
                    self.strain_hash[strain_id] = {
                        'variants': set(),
                        'genes': set()
                    }

                # clean up the bad one
                if mgi_allele_id == 'multiple mutation':
                    logger.error("Erroneous gene id: %s", mgi_allele_id)
                    mgi_allele_id = ''

                if mgi_allele_id != '':
                    self.strain_hash[strain_id]['variants'].add(mgi_allele_id)
                    self.id_label_hash[mgi_allele_id] = mgi_allele_symbol

                    # use the following if needing to add the
                    # sequence alteration types
                    # var_type =
                    #   self._get_variant_type_from_abbrev(mutation_type)
                    # make a sequence alteration for this variant locus,
                    # and link the variation type to it
                    # sa_id = '_'+re.sub(r':','',mgi_allele_id)+'SA'
                    # if self.nobnodes:
                    #     sa_id = ':'+sa_id
                    # gu.addIndividualToGraph(g, sa_id, None, var_type)
                    # geno.addSequenceAlterationToVariantLocus(sa_id,
                    #                                          mgi_allele_id)

                # scrub out any spaces
                mgi_gene_id = re.sub(r'\s+', '', mgi_gene_id)
                if mgi_gene_id.strip() != '':
                    if re.match(r'Gene\s*ID:', mgi_gene_id, re.I):
                        mgi_gene_id = re.sub(r'Gene\s*ID:\s*', 'NCBIGene:',
                                             mgi_gene_id)
                    elif not re.match(r'MGI', mgi_gene_id):
                        logger.info("Gene id not recognized: %s", mgi_gene_id)
                        if re.match(r'\d+$', mgi_gene_id):
                            # assume that if it's all numbers, then it's MGI
                            mgi_gene_id = 'MGI:' + str(mgi_gene_id)
                            logger.info("Assuming numerics are MGI.")
                    self.strain_hash[strain_id]['genes'].add(mgi_gene_id)
                    self.id_label_hash[mgi_gene_id] = mgi_gene_symbol

                # catch some errors -
                # some things have gene labels, but no identifiers - report
                if mgi_gene_symbol.strip() != '' and mgi_gene_id == '':
                    logger.error(
                        "Gene label with no identifier for strain %s: %s",
                        strain_id, mgi_gene_symbol)
                    genes_with_no_ids.add(mgi_gene_symbol.strip())
                    # make a temp id for genes that aren't identified
                    # tmp_gene_id = '_'+mgi_gene_symbol
                    # self.id_label_hash[tmp_gene_id] = mgi_gene_symbol
                    # self.strain_hash[strain_id]['genes'].add(tmp_gene_id)

                # split apart the mp ids
                # ataxia [MP:0001393] ,hypoactivity [MP:0001402] ...
                # mp_ids are now a comma delimited list
                # with MP terms in brackets
                phenotype_ids = []
                if mp_ids != '':
                    for i in re.split(r',', mp_ids):
                        i = i.strip()
                        mps = re.search(r'\[(.*)\]', i)
                        if mps is not None:
                            mp_id = mps.group(1).strip()
                            phenotype_ids.append(mp_id)

                # pubmed ids are space delimited
                pubmed_ids = []
                if pubmed_nums.strip() != '':
                    for i in re.split(r'\s+', pubmed_nums):
                        pmid = 'PMID:' + i.strip()
                        pubmed_ids.append(pmid)
                        r = Reference(g, pmid,
                                      Reference.ref_types['journal_article'])
                        r.addRefToGraph()

                # https://www.mmrrc.org/catalog/sds.php?mmrrc_id=00001
                # is a good example of 4 genotype parts

                model.addClassToGraph(mouse_taxon, None)
                if research_areas.strip() == '':
                    research_areas = None
                else:
                    research_areas = 'Research Areas: ' + research_areas
                strain_type = mouse_taxon
                if strain_state == 'ES':
                    strain_type = stem_cell_class
                model.addIndividualToGraph(
                    strain_id, strain_label, strain_type,
                    research_areas)  # an inst of mouse??
                model.makeLeader(strain_id)

                # phenotypes are associated with the alleles
                for pid in phenotype_ids:
                    # assume the phenotype label is in the ontology
                    model.addClassToGraph(pid, None)
                    if mgi_allele_id is not None and mgi_allele_id != '':
                        assoc = G2PAssoc(
                            g, self.name, mgi_allele_id, pid,
                            model.object_properties['has_phenotype'])
                        for p in pubmed_ids:
                            assoc.add_source(p)
                        assoc.add_association_to_graph()
                    else:
                        logger.info("Phenotypes and no allele for %s",
                                    strain_id)

                if not self.testMode and (limit is not None
                                          and line_counter > limit):
                    break

            # now that we've collected all of the variant information, build it
            # we don't know their zygosities
            for s in self.strain_hash:
                h = self.strain_hash.get(s)
                variants = h['variants']
                genes = h['genes']
                vl_set = set()
                # make variant loci for each gene
                if len(variants) > 0:
                    for v in variants:
                        vl_id = v
                        vl_symbol = self.id_label_hash[vl_id]
                        geno.addAllele(vl_id, vl_symbol,
                                       geno.genoparts['variant_locus'])
                        vl_set.add(vl_id)
                        if len(variants) == 1 and len(genes) == 1:
                            for gene in genes:
                                geno.addAlleleOfGene(vl_id, gene)
                        else:
                            geno.addAllele(vl_id, vl_symbol)
                else:  # len(vars) == 0
                    # it's just anonymous variants in some gene
                    for gene in genes:
                        vl_id = '_:' + re.sub(r':', '', gene) + '-VL'
                        vl_symbol = self.id_label_hash[gene] + '<?>'
                        self.id_label_hash[vl_id] = vl_symbol
                        geno.addAllele(vl_id, vl_symbol,
                                       geno.genoparts['variant_locus'])
                        geno.addGene(gene, self.id_label_hash[gene])
                        geno.addAlleleOfGene(vl_id, gene)
                        vl_set.add(vl_id)

                # make the vslcs
                vl_list = sorted(vl_set)
                vslc_list = []
                for vl in vl_list:
                    # for unknown zygosity
                    vslc_id = re.sub(r'^_', '', vl) + 'U'
                    vslc_id = re.sub(r':', '', vslc_id)
                    vslc_id = '_:' + vslc_id
                    vslc_label = self.id_label_hash[vl] + '/?'
                    self.id_label_hash[vslc_id] = vslc_label
                    vslc_list.append(vslc_id)
                    geno.addPartsToVSLC(
                        vslc_id, vl, None, geno.zygosity['indeterminate'],
                        geno.object_properties['has_alternate_part'], None)
                    model.addIndividualToGraph(
                        vslc_id, vslc_label,
                        geno.genoparts['variant_single_locus_complement'])
                if len(vslc_list) > 0:
                    if len(vslc_list) > 1:
                        gvc_id = '-'.join(vslc_list)
                        gvc_id = re.sub(r'_|:', '', gvc_id)
                        gvc_id = '_:' + gvc_id
                        gvc_label = \
                            '; '.join(self.id_label_hash[v] for v in vslc_list)
                        model.addIndividualToGraph(
                            gvc_id, gvc_label,
                            geno.genoparts['genomic_variation_complement'])
                        for vslc_id in vslc_list:
                            geno.addVSLCtoParent(vslc_id, gvc_id)
                    else:
                        # the GVC == VSLC, so don't have to make an extra piece
                        gvc_id = vslc_list.pop()
                        gvc_label = self.id_label_hash[gvc_id]

                    genotype_label = gvc_label + ' [n.s.]'
                    bkgd_id = \
                        re.sub(r':', '', '-'.join(
                            (geno.genoparts['unspecified_genomic_background'],
                             s)))
                    genotype_id = '-'.join((gvc_id, bkgd_id))
                    bkgd_id = '_:' + bkgd_id
                    geno.addTaxon(mouse_taxon, bkgd_id)
                    geno.addGenomicBackground(
                        bkgd_id, 'unspecified (' + s + ')',
                        geno.genoparts['unspecified_genomic_background'],
                        "A placeholder for the " +
                        "unspecified genetic background for " + s)
                    geno.addGenomicBackgroundToGenotype(
                        bkgd_id, genotype_id,
                        geno.genoparts['unspecified_genomic_background'])
                    geno.addParts(gvc_id, genotype_id,
                                  geno.object_properties['has_alternate_part'])
                    geno.addGenotype(genotype_id, genotype_label)
                    g.addTriple(s, geno.object_properties['has_genotype'],
                                genotype_id)
                else:
                    # logger.debug(
                    #   "Strain %s is not making a proper genotype.", s)
                    pass

            logger.warning(
                "The following gene symbols did not list identifiers: %s",
                str(sorted(list(genes_with_no_ids))))

        return

    @staticmethod
    def _get_variant_type_from_abbrev(abbrev):
        """
        All variants are generically typed as "sequence_alterations"
        unless otherwise stated.
        :param abbrev:
        :return:

        """
        variant_type = None

        var_dict = {
            'SM': 'SO:0001059',  # spontaneous mutation
            'TM': 'SO:0001059',  # targeted mutation
            'TG': 'SO:xxxxxxx',  # transgenic
            'GT': 'SO:0001059',  # gene trap
            'CI': 'SO:0001059',  # chemically induced mutation
            'RAD': 'SO:0001059',  # radiation induced mutation
            # chromosomal aberration --> chromosomal structure variation
            'CH': 'SO:1000183',
            'RB': 'SO:1000043',  # Robertsonian translocation
            'TL': 'SO:1000048',  # reciprocal translocation
            'TP': 'SO:0000453',  # transposition
            'INV': 'SO:1000036',  # inversion
            'INS': 'SO:0000667',  # insertion
            'DEL': 'SO:0000159',  # deletion
            'DP': 'SO:1000035',  # duplication
            'OTH': 'SO:0001059'  # other
        }
        if abbrev in var_dict:
            variant_type = var_dict[abbrev]
        else:
            logger.warning("Variant type not recognized: %s", abbrev)

        return variant_type

    def getTestSuite(self):
        import unittest
        from tests.test_mmrrc import MMRRCTestCase

        test_suite = unittest.TestLoader().loadTestsFromTestCase(MMRRCTestCase)

        return test_suite
예제 #11
0
class HPOAnnotations(Source):
    """
    The [Human Phenotype Ontology](http://human-phenotype-ontology.org) group
    curates and assembles over 115,000 annotations to hereditary diseases
    using the HPO ontology. Here we create OBAN-style associations
    between diseases and phenotypic features, together with their evidence,
    and age of onset and frequency (if known).
    The parser currently only processes the "abnormal" annotations.
    Association to "remarkable normality" will be added in the near future.

    We create additional associations from text mining.  See info at
    http://pubmed-browser.human-phenotype-ontology.org/.

    Also, you can read about these annotations in
    [PMID:26119816](http://www.ncbi.nlm.nih.gov/pubmed/26119816).

    In order to properly test this class,
    you should have a conf.json file configured with some test ids, in
    the structure of:
    # as examples.  put your favorite ids in the config.
    <pre>
    test_ids: {"disease" : ["OMIM:119600", "OMIM:120160"]}
    </pre>

    """

    files = {
        'annot': {
            'file': 'phenotype_annotation.tab',
            'url': HPOADL + '/phenotype_annotation.tab'
        },
        'version': {
            'file': 'data_version.txt',
            'url': HPOADL + '/data_version.txt'
        },
        # 'neg_annot': {
        #   'file': 'phenotype_annotation.tab',
        #    'url': HPOADL + '/negative_phenotype_annotation.tab'},
        'doid': {
            'file': 'doid.owl',
            'url': 'http://purl.obolibrary.org/obo/doid.owl'
        }
    }

    # note, two of these codes are awaiting term requests.  see #114 and
    # https://code.google.com/p/evidenceontology/issues/detail?id=32
    # TODO TEC see if the GC issue translated into a GH issue
    eco_dict = {
        # FIXME currently using "curator inference used in manual assertion"
        "ICE": "ECO:0000305",
        # Inferred from Electronic Annotation
        "IEA": "ECO:0000501",
        # FIXME currently is"experimental evidence used in manual assertion"
        "PCS": "ECO:0000269",
        # Traceable Author Statement
        "TAS": "ECO:0000304",
        # FIXME currently using computational combinatorial evidence
        # in automatic assertion
        "ITM": "ECO:0000246",
    }

    def __init__(self, graph_type, are_bnodes_skolemized):
        super().__init__(graph_type, are_bnodes_skolemized, 'hpoa')

        self.dataset = Dataset(
            'hpoa', 'Human Phenotype Ontology',
            'http://www.human-phenotype-ontology.org', None,
            'http://www.human-phenotype-ontology.org/contao/index.php/legal-issues.html'
        )

        self.replaced_id_count = 0

        if 'test_ids' not in config.get_config()\
                or 'disease' not in config.get_config()['test_ids']:
            logger.warning("not configured with disease test ids.")
            self.test_ids = []
        else:
            self.test_ids = config.get_config()['test_ids']['disease']

        # data-source specific warnings to be removed when issues are cleared
        logger.warning(
            "note that some ECO classes are missing for ICE, PCS, and ITM;" +
            " using temporary mappings.")

        return

    def fetch(self, is_dl_forced=False):

        self.get_files(is_dl_forced)

        self.scrub()

        # get the latest build from jenkins

        # use the files['version'] file as the version
        fname = '/'.join((self.rawdir, self.files['version']['file']))

        with open(fname, 'r', encoding="utf8") as f:
            # 2015-04-23 13:01
            v = f.readline()  # read the first line (the only line, really)
            d = datetime.strptime(v.strip(),
                                  '%Y-%m-%d %H:%M').strftime("%Y-%m-%d-%H-%M")
        f.close()

        st = os.stat(fname)
        filedate = datetime.utcfromtimestamp(st[ST_CTIME]).strftime("%Y-%m-%d")

        # this will cause two dates to be attached to the dataset
        # (one from the filedate, and the other from here)
        # TODO when #112 is implemented,
        # this will result in only the whole dataset being versioned
        self.dataset.setVersion(filedate, d)

        self.get_common_files()

        return

    def scrub(self):
        """
        Perform various data-scrubbing on the raw data files prior to parsing.
        For this resource, this currently includes:
        * revise errors in identifiers for some OMIM and PMIDs

        :return: None

        """

        # scrub file of the oddities...lots of publication rewriting
        f = '/'.join((self.rawdir, self.files['annot']['file']))
        logger.info('scrubbing PubMed:12345 --> PMID:12345')
        pysed.replace(r'PubMed:', 'PMID:', f)

        logger.info('scrubbing pmid:12345 --> PMID:12345')
        pysed.replace(r'pmid:', 'PMID:', f)

        logger.info('scrubbing PMID:    12345 --> PMID:12345')
        pysed.replace(r'PMID:  *', 'PMID:', f)

        logger.info('scrubbing PMID12345 --> PMID:12345')
        pysed.replace(r'PMID([0-9][0-9]*)', r'PMID:\1', f)

        logger.info('scrubbing MIM12345 --> OMIM:12345')
        pysed.replace(r'MIM([0-9][0-9]*)', r'OMIM:\1', f)

        logger.info('scrubbing MIM:12345 --> OMIM:12345')
        pysed.replace(r";MIM", ";OMIM", f)

        logger.info('scrubbing ORPHANET --> Orphanet')
        pysed.replace("ORPHANET", "Orphanet", f)

        logger.info('scrubbing ORPHA --> Orphanet')
        pysed.replace("ORPHA", "Orphanet", f)
        return

    def parse(self, limit=None):
        if limit is not None:
            logger.info("Only parsing first %s rows", limit)

        self.add_common_files_to_file_list()

        logger.info("Parsing files...")

        if self.testOnly:
            self.testMode = True

        # rare disease-phenotype associations
        self._process_phenotype_tab(
            '/'.join((self.rawdir, self.files['annot']['file'])), limit)

        # TODO add negative phenotype statements #113
        # self._process_negative_phenotype_tab(self.rawfile,self.outfile,limit)

        # common disease-phenotype associations from text mining work
        self.process_all_common_disease_files(limit)

        logger.info("Finished parsing.")

        return

    def _map_evidence_to_codes(self, code_string):
        """
        A simple mapping of the code_string to it's ECO class
        using the dictionary defined here
        Currently includes ICE, IEA, PCS, TAS
        :param code_string:
        :return:

        """
        return self.eco_dict.get(code_string)

    def _process_phenotype_tab(self, raw, limit):
        """
        see info on format here:
        http://www.human-phenotype-ontology.org/contao/index.php/annotation-guide.html

        :param raw:
        :param limit:
        :return:

        """
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        model = Model(g)
        line_counter = 0
        with open(raw, 'r', encoding="utf8") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                row = [str(col).strip() for col in row]
                (db, num, name, qual, pheno_id, publist, eco, onset, freq, w,
                 asp, syn, date, curator) = row
                disease_id = db + ":" + num

                if self.testMode:
                    try:
                        id_list = self.test_ids
                        if id_list is None \
                                or disease_id not in id_list:
                            continue
                    except AttributeError:
                        continue

                # logger.info('adding %s', disease_id)

                model.addClassToGraph(disease_id, None)
                model.addClassToGraph(pheno_id, None)
                eco_id = self._map_evidence_to_codes(eco)
                model.addClassToGraph(eco_id, None)
                if onset is not None and onset != '':
                    model.addClassToGraph(onset, None)

                # we want to do things differently depending on
                # the aspect of the annotation
                # TODO PYLINT Redefinition of assoc type from
                #   dipper.models.assoc.D2PAssoc.D2PAssoc to
                #   dipper.models.assoc.DispositionAssoc.DispositionAssoc
                if asp == 'O' or asp == 'M':  # organ abnormality or mortality
                    assoc = D2PAssoc(g, self.name, disease_id, pheno_id, onset,
                                     freq)
                elif asp == 'I':  # inheritance patterns for the whole disease
                    assoc = DispositionAssoc(g, self.name, disease_id,
                                             pheno_id)
                elif asp == 'C':  # clinical course / onset
                    assoc = DispositionAssoc(g, self.name, disease_id,
                                             pheno_id)
                else:
                    logger.error("I don't know what this aspect is: %s", asp)

                assoc.add_evidence(eco_id)

                publist = re.split(r'[,;]', publist)
                # blow these apart if there is a list of pubs
                for pub in publist:
                    pub = pub.strip()
                    pubtype = None
                    if pub != '':
                        # if re.match(
                        #       r'http://www.ncbi.nlm.nih.gov/bookshelf/br\.fcgi\?book=gene',
                        #        pub):
                        #     #http://www.ncbi.nlm.nih.gov/bookshelf/br.fcgi?book=gene&part=ced
                        #     m = re.search(r'part\=(\w+)', pub)
                        #     pub_id = 'GeneReviews:'+m.group(1)
                        # elif re.search(
                        #        r'http://www.orpha.net/consor/cgi-bin/OC_Exp\.php\?lng\=en\&Expert\=',
                        #        pub):
                        #     m = re.search(r'Expert=(\d+)', pub)
                        #     pub_id = 'Orphanet:'+m.group(1)

                        if re.match(r'(PMID|ISBN-13|ISBN-10|ISBN|HPO)', pub):
                            if re.match(r'PMID', pub):
                                pubtype = \
                                    Reference.ref_types['journal_article']
                            elif re.match(r'HPO', pub):
                                pubtype = Reference.ref_types['person']
                            else:
                                pubtype = Reference.ref_types['publication']
                            r = Reference(g, pub, pubtype)
                            r.addRefToGraph()
                        elif re.match(r'(OMIM|Orphanet|DECIPHER)', pub):
                            # make the pubs a reference to the website,
                            # instead of the curie
                            if re.match(r'OMIM', pub):
                                omimnum = re.sub(r'OMIM:', '', pub)
                                omimurl = '/'.join(('http://omim.org/entry',
                                                    str(omimnum).strip()))
                                pub = omimurl
                            elif re.match(r'Orphanet:', pub):
                                orphanetnum = re.sub(r'Orphanet:', '', pub)
                                orphaneturl = \
                                    ''.join((
                                        'http://www.orpha.net/consor/cgi-bin/OC_Exp.php?lng=en&Expert=',
                                        str(orphanetnum)))
                                pub = orphaneturl
                            elif re.match(r'DECIPHER:', pub):
                                deciphernum = re.sub(r'DECIPHER:', '', pub)
                                decipherurl = '/'.join(
                                    ('https://decipher.sanger.ac.uk/syndrome',
                                     deciphernum))
                                pub = decipherurl
                            pubtype = Reference.ref_types['webpage']
                        elif re.match(r'http', pub):
                            pass
                        else:
                            logger.error('Unknown pub type for %s: %s',
                                         disease_id, pub)
                            print(disease_id, 'pubs:', str(publist))
                            continue

                        if pub is not None:
                            assoc.add_source(pub)

                        # TODO add curator

                assoc.add_association_to_graph()

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

        return

    def get_common_files(self):
        """
        Fetch the raw hpo-annotation-data by cloning/pulling the
        [repository](https://github.com/monarch-initiative/hpo-annotation-data.git)
        These files get added to the files object,
        and iterated over separately.
        :return:

        """

        repo_dir = '/'.join((self.rawdir, 'git'))
        REMOTE_URL = \
            "[email protected]:monarch-initiative/hpo-annotation-data.git"
        HTTPS_URL = \
            "https://github.com/monarch-initiative/hpo-annotation-data.git"

        # TODO if repo doesn't exist, then clone otherwise pull
        if os.path.isdir(repo_dir):
            shutil.rmtree(repo_dir)

        logger.info("Cloning common disease files from %s", REMOTE_URL)
        try:
            Repo.clone_from(REMOTE_URL, repo_dir)
        except GitCommandError:
            # Try with https and if this doesn't work fail
            Repo.clone_from(HTTPS_URL, repo_dir)

        return

    def add_common_files_to_file_list(self):
        repo_dir = '/'.join((self.rawdir, 'git'))
        common_disease_dir = '/'.join((repo_dir, 'common-diseases'))

        # add the files to the self.files object
        filelist = os.listdir(common_disease_dir)
        fcount = 0
        for f in filelist:
            if not re.search(r'\.tab', f):
                continue
            fcount += 1
            self.files['common' + str(fcount).zfill(7)] = {
                'file': '/'.join((common_disease_dir, f)),
                # TODO add url to reference the file?
                # need to get git version somehow?
            }
            # TODO add this to the dataset
        logger.info("Found %d common disease files", fcount)

        return

    def process_all_common_disease_files(self, limit=None):
        """
        Loop through all of the files that we previously fetched from git,
        creating the disease-phenotype assoc.
        :param limit:
        :return:

        """

        self.replaced_id_count = 0
        unpadded_doids = self.get_doid_ids_for_unpadding()
        total_processed = 0
        logger.info("Iterating over all common disease files")
        common_file_count = 0
        for f in self.files:
            if not re.match(r'common', f):
                continue
            common_file_count += 1
            raw = self.files[f]['file']
            total_processed += self.process_common_disease_file(
                raw, unpadded_doids, limit)
            if not self.testMode \
                    and limit is not None and total_processed > limit:
                break
        logger.info("Finished iterating over all common disease files.")
        logger.info("Fixed %d/%d incorrectly zero-padded ids",
                    self.replaced_id_count, common_file_count)
        return

    def get_doid_ids_for_unpadding(self):
        """
        Here, we fetch the doid owl file, and get all the doids.
        We figure out which are not zero-padded, so we can map the DOID
        to the correct identifier when processing the common annotation files.

        This may become obsolete when
        https://github.com/monarch-initiative/hpo-annotation-data/issues/84
        is addressed.

        :return:

        """

        logger.info("Building list of non-zero-padded DOIDs")
        raw_file = '/'.join((self.rawdir, self.files['doid']['file']))
        doids = set()
        # scan the file and get all doids
        with open(raw_file, 'r', encoding="utf8") as f:
            for line in f:
                matches = re.search(r'(DOID_\d+)', line)
                if matches is not None:
                    for m in matches.groups():
                        doids.add(re.sub(r'_', ':', m))

        nopad_doids = set()
        for d in doids:
            num = re.sub(r'DOID[:_]', '', d)
            # look for things not starting with zero
            if not re.match(r'^0', str(num)):
                nopad_doids.add(num)

        logger.info("Found %d/%d DOIDs are not zero-padded", len(nopad_doids),
                    len(doids))

        return nopad_doids

    def process_common_disease_file(self, raw, unpadded_doids, limit=None):
        """
        Make disaese-phenotype associations.
        Some identifiers need clean up:
        * DOIDs are listed as DOID-DOID: --> DOID:
        * DOIDs may be unnecessarily zero-padded.
        these are remapped to their non-padded equivalent.

        :param raw:
        :param unpadded_doids:
        :param limit:
        :return:

        """
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        line_counter = 0
        assoc_count = 0
        replace_id_flag = False

        with open(raw, 'r', encoding="utf8") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            header = csvfile.readline()  # skip the header row
            logger.info("HEADER: %s", header)
            disease_id = None
            for row in filereader:

                if 21 == len(row):
                    (did, dname, gid, gene_name, genotype, gene_symbols,
                     phenotype_id, phenotype_name, age_of_onset_id,
                     age_of_onset_name, eid, evidence_name, frequency, sex_id,
                     sex_name, negation_id, negation_name, description,
                     pub_ids, assigned_by,
                     date_created) = [str(col).strip() for col in row]
                else:
                    logger.warning(
                        "Wrong number of columns! expected 21, got: %s in: %s",
                        len(row), raw)
                    logger.warning("%s", row)
                    continue
                # b/c "PMID:    17223397"
                pub_ids = re.sub(r'  *', '', pub_ids)

                disease_id = re.sub(r'DO(ID)?[-\:](DOID:)?', 'DOID:', did)
                disease_id = re.sub(r'MESH-', 'MESH:', disease_id)
                if not re.search(r'(DOID\:|MESH\:\w)\d+', disease_id):
                    logger.warning("Invalid id format: %s", disease_id)

                # figure out if the doid should be unpadded,
                # then use the unpadded version instead
                if re.match(r'DOID', disease_id):
                    unpadded_num = re.sub(r'DOID:', '', disease_id)
                    unpadded_num = unpadded_num.lstrip('0')
                    if unpadded_num in unpadded_doids:
                        fixed_id = 'DOID:' + unpadded_num
                        replace_id_flag = True
                        disease_id = fixed_id.strip()

                if self.testMode and disease_id not in self.test_ids:
                    # since these are broken up into disease-by-disease,
                    # just skip the whole file
                    return 0
                else:
                    line_counter += 1

                if negation_id != '':
                    continue  # TODO add negative associations

                if disease_id != '' and phenotype_id != '':
                    assoc = D2PAssoc(g, self.name, disease_id,
                                     phenotype_id.strip())
                    if age_of_onset_id != '':
                        assoc.onset = age_of_onset_id
                    if frequency != '':
                        assoc.frequency = frequency
                    eco_id = self._map_evidence_to_codes(eid)
                    if eco_id is None:
                        eco_id = self._map_evidence_to_codes('ITM')
                    assoc.add_evidence(eco_id)
                    # TODO add sex? - not in dataset yet
                    if description != '':
                        assoc.set_description(description)
                    if pub_ids != '':
                        for p in pub_ids.split(';'):
                            p = re.sub(r'  *', '', p)
                            if re.search(r'(DOID|MESH)', p) \
                                    or re.search(r'Disease name contained',
                                                 description):
                                # skip "pubs" that are derived from
                                # the classes themselves
                                continue
                            assoc.add_source(p.strip())
                    # TODO assigned by?

                    assoc.add_association_to_graph()
                    assoc_count += 1

                if not self.testMode and limit is not None\
                        and line_counter > limit:
                    break

            if replace_id_flag:
                logger.info("replaced DOID with unpadded version")
                self.replaced_id_count += 1
            logger.info("Added %d associations for %s.", assoc_count,
                        disease_id)
        return assoc_count

    def getTestSuite(self):
        import unittest
        from tests.test_hpoa import HPOATestCase

        test_suite = unittest.TestLoader().loadTestsFromTestCase(HPOATestCase)

        return test_suite
예제 #12
0
class Coriell(Source):
    """
    The Coriell Catalog provided to Monarch includes metadata and descriptions
    of NIGMS, NINDS, NHGRI, and NIA cell lines.  These lines are made available
    for research purposes. Here, we create annotations for the cell lines as
    models of the diseases from which they originate.

    We create a handle for a patient from which the given cell line is derived
    (since there may be multiple cell lines created from a given patient).
    A genotype is assembled for a patient, which includes a karyotype
    (if specified) and/or a collection of variants.
    Both the genotype (has_genotype) and disease are linked to the patient
    (has_phenotype), and the cell line is listed as derived from the patient.
    The cell line is classified by it's
    [CLO cell type](http://www.ontobee.org/browser/index.php?o=clo),
    which itself is linked to a tissue of origin.

    Unfortunately, the omim numbers listed in this file are both for genes
    & diseases; we have no way of knowing a priori if a designated omim number
    is a gene or disease; so we presently link the patient to any omim id via
    the has_phenotype relationship.

    Notice: The Coriell catalog is delivered to Monarch in a specific format,
    and requires ssh rsa fingerprint identification.  Other groups wishing to
    get this data in it's raw form will need to contact Coriell for credential
    This needs to be placed into your configuration file for it to work.

    """

    terms = {
        'cell_line_repository': 'CLO:0000008',
        'race': 'SIO:001015',
        'ethnic_group': 'EFO:0001799',
        'age': 'EFO:0000246',
        'sampling_time': 'EFO:0000689',
        'collection': 'ERO:0002190'
    }

    files = {
        'NINDS': {
            'file': 'NINDS.csv',
            'id': 'NINDS',
            'label': 'NINDS Human Genetics DNA and Cell line Repository',
            'page': 'https://catalog.coriell.org/1/NINDS'
        },
        'NIGMS': {
            'file': 'NIGMS.csv',
            'id': 'NIGMS',
            'label': 'NIGMS Human Genetic Cell Repository',
            'page': 'https://catalog.coriell.org/1/NIGMS'
        },
        'NIA': {
            'file': 'NIA.csv',
            'id': 'NIA',
            'label': 'NIA Aging Cell Repository',
            'page': 'https://catalog.coriell.org/1/NIA'
        },
        'NHGRI': {
            'file': 'NHGRI.csv',
            'id': 'NHGRI',
            'label': 'NHGRI Sample Repository for Human Genetic Research',
            'page': 'https://catalog.coriell.org/1/NHGRI'
        }
    }

    # the following will house the specific cell lines to use for test output
    test_lines = [
        'ND02380', 'ND02381', 'ND02383', 'ND02384', 'GM17897', 'GM17898',
        'GM17896', 'GM17944', 'GM17945', 'ND00055', 'ND00094', 'ND00136',
        'GM17940', 'GM17939', 'GM20567', 'AG02506', 'AG04407', 'AG07602'
        'AG07601', 'GM19700', 'GM19701', 'GM19702', 'GM00324', 'GM00325',
        'GM00142', 'NA17944', 'AG02505', 'GM01602', 'GM02455', 'AG00364',
        'GM13707', 'AG00780'
    ]

    def __init__(self, graph_type, are_bnodes_skolemized):
        super().__init__(graph_type, are_bnodes_skolemized, 'coriell')

        self.dataset = Dataset('coriell', 'Coriell', 'http://ccr.coriell.org/',
                               None)

        # data-source specific warnings
        # (will be removed when issues are cleared)

        logger.warning('We assume that if a species is not provided, '
                       'that it is a Human-derived cell line')
        logger.warning('We map all omim ids as a disease/phenotype entity, '
                       'but should be fixed in the future')  # TODO

        # check if config exists; if it doesn't, error out and let user know
        if 'dbauth' not in config.get_config() or \
                'coriell' not in config.get_config()['dbauth']:
            logger.error("not configured with FTP user/password.")

        return

    def fetch(self, is_dl_forced=False):
        """
        Here we connect to the coriell sftp server using private connection
        details.  They dump bi-weekly files with a timestamp in the filename.
        For each catalog, we poll the remote site and pull the most-recently
        updated file, renaming it to our local  latest.csv.

        Be sure to have pg user/password connection details in your conf.json
        file, like:
        dbauth : {"coriell" : {
        "user" : "<username>", "password" : "<password>",
        "host" : <host>, "private_key"=path/to/rsa_key}
        }

        :param is_dl_forced:
        :return:

        """

        host = config.get_config()['dbauth']['coriell']['host']
        user = config.get_config()['dbauth']['coriell']['user']
        passwd = config.get_config()['dbauth']['coriell']['password']
        key = config.get_config()['dbauth']['coriell']['private_key']

        with pysftp.Connection(host,
                               username=user,
                               password=passwd,
                               private_key=key) as sftp:
            # check to make sure each file is in there
            # get the remote files
            remote_files = sftp.listdir_attr()
            files_by_repo = {}
            for attr in remote_files:
                # for each catalog, get the most-recent filename
                m = re.match('(NIGMS|NIA|NHGRI|NINDS)', attr.filename)
                if m is not None and len(m.groups()) > 0:
                    # there should just be one now
                    files_by_repo[m.group(1)] = attr
            # sort each array in hash,
            # & get the name and time of the most-recent file for each catalog
            for r in self.files:
                logger.info("Checking on %s catalog file", r)
                fname = self.files[r]['file']
                remotef = files_by_repo[r]
                target_name = '/'.join((self.rawdir, fname))
                # check if the local file is out of date, if so, download.
                # otherwise, skip.
                # we rename (for simplicity) the original file
                st = None
                if os.path.exists(target_name):
                    st = os.stat(target_name)
                    logger.info("Local file date: %s",
                                datetime.utcfromtimestamp(st[stat.ST_CTIME]))
                if st is None or remotef.st_mtime > st[stat.ST_CTIME]:
                    if st is None:
                        logger.info(
                            "File does not exist locally; downloading...")
                    else:
                        logger.info(
                            "There's a new version of %s catalog available; "
                            "downloading...", r)
                    sftp.get(remotef.filename, target_name)
                    logger.info("Fetched remote %s -> %s", remotef.filename,
                                target_name)
                    st = os.stat(target_name)
                    filedate = \
                        datetime.utcfromtimestamp(
                            remotef.st_mtime).strftime("%Y-%m-%d")
                    logger.info("New file date: %s",
                                datetime.utcfromtimestamp(st[stat.ST_CTIME]))

                else:
                    logger.info("File %s exists; using local copy", fname)
                    filedate = \
                        datetime.utcfromtimestamp(
                            st[stat.ST_CTIME]).strftime("%Y-%m-%d")

                self.dataset.setFileAccessUrl(remotef.filename, True)
                self.dataset.setVersion(filedate)
        return

    def parse(self, limit=None):
        if limit is not None:
            logger.info("Only parsing first %s rows of each file", limit)

        logger.info("Parsing files...")

        if self.testOnly:
            self.testMode = True

        for f in self.files:
            file = '/'.join((self.rawdir, self.files[f]['file']))
            self._process_collection(self.files[f]['id'],
                                     self.files[f]['label'],
                                     self.files[f]['page'])
            self._process_data(file, limit)

        logger.info("Finished parsing.")
        return

    def _process_data(self, raw, limit=None):
        """
        This function will process the data files from Coriell.
        We make the assumption that any alleles listed are variants
        (alternates to w.t.)

        Triples: (examples)

        :NIGMSrepository a CLO_0000008 #repository
        label : NIGMS Human Genetic Cell Repository
        foaf:page
         https://catalog.coriell.org/0/sections/collections/NIGMS/?SsId=8

        line_id a CL_0000057,  #fibroblast line
            derives_from patient_id
            part_of :NIGMSrepository
            RO:model_of OMIM:disease_id

        patient id a foaf:person,
            label: "fibroblast from patient 12345 with disease X"
            member_of family_id  #what is the right thing here?
            SIO:race EFO:caucasian  #subclass of EFO:0001799
            in_taxon NCBITaxon:9606
            dc:description Literal(remark)
            RO:has_phenotype OMIM:disease_id
            GENO:has_genotype genotype_id

        family_id a owl:NamedIndividual
            foaf:page
             "https://catalog.coriell.org/0/Sections/BrowseCatalog/FamilyTypeSubDetail.aspx?PgId=402&fam=2104&coll=GM"

        genotype_id a intrinsic_genotype
            GENO:has_alternate_part allelic_variant_id
            we don't necessarily know much about the genotype,
            other than the allelic variant. also there's the sex here

        pub_id mentions cell_line_id

        :param raw:
        :param limit:
        :return:

        """

        logger.info("Processing Data from %s", raw)

        if self.testMode:  # set the graph to build
            g = self.testgraph
        else:
            g = self.graph

        family = Family(g)
        model = Model(g)

        line_counter = 0
        geno = Genotype(g)
        du = DipperUtil()

        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter=',', quotechar='\"')
            next(filereader, None)  # skip the header row
            for row in filereader:
                if not row:
                    pass
                else:
                    line_counter += 1

                    (catalog_id, description, omim_number, sample_type,
                     cell_line_available, dna_in_stock, dna_ref, gender, age,
                     race, ethnicity, affected, karyotype, relprob, mutation,
                     gene, family_id, collection, url, cat_remark, pubmed_ids,
                     family_member, variant_id, dbsnp_id, species) = row

                    # example:
                    # GM00003,HURLER SYNDROME,607014,Fibroblast,Yes,No,,Female,26 YR,Caucasian,,,,
                    # parent,,,39,NIGMS Human Genetic Cell Repository,
                    # http://ccr.coriell.org/Sections/Search/Sample_Detail.aspx?Ref=GM00003,
                    # 46;XX; clinically normal mother of a child with Hurler syndrome; proband not in Repository,,
                    # 2,,18343,H**o sapiens

                    if self.testMode and catalog_id not in self.test_lines:
                        # skip rows not in our test lines, when in test mode
                        continue

                    # ###########    BUILD REQUIRED VARIABLES    ###########

                    # Make the cell line ID
                    cell_line_id = 'Coriell:' + catalog_id.strip()

                    # Map the cell/sample type
                    cell_type = self._map_cell_type(sample_type)

                    # Make a cell line label
                    line_label = \
                        collection.partition(' ')[0]+'-'+catalog_id.strip()

                    # Map the repository/collection
                    repository = self._map_collection(collection)

                    # patients are uniquely identified by one of:
                    # dbsnp id (which is == an individual haplotype)
                    # family id + family member (if present) OR
                    # probands are usually family member zero
                    # cell line id
                    # since some patients have >1 cell line derived from them,
                    # we must make sure that the genotype is attached to
                    # the patient, and can be inferred to the cell line
                    # examples of repeated patients are:
                    #   famid=1159, member=1; fam=152,member=1

                    # Make the patient ID

                    # make an anonymous patient
                    patient_id = '_:person'
                    if family_id != '':
                        patient_id = \
                            '-'.join((patient_id, family_id, family_member))
                    else:
                        # make an anonymous patient
                        patient_id = '-'.join((patient_id, catalog_id.strip()))

                    # properties of the individual patients:  sex, family id,
                    # member/relproband, description descriptions are
                    # really long and ugly SCREAMING text, so need to clean up
                    # the control cases are so odd with this labeling scheme;
                    # but we'll deal with it as-is for now.
                    short_desc = (description.split(';')[0]).capitalize()
                    if affected == 'Yes':
                        affected = 'affected'
                    elif affected == 'No':
                        affected = 'unaffected'
                    gender = gender.lower()
                    patient_label = ' '.join((affected, gender, relprob))
                    if relprob == 'proband':
                        patient_label = \
                            ' '.join(
                                (patient_label.strip(), 'with', short_desc))
                    else:
                        patient_label = \
                            ' '.join(
                                (patient_label.strip(), 'of proband with',
                                 short_desc))

                    # #############    BUILD THE CELL LINE    #############

                    # Adding the cell line as a typed individual.
                    cell_line_reagent_id = 'CLO:0000031'

                    model.addIndividualToGraph(cell_line_id, line_label,
                                               cell_line_reagent_id)

                    # add the equivalent id == dna_ref
                    if dna_ref != '' and dna_ref != catalog_id:
                        equiv_cell_line = 'Coriell:' + dna_ref
                        # some of the equivalent ids are not defined
                        # in the source data; so add them
                        model.addIndividualToGraph(equiv_cell_line, None,
                                                   cell_line_reagent_id)
                        model.addSameIndividual(cell_line_id, equiv_cell_line)

                    # Cell line derives from patient
                    geno.addDerivesFrom(cell_line_id, patient_id)
                    geno.addDerivesFrom(cell_line_id, cell_type)

                    # Cell line a member of repository
                    family.addMember(repository, cell_line_id)

                    if cat_remark != '':
                        model.addDescription(cell_line_id, cat_remark)

                    # Cell age_at_sampling
                    # TODO add the age nodes when modeled properly in #78
                    # if (age != ''):
                    # this would give a BNode that is an instance of Age.
                    # but i don't know how to connect
                    # the age node to the cell line? we need to ask @mbrush
                    # age_id = '_'+re.sub('\s+','_',age)
                    # gu.addIndividualToGraph(
                    #   g,age_id,age,self.terms['age'])
                    # gu.addTriple(
                    #   g,age_id,self.properties['has_measurement'],age,
                    #   True)

                    # #############    BUILD THE PATIENT    #############

                    # Add the patient ID as an individual.
                    model.addPerson(patient_id, patient_label)
                    # TODO map relationship to proband as a class
                    # (what ontology?)

                    # Add race of patient
                    # FIXME: Adjust for subcategories based on ethnicity field
                    # EDIT: There are 743 different entries for ethnicity...
                    # Too many to map?
                    # Add ethnicity as literal in addition to the mapped race?
                    # Adjust the ethnicity txt (if using)
                    # to initial capitalization to remove ALLCAPS

                    # TODO race should go into the individual's background
                    # and abstracted out to the Genotype class punting for now.
                    # if race != '':
                    #    mapped_race = self._map_race(race)
                    #    if mapped_race is not None:
                    #        gu.addTriple(
                    #           g,patient_id,self.terms['race'],mapped_race)
                    #        model.addSubClass(
                    #           mapped_race,self.terms['ethnic_group'])

                    # #############    BUILD THE FAMILY    #############

                    # Add triples for family_id, if present.
                    if family_id != '':
                        family_comp_id = 'CoriellFamily:' + family_id

                        family_label = \
                            ' '.join(('Family of proband with', short_desc))

                        # Add the family ID as a named individual
                        model.addIndividualToGraph(family_comp_id,
                                                   family_label,
                                                   geno.genoparts['family'])

                        # Add the patient as a member of the family
                        family.addMemberOf(patient_id, family_comp_id)

                    # #############    BUILD THE GENOTYPE   #############

                    # the important things to pay attention to here are:
                    # karyotype = chr rearrangements  (somatic?)
                    # mutation = protein-level mutation as a label,
                    # often from omim
                    # gene = gene symbol - TODO get id
                    # variant_id = omim variant ids (; delimited)
                    # dbsnp_id = snp individual ids = full genotype?

                    # note GM00633 is a good example of chromosomal variation
                    # - do we have enough to capture this?
                    # GM00325 has both abnormal karyotype and variation

                    # make an assumption that if the taxon is blank,
                    # that it is human!
                    if species is None or species == '':
                        species = 'H**o sapiens'
                    taxon = self._map_species(species)

                    # if there's a dbSNP id,
                    # this is actually the individual's genotype
                    genotype_id = None
                    genotype_label = None
                    if dbsnp_id != '':
                        genotype_id = 'dbSNPIndividual:' + dbsnp_id.strip()

                    omim_map = {}
                    gvc_id = None

                    # some of the karyotypes are encoded
                    # with terrible hidden codes. remove them here
                    # i've seen a <98> character
                    karyotype = du.remove_control_characters(karyotype)
                    karyotype_id = None
                    if karyotype.strip() != '':
                        karyotype_id = \
                            '_:'+re.sub(
                                'MONARCH:', '', self.make_id(karyotype))
                        # add karyotype as karyotype_variation_complement
                        model.addIndividualToGraph(
                            karyotype_id, karyotype,
                            geno.genoparts['karyotype_variation_complement'])
                        # TODO break down the karyotype into parts
                        # and map into GENO. depends on #77

                        # place the karyotype in a location(s).
                        karyo_chrs = \
                            self._get_affected_chromosomes_from_karyotype(
                                karyotype)
                        for c in karyo_chrs:
                            chr_id = makeChromID(c, taxon, 'CHR')
                            # add an anonymous sequence feature,
                            # each located on chr
                            karyotype_feature_id = '-'.join((karyotype_id, c))
                            karyotype_feature_label = \
                                'some karyotype alteration on chr'+str(c)
                            f = Feature(g, karyotype_feature_id,
                                        karyotype_feature_label,
                                        geno.genoparts['sequence_alteration'])
                            f.addFeatureStartLocation(None, chr_id)
                            f.addFeatureToGraph()
                            geno.addParts(
                                karyotype_feature_id, karyotype_id,
                                geno.object_properties['has_alternate_part'])

                    if gene != '':
                        vl = gene + '(' + mutation + ')'

                    # fix the variant_id so it's always in the same order
                    vids = variant_id.split(';')
                    variant_id = ';'.join(sorted(list(set(vids))))

                    if karyotype.strip() != '' \
                            and not self._is_normal_karyotype(karyotype):
                        mutation = mutation.strip()
                        gvc_id = karyotype_id
                        if variant_id != '':
                            gvc_id = \
                                '_:' + variant_id.replace(';', '-') + '-' \
                                + re.sub(r'\w*:', '', karyotype_id)
                        if mutation.strip() != '':
                            gvc_label = '; '.join((vl, karyotype))
                        else:
                            gvc_label = karyotype
                    elif variant_id.strip() != '':
                        gvc_id = '_:' + variant_id.replace(';', '-')
                        gvc_label = vl
                    else:
                        # wildtype?
                        pass

                    # add the karyotype to the gvc.
                    # use reference if normal karyotype
                    karyo_rel = geno.object_properties['has_alternate_part']
                    if self._is_normal_karyotype(karyotype):
                        karyo_rel = \
                            geno.object_properties['has_reference_part']
                    if karyotype_id is not None \
                            and not self._is_normal_karyotype(karyotype) \
                            and gvc_id is not None and karyotype_id != gvc_id:
                        geno.addParts(karyotype_id, gvc_id, karyo_rel)

                    if variant_id.strip() != '':
                        # split the variants & add them as part of the genotype
                        # we don't necessarily know their zygosity,
                        # just that they are part of the genotype variant ids
                        # are from OMIM, so prefix as such we assume that the
                        # sequence alts will be defined in OMIM not here
                        # TODO sort the variant_id list, if the omim prefix is
                        # the same, then assume it's the locus make a hashmap
                        # of the omim id to variant id list;
                        # then build the genotype hashmap is also useful for
                        # removing the "genes" from the list of "phenotypes"

                        # will hold gene/locus id to variant list
                        omim_map = {}

                        locus_num = None
                        for v in variant_id.split(';'):
                            # handle omim-style and odd var ids
                            # like 610661.p.R401X
                            m = re.match(r'(\d+)\.+(.*)', v.strip())
                            if m is not None and len(m.groups()) == 2:
                                (locus_num, var_num) = m.groups()

                            if locus_num is not None \
                                    and locus_num not in omim_map:
                                omim_map[locus_num] = [var_num]
                            else:
                                omim_map[locus_num] += [var_num]

                        for o in omim_map:
                            # gene_id = 'OMIM:' + o  # TODO unused
                            vslc_id = \
                                '_:' + '-'.join(
                                    [o + '.' + a for a in omim_map.get(o)])
                            vslc_label = vl
                            # we don't really know the zygosity of
                            # the alleles at all.
                            # so the vslcs are just a pot of them
                            model.addIndividualToGraph(
                                vslc_id, vslc_label, geno.
                                genoparts['variant_single_locus_complement'])
                            for v in omim_map.get(o):
                                # this is actually a sequence alt
                                allele1_id = 'OMIM:' + o + '.' + v
                                geno.addSequenceAlteration(allele1_id, None)

                                # assume that the sa -> var_loc -> gene
                                # is taken care of in OMIM
                                geno.addPartsToVSLC(
                                    vslc_id, allele1_id, None,
                                    geno.zygosity['indeterminate'], geno.
                                    object_properties['has_alternate_part'])

                            if vslc_id != gvc_id:
                                geno.addVSLCtoParent(vslc_id, gvc_id)

                    if affected == 'unaffected':
                        # let's just say that this person is wildtype
                        model.addType(patient_id, geno.genoparts['wildtype'])
                    elif genotype_id is None:
                        # make an anonymous genotype id
                        genotype_id = '_:geno' + catalog_id.strip()

                    # add the gvc
                    if gvc_id is not None:
                        model.addIndividualToGraph(
                            gvc_id, gvc_label,
                            geno.genoparts['genomic_variation_complement'])

                        # add the gvc to the genotype
                        if genotype_id is not None:
                            if affected == 'unaffected':
                                rel = \
                                    geno.object_properties[
                                        'has_reference_part']
                            else:
                                rel = \
                                    geno.object_properties[
                                        'has_alternate_part']
                            geno.addParts(gvc_id, genotype_id, rel)
                        if karyotype_id is not None \
                                and self._is_normal_karyotype(karyotype):
                            if gvc_label is not None and gvc_label != '':
                                genotype_label = \
                                    '; '.join((gvc_label, karyotype))
                            else:
                                genotype_label = karyotype
                            if genotype_id is None:
                                genotype_id = karyotype_id
                            else:
                                geno.addParts(
                                    karyotype_id, genotype_id, geno.
                                    object_properties['has_reference_part'])
                        else:
                            genotype_label = gvc_label
                            # use the catalog id as the background
                        genotype_label += ' [' + catalog_id.strip() + ']'

                    if genotype_id is not None and gvc_id is not None:
                        # only add the genotype if it has some parts
                        geno.addGenotype(genotype_id, genotype_label,
                                         geno.genoparts['intrinsic_genotype'])
                        geno.addTaxon(taxon, genotype_id)
                        # add that the patient has the genotype
                        # TODO check if the genotype belongs to
                        # the cell line or to the patient
                        g.addTriple(patient_id,
                                    geno.properties['has_genotype'],
                                    genotype_id)
                    else:
                        geno.addTaxon(taxon, patient_id)

                    # TODO: Add sex/gender  (as part of the karyotype?)

                    # #############    DEAL WITH THE DISEASES   #############

                    # we associate the disease to the patient
                    if affected == 'affected':
                        if omim_number != '':
                            for d in omim_number.split(';'):
                                if d is not None and d != '':
                                    # if the omim number is in omim_map,
                                    # then it is a gene not a pheno
                                    if d not in omim_map:
                                        disease_id = 'OMIM:' + d.strip()
                                        # assume the label is taken care of
                                        model.addClassToGraph(disease_id, None)

                                        # add the association:
                                        #   the patient has the disease
                                        assoc = G2PAssoc(
                                            g, self.name, patient_id,
                                            disease_id)
                                        assoc.add_association_to_graph()

                                        # this line is a model of this disease
                                        # TODO abstract out model into
                                        # it's own association class?
                                        g.addTriple(
                                            cell_line_id, model.
                                            object_properties['model_of'],
                                            disease_id)
                                    else:
                                        logger.info(
                                            'removing %s from disease list ' +
                                            'since it is a gene', d)

                    # #############    ADD PUBLICATIONS   #############

                    if pubmed_ids != '':
                        for s in pubmed_ids.split(';'):
                            pubmed_id = 'PMID:' + s.strip()
                            ref = Reference(g, pubmed_id)
                            ref.setType(Reference.ref_types['journal_article'])
                            ref.addRefToGraph()
                            g.addTriple(pubmed_id,
                                        model.object_properties['mentions'],
                                        cell_line_id)

                    if not self.testMode \
                            and (limit is not None and line_counter > limit):
                        break
        return

    def _process_collection(self, collection_id, label, page):
        """
        This function will process the data supplied internally
        about the repository from Coriell.

        Triples:
            Repository a ERO:collection
            rdf:label Literal(label)
            foaf:page Literal(page)

        :param collection_id:
        :param label:
        :param page:
        :return:
        """
        # #############    BUILD THE CELL LINE REPOSITORY    #############
        for graph in [self.graph, self.testgraph]:
            # TODO: How to devise a label for each repository?
            model = Model(graph)
            reference = Reference(graph)
            repo_id = 'CoriellCollection:' + collection_id
            repo_label = label
            repo_page = page

            model.addIndividualToGraph(repo_id, repo_label,
                                       self.terms['collection'])
            reference.addPage(repo_id, repo_page)

        return

    @staticmethod
    def _map_cell_type(sample_type):
        ctype = None
        type_map = {
            # FIXME: mesenchymal stem cell of adipose
            'Adipose stromal cell': 'CL:0002570',
            # FIXME: amniocyte?
            'Amniotic fluid-derived cell line': 'CL:0002323',
            # B cell
            'B-Lymphocyte': 'CL:0000236',
            # FIXME: No Match
            'Chorionic villus-derived cell line': 'CL:0000000',
            # endothelial cell
            'Endothelial': 'CL:0000115',
            # epithelial cell
            'Epithelial': 'CL:0000066',
            # FIXME: No Match. "Abnormal precursor (virally transformed)
            # of mouse erythrocytes that can be grown in culture and
            # induced to differentiate by treatment with, for example, DMSO."
            'Erythroleukemic cell line': 'CL:0000000',
            'Fibroblast': 'CL:0000057',  # fibroblast
            'Keratinocyte': 'CL:0000312',  # keratinocyte
            'Melanocyte': 'CL:0000148',  # melanocyte
            'Mesothelial': 'CL:0000077',
            'Microcell hybrid': 'CL:0000000',  # FIXME: No Match
            'Myoblast': 'CL:0000056',  # myoblast
            'Smooth muscle': 'CL:0000192',  # smooth muscle cell
            'Stem cell': 'CL:0000034',  # stem cell
            'T-Lymphocyte': 'CL:0000084',  # T cell
            # FIXME: No Match. "Cells isolated from a mass of neoplastic cells,
            # i.e., a growth formed by abnormal cellular proliferation."
            # Oncocyte? CL:0002198
            'Tumor-derived cell line': 'CL:0002198',
            'Kidney-derived cell line': 'CLO:0000220'
        }
        if sample_type.strip() in type_map:
            ctype = type_map.get(sample_type)
        else:
            logger.error("Cell type not mapped: %s", sample_type)

        return ctype

    @staticmethod
    def _map_race(race):
        rtype = None
        type_map = {
            'African American': 'EFO:0003150',
            # 'American Indian': 'EFO',
            'Asian': 'EFO:0003152',
            # FIXME: Asian?
            'Asian; Other': 'EFO:0003152',
            # Asian Indian
            'Asiatic Indian': 'EFO:0003153',
            # FIXME: African American? There is also African.
            'Black': 'EFO:0003150',
            'Caucasian': 'EFO:0003156',
            'Chinese': 'EFO:0003157',
            'East Indian': 'EFO:0003158',  # Eastern Indian
            'Filipino': 'EFO:0003160',
            # Hispanic: EFO:0003169, Latino: EFO:0003166 see next
            'Hispanic/Latino': 'EFO:0003169',
            'Japanese': 'EFO:0003164',
            'Korean': 'EFO:0003165',
            # 'More than one race': 'EFO',
            # 'Not Reported': 'EFO',
            # 'Other': 'EFO',
            # Asian/Pacific Islander
            'Pacific Islander': 'EFO:0003154',
            # Asian/Pacific Islander
            'Polynesian': 'EFO:0003154',
            # 'Unknown': 'EFO',
            # Asian
            'Vietnamese': 'EFO:0003152',
        }
        if race.strip() in type_map:
            rtype = type_map.get(race)
        else:
            logger.warning("Race type not mapped: %s", race)

        return rtype

    @staticmethod
    def _map_species(species):
        tax = None
        type_map = {
            'Mus musculus': 'NCBITaxon:10090',
            'Peromyscus peromyscus californicus': 'NCBITaxon:42520',
            'Peromyscus peromyscus maniculatus': 'NCBITaxon:10042',
            'Peromyscus peromyscus leucopus': 'NCBITaxon:10041',
            'Peromyscus peromyscus polionotus': 'NCBITaxon:42413',
            'Macaca fascicularis': 'NCBITaxon:9541',
            'Rattus norvegicus': 'NCBITaxon:10116',
            'Papio anubis': 'NCBITaxon:9555',
            'Cricetulus griseus': 'NCBITaxon:10029',
            'Geochelone elephantopus': 'NCBITaxon:66189',
            'Muntiacus muntjak': 'NCBITaxon:9888',
            'Ailurus fulgens': 'NCBITaxon:9649',
            'Sus scrofa': 'NCBITaxon:9823',
            'Bos taurus': 'NCBITaxon:9913',
            'Oryctolagus cuniculus': 'NCBITaxon:9986',
            'Macaca nemestrina': 'NCBITaxon:9545',
            'Canis familiaris': 'NCBITaxon:9615',
            'Equus caballus': 'NCBITaxon:9796',
            'Macaca mulatta': 'NCBITaxon:9544',
            'Mesocricetus auratus': 'NCBITaxon:10036',
            'Macaca nigra': 'NCBITaxon:54600',
            'Erythrocebus patas': 'NCBITaxon:9538',
            'Pongo pygmaeus': 'NCBITaxon:9600',
            'Callicebus moloch': 'NCBITaxon:9523',
            'Lagothrix lagotricha': 'NCBITaxon:9519',
            'Saguinus fuscicollis': 'NCBITaxon:9487',
            'Saimiri sciureus': 'NCBITaxon:9521',
            'Saguinus labiatus': 'NCBITaxon:78454',
            'Pan paniscus': 'NCBITaxon:9597',
            'Ovis aries': 'NCBITaxon:9940',
            'Felis catus': 'NCBITaxon:9685',
            'H**o sapiens': 'NCBITaxon:9606',
            'Gorilla gorilla': 'NCBITaxon:9593',
            'Peromyscus maniculatus': 'NCBITaxon:10042'
        }
        if species.strip() in type_map:
            tax = type_map.get(species)
        else:
            logger.warning("Species type not mapped: %s", species)

        return tax

    @staticmethod
    def _map_collection(collection):
        ctype = None
        type_map = {
            'NINDS Repository':
            'CoriellCollection:NINDS',
            'NIGMS Human Genetic Cell Repository':
            'CoriellCollection:NIGMS',
            'NIA Aging Cell Culture Repository':
            'CoriellCollection:NIA',
            'NHGRI Sample Repository for Human Genetic Research':
            'CoriellCollection:NHGRI'
        }
        if collection.strip() in type_map:
            ctype = type_map.get(collection)
        else:
            logger.warning("ERROR: Collection type not mapped: %s", collection)

        return ctype

    @staticmethod
    def _get_affected_chromosomes_from_karyotype(karyotype):

        affected_chromosomes = set()
        chr_regex = r'(\d+|X|Y|M|\?);?'
        abberation_regex = r'(?:add|del|der|i|idic|inv|r|rec|t)\([\w;]+\)'
        sex_regex = r'(?:;)(X{2,}Y+|X?Y{2,}|X{3,}|X|Y)(?:;|$)'

        # first fetch the set of abberations
        abberations = re.findall(abberation_regex, karyotype)

        # iterate over them to get the chromosomes
        for a in abberations:
            chrs = re.findall(chr_regex, a)
            affected_chromosomes = affected_chromosomes.union(set(chrs))

        # remove the ? as a chromosome, since it isn't valid
        if '?' in affected_chromosomes:
            affected_chromosomes.remove('?')

        # check to see if there are any abnormal sex chromosomes
        m = re.search(sex_regex, karyotype)
        if m is not None:
            if re.search(r'X?Y{2,}', m.group(1)):
                # this is the only case where there is an extra Y chromosome
                affected_chromosomes.add('Y')
            else:
                affected_chromosomes.add('X')

        return affected_chromosomes

    @staticmethod
    def _is_normal_karyotype(karyotype):
        """
        This will default to true if no karyotype is provided.
        This is assuming human karyotypes.
        :param karyotype:
        :return:
        """

        is_normal = True
        if karyotype is not None:
            karyotype = karyotype.strip()
            if karyotype not in ['46;XX', '46;XY', '']:
                is_normal = False

        return is_normal

    def getTestSuite(self):
        import unittest
        from tests.test_coriell import CoriellTestCase
        # TODO add G2PAssoc, Genotype tests

        test_suite = \
            unittest.TestLoader().loadTestsFromTestCase(CoriellTestCase)

        return test_suite
예제 #13
0
class BioGrid(Source):
    """
    Biogrid interaction data

    """
    # TODO write up class summary for docstring

    files = {
        'interactions': {
            'file': 'interactions.mitab.zip',
            'url': BGDL + '/BIOGRID-ALL-LATEST.mitab.zip'
        },
        'identifiers': {
            'file': 'identifiers.tab.zip',
            'url': BGDL + '/BIOGRID-IDENTIFIERS-LATEST.tab.zip'
        }
    }

    # biogrid-specific identifiers for use in subsetting identifier mapping
    biogrid_ids = [
        106638, 107308, 107506, 107674, 107675, 108277, 108506, 108767, 108814,
        108899, 110308, 110364, 110678, 111642, 112300, 112365, 112771, 112898,
        199832, 203220, 247276, 120150, 120160, 124085
    ]

    def __init__(self, graph_type, are_bnodes_skolemized, tax_ids=None):
        super().__init__(graph_type, are_bnodes_skolemized, 'biogrid')

        self.tax_ids = tax_ids

        self.dataset = Dataset(
            'biogrid', 'The BioGrid', 'http://thebiogrid.org/', None,
            'http://wiki.thebiogrid.org/doku.php/terms_and_conditions')

        # Defaults
        # our favorite animals
        # taxids = [9606,10090,10116,7227,7955,6239,8355]
        if self.tax_ids is None:
            self.tax_ids = [9606, 10090, 7955]

        if 'test_ids' not in config.get_config() or \
                'gene' not in config.get_config()['test_ids']:
            logger.warning("not configured with gene test ids.")
        else:
            self.test_ids = config.get_config()['test_ids']['gene']

        # data-source specific warnings
        # (will be removed when issues are cleared)
        logger.warning(
            "several MI experimental codes do not exactly map to ECO; "
            "using approximations.")
        return

    def fetch(self, is_dl_forced=False):
        """

        :param is_dl_forced:
        :return:  None
        """

        self.get_files(is_dl_forced)

        # the version number is encoded in the filename in the zip.
        # for example, the interactions file may unzip to
        # BIOGRID-ALL-3.2.119.mitab.txt, where the version number is 3.2.119
        f = '/'.join((self.rawdir, self.files['interactions']['file']))
        st = os.stat(f)
        filedate = datetime.utcfromtimestamp(st[ST_CTIME]).strftime("%Y-%m-%d")
        with ZipFile(f, 'r') as myzip:
            flist = myzip.namelist()
            # assume that the first entry is the item
            fname = flist[0]
            # get the version from the filename
            version = \
                re.match(r'BIOGRID-ALL-(\d+\.\d+\.\d+)\.mitab.txt', fname)
        myzip.close()

        self.dataset.setVersion(filedate, str(version.groups()[0]))

        return

    def parse(self, limit=None):
        """

        :param limit:
        :return:

        """
        if self.testOnly:
            self.testMode = True

        self._get_interactions(limit)
        self._get_identifiers(limit)

        logger.info("Loaded %d test graph nodes", len(self.testgraph))
        logger.info("Loaded %d full graph nodes", len(self.graph))

        return

    def _get_interactions(self, limit):
        logger.info("getting interactions")
        line_counter = 0
        f = '/'.join((self.rawdir, self.files['interactions']['file']))
        myzip = ZipFile(f, 'r')
        # assume that the first entry is the item
        fname = myzip.namelist()[0]
        matchcounter = 0

        with myzip.open(fname, 'r') as csvfile:
            for line in csvfile:
                # skip comment lines
                if re.match(r'^#', line.decode()):
                    logger.debug("Skipping header line")
                    continue
                line_counter += 1
                line = line.decode().strip()
                # print(line)
                (interactor_a, interactor_b, alt_ids_a, alt_ids_b, aliases_a,
                 aliases_b, detection_method, pub_author, pub_id, taxid_a,
                 taxid_b, interaction_type, source_db, interaction_id,
                 confidence_val) = line.split('\t')

                # get the actual gene ids,
                # typically formated like: gene/locuslink:351|BIOGRID:106848
                gene_a_num = re.search(r'locuslink\:(\d+)\|?',
                                       interactor_a).groups()[0]
                gene_b_num = re.search(r'locuslink\:(\d+)\|?',
                                       interactor_b).groups()[0]

                if self.testMode:
                    g = self.testgraph
                    # skip any genes that don't match our test set
                    if (int(gene_a_num) not in self.test_ids) or\
                            (int(gene_b_num) not in self.test_ids):
                        continue
                else:
                    g = self.graph
                    # when not in test mode, filter by taxon
                    if int(re.sub(r'taxid:', '', taxid_a.rstrip())) not in\
                            self.tax_ids or\
                            int(re.sub(
                                r'taxid:', '', taxid_b.rstrip())) not in\
                            self.tax_ids:
                        continue
                    else:
                        matchcounter += 1

                gene_a = 'NCBIGene:' + gene_a_num
                gene_b = 'NCBIGene:' + gene_b_num

                # get the interaction type
                # psi-mi:"MI:0407"(direct interaction)
                int_type = re.search(r'MI:\d+', interaction_type).group()
                rel = self._map_MI_to_RO(int_type)

                # scrub pubmed-->PMID prefix
                pub_id = re.sub(r'pubmed', 'PMID', pub_id)
                # remove bogus whitespace
                pub_id = pub_id.strip()

                # get the method, and convert to evidence code
                det_code = re.search(r'MI:\d+', detection_method).group()
                evidence = self._map_MI_to_ECO(det_code)

                # note that the interaction_id is some kind of internal biogrid
                # identifier that does not map to a public URI.
                # we will construct a monarch identifier from this

                assoc = InteractionAssoc(g, self.name, gene_a, gene_b, rel)
                assoc.add_evidence(evidence)
                assoc.add_source(pub_id)
                assoc.add_association_to_graph()

                if not self.testMode and (limit is not None
                                          and line_counter > limit):
                    break

        myzip.close()

        return

    def _get_identifiers(self, limit):
        """
        This will process the id mapping file provided by Biogrid.
        The file has a very large header, which we scan past,
        then pull the identifiers, and make equivalence axioms

        :param limit:
        :return:

        """

        logger.info("getting identifier mapping")
        line_counter = 0
        f = '/'.join((self.rawdir, self.files['identifiers']['file']))
        myzip = ZipFile(f, 'r')
        # assume that the first entry is the item
        fname = myzip.namelist()[0]
        foundheader = False

        # TODO align this species filter with the one above
        # speciesfilters = 'H**o sapiens,Mus musculus,Drosophila melanogaster,
        # Danio rerio, Caenorhabditis elegans,Xenopus laevis'.split(',')

        speciesfilters = 'H**o sapiens,Mus musculus'.split(',')
        with myzip.open(fname, 'r') as csvfile:
            for line in csvfile:
                # skip header lines
                if not foundheader:
                    if re.match(r'BIOGRID_ID', line.decode()):
                        foundheader = True
                    continue

                line = line.decode().strip()
                # BIOGRID_ID
                # IDENTIFIER_VALUE
                # IDENTIFIER_TYPE
                # ORGANISM_OFFICIAL_NAME
                # 1	814566	ENTREZ_GENE	Arabidopsis thaliana
                (biogrid_num, id_num, id_type,
                 organism_label) = line.split('\t')

                if self.testMode:
                    g = self.testgraph
                    # skip any genes that don't match our test set
                    if int(biogrid_num) not in self.biogrid_ids:
                        continue
                else:
                    g = self.graph

                model = Model(g)

                # for each one of these,
                # create the node and add equivalent classes
                biogrid_id = 'BIOGRID:' + biogrid_num
                prefix = self._map_idtype_to_prefix(id_type)

                # TODO make these filters available as commandline options
                # geneidtypefilters='NCBIGene,OMIM,MGI,FlyBase,ZFIN,MGI,HGNC,
                #                   WormBase,XenBase,ENSEMBL,miRBase'.split(',')
                geneidtypefilters = 'NCBIGene,MGI,ENSEMBL,ZFIN,HGNC'.split(',')
                # proteinidtypefilters='HPRD,Swiss-Prot,NCBIProtein'
                if (speciesfilters is not None) \
                        and (organism_label.strip() in speciesfilters):
                    line_counter += 1
                    if (geneidtypefilters is not None) \
                            and (prefix in geneidtypefilters):
                        mapped_id = ':'.join((prefix, id_num))
                        model.addEquivalentClass(biogrid_id, mapped_id)
                    # this symbol will only get attached to the biogrid class
                    elif id_type == 'OFFICIAL_SYMBOL':
                        model.addClassToGraph(biogrid_id, id_num)
                    # elif (id_type == 'SYNONYM'):
                    #   FIXME - i am not sure these are synonyms, altids?
                    #   gu.addSynonym(g,biogrid_id,id_num)

                if not self.testMode and limit is not None \
                        and line_counter > limit:
                    break

        myzip.close()

        return

    @staticmethod
    def _map_MI_to_RO(mi_id):
        rel = InteractionAssoc.interaction_object_properties
        mi_ro_map = {
            # colocalization
            'MI:0403': rel['colocalizes_with'],
            # direct interaction
            'MI:0407': rel['interacts_with'],
            # synthetic genetic interaction defined by inequality
            'MI:0794': rel['genetically_interacts_with'],
            # suppressive genetic interaction defined by inequality
            'MI:0796': rel['genetically_interacts_with'],
            # additive genetic interaction defined by inequality
            'MI:0799': rel['genetically_interacts_with'],
            # association
            'MI:0914': rel['interacts_with'],
            # physical association
            'MI:0915': rel['interacts_with']
        }

        ro_id = rel['interacts_with']  # default
        if mi_id in mi_ro_map:
            ro_id = mi_ro_map.get(mi_id)

        return ro_id

    @staticmethod
    def _map_MI_to_ECO(mi_id):
        eco_id = 'ECO:0000006'  # default to experimental evidence
        mi_to_eco_map = {
            'MI:0018': 'ECO:0000068',  # yeast two-hybrid
            'MI:0004': 'ECO:0000079',  # affinity chromatography
            'MI:0047': 'ECO:0000076',  # far western blotting
            'MI:0055':
            'ECO:0000021',  # should be FRET, but using physical_interaction FIXME
            'MI:0090':
            'ECO:0000012',  # desired: protein complementation, using: functional complementation
            'MI:0096':
            'ECO:0000085',  # desired: pull down, using: immunoprecipitation
            'MI:0114':
            'ECO:0000324',  # desired: x-ray crystallography, using: imaging assay
            'MI:0254':
            'ECO:0000011',  # desired: genetic interference, using: genetic interaction evidence
            'MI:0401':
            'ECO:0000172',  # desired: biochemical, using: biochemical trait evidence
            'MI:0415':
            'ECO:0000005',  # desired: enzymatic study, using: enzyme assay evidence
            'MI:0428': 'ECO:0000324',  # imaging
            'MI:0686':
            'ECO:0000006',  # desired: unspecified, using: experimental evidence
            'MI:1313': 'ECO:0000006'  # None?
        }
        if mi_id in mi_to_eco_map:
            eco_id = mi_to_eco_map.get(mi_id)
        else:
            logger.warning(
                "unmapped code %s. Defaulting to experimental_evidence", mi_id)

        return eco_id

    @staticmethod
    def _map_idtype_to_prefix(idtype):
        """
        Here we need to reformat the BioGrid source prefixes
        to standard ones used in our curie-map.
        :param idtype:
        :return:

        """
        prefix = idtype
        idtype_to_prefix_map = {
            'XENBASE': 'XenBase',
            'TREMBL': 'TrEMBL',
            'MGI': 'MGI',
            'REFSEQ_DNA_ACCESSION': 'RefSeqNA',
            'MAIZEGDB': 'MaizeGDB',
            'BEEBASE': 'BeeBase',
            'ENSEMBL': 'ENSEMBL',
            'TAIR': 'TAIR',
            'GENBANK_DNA_GI': 'NCBIgi',
            'CGNC': 'CGNC',
            'RGD': 'RGD',
            'GENBANK_GENOMIC_DNA_GI': 'NCBIgi',
            'SWISSPROT': 'Swiss-Prot',
            'MIM': 'OMIM',
            'FLYBASE': 'FlyBase',
            'VEGA': 'VEGA',
            'ANIMALQTLDB': 'AQTLDB',
            'ENTREZ_GENE_ETG': 'ETG',
            'HPRD': 'HPRD',
            'APHIDBASE': 'APHIDBASE',
            'GENBANK_PROTEIN_ACCESSION': 'NCBIProtein',
            'ENTREZ_GENE': 'NCBIGene',
            'SGD': 'SGD',
            'GENBANK_GENOMIC_DNA_ACCESSION': 'NCBIGenome',
            'BGD': 'BGD',
            'WORMBASE': 'WormBase',
            'ZFIN': 'ZFIN',
            'DICTYBASE': 'dictyBase',
            'ECOGENE': 'ECOGENE',
            'BIOGRID': 'BIOGRID',
            'GENBANK_DNA_ACCESSION': 'NCBILocus',
            'VECTORBASE': 'VectorBase',
            'MIRBASE': 'miRBase',
            'IMGT/GENE-DB': 'IGMT',
            'HGNC': 'HGNC',
            'SYSTEMATIC_NAME': None,
            'OFFICIAL_SYMBOL': None,
            'REFSEQ_GENOMIC_DNA_ACCESSION': 'NCBILocus',
            'GENBANK_PROTEIN_GI': 'NCBIgi',
            'REFSEQ_PROTEIN_ACCESSION': 'RefSeqProt',
            'SYNONYM': None,
            'GRID_LEGACY': None,
            # the following showed up in 3.3.124
            'UNIPROT-ACCESSION': 'UniprotKB',
            'SWISS-PROT': 'Swiss-Prot',
            'OFFICIAL SYMBOL': None,
            'ENSEMBL RNA': None,
            'GRID LEGACY': None,
            'ENSEMBL PROTEIN': None,
            'REFSEQ-RNA-GI': None,
            'REFSEQ-RNA-ACCESSION': None,
            'REFSEQ-PROTEIN-GI': None,
            'REFSEQ-PROTEIN-ACCESSION-VERSIONED': None,
            'REFSEQ-PROTEIN-ACCESSION': None,
            'REFSEQ-LEGACY': None,
            'SYSTEMATIC NAME': None,
            'ORDERED LOCUS': None,
            'UNIPROT-ISOFORM': 'UniprotKB',
            'ENSEMBL GENE': 'ENSEMBL',
            'CGD': None,  # Not sure what this is?
            'WORMBASE-OLD': 'WormBase'
        }
        if idtype in idtype_to_prefix_map:
            prefix = idtype_to_prefix_map.get(idtype)
        else:
            logger.warning("unmapped prefix %s", prefix)

        return prefix

    def getTestSuite(self):
        import unittest
        from tests.test_biogrid import BioGridTestCase
        # TODO add InteractionAssoc tests
        # TODO add test about if all prefixes are mapped?

        test_suite = \
            unittest.TestLoader().loadTestsFromTestCase(BioGridTestCase)

        return test_suite