예제 #1
0
class DatasetTestCase(unittest.TestCase):
    """
    For testing metadata emitted by Dataset class

    Dataset creates a graph describing the metadata associated with the dataset in
    question, which should follow the HCLS specification for dataset descriptions
    https://www.w3.org/TR/2015/NOTE-hcls-dataset-20150514/
    """
    @classmethod
    def setUpClass(cls):
        cls.curie_map = curiemap.get()

        # parameters passed to code, to be returned in graph
        cls.monarch_archive_curie_prefix = "MonarchArchive"
        cls.identifier = "fakeingest"
        cls.ingest_description = "some ingest description"
        cls.ingest_url = "http://fakeingest.com"
        cls.ingest_title = "this ingest title"
        cls.ingest_logo_url = "logo.png"
        cls.license_url = "https://choosealicense.com/licenses/mit/"
        cls.license_url_default = "https://project-open-data.cio.gov/unknown-license/"
        cls.data_rights = "https://www.gnu.org/licenses/gpl-3.0.html"
        cls.distribution_type = "ttl"

        # parse test graph once, to test triples counts/statistics below
        cls.test_ttl = "tests/resources/fakeingest/test_graph_simple.ttl"
        cls.test_graph = RDFGraph()
        cls.test_graph.parse(cls.test_ttl, format="turtle")

        # expected things:
        cls.expected_curie_prefix = "MonarchArchive"
        cls.timestamp_date = datetime.today().strftime("%Y%m%d")

        cls.base_cito = 'http://purl.org/spar/cito/'
        cls.base_dcat = 'http://www.w3.org/ns/dcat#'
        cls.base_dcterms = 'http://purl.org/dc/terms/'
        cls.base_dctypes = 'http://purl.org/dc/dcmitype/'
        cls.base_pav = 'http://purl.org/pav/'
        cls.base_rdf = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
        cls.base_rdfs = 'http://www.w3.org/2000/01/rdf-schema#'
        cls.base_schema = 'http://schema.org/'
        cls.base_void = 'http://rdfs.org/ns/void#'
        cls.base_owl = 'http://www.w3.org/2002/07/owl#'
        cls.base_logo_url = "https://github.com/monarch-initiative/monarch-ui/blob/master/public/img/sources/"
        # expected summary level IRI
        cls.summary_level_IRI = URIRef(
            cls.curie_map.get(cls.expected_curie_prefix) + "#" +
            cls.identifier)
        # expected version level IRI
        cls.data_release_version = "19700101"
        cls.version_level_IRI = URIRef(
            cls.curie_map.get(cls.expected_curie_prefix) +
            cls.data_release_version + "/" + "#" + cls.identifier)
        cls.version_level_IRI_default_version = \
            URIRef(
                   cls.curie_map.get(cls.expected_curie_prefix) +
                   cls.timestamp_date + "/" +
                   "#" + cls.identifier)

        # expected distribution level IRI (for ttl resource)
        cls.distribution_level_IRI_ttl = \
            URIRef(
                   cls.curie_map.get(cls.expected_curie_prefix) +
                   cls.data_release_version + "/rdf/" +
                   cls.identifier + "." + cls.distribution_type)
        cls.distribution_level_IRI_ttl_default_version = \
            URIRef(
                   cls.curie_map.get(cls.expected_curie_prefix) +
                   cls.timestamp_date + "/rdf/" +
                   cls.identifier + "." + cls.distribution_type)

        # set expected IRIs for predicates and other things
        cls.iri_rdf_type = URIRef(cls.base_rdf + "type")
        cls.iri_title = URIRef(cls.base_dcterms + "title")
        cls.iri_dataset = URIRef(cls.base_dctypes + "Dataset")
        cls.iri_description = URIRef(cls.base_dcterms + "description")
        cls.iri_publisher = URIRef(cls.base_dcterms + "Publisher")
        cls.iri_source = URIRef(cls.base_dcterms + "source")
        cls.iri_logo = URIRef(cls.base_schema + "logo")
        cls.iri_mi_org = URIRef("https://monarchinitiative.org/")
        cls.iri_created = URIRef(cls.base_dcterms + "created")
        cls.iri_version = URIRef(cls.base_pav + "version")
        cls.iri_retrieved_on = URIRef(cls.base_pav + "retrievedOn")
        cls.iri_creator = URIRef(cls.base_dcterms + "creator")
        cls.iri_is_version_of = URIRef(cls.base_dcterms + "isVersionOf")
        cls.iri_distribution = URIRef(cls.base_dcat + "Distribution")
        cls.iri_created_with = URIRef(cls.base_pav + "createdWith")
        cls.iri_format = URIRef(cls.base_dcterms + "format")
        cls.iri_download_url = URIRef(cls.base_dcterms + "downloadURL")
        cls.iri_license = URIRef(cls.base_dcterms + "license")
        cls.iri_data_rights = URIRef(cls.base_dcterms + "rights")
        cls.iri_cites_as_authority = URIRef(cls.base_cito + "citesAsAuthority")
        cls.iri_rdfs_label = URIRef(cls.base_rdfs + "label")
        cls.iri_owl_ontology = URIRef(cls.base_owl + "Ontology")
        cls.iri_owl_version_iri = URIRef(cls.base_owl + "versionIRI")
        cls.iri_owl_version_info = URIRef(cls.base_owl + "versionInfo")
        cls.iri_returned_logo = URIRef(cls.base_logo_url + cls.ingest_logo_url)
        cls.iri_expected_download_url_value = \
            URIRef(
                   cls.curie_map.get("MonarchArchive") +
                   cls.data_release_version + "/rdf/" +
                   cls.identifier + "." + cls.distribution_type)

        cls.iri_dipper = URIRef("https://github.com/monarch-initiative/dipper")
        cls.iri_ttl_spec = URIRef("https://www.w3.org/TR/turtle/")

    @classmethod
    def tearDownClass(cls):
        pass

    def setUp(self):
        self.dataset = Dataset(identifier=self.identifier,
                               data_release_version=self.data_release_version,
                               ingest_name=self.identifier,
                               ingest_title=self.ingest_title,
                               ingest_url=self.ingest_url,
                               ingest_logo=self.ingest_logo_url,
                               ingest_description=self.ingest_description,
                               license_url=self.license_url,
                               data_rights=self.data_rights)

        # put all triples in a list for debugging below
        self.all_triples = list(self.dataset.graph.triples((None, None, None)))

    def tearDown(self):
        pass

    def test_dataset_has_graph(self):
        self.assertIsInstance(self.dataset.graph, Graph,
                              "dataset doesn't contain an RDF graph")

    def test_get_graph(self):
        self.assertIsInstance(self.dataset.get_graph(), RDFGraph,
                              "get_graph() didn't return an RDF graph")

    def test_get_license(self):
        gpl2_iri = "https://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html"
        self.dataset.license_url = gpl2_iri
        self.assertEqual(self.dataset.get_license(), gpl2_iri,
                         "set_license didn't set license_url correctly")

    def test_set_citation(self):
        citation_iri =\
            "http://purl.obolibrary.org/obo/uberon/releases/2016-01-26/uberon.owl"
        self.dataset.set_citation(citation_iri)
        self.assertTrue(self.dataset.citation.issuperset([citation_iri]))
        triples = list(
            self.dataset.graph.triples(
                (self.version_level_IRI, URIRef(self.iri_cites_as_authority),
                 URIRef(citation_iri))))
        self.assertTrue(len(triples) == 1, "missing citation triple")

    def test_set_ingest_source_file_version_num(self):
        this_version = "version1234"
        file_iri = "http://somefilesource.org/file.txt"
        self.dataset.set_ingest_source_file_version_num(file_iri, this_version)
        triples = list(
            self.dataset.graph.triples(
                (URIRef(file_iri), self.iri_version, Literal(this_version))))
        self.assertTrue(
            len(triples) == 1, "ingest source file version not set")

    def test_set_ingest_source_file_version_date(self):
        this_version = "1970-01-01"
        file_iri = "http://somefilesource.org/file.txt"
        self.dataset.set_ingest_source_file_version_date(
            file_iri, this_version)

        triples = list(
            self.dataset.graph.triples((URIRef(file_iri), self.iri_version,
                                        Literal(this_version,
                                                datatype=XSD.date))))
        self.assertTrue(
            len(triples) == 1,
            "ingest source file version not set with literal type of date")

    #
    # Test summary level triples:
    #
    def test_summary_level_type(self):
        triples = list(
            self.dataset.graph.triples(
                (self.summary_level_IRI, self.iri_rdf_type, self.iri_dataset)))
        self.assertTrue(len(triples) == 1, "missing summary level type triple")

    def test_summary_level_title(self):
        triples = list(
            self.dataset.graph.triples((self.summary_level_IRI, self.iri_title,
                                        Literal(self.ingest_title))))
        self.assertTrue(
            len(triples) == 1, "missing summary level title triple")

    def test_summary_level_description(self):
        # by default, description is the class's docstring
        triples = list(
            self.dataset.graph.triples(
                (self.summary_level_IRI, self.iri_description,
                 Literal(self.ingest_description))))
        self.assertTrue(
            len(triples) == 1, "missing summary level description triple")

    def test_summary_level_publisher(self):
        triples = list(
            self.dataset.graph.triples(
                (self.summary_level_IRI, self.iri_publisher, self.iri_mi_org)))
        self.assertTrue(
            len(triples) == 1, "missing summary level publisher triple")

    def test_summary_level_source_web_page(self):
        triples = list(
            self.dataset.graph.triples(
                (self.summary_level_IRI, self.iri_source,
                 URIRef(self.ingest_url))))
        self.assertTrue(
            len(triples) == 1, "missing summary level source page triple")

    def test_summary_level_source_logo(self):
        triples = list(
            self.dataset.graph.triples((self.summary_level_IRI, self.iri_logo,
                                        URIRef(self.iri_returned_logo))))
        self.assertTrue(
            len(triples) == 1, "missing summary level source logo triple")

    def test_summary_level_ontology_type_declaration(self):
        triples = list(
            self.dataset.graph.triples(
                (self.summary_level_IRI, self.iri_rdf_type,
                 self.iri_owl_ontology)))
        self.assertTrue(
            len(triples) == 1,
            "missing distribution level owl ontology type triple")

    def test_summary_level_owl_version_iri(self):
        triples = list(
            self.dataset.graph.triples(
                (self.summary_level_IRI, self.iri_owl_version_iri, None)))
        self.assertTrue(
            len(triples) == 1,
            "missing distribution level owl version iri triple")
        self.assertEqual(triples[0][2], URIRef(self.version_level_IRI),
                         "owl version iri triple has the wrong object")

    #
    # Test version level resource triples:
    #
    def test_version_level_type(self):
        triples = list(
            self.dataset.graph.triples(
                (self.version_level_IRI, self.iri_rdf_type, self.iri_dataset)))
        self.assertTrue(len(triples) == 1, "missing version level type triple")

    def test_version_level_title(self):
        triples = list(
            self.dataset.graph.triples(
                (self.version_level_IRI, self.iri_title, None)))
        self.assertTrue(
            len(triples) == 1, "missing version level title triple")
        self.assertEqual(
            triples[0][2],
            Literal(self.ingest_title + " Monarch version " +
                    self.data_release_version),
            "version level title triple has wrong value")

    def test_version_level_description(self):
        triples = list(
            self.dataset.graph.triples(
                (self.version_level_IRI, self.iri_description,
                 Literal(self.ingest_description))))
        self.assertTrue(
            len(triples) == 1, "missing version level description triple")

    def test_version_level_created(self):
        triples = list(
            self.dataset.graph.triples(
                (self.version_level_IRI, self.iri_created, None)))
        self.assertTrue(
            len(triples) == 1,
            "didn't get exactly 1 version level created triple")
        self.assertEqual(
            triples[0][2],
            Literal(datetime.today().strftime("%Y%m%d"), datatype=XSD.date),
            "version level created triple has the wrong timestamp")

    def test_version_level_version_default(self):
        triples = list(
            self.dataset.graph.triples(
                (self.version_level_IRI, self.iri_version, None)))
        self.assertTrue(
            len(triples) == 1,
            "didn't get exactly one version level version triple")
        self.assertEqual(
            triples[0][2], Literal(self.data_release_version,
                                   datatype=XSD.date),
            "version level version triple (default) has the wrong " +
            "timestamp")

    def test_version_level_version_set_explicitly(self):
        self.dataset = Dataset(identifier=self.identifier,
                               data_release_version=self.data_release_version,
                               ingest_name=self.identifier,
                               ingest_title=self.ingest_title,
                               ingest_url=self.ingest_url,
                               ingest_logo=self.ingest_logo_url,
                               ingest_description=self.ingest_description,
                               license_url=None,
                               data_rights=self.data_rights)
        triples = list(
            self.dataset.graph.triples(
                (self.version_level_IRI, self.iri_version, None)))
        self.assertTrue(
            len(triples) == 1,
            "didn't get exactly one version level version triple")
        self.assertEqual(
            triples[0][2], Literal(self.data_release_version,
                                   datatype=XSD.date),
            "version level version triple (set explicitly) is wrong ")

    def test_version_level_creator(self):
        triples = list(
            self.dataset.graph.triples(
                (self.version_level_IRI, self.iri_creator, self.iri_mi_org)))
        self.assertTrue(
            len(triples) == 1, "missing version level creator triple")

    def test_version_level_publisher(self):
        triples = list(
            self.dataset.graph.triples(
                (self.version_level_IRI, self.iri_publisher, self.iri_mi_org)))
        self.assertTrue(
            len(triples) == 1, "missing version level publisher triple")

    def test_version_level_isVersionOf(self):
        triples = list(
            self.dataset.graph.triples(
                (self.version_level_IRI, self.iri_is_version_of,
                 self.summary_level_IRI)))
        self.assertTrue(
            len(triples) == 1, "missing version level isVersionOf triple")

    def test_version_level_distribution(self):
        triples = list(
            self.dataset.graph.triples(
                (self.version_level_IRI, self.iri_distribution,
                 self.distribution_level_IRI_ttl)))
        self.assertTrue(
            len(triples) == 1, "missing version level distribution triple")

    #
    # test distribution level triples
    #
    def test_distribution_level_dataset_type(self):
        triples = list(
            self.dataset.graph.triples((self.distribution_level_IRI_ttl,
                                        self.iri_rdf_type, self.iri_dataset)))
        self.assertTrue(
            len(triples) == 1, "missing version level type dataset triple")

    def test_distribution_level_distribution_type(self):
        triples = list(
            self.dataset.graph.triples(
                (self.distribution_level_IRI_ttl, self.iri_rdf_type,
                 self.iri_distribution)))
        self.assertTrue(
            len(triples) == 1,
            "missing version level type distribution triple")

    def test_distribution_level_title(self):
        triples = list(
            self.dataset.graph.triples(
                (self.distribution_level_IRI_ttl, self.iri_title, None)))
        self.assertTrue(
            len(triples) == 1, "missing distribution level type title triple")
        self.assertEqual(
            triples[0][2],
            Literal(self.ingest_title + " distribution " +
                    self.distribution_type),
            "distribution level title triple has wrong value")

    def test_distribution_level_description(self):
        triples = list(
            self.dataset.graph.triples(
                (self.distribution_level_IRI_ttl, self.iri_description,
                 Literal(self.ingest_description))))
        self.assertTrue(
            len(triples) == 1, "missing version level type description triple")

    def test_distribution_level_created(self):
        triples = list(
            self.dataset.graph.triples(
                (self.distribution_level_IRI_ttl, self.iri_created, None)))
        self.assertTrue(
            len(triples) == 1,
            "didn't get exactly 1 version level type created triple")
        self.assertEqual(
            triples[0][2],
            Literal(datetime.today().strftime("%Y%m%d"), datatype=XSD.date))

    def test_distribution_level_version(self):
        triples = list(
            self.dataset.graph.triples(
                (self.distribution_level_IRI_ttl, self.iri_version, None)))
        self.assertTrue(
            len(triples) == 1,
            "didn't get exactly 1 version level type version triple")
        self.assertEqual(triples[0][2],
                         Literal(self.data_release_version, datatype=XSD.date))

    def test_distribution_level_creator(self):
        triples = list(
            self.dataset.graph.triples((self.distribution_level_IRI_ttl,
                                        self.iri_creator, self.iri_mi_org)))
        self.assertTrue(
            len(triples) == 1, "missing distribution level creator triple")

    def test_distribution_level_publisher(self):
        triples = list(
            self.dataset.graph.triples((self.distribution_level_IRI_ttl,
                                        self.iri_publisher, self.iri_mi_org)))
        self.assertTrue(
            len(triples) == 1, "missing distribution level publisher triple")

    def test_distribution_level_created_with(self):
        triples = list(
            self.dataset.graph.triples(
                (self.distribution_level_IRI_ttl, self.iri_created_with,
                 self.iri_dipper)))
        self.assertTrue(
            len(triples) == 1, "missing distribution level createdWith triple")

    def test_distribution_level_format(self):
        triples = list(
            self.dataset.graph.triples((self.distribution_level_IRI_ttl,
                                        self.iri_format, self.iri_ttl_spec)))
        self.assertTrue(
            len(triples) == 1, "missing distribution level format triple")

    def test_distribution_level_download_url(self):
        triples = list(
            self.dataset.graph.triples((self.distribution_level_IRI_ttl,
                                        self.iri_download_url, None)))
        self.assertTrue(
            len(triples) == 1, "didn't get exactly 1 downloadURL triple")
        self.assertEqual(triples[0][2], self.iri_expected_download_url_value,
                         "didn't get the expected downloadURL value")

    def test_distribution_level_license_url(self):
        triples = list(
            self.dataset.graph.triples(
                (self.distribution_level_IRI_ttl, self.iri_license,
                 URIRef(self.license_url))))
        self.assertTrue(
            len(triples) == 1, "missing distribution level license triple")

    def test_distribution_level_data_rights(self):
        triples = list(
            self.dataset.graph.triples(
                (self.distribution_level_IRI_ttl, self.iri_data_rights,
                 URIRef(self.data_rights))))
        self.assertTrue(
            len(triples) == 1, "missing distribution level data rights triple")

    def test_distribution_level_no_license_url_default_value(self):
        self.dataset = Dataset(identifier=self.identifier,
                               data_release_version=None,
                               ingest_name=self.identifier,
                               ingest_title=self.ingest_title,
                               ingest_url=self.ingest_url,
                               ingest_logo=self.ingest_logo_url,
                               ingest_description=self.ingest_description,
                               license_url=None,
                               data_rights=self.data_rights)
        triples = list(
            self.dataset.graph.triples(
                (self.distribution_level_IRI_ttl_default_version,
                 self.iri_license, URIRef(self.license_url_default))))
        self.assertTrue(
            len(triples) == 1,
            "distribution level default license triple not set")
예제 #2
0
class GeneReviews(Source):
    """
    Here we process the GeneReviews mappings to OMIM,
    plus inspect the GeneReviews (html) books to pull the clinical descriptions
    in order to populate the definitions of the terms in the ontology.
    We define the GeneReviews items as classes that are either grouping classes
    over OMIM disease ids (gene ids are filtered out),
    or are made as subclasses of DOID:4 (generic disease).

    Note that GeneReviews
    [copyright policy](http://www.ncbi.nlm.nih.gov/books/NBK138602/)
    (as of 2015.11.20) says:

    GeneReviews® chapters are owned by the University of Washington, Seattle,
    © 1993-2015. Permission is hereby granted to reproduce, distribute,
    and translate copies of content materials provided that
    (i) credit for source (www.ncbi.nlm.nih.gov/books/NBK1116/)
    and copyright (University of Washington, Seattle)
    are included with each copy;
    (ii) a link to the original material is provided whenever the material is
    published elsewhere on the Web; and
    (iii) reproducers, distributors, and/or translators comply with this
    copyright notice and the GeneReviews Usage Disclaimer.

    This script doesn't pull the GeneReviews books from the NCBI Bookshelf
    directly; scripting this task is expressly prohibited by
    [NCBIBookshelf policy](http://www.ncbi.nlm.nih.gov/books/NBK45311/).
    However, assuming you have acquired the books (in html format) via
    permissible means, a parser for those books is provided here to extract
    the clinical descriptions to define the NBK identified classes.

    """

    files = {
        'idmap': {
            'file': 'NBKid_shortname_OMIM.txt',
            'url': GRDL + '/NBKid_shortname_OMIM.txt'
        },
        'titles': {
            'file': 'GRtitle_shortname_NBKid.txt',
            'url': GRDL + '/GRtitle_shortname_NBKid.txt'
        }
    }

    def __init__(self, graph_type, are_bnodes_skolemized):
        super().__init__(graph_type, are_bnodes_skolemized, 'genereviews')

        self.dataset = Dataset('genereviews', 'Gene Reviews',
                               'http://genereviews.org/', None,
                               'http://www.ncbi.nlm.nih.gov/books/NBK138602/')
        self.dataset.set_citation('GeneReviews:NBK1116')

        self.book_ids = set()
        self.all_books = {}

        if 'test_ids' not in config.get_config() or\
                'disease' not in config.get_config()['test_ids']:
            logger.warning("not configured with disease test ids.")
            self.test_ids = list()
        else:
            # select ony those test ids that are omim's.
            self.test_ids = config.get_config()['test_ids']['disease']

        return

    def fetch(self, is_dl_forced=False):
        """
        We fetch GeneReviews id-label map and id-omim mapping files from NCBI.
        :return: None
        """

        self.get_files(is_dl_forced)

        return

    def parse(self, limit=None):
        """
        :return: None
        """

        if self.testOnly:
            self.testMode = True

        self._get_titles(limit)
        self._get_equivids(limit)

        self.create_books()
        self.process_nbk_html(limit)

        # no test subset for now; test == full graph
        self.testgraph = self.graph
        return

    def _get_equivids(self, limit):
        """
        The file processed here is of the format:
        #NBK_id GR_shortname    OMIM
        NBK1103 trimethylaminuria       136132
        NBK1103 trimethylaminuria       602079
        NBK1104 cdls    122470
        Where each of the rows represents a mapping between
        a gr id and an omim id. These are a 1:many relationship,
        and some of the omim ids are genes(not diseases).
        Therefore, we need to create a loose coupling here.
        We make the assumption that these NBKs are generally higher-level
        grouping classes; therefore the OMIM ids are treated as subclasses.
        (This assumption is poor for those omims that are actually genes,
        but we have no way of knowing what those are here...
        we will just have to deal with that for now.)
        :param limit:
        :return:

        """
        raw = '/'.join((self.rawdir, self.files['idmap']['file']))
        model = Model(self.graph)
        line_counter = 0

        # we look some stuff up in OMIM, so initialize here
        omim = OMIM(self.graph_type, self.are_bnodes_skized)
        id_map = {}
        allomimids = set()
        with open(raw, 'r', encoding="utf8") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                if line_counter == 1:  # skip header
                    continue
                (nbk_num, shortname, omim_num) = row
                gr_id = 'GeneReviews:' + nbk_num
                omim_id = 'OMIM:' + omim_num
                if not ((self.testMode and len(self.test_ids) > 0
                         and omim_id in self.test_ids) or not self.testMode):
                    continue

                # sometimes there's bad omim nums
                if len(omim_num) > 6:
                    logger.warning(
                        "OMIM number incorrectly formatted " +
                        "in row %d; skipping:\n%s", line_counter,
                        '\t'.join(row))
                    continue

                # build up a hashmap of the mappings; then process later
                if nbk_num not in id_map:
                    id_map[nbk_num] = set()
                id_map[nbk_num].add(omim_num)

                # add the class along with the shortname
                model.addClassToGraph(gr_id, None)
                model.addSynonym(gr_id, shortname)

                allomimids.add(omim_num)

                if not self.testMode and \
                        limit is not None and line_counter > limit:
                    break

            # end looping through file

        # get the omim ids that are not genes
        entries_that_are_phenotypes = \
            omim.process_entries(
                list(allomimids), filter_keep_phenotype_entry_ids,
                None, None, limit)

        logger.info("Filtered out %d/%d entries that are genes or features",
                    len(allomimids) - len(entries_that_are_phenotypes),
                    len(allomimids))

        for nbk_num in self.book_ids:
            gr_id = 'GeneReviews:' + nbk_num
            if nbk_num in id_map:
                omim_ids = id_map.get(nbk_num)
                for omim_num in omim_ids:
                    omim_id = 'OMIM:' + omim_num
                    # add the gene reviews as a superclass to the omim id,
                    # but only if the omim id is not a gene
                    if omim_id in entries_that_are_phenotypes:
                        model.addClassToGraph(omim_id, None)
                        model.addSubClass(omim_id, gr_id)
            # add this as a generic subclass of DOID:4
            model.addSubClass(gr_id, 'DOID:4')

        return

    def _get_titles(self, limit):
        """
        The file processed here is of the format:
        #NBK_id GR_shortname    OMIM
        NBK1103 trimethylaminuria       136132
        NBK1103 trimethylaminuria       602079
        NBK1104 cdls    122470
        Where each of the rows represents a mapping between
        a gr id and an omim id. These are a 1:many relationship,
        and some of the omim ids are genes (not diseases).
        Therefore, we need to create a loose coupling here.
        We make the assumption that these NBKs are generally higher-level
        grouping classes; therefore the OMIM ids are treated as subclasses.
        (This assumption is poor for those omims that are actually genes,
        but we have no way of knowing what those are here...
        we will just have to deal with that for now.)
        :param limit:
        :return:
        """
        raw = '/'.join((self.rawdir, self.files['titles']['file']))
        model = Model(self.graph)
        line_counter = 0
        with open(raw, 'r', encoding='latin-1') as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            header = next(filereader)
            line_counter = 1
            colcount = len(header)
            if colcount != 4:  # ('GR_shortname', 'GR_Title', 'NBK_id', 'PMID')
                logger.error("Unexpected Header ", header)
                exit(-1)
            for row in filereader:
                line_counter += 1
                if len(row) != colcount:
                    logger.error("Unexpected row. got: ", row)
                    logger.error("Expected data for: ", header)
                    exit(-1)
                (shortname, title, nbk_num, pmid) = row
                gr_id = 'GeneReviews:' + nbk_num

                self.book_ids.add(nbk_num)  # a global set of the book nums

                if limit is None or line_counter < limit:
                    model.addClassToGraph(gr_id, title)
                    model.addSynonym(gr_id, shortname)
                # TODO include the new PMID?

        return

    def create_books(self):

        # note that although we put in the url to the book,
        # NCBI Bookshelf does not allow robots to download content
        book_item = {'file': 'books/', 'url': ''}

        for nbk in self.book_ids:
            b = book_item.copy()
            b['file'] = '/'.join(('books', nbk + '.html'))
            b['url'] = 'http://www.ncbi.nlm.nih.gov/books/' + nbk
            self.all_books[nbk] = b

        return

    def process_nbk_html(self, limit):
        """
        Here we process the gene reviews books to fetch
        the clinical descriptions to include in the ontology.
        We only use books that have been acquired manually,
        as NCBI Bookshelf does not permit automated downloads.
        This parser will only process the books that are found in
        the ```raw/genereviews/books``` directory,
        permitting partial completion.

        :param limit:
        :return:
        """
        model = Model(self.graph)
        c = 0
        books_not_found = set()
        for nbk in self.book_ids:
            c += 1
            nbk_id = 'GeneReviews:' + nbk
            book_item = self.all_books.get(nbk)
            url = '/'.join((self.rawdir, book_item['file']))

            # figure out if the book is there; if so, process, otherwise skip
            book_dir = '/'.join((self.rawdir, 'books'))
            book_files = os.listdir(book_dir)
            if ''.join((nbk, '.html')) not in book_files:
                # logger.warning("No book found locally for %s; skipping", nbk)
                books_not_found.add(nbk)
                continue
            logger.info("Processing %s", nbk)

            page = open(url)
            soup = BeautifulSoup(page.read())

            # sec0 == clinical description
            clin_summary = \
                soup.find(
                    'div', id=re.compile(".*Summary.sec0"))
            if clin_summary is not None:
                p = clin_summary.find('p')
                ptext = p.text
                ptext = re.sub(r'\s+', ' ', ptext)

                ul = clin_summary.find('ul')
                if ul is not None:
                    item_text = list()
                    for li in ul.find_all('li'):
                        item_text.append(re.sub(r'\s+', ' ', li.text))
                    ptext += ' '.join(item_text)

                # add in the copyright and citation info to description
                ptext = \
                    ' '.join(
                        (ptext,
                         '[GeneReviews:NBK1116, GeneReviews:NBK138602, ' +
                         nbk_id+']'))

                model.addDefinition(nbk_id, ptext.strip())

            # get the pubs
            pmid_set = set()
            pub_div = soup.find('div', id=re.compile(r".*Literature_Cited"))
            if pub_div is not None:
                ref_list = pub_div.find_all('div', attrs={'class': "bk_ref"})
                for r in ref_list:
                    for a in r.find_all('a',
                                        attrs={'href': re.compile(r"pubmed")}):
                        if re.match(r'PubMed:', a.text):
                            pmnum = re.sub(r'PubMed:\s*', '', a.text)
                        else:
                            pmnum = \
                                re.search(
                                    r'\/pubmed\/(\d+)$', a['href']).group(1)
                        if pmnum is not None:
                            pmid = 'PMID:' + str(pmnum)
                            self.graph.addTriple(
                                pmid, model.object_properties['is_about'],
                                nbk_id)
                            pmid_set.add(pmnum)
                            reference = Reference(
                                self.graph, pmid,
                                Reference.ref_types['journal_article'])
                            reference.addRefToGraph()

            # TODO add author history, copyright, license to dataset

            # TODO get PMID-NBKID equivalence (near foot of page),
            # and make it "is about" link
            # self.gu.addTriple(
            #   self.graph, pmid,
            #   self.gu.object_properties['is_about'], nbk_id)
            # for example: NBK1191 PMID:20301370

            # add the book to the dataset
            self.dataset.setFileAccessUrl(book_item['url'])

            if limit is not None and c > limit:
                break

            # finish looping through books

        l = len(books_not_found)
        if len(books_not_found) > 0:
            if l > 100:
                logger.warning("There were %d books not found.", l)
            else:
                logger.warning(
                    "The following %d books were not found locally: %s", l,
                    str(books_not_found))
        logger.info("Finished processing %d books for clinical descriptions",
                    c - l)

        return

    def getTestSuite(self):
        import unittest
        from tests.test_genereviews import GeneReviewsTestCase

        test_suite = \
            unittest.TestLoader().loadTestsFromTestCase(GeneReviewsTestCase)

        return test_suite
예제 #3
0
class GeneReviews(Source):
    """
    Here we process the GeneReviews mappings to OMIM,
    plus inspect the GeneReviews (html) books to pull the clinical descriptions
    in order to populate the definitions of the terms in the ontology.
    We define the GeneReviews items as classes that are either grouping classes
    over OMIM disease ids (gene ids are filtered out),
    or are made as subclasses of DOID:4 (generic disease).

    Note that GeneReviews
    [copyright policy](http://www.ncbi.nlm.nih.gov/books/NBK138602/)
    (as of 2015.11.20) says:

    GeneReviews® chapters are owned by the University of Washington, Seattle,
    © 1993-2015. Permission is hereby granted to reproduce, distribute,
    and translate copies of content materials provided that
    (i) credit for source (www.ncbi.nlm.nih.gov/books/NBK1116/)
        and copyright (University of Washington, Seattle)
        are included with each copy;
    (ii) a link to the original material is provided whenever the material is
        published elsewhere on the Web; and
    (iii) reproducers, distributors, and/or translators comply with this
        copyright notice and the GeneReviews Usage Disclaimer.

    This script doesn't pull the GeneReviews books from the NCBI Bookshelf
    directly; scripting this task is expressly prohibited by
    [NCBIBookshelf policy](http://www.ncbi.nlm.nih.gov/books/NBK45311/).
    However, assuming you have acquired the books (in html format) via
    permissible means, a parser for those books is provided here to extract
    the clinical descriptions to define the NBK identified classes.

    """

    files = {
        'idmap': {'file': 'NBKid_shortname_OMIM.txt',
                  'url': GRDL + '/NBKid_shortname_OMIM.txt'},
        'titles': {'file': 'GRtitle_shortname_NBKid.txt',
                   'url': GRDL + '/GRtitle_shortname_NBKid.txt'}
        }

    def __init__(self):
        Source.__init__(self, 'genereviews')

        self.load_bindings()

        self.dataset = Dataset(
            'genereviews', 'Gene Reviews', 'http://genereviews.org/',
            None, 'http://www.ncbi.nlm.nih.gov/books/NBK138602/')
        self.dataset.set_citation('GeneReviews:NBK1116')

        self.gu = GraphUtils(curie_map.get())

        self.book_ids = set()
        self.all_books = {}

        if 'test_ids' not in config.get_config() or\
                'disease' not in config.get_config()['test_ids']:
            logger.warning("not configured with disease test ids.")
            self.test_ids = list()
        else:
            # select ony those test ids that are omim's.
            self.test_ids = config.get_config()['test_ids']['disease']

        return

    def fetch(self, is_dl_forced=False):
        """
        We fetch GeneReviews id-label map and id-omim mapping files from NCBI.
        :return: None
        """

        self.get_files(is_dl_forced)

        return

    def parse(self, limit=None):
        """
        :return: None
        """

        if self.testOnly:
            self.testMode = True

        self._get_titles(limit)
        self._get_equivids(limit)

        self.create_books()
        self.process_nbk_html(limit)

        self.load_bindings()

        # no test subset for now; test == full graph
        self.testgraph = self.graph

        logger.info("Found %d nodes", len(self.graph))

        return

    def _get_equivids(self, limit):
        """
        The file processed here is of the format:
        #NBK_id GR_shortname    OMIM
        NBK1103 trimethylaminuria       136132
        NBK1103 trimethylaminuria       602079
        NBK1104 cdls    122470
        Where each of the rows represents a mapping between
        a gr id and an omim id. These are a 1:many relationship,
        and some of the omim ids are genes(not diseases).
        Therefore, we need to create a loose coupling here.
        We make the assumption that these NBKs are generally higher-level
        grouping classes; therefore the OMIM ids are treated as subclasses.
        (This assumption is poor for those omims that are actually genes,
        but we have no way of knowing what those are here...
        we will just have to deal with that for now.)
        :param limit:
        :return:

        """
        raw = '/'.join((self.rawdir, self.files['idmap']['file']))
        gu = GraphUtils(curie_map.get())
        line_counter = 0

        # we look some stuff up in OMIM, so initialize here
        omim = OMIM()
        id_map = {}
        allomimids = set()
        with open(raw, 'r', encoding="utf8") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                if line_counter == 1:  # skip header
                    continue
                (nbk_num, shortname, omim_num) = row
                gr_id = 'GeneReviews:'+nbk_num
                omim_id = 'OMIM:'+omim_num
                if not (
                        (self.testMode and
                         len(self.test_ids) > 0 and
                         omim_id in self.test_ids) or not
                        self.testMode):
                    continue

                # sometimes there's bad omim nums
                if len(omim_num) > 6:
                    logger.warning(
                        "OMIM number incorrectly formatted " +
                        "in row %d; skipping:\n%s",
                        line_counter, '\t'.join(row))
                    continue

                # build up a hashmap of the mappings; then process later
                if nbk_num not in id_map:
                    id_map[nbk_num] = set()
                id_map[nbk_num].add(omim_num)

                # add the class along with the shortname
                gu.addClassToGraph(self.graph, gr_id, None)
                gu.addSynonym(self.graph, gr_id, shortname)

                allomimids.add(omim_num)

                if not self.testMode and \
                        limit is not None and line_counter > limit:
                    break

            # end looping through file

        # get the omim ids that are not genes
        entries_that_are_phenotypes = \
            omim.process_entries(
                list(allomimids), filter_keep_phenotype_entry_ids,
                None, None, limit)

        logger.info("Filtered out %d/%d entries that are genes or features",
                    len(allomimids)-len(entries_that_are_phenotypes),
                    len(allomimids))

        for nbk_num in self.book_ids:
            gr_id = 'GeneReviews:'+nbk_num
            if nbk_num in id_map:
                omim_ids = id_map.get(nbk_num)
                for omim_num in omim_ids:
                    omim_id = 'OMIM:'+omim_num
                    # add the gene reviews as a superclass to the omim id,
                    # but only if the omim id is not a gene
                    if omim_id in entries_that_are_phenotypes:
                        gu.addClassToGraph(self.graph, omim_id, None)
                        gu.addSubclass(self.graph, gr_id, omim_id)
            # add this as a generic subclass of DOID:4
            gu.addSubclass(self.graph, 'DOID:4', gr_id)

        return

    def _get_titles(self, limit):
        """
        The file processed here is of the format:
        #NBK_id GR_shortname    OMIM
        NBK1103 trimethylaminuria       136132
        NBK1103 trimethylaminuria       602079
        NBK1104 cdls    122470
        Where each of the rows represents a mapping between
        a gr id and an omim id. These are a 1:many relationship,
        and some of the omim ids are genes (not diseases).
        Therefore, we need to create a loose coupling here.
        We make the assumption that these NBKs are generally higher-level
        grouping classes; therefore the OMIM ids are treated as subclasses.
        (This assumption is poor for those omims that are actually genes,
        but we have no way of knowing what those are here...
        we will just have to deal with that for now.)
        :param limit:
        :return:
        """
        raw = '/'.join((self.rawdir, self.files['titles']['file']))
        gu = GraphUtils(curie_map.get())
        line_counter = 0
        with open(raw, 'r', encoding='latin-1') as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                if line_counter == 1:  # skip header
                    continue
                (shortname, title, nbk_num) = row
                gr_id = 'GeneReviews:'+nbk_num

                self.book_ids.add(nbk_num)  # a global set of the book nums

                if limit is None or line_counter < limit:
                    gu.addClassToGraph(self.graph, gr_id, title)
                    gu.addSynonym(self.graph, gr_id, shortname)

        return

    def create_books(self):

        # note that although we put in the url to the book,
        # NCBI Bookshelf does not allow robots to download content
        book_item = {'file': 'books/',
                     'url': ''}

        for nbk in self.book_ids:
            b = book_item.copy()
            b['file'] = '/'.join(('books', nbk+'.html'))
            b['url'] = 'http://www.ncbi.nlm.nih.gov/books/'+nbk
            self.all_books[nbk] = b

        return

    def process_nbk_html(self, limit):
        """
        Here we process the gene reviews books to fetch
        the clinical descriptions to include in the ontology.
        We only use books that have been acquired manually,
        as NCBI Bookshelf does not permit automated downloads.
        This parser will only process the books that are found in
        the ```raw/genereviews/books``` directory,
        permitting partial completion.

        :param limit:
        :return:
        """
        c = 0
        books_not_found = set()
        for nbk in self.book_ids:
            c += 1
            nbk_id = 'GeneReviews:'+nbk
            book_item = self.all_books.get(nbk)
            url = '/'.join((self.rawdir, book_item['file']))

            # figure out if the book is there; if so, process, otherwise skip
            book_dir = '/'.join((self.rawdir, 'books'))
            book_files = os.listdir(book_dir)
            if ''.join((nbk, '.html')) not in book_files:
                # logger.warning("No book found locally for %s; skipping", nbk)
                books_not_found.add(nbk)
                continue
            logger.info("Processing %s", nbk)

            page = open(url)
            soup = BeautifulSoup(page.read())

            # sec0 == clinical description
            clin_summary = \
                soup.find(
                    'div', id=re.compile(".*Summary.sec0"))
            if clin_summary is not None:
                p = clin_summary.find('p')
                ptext = p.text
                ptext = re.sub(r'\s+', ' ', ptext)

                ul = clin_summary.find('ul')
                if ul is not None:
                    item_text = list()
                    for li in ul.find_all('li'):
                        item_text.append(re.sub(r'\s+', ' ', li.text))
                    ptext += ' '.join(item_text)

                # add in the copyright and citation info to description
                ptext = \
                    ' '.join(
                        (ptext,
                         '[GeneReviews:NBK1116, GeneReviews:NBK138602, ' +
                         nbk_id+']'))

                self.gu.addDefinition(self.graph, nbk_id, ptext.strip())

            # get the pubs
            pmid_set = set()
            pub_div = soup.find('div', id=re.compile(r".*Literature_Cited"))
            if pub_div is not None:
                ref_list = pub_div.find_all('div', attrs={'class': "bk_ref"})
                for r in ref_list:
                    for a in r.find_all(
                            'a', attrs={'href': re.compile(r"pubmed")}):
                        if re.match(r'PubMed:', a.text):
                            pmnum = re.sub(r'PubMed:\s*', '', a.text)
                        else:
                            pmnum = \
                                re.search(
                                    r'\/pubmed\/(\d+)$', a['href']).group(1)
                        if pmnum is not None:
                            pmid = 'PMID:'+str(pmnum)
                            self.gu.addTriple(
                                self.graph, pmid,
                                self.gu.object_properties['is_about'],
                                nbk_id)
                            pmid_set.add(pmnum)
                            r = Reference(
                                pmid, Reference.ref_types['journal_article'])
                            r.addRefToGraph(self.graph)

            # TODO add author history, copyright, license to dataset

            # TODO get PMID-NBKID equivalence (near foot of page),
            # and make it "is about" link
            # self.gu.addTriple(
            #   self.graph, pmid,
            #   self.gu.object_properties['is_about'], nbk_id)
            # for example: NBK1191 PMID:20301370

            # add the book to the dataset
            self.dataset.setFileAccessUrl(book_item['url'])

            if limit is not None and c > limit:
                break

            # finish looping through books

        l = len(books_not_found)
        if len(books_not_found) > 0:
            if l > 100:
                logger.warning("There were %d books not found.", l)
            else:
                logger.warning(
                    "The following %d books were not found locally: %s",
                    l, str(books_not_found))
        logger.info(
            "Finished processing %d books for clinical descriptions", c-l)

        return

    def getTestSuite(self):
        import unittest
        from tests.test_genereviews import GeneReviewsTestCase

        test_suite = \
            unittest.TestLoader().loadTestsFromTestCase(GeneReviewsTestCase)

        return test_suite
예제 #4
0
파일: MPD.py 프로젝트: DoctorBud/dipper
class MPD(Source):
    """
    From the [MPD](http://phenome.jax.org/) website:
    This resource is a collaborative standardized collection of measured data
    on laboratory mouse strains and populations. Includes baseline phenotype
    data sets as well as studies of drug, diet, disease and aging effect.
    Also includes protocols, projects and publications, and SNP,
    variation and gene expression studies.

    Here, we pull the data and model the genotypes using GENO and
    the genotype-to-phenotype associations using the OBAN schema.

    MPD provide measurements for particular assays for several strains.
    Each of these measurements is itself mapped to a MP or VT term
    as a phenotype.  Therefore, we can create a strain-to-phenotype association
    based on those strains that lie outside of the "normal" range for the given
    measurements.  We can compute the average of the measurements
    for all strains tested, and then threshold any extreme measurements being
    beyond some threshold beyond the average.

    Our default threshold here, is +/-2 standard deviations beyond the mean.

    Because the measurements are made and recorded at the level of
    a specific sex of each strain, we associate the MP/VT phenotype with
    the sex-qualified genotype/strain.

    """
    MPDDL = 'http://phenomedoc.jax.org/MPD_downloads'
    files = {
        'ontology_mappings': {
            'file': 'ontology_mappings.csv',
            'url': MPDDL + '/ontology_mappings.csv'},
        'straininfo': {
            'file': 'straininfo.csv',
            'url': MPDDL + '/straininfo.csv'},
        'assay_metadata': {
            'file': 'measurements.csv',
            'url': MPDDL + '/measurements.csv'},
        'strainmeans': {
            'file': 'strainmeans.csv.gz',
            'url': MPDDL + '/strainmeans.csv.gz'},
        # 'mpd_datasets_metadata': { #TEC does not seem to be used
        #    'file': 'mpd_datasets_metadata.xml.gz',
        #    'url': MPDDL + '/mpd_datasets_metadata.xml.gz'},
    }

    # the following are strain ids for testing
    # test_ids = [
    #   "MPD:2", "MPD:3", "MPD:5", "MPD:6", "MPD:9", "MPD:11", "MPD:18",
    #   "MPD:20", "MPD:24", "MPD:28", "MPD:30", "MPD:33", "MPD:34", "MPD:36",
    #   "MPD:37", "MPD:39", "MPD:40", "MPD:42", "MPD:47", "MPD:66", "MPD:68",
    #   "MPD:71", "MPD:75", "MPD:78", "MPD:122", "MPD:169", "MPD:438",
    #   "MPD:457","MPD:473", "MPD:481", "MPD:759", "MPD:766", "MPD:770",
    #   "MPD:849",  "MPD:857", "MPD:955", "MPD:964", "MPD:988", "MPD:1005",
    #   "MPD:1017", "MPD:1204", "MPD:1233", "MPD:1235", "MPD:1236", "MPD:1237"]

    test_ids = [
        'MPD:6', 'MPD:849', 'MPD:425', 'MPD:569', "MPD:10", "MPD:1002",
        "MPD:39", "MPD:2319"]

    mgd_agent_id = "MPD:db/q?rtn=people/allinv"
    mgd_agent_label = "Mouse Phenotype Database"
    mgd_agent_type = "foaf:organization"

    def __init__(self, graph_type, are_bnodes_skolemized):
        Source.__init__(self, graph_type, are_bnodes_skolemized, 'mpd')
        # @N, not sure if this step is required
        self.stdevthreshold = 2

        # update the dataset object with details about this resource
        # @N: Note that there is no license as far as I can tell
        self.dataset = Dataset(
            'mpd', 'MPD', 'http://phenome.jax.org', None, None)

        # TODO add a citation for mpd dataset as a whole
        self.dataset.set_citation('PMID:15619963')

        self.assayhash = {}
        self.idlabel_hash = {}
        # to store the mean/zscore of each measure by strain+sex
        self.score_means_by_measure = {}
        # to store the mean value for each measure by strain+sex
        self.strain_scores_by_measure = {}

        return

    def fetch(self, is_dl_forced=False):

        self.get_files(is_dl_forced)
        return

    def parse(self, limit=None):
        """
        MPD data is delivered in four separate csv files and one xml file,
        which we process iteratively and write out as
        one large graph.

        :param limit:
        :return:
        """
        if limit is not None:
            logger.info("Only parsing first %s rows fo each file", str(limit))

        logger.info("Parsing files...")

        self._process_straininfo(limit)
        # the following will provide us the hash-lookups
        # These must be processed in a specific order

        # mapping between assays and ontology terms
        self._process_ontology_mappings_file(limit)
        # this is the metadata about the measurements
        self._process_measurements_file(limit)
        # get all the measurements per strain
        self._process_strainmeans_file(limit)

        # The following will use the hash populated above
        # to lookup the ids when filling in the graph
        self._fill_provenance_graph(limit)

        logger.info("Finished parsing.")
        return

    def _process_ontology_mappings_file(self, limit):

        # line_counter = 0  # TODO unused

        logger.info("Processing ontology mappings...")
        raw = '/'.join((self.rawdir, 'ontology_mappings.csv'))

        with open(raw, 'r') as f:
            reader = csv.reader(f)
            # read the header row; skip
            self.check_header(self.files['ontology_mappings']['file'], f.readline())
            for row in reader:
                try:
                    (assay_id, ont_term, descrip) = row
                except ValueError:
                    continue
                assay_id = int(assay_id)
                if re.match(r'(MP|VT)', ont_term):
                    # add the mapping denovo
                    if assay_id not in self.assayhash:
                        self.assayhash[assay_id] = {}
                        self.assayhash[assay_id]['ont_terms'] = set()
                    self.assayhash[assay_id]['ont_terms'].add(ont_term)

        return

    def _process_straininfo(self, limit):
        # line_counter = 0  # TODO unused
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        model = Model(g)

        logger.info("Processing measurements ...")
        raw = '/'.join((self.rawdir, self.files['straininfo']['file']))

        tax_id = 'NCBITaxon:10090'

        with open(raw, 'r') as f:
            reader = csv.reader(f, delimiter=',', quotechar='\"')
            self.check_header(self.files['straininfo']['file'], f.readline())
            for row in reader:
                (strain_name, vendor, stocknum, panel, mpd_strainid,
                 straintype, n_proj, n_snp_datasets, mpdshortname, url) = row
                # C57BL/6J,J,000664,,7,IN,225,17,,http://jaxmice.jax.org/strain/000664.html
                # create the strain as an instance of the taxon
                if self.testMode and \
                        'MPD:' + str(mpd_strainid) not in self.test_ids:
                    continue
                strain_id = 'MPD-strain:' + str(mpd_strainid)
                model.addIndividualToGraph(strain_id, strain_name, tax_id)
                if mpdshortname.strip() != '':
                    model.addSynonym(strain_id, mpdshortname.strip())
                self.idlabel_hash[strain_id] = strain_name
                # make it equivalent to the vendor+stock
                if stocknum != '':
                    if vendor == 'J':
                        jax_id = 'JAX:'+stocknum
                        model.addSameIndividual(strain_id, jax_id)
                    elif vendor == 'Rbrc':
                        # reiken
                        reiken_id = 'RBRC:'+re.sub(r'RBRC', '', stocknum)
                        model.addSameIndividual(strain_id, reiken_id)
                    else:
                        if url != '':
                            model.addXref(strain_id, url, True)
                        if vendor != '':
                            model.addXref(
                                strain_id, ':'.join((vendor, stocknum)),
                                True)

                # add the panel information
                if panel != '':
                    desc = panel+' [panel]'
                    model.addDescription(strain_id, desc)

                # TODO make the panels as a resource collection

        return

    def _process_measurements_file(self, limit):
        line_counter = 0

        logger.info("Processing measurements ...")
        raw = '/'.join((self.rawdir, 'measurements.csv'))

        with open(raw, 'r') as f:
            reader = csv.reader(f)
            # read the header row; skip
            self.check_header(
                self.files['assay_metadata']['file'], f.readline())
            for row in reader:
                line_counter += 1
                assay_id = int(row[0])
                assay_label = row[4]
                assay_units = row[5]
                assay_type = row[6] if row[6] is not '' else None

                if assay_id not in self.assayhash:
                    self.assayhash[assay_id] = {}
                description = self.build_measurement_description(row)
                self.assayhash[assay_id]['description'] = description
                self.assayhash[assay_id]['assay_label'] = assay_label
                self.assayhash[assay_id]['assay_type'] = assay_type
                self.assayhash[assay_id]['assay_units'] = assay_units

                # TODO add projectsym property?
                # TODO add intervention?
                # ageweeks might be useful for adding to phenotype assoc

            # end loop on measurement metadata

        return

    def _process_strainmeans_file(self, limit):
        """
        This will store the entire set of strain means in a hash.
        Not the most efficient representation,
        but easy access.
        We will loop through this later to then apply cutoffs
        and add associations
        :param limit:
        :return:

        """
        logger.info("Processing strain means ...")
        line_counter = 0
        raw = '/'.join((self.rawdir, self.files['strainmeans']['file']))
        with gzip.open(raw, 'rb') as f:
            f = io.TextIOWrapper(f)
            reader = csv.reader(f)
            self.check_header(self.files['strainmeans']['file'], f.readline())
            score_means_by_measure = {}
            strain_scores_by_measure = {}
            for row in reader:
                try:
                    (measnum, varname, strain, strainid, sex, mean, nmice, sd,
                     sem, cv, minval, maxval, logmean, logsd, zscore,
                     logzscore) = row
                except ValueError:
                    continue
                line_counter += 1
                strain_num = int(strainid)
                assay_num = int(measnum)
                # assuming the zscore is across all the items
                # in the same measure+var+strain+sex
                # note: it seems that there is only ever 1 varname per measnum.
                # note: some assays only tested one sex!
                # we split this here by sex
                if assay_num not in score_means_by_measure:
                    score_means_by_measure[assay_num] = {}
                if sex not in score_means_by_measure[assay_num]:
                    score_means_by_measure[assay_num][sex] = list()
                score_means_by_measure[assay_num][sex].append(float(mean))

                if strain_num not in strain_scores_by_measure:
                    strain_scores_by_measure[strain_num] = {}
                if sex not in strain_scores_by_measure[strain_num]:
                    strain_scores_by_measure[strain_num][sex] = {}
                strain_scores_by_measure[strain_num][sex][assay_num] = \
                    {'mean': float(mean), 'zscore': float(zscore)}

            # end loop over strainmeans
        self.score_means_by_measure = score_means_by_measure
        self.strain_scores_by_measure = strain_scores_by_measure

        return

    def _fill_provenance_graph(self, limit):
        logger.info("Building graph ...")
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        model = Model(g)
        taxon_id = 'NCBITaxon:10090'  # hardcode to Mus musculus
        model.addClassToGraph(taxon_id, None)

        scores_passing_threshold_count = 0
        scores_passing_threshold_with_ontologies_count = 0
        scores_not_passing_threshold_count = 0

        # loop through all the strains,
        # and make G2P assoc for those with scores beyond threshold
        for strain_num in self.strain_scores_by_measure:
            if self.testMode and 'MPD:'+str(strain_num) not in self.test_ids:
                continue
            strain_id = 'MPD-strain:'+str(strain_num)
            for sex in self.strain_scores_by_measure[strain_num]:
                measures = self.strain_scores_by_measure[strain_num][sex]
                for m in measures:
                    assay_id = 'MPD-assay:'+str(m)
                    # TODO consider using the means
                    # instead of precomputed zscores
                    if 'zscore' in measures[m]:
                        zscore = measures[m]['zscore']
                        if abs(zscore) >= self.stdevthreshold:
                            scores_passing_threshold_count += 1
                            # logger.info(
                            #   "Score passing threshold: %s | %s | %s",
                            #   strain_id, assay_id, zscore)
                            # add the G2P assoc
                            prov = Provenance(self.graph)
                            try:
                                assay_label = self.assayhash[m]['assay_label']
                                assay_description = \
                                    self.assayhash[m]['description']
                                ont_term_ids = self.assayhash[m].get('ont_terms')
                                comment = ' '.join((assay_label,
                                                   '(zscore='+str(zscore)+')'))
                            except KeyError:
                                assay_label = None
                                assay_description = None
                                ont_term_ids = None
                            if assay_label is not None:
                                assay_label += ' ('+str(m)+')'
                            # TODO unused
                            # assay_type = self.assayhash[m]['assay_type']

                            assay_type_id = Provenance.provenance_types['assay']

                            if ont_term_ids is not None:
                                scores_passing_threshold_with_ontologies_count += 1
                                prov.add_assay_to_graph(
                                    assay_id, assay_label, assay_type_id,
                                    assay_description)
                                self._add_g2p_assoc(
                                    g, strain_id, sex, assay_id, ont_term_ids,
                                    comment)
                        else:
                            scores_not_passing_threshold_count += 1

        logger.info("Scores passing threshold: %d",
                    scores_passing_threshold_count)
        logger.info("Scores passing threshold with ontologies: %d",
                    scores_passing_threshold_with_ontologies_count)
        logger.info("Scores not passing threshold: %d",
                    scores_not_passing_threshold_count)

        return

    def _add_g2p_assoc(self, g, strain_id, sex, assay_id, phenotypes, comment):
        """
        Create an association between a sex-specific strain id
        and each of the phenotypes.
        Here, we create a genotype from the strain,
        and a sex-specific genotype.
        Each of those genotypes are created as anonymous nodes.

        The evidence code is hardcoded to be:
            ECO:experimental_phenotypic_evidence.

        :param g:
        :param strain_id:
        :param sex:
        :param assay_id:
        :param phenotypes: a list of phenotypes to association with the strain
        :param comment:
        :return:

        """
        geno = Genotype(g)
        model = Model(g)
        eco_id = "ECO:0000059"  # experimental_phenotypic_evidence
        strain_label = self.idlabel_hash.get(strain_id)
        # strain genotype
        genotype_id = '_'+'-'.join((re.sub(r':', '', strain_id), 'genotype'))
        genotype_label = '[' + strain_label + ']'

        sex_specific_genotype_id = '_'+'-'.join((re.sub(r':', '', strain_id),
                                                 sex, 'genotype'))
        if strain_label is not None:
            sex_specific_genotype_label = strain_label + ' (' + sex + ')'
        else:
            sex_specific_genotype_label = strain_id + '(' + sex + ')'

        genotype_type = Genotype.genoparts['sex_qualified_genotype']
        if sex == 'm':
            genotype_type = Genotype.genoparts['male_genotype']
        elif sex == 'f':
            genotype_type = Genotype.genoparts['female_genotype']

        # add the genotype to strain connection
        geno.addGenotype(
            genotype_id, genotype_label,
            Genotype.genoparts['genomic_background'])
        g.addTriple(
            strain_id, Genotype.object_properties['has_genotype'], genotype_id)

        geno.addGenotype(
            sex_specific_genotype_id, sex_specific_genotype_label,
            genotype_type)

        # add the strain as the background for the genotype
        g.addTriple(
            sex_specific_genotype_id,
            Genotype.object_properties['has_sex_agnostic_genotype_part'],
            genotype_id)

        # #############    BUILD THE G2P ASSOC    #############
        # TODO add more provenance info when that model is completed

        if phenotypes is not None:
            for phenotype_id in phenotypes:
                assoc = G2PAssoc(
                    g, self.name, sex_specific_genotype_id, phenotype_id)
                assoc.add_evidence(assay_id)
                assoc.add_evidence(eco_id)
                assoc.add_association_to_graph()
                assoc_id = assoc.get_association_id()
                model.addComment(assoc_id, comment)

        return

    def getTestSuite(self):
        import unittest
        from tests.test_mpd import MPDTestCase

        test_suite = unittest.TestLoader().loadTestsFromTestCase(MPDTestCase)

        return test_suite

    @staticmethod
    def normalise_units(units):
        # todo:
        return units

    @staticmethod
    def build_measurement_description(row):
        (measnum,
         mpdsector,
         projsym,
         varname,
         descrip,
         units,
         method,
         intervention,
         paneldesc,
         datatype,
         sextested,
         nstrainstested,
         ageweeks,) = row

        if sextested == 'f':
            sextested = 'female'
        elif sextested == 'm':
            sextested = 'male'
        elif sextested == 'fm':
            sextested = 'male and female'
        else:
            logger.warning("Unknown sex tested key: %s", sextested)
        description = "This is an assay of [" + descrip + "] shown as a [" + \
                      datatype + "] measured in [" + units + "]"

        if intervention is not None and intervention != "":
            description += " in response to [" + intervention + "]"
        """
        As of 9/28/2017 intparm is no longer in the measurements.tsv
        if intparm is not None and intervention != "":
            description += \
                ". This represents the [" + intparm + \
                "] arm, using materials and methods that included [" + \
                method + "]"
        """

        description += \
            ".  The overall experiment is entitled [" + projsym + "].  "

        description += \
            "It was conducted in [" + sextested + "] mice at [" + \
            ageweeks + "] of age in" + " [" + nstrainstested + \
            "] different mouse strains. "
        """
        As of 9/28/2017 cat1-3 are no longer in the measurements.tsv
        description += "Keywords: " + cat1 + \
                       ((", " + cat2) if cat2.strip() is not "" else "") + \
                       ((", " + cat3) if cat3.strip() is not "" else "") + "."
        """
        return description

    # def _log_missing_ids(self, missing_id, name_of_file_from_which_missing):
    #     if missing_id not in self.missing_assay_hash:
    #         self.missing_assay_hash[missing_id] = set()
    #     self.missing_assay_hash[missing_id].add(name_of_file_from_which_missing)
    #     # todo: remove the offending ids from the hash
    #     return
    @staticmethod
    def check_header(filename, header):
        header = header.rstrip("\n")
        header_map = {
            'strainmeans.csv.gz':
                'measnum,varname,strain,strainid,sex,mean,'
                'nmice,sd,sem,cv,minval,maxval,logmean,'
                'logsd,zscore,logzscore',
            'straininfo.csv':
                'strainname,vendor,stocknum,panel,mpd_strainid,'
                'straintype,n_proj,n_snp_datasets,mpd_shortname,url',
            'measurements.csv':
                'measnum,mpdsector,projsym,varname,descrip,units,'
                'method,intervention,paneldesc,datatype,sextested,'
                'nstrainstested,ageweeks',
            'ontology_mappings.csv':
                'measnum,ont_term,descrip'
        }
        if header != header_map[filename]:
            raise ValueError(
                "header in {} \n {}\n"
                "does not match expected:\n {}"
                .format(filename, header, header_map[filename])
            )
예제 #5
0
파일: MPD.py 프로젝트: putmantime/dipper
class MPD(Source):
    """
    From the [MPD](http://phenome.jax.org/) website:
    This resource is a collaborative standardized collection of measured data
    on laboratory mouse strains and populations. Includes baseline phenotype
    data sets as well as studies of drug, diet, disease and aging effect.
    Also includes protocols, projects and publications, and SNP,
    variation and gene expression studies.

    Here, we pull the data and model the genotypes using GENO and
    the genotype-to-phenotype associations using the OBAN schema.

    MPD provide measurements for particular assays for several strains.
    Each of these measurements is itself mapped to a MP or VT term
    as a phenotype.  Therefore, we can create a strain-to-phenotype association
    based on those strains that lie outside of the "normal" range for the given
    measurements.  We can compute the average of the measurements
    for all strains tested, and then threshold any extreme measurements being
    beyond some threshold beyond the average.

    Our default threshold here, is +/-2 standard deviations beyond the mean.

    Because the measurements are made and recorded at the level of
    a specific sex of each strain, we associate the MP/VT phenotype with
    the sex-qualified genotype/strain.

    """
    MPDDL = 'http://phenomedoc.jax.org/MPD_downloads'
    files = {
        'ontology_mappings': {
            'file': 'ontology_mappings.csv',
            'url': MPDDL + '/ontology_mappings.csv'},
        'straininfo': {
            'file': 'straininfo.csv',
            'url': MPDDL + '/straininfo.csv'},
        'assay_metadata': {
            'file': 'measurements.csv',
            'url': MPDDL + '/measurements.csv'},
        'strainmeans': {
            'file': 'strainmeans.csv.gz',
            'url': MPDDL + '/strainmeans.csv.gz'},
        # 'mpd_datasets_metadata': { #TEC does not seem to be used
        #    'file': 'mpd_datasets_metadata.xml.gz',
        #    'url': MPDDL + '/mpd_datasets_metadata.xml.gz'},
    }

    # the following are strain ids for testing
    # test_ids = [
    #   "MPD:2", "MPD:3", "MPD:5", "MPD:6", "MPD:9", "MPD:11", "MPD:18",
    #   "MPD:20", "MPD:24", "MPD:28", "MPD:30", "MPD:33", "MPD:34", "MPD:36",
    #   "MPD:37", "MPD:39", "MPD:40", "MPD:42", "MPD:47", "MPD:66", "MPD:68",
    #   "MPD:71", "MPD:75", "MPD:78", "MPD:122", "MPD:169", "MPD:438",
    #   "MPD:457","MPD:473", "MPD:481", "MPD:759", "MPD:766", "MPD:770",
    #   "MPD:849",  "MPD:857", "MPD:955", "MPD:964", "MPD:988", "MPD:1005",
    #   "MPD:1017", "MPD:1204", "MPD:1233", "MPD:1235", "MPD:1236", "MPD:1237"]

    test_ids = [
        'MPD:6', 'MPD:849', 'MPD:425', 'MPD:569', "MPD:10", "MPD:1002",
        "MPD:39", "MPD:2319"]

    mgd_agent_id = "MPD:db/q?rtn=people/allinv"
    mgd_agent_label = "Mouse Phenotype Database"
    mgd_agent_type = "foaf:organization"

    def __init__(self, graph_type, are_bnodes_skolemized):
        Source.__init__(self, graph_type, are_bnodes_skolemized, 'mpd')
        # @N, not sure if this step is required
        self.stdevthreshold = 2

        # update the dataset object with details about this resource
        # @N: Note that there is no license as far as I can tell
        self.dataset = Dataset(
            'mpd', 'MPD', 'http://phenome.jax.org', None, None)

        # TODO add a citation for mpd dataset as a whole
        self.dataset.set_citation('PMID:15619963')

        self.assayhash = {}
        self.idlabel_hash = {}
        # to store the mean/zscore of each measure by strain+sex
        self.score_means_by_measure = {}
        # to store the mean value for each measure by strain+sex
        self.strain_scores_by_measure = {}

        return

    def fetch(self, is_dl_forced=False):

        self.get_files(is_dl_forced)
        return

    def parse(self, limit=None):
        """
        MPD data is delivered in four separate csv files and one xml file,
        which we process iteratively and write out as
        one large graph.

        :param limit:
        :return:
        """
        if limit is not None:
            logger.info("Only parsing first %s rows fo each file", str(limit))

        logger.info("Parsing files...")

        self._process_straininfo(limit)
        # the following will provide us the hash-lookups
        # These must be processed in a specific order

        # mapping between assays and ontology terms
        self._process_ontology_mappings_file(limit)
        # this is the metadata about the measurements
        self._process_measurements_file(limit)
        # get all the measurements per strain
        self._process_strainmeans_file(limit)

        # The following will use the hash populated above
        # to lookup the ids when filling in the graph
        self._fill_provenance_graph(limit)

        logger.info("Finished parsing.")
        return

    def _process_ontology_mappings_file(self, limit):

        # line_counter = 0  # TODO unused

        logger.info("Processing ontology mappings...")
        raw = '/'.join((self.rawdir, 'ontology_mappings.csv'))

        with open(raw, 'r') as f:
            reader = csv.reader(f)
            # read the header row; skip
            f.readline()
            for row in reader:
                try:
                    (assay_id, ont_term, descrip) = row
                except ValueError:
                    continue
                assay_id = int(assay_id)
                if re.match(r'(MP|VT)', ont_term):
                    # add the mapping denovo
                    if assay_id not in self.assayhash:
                        self.assayhash[assay_id] = {}
                        self.assayhash[assay_id]['ont_terms'] = set()
                    self.assayhash[assay_id]['ont_terms'].add(ont_term)

        return

    def _process_straininfo(self, limit):
        # line_counter = 0  # TODO unused
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        model = Model(g)

        logger.info("Processing measurements ...")
        raw = '/'.join((self.rawdir, self.files['straininfo']['file']))

        tax_id = 'NCBITaxon:10090'

        with open(raw, 'r') as f:
            reader = csv.reader(f, delimiter=',', quotechar='\"')
            f.readline()  # read the header row; skip
            for row in reader:
                (strain_name, vendor, stocknum, panel, mpd_strainid,
                 straintype, n_proj, n_snp_datasets, mpdshortname, url) = row
                # C57BL/6J,J,000664,,7,IN,225,17,,http://jaxmice.jax.org/strain/000664.html
                # create the strain as an instance of the taxon
                if self.testMode and \
                        'MPD:' + str(mpd_strainid) not in self.test_ids:
                    continue
                strain_id = 'MPD-strain:' + str(mpd_strainid)
                model.addIndividualToGraph(strain_id, strain_name, tax_id)
                if mpdshortname.strip() != '':
                    model.addSynonym(strain_id, mpdshortname.strip())
                self.idlabel_hash[strain_id] = strain_name
                # make it equivalent to the vendor+stock
                if stocknum != '':
                    if vendor == 'J':
                        jax_id = 'JAX:'+stocknum
                        model.addSameIndividual(strain_id, jax_id)
                    elif vendor == 'Rbrc':
                        # reiken
                        reiken_id = 'RBRC:'+re.sub(r'RBRC', '', stocknum)
                        model.addSameIndividual(strain_id, reiken_id)
                    else:
                        if url != '':
                            model.addXref(strain_id, url, True)
                        if vendor != '':
                            model.addXref(
                                strain_id, ':'.join((vendor, stocknum)),
                                True)

                # add the panel information
                if panel != '':
                    desc = panel+' [panel]'
                    model.addDescription(strain_id, desc)

                # TODO make the panels as a resource collection

        return

    def _process_measurements_file(self, limit):
        line_counter = 0

        logger.info("Processing measurements ...")
        raw = '/'.join((self.rawdir, 'measurements.csv'))

        with open(raw, 'r') as f:
            reader = csv.reader(f)
            # read the header row; skip
            header = f.readline()
            logger.info("HEADER: %s", header)
            for row in reader:
                # measnum,projsym,varname,descrip,units,cat1,cat2,cat3,
                # intervention,intparm,appmeth,panelsym,datatype,sextested,
                # nstrainstested,ageweeks
                # Again the last row has changed. contains: '(4486 rows)'
                if len(row) != 16:
                    continue
                line_counter += 1
                assay_id = int(row[0])
                assay_label = row[3]
                assay_units = row[4]
                assay_type = row[10] if row[10] is not '' else None

                if assay_id not in self.assayhash:
                    self.assayhash[assay_id] = {}
                description = self.build_measurement_description(row)
                self.assayhash[assay_id]['description'] = description
                self.assayhash[assay_id]['assay_label'] = assay_label
                self.assayhash[assay_id]['assay_type'] = assay_type
                self.assayhash[assay_id]['assay_units'] = assay_units

                # TODO add projectsym property?
                # TODO add intervention?
                # ageweeks might be useful for adding to phenotype assoc

            # end loop on measurement metadata

        return

    def _process_strainmeans_file(self, limit):
        """
        This will store the entire set of strain means in a hash.
        Not the most efficient representation,
        but easy access.
        We will loop through this later to then apply cutoffs
        and add associations
        :param limit:
        :return:

        """
        logger.info("Processing strain means ...")
        line_counter = 0
        raw = '/'.join((self.rawdir, self.files['strainmeans']['file']))
        with gzip.open(raw, 'rb') as f:
            f = io.TextIOWrapper(f)
            reader = csv.reader(f)
            f.readline()  # read the header row; skip
            score_means_by_measure = {}
            strain_scores_by_measure = {}
            for row in reader:
                try:
                    (measnum, varname, strain, strainid, sex, mean, nmice, sd,
                     sem, cv, minval, maxval, logmean, logsd, zscore,
                     logzscore) = row
                except ValueError:
                    continue
                line_counter += 1
                strain_num = int(strainid)
                assay_num = int(measnum)
                # assuming the zscore is across all the items
                # in the same measure+var+strain+sex
                # note: it seems that there is only ever 1 varname per measnum.
                # note: some assays only tested one sex!
                # we split this here by sex
                if assay_num not in score_means_by_measure:
                    score_means_by_measure[assay_num] = {}
                if sex not in score_means_by_measure[assay_num]:
                    score_means_by_measure[assay_num][sex] = list()
                score_means_by_measure[assay_num][sex].append(float(mean))

                if strain_num not in strain_scores_by_measure:
                    strain_scores_by_measure[strain_num] = {}
                if sex not in strain_scores_by_measure[strain_num]:
                    strain_scores_by_measure[strain_num][sex] = {}
                strain_scores_by_measure[strain_num][sex][assay_num] = \
                    {'mean': float(mean), 'zscore': float(zscore)}

            # end loop over strainmeans
        self.score_means_by_measure = score_means_by_measure
        self.strain_scores_by_measure = strain_scores_by_measure

        return

    def _fill_provenance_graph(self, limit):
        logger.info("Building graph ...")
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        model = Model(g)
        taxon_id = 'NCBITaxon:10090'  # hardcode to Mus musculus
        model.addClassToGraph(taxon_id, None)

        scores_passing_threshold_count = 0
        scores_passing_threshold_with_ontologies_count = 0
        scores_not_passing_threshold_count = 0

        # loop through all the strains,
        # and make G2P assoc for those with scores beyond threshold
        for strain_num in self.strain_scores_by_measure:
            if self.testMode and 'MPD:'+str(strain_num) not in self.test_ids:
                continue
            strain_id = 'MPD-strain:'+str(strain_num)
            for sex in self.strain_scores_by_measure[strain_num]:
                measures = self.strain_scores_by_measure[strain_num][sex]
                for m in measures:
                    assay_id = 'MPD-assay:'+str(m)
                    # TODO consider using the means
                    # instead of precomputed zscores
                    if 'zscore' in measures[m]:
                        zscore = measures[m]['zscore']
                        if abs(zscore) >= self.stdevthreshold:
                            scores_passing_threshold_count += 1
                            # logger.info(
                            #   "Score passing threshold: %s | %s | %s",
                            #   strain_id, assay_id, zscore)
                            # add the G2P assoc
                            prov = Provenance(self.graph)
                            try:
                                assay_label = self.assayhash[m]['assay_label']
                                assay_description = \
                                    self.assayhash[m]['description']
                                ont_term_ids = self.assayhash[m].get('ont_terms')
                                comment = ' '.join((assay_label,
                                                   '(zscore='+str(zscore)+')'))
                            except KeyError:
                                assay_label = None
                                assay_description = None
                                ont_term_ids = None
                            if assay_label is not None:
                                assay_label += ' ('+str(m)+')'
                            # TODO unused
                            # assay_type = self.assayhash[m]['assay_type']

                            assay_type_id = Provenance.provenance_types['assay']

                            if ont_term_ids is not None:
                                scores_passing_threshold_with_ontologies_count += 1
                                prov.add_assay_to_graph(
                                    assay_id, assay_label, assay_type_id,
                                    assay_description)
                                self._add_g2p_assoc(
                                    g, strain_id, sex, assay_id, ont_term_ids,
                                    comment)
                        else:
                            scores_not_passing_threshold_count += 1

        logger.info("Scores passing threshold: %d",
                    scores_passing_threshold_count)
        logger.info("Scores passing threshold with ontologies: %d",
                    scores_passing_threshold_with_ontologies_count)
        logger.info("Scores not passing threshold: %d",
                    scores_not_passing_threshold_count)

        return

    def _add_g2p_assoc(self, g, strain_id, sex, assay_id, phenotypes, comment):
        """
        Create an association between a sex-specific strain id
        and each of the phenotypes.
        Here, we create a genotype from the strain,
        and a sex-specific genotype.
        Each of those genotypes are created as anonymous nodes.

        The evidence code is hardcoded to be:
            ECO:experimental_phenotypic_evidence.

        :param g:
        :param strain_id:
        :param sex:
        :param assay_id:
        :param phenotypes: a list of phenotypes to association with the strain
        :param comment:
        :return:

        """
        geno = Genotype(g)
        model = Model(g)
        eco_id = "ECO:0000059"  # experimental_phenotypic_evidence
        strain_label = self.idlabel_hash.get(strain_id)
        # strain genotype
        genotype_id = '_'+'-'.join((re.sub(r':', '', strain_id), 'genotype'))
        genotype_label = '[' + strain_label + ']'

        sex_specific_genotype_id = '_'+'-'.join((re.sub(r':', '', strain_id),
                                                 sex, 'genotype'))
        if strain_label is not None:
            sex_specific_genotype_label = strain_label + ' (' + sex + ')'
        else:
            sex_specific_genotype_label = strain_id + '(' + sex + ')'

        genotype_type = Genotype.genoparts['sex_qualified_genotype']
        if sex == 'm':
            genotype_type = Genotype.genoparts['male_genotype']
        elif sex == 'f':
            genotype_type = Genotype.genoparts['female_genotype']

        # add the genotype to strain connection
        geno.addGenotype(
            genotype_id, genotype_label,
            Genotype.genoparts['genomic_background'])
        g.addTriple(
            strain_id, Genotype.object_properties['has_genotype'], genotype_id)

        geno.addGenotype(
            sex_specific_genotype_id, sex_specific_genotype_label,
            genotype_type)

        # add the strain as the background for the genotype
        g.addTriple(
            sex_specific_genotype_id,
            Genotype.object_properties['has_sex_agnostic_genotype_part'],
            genotype_id)

        # #############    BUILD THE G2P ASSOC    #############
        # TODO add more provenance info when that model is completed

        if phenotypes is not None:
            for phenotype_id in phenotypes:
                assoc = G2PAssoc(
                    g, self.name, sex_specific_genotype_id, phenotype_id)
                assoc.add_evidence(assay_id)
                assoc.add_evidence(eco_id)
                assoc.add_association_to_graph()
                assoc_id = assoc.get_association_id()
                model.addComment(assoc_id, comment)

        return

    def getTestSuite(self):
        import unittest
        from tests.test_mpd import MPDTestCase

        test_suite = unittest.TestLoader().loadTestsFromTestCase(MPDTestCase)

        return test_suite

    @staticmethod
    def normalise_units(units):
        # todo:
        return units

    @staticmethod
    def build_measurement_description(row):
        (assay_id, projsym, varname, descrip, units, cat1, cat2, cat3,
         intervention, intparm, appmeth, panelsym, datatype, sextested,
         nstrainstested, ageweeks) = row

        if sextested == 'f':
            sextested = 'female'
        elif sextested == 'm':
            sextested = 'male'
        elif sextested == 'fm':
            sextested = 'male and female'
        else:
            logger.warning("Unknown sex tested key: %s", sextested)
        description = "This is an assay of [" + descrip + "] shown as a [" + \
                      datatype + "] measured in [" + units + "]"

        if intervention is not None and intervention != "":
            description += " in response to [" + intervention + "]"
        if intparm is not None and intervention != "":
            description += \
                ". This represents the [" + intparm + \
                "] arm, using materials and methods that included [" +\
                appmeth + "]"

        description += \
            ".  The overall experiment is entitled [" + projsym + "].  "

        description += \
            "It was conducted in [" + sextested + "] mice at [" + \
            ageweeks + "] of age in" + " [" + nstrainstested + \
            "] different mouse strains. "
        description += "Keywords: " + cat1 + \
                       ((", " + cat2) if cat2.strip() is not "" else "") + \
                       ((", " + cat3) if cat3.strip() is not "" else "") + "."
        return description