예제 #1
0
class EOM(PostgreSQLSource):
    """
    Elements of Morphology is a resource from NHGRI that has definitions of
    morphological abnormalities, together with image depictions.
    We pull those relationships, as well as our local mapping of equivalences
    between EOM and HP terminologies.

    The website is crawled monthly by NIF's DISCO crawler system,
        which we utilize here.
    Be sure to have pg user/password connection details in your conf.json file,
    like:
      dbauth : {
        'disco' : {'user' : '<username>', 'password' : '<password>'}
      }

    Monarch-curated data for the HP to EOM mapping is stored at
        https://phenotype-ontologies.googlecode.com

    Since this resource is so small, the entirety of it is the "test" set.

    """

    # we are using the production view here; should we be using services?
    tables = [
        'dvp.pr_nlx_157874_1'
    ]

    files = {
        'map': {
            'file': 'hp-to-eom-mapping.tsv',
            'url': 'https://phenotype-ontologies.googlecode.com/svn/trunk/src/ontology/hp/mappings/hp-to-eom-mapping.tsv'
        }
    }

    def __init__(self):
        super().__init__('eom')
        self.namespaces.update(curie_map.get())

        # update the dataset object with details about this resource
        # TODO put this into a conf file?
        self.dataset = Dataset(
            'eom', 'EOM', 'http://elementsofmorphology.nih.gov', None,
            'http://www.genome.gov/copyright.cfm',
            'https://creativecommons.org/publicdomain/mark/1.0/')

        # check if config exists; if it doesn't, error out and let user know
        if 'dbauth' not in config.get_config() or \
                'disco' not in config.get_config()['dbauth']:
            logger.error("not configured with PG user/password.")

        # source-specific warnings.  will be cleared when resolved.

        return

    def fetch(self, is_dl_forced=False):
        '''create the connection details for DISCO'''

        cxn = config.get_config()['dbauth']['disco']
        cxn.update(
            {'host': 'nif-db.crbs.ucsd.edu', 'database': 'disco_crawler',
             'port': 5432})

        self.dataset.setFileAccessUrl(
            ''.join(('jdbc:postgresql://', cxn['host'], ':', str(cxn['port']),
                    '/', cxn['database'])))

        # process the tables
        # self.fetch_from_pgdb(self.tables,cxn,100)  #for testing
        self.fetch_from_pgdb(self.tables, cxn)

        self.get_files(is_dl_forced)

        # FIXME: Everything needed for data provenance?
        st = os.stat('/'.join((self.rawdir, 'dvp.pr_nlx_157874_1')))
        filedate = datetime.utcfromtimestamp(st[ST_CTIME]).strftime("%Y-%m-%d")
        self.dataset.setVersion(filedate)

        return

    def parse(self, limit=None):
        '''
            Over ride Source.parse inherited via PostgreSQLSource
        '''

        if limit is not None:
            logger.info("Only parsing first %s rows of each file", limit)

        if self.testOnly:
            self.testMode = True

        logger.info("Parsing files...")

        self._process_nlx_157874_1_view('/'.join((self.rawdir,
                                                  'dvp.pr_nlx_157874_1')),
                                        limit)
        self._map_eom_terms('/'.join((self.rawdir, self.files['map']['file'])),
                            limit)

        logger.info("Finished parsing.")

        self.load_bindings()

        # since it's so small,
        # we default to copying the entire graph to the test set
        self.testgraph = self.graph

        logger.info("Found %s nodes", len(self.graph))
        return

    def _process_nlx_157874_1_view(self, raw, limit=None):
        """
        This table contains the Elements of Morphology data that has been
        screen-scraped into DISCO.
        Note that foaf:depiction is inverse of foaf:depicts relationship.

        Since it is bad form to have two definitions,
        we concatenate the two into one string.

        Triples:
            <eom id> a owl:Class
                rdf:label Literal(eom label)
                OIO:hasRelatedSynonym Literal(synonym list)
                IAO:definition Literal(objective_def. subjective def)
                foaf:depiction Literal(small_image_url),
                               Literal(large_image_url)
                foaf:page Literal(page_url)
                rdfs:comment Literal(long commented text)


        :param raw:
        :param limit:
        :return:
        """

        gu = GraphUtils(curie_map.get())
        line_counter = 0
        with open(raw, 'r') as f1:
            f1.readline()  # read the header row; skip
            filereader = csv.reader(f1, delimiter='\t', quotechar='\"')
            for line in filereader:
                line_counter += 1
                (morphology_term_id, morphology_term_num,
                 morphology_term_label, morphology_term_url,
                 terminology_category_label, terminology_category_url,
                 subcategory, objective_definition, subjective_definition,
                 comments, synonyms, replaces, small_figure_url,
                 large_figure_url, e_uid, v_uid, v_uuid,
                 v_last_modified) = line

                # note:
                # e_uid v_uuid v_last_modified terminology_category_url
                # subcategory v_uid morphology_term_num
                # terminology_category_label hp_label notes
                # are currently unused.

                # Add morphology term to graph as a class
                # with label, type, and description.
                gu.addClassToGraph(self.graph, morphology_term_id,
                                   morphology_term_label)

                # Assemble the description text

                if subjective_definition != '' and not (
                        re.match(r'.+\.$', subjective_definition)):
                    # add a trailing period.
                    subjective_definition = subjective_definition.strip() + '.'
                if objective_definition != '' and not (
                        re.match(r'.+\.$', objective_definition)):
                    # add a trailing period.
                    objective_definition = objective_definition.strip() + '.'

                definition = \
                    '  '.join(
                        (objective_definition, subjective_definition)).strip()

                gu.addDefinition(self.graph, morphology_term_id, definition)

                # <term id> FOAF:depicted_by literal url
                # <url> type foaf:depiction

                # do we want both images?
                # morphology_term_id has depiction small_figure_url
                if small_figure_url != '':
                    gu.addDepiction(self.graph, morphology_term_id,
                                    small_figure_url)

                # morphology_term_id has depiction large_figure_url
                if large_figure_url != '':
                    gu.addDepiction(self.graph, morphology_term_id,
                                    large_figure_url)

                # morphology_term_id has comment comments
                if comments != '':
                    gu.addComment(self.graph, morphology_term_id,
                                  comments.strip())

                if synonyms != '':
                    for s in synonyms.split(';'):
                        gu.addSynonym(
                            self.graph, morphology_term_id, s.strip(),
                            gu.properties['hasExactSynonym'])

                # morphology_term_id hasRelatedSynonym replaces (; delimited)
                if replaces != '' and replaces != synonyms:
                    for s in replaces.split(';'):
                        gu.addSynonym(
                            self.graph, morphology_term_id, s.strip(),
                            gu.properties['hasRelatedSynonym'])

                # morphology_term_id has page morphology_term_url
                gu.addPage(self.graph, morphology_term_id, morphology_term_url)

                if limit is not None and line_counter > limit:
                    break
        return

    def _map_eom_terms(self, raw, limit=None):
        """
        This table contains the HP ID mappings from the local tsv file.
        Triples:
            <eom id> owl:equivalentClass <hp id>
        :param raw:
        :param limit:
        :return:
        """

        gu = GraphUtils(curie_map.get())

        line_counter = 0
        with open(raw, 'r') as f1:
            f1.readline()  # read the header row; skip
            for line in f1:
                line_counter += 1

                (morphology_term_id, morphology_term_label, hp_id, hp_label,
                 notes) = line.split('\t')

                # Sub out the underscores for colons.
                hp_id = re.sub('_', ':', hp_id)
                if re.match(".*HP:.*", hp_id):
                    # add the HP term as a class
                    gu.addClassToGraph(self.graph, hp_id, None)
                    # Add the HP ID as an equivalent class
                    gu.addEquivalentClass(
                        self.graph, morphology_term_id, hp_id)
                else:
                    logger.warning('No matching HP term for %s',
                                   morphology_term_label)

                if limit is not None and line_counter > limit:
                    break

        return

    def getTestSuite(self):
        import unittest
        # TODO PYLINT: Unable to import 'tests.test_eom'
        from tests.test_eom import EOMTestCase

        test_suite = unittest.TestLoader().loadTestsFromTestCase(EOMTestCase)

        return test_suite
예제 #2
0
class GeneReviews(Source):
    """
    Here we process the GeneReviews mappings to OMIM,
    plus inspect the GeneReviews (html) books to pull the clinical descriptions
    in order to populate the definitions of the terms in the ontology.
    We define the GeneReviews items as classes that are either grouping classes
    over OMIM disease ids (gene ids are filtered out),
    or are made as subclasses of DOID:4 (generic disease).

    Note that GeneReviews
    [copyright policy](http://www.ncbi.nlm.nih.gov/books/NBK138602/)
    (as of 2015.11.20) says:

    GeneReviews® chapters are owned by the University of Washington, Seattle,
    © 1993-2015. Permission is hereby granted to reproduce, distribute,
    and translate copies of content materials provided that
    (i) credit for source (www.ncbi.nlm.nih.gov/books/NBK1116/)
    and copyright (University of Washington, Seattle)
    are included with each copy;
    (ii) a link to the original material is provided whenever the material is
    published elsewhere on the Web; and
    (iii) reproducers, distributors, and/or translators comply with this
    copyright notice and the GeneReviews Usage Disclaimer.

    This script doesn't pull the GeneReviews books from the NCBI Bookshelf
    directly; scripting this task is expressly prohibited by
    [NCBIBookshelf policy](http://www.ncbi.nlm.nih.gov/books/NBK45311/).
    However, assuming you have acquired the books (in html format) via
    permissible means, a parser for those books is provided here to extract
    the clinical descriptions to define the NBK identified classes.

    """

    files = {
        'idmap': {
            'file': 'NBKid_shortname_OMIM.txt',
            'url': GRDL + '/NBKid_shortname_OMIM.txt'
        },
        'titles': {
            'file': 'GRtitle_shortname_NBKid.txt',
            'url': GRDL + '/GRtitle_shortname_NBKid.txt'
        }
    }

    def __init__(self, graph_type, are_bnodes_skolemized):
        super().__init__(graph_type, are_bnodes_skolemized, 'genereviews')

        self.dataset = Dataset('genereviews', 'Gene Reviews',
                               'http://genereviews.org/', None,
                               'http://www.ncbi.nlm.nih.gov/books/NBK138602/')
        self.dataset.set_citation('GeneReviews:NBK1116')

        self.book_ids = set()
        self.all_books = {}

        if 'test_ids' not in config.get_config() or\
                'disease' not in config.get_config()['test_ids']:
            logger.warning("not configured with disease test ids.")
            self.test_ids = list()
        else:
            # select ony those test ids that are omim's.
            self.test_ids = config.get_config()['test_ids']['disease']

        return

    def fetch(self, is_dl_forced=False):
        """
        We fetch GeneReviews id-label map and id-omim mapping files from NCBI.
        :return: None
        """

        self.get_files(is_dl_forced)

        return

    def parse(self, limit=None):
        """
        :return: None
        """

        if self.testOnly:
            self.testMode = True

        self._get_titles(limit)
        self._get_equivids(limit)

        self.create_books()
        self.process_nbk_html(limit)

        # no test subset for now; test == full graph
        self.testgraph = self.graph
        return

    def _get_equivids(self, limit):
        """
        The file processed here is of the format:
        #NBK_id GR_shortname    OMIM
        NBK1103 trimethylaminuria       136132
        NBK1103 trimethylaminuria       602079
        NBK1104 cdls    122470
        Where each of the rows represents a mapping between
        a gr id and an omim id. These are a 1:many relationship,
        and some of the omim ids are genes(not diseases).
        Therefore, we need to create a loose coupling here.
        We make the assumption that these NBKs are generally higher-level
        grouping classes; therefore the OMIM ids are treated as subclasses.
        (This assumption is poor for those omims that are actually genes,
        but we have no way of knowing what those are here...
        we will just have to deal with that for now.)
        :param limit:
        :return:

        """
        raw = '/'.join((self.rawdir, self.files['idmap']['file']))
        model = Model(self.graph)
        line_counter = 0

        # we look some stuff up in OMIM, so initialize here
        omim = OMIM(self.graph_type, self.are_bnodes_skized)
        id_map = {}
        allomimids = set()
        with open(raw, 'r', encoding="utf8") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                if line_counter == 1:  # skip header
                    continue
                (nbk_num, shortname, omim_num) = row
                gr_id = 'GeneReviews:' + nbk_num
                omim_id = 'OMIM:' + omim_num
                if not ((self.testMode and len(self.test_ids) > 0
                         and omim_id in self.test_ids) or not self.testMode):
                    continue

                # sometimes there's bad omim nums
                if len(omim_num) > 6:
                    logger.warning(
                        "OMIM number incorrectly formatted " +
                        "in row %d; skipping:\n%s", line_counter,
                        '\t'.join(row))
                    continue

                # build up a hashmap of the mappings; then process later
                if nbk_num not in id_map:
                    id_map[nbk_num] = set()
                id_map[nbk_num].add(omim_num)

                # add the class along with the shortname
                model.addClassToGraph(gr_id, None)
                model.addSynonym(gr_id, shortname)

                allomimids.add(omim_num)

                if not self.testMode and \
                        limit is not None and line_counter > limit:
                    break

            # end looping through file

        # get the omim ids that are not genes
        entries_that_are_phenotypes = \
            omim.process_entries(
                list(allomimids), filter_keep_phenotype_entry_ids,
                None, None, limit)

        logger.info("Filtered out %d/%d entries that are genes or features",
                    len(allomimids) - len(entries_that_are_phenotypes),
                    len(allomimids))

        for nbk_num in self.book_ids:
            gr_id = 'GeneReviews:' + nbk_num
            if nbk_num in id_map:
                omim_ids = id_map.get(nbk_num)
                for omim_num in omim_ids:
                    omim_id = 'OMIM:' + omim_num
                    # add the gene reviews as a superclass to the omim id,
                    # but only if the omim id is not a gene
                    if omim_id in entries_that_are_phenotypes:
                        model.addClassToGraph(omim_id, None)
                        model.addSubClass(omim_id, gr_id)
            # add this as a generic subclass of DOID:4
            model.addSubClass(gr_id, 'DOID:4')

        return

    def _get_titles(self, limit):
        """
        The file processed here is of the format:
        #NBK_id GR_shortname    OMIM
        NBK1103 trimethylaminuria       136132
        NBK1103 trimethylaminuria       602079
        NBK1104 cdls    122470
        Where each of the rows represents a mapping between
        a gr id and an omim id. These are a 1:many relationship,
        and some of the omim ids are genes (not diseases).
        Therefore, we need to create a loose coupling here.
        We make the assumption that these NBKs are generally higher-level
        grouping classes; therefore the OMIM ids are treated as subclasses.
        (This assumption is poor for those omims that are actually genes,
        but we have no way of knowing what those are here...
        we will just have to deal with that for now.)
        :param limit:
        :return:
        """
        raw = '/'.join((self.rawdir, self.files['titles']['file']))
        model = Model(self.graph)
        line_counter = 0
        with open(raw, 'r', encoding='latin-1') as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            header = next(filereader)
            line_counter = 1
            colcount = len(header)
            if colcount != 4:  # ('GR_shortname', 'GR_Title', 'NBK_id', 'PMID')
                logger.error("Unexpected Header ", header)
                exit(-1)
            for row in filereader:
                line_counter += 1
                if len(row) != colcount:
                    logger.error("Unexpected row. got: ", row)
                    logger.error("Expected data for: ", header)
                    exit(-1)
                (shortname, title, nbk_num, pmid) = row
                gr_id = 'GeneReviews:' + nbk_num

                self.book_ids.add(nbk_num)  # a global set of the book nums

                if limit is None or line_counter < limit:
                    model.addClassToGraph(gr_id, title)
                    model.addSynonym(gr_id, shortname)
                # TODO include the new PMID?

        return

    def create_books(self):

        # note that although we put in the url to the book,
        # NCBI Bookshelf does not allow robots to download content
        book_item = {'file': 'books/', 'url': ''}

        for nbk in self.book_ids:
            b = book_item.copy()
            b['file'] = '/'.join(('books', nbk + '.html'))
            b['url'] = 'http://www.ncbi.nlm.nih.gov/books/' + nbk
            self.all_books[nbk] = b

        return

    def process_nbk_html(self, limit):
        """
        Here we process the gene reviews books to fetch
        the clinical descriptions to include in the ontology.
        We only use books that have been acquired manually,
        as NCBI Bookshelf does not permit automated downloads.
        This parser will only process the books that are found in
        the ```raw/genereviews/books``` directory,
        permitting partial completion.

        :param limit:
        :return:
        """
        model = Model(self.graph)
        c = 0
        books_not_found = set()
        for nbk in self.book_ids:
            c += 1
            nbk_id = 'GeneReviews:' + nbk
            book_item = self.all_books.get(nbk)
            url = '/'.join((self.rawdir, book_item['file']))

            # figure out if the book is there; if so, process, otherwise skip
            book_dir = '/'.join((self.rawdir, 'books'))
            book_files = os.listdir(book_dir)
            if ''.join((nbk, '.html')) not in book_files:
                # logger.warning("No book found locally for %s; skipping", nbk)
                books_not_found.add(nbk)
                continue
            logger.info("Processing %s", nbk)

            page = open(url)
            soup = BeautifulSoup(page.read())

            # sec0 == clinical description
            clin_summary = \
                soup.find(
                    'div', id=re.compile(".*Summary.sec0"))
            if clin_summary is not None:
                p = clin_summary.find('p')
                ptext = p.text
                ptext = re.sub(r'\s+', ' ', ptext)

                ul = clin_summary.find('ul')
                if ul is not None:
                    item_text = list()
                    for li in ul.find_all('li'):
                        item_text.append(re.sub(r'\s+', ' ', li.text))
                    ptext += ' '.join(item_text)

                # add in the copyright and citation info to description
                ptext = \
                    ' '.join(
                        (ptext,
                         '[GeneReviews:NBK1116, GeneReviews:NBK138602, ' +
                         nbk_id+']'))

                model.addDefinition(nbk_id, ptext.strip())

            # get the pubs
            pmid_set = set()
            pub_div = soup.find('div', id=re.compile(r".*Literature_Cited"))
            if pub_div is not None:
                ref_list = pub_div.find_all('div', attrs={'class': "bk_ref"})
                for r in ref_list:
                    for a in r.find_all('a',
                                        attrs={'href': re.compile(r"pubmed")}):
                        if re.match(r'PubMed:', a.text):
                            pmnum = re.sub(r'PubMed:\s*', '', a.text)
                        else:
                            pmnum = \
                                re.search(
                                    r'\/pubmed\/(\d+)$', a['href']).group(1)
                        if pmnum is not None:
                            pmid = 'PMID:' + str(pmnum)
                            self.graph.addTriple(
                                pmid, model.object_properties['is_about'],
                                nbk_id)
                            pmid_set.add(pmnum)
                            reference = Reference(
                                self.graph, pmid,
                                Reference.ref_types['journal_article'])
                            reference.addRefToGraph()

            # TODO add author history, copyright, license to dataset

            # TODO get PMID-NBKID equivalence (near foot of page),
            # and make it "is about" link
            # self.gu.addTriple(
            #   self.graph, pmid,
            #   self.gu.object_properties['is_about'], nbk_id)
            # for example: NBK1191 PMID:20301370

            # add the book to the dataset
            self.dataset.setFileAccessUrl(book_item['url'])

            if limit is not None and c > limit:
                break

            # finish looping through books

        l = len(books_not_found)
        if len(books_not_found) > 0:
            if l > 100:
                logger.warning("There were %d books not found.", l)
            else:
                logger.warning(
                    "The following %d books were not found locally: %s", l,
                    str(books_not_found))
        logger.info("Finished processing %d books for clinical descriptions",
                    c - l)

        return

    def getTestSuite(self):
        import unittest
        from tests.test_genereviews import GeneReviewsTestCase

        test_suite = \
            unittest.TestLoader().loadTestsFromTestCase(GeneReviewsTestCase)

        return test_suite
예제 #3
0
class Coriell(Source):
    """
    The Coriell Catalog provided to Monarch includes metadata and descriptions
    of NIGMS, NINDS, NHGRI, and NIA cell lines.  These lines are made available
    for research purposes. Here, we create annotations for the cell lines as
    models of the diseases from which they originate.

    We create a handle for a patient from which the given cell line is derived
    (since there may be multiple cell lines created from a given patient).
    A genotype is assembled for a patient, which includes a karyotype
    (if specified) and/or a collection of variants.
    Both the genotype (has_genotype) and disease are linked to the patient
    (has_phenotype), and the cell line is listed as derived from the patient.
    The cell line is classified by it's
    [CLO cell type](http://www.ontobee.org/browser/index.php?o=clo),
    which itself is linked to a tissue of origin.

    Unfortunately, the omim numbers listed in this file are both for genes
    & diseases; we have no way of knowing a priori if a designated omim number
    is a gene or disease; so we presently link the patient to any omim id via
    the has_phenotype relationship.

    Notice: The Coriell catalog is delivered to Monarch in a specific format,
    and requires ssh rsa fingerprint identification.  Other groups wishing to
    get this data in it's raw form will need to contact Coriell for credential
    This needs to be placed into your configuration file for it to work.

    """

    terms = {
        'cell_line_repository': 'CLO:0000008',
        'race': 'SIO:001015',
        'ethnic_group': 'EFO:0001799',
        'age': 'EFO:0000246',
        'sampling_time': 'EFO:0000689',
        'collection': 'ERO:0002190'
    }

    files = {
        'NINDS': {
            'file': 'NINDS.csv',
            'id': 'NINDS',
            'label': 'NINDS Human Genetics DNA and Cell line Repository',
            'page': 'https://catalog.coriell.org/1/NINDS'},
        'NIGMS': {
            'file': 'NIGMS.csv',
            'id': 'NIGMS',
            'label': 'NIGMS Human Genetic Cell Repository',
            'page': 'https://catalog.coriell.org/1/NIGMS'},
        'NIA': {
            'file': 'NIA.csv',
            'id': 'NIA',
            'label': 'NIA Aging Cell Repository',
            'page': 'https://catalog.coriell.org/1/NIA'},
        'NHGRI': {
            'file': 'NHGRI.csv',
            'id': 'NHGRI',
            'label': 'NHGRI Sample Repository for Human Genetic Research',
            'page': 'https://catalog.coriell.org/1/NHGRI'}
    }

    # the following will house the specific cell lines to use for test output
    test_lines = [
        'ND02380', 'ND02381', 'ND02383', 'ND02384', 'GM17897', 'GM17898',
        'GM17896', 'GM17944', 'GM17945', 'ND00055', 'ND00094', 'ND00136',
        'GM17940', 'GM17939', 'GM20567', 'AG02506', 'AG04407', 'AG07602'
        'AG07601', 'GM19700', 'GM19701', 'GM19702', 'GM00324', 'GM00325',
        'GM00142', 'NA17944', 'AG02505', 'GM01602', 'GM02455', 'AG00364',
        'GM13707', 'AG00780']

    def __init__(self):
        Source.__init__(self, 'coriell')

        self.load_bindings()

        self.dataset = Dataset(
            'coriell', 'Coriell', 'http://ccr.coriell.org/', None)

        # data-source specific warnings
        # (will be removed when issues are cleared)

        logger.warning(
            'We assume that if a species is not provided, '
            'that it is a Human-derived cell line')
        logger.warning(
            'We map all omim ids as a disease/phenotype entity, '
            'but should be fixed in the future')

        # check if config exists; if it doesn't, error out and let user know
        if 'dbauth' not in config.get_config() or \
                'coriell' not in config.get_config()['dbauth']:
            logger.error("not configured with FTP user/password.")

        return

    def fetch(self, is_dl_forced=False):
        """
        Here we connect to the coriell sftp server using private connection
        details.  They dump bi-weekly files with a timestamp in the filename.
        For each catalog, we poll the remote site and pull the most-recently
        updated file, renaming it to our local *_latest.csv.

        Be sure to have pg user/password connection details in your conf.json
        file, like:
        dbauth : {
            "coriell" : {
                "user" : "<username>", "password" : "<password>",
                "host" : <host>, "private_key"=path/to/rsa_key}
        }

        :param is_dl_forced:
        :return:

        """
        host = config.get_config()['dbauth']['coriell']['host']
        user = config.get_config()['dbauth']['coriell']['user']
        passwd = config.get_config()['dbauth']['coriell']['password']
        key = config.get_config()['dbauth']['coriell']['private_key']

        with pysftp.Connection(
                host, username=user, password=passwd, private_key=key) as sftp:
            # check to make sure each file is in there
            # get the remote files
            remote_files = sftp.listdir_attr()
            files_by_repo = {}
            for attr in remote_files:
                # for each catalog, get the most-recent filename
                m = re.match('(NIGMS|NIA|NHGRI|NINDS)', attr.filename)
                if m is not None and len(m.groups()) > 0:
                    # there should just be one now
                    files_by_repo[m.group(1)] = attr
            # sort each array in hash,
            # & get the name and time of the most-recent file for each catalog
            for r in self.files:
                logger.info("Checking on %s catalog file", r)
                fname = self.files[r]['file']
                remotef = files_by_repo[r]
                target_name = '/'.join((self.rawdir, fname))
                # check if the local file is out of date, if so, download.
                # otherwise, skip.
                # we rename (for simplicity) the original file
                st = None
                if os.path.exists(target_name):
                    st = os.stat(target_name)
                    logger.info(
                        "Local file date: %s",
                        datetime.utcfromtimestamp(st[stat.ST_CTIME]))
                if st is None or remotef.st_mtime > st[stat.ST_CTIME]:
                    if st is None:
                        logger.info(
                            "File does not exist locally; downloading...")
                    else:
                        logger.info(
                            "There's a new version of %s catalog available; "
                            "downloading...", r)
                    sftp.get(remotef.filename, target_name)
                    logger.info(
                        "Fetched remote %s -> %s",
                        remotef.filename, target_name)
                    st = os.stat(target_name)
                    filedate = \
                        datetime.utcfromtimestamp(
                            remotef.st_mtime).strftime("%Y-%m-%d")
                    logger.info(
                        "New file date: %s",
                        datetime.utcfromtimestamp(st[stat.ST_CTIME]))

                else:
                    logger.info("File %s exists; using local copy", fname)
                    filedate = \
                        datetime.utcfromtimestamp(
                            st[stat.ST_CTIME]).strftime("%Y-%m-%d")

                self.dataset.setFileAccessUrl(remotef.filename)
                self.dataset.setVersion(filedate)
        return

    def parse(self, limit=None):
        if limit is not None:
            logger.info("Only parsing first %s rows of each file", limit)

        logger.info("Parsing files...")

        if self.testOnly:
            self.testMode = True

        for f in self.files:
            file = '/'.join((self.rawdir, self.files[f]['file']))
            self._process_collection(
                self.files[f]['id'],
                self.files[f]['label'],
                self.files[f]['page'])
            self._process_data(file, limit)

        logger.info("Finished parsing.")

        self.load_bindings()

        logger.info("Found %d nodes in graph", len(self.graph))
        logger.info("Found %d nodes in testgraph", len(self.testgraph))

        return

    def _process_data(self, raw, limit=None):
        """
        This function will process the data files from Coriell.
        We make the assumption that any alleles listed are variants
        (alternates to w.t.)

        Triples: (examples)

        :NIGMSrepository a CLO_0000008 #repository
        label : NIGMS Human Genetic Cell Repository
        foaf:page https://catalog.coriell.org/0/sections/collections/NIGMS/?SsId=8

            line_id a CL_0000057,  #fibroblast line
                derives_from patient_id
                part_of :NIGMSrepository
                RO:model_of OMIM:disease_id

            patient id a foaf:person,
                label: "fibroblast from patient 12345 with disease X"
                member_of family_id  #what is the right thing here?
                SIO:race EFO:caucasian  #subclass of EFO:0001799
                in_taxon NCBITaxon:9606
                dc:description Literal(remark)
                RO:has_phenotype OMIM:disease_id
                GENO:has_genotype genotype_id

            family_id a owl:NamedIndividual
                foaf:page "https://catalog.coriell.org/0/Sections/BrowseCatalog/FamilyTypeSubDetail.aspx?PgId=402&fam=2104&coll=GM"

            genotype_id a intrinsic_genotype
                GENO:has_alternate_part allelic_variant_id
                we don't necessarily know much about the genotype,
                other than the allelic variant. also there's the sex here

            pub_id mentions cell_line_id

        :param raw:
        :param limit:
        :return:
        """
        logger.info("Processing Data from %s", raw)
        gu = GraphUtils(curie_map.get())

        if self.testMode:      # set the graph to build
            g = self.testgraph
        else:
            g = self.graph

        line_counter = 0
        geno = Genotype(g)
        du = DipperUtil()

        gu.loadProperties(g, geno.object_properties, gu.OBJPROP)
        gu.loadAllProperties(g)

        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter=',', quotechar='\"')
            next(filereader, None)  # skip the header row
            for row in filereader:
                if not row:
                    pass
                else:
                    line_counter += 1

                    (catalog_id, description, omim_number, sample_type,
                     cell_line_available, dna_in_stock, dna_ref, gender, age,
                     race, ethnicity, affected, karyotype, relprob, mutation,
                     gene, family_id, collection, url, cat_remark, pubmed_ids,
                     family_member, variant_id, dbsnp_id, species) = row

                    # example:
                    # GM00003,HURLER SYNDROME,607014,Fibroblast,Yes,No,,Female,26 YR,Caucasian,,,,
                    # parent,,,39,NIGMS Human Genetic Cell Repository,
                    # http://ccr.coriell.org/Sections/Search/Sample_Detail.aspx?Ref=GM00003,
                    # 46;XX; clinically normal mother of a child with Hurler syndrome; proband not in Repository,,
                    # 2,,18343,H**o sapiens

                    if self.testMode and catalog_id not in self.test_lines:
                        # skip rows not in our test lines, when in test mode
                        continue

                    # ###########    BUILD REQUIRED VARIABLES    ###########

                    # Make the cell line ID
                    cell_line_id = 'Coriell:'+catalog_id.strip()

                    # Map the cell/sample type
                    cell_type = self._map_cell_type(sample_type)

                    # Make a cell line label
                    line_label = \
                        collection.partition(' ')[0]+'-'+catalog_id.strip()

                    # Map the repository/collection
                    repository = self._map_collection(collection)

                    # patients are uniquely identified by one of:
                    # dbsnp id (which is == an individual haplotype)
                    # family id + family member (if present) OR
                    # probands are usually family member zero
                    # cell line id
                    # since some patients have >1 cell line derived from them,
                    # we must make sure that the genotype is attached to
                    # the patient, and can be inferred to the cell line
                    # examples of repeated patients are:
                    #   famid=1159, member=1; fam=152,member=1

                    # Make the patient ID

                    # make an anonymous patient
                    patient_id = '_person'
                    if self.nobnodes:
                        patient_id = ':'+patient_id
                    if family_id != '':
                        patient_id = \
                            '-'.join((patient_id, family_id, family_member))
                    else:
                        # make an anonymous patient
                        patient_id = '-'.join((patient_id, catalog_id.strip()))

                    # properties of the individual patients:  sex, family id,
                    # member/relproband, description descriptions are
                    # really long and ugly SCREAMING text, so need to clean up
                    # the control cases are so odd with this labeling scheme;
                    # but we'll deal with it as-is for now.
                    short_desc = (description.split(';')[0]).capitalize()
                    if affected == 'Yes':
                        affected = 'affected'
                    elif affected == 'No':
                        affected = 'unaffected'
                    gender = gender.lower()
                    patient_label = ' '.join((affected, gender, relprob))
                    if relprob == 'proband':
                        patient_label = \
                            ' '.join(
                                (patient_label.strip(), 'with', short_desc))
                    else:
                        patient_label = \
                            ' '.join(
                                (patient_label.strip(), 'of proband with',
                                 short_desc))

                    # #############    BUILD THE CELL LINE    #############

                    # Adding the cell line as a typed individual.
                    cell_line_reagent_id = 'CLO:0000031'

                    gu.addIndividualToGraph(
                        g, cell_line_id, line_label, cell_line_reagent_id)

                    # add the equivalent id == dna_ref
                    if dna_ref != '' and dna_ref != catalog_id:
                        equiv_cell_line = 'Coriell:'+dna_ref
                        # some of the equivalent ids are not defined
                        # in the source data; so add them
                        gu.addIndividualToGraph(
                            g, equiv_cell_line, None, cell_line_reagent_id)
                        gu.addSameIndividual(g, cell_line_id, equiv_cell_line)

                    # Cell line derives from patient
                    geno.addDerivesFrom(cell_line_id, patient_id)
                    geno.addDerivesFrom(cell_line_id, cell_type)

                    # Cell line a member of repository
                    gu.addMember(g, repository, cell_line_id)

                    if cat_remark != '':
                        gu.addDescription(g, cell_line_id, cat_remark)

                    # Cell age_at_sampling
                    # TODO add the age nodes when modeled properly in #78
                    # if (age != ''):
                        # this would give a BNode that is an instance of Age.
                        # but i don't know how to connect
                        # the age node to the cell line? we need to ask @mbrush
                        # age_id = '_'+re.sub('\s+','_',age)
                        # gu.addIndividualToGraph(
                        #   g,age_id,age,self.terms['age'])
                        # gu.addTriple(
                        #   g,age_id,self.properties['has_measurement'],age,
                        #   True)

                    # #############    BUILD THE PATIENT    #############

                    # Add the patient ID as an individual.
                    gu.addPerson(g, patient_id, patient_label)
                    # TODO map relationship to proband as a class
                    # (what ontology?)

                    # Add race of patient
                    # FIXME: Adjust for subcategories based on ethnicity field
                    # EDIT: There are 743 different entries for ethnicity...
                    # Too many to map?
                    # Add ethnicity as literal in addition to the mapped race?
                    # Adjust the ethnicity txt (if using)
                    # to initial capitalization to remove ALLCAPS

                    # TODO race should go into the individual's background
                    # and abstracted out to the Genotype class punting for now.
                    # if race != '':
                    #    mapped_race = self._map_race(race)
                    #    if mapped_race is not None:
                    #        gu.addTriple(
                    #           g,patient_id,self.terms['race'],mapped_race)
                    #        gu.addSubclass(
                    #           g,self.terms['ethnic_group'],mapped_race)

                    # #############    BUILD THE FAMILY    #############

                    # Add triples for family_id, if present.
                    if family_id != '':
                        family_comp_id = 'CoriellFamily:'+family_id

                        family_label = \
                            ' '.join(('Family of proband with', short_desc))

                        # Add the family ID as a named individual
                        gu.addIndividualToGraph(
                            g, family_comp_id, family_label,
                            geno.genoparts['family'])

                        # Add the patient as a member of the family
                        gu.addMemberOf(g, patient_id, family_comp_id)

                    # #############    BUILD THE GENOTYPE   #############

                    # the important things to pay attention to here are:
                    # karyotype = chr rearrangements  (somatic?)
                    # mutation = protein-level mutation as a label,
                    # often from omim
                    # gene = gene symbol - TODO get id
                    # variant_id = omim variant ids (; delimited)
                    # dbsnp_id = snp individual ids = full genotype?

                    # note GM00633 is a good example of chromosomal variation
                    # - do we have enough to capture this?
                    # GM00325 has both abnormal karyotype and variation

                    # make an assumption that if the taxon is blank,
                    # that it is human!
                    if species is None or species == '':
                        species = 'H**o sapiens'
                    taxon = self._map_species(species)

                    # if there's a dbSNP id,
                    # this is actually the individual's genotype
                    genotype_id = None
                    genotype_label = None
                    if dbsnp_id != '':
                        genotype_id = 'dbSNPIndividual:'+dbsnp_id.strip()

                    omim_map = {}
                    gvc_id = None

                    # some of the karyotypes are encoded
                    # with terrible hidden codes. remove them here
                    # i've seen a <98> character
                    karyotype = du.remove_control_characters(karyotype)
                    karyotype_id = None
                    if karyotype.strip() != '':
                        karyotype_id = \
                            '_'+re.sub('MONARCH:', '', self.make_id(karyotype))
                        if self.nobnodes:
                            karyotype_id = ':'+karyotype_id
                        # add karyotype as karyotype_variation_complement
                        gu.addIndividualToGraph(
                            g, karyotype_id, karyotype,
                            geno.genoparts['karyotype_variation_complement'])
                        # TODO break down the karyotype into parts
                        # and map into GENO. depends on #77

                        # place the karyotype in a location(s).
                        karyo_chrs = \
                            self._get_affected_chromosomes_from_karyotype(
                                karyotype)
                        for c in karyo_chrs:
                            chr_id = makeChromID(c, taxon, 'CHR')
                            # add an anonymous sequence feature,
                            # each located on chr
                            karyotype_feature_id = '-'.join((karyotype_id, c))
                            karyotype_feature_label = \
                                'some karyotype alteration on chr'+str(c)
                            f = Feature(
                                karyotype_feature_id, karyotype_feature_label,
                                geno.genoparts['sequence_alteration'])
                            f.addFeatureStartLocation(None, chr_id)
                            f.addFeatureToGraph(g)
                            f.loadAllProperties(g)
                            geno.addParts(
                                karyotype_feature_id, karyotype_id,
                                geno.object_properties['has_alternate_part'])

                    if gene != '':
                        vl = gene+'('+mutation+')'

                    # fix the variant_id so it's always in the same order
                    vids = variant_id.split(';')
                    variant_id = ';'.join(sorted(list(set(vids))))

                    if karyotype.strip() != '' \
                            and not self._is_normal_karyotype(karyotype):
                        mutation = mutation.strip()
                        gvc_id = karyotype_id
                        if variant_id != '':
                            gvc_id = '_' + variant_id.replace(';', '-') + '-' \
                                    + re.sub(r'\w*:', '', karyotype_id)
                        if mutation.strip() != '':
                            gvc_label = '; '.join((vl, karyotype))
                        else:
                            gvc_label = karyotype
                    elif variant_id.strip() != '':
                        gvc_id = '_' + variant_id.replace(';', '-')
                        gvc_label = vl
                    else:
                        # wildtype?
                        pass

                    if gvc_id is not None and gvc_id != karyotype_id \
                            and self.nobnodes:
                        gvc_id = ':'+gvc_id

                    # add the karyotype to the gvc.
                    # use reference if normal karyotype
                    karyo_rel = geno.object_properties['has_alternate_part']
                    if self._is_normal_karyotype(karyotype):
                        karyo_rel = \
                            geno.object_properties['has_reference_part']
                    if karyotype_id is not None \
                            and not self._is_normal_karyotype(karyotype) \
                            and gvc_id is not None and karyotype_id != gvc_id:
                        geno.addParts(karyotype_id, gvc_id, karyo_rel)

                    if variant_id.strip() != '':
                        # split the variants & add them as part of the genotype
                        # we don't necessarily know their zygosity,
                        # just that they are part of the genotype variant ids
                        # are from OMIM, so prefix as such we assume that the
                        # sequence alts will be defined in OMIM not here
                        # TODO sort the variant_id list, if the omim prefix is
                        # the same, then assume it's the locus make a hashmap
                        # of the omim id to variant id list;
                        # then build the genotype hashmap is also useful for
                        # removing the "genes" from the list of "phenotypes"

                        # will hold gene/locus id to variant list
                        omim_map = {}

                        locus_num = None
                        for v in variant_id.split(';'):
                            # handle omim-style and odd var ids
                            # like 610661.p.R401X
                            m = re.match(r'(\d+)\.+(.*)', v.strip())
                            if m is not None and len(m.groups()) == 2:
                                (locus_num, var_num) = m.groups()

                            if locus_num is not None \
                                    and locus_num not in omim_map:
                                omim_map[locus_num] = [var_num]
                            else:
                                omim_map[locus_num] += [var_num]

                        for o in omim_map:
                            # gene_id = 'OMIM:' + o  # TODO unused
                            vslc_id = \
                                '_' + '-'.join(
                                    [o + '.' + a for a in omim_map.get(o)])
                            if self.nobnodes:
                                vslc_id = ':'+vslc_id
                            vslc_label = vl
                            # we don't really know the zygosity of
                            # the alleles at all.
                            # so the vslcs are just a pot of them
                            gu.addIndividualToGraph(
                                g, vslc_id, vslc_label,
                                geno.genoparts[
                                    'variant_single_locus_complement'])
                            for v in omim_map.get(o):
                                # this is actually a sequence alt
                                allele1_id = 'OMIM:'+o+'.'+v
                                geno.addSequenceAlteration(allele1_id, None)

                                # assume that the sa -> var_loc -> gene
                                # is taken care of in OMIM
                                geno.addPartsToVSLC(
                                    vslc_id, allele1_id, None,
                                    geno.zygosity['indeterminate'],
                                    geno.object_properties[
                                        'has_alternate_part'])

                            if vslc_id != gvc_id:
                                geno.addVSLCtoParent(vslc_id, gvc_id)

                    if affected == 'unaffected':
                        # let's just say that this person is wildtype
                        gu.addType(g, patient_id, geno.genoparts['wildtype'])
                    elif genotype_id is None:
                        # make an anonymous genotype id
                        genotype_id = '_geno'+catalog_id.strip()
                        if self.nobnodes:
                            genotype_id = ':'+genotype_id

                    # add the gvc
                    if gvc_id is not None:
                        gu.addIndividualToGraph(
                            g, gvc_id, gvc_label,
                            geno.genoparts['genomic_variation_complement'])

                        # add the gvc to the genotype
                        if genotype_id is not None:
                            if affected == 'unaffected':
                                rel = \
                                    geno.object_properties[
                                        'has_reference_part']
                            else:
                                rel = \
                                    geno.object_properties[
                                        'has_alternate_part']
                            geno.addParts(gvc_id, genotype_id, rel)
                        if karyotype_id is not None \
                                and self._is_normal_karyotype(karyotype):
                            if gvc_label is not None and gvc_label != '':
                                genotype_label = \
                                    '; '.join((gvc_label, karyotype))
                            else:
                                genotype_label = karyotype
                            if genotype_id is None:
                                genotype_id = karyotype_id
                            else:
                                geno.addParts(
                                    karyotype_id, genotype_id,
                                    geno.object_properties[
                                        'has_reference_part'])
                        else:
                            genotype_label = gvc_label
                            # use the catalog id as the background
                        genotype_label += ' ['+catalog_id.strip()+']'

                    if genotype_id is not None and gvc_id is not None:
                        # only add the genotype if it has some parts
                        geno.addGenotype(
                            genotype_id, genotype_label,
                            geno.genoparts['intrinsic_genotype'])
                        geno.addTaxon(taxon, genotype_id)
                        # add that the patient has the genotype
                        # TODO check if the genotype belongs to
                        # the cell line or to the patient
                        gu.addTriple(
                            g, patient_id,
                            geno.properties['has_genotype'], genotype_id)
                    else:
                        geno.addTaxon(taxon, patient_id)

                    # TODO: Add sex/gender  (as part of the karyotype?)

                    # #############    DEAL WITH THE DISEASES   #############

                    # we associate the disease to the patient
                    if affected == 'affected':
                        if omim_number != '':
                            for d in omim_number.split(';'):
                                if d is not None and d != '':
                                    # if the omim number is in omim_map,
                                    # then it is a gene not a pheno
                                    if d not in omim_map:
                                        disease_id = 'OMIM:'+d.strip()
                                        # assume the label is taken care of
                                        gu.addClassToGraph(g, disease_id, None)

                                        # add the association:
                                        #   the patient has the disease
                                        assoc = G2PAssoc(
                                            self.name, patient_id, disease_id)
                                        assoc.add_association_to_graph(g)

                                        # this line is a model of this disease
                                        # TODO abstract out model into
                                        # it's own association class?
                                        gu.addTriple(
                                            g, cell_line_id,
                                            gu.properties['model_of'],
                                            disease_id)
                                    else:
                                        logger.info(
                                            'removing %s from disease list ' +
                                            'since it is a gene', d)

                    # #############    ADD PUBLICATIONS   #############

                    if pubmed_ids != '':
                        for s in pubmed_ids.split(';'):
                            pubmed_id = 'PMID:'+s.strip()
                            ref = Reference(pubmed_id)
                            ref.setType(Reference.ref_types['journal_article'])
                            ref.addRefToGraph(g)
                            gu.addTriple(
                                g, pubmed_id, gu.properties['mentions'],
                                cell_line_id)

                    if not self.testMode \
                            and (limit is not None and line_counter > limit):
                        break

            Assoc(self.name).load_all_properties(g)

        return

    def _process_collection(self, collection_id, label, page):
        """
        This function will process the data supplied internally
        about the repository from Coriell.

        Triples:
            Repository a ERO:collection
            rdf:label Literal(label)
            foaf:page Literal(page)

        :param collection_id:
        :param label:
        :param page:
        :return:
        """
        # #############    BUILD THE CELL LINE REPOSITORY    #############
        for g in [self.graph, self.testgraph]:
            # FIXME: How to devise a label for each repository?
            gu = GraphUtils(curie_map.get())
            repo_id = 'CoriellCollection:'+collection_id
            repo_label = label
            repo_page = page

            gu.addIndividualToGraph(
                g, repo_id, repo_label, self.terms['collection'])
            gu.addPage(g, repo_id, repo_page)

        return

    @staticmethod
    def _map_cell_type(sample_type):
        ctype = None
        type_map = {
            # FIXME: mesenchymal stem cell of adipose
            'Adipose stromal cell': 'CL:0002570',
            # FIXME: amniocyte?
            'Amniotic fluid-derived cell line': 'CL:0002323',
            # B cell
            'B-Lymphocyte': 'CL:0000236',
            # FIXME: No Match
            'Chorionic villus-derived cell line': 'CL:0000000',
            # endothelial cell
            'Endothelial': 'CL:0000115',
            # epithelial cell
            'Epithelial': 'CL:0000066',
            # FIXME: No Match. "Abnormal precursor (virally transformed)
            # of mouse erythrocytes that can be grown in culture and
            # induced to differentiate by treatment with, for example, DMSO."
            'Erythroleukemic cell line': 'CL:0000000',

            'Fibroblast': 'CL:0000057',         # fibroblast
            'Keratinocyte': 'CL:0000312',       # keratinocyte
            'Melanocyte': 'CL:0000148',         # melanocyte
            'Mesothelial': 'CL:0000077',
            'Microcell hybrid': 'CL:0000000',   # FIXME: No Match
            'Myoblast': 'CL:0000056',           # myoblast
            'Smooth muscle': 'CL:0000192',      # smooth muscle cell
            'Stem cell': 'CL:0000034',          # stem cell
            'T-Lymphocyte': 'CL:0000084',       # T cell
            # FIXME: No Match. "Cells isolated from a mass of neoplastic cells,
            # i.e., a growth formed by abnormal cellular proliferation."
            # Oncocyte? CL:0002198
            'Tumor-derived cell line': 'CL:0002198'
        }
        if sample_type.strip() in type_map:
            ctype = type_map.get(sample_type)
        else:
            logger.error("Cell type not mapped: %s", sample_type)

        return ctype

    @staticmethod
    def _map_race(race):
        rtype = None
        type_map = {
            'African American': 'EFO:0003150',
            # 'American Indian': 'EFO',
            'Asian': 'EFO:0003152',
            # FIXME: Asian?
            'Asian; Other': 'EFO:0003152',
            # Asian Indian
            'Asiatic Indian': 'EFO:0003153',
            # FIXME: African American? There is also African.
            'Black': 'EFO:0003150',
            'Caucasian': 'EFO:0003156',
            'Chinese': 'EFO:0003157',
            'East Indian': 'EFO:0003158',  # Eastern Indian
            'Filipino': 'EFO:0003160',
            # Hispanic: EFO:0003169, Latino: EFO:0003166 see next
            'Hispanic/Latino': 'EFO:0003169',
            'Japanese': 'EFO:0003164',
            'Korean': 'EFO:0003165',
            # 'More than one race': 'EFO',
            # 'Not Reported': 'EFO',
            # 'Other': 'EFO',
            # Asian/Pacific Islander
            'Pacific Islander': 'EFO:0003154',
            # Asian/Pacific Islander
            'Polynesian': 'EFO:0003154',
            # 'Unknown': 'EFO',
            # Asian
            'Vietnamese': 'EFO:0003152',
        }
        if race.strip() in type_map:
            rtype = type_map.get(race)
        else:
            logger.warning("Race type not mapped: %s", race)

        return rtype

    @staticmethod
    def _map_species(species):
        tax = None
        type_map = {
            'Mus musculus': 'NCBITaxon:10090',
            'Peromyscus peromyscus californicus': 'NCBITaxon:42520',
            'Peromyscus peromyscus maniculatus': 'NCBITaxon:10042',
            'Peromyscus peromyscus leucopus': 'NCBITaxon:10041',
            'Peromyscus peromyscus polionotus': 'NCBITaxon:42413',
            'Macaca fascicularis': 'NCBITaxon:9541',
            'Rattus norvegicus': 'NCBITaxon:10116',
            'Papio anubis': 'NCBITaxon:9555',
            'Cricetulus griseus': 'NCBITaxon:10029',
            'Geochelone elephantopus': 'NCBITaxon:66189',
            'Muntiacus muntjak': 'NCBITaxon:9888',
            'Ailurus fulgens': 'NCBITaxon:9649',
            'Sus scrofa': 'NCBITaxon:9823',
            'Bos taurus': 'NCBITaxon:9913',
            'Oryctolagus cuniculus': 'NCBITaxon:9986',
            'Macaca nemestrina': 'NCBITaxon:9545',
            'Canis familiaris': 'NCBITaxon:9615',
            'Equus caballus': 'NCBITaxon:9796',
            'Macaca mulatta': 'NCBITaxon:9544',
            'Mesocricetus auratus': 'NCBITaxon:10036',
            'Macaca nigra': 'NCBITaxon:54600',
            'Erythrocebus patas': 'NCBITaxon:9538',
            'Pongo pygmaeus': 'NCBITaxon:9600',
            'Callicebus moloch': 'NCBITaxon:9523',
            'Lagothrix lagotricha': 'NCBITaxon:9519',
            'Saguinus fuscicollis': 'NCBITaxon:9487',
            'Saimiri sciureus': 'NCBITaxon:9521',
            'Saguinus labiatus': 'NCBITaxon:78454',
            'Pan paniscus': 'NCBITaxon:9597',
            'Ovis aries': 'NCBITaxon:9940',
            'Felis catus': 'NCBITaxon:9685',
            'H**o sapiens': 'NCBITaxon:9606'
        }
        if species.strip() in type_map:
            tax = type_map.get(species)
        else:
            logger.warning("Species type not mapped: %s", species)

        return tax

    @staticmethod
    def _map_collection(collection):
        ctype = None
        type_map = {
            'NINDS Repository':
                'CoriellCollection:NINDS',
            'NIGMS Human Genetic Cell Repository':
                'CoriellCollection:NIGMS',
            'NIA Aging Cell Culture Repository':
                'CoriellCollection:NIA',
            'NHGRI Sample Repository for Human Genetic Research':
                'CoriellCollection:NHGRI'
        }
        if collection.strip() in type_map:
            ctype = type_map.get(collection)
        else:
            logger.warning("ERROR: Collection type not mapped: %s", collection)

        return ctype

    @staticmethod
    def _get_affected_chromosomes_from_karyotype(karyotype):

        affected_chromosomes = set()
        chr_regex = r'(\d+|X|Y|M|\?);?'
        abberation_regex = r'(?:add|del|der|i|idic|inv|r|rec|t)\([\w;]+\)'
        sex_regex = r'(?:;)(X{2,}Y+|X?Y{2,}|X{3,}|X|Y)(?:;|$)'

        # first fetch the set of abberations
        abberations = re.findall(abberation_regex, karyotype)

        # iterate over them to get the chromosomes
        for a in abberations:
            chrs = re.findall(chr_regex, a)
            affected_chromosomes = affected_chromosomes.union(set(chrs))

        # remove the ? as a chromosome, since it isn't valid
        if '?' in affected_chromosomes:
            affected_chromosomes.remove('?')

        # check to see if there are any abnormal sex chromosomes
        m = re.search(sex_regex, karyotype)
        if m is not None:
            if re.search(r'X?Y{2,}', m.group(1)):
                # this is the only case where there is an extra Y chromosome
                affected_chromosomes.add('Y')
            else:
                affected_chromosomes.add('X')

        return affected_chromosomes

    @staticmethod
    def _is_normal_karyotype(karyotype):
        """
        This will default to true if no karyotype is provided.
        This is assuming human karyotypes.
        :param karyotype:
        :return:
        """

        is_normal = True
        if karyotype is not None:
            karyotype = karyotype.strip()
            if karyotype not in ['46;XX', '46;XY', '']:
                is_normal = False

        return is_normal

    def getTestSuite(self):
        import unittest
        from tests.test_coriell import CoriellTestCase
        # TODO add G2PAssoc, Genotype tests

        test_suite = \
            unittest.TestLoader().loadTestsFromTestCase(CoriellTestCase)

        return test_suite
예제 #4
0
class EOM(PostgreSQLSource):
    """
    Elements of Morphology is a resource from NHGRI that has definitions of
    morphological abnormalities, together with image depictions.
    We pull those relationships, as well as our local mapping of equivalences
    between EOM and HP terminologies.

    The website is crawled monthly by NIF's DISCO crawler system,
    which we utilize here.
    Be sure to have pg user/password connection details in your conf.json file,
    like:
    dbauth : {'disco' : {'user' : '<username>', 'password' : '<password>'}}

    Monarch-curated data for the HP to EOM mapping is stored at
    https://raw.githubusercontent.com/obophenotype/human-phenotype-ontology/master/src/mappings/hp-to-eom-mapping.tsv

    Since this resource is so small, the entirety of it is the "test" set.

    """

    # we are using the production view here; should we be using services?
    tables = ['dvp.pr_nlx_157874_1']

    files = {
        'map': {
            'file':
            'hp-to-eom-mapping.tsv',
            'url':
            'https://raw.githubusercontent.com/obophenotype/human-phenotype-ontology/master/src/mappings/hp-to-eom-mapping.tsv'
        }
    }

    def __init__(self, graph_type, are_bnodes_skolemized):
        super().__init__(graph_type, are_bnodes_skolemized, 'eom')

        # update the dataset object with details about this resource
        # TODO put this into a conf file?
        self.dataset = Dataset(
            'eom', 'EOM', 'http://elementsofmorphology.nih.gov', None,
            'http://www.genome.gov/copyright.cfm',
            'https://creativecommons.org/publicdomain/mark/1.0/')

        # check if config exists; if it doesn't, error out and let user know
        if 'dbauth' not in config.get_config() or \
                'disco' not in config.get_config()['dbauth']:
            logger.error("not configured with PG user/password.")

        # source-specific warnings.  will be cleared when resolved.

        return

    def fetch(self, is_dl_forced=False):
        '''create the connection details for DISCO'''

        cxn = config.get_config()['dbauth']['disco']
        cxn.update({
            'host': 'nif-db.crbs.ucsd.edu',
            'database': 'disco_crawler',
            'port': 5432
        })

        self.dataset.setFileAccessUrl(''.join(
            ('jdbc:postgresql://', cxn['host'], ':', str(cxn['port']), '/',
             cxn['database'])),
                                      is_object_literal=True)

        # process the tables
        # self.fetch_from_pgdb(self.tables,cxn,100)  #for testing
        self.fetch_from_pgdb(self.tables, cxn)

        self.get_files(is_dl_forced)

        # FIXME: Everything needed for data provenance?
        st = os.stat('/'.join((self.rawdir, 'dvp.pr_nlx_157874_1')))
        filedate = datetime.utcfromtimestamp(st[ST_CTIME]).strftime("%Y-%m-%d")
        self.dataset.setVersion(filedate)

        return

    def parse(self, limit=None):
        '''
            Over ride Source.parse inherited via PostgreSQLSource
        '''

        if limit is not None:
            logger.info("Only parsing first %s rows of each file", limit)

        if self.testOnly:
            self.testMode = True

        logger.info("Parsing files...")

        self._process_nlx_157874_1_view(
            '/'.join((self.rawdir, 'dvp.pr_nlx_157874_1')), limit)
        self._map_eom_terms('/'.join((self.rawdir, self.files['map']['file'])),
                            limit)

        logger.info("Finished parsing.")

        # since it's so small,
        # we default to copying the entire graph to the test set
        self.testgraph = self.graph

        return

    def _process_nlx_157874_1_view(self, raw, limit=None):
        """
        This table contains the Elements of Morphology data that has been
        screen-scraped into DISCO.
        Note that foaf:depiction is inverse of foaf:depicts relationship.

        Since it is bad form to have two definitions,
        we concatenate the two into one string.

        Triples:
            <eom id> a owl:Class
                rdf:label Literal(eom label)
                OIO:hasRelatedSynonym Literal(synonym list)
                IAO:definition Literal(objective_def. subjective def)
                foaf:depiction Literal(small_image_url),
                               Literal(large_image_url)
                foaf:page Literal(page_url)
                rdfs:comment Literal(long commented text)


        :param raw:
        :param limit:
        :return:
        """

        model = Model(self.graph)
        line_counter = 0
        with open(raw, 'r') as f1:
            f1.readline()  # read the header row; skip
            filereader = csv.reader(f1, delimiter='\t', quotechar='\"')
            for line in filereader:
                line_counter += 1
                (morphology_term_id, morphology_term_num,
                 morphology_term_label, morphology_term_url,
                 terminology_category_label, terminology_category_url,
                 subcategory, objective_definition, subjective_definition,
                 comments, synonyms, replaces, small_figure_url,
                 large_figure_url, e_uid, v_uid, v_uuid, v_last_modified,
                 v_status, v_lastmodified_epoch) = line

                # note:
                # e_uid v_uuid v_last_modified terminology_category_url
                # subcategory v_uid morphology_term_num
                # terminology_category_label hp_label notes
                # are currently unused.

                # Add morphology term to graph as a class
                # with label, type, and description.
                model.addClassToGraph(morphology_term_id,
                                      morphology_term_label)

                # Assemble the description text

                if subjective_definition != '' and not (re.match(
                        r'.+\.$', subjective_definition)):
                    # add a trailing period.
                    subjective_definition = subjective_definition.strip() + '.'
                if objective_definition != '' and not (re.match(
                        r'.+\.$', objective_definition)):
                    # add a trailing period.
                    objective_definition = objective_definition.strip() + '.'

                definition = \
                    '  '.join(
                        (objective_definition, subjective_definition)).strip()

                model.addDefinition(morphology_term_id, definition)

                # <term id> FOAF:depicted_by literal url
                # <url> type foaf:depiction

                # do we want both images?
                # morphology_term_id has depiction small_figure_url
                if small_figure_url != '':
                    model.addDepiction(morphology_term_id, small_figure_url)

                # morphology_term_id has depiction large_figure_url
                if large_figure_url != '':
                    model.addDepiction(morphology_term_id, large_figure_url)

                # morphology_term_id has comment comments
                if comments != '':
                    model.addComment(morphology_term_id, comments.strip())

                if synonyms != '':
                    for s in synonyms.split(';'):
                        model.addSynonym(
                            morphology_term_id, s.strip(),
                            model.annotation_properties['hasExactSynonym'])

                # morphology_term_id hasRelatedSynonym replaces (; delimited)
                if replaces != '' and replaces != synonyms:
                    for s in replaces.split(';'):
                        model.addSynonym(
                            morphology_term_id, s.strip(),
                            model.annotation_properties['hasRelatedSynonym'])

                # morphology_term_id has page morphology_term_url
                reference = Reference(self.graph)
                reference.addPage(morphology_term_id, morphology_term_url)

                if limit is not None and line_counter > limit:
                    break
        return

    def _map_eom_terms(self, raw, limit=None):
        """
        This table contains the HP ID mappings from the local tsv file.
        Triples:
            <eom id> owl:equivalentClass <hp id>
        :param raw:
        :param limit:
        :return:
        """

        model = Model(self.graph)
        line_counter = 0
        with open(raw, 'r') as f1:
            f1.readline()  # read the header row; skip
            for line in f1:
                line_counter += 1

                (morphology_term_id, morphology_term_label, hp_id, hp_label,
                 notes) = line.split('\t')

                # Sub out the underscores for colons.
                hp_id = re.sub('_', ':', hp_id)
                if re.match(".*HP:.*", hp_id):
                    # add the HP term as a class
                    model.addClassToGraph(hp_id, None)
                    # Add the HP ID as an equivalent class
                    model.addEquivalentClass(morphology_term_id, hp_id)
                else:
                    logger.warning('No matching HP term for %s',
                                   morphology_term_label)

                if limit is not None and line_counter > limit:
                    break

        return

    def getTestSuite(self):
        import unittest
        # TODO PYLINT: Unable to import 'tests.test_eom'
        from tests.test_eom import EOMTestCase

        test_suite = unittest.TestLoader().loadTestsFromTestCase(EOMTestCase)

        return test_suite
예제 #5
0
class GeneReviews(Source):
    """
    Here we process the GeneReviews mappings to OMIM,
    plus inspect the GeneReviews (html) books to pull the clinical descriptions
    in order to populate the definitions of the terms in the ontology.
    We define the GeneReviews items as classes that are either grouping classes
    over OMIM disease ids (gene ids are filtered out),
    or are made as subclasses of DOID:4 (generic disease).

    Note that GeneReviews
    [copyright policy](http://www.ncbi.nlm.nih.gov/books/NBK138602/)
    (as of 2015.11.20) says:

    GeneReviews® chapters are owned by the University of Washington, Seattle,
    © 1993-2015. Permission is hereby granted to reproduce, distribute,
    and translate copies of content materials provided that
    (i) credit for source (www.ncbi.nlm.nih.gov/books/NBK1116/)
        and copyright (University of Washington, Seattle)
        are included with each copy;
    (ii) a link to the original material is provided whenever the material is
        published elsewhere on the Web; and
    (iii) reproducers, distributors, and/or translators comply with this
        copyright notice and the GeneReviews Usage Disclaimer.

    This script doesn't pull the GeneReviews books from the NCBI Bookshelf
    directly; scripting this task is expressly prohibited by
    [NCBIBookshelf policy](http://www.ncbi.nlm.nih.gov/books/NBK45311/).
    However, assuming you have acquired the books (in html format) via
    permissible means, a parser for those books is provided here to extract
    the clinical descriptions to define the NBK identified classes.

    """

    files = {
        'idmap': {'file': 'NBKid_shortname_OMIM.txt',
                  'url': GRDL + '/NBKid_shortname_OMIM.txt'},
        'titles': {'file': 'GRtitle_shortname_NBKid.txt',
                   'url': GRDL + '/GRtitle_shortname_NBKid.txt'}
        }

    def __init__(self):
        Source.__init__(self, 'genereviews')

        self.load_bindings()

        self.dataset = Dataset(
            'genereviews', 'Gene Reviews', 'http://genereviews.org/',
            None, 'http://www.ncbi.nlm.nih.gov/books/NBK138602/')
        self.dataset.set_citation('GeneReviews:NBK1116')

        self.gu = GraphUtils(curie_map.get())

        self.book_ids = set()
        self.all_books = {}

        if 'test_ids' not in config.get_config() or\
                'disease' not in config.get_config()['test_ids']:
            logger.warning("not configured with disease test ids.")
            self.test_ids = list()
        else:
            # select ony those test ids that are omim's.
            self.test_ids = config.get_config()['test_ids']['disease']

        return

    def fetch(self, is_dl_forced=False):
        """
        We fetch GeneReviews id-label map and id-omim mapping files from NCBI.
        :return: None
        """

        self.get_files(is_dl_forced)

        return

    def parse(self, limit=None):
        """
        :return: None
        """

        if self.testOnly:
            self.testMode = True

        self._get_titles(limit)
        self._get_equivids(limit)

        self.create_books()
        self.process_nbk_html(limit)

        self.load_bindings()

        # no test subset for now; test == full graph
        self.testgraph = self.graph

        logger.info("Found %d nodes", len(self.graph))

        return

    def _get_equivids(self, limit):
        """
        The file processed here is of the format:
        #NBK_id GR_shortname    OMIM
        NBK1103 trimethylaminuria       136132
        NBK1103 trimethylaminuria       602079
        NBK1104 cdls    122470
        Where each of the rows represents a mapping between
        a gr id and an omim id. These are a 1:many relationship,
        and some of the omim ids are genes(not diseases).
        Therefore, we need to create a loose coupling here.
        We make the assumption that these NBKs are generally higher-level
        grouping classes; therefore the OMIM ids are treated as subclasses.
        (This assumption is poor for those omims that are actually genes,
        but we have no way of knowing what those are here...
        we will just have to deal with that for now.)
        :param limit:
        :return:

        """
        raw = '/'.join((self.rawdir, self.files['idmap']['file']))
        gu = GraphUtils(curie_map.get())
        line_counter = 0

        # we look some stuff up in OMIM, so initialize here
        omim = OMIM()
        id_map = {}
        allomimids = set()
        with open(raw, 'r', encoding="utf8") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                if line_counter == 1:  # skip header
                    continue
                (nbk_num, shortname, omim_num) = row
                gr_id = 'GeneReviews:'+nbk_num
                omim_id = 'OMIM:'+omim_num
                if not (
                        (self.testMode and
                         len(self.test_ids) > 0 and
                         omim_id in self.test_ids) or not
                        self.testMode):
                    continue

                # sometimes there's bad omim nums
                if len(omim_num) > 6:
                    logger.warning(
                        "OMIM number incorrectly formatted " +
                        "in row %d; skipping:\n%s",
                        line_counter, '\t'.join(row))
                    continue

                # build up a hashmap of the mappings; then process later
                if nbk_num not in id_map:
                    id_map[nbk_num] = set()
                id_map[nbk_num].add(omim_num)

                # add the class along with the shortname
                gu.addClassToGraph(self.graph, gr_id, None)
                gu.addSynonym(self.graph, gr_id, shortname)

                allomimids.add(omim_num)

                if not self.testMode and \
                        limit is not None and line_counter > limit:
                    break

            # end looping through file

        # get the omim ids that are not genes
        entries_that_are_phenotypes = \
            omim.process_entries(
                list(allomimids), filter_keep_phenotype_entry_ids,
                None, None, limit)

        logger.info("Filtered out %d/%d entries that are genes or features",
                    len(allomimids)-len(entries_that_are_phenotypes),
                    len(allomimids))

        for nbk_num in self.book_ids:
            gr_id = 'GeneReviews:'+nbk_num
            if nbk_num in id_map:
                omim_ids = id_map.get(nbk_num)
                for omim_num in omim_ids:
                    omim_id = 'OMIM:'+omim_num
                    # add the gene reviews as a superclass to the omim id,
                    # but only if the omim id is not a gene
                    if omim_id in entries_that_are_phenotypes:
                        gu.addClassToGraph(self.graph, omim_id, None)
                        gu.addSubclass(self.graph, gr_id, omim_id)
            # add this as a generic subclass of DOID:4
            gu.addSubclass(self.graph, 'DOID:4', gr_id)

        return

    def _get_titles(self, limit):
        """
        The file processed here is of the format:
        #NBK_id GR_shortname    OMIM
        NBK1103 trimethylaminuria       136132
        NBK1103 trimethylaminuria       602079
        NBK1104 cdls    122470
        Where each of the rows represents a mapping between
        a gr id and an omim id. These are a 1:many relationship,
        and some of the omim ids are genes (not diseases).
        Therefore, we need to create a loose coupling here.
        We make the assumption that these NBKs are generally higher-level
        grouping classes; therefore the OMIM ids are treated as subclasses.
        (This assumption is poor for those omims that are actually genes,
        but we have no way of knowing what those are here...
        we will just have to deal with that for now.)
        :param limit:
        :return:
        """
        raw = '/'.join((self.rawdir, self.files['titles']['file']))
        gu = GraphUtils(curie_map.get())
        line_counter = 0
        with open(raw, 'r', encoding='latin-1') as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                if line_counter == 1:  # skip header
                    continue
                (shortname, title, nbk_num) = row
                gr_id = 'GeneReviews:'+nbk_num

                self.book_ids.add(nbk_num)  # a global set of the book nums

                if limit is None or line_counter < limit:
                    gu.addClassToGraph(self.graph, gr_id, title)
                    gu.addSynonym(self.graph, gr_id, shortname)

        return

    def create_books(self):

        # note that although we put in the url to the book,
        # NCBI Bookshelf does not allow robots to download content
        book_item = {'file': 'books/',
                     'url': ''}

        for nbk in self.book_ids:
            b = book_item.copy()
            b['file'] = '/'.join(('books', nbk+'.html'))
            b['url'] = 'http://www.ncbi.nlm.nih.gov/books/'+nbk
            self.all_books[nbk] = b

        return

    def process_nbk_html(self, limit):
        """
        Here we process the gene reviews books to fetch
        the clinical descriptions to include in the ontology.
        We only use books that have been acquired manually,
        as NCBI Bookshelf does not permit automated downloads.
        This parser will only process the books that are found in
        the ```raw/genereviews/books``` directory,
        permitting partial completion.

        :param limit:
        :return:
        """
        c = 0
        books_not_found = set()
        for nbk in self.book_ids:
            c += 1
            nbk_id = 'GeneReviews:'+nbk
            book_item = self.all_books.get(nbk)
            url = '/'.join((self.rawdir, book_item['file']))

            # figure out if the book is there; if so, process, otherwise skip
            book_dir = '/'.join((self.rawdir, 'books'))
            book_files = os.listdir(book_dir)
            if ''.join((nbk, '.html')) not in book_files:
                # logger.warning("No book found locally for %s; skipping", nbk)
                books_not_found.add(nbk)
                continue
            logger.info("Processing %s", nbk)

            page = open(url)
            soup = BeautifulSoup(page.read())

            # sec0 == clinical description
            clin_summary = \
                soup.find(
                    'div', id=re.compile(".*Summary.sec0"))
            if clin_summary is not None:
                p = clin_summary.find('p')
                ptext = p.text
                ptext = re.sub(r'\s+', ' ', ptext)

                ul = clin_summary.find('ul')
                if ul is not None:
                    item_text = list()
                    for li in ul.find_all('li'):
                        item_text.append(re.sub(r'\s+', ' ', li.text))
                    ptext += ' '.join(item_text)

                # add in the copyright and citation info to description
                ptext = \
                    ' '.join(
                        (ptext,
                         '[GeneReviews:NBK1116, GeneReviews:NBK138602, ' +
                         nbk_id+']'))

                self.gu.addDefinition(self.graph, nbk_id, ptext.strip())

            # get the pubs
            pmid_set = set()
            pub_div = soup.find('div', id=re.compile(r".*Literature_Cited"))
            if pub_div is not None:
                ref_list = pub_div.find_all('div', attrs={'class': "bk_ref"})
                for r in ref_list:
                    for a in r.find_all(
                            'a', attrs={'href': re.compile(r"pubmed")}):
                        if re.match(r'PubMed:', a.text):
                            pmnum = re.sub(r'PubMed:\s*', '', a.text)
                        else:
                            pmnum = \
                                re.search(
                                    r'\/pubmed\/(\d+)$', a['href']).group(1)
                        if pmnum is not None:
                            pmid = 'PMID:'+str(pmnum)
                            self.gu.addTriple(
                                self.graph, pmid,
                                self.gu.object_properties['is_about'],
                                nbk_id)
                            pmid_set.add(pmnum)
                            r = Reference(
                                pmid, Reference.ref_types['journal_article'])
                            r.addRefToGraph(self.graph)

            # TODO add author history, copyright, license to dataset

            # TODO get PMID-NBKID equivalence (near foot of page),
            # and make it "is about" link
            # self.gu.addTriple(
            #   self.graph, pmid,
            #   self.gu.object_properties['is_about'], nbk_id)
            # for example: NBK1191 PMID:20301370

            # add the book to the dataset
            self.dataset.setFileAccessUrl(book_item['url'])

            if limit is not None and c > limit:
                break

            # finish looping through books

        l = len(books_not_found)
        if len(books_not_found) > 0:
            if l > 100:
                logger.warning("There were %d books not found.", l)
            else:
                logger.warning(
                    "The following %d books were not found locally: %s",
                    l, str(books_not_found))
        logger.info(
            "Finished processing %d books for clinical descriptions", c-l)

        return

    def getTestSuite(self):
        import unittest
        from tests.test_genereviews import GeneReviewsTestCase

        test_suite = \
            unittest.TestLoader().loadTestsFromTestCase(GeneReviewsTestCase)

        return test_suite
예제 #6
0
class Source:
    """
    Abstract class for any data sources that we'll import and process.
    Each of the subclasses will fetch() the data, scrub() it as necessary,
    then parse() it into a graph.  The graph will then be written out to
    a single self.name().<dest_fmt>  file.

    Also provides a means to marshal metadata in a consistent fashion

    Houses the global translation table (from ontology label to ontology term)
    so it may as well be used everywhere.

    """

    namespaces = {}
    files = {}

    def __init__(
            self,
            graph_type='rdf_graph',  # or streamed_graph
            are_bnodes_skized=False,  # typically True
            name=None,  # identifier; make an IRI for nquads
            ingest_title=None,
            ingest_url=None,
            license_url=None,  # only if it is _our_ lic
            data_rights=None,  # external page that points to their current lic
            file_handle=None):

        # pull in the common test identifiers
        self.all_test_ids = self.open_and_parse_yaml(
            '../../resources/test_ids.yaml')

        self.graph_type = graph_type
        self.are_bnodes_skized = are_bnodes_skized
        self.ingest_url = ingest_url
        self.ingest_title = ingest_title
        self.localtt = self.load_local_translationtable(name)

        if name is not None:
            self.name = name.lower()
        elif self.whoami() is not None:
            self.name = self.whoami().lower()

        LOG.info("Processing Source \"%s\"", self.name)
        self.test_only = False
        self.path = ""
        # to be used to store a subset of data for testing downstream.
        self.triple_count = 0
        self.outdir = 'out'
        self.testdir = 'tests'
        self.rawdir = 'raw'
        self.rawdir = '/'.join((self.rawdir, self.name))
        self.testname = name + "_test"
        self.testfile = '/'.join((self.outdir, self.testname + ".ttl"))
        self.datasetfile = None

        # still need to pull in file suffix  -- this ia a curie not a url
        self.archive_url = 'MonarchArchive:' + 'ttl/' + self.name + '.ttl'

        # if raw data dir doesn't exist, create it
        if not os.path.exists(self.rawdir):
            os.makedirs(self.rawdir)
            pth = os.path.abspath(self.rawdir)
            LOG.info("creating raw directory for %s at %s", self.name, pth)

        # if output dir doesn't exist, create it
        if not os.path.exists(self.outdir):
            os.makedirs(self.outdir)
            pth = os.path.abspath(self.outdir)
            LOG.info("created output directory %s", pth)

        LOG.info("Creating Test graph %s", self.testname)
        # note: tools such as protoge need slolemized blank nodes
        self.testgraph = RDFGraph(True, self.testname)

        if graph_type == 'rdf_graph':
            graph_id = ':MONARCH_' + str(self.name) + "_" + \
                datetime.now().isoformat(' ').split()[0]

            LOG.info("Creating graph  %s", graph_id)
            self.graph = RDFGraph(are_bnodes_skized, graph_id)

        elif graph_type == 'streamed_graph':
            # need to expand on export formats
            dest_file = open(pth + '/' + name + '.nt',
                             'w')  # where is the close?
            self.graph = StreamedGraph(are_bnodes_skized, dest_file)
            # leave test files as turtle (better human readibility)
        else:
            LOG.error(
                "%s graph type not supported\n"
                "valid types: rdf_graph, streamed_graph", graph_type)

        # pull in global ontology mapping datastructures
        self.globaltt = self.graph.globaltt
        self.globaltcid = self.graph.globaltcid

        self.curie_map = self.graph.curie_map
        # self.prefix_base = {v: k for k, v in self.curie_map.items()}

        # will be set to True if the intention is
        # to only process and write the test data
        self.test_only = False
        self.test_mode = False

        # this may eventually support Bagits
        self.dataset = Dataset(
            self.archive_url,
            self.ingest_title,
            self.ingest_url,
            None,  # description
            license_url,  # only _OUR_ lic
            data_rights,  # tries to point to others lics
            graph_type,
            file_handle)

        for graph in [self.graph, self.testgraph]:
            self.declareAsOntology(graph)

    def fetch(self, is_dl_forced=False):
        """
        abstract method to fetch all data from an external resource.
        this should be overridden by subclasses
        :return: None

        """
        raise NotImplementedError

    def parse(self, limit):
        """
        abstract method to parse all data from an external resource,
        that was fetched in fetch() this should be overridden by subclasses
        :return: None

        """
        raise NotImplementedError

    def write(self, fmt='turtle', stream=None):
        """
        This convenience method will write out all of the graphs
        associated with the source.
        Right now these are hardcoded to be a single "graph"
        and a "src_dataset.ttl" and a "src_test.ttl"
        If you do not supply stream='stdout'
        it will default write these to files.

        In addition, if the version number isn't yet set in the dataset,
        it will be set to the date on file.
        :return: None

        """
        fmt_ext = {
            'rdfxml': 'xml',
            'turtle': 'ttl',
            'nt': 'nt',  # ntriples
            'nquads': 'nq',
            'n3': 'n3'  # notation3
        }

        # make the regular graph output file
        dest = None
        if self.name is not None:
            dest = '/'.join((self.outdir, self.name))
            if fmt in fmt_ext:
                dest = '.'.join((dest, fmt_ext.get(fmt)))
            else:
                dest = '.'.join((dest, fmt))
            LOG.info("Setting outfile to %s", dest)

            # make the dataset_file name, always format as turtle
            self.datasetfile = '/'.join(
                (self.outdir, self.name + '_dataset.ttl'))
            LOG.info("Setting dataset file to %s", self.datasetfile)

            if self.dataset is not None and self.dataset.version is None:
                self.dataset.set_version_by_date()
                LOG.info("No version for %s setting to date issued.",
                         self.name)
        else:
            LOG.warning("No output file set. Using stdout")
            stream = 'stdout'

        gu = GraphUtils(None)

        # the  _dataset description is always turtle
        gu.write(self.dataset.getGraph(), 'turtle', filename=self.datasetfile)

        if self.test_mode:
            # unless we stop hardcoding, the test dataset is always turtle
            LOG.info("Setting testfile to %s", self.testfile)
            gu.write(self.testgraph, 'turtle', filename=self.testfile)

        # print graph out
        if stream is None:
            outfile = dest
        elif stream.lower().strip() == 'stdout':
            outfile = None
        else:
            LOG.error("I don't understand our stream.")
            return
        gu.write(self.graph, fmt, filename=outfile)

    def whoami(self):
        '''
            pointless convieniance
        '''
        LOG.info("Ingest is %s", self.name)

    @staticmethod
    def make_id(long_string, prefix='MONARCH'):
        """
        a method to create DETERMINISTIC identifiers
        based on a string's digest. currently implemented with sha1
        :param long_string:
        :return:

        """
        return ':'.join((prefix, Source.hash_id(long_string)))

    @staticmethod
    def hash_id(wordage):  # same as graph/GraphUtils.digest_id(wordage)
        """
        prepend 'b' to avoid leading with digit
        truncate to a 20 char sized word with a leading 'b'
        return truncated sha1 hash of string.

        by the birthday paradox;
            expect 50% chance of collision after 69 billion invocations
            however these are only hoped to be unique within a single file

        Consider reducing to 17 hex chars to fit in a 64 bit word
        16 discounting a leading constant
        gives a 50% chance of collision at about 4.3b billion unique input strings
        (currently _many_ orders of magnitude below that)

        :param long_string: str string to be hashed
        :return: str hash of id
        """
        return 'b' + hashlib.sha1(wordage.encode('utf-8')).hexdigest()[1:20]

    def checkIfRemoteIsNewer(self, remote, local, headers):
        """
        Given a remote file location, and the corresponding local file
        this will check the datetime stamp on the files to see if the remote
        one is newer.
        This is a convenience method to be used so that we don't have to
        re-fetch files that we already have saved locally
        :param remote: URL of file to fetch from remote server
        :param local: pathname to save file to locally
        :return: True if the remote file is newer and should be downloaded

        """
        LOG.info("Checking if remote file is newer than local \n(%s)", local)

        # check if local file exists
        # if no local file, then remote is newer
        if os.path.exists(local):
            LOG.info("Local File exists as %s", local)
        else:
            LOG.info("Local File does NOT exist as %s", local)
            return True

        # get remote file details
        if headers is None:
            headers = self._get_default_request_headers()

        req = urllib.request.Request(remote, headers=headers)
        LOG.info("Request header: %s", str(req.header_items()))

        response = urllib.request.urlopen(req)

        try:
            resp_headers = response.info()
            size = resp_headers.get('Content-Length')
            last_modified = resp_headers.get('Last-Modified')
        except urllib.error.URLError as err:
            resp_headers = None
            size = 0
            last_modified = None
            LOG.error(err)

        if size is not None and size != '':
            size = int(size)
        else:
            size = 0

        fstat = os.stat(local)
        LOG.info("Local File date: %s",
                 datetime.utcfromtimestamp(fstat[ST_CTIME]))

        if last_modified is not None:
            # Thu, 07 Aug 2008 16:20:19 GMT
            dt_obj = datetime.strptime(last_modified,
                                       "%a, %d %b %Y %H:%M:%S %Z")
            # get local file details

            # check date on local vs remote file
            if dt_obj > datetime.utcfromtimestamp(fstat[ST_CTIME]):
                # check if file size is different
                if fstat[ST_SIZE] < size:
                    LOG.info("New Remote File exists")
                    return True
                if fstat[ST_SIZE] > size:
                    LOG.warning("New Remote File exists but it is SMALLER")
                    return True
                # filesize is a fairly imperfect metric here
                LOG.info(
                    "New Remote fFle has same filesize--will not download")
        elif fstat[ST_SIZE] != size:
            LOG.info("Remote File is %i  \t Local File is %i", size,
                     fstat[ST_SIZE])
            return True

        return False

    def get_files(self, is_dl_forced, files=None):
        """
        Given a set of files for this source, it will go fetch them, and
        set a default version by date.  If you need to set the version number
        by another method, then it can be set again.
        :param is_dl_forced - boolean
        :param files dict - override instance files dict
        :return: None
        """

        fstat = None
        if files is None:
            files = self.files
        for fname in files:
            headers = None
            filesource = files[fname]
            if 'headers' in filesource:
                headers = filesource['headers']
            LOG.info("Getting %s", fname)
            # if the key 'clean' exists in the sources `files` dict
            # expose that instead of the longer url
            if 'clean' in filesource and filesource['clean'] is not None:
                self.dataset.setFileAccessUrl(filesource['clean'])
            else:
                self.dataset.setFileAccessUrl(filesource['url'])
                LOG.info('Fetching %s', filesource['url'])

            self.fetch_from_url(filesource['url'], '/'.join(
                (self.rawdir, filesource['file'])), is_dl_forced, headers)

            fstat = os.stat('/'.join((self.rawdir, filesource['file'])))

        # only keeping the date from the last file
        filedate = datetime.utcfromtimestamp(
            fstat[ST_CTIME]).strftime("%Y-%m-%d")

        # FIXME
        # change this so the date is attached only to each file, not the entire dataset
        self.dataset.set_date_issued(filedate)

    def fetch_from_url(self,
                       remotefile,
                       localfile=None,
                       is_dl_forced=False,
                       headers=None):
        """
        Given a remote url and a local filename, attempt to determine
        if the remote file is newer; if it is,
        fetch the remote file and save it to the specified localfile,
        reporting the basic file information once it is downloaded
        :param remotefile: URL of remote file to fetch
        :param localfile: pathname of file to save locally
        :return: None

        """

        response = None
        if ((is_dl_forced is True) or localfile is None or
            (self.checkIfRemoteIsNewer(remotefile, localfile, headers))):
            # TODO url verification, etc
            if headers is None:
                headers = self._get_default_request_headers()

            request = urllib.request.Request(remotefile, headers=headers)
            response = urllib.request.urlopen(request)

            if localfile is not None:
                with open(localfile, 'wb') as binwrite:
                    while True:
                        chunk = response.read(CHUNK)
                        if not chunk:
                            break
                        binwrite.write(chunk)

                LOG.info("Finished.  Wrote file to %s", localfile)
                if self.compare_local_remote_bytes(remotefile, localfile,
                                                   headers):
                    LOG.debug(
                        "local file is same size as remote after download")
                else:
                    raise Exception(
                        "Error downloading file: local file size  != remote file size"
                    )

                fstat = os.stat(localfile)
                LOG.info("file size: %s", fstat[ST_SIZE])
                LOG.info("file created: %s",
                         time.asctime(time.localtime(fstat[ST_CTIME])))
            else:
                LOG.error('Local filename is required')
                exit(-1)
        else:
            LOG.info("Using existing file %s", localfile)

        return response

    # TODO: rephrase as mysql-dump-xml specific format
    def process_xml_table(self, elem, table_name, processing_function, limit):
        """
        This is a convenience function to process the elements of an xml dump of
        a mysql relational database.
        The "elem" is akin to a mysql table, with it's name of ```table_name```.
        It will process each ```row``` given the ```processing_function``` supplied.
        :param elem: The element data
        :param table_name: The name of the table to process
        :param processing_function: The row processing function
        :param limit:

        Appears to be making calls to the elementTree library
        although it not explicitly imported here.

        :return:

        """

        line_counter = 0
        table_data = elem.find("[@name='" + table_name + "']")
        if table_data is not None:
            LOG.info("Processing " + table_name)
            row = {}
            for line in table_data.findall('row'):
                for field in line.findall('field'):
                    atts = dict(field.attrib)
                    row[atts['name']] = field.text
                processing_function(row)
                line_counter += 1
                if self.test_mode and limit is not None and line_counter > limit:
                    continue

            elem.clear()  # discard the element

    @staticmethod
    def _check_list_len(row, length):
        """
        Sanity check for csv parser
        :param row
        :param length
        :return:None
        """
        if len(row) != length:
            raise Exception("row length does not match expected length of " +
                            str(length) + "\nrow: " + str(row))

    @staticmethod
    def get_file_md5(directory, filename, blocksize=2**20):
        # reference:
        # http://stackoverflow.com/questions/1131220/get-md5-hash-of-big-files-in-python

        md5 = hashlib.md5()
        with open(os.path.join(directory, filename), "rb") as bin_reader:
            while True:
                buff = bin_reader.read(blocksize)
                if not buff:
                    break
                md5.update(buff)

        return md5.hexdigest()

    def get_remote_content_len(self, remote, headers=None):
        """
        :param remote:
        :return: size of remote file
        """

        if headers is None:
            headers = self._get_default_request_headers()

        req = urllib.request.Request(remote, headers=headers)

        try:
            response = urllib.request.urlopen(req)
            resp_header = response.info()
            byte_size = resp_header.get('Content-length')
        except OSError as err:
            byte_size = None
            LOG.error(err)

        return byte_size

    @staticmethod
    def get_local_file_size(localfile):
        """
        :param localfile:
        :return: size of file
        """
        byte_size = os.stat(localfile)
        return byte_size[ST_SIZE]

    def compare_local_remote_bytes(self,
                                   remotefile,
                                   localfile,
                                   remote_headers=None):
        """
        test to see if fetched file is the same size as the remote file
        using information in the content-length field in the HTTP header
        :return: True or False
        """
        is_equal = True
        remote_size = self.get_remote_content_len(remotefile, remote_headers)
        local_size = self.get_local_file_size(localfile)
        if remote_size is not None and local_size != int(remote_size):
            is_equal = False
            LOG.error(
                'local file and remote file different sizes\n'
                '%s has size %s, %s has size %s', localfile, local_size,
                remotefile, remote_size)
        return is_equal

    @staticmethod
    def file_len(fname):
        with open(fname) as lines:
            length = sum(1 for line in lines)
        return length

    @staticmethod
    def get_eco_map(url):
        """
        To convert the three column file to
        a hashmap we join primary and secondary keys,
        for example
        IEA	GO_REF:0000002	ECO:0000256
        IEA	GO_REF:0000003	ECO:0000501
        IEA	Default	ECO:0000501

        becomes
        IEA-GO_REF:0000002: ECO:0000256
        IEA-GO_REF:0000003: ECO:0000501
        IEA: ECO:0000501

        :return: dict
        """
        # this would go in a translation table but it is generated dynamically
        # maybe when we move to a make driven system
        eco_map = {}
        request = urllib.request.Request(url)
        response = urllib.request.urlopen(request)

        for line in response:
            line = line.decode('utf-8').rstrip()
            if re.match(r'^#', line):
                continue
            (code, go_ref, eco_curie) = line.split('\t')
            if go_ref != 'Default':
                eco_map["{}-{}".format(code, go_ref)] = eco_curie
            else:
                eco_map[code] = eco_curie

        return eco_map

    def settestonly(self, testonly):
        """
        Set that this source should only be processed in testMode
        :param testOnly:
        :return: None
        """

        self.test_only = testonly

    def settestmode(self, mode):
        """
        Set testMode to (mode).
        - True: run the Source in testMode;
        - False: run it in full mode
        :param mode:
        :return: None

        """

        self.test_mode = mode

    def getTestSuite(self):
        """
        An abstract method that should be overwritten with
        tests appropriate for the specific source.
        :return:

        """
        return None

    # TODO: pramaterising the release date

    def declareAsOntology(self, graph):
        """
        The file we output needs to be declared as an ontology,
        including it's version information.

        TEC: I am not convinced dipper reformatting external data as RDF triples
        makes an OWL ontology (nor that it should be considered a goal).

        Proper ontologies are built by ontologists. Dipper reformats data
        and annotates/decorates it with a minimal set of carefully arranged
        terms drawn from from multiple proper ontologies.
        Which allows the whole (dipper's RDF triples and parent ontologies)
        to function as a single ontology we can reason over when combined
        in a store such as SciGraph.

        Including more than the minimal ontological terms in dipper's RDF
        output constitutes a liability as it allows greater divergence
        between dipper artifacts and the proper ontologies.

        Further information will be augmented in the dataset object.
        :param version:
        :return:

        """

        # <http://data.monarchinitiative.org/ttl/biogrid.ttl> a owl:Ontology ;
        # owl:versionInfo
        # <https://archive.monarchinitiative.org/YYYYMM/ttl/biogrid.ttl>

        model = Model(graph)

        # is self.outfile suffix set yet???
        ontology_file_id = 'MonarchData:' + self.name + ".ttl"
        model.addOntologyDeclaration(ontology_file_id)

        # add timestamp as version info

        cur_time = datetime.now()
        t_string = cur_time.strftime("%Y-%m-%d")
        ontology_version = t_string
        # TEC this means the MonarchArchive IRI needs the release updated
        # maybe extract the version info from there

        # should not hardcode the suffix as it may change
        archive_url = 'MonarchArchive:' + 'ttl/' + self.name + '.ttl'
        model.addOWLVersionIRI(ontology_file_id, archive_url)
        model.addOWLVersionInfo(ontology_file_id, ontology_version)
        # TODO make sure this is synced with the Dataset class

    @staticmethod
    def remove_backslash_r(filename, encoding):
        """
        A helpful utility to remove Carriage Return from any file.
        This will read a file into memory,
        and overwrite the contents of the original file.

        TODO: This function may be a liability

        :param filename:

        :return:

        """

        with open(filename, 'r', encoding=encoding,
                  newline=r'\n') as filereader:
            contents = filereader.read()
        contents = re.sub(r'\r', '', contents)
        with open(filename, "w") as filewriter:
            filewriter.truncate()
            filewriter.write(contents)

    @staticmethod
    def open_and_parse_yaml(yamlfile):
        """
        :param file: String, path to file containing label-id mappings in
                             the first two columns of each row
        :return: dict where keys are labels and values are ids
        """

        # ??? what if the yaml file does not contain a dict datastructure?
        mapping = dict()
        if os.path.exists(os.path.join(os.path.dirname(__file__), yamlfile)):
            map_file = open(os.path.join(os.path.dirname(__file__), yamlfile),
                            'r')
            mapping = yaml.safe_load(map_file)
            map_file.close()
        else:
            LOG.warning("file: %s not found", yamlfile)

        return mapping

    @staticmethod
    def parse_mapping_file(file):
        """
        :param file: String, path to file containing label-id mappings
                in the first two columns of each row
        :return: dict where keys are labels and values are ids
        """
        id_map = {}
        if os.path.exists(os.path.join(os.path.dirname(__file__), file)):
            with open(os.path.join(os.path.dirname(__file__),
                                   file)) as tsvfile:
                reader = csv.reader(tsvfile, delimiter="\t")
                for row in reader:
                    key = row[0]
                    value = row[1]
                    id_map[key] = value

        return id_map

    @staticmethod
    def _get_default_request_headers():
        return {'User-Agent': USER_AGENT}

    # @staticmethod
    # def getTestSuite(ingest):  # WIP
    #    '''
    #    try to avoid having one of these per ingest
    #    '''
    #    import unittest
    #    testcase = ingest + 'TestCase'
    #    # construct import names ... how
    #    from tests.test_ + ingest import testcase
    #    return unittest.TestLoader().loadTestsFromTestCase(testcase)

    def load_local_translationtable(self, name):
        '''
        Load "ingest specific" translation from whatever they called something
        to the ontology label we need to map it to.
        To facilitate seeing more ontology labels in dipper ingests
        a reverse mapping from ontology labels to external strings is also generated
        and available as a dict localtcid

        '---\n# %s.yaml\n"": ""  # example'
        '''

        localtt_file = '../../translationtable/' + name + '.yaml'

        try:
            with open(os.path.join(os.path.dirname(__file__), localtt_file)):
                pass
        except IOError:
            # write a stub file as a place holder if none exists
            with open(os.path.join(os.path.dirname(__file__), localtt_file),
                      'w') as write_yaml:
                print('---\n# %s.yaml\n"": ""  # example' % name,
                      file=write_yaml)
        finally:
            with open(os.path.join(os.path.dirname(__file__), localtt_file),
                      'r') as read_yaml:
                localtt = yaml.safe_load(read_yaml)

        # inverse local translation.
        # note: keeping this invertable will be work.
        # Useful to not litter an ingest with external syntax
        self.localtcid = {v: k for k, v in localtt.items()}

        return localtt

    def resolve(self, word, mandatory=True, default=None):
        '''
        composite mapping
        given f(x) and g(x)
        here: localtt & globaltt respectivly
        return g(f(x))|g(x)||f(x)|x in order of preference
        returns x on fall through if finding a mapping
        is not mandatory (by default finding is mandatory).

        This may be specialized further from any mapping
        to a global mapping only; if need be.

        :param word:  the srting to find as a key in translation tables
        :param  mandatory: boolean to cauae failure when no key exists

        :return
            value from global translation table,
            or value from local translation table,
            or the query key if finding a value is not mandatory (in this order)

        '''

        assert word is not None

        # we may not agree with a remote sources use of a global term we have
        # this provides opportunity for us to override
        if word in self.localtt:
            label = self.localtt[word]
            if label in self.globaltt:
                term_id = self.globaltt[label]
            else:
                logging.info(
                    "Translated to '%s' but no global term_id for: '%s'",
                    label, word)
                term_id = label
        elif word in self.globaltt:
            term_id = self.globaltt[word]
        else:
            if mandatory:
                raise KeyError("Mapping required for: ", word)
            logging.warning("We have no translation for: '%s'", word)

            if default is not None:
                term_id = default
            else:
                term_id = word
        return term_id

    @staticmethod
    def check_fileheader(expected, received):
        '''
        Compare file headers received versus file headers expected
        if the expected headers are a subset (proper or not)
        of received headers report suscess (warn if proper subset)

            param:  expected  list
            param:  received  list

            return: truthyness
        '''
        exp = set(expected)
        got = set(received)
        if expected != received:
            LOG.error('\nExpected header: %s\nRecieved header: %s', expected,
                      received)

            # pass reordering and adding new columns (after protesting)
            # hard fail on missing expected columns (temper with mandatory cols?)
            if exp - got != set():
                LOG.error('Missing: %s', exp - got)
                raise AssertionError(
                    'Incomming headers are missing expected column.')

            if got - exp != set():
                LOG.warrning('Addtional new columns: %s', got - exp)
            else:
                LOG.warrning('Check columns order')

        return (exp ^ got) & exp == set()
예제 #7
0
class Source:
    """
    Abstract class for any data sources that we'll import and process.
    Each of the subclasses will fetch() the data, scrub() it as necessary,
    then parse() it into a graph.  The graph will then be written out to
    a single self.name().<dest_fmt>  file.

    Also provides a means to marshal metadata in a consistent fashion

    Houses the global translation table (from ontology label to ontology term)
    so it may as well be used everywhere.

    """

    namespaces = {}
    files = {}

    def __init__(
            self,
            graph_type='rdf_graph',     # or streamed_graph
            are_bnodes_skized=False,    # typically True
            name=None,                  # identifier; make an IRI for nquads
            ingest_title=None,
            ingest_url=None,
            license_url=None,           # only if it is _our_ lic
            data_rights=None,           # external page that points to their current lic
            file_handle=None
    ):

        # pull in the common test identifiers
        self.all_test_ids = self.open_and_parse_yaml('../../resources/test_ids.yaml')

        self.graph_type = graph_type
        self.are_bnodes_skized = are_bnodes_skized
        self.ingest_url = ingest_url
        self.ingest_title = ingest_title
        self.localtt = self.load_local_translationtable(name)

        if name is not None:
            self.name = name.lower()
        elif self.whoami() is not None:
            self.name = self.whoami().lower()

        LOG.info("Processing Source \"%s\"", self.name)
        self.test_only = False
        self.path = ""
        # to be used to store a subset of data for testing downstream.
        self.triple_count = 0
        self.outdir = 'out'
        self.testdir = 'tests'
        self.rawdir = 'raw'
        self.rawdir = '/'.join((self.rawdir, self.name))
        self.testname = name + "_test"
        self.testfile = '/'.join((self.outdir, self.testname + ".ttl"))
        self.datasetfile = None

        # still need to pull in file suffix  -- this ia a curie not a url
        self.archive_url = 'MonarchArchive:' + 'ttl/' + self.name + '.ttl'

        # if raw data dir doesn't exist, create it
        if not os.path.exists(self.rawdir):
            os.makedirs(self.rawdir)
            pth = os.path.abspath(self.rawdir)
            LOG.info("creating raw directory for %s at %s", self.name, pth)

        # if output dir doesn't exist, create it
        if not os.path.exists(self.outdir):
            os.makedirs(self.outdir)
            pth = os.path.abspath(self.outdir)
            LOG.info("created output directory %s", pth)

        LOG.info("Creating Test graph %s", self.testname)
        # note: tools such as protoge need slolemized blank nodes
        self.testgraph = RDFGraph(True, self.testname)

        if graph_type == 'rdf_graph':
            graph_id = ':MONARCH_' + str(self.name) + "_" + \
                datetime.now().isoformat(' ').split()[0]

            LOG.info("Creating graph  %s", graph_id)
            self.graph = RDFGraph(are_bnodes_skized, graph_id)

        elif graph_type == 'streamed_graph':
            # need to expand on export formats
            dest_file = open(pth + '/' + name + '.nt', 'w')    # where is the close?
            self.graph = StreamedGraph(are_bnodes_skized, dest_file)
            # leave test files as turtle (better human readibility)
        else:
            LOG.error(
                "%s graph type not supported\n"
                "valid types: rdf_graph, streamed_graph", graph_type)

        # pull in global ontology mapping datastructures
        self.globaltt = self.graph.globaltt
        self.globaltcid = self.graph.globaltcid

        self.curie_map = self.graph.curie_map
        # self.prefix_base = {v: k for k, v in self.curie_map.items()}

        # will be set to True if the intention is
        # to only process and write the test data
        self.test_only = False
        self.test_mode = False

        # this may eventually support Bagits
        self.dataset = Dataset(
            self.archive_url,
            self.ingest_title,
            self.ingest_url,
            None,           # description
            license_url,    # only _OUR_ lic
            data_rights,    # tries to point to others lics
            graph_type,
            file_handle
        )

        for graph in [self.graph, self.testgraph]:
            self.declareAsOntology(graph)

    def fetch(self, is_dl_forced=False):
        """
        abstract method to fetch all data from an external resource.
        this should be overridden by subclasses
        :return: None

        """
        raise NotImplementedError

    def parse(self, limit):
        """
        abstract method to parse all data from an external resource,
        that was fetched in fetch() this should be overridden by subclasses
        :return: None

        """
        raise NotImplementedError

    def write(self, fmt='turtle', stream=None):
        """
        This convenience method will write out all of the graphs
        associated with the source.
        Right now these are hardcoded to be a single "graph"
        and a "src_dataset.ttl" and a "src_test.ttl"
        If you do not supply stream='stdout'
        it will default write these to files.

        In addition, if the version number isn't yet set in the dataset,
        it will be set to the date on file.
        :return: None

        """
        fmt_ext = {
            'rdfxml': 'xml',
            'turtle': 'ttl',
            'nt': 'nt',         # ntriples
            'nquads':  'nq',
            'n3': 'n3'          # notation3
        }

        # make the regular graph output file
        dest = None
        if self.name is not None:
            dest = '/'.join((self.outdir, self.name))
            if fmt in fmt_ext:
                dest = '.'.join((dest, fmt_ext.get(fmt)))
            else:
                dest = '.'.join((dest, fmt))
            LOG.info("Setting outfile to %s", dest)

            # make the dataset_file name, always format as turtle
            self.datasetfile = '/'.join(
                (self.outdir, self.name + '_dataset.ttl'))
            LOG.info("Setting dataset file to %s", self.datasetfile)

            if self.dataset is not None and self.dataset.version is None:
                self.dataset.set_version_by_date()
                LOG.info("No version for %s setting to date issued.", self.name)
        else:
            LOG.warning("No output file set. Using stdout")
            stream = 'stdout'

        gu = GraphUtils(None)

        # the  _dataset description is always turtle
        gu.write(self.dataset.getGraph(), 'turtle', filename=self.datasetfile)

        if self.test_mode:
            # unless we stop hardcoding, the test dataset is always turtle
            LOG.info("Setting testfile to %s", self.testfile)
            gu.write(self.testgraph, 'turtle', filename=self.testfile)

        # print graph out
        if stream is None:
            outfile = dest
        elif stream.lower().strip() == 'stdout':
            outfile = None
        else:
            LOG.error("I don't understand our stream.")
            return
        gu.write(self.graph, fmt, filename=outfile)

    def whoami(self):
        '''
            pointless convieniance
        '''
        LOG.info("Ingest is %s", self.name)

    @staticmethod
    def make_id(long_string, prefix='MONARCH'):
        """
        a method to create DETERMINISTIC identifiers
        based on a string's digest. currently implemented with sha1
        :param long_string:
        :return:

        """
        return ':'.join((prefix, Source.hash_id(long_string)))

    @staticmethod
    def hash_id(wordage):  # same as graph/GraphUtils.digest_id(wordage)
        """
        prepend 'b' to avoid leading with digit
        truncate to a 20 char sized word with a leading 'b'
        return truncated sha1 hash of string.

        by the birthday paradox;
            expect 50% chance of collision after 69 billion invocations
            however these are only hoped to be unique within a single file

        Consider reducing to 17 hex chars to fit in a 64 bit word
        16 discounting a leading constant
        gives a 50% chance of collision at about 4.3b billion unique input strings
        (currently _many_ orders of magnitude below that)

        :param long_string: str string to be hashed
        :return: str hash of id
        """
        return 'b' + hashlib.sha1(wordage.encode('utf-8')).hexdigest()[1:20]

    def checkIfRemoteIsNewer(self, remote, local, headers):
        """
        Given a remote file location, and the corresponding local file
        this will check the datetime stamp on the files to see if the remote
        one is newer.
        This is a convenience method to be used so that we don't have to
        re-fetch files that we already have saved locally
        :param remote: URL of file to fetch from remote server
        :param local: pathname to save file to locally
        :return: True if the remote file is newer and should be downloaded

        """
        LOG.info("Checking if remote file is newer than local \n(%s)", local)

        # check if local file exists
        # if no local file, then remote is newer
        if os.path.exists(local):
            LOG.info("Local File exists as %s", local)
        else:
            LOG.info("Local File does NOT exist as %s", local)
            return True

        # get remote file details
        if headers is None:
            headers = self._get_default_request_headers()

        req = urllib.request.Request(remote, headers=headers)
        LOG.info("Request header: %s", str(req.header_items()))

        response = urllib.request.urlopen(req)

        try:
            resp_headers = response.info()
            size = resp_headers.get('Content-Length')
            last_modified = resp_headers.get('Last-Modified')
        except urllib.error.URLError as err:
            resp_headers = None
            size = 0
            last_modified = None
            LOG.error(err)

        if size is not None and size != '':
            size = int(size)
        else:
            size = 0

        fstat = os.stat(local)
        LOG.info(
            "Local File date: %s",
            datetime.utcfromtimestamp(fstat[ST_CTIME]))

        if last_modified is not None:
            # Thu, 07 Aug 2008 16:20:19 GMT
            dt_obj = datetime.strptime(
                last_modified, "%a, %d %b %Y %H:%M:%S %Z")
            # get local file details

            # check date on local vs remote file
            if dt_obj > datetime.utcfromtimestamp(fstat[ST_CTIME]):
                # check if file size is different
                if fstat[ST_SIZE] < size:
                    LOG.info("New Remote File exists")
                    return True
                if fstat[ST_SIZE] > size:
                    LOG.warning("New Remote File exists but it is SMALLER")
                    return True
                # filesize is a fairly imperfect metric here
                LOG.info("New Remote fFle has same filesize--will not download")
        elif fstat[ST_SIZE] != size:
            LOG.info(
                "Remote File is %i  \t Local File is %i", size, fstat[ST_SIZE])
            return True

        return False

    def get_files(self, is_dl_forced, files=None):
        """
        Given a set of files for this source, it will go fetch them, and
        set a default version by date.  If you need to set the version number
        by another method, then it can be set again.
        :param is_dl_forced - boolean
        :param files dict - override instance files dict
        :return: None
        """

        fstat = None
        if files is None:
            files = self.files
        for fname in files:
            headers = None
            filesource = files[fname]
            if 'headers' in filesource:
                headers = filesource['headers']
            LOG.info("Getting %s", fname)
            # if the key 'clean' exists in the sources `files` dict
            # expose that instead of the longer url
            if 'clean' in filesource and filesource['clean'] is not None:
                self.dataset.setFileAccessUrl(filesource['clean'])
            else:
                self.dataset.setFileAccessUrl(filesource['url'])
                LOG.info('Fetching %s', filesource['url'])

            self.fetch_from_url(
                filesource['url'], '/'.join((self.rawdir, filesource['file'])),
                is_dl_forced, headers)

            fstat = os.stat('/'.join((self.rawdir, filesource['file'])))

        # only keeping the date from the last file
        filedate = datetime.utcfromtimestamp(fstat[ST_CTIME]).strftime("%Y-%m-%d")

        # FIXME
        # change this so the date is attached only to each file, not the entire dataset
        self.dataset.set_date_issued(filedate)

    def fetch_from_url(
            self, remotefile, localfile=None, is_dl_forced=False, headers=None):
        """
        Given a remote url and a local filename, attempt to determine
        if the remote file is newer; if it is,
        fetch the remote file and save it to the specified localfile,
        reporting the basic file information once it is downloaded
        :param remotefile: URL of remote file to fetch
        :param localfile: pathname of file to save locally
        :return: None

        """

        response = None
        if ((is_dl_forced is True) or localfile is None or
                (self.checkIfRemoteIsNewer(remotefile, localfile, headers))):
            # TODO url verification, etc
            if headers is None:
                headers = self._get_default_request_headers()

            request = urllib.request.Request(remotefile, headers=headers)
            response = urllib.request.urlopen(request)

            if localfile is not None:
                with open(localfile, 'wb') as binwrite:
                    while True:
                        chunk = response.read(CHUNK)
                        if not chunk:
                            break
                        binwrite.write(chunk)

                LOG.info("Finished.  Wrote file to %s", localfile)
                if self.compare_local_remote_bytes(remotefile, localfile, headers):
                    LOG.debug("local file is same size as remote after download")
                else:
                    raise Exception(
                        "Error downloading file: local file size  != remote file size")

                fstat = os.stat(localfile)
                LOG.info("file size: %s", fstat[ST_SIZE])
                LOG.info(
                    "file created: %s", time.asctime(time.localtime(fstat[ST_CTIME])))
            else:
                LOG.error('Local filename is required')
                exit(-1)
        else:
            LOG.info("Using existing file %s", localfile)

        return response

    # TODO: rephrase as mysql-dump-xml specific format
    def process_xml_table(self, elem, table_name, processing_function, limit):
        """
        This is a convenience function to process the elements of an xml dump of
        a mysql relational database.
        The "elem" is akin to a mysql table, with it's name of ```table_name```.
        It will process each ```row``` given the ```processing_function``` supplied.
        :param elem: The element data
        :param table_name: The name of the table to process
        :param processing_function: The row processing function
        :param limit:

        Appears to be making calls to the elementTree library
        although it not explicitly imported here.

        :return:

        """

        line_counter = 0
        table_data = elem.find("[@name='" + table_name + "']")
        if table_data is not None:
            LOG.info("Processing " + table_name)
            row = {}
            for line in table_data.findall('row'):
                for field in line.findall('field'):
                    atts = dict(field.attrib)
                    row[atts['name']] = field.text
                processing_function(row)
                line_counter += 1
                if self.test_mode and limit is not None and line_counter > limit:
                    continue

            elem.clear()  # discard the element

    @staticmethod
    def _check_list_len(row, length):
        """
        Sanity check for csv parser
        :param row
        :param length
        :return:None
        """
        if len(row) != length:
            raise Exception(
                "row length does not match expected length of " +
                str(length) + "\nrow: " + str(row))

    @staticmethod
    def get_file_md5(directory, filename, blocksize=2**20):
        # reference:
        # http://stackoverflow.com/questions/1131220/get-md5-hash-of-big-files-in-python

        md5 = hashlib.md5()
        with open(os.path.join(directory, filename), "rb") as bin_reader:
            while True:
                buff = bin_reader.read(blocksize)
                if not buff:
                    break
                md5.update(buff)

        return md5.hexdigest()

    def get_remote_content_len(self, remote, headers=None):
        """
        :param remote:
        :return: size of remote file
        """

        if headers is None:
            headers = self._get_default_request_headers()

        req = urllib.request.Request(remote, headers=headers)

        try:
            response = urllib.request.urlopen(req)
            resp_header = response.info()
            byte_size = resp_header.get('Content-length')
        except OSError as err:
            byte_size = None
            LOG.error(err)

        return byte_size

    @staticmethod
    def get_local_file_size(localfile):
        """
        :param localfile:
        :return: size of file
        """
        byte_size = os.stat(localfile)
        return byte_size[ST_SIZE]

    def compare_local_remote_bytes(self, remotefile, localfile, remote_headers=None):
        """
        test to see if fetched file is the same size as the remote file
        using information in the content-length field in the HTTP header
        :return: True or False
        """
        is_equal = True
        remote_size = self.get_remote_content_len(remotefile, remote_headers)
        local_size = self.get_local_file_size(localfile)
        if remote_size is not None and local_size != int(remote_size):
            is_equal = False
            LOG.error(
                'local file and remote file different sizes\n'
                '%s has size %s, %s has size %s',
                localfile, local_size, remotefile, remote_size)
        return is_equal

    @staticmethod
    def file_len(fname):
        with open(fname) as lines:
            length = sum(1 for line in lines)
        return length

    @staticmethod
    def get_eco_map(url):
        """
        To convert the three column file to
        a hashmap we join primary and secondary keys,
        for example
        IEA	GO_REF:0000002	ECO:0000256
        IEA	GO_REF:0000003	ECO:0000501
        IEA	Default	ECO:0000501

        becomes
        IEA-GO_REF:0000002: ECO:0000256
        IEA-GO_REF:0000003: ECO:0000501
        IEA: ECO:0000501

        :return: dict
        """
        # this would go in a translation table but it is generated dynamically
        # maybe when we move to a make driven system
        eco_map = {}
        request = urllib.request.Request(url)
        response = urllib.request.urlopen(request)

        for line in response:
            line = line.decode('utf-8').rstrip()
            if re.match(r'^#', line):
                continue
            (code, go_ref, eco_curie) = line.split('\t')
            if go_ref != 'Default':
                eco_map["{}-{}".format(code, go_ref)] = eco_curie
            else:
                eco_map[code] = eco_curie

        return eco_map

    def settestonly(self, testonly):
        """
        Set that this source should only be processed in testMode
        :param testOnly:
        :return: None
        """

        self.test_only = testonly

    def settestmode(self, mode):
        """
        Set testMode to (mode).
        - True: run the Source in testMode;
        - False: run it in full mode
        :param mode:
        :return: None

        """

        self.test_mode = mode

    def getTestSuite(self):
        """
        An abstract method that should be overwritten with
        tests appropriate for the specific source.
        :return:

        """
        return None

    # TODO: pramaterising the release date

    def declareAsOntology(self, graph):
        """
        The file we output needs to be declared as an ontology,
        including it's version information.

        TEC: I am not convinced dipper reformatting external data as RDF triples
        makes an OWL ontology (nor that it should be considered a goal).

        Proper ontologies are built by ontologists. Dipper reformats data
        and annotates/decorates it with a minimal set of carefully arranged
        terms drawn from from multiple proper ontologies.
        Which allows the whole (dipper's RDF triples and parent ontologies)
        to function as a single ontology we can reason over when combined
        in a store such as SciGraph.

        Including more than the minimal ontological terms in dipper's RDF
        output constitutes a liability as it allows greater divergence
        between dipper artifacts and the proper ontologies.

        Further information will be augmented in the dataset object.
        :param version:
        :return:

        """

        # <http://data.monarchinitiative.org/ttl/biogrid.ttl> a owl:Ontology ;
        # owl:versionInfo
        # <https://archive.monarchinitiative.org/YYYYMM/ttl/biogrid.ttl>

        model = Model(graph)

        # is self.outfile suffix set yet???
        ontology_file_id = 'MonarchData:' + self.name + ".ttl"
        model.addOntologyDeclaration(ontology_file_id)

        # add timestamp as version info

        cur_time = datetime.now()
        t_string = cur_time.strftime("%Y-%m-%d")
        ontology_version = t_string
        # TEC this means the MonarchArchive IRI needs the release updated
        # maybe extract the version info from there

        # should not hardcode the suffix as it may change
        archive_url = 'MonarchArchive:' + 'ttl/' + self.name + '.ttl'
        model.addOWLVersionIRI(ontology_file_id, archive_url)
        model.addOWLVersionInfo(ontology_file_id, ontology_version)
        # TODO make sure this is synced with the Dataset class

    @staticmethod
    def remove_backslash_r(filename, encoding):
        """
        A helpful utility to remove Carriage Return from any file.
        This will read a file into memory,
        and overwrite the contents of the original file.

        TODO: This function may be a liability

        :param filename:

        :return:

        """

        with open(filename, 'r', encoding=encoding, newline=r'\n') as filereader:
            contents = filereader.read()
        contents = re.sub(r'\r', '', contents)
        with open(filename, "w") as filewriter:
            filewriter.truncate()
            filewriter.write(contents)

    @staticmethod
    def open_and_parse_yaml(yamlfile):
        """
        :param file: String, path to file containing label-id mappings in
                             the first two columns of each row
        :return: dict where keys are labels and values are ids
        """

        # ??? what if the yaml file does not contain a dict datastructure?
        mapping = dict()
        if os.path.exists(os.path.join(os.path.dirname(__file__), yamlfile)):
            map_file = open(os.path.join(os.path.dirname(__file__), yamlfile), 'r')
            mapping = yaml.safe_load(map_file)
            map_file.close()
        else:
            LOG.warning("file: %s not found", yamlfile)

        return mapping

    @staticmethod
    def parse_mapping_file(file):
        """
        :param file: String, path to file containing label-id mappings
                in the first two columns of each row
        :return: dict where keys are labels and values are ids
        """
        id_map = {}
        if os.path.exists(os.path.join(os.path.dirname(__file__), file)):
            with open(os.path.join(os.path.dirname(__file__), file)) as tsvfile:
                reader = csv.reader(tsvfile, delimiter="\t")
                for row in reader:
                    key = row[0]
                    value = row[1]
                    id_map[key] = value

        return id_map

    @staticmethod
    def _get_default_request_headers():
        return {
            'User-Agent': USER_AGENT
        }

    # @staticmethod
    # def getTestSuite(ingest):  # WIP
    #    '''
    #    try to avoid having one of these per ingest
    #    '''
    #    import unittest
    #    testcase = ingest + 'TestCase'
    #    # construct import names ... how
    #    from tests.test_ + ingest import testcase
    #    return unittest.TestLoader().loadTestsFromTestCase(testcase)

    def load_local_translationtable(self, name):
        '''
        Load "ingest specific" translation from whatever they called something
        to the ontology label we need to map it to.
        To facilitate seeing more ontology labels in dipper ingests
        a reverse mapping from ontology labels to external strings is also generated
        and available as a dict localtcid

        '---\n# %s.yaml\n"": ""  # example'
        '''

        localtt_file = 'translationtable/' + name + '.yaml'

        try:
            with open(localtt_file):
                pass
        except IOError:
            # write a stub file as a place holder if none exists
            with open(localtt_file, 'w') as write_yaml:
                print('---\n# %s.yaml\n"": ""  # example' % name, file=write_yaml)
        finally:
            with open(localtt_file, 'r') as read_yaml:
                localtt = yaml.safe_load(read_yaml)

        # inverse local translation.
        # note: keeping this invertable will be work.
        # Useful to not litter an ingest with external syntax
        self.localtcid = {v: k for k, v in localtt.items()}

        return localtt

    def resolve(self, word, mandatory=True, default=None):
        '''
        composite mapping
        given f(x) and g(x)
        here: localtt & globaltt respectivly
        return g(f(x))|g(x)||f(x)|x in order of preference
        returns x on fall through if finding a mapping
        is not mandatory (by default finding is mandatory).

        This may be specialized further from any mapping
        to a global mapping only; if need be.

        :param word:  the srting to find as a key in translation tables
        :param  mandatory: boolean to cauae failure when no key exists

        :return
            value from global translation table,
            or value from local translation table,
            or the query key if finding a value is not mandatory (in this order)

        '''

        assert word is not None

        # we may not agree with a remote sources use of a global term we have
        # this provides opportunity for us to override
        if word in self.localtt:
            label = self.localtt[word]
            if label in self.globaltt:
                term_id = self.globaltt[label]
            else:
                logging.info(
                    "Translated to '%s' but no global term_id for: '%s'", label, word)
                term_id = label
        elif word in self.globaltt:
            term_id = self.globaltt[word]
        else:
            if mandatory:
                raise KeyError("Mapping required for: ", word)
            logging.warning("We have no translation for: '%s'", word)

            if default is not None:
                term_id = default
            else:
                term_id = word
        return term_id

    @staticmethod
    def check_fileheader(expected, received):
        '''
        Compare file headers received versus file headers expected
        if the expected headers are a subset (proper or not)
        of received headers report suscess (warn if proper subset)

            param:  expected  list
            param:  received  list

            return: truthyness
        '''
        exp = set(expected)
        got = set(received)
        if expected != received:
            LOG.error('\nExpected header: %s\nRecieved header: %s', expected, received)

            # pass reordering and adding new columns (after protesting)
            # hard fail on missing expected columns (temper with mandatory cols?)
            if exp - got != set():
                LOG.error('Missing: %s', exp - got)
                raise AssertionError('Incomming headers are missing expected column.')

            if got - exp != set():
                LOG.warrning('Addtional new columns: %s', got - exp)
            else:
                LOG.warrning('Check columns order')

        return (exp ^ got) & exp == set()
예제 #8
0
class Coriell(Source):
    """
    The Coriell Catalog provided to Monarch includes metadata and descriptions
    of NIGMS, NINDS, NHGRI, and NIA cell lines.  These lines are made available
    for research purposes. Here, we create annotations for the cell lines as
    models of the diseases from which they originate.

    We create a handle for a patient from which the given cell line is derived
    (since there may be multiple cell lines created from a given patient).
    A genotype is assembled for a patient, which includes a karyotype
    (if specified) and/or a collection of variants.
    Both the genotype (has_genotype) and disease are linked to the patient
    (has_phenotype), and the cell line is listed as derived from the patient.
    The cell line is classified by it's
    [CLO cell type](http://www.ontobee.org/browser/index.php?o=clo),
    which itself is linked to a tissue of origin.

    Unfortunately, the omim numbers listed in this file are both for genes
    & diseases; we have no way of knowing a priori if a designated omim number
    is a gene or disease; so we presently link the patient to any omim id via
    the has_phenotype relationship.

    Notice: The Coriell catalog is delivered to Monarch in a specific format,
    and requires ssh rsa fingerprint identification.  Other groups wishing to
    get this data in it's raw form will need to contact Coriell for credential
    This needs to be placed into your configuration file for it to work.

    """

    terms = {
        'cell_line_repository': 'CLO:0000008',
        'race': 'SIO:001015',
        'ethnic_group': 'EFO:0001799',
        'age': 'EFO:0000246',
        'sampling_time': 'EFO:0000689',
        'collection': 'ERO:0002190'
    }

    files = {
        'NINDS': {
            'file': 'NINDS.csv',
            'id': 'NINDS',
            'label': 'NINDS Human Genetics DNA and Cell line Repository',
            'page': 'https://catalog.coriell.org/1/NINDS'
        },
        'NIGMS': {
            'file': 'NIGMS.csv',
            'id': 'NIGMS',
            'label': 'NIGMS Human Genetic Cell Repository',
            'page': 'https://catalog.coriell.org/1/NIGMS'
        },
        'NIA': {
            'file': 'NIA.csv',
            'id': 'NIA',
            'label': 'NIA Aging Cell Repository',
            'page': 'https://catalog.coriell.org/1/NIA'
        },
        'NHGRI': {
            'file': 'NHGRI.csv',
            'id': 'NHGRI',
            'label': 'NHGRI Sample Repository for Human Genetic Research',
            'page': 'https://catalog.coriell.org/1/NHGRI'
        }
    }

    # the following will house the specific cell lines to use for test output
    test_lines = [
        'ND02380', 'ND02381', 'ND02383', 'ND02384', 'GM17897', 'GM17898',
        'GM17896', 'GM17944', 'GM17945', 'ND00055', 'ND00094', 'ND00136',
        'GM17940', 'GM17939', 'GM20567', 'AG02506', 'AG04407', 'AG07602'
        'AG07601', 'GM19700', 'GM19701', 'GM19702', 'GM00324', 'GM00325',
        'GM00142', 'NA17944', 'AG02505', 'GM01602', 'GM02455', 'AG00364',
        'GM13707', 'AG00780'
    ]

    def __init__(self, graph_type, are_bnodes_skolemized):
        super().__init__(graph_type, are_bnodes_skolemized, 'coriell')

        self.dataset = Dataset('coriell', 'Coriell', 'http://ccr.coriell.org/',
                               None)

        # data-source specific warnings
        # (will be removed when issues are cleared)

        logger.warning('We assume that if a species is not provided, '
                       'that it is a Human-derived cell line')
        logger.warning('We map all omim ids as a disease/phenotype entity, '
                       'but should be fixed in the future')  # TODO

        # check if config exists; if it doesn't, error out and let user know
        if 'dbauth' not in config.get_config() or \
                'coriell' not in config.get_config()['dbauth']:
            logger.error("not configured with FTP user/password.")

        return

    def fetch(self, is_dl_forced=False):
        """
        Here we connect to the coriell sftp server using private connection
        details.  They dump bi-weekly files with a timestamp in the filename.
        For each catalog, we poll the remote site and pull the most-recently
        updated file, renaming it to our local  latest.csv.

        Be sure to have pg user/password connection details in your conf.json
        file, like:
        dbauth : {"coriell" : {
        "user" : "<username>", "password" : "<password>",
        "host" : <host>, "private_key"=path/to/rsa_key}
        }

        :param is_dl_forced:
        :return:

        """

        host = config.get_config()['dbauth']['coriell']['host']
        user = config.get_config()['dbauth']['coriell']['user']
        passwd = config.get_config()['dbauth']['coriell']['password']
        key = config.get_config()['dbauth']['coriell']['private_key']

        with pysftp.Connection(host,
                               username=user,
                               password=passwd,
                               private_key=key) as sftp:
            # check to make sure each file is in there
            # get the remote files
            remote_files = sftp.listdir_attr()
            files_by_repo = {}
            for attr in remote_files:
                # for each catalog, get the most-recent filename
                m = re.match('(NIGMS|NIA|NHGRI|NINDS)', attr.filename)
                if m is not None and len(m.groups()) > 0:
                    # there should just be one now
                    files_by_repo[m.group(1)] = attr
            # sort each array in hash,
            # & get the name and time of the most-recent file for each catalog
            for r in self.files:
                logger.info("Checking on %s catalog file", r)
                fname = self.files[r]['file']
                remotef = files_by_repo[r]
                target_name = '/'.join((self.rawdir, fname))
                # check if the local file is out of date, if so, download.
                # otherwise, skip.
                # we rename (for simplicity) the original file
                st = None
                if os.path.exists(target_name):
                    st = os.stat(target_name)
                    logger.info("Local file date: %s",
                                datetime.utcfromtimestamp(st[stat.ST_CTIME]))
                if st is None or remotef.st_mtime > st[stat.ST_CTIME]:
                    if st is None:
                        logger.info(
                            "File does not exist locally; downloading...")
                    else:
                        logger.info(
                            "There's a new version of %s catalog available; "
                            "downloading...", r)
                    sftp.get(remotef.filename, target_name)
                    logger.info("Fetched remote %s -> %s", remotef.filename,
                                target_name)
                    st = os.stat(target_name)
                    filedate = \
                        datetime.utcfromtimestamp(
                            remotef.st_mtime).strftime("%Y-%m-%d")
                    logger.info("New file date: %s",
                                datetime.utcfromtimestamp(st[stat.ST_CTIME]))

                else:
                    logger.info("File %s exists; using local copy", fname)
                    filedate = \
                        datetime.utcfromtimestamp(
                            st[stat.ST_CTIME]).strftime("%Y-%m-%d")

                self.dataset.setFileAccessUrl(remotef.filename, True)
                self.dataset.setVersion(filedate)
        return

    def parse(self, limit=None):
        if limit is not None:
            logger.info("Only parsing first %s rows of each file", limit)

        logger.info("Parsing files...")

        if self.testOnly:
            self.testMode = True

        for f in self.files:
            file = '/'.join((self.rawdir, self.files[f]['file']))
            self._process_collection(self.files[f]['id'],
                                     self.files[f]['label'],
                                     self.files[f]['page'])
            self._process_data(file, limit)

        logger.info("Finished parsing.")
        return

    def _process_data(self, raw, limit=None):
        """
        This function will process the data files from Coriell.
        We make the assumption that any alleles listed are variants
        (alternates to w.t.)

        Triples: (examples)

        :NIGMSrepository a CLO_0000008 #repository
        label : NIGMS Human Genetic Cell Repository
        foaf:page
         https://catalog.coriell.org/0/sections/collections/NIGMS/?SsId=8

        line_id a CL_0000057,  #fibroblast line
            derives_from patient_id
            part_of :NIGMSrepository
            RO:model_of OMIM:disease_id

        patient id a foaf:person,
            label: "fibroblast from patient 12345 with disease X"
            member_of family_id  #what is the right thing here?
            SIO:race EFO:caucasian  #subclass of EFO:0001799
            in_taxon NCBITaxon:9606
            dc:description Literal(remark)
            RO:has_phenotype OMIM:disease_id
            GENO:has_genotype genotype_id

        family_id a owl:NamedIndividual
            foaf:page
             "https://catalog.coriell.org/0/Sections/BrowseCatalog/FamilyTypeSubDetail.aspx?PgId=402&fam=2104&coll=GM"

        genotype_id a intrinsic_genotype
            GENO:has_alternate_part allelic_variant_id
            we don't necessarily know much about the genotype,
            other than the allelic variant. also there's the sex here

        pub_id mentions cell_line_id

        :param raw:
        :param limit:
        :return:

        """

        logger.info("Processing Data from %s", raw)

        if self.testMode:  # set the graph to build
            g = self.testgraph
        else:
            g = self.graph

        family = Family(g)
        model = Model(g)

        line_counter = 0
        geno = Genotype(g)
        du = DipperUtil()

        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter=',', quotechar='\"')
            next(filereader, None)  # skip the header row
            for row in filereader:
                if not row:
                    pass
                else:
                    line_counter += 1

                    (catalog_id, description, omim_number, sample_type,
                     cell_line_available, dna_in_stock, dna_ref, gender, age,
                     race, ethnicity, affected, karyotype, relprob, mutation,
                     gene, family_id, collection, url, cat_remark, pubmed_ids,
                     family_member, variant_id, dbsnp_id, species) = row

                    # example:
                    # GM00003,HURLER SYNDROME,607014,Fibroblast,Yes,No,,Female,26 YR,Caucasian,,,,
                    # parent,,,39,NIGMS Human Genetic Cell Repository,
                    # http://ccr.coriell.org/Sections/Search/Sample_Detail.aspx?Ref=GM00003,
                    # 46;XX; clinically normal mother of a child with Hurler syndrome; proband not in Repository,,
                    # 2,,18343,H**o sapiens

                    if self.testMode and catalog_id not in self.test_lines:
                        # skip rows not in our test lines, when in test mode
                        continue

                    # ###########    BUILD REQUIRED VARIABLES    ###########

                    # Make the cell line ID
                    cell_line_id = 'Coriell:' + catalog_id.strip()

                    # Map the cell/sample type
                    cell_type = self._map_cell_type(sample_type)

                    # Make a cell line label
                    line_label = \
                        collection.partition(' ')[0]+'-'+catalog_id.strip()

                    # Map the repository/collection
                    repository = self._map_collection(collection)

                    # patients are uniquely identified by one of:
                    # dbsnp id (which is == an individual haplotype)
                    # family id + family member (if present) OR
                    # probands are usually family member zero
                    # cell line id
                    # since some patients have >1 cell line derived from them,
                    # we must make sure that the genotype is attached to
                    # the patient, and can be inferred to the cell line
                    # examples of repeated patients are:
                    #   famid=1159, member=1; fam=152,member=1

                    # Make the patient ID

                    # make an anonymous patient
                    patient_id = '_:person'
                    if family_id != '':
                        patient_id = \
                            '-'.join((patient_id, family_id, family_member))
                    else:
                        # make an anonymous patient
                        patient_id = '-'.join((patient_id, catalog_id.strip()))

                    # properties of the individual patients:  sex, family id,
                    # member/relproband, description descriptions are
                    # really long and ugly SCREAMING text, so need to clean up
                    # the control cases are so odd with this labeling scheme;
                    # but we'll deal with it as-is for now.
                    short_desc = (description.split(';')[0]).capitalize()
                    if affected == 'Yes':
                        affected = 'affected'
                    elif affected == 'No':
                        affected = 'unaffected'
                    gender = gender.lower()
                    patient_label = ' '.join((affected, gender, relprob))
                    if relprob == 'proband':
                        patient_label = \
                            ' '.join(
                                (patient_label.strip(), 'with', short_desc))
                    else:
                        patient_label = \
                            ' '.join(
                                (patient_label.strip(), 'of proband with',
                                 short_desc))

                    # #############    BUILD THE CELL LINE    #############

                    # Adding the cell line as a typed individual.
                    cell_line_reagent_id = 'CLO:0000031'

                    model.addIndividualToGraph(cell_line_id, line_label,
                                               cell_line_reagent_id)

                    # add the equivalent id == dna_ref
                    if dna_ref != '' and dna_ref != catalog_id:
                        equiv_cell_line = 'Coriell:' + dna_ref
                        # some of the equivalent ids are not defined
                        # in the source data; so add them
                        model.addIndividualToGraph(equiv_cell_line, None,
                                                   cell_line_reagent_id)
                        model.addSameIndividual(cell_line_id, equiv_cell_line)

                    # Cell line derives from patient
                    geno.addDerivesFrom(cell_line_id, patient_id)
                    geno.addDerivesFrom(cell_line_id, cell_type)

                    # Cell line a member of repository
                    family.addMember(repository, cell_line_id)

                    if cat_remark != '':
                        model.addDescription(cell_line_id, cat_remark)

                    # Cell age_at_sampling
                    # TODO add the age nodes when modeled properly in #78
                    # if (age != ''):
                    # this would give a BNode that is an instance of Age.
                    # but i don't know how to connect
                    # the age node to the cell line? we need to ask @mbrush
                    # age_id = '_'+re.sub('\s+','_',age)
                    # gu.addIndividualToGraph(
                    #   g,age_id,age,self.terms['age'])
                    # gu.addTriple(
                    #   g,age_id,self.properties['has_measurement'],age,
                    #   True)

                    # #############    BUILD THE PATIENT    #############

                    # Add the patient ID as an individual.
                    model.addPerson(patient_id, patient_label)
                    # TODO map relationship to proband as a class
                    # (what ontology?)

                    # Add race of patient
                    # FIXME: Adjust for subcategories based on ethnicity field
                    # EDIT: There are 743 different entries for ethnicity...
                    # Too many to map?
                    # Add ethnicity as literal in addition to the mapped race?
                    # Adjust the ethnicity txt (if using)
                    # to initial capitalization to remove ALLCAPS

                    # TODO race should go into the individual's background
                    # and abstracted out to the Genotype class punting for now.
                    # if race != '':
                    #    mapped_race = self._map_race(race)
                    #    if mapped_race is not None:
                    #        gu.addTriple(
                    #           g,patient_id,self.terms['race'],mapped_race)
                    #        model.addSubClass(
                    #           mapped_race,self.terms['ethnic_group'])

                    # #############    BUILD THE FAMILY    #############

                    # Add triples for family_id, if present.
                    if family_id != '':
                        family_comp_id = 'CoriellFamily:' + family_id

                        family_label = \
                            ' '.join(('Family of proband with', short_desc))

                        # Add the family ID as a named individual
                        model.addIndividualToGraph(family_comp_id,
                                                   family_label,
                                                   geno.genoparts['family'])

                        # Add the patient as a member of the family
                        family.addMemberOf(patient_id, family_comp_id)

                    # #############    BUILD THE GENOTYPE   #############

                    # the important things to pay attention to here are:
                    # karyotype = chr rearrangements  (somatic?)
                    # mutation = protein-level mutation as a label,
                    # often from omim
                    # gene = gene symbol - TODO get id
                    # variant_id = omim variant ids (; delimited)
                    # dbsnp_id = snp individual ids = full genotype?

                    # note GM00633 is a good example of chromosomal variation
                    # - do we have enough to capture this?
                    # GM00325 has both abnormal karyotype and variation

                    # make an assumption that if the taxon is blank,
                    # that it is human!
                    if species is None or species == '':
                        species = 'H**o sapiens'
                    taxon = self._map_species(species)

                    # if there's a dbSNP id,
                    # this is actually the individual's genotype
                    genotype_id = None
                    genotype_label = None
                    if dbsnp_id != '':
                        genotype_id = 'dbSNPIndividual:' + dbsnp_id.strip()

                    omim_map = {}
                    gvc_id = None

                    # some of the karyotypes are encoded
                    # with terrible hidden codes. remove them here
                    # i've seen a <98> character
                    karyotype = du.remove_control_characters(karyotype)
                    karyotype_id = None
                    if karyotype.strip() != '':
                        karyotype_id = \
                            '_:'+re.sub(
                                'MONARCH:', '', self.make_id(karyotype))
                        # add karyotype as karyotype_variation_complement
                        model.addIndividualToGraph(
                            karyotype_id, karyotype,
                            geno.genoparts['karyotype_variation_complement'])
                        # TODO break down the karyotype into parts
                        # and map into GENO. depends on #77

                        # place the karyotype in a location(s).
                        karyo_chrs = \
                            self._get_affected_chromosomes_from_karyotype(
                                karyotype)
                        for c in karyo_chrs:
                            chr_id = makeChromID(c, taxon, 'CHR')
                            # add an anonymous sequence feature,
                            # each located on chr
                            karyotype_feature_id = '-'.join((karyotype_id, c))
                            karyotype_feature_label = \
                                'some karyotype alteration on chr'+str(c)
                            f = Feature(g, karyotype_feature_id,
                                        karyotype_feature_label,
                                        geno.genoparts['sequence_alteration'])
                            f.addFeatureStartLocation(None, chr_id)
                            f.addFeatureToGraph()
                            geno.addParts(
                                karyotype_feature_id, karyotype_id,
                                geno.object_properties['has_alternate_part'])

                    if gene != '':
                        vl = gene + '(' + mutation + ')'

                    # fix the variant_id so it's always in the same order
                    vids = variant_id.split(';')
                    variant_id = ';'.join(sorted(list(set(vids))))

                    if karyotype.strip() != '' \
                            and not self._is_normal_karyotype(karyotype):
                        mutation = mutation.strip()
                        gvc_id = karyotype_id
                        if variant_id != '':
                            gvc_id = \
                                '_:' + variant_id.replace(';', '-') + '-' \
                                + re.sub(r'\w*:', '', karyotype_id)
                        if mutation.strip() != '':
                            gvc_label = '; '.join((vl, karyotype))
                        else:
                            gvc_label = karyotype
                    elif variant_id.strip() != '':
                        gvc_id = '_:' + variant_id.replace(';', '-')
                        gvc_label = vl
                    else:
                        # wildtype?
                        pass

                    # add the karyotype to the gvc.
                    # use reference if normal karyotype
                    karyo_rel = geno.object_properties['has_alternate_part']
                    if self._is_normal_karyotype(karyotype):
                        karyo_rel = \
                            geno.object_properties['has_reference_part']
                    if karyotype_id is not None \
                            and not self._is_normal_karyotype(karyotype) \
                            and gvc_id is not None and karyotype_id != gvc_id:
                        geno.addParts(karyotype_id, gvc_id, karyo_rel)

                    if variant_id.strip() != '':
                        # split the variants & add them as part of the genotype
                        # we don't necessarily know their zygosity,
                        # just that they are part of the genotype variant ids
                        # are from OMIM, so prefix as such we assume that the
                        # sequence alts will be defined in OMIM not here
                        # TODO sort the variant_id list, if the omim prefix is
                        # the same, then assume it's the locus make a hashmap
                        # of the omim id to variant id list;
                        # then build the genotype hashmap is also useful for
                        # removing the "genes" from the list of "phenotypes"

                        # will hold gene/locus id to variant list
                        omim_map = {}

                        locus_num = None
                        for v in variant_id.split(';'):
                            # handle omim-style and odd var ids
                            # like 610661.p.R401X
                            m = re.match(r'(\d+)\.+(.*)', v.strip())
                            if m is not None and len(m.groups()) == 2:
                                (locus_num, var_num) = m.groups()

                            if locus_num is not None \
                                    and locus_num not in omim_map:
                                omim_map[locus_num] = [var_num]
                            else:
                                omim_map[locus_num] += [var_num]

                        for o in omim_map:
                            # gene_id = 'OMIM:' + o  # TODO unused
                            vslc_id = \
                                '_:' + '-'.join(
                                    [o + '.' + a for a in omim_map.get(o)])
                            vslc_label = vl
                            # we don't really know the zygosity of
                            # the alleles at all.
                            # so the vslcs are just a pot of them
                            model.addIndividualToGraph(
                                vslc_id, vslc_label, geno.
                                genoparts['variant_single_locus_complement'])
                            for v in omim_map.get(o):
                                # this is actually a sequence alt
                                allele1_id = 'OMIM:' + o + '.' + v
                                geno.addSequenceAlteration(allele1_id, None)

                                # assume that the sa -> var_loc -> gene
                                # is taken care of in OMIM
                                geno.addPartsToVSLC(
                                    vslc_id, allele1_id, None,
                                    geno.zygosity['indeterminate'], geno.
                                    object_properties['has_alternate_part'])

                            if vslc_id != gvc_id:
                                geno.addVSLCtoParent(vslc_id, gvc_id)

                    if affected == 'unaffected':
                        # let's just say that this person is wildtype
                        model.addType(patient_id, geno.genoparts['wildtype'])
                    elif genotype_id is None:
                        # make an anonymous genotype id
                        genotype_id = '_:geno' + catalog_id.strip()

                    # add the gvc
                    if gvc_id is not None:
                        model.addIndividualToGraph(
                            gvc_id, gvc_label,
                            geno.genoparts['genomic_variation_complement'])

                        # add the gvc to the genotype
                        if genotype_id is not None:
                            if affected == 'unaffected':
                                rel = \
                                    geno.object_properties[
                                        'has_reference_part']
                            else:
                                rel = \
                                    geno.object_properties[
                                        'has_alternate_part']
                            geno.addParts(gvc_id, genotype_id, rel)
                        if karyotype_id is not None \
                                and self._is_normal_karyotype(karyotype):
                            if gvc_label is not None and gvc_label != '':
                                genotype_label = \
                                    '; '.join((gvc_label, karyotype))
                            else:
                                genotype_label = karyotype
                            if genotype_id is None:
                                genotype_id = karyotype_id
                            else:
                                geno.addParts(
                                    karyotype_id, genotype_id, geno.
                                    object_properties['has_reference_part'])
                        else:
                            genotype_label = gvc_label
                            # use the catalog id as the background
                        genotype_label += ' [' + catalog_id.strip() + ']'

                    if genotype_id is not None and gvc_id is not None:
                        # only add the genotype if it has some parts
                        geno.addGenotype(genotype_id, genotype_label,
                                         geno.genoparts['intrinsic_genotype'])
                        geno.addTaxon(taxon, genotype_id)
                        # add that the patient has the genotype
                        # TODO check if the genotype belongs to
                        # the cell line or to the patient
                        g.addTriple(patient_id,
                                    geno.properties['has_genotype'],
                                    genotype_id)
                    else:
                        geno.addTaxon(taxon, patient_id)

                    # TODO: Add sex/gender  (as part of the karyotype?)

                    # #############    DEAL WITH THE DISEASES   #############

                    # we associate the disease to the patient
                    if affected == 'affected':
                        if omim_number != '':
                            for d in omim_number.split(';'):
                                if d is not None and d != '':
                                    # if the omim number is in omim_map,
                                    # then it is a gene not a pheno
                                    if d not in omim_map:
                                        disease_id = 'OMIM:' + d.strip()
                                        # assume the label is taken care of
                                        model.addClassToGraph(disease_id, None)

                                        # add the association:
                                        #   the patient has the disease
                                        assoc = G2PAssoc(
                                            g, self.name, patient_id,
                                            disease_id)
                                        assoc.add_association_to_graph()

                                        # this line is a model of this disease
                                        # TODO abstract out model into
                                        # it's own association class?
                                        g.addTriple(
                                            cell_line_id, model.
                                            object_properties['model_of'],
                                            disease_id)
                                    else:
                                        logger.info(
                                            'removing %s from disease list ' +
                                            'since it is a gene', d)

                    # #############    ADD PUBLICATIONS   #############

                    if pubmed_ids != '':
                        for s in pubmed_ids.split(';'):
                            pubmed_id = 'PMID:' + s.strip()
                            ref = Reference(g, pubmed_id)
                            ref.setType(Reference.ref_types['journal_article'])
                            ref.addRefToGraph()
                            g.addTriple(pubmed_id,
                                        model.object_properties['mentions'],
                                        cell_line_id)

                    if not self.testMode \
                            and (limit is not None and line_counter > limit):
                        break
        return

    def _process_collection(self, collection_id, label, page):
        """
        This function will process the data supplied internally
        about the repository from Coriell.

        Triples:
            Repository a ERO:collection
            rdf:label Literal(label)
            foaf:page Literal(page)

        :param collection_id:
        :param label:
        :param page:
        :return:
        """
        # #############    BUILD THE CELL LINE REPOSITORY    #############
        for graph in [self.graph, self.testgraph]:
            # TODO: How to devise a label for each repository?
            model = Model(graph)
            reference = Reference(graph)
            repo_id = 'CoriellCollection:' + collection_id
            repo_label = label
            repo_page = page

            model.addIndividualToGraph(repo_id, repo_label,
                                       self.terms['collection'])
            reference.addPage(repo_id, repo_page)

        return

    @staticmethod
    def _map_cell_type(sample_type):
        ctype = None
        type_map = {
            # FIXME: mesenchymal stem cell of adipose
            'Adipose stromal cell': 'CL:0002570',
            # FIXME: amniocyte?
            'Amniotic fluid-derived cell line': 'CL:0002323',
            # B cell
            'B-Lymphocyte': 'CL:0000236',
            # FIXME: No Match
            'Chorionic villus-derived cell line': 'CL:0000000',
            # endothelial cell
            'Endothelial': 'CL:0000115',
            # epithelial cell
            'Epithelial': 'CL:0000066',
            # FIXME: No Match. "Abnormal precursor (virally transformed)
            # of mouse erythrocytes that can be grown in culture and
            # induced to differentiate by treatment with, for example, DMSO."
            'Erythroleukemic cell line': 'CL:0000000',
            'Fibroblast': 'CL:0000057',  # fibroblast
            'Keratinocyte': 'CL:0000312',  # keratinocyte
            'Melanocyte': 'CL:0000148',  # melanocyte
            'Mesothelial': 'CL:0000077',
            'Microcell hybrid': 'CL:0000000',  # FIXME: No Match
            'Myoblast': 'CL:0000056',  # myoblast
            'Smooth muscle': 'CL:0000192',  # smooth muscle cell
            'Stem cell': 'CL:0000034',  # stem cell
            'T-Lymphocyte': 'CL:0000084',  # T cell
            # FIXME: No Match. "Cells isolated from a mass of neoplastic cells,
            # i.e., a growth formed by abnormal cellular proliferation."
            # Oncocyte? CL:0002198
            'Tumor-derived cell line': 'CL:0002198',
            'Kidney-derived cell line': 'CLO:0000220'
        }
        if sample_type.strip() in type_map:
            ctype = type_map.get(sample_type)
        else:
            logger.error("Cell type not mapped: %s", sample_type)

        return ctype

    @staticmethod
    def _map_race(race):
        rtype = None
        type_map = {
            'African American': 'EFO:0003150',
            # 'American Indian': 'EFO',
            'Asian': 'EFO:0003152',
            # FIXME: Asian?
            'Asian; Other': 'EFO:0003152',
            # Asian Indian
            'Asiatic Indian': 'EFO:0003153',
            # FIXME: African American? There is also African.
            'Black': 'EFO:0003150',
            'Caucasian': 'EFO:0003156',
            'Chinese': 'EFO:0003157',
            'East Indian': 'EFO:0003158',  # Eastern Indian
            'Filipino': 'EFO:0003160',
            # Hispanic: EFO:0003169, Latino: EFO:0003166 see next
            'Hispanic/Latino': 'EFO:0003169',
            'Japanese': 'EFO:0003164',
            'Korean': 'EFO:0003165',
            # 'More than one race': 'EFO',
            # 'Not Reported': 'EFO',
            # 'Other': 'EFO',
            # Asian/Pacific Islander
            'Pacific Islander': 'EFO:0003154',
            # Asian/Pacific Islander
            'Polynesian': 'EFO:0003154',
            # 'Unknown': 'EFO',
            # Asian
            'Vietnamese': 'EFO:0003152',
        }
        if race.strip() in type_map:
            rtype = type_map.get(race)
        else:
            logger.warning("Race type not mapped: %s", race)

        return rtype

    @staticmethod
    def _map_species(species):
        tax = None
        type_map = {
            'Mus musculus': 'NCBITaxon:10090',
            'Peromyscus peromyscus californicus': 'NCBITaxon:42520',
            'Peromyscus peromyscus maniculatus': 'NCBITaxon:10042',
            'Peromyscus peromyscus leucopus': 'NCBITaxon:10041',
            'Peromyscus peromyscus polionotus': 'NCBITaxon:42413',
            'Macaca fascicularis': 'NCBITaxon:9541',
            'Rattus norvegicus': 'NCBITaxon:10116',
            'Papio anubis': 'NCBITaxon:9555',
            'Cricetulus griseus': 'NCBITaxon:10029',
            'Geochelone elephantopus': 'NCBITaxon:66189',
            'Muntiacus muntjak': 'NCBITaxon:9888',
            'Ailurus fulgens': 'NCBITaxon:9649',
            'Sus scrofa': 'NCBITaxon:9823',
            'Bos taurus': 'NCBITaxon:9913',
            'Oryctolagus cuniculus': 'NCBITaxon:9986',
            'Macaca nemestrina': 'NCBITaxon:9545',
            'Canis familiaris': 'NCBITaxon:9615',
            'Equus caballus': 'NCBITaxon:9796',
            'Macaca mulatta': 'NCBITaxon:9544',
            'Mesocricetus auratus': 'NCBITaxon:10036',
            'Macaca nigra': 'NCBITaxon:54600',
            'Erythrocebus patas': 'NCBITaxon:9538',
            'Pongo pygmaeus': 'NCBITaxon:9600',
            'Callicebus moloch': 'NCBITaxon:9523',
            'Lagothrix lagotricha': 'NCBITaxon:9519',
            'Saguinus fuscicollis': 'NCBITaxon:9487',
            'Saimiri sciureus': 'NCBITaxon:9521',
            'Saguinus labiatus': 'NCBITaxon:78454',
            'Pan paniscus': 'NCBITaxon:9597',
            'Ovis aries': 'NCBITaxon:9940',
            'Felis catus': 'NCBITaxon:9685',
            'H**o sapiens': 'NCBITaxon:9606',
            'Gorilla gorilla': 'NCBITaxon:9593',
            'Peromyscus maniculatus': 'NCBITaxon:10042'
        }
        if species.strip() in type_map:
            tax = type_map.get(species)
        else:
            logger.warning("Species type not mapped: %s", species)

        return tax

    @staticmethod
    def _map_collection(collection):
        ctype = None
        type_map = {
            'NINDS Repository':
            'CoriellCollection:NINDS',
            'NIGMS Human Genetic Cell Repository':
            'CoriellCollection:NIGMS',
            'NIA Aging Cell Culture Repository':
            'CoriellCollection:NIA',
            'NHGRI Sample Repository for Human Genetic Research':
            'CoriellCollection:NHGRI'
        }
        if collection.strip() in type_map:
            ctype = type_map.get(collection)
        else:
            logger.warning("ERROR: Collection type not mapped: %s", collection)

        return ctype

    @staticmethod
    def _get_affected_chromosomes_from_karyotype(karyotype):

        affected_chromosomes = set()
        chr_regex = r'(\d+|X|Y|M|\?);?'
        abberation_regex = r'(?:add|del|der|i|idic|inv|r|rec|t)\([\w;]+\)'
        sex_regex = r'(?:;)(X{2,}Y+|X?Y{2,}|X{3,}|X|Y)(?:;|$)'

        # first fetch the set of abberations
        abberations = re.findall(abberation_regex, karyotype)

        # iterate over them to get the chromosomes
        for a in abberations:
            chrs = re.findall(chr_regex, a)
            affected_chromosomes = affected_chromosomes.union(set(chrs))

        # remove the ? as a chromosome, since it isn't valid
        if '?' in affected_chromosomes:
            affected_chromosomes.remove('?')

        # check to see if there are any abnormal sex chromosomes
        m = re.search(sex_regex, karyotype)
        if m is not None:
            if re.search(r'X?Y{2,}', m.group(1)):
                # this is the only case where there is an extra Y chromosome
                affected_chromosomes.add('Y')
            else:
                affected_chromosomes.add('X')

        return affected_chromosomes

    @staticmethod
    def _is_normal_karyotype(karyotype):
        """
        This will default to true if no karyotype is provided.
        This is assuming human karyotypes.
        :param karyotype:
        :return:
        """

        is_normal = True
        if karyotype is not None:
            karyotype = karyotype.strip()
            if karyotype not in ['46;XX', '46;XY', '']:
                is_normal = False

        return is_normal

    def getTestSuite(self):
        import unittest
        from tests.test_coriell import CoriellTestCase
        # TODO add G2PAssoc, Genotype tests

        test_suite = \
            unittest.TestLoader().loadTestsFromTestCase(CoriellTestCase)

        return test_suite