예제 #1
0
 def _process_query_dict(self, query_dict, valid_keys, kwargs):
     if query_dict is None:
         query_dict = {}
     for k, v in kwargs.items():
         if k in valid_keys:
             query_dict[k] = v
         else:
             query_dict['ot:' + k] = v
     nq = len(query_dict)
     if nq == 0:
         if self.use_v1:
             raise ValueError(
                 'The property/value pairs for the query should be passed in as keyword arguments'
             )
         return None
     if nq > 1:
         raise NotImplementedError(
             'Currently only searches for one property/value pair are supported'
         )
     k = list(query_dict.keys())[0]
     if k not in valid_keys:
         m = '"{k}" is not a valid search term. Expecting it to be one of the following: {kl}'
         m = m.format(k=k, kl=repr(valid_keys))
         raise ValueError(m)
     v = query_dict[k]
     if not is_str_type(v):
         v = UNICODE(v)
     if k == 'ot:studyPublication':
         v = doi2url(v)
     return (k, v)
예제 #2
0
파일: oti.py 프로젝트: pombredanne/peyotl
 def _process_query_dict(self, query_dict, valid_keys, kwargs):
     if query_dict is None:
         query_dict = {}
     for k, v in kwargs.items():
         if k in valid_keys:
             query_dict[k] = v
         else:
             query_dict['ot:' + k] = v
     nq = len(query_dict)
     if nq == 0:
         if self.use_v1:
             raise ValueError('The property/value pairs for the query should be passed in as keyword arguments')
         return None
     if nq > 1:
         raise NotImplementedError('Currently only searches for one property/value pair are supported')
     k = list(query_dict.keys())[0]
     if k not in valid_keys:
         m = '"{k}" is not a valid search term. Expecting it to be one of the following: {kl}'
         m = m.format(k=k, kl=repr(valid_keys))
         raise ValueError(m)
     v = query_dict[k]
     if not is_str_type(v):
         v = UNICODE(v)
     if k == 'ot:studyPublication':
         v = doi2url(v)
     return (k, v)
예제 #3
0
 def testTreeBaseImport(self):
     fp = pathmap.nexml_source_path('S15515.xml')
     n = get_ot_study_info_from_treebase_nexml(src=fp,
                                               merge_blocks=True,
                                               sort_arbitrary=True)
     # did we successfully coerce its DOI to the required URL form?
     self.assertTrue('@href' in n['nexml']['^ot:studyPublication'])
     test_doi = n['nexml']['^ot:studyPublication']['@href']
     self.assertTrue(test_doi == doi2url(test_doi))
     # furthermore, the output should exactly match our test file
     expected = pathmap.nexson_obj('S15515.json')
     equal_blob_check(self, 'S15515', n, expected)
     self.assertTrue(expected == n)
예제 #4
0
 def testTreeBaseImport(self):
     fp = pathmap.nexml_source_path('S15515.xml')
     n = get_ot_study_info_from_treebase_nexml(src=fp,
                                               merge_blocks=True,
                                               sort_arbitrary=True)
     # did we successfully coerce its DOI to the required URL form?
     self.assertTrue('@href' in n['nexml']['^ot:studyPublication'])
     test_doi = n['nexml']['^ot:studyPublication']['@href']
     self.assertTrue(test_doi == doi2url(test_doi))
     # furthermore, the output should exactly match our test file
     expected = pathmap.nexson_obj('S15515.json')
     equal_blob_check(self, 'S15515', n, expected)
     self.assertTrue(expected == n)
예제 #5
0
파일: external.py 프로젝트: mtholder/peyotl
def get_ot_study_info_from_treebase_nexml(
        src=None,
        nexml_content=None,
        encoding=u'utf8',
        nexson_syntax_version=DEFAULT_NEXSON_VERSION,
        merge_blocks=True,
        sort_arbitrary=False):
    """Normalize treebase-specific metadata into the locations where
    open tree of life software that expects it.

    See get_ot_study_info_from_nexml for the explanation of the src,
    nexml_content, encoding, and nexson_syntax_version arguments
    If merge_blocks is True then peyotl.manip.merge_otus_and_trees

    Actions to "normalize" TreeBase objects to ot Nexson
        1. the meta id for any meta item that has only a value and an id
        2. throw away rdfs:isDefinedBy
        3. otu @label -> otu ^ot:originalLabel
        4. ^tb:indentifier.taxon, ^tb:indentifier.taxonVariant and some skos:closeMatch
            fields to ^ot:taxonLink
        5. remove "@xml:base"
        6. coerce edge lengths to native types
    """
    # pylint: disable=R0915
    raw = get_ot_study_info_from_nexml(
        src=src,
        nexml_content=nexml_content,
        encoding=encoding,
        nexson_syntax_version=BY_ID_HONEY_BADGERFISH)
    nexml = raw['nexml']
    SKOS_ALT_LABEL = '^skos:altLabel'
    SKOS_CLOSE_MATCH = '^skos:closeMatch'
    strippable_pre = {
        'http://www.ubio.org/authority/metadata.php?lsid=urn:lsid:ubio.org:namebank:':
        '@ubio',
        'http://purl.uniprot.org/taxonomy/':
        '@uniprot',
    }
    moveable2taxon_link = {
        "^tb:identifier.taxon": '@tb:identifier.taxon',
        "^tb:identifier.taxonVariant": '@tb:identifier.taxonVariant',
    }
    to_del = ['^rdfs:isDefinedBy', '@xml:base']
    for tag in to_del:
        if tag in nexml:
            del nexml[tag]
    _simplify_all_meta_by_id_del(nexml)
    _otu2label = {}
    prefix_map = {}
    # compose dataDeposit
    nexid = nexml['@id']
    tb_url = 'http://purl.org/phylo/treebase/phylows/study/TB2:' + nexid
    nexml['^ot:dataDeposit'] = {'@href': tb_url}
    # compose dataDeposit
    bd = nexml.get("^dcterms:bibliographicCitation")
    if bd:
        nexml['^ot:studyPublicationReference'] = bd
    doi = nexml.get('^prism:doi')
    if doi:
        doi = doi2url(doi)
        nexml['^ot:studyPublication'] = {'@href': doi}
    year = nexml.get('^prism:publicationDate')
    if year:
        try:
            nexml['^ot:studyYear'] = int(year)
        except:
            pass
    #
    for otus in nexml['otusById'].values():
        for tag in to_del:
            if tag in otus:
                del otus[tag]
        _simplify_all_meta_by_id_del(otus)
        for oid, otu in otus['otuById'].items():
            for tag in to_del:
                if tag in otu:
                    del otu[tag]
            _simplify_all_meta_by_id_del(otu)
            label = otu['@label']
            _otu2label[oid] = label
            otu['^ot:originalLabel'] = label
            del otu['@label']
            al = otu.get(SKOS_ALT_LABEL)
            if al is not None:
                if otu.get('^ot:altLabel') is None:
                    otu['^ot:altLabel'] = al
                del otu[SKOS_ALT_LABEL]
            tl = {}
            scm = otu.get(SKOS_CLOSE_MATCH)
            # _LOG.debug('scm = ' + str(scm))
            if scm:
                if isinstance(scm, dict):
                    h = scm.get('@href')
                    if h:
                        try:
                            for p, t in strippable_pre.items():
                                if h.startswith(p):
                                    ident = h[len(p):]
                                    tl[t] = ident
                                    del otu[SKOS_CLOSE_MATCH]
                                    prefix_map[t] = p
                        except:
                            pass
                else:
                    nm = []
                    try:
                        for el in scm:
                            h = el.get('@href')
                            if h:
                                found = False
                                for p, t in strippable_pre.items():
                                    if h.startswith(p):
                                        ident = h[len(p):]
                                        tl[t] = ident
                                        found = True
                                        prefix_map[t] = p
                                        break
                                if not found:
                                    nm.append(el)
                    except:
                        pass
                    if len(nm) < len(scm):
                        if len(nm) > 1:
                            otu[SKOS_CLOSE_MATCH] = nm
                        elif len(nm) == 1:
                            otu[SKOS_CLOSE_MATCH] = nm[0]
                        else:
                            del otu[SKOS_CLOSE_MATCH]
            # _LOG.debug('tl =' + str(tl))
            for k, t in moveable2taxon_link.items():
                al = otu.get(k)
                if al:
                    tl[t] = al
                    del otu[k]
            if tl:
                otu['^ot:taxonLink'] = tl
    for trees in nexml['treesById'].values():
        for tag in to_del:
            if tag in trees:
                del trees[tag]
        _simplify_all_meta_by_id_del(trees)
        for tree in trees['treeById'].values():
            for tag in to_del:
                if tag in tree:
                    del tree[tag]
            _simplify_all_meta_by_id_del(tree)
            tt = tree.get('@xsi:type', 'nex:FloatTree')
            if tt.lower() == 'nex:inttree':
                e_len_coerce = int
            else:
                e_len_coerce = float
            for edge_d in tree['edgeBySourceId'].values():
                for edge in edge_d.values():
                    try:
                        x = e_len_coerce(edge['@length'])
                        edge['@length'] = x
                    except:
                        pass
            for node in tree['nodeById'].values():
                nl = node.get('@label')
                if nl:
                    no = node.get('@otu')
                    if no and _otu2label[no] == nl:
                        del node['@label']

    if prefix_map:
        nexml['^ot:taxonLinkPrefixes'] = prefix_map
    if merge_blocks:
        from peyotl.manip import merge_otus_and_trees
        merge_otus_and_trees(raw)
    if nexson_syntax_version != BY_ID_HONEY_BADGERFISH:
        convert_nexson_format(raw,
                              nexson_syntax_version,
                              current_format=BY_ID_HONEY_BADGERFISH,
                              sort_arbitrary=sort_arbitrary)
    elif sort_arbitrary:
        sort_arbitrarily_ordered_nexson(raw)
    return raw
예제 #6
0
def get_ot_study_info_from_treebase_nexml(src=None,
                                          nexml_content=None,
                                          encoding=u'utf8',
                                          nexson_syntax_version=DEFAULT_NEXSON_VERSION,
                                          merge_blocks=True,
                                          sort_arbitrary=False):
    """Normalize treebase-specific metadata into the locations where
    open tree of life software that expects it.

    See get_ot_study_info_from_nexml for the explanation of the src,
    nexml_content, encoding, and nexson_syntax_version arguments
    If merge_blocks is True then peyotl.manip.merge_otus_and_trees

    Actions to "normalize" TreeBase objects to ot Nexson
        1. the meta id for any meta item that has only a value and an id
        2. throw away rdfs:isDefinedBy
        3. otu @label -> otu ^ot:originalLabel
        4. ^tb:indentifier.taxon, ^tb:indentifier.taxonVariant and some skos:closeMatch
            fields to ^ot:taxonLink
        5. remove "@xml:base"
        6. coerce edge lengths to native types
    """
    # pylint: disable=R0915
    raw = get_ot_study_info_from_nexml(src=src,
                                       nexml_content=nexml_content,
                                       encoding=encoding,
                                       nexson_syntax_version=BY_ID_HONEY_BADGERFISH)
    nexml = raw['nexml']
    SKOS_ALT_LABEL = '^skos:altLabel'
    SKOS_CLOSE_MATCH = '^skos:closeMatch'
    strippable_pre = {
        'http://www.ubio.org/authority/metadata.php?lsid=urn:lsid:ubio.org:namebank:': '@ubio',
        'http://purl.uniprot.org/taxonomy/': '@uniprot',
    }
    moveable2taxon_link = {"^tb:identifier.taxon": '@tb:identifier.taxon',
                           "^tb:identifier.taxonVariant": '@tb:identifier.taxonVariant', }
    to_del = ['^rdfs:isDefinedBy', '@xml:base']
    for tag in to_del:
        if tag in nexml:
            del nexml[tag]
    _simplify_all_meta_by_id_del(nexml)
    _otu2label = {}
    prefix_map = {}
    # compose dataDeposit
    nexid = nexml['@id']
    tb_url = 'http://purl.org/phylo/treebase/phylows/study/TB2:' + nexid
    nexml['^ot:dataDeposit'] = {'@href': tb_url}
    # compose dataDeposit
    bd = nexml.get("^dcterms:bibliographicCitation")
    if bd:
        nexml['^ot:studyPublicationReference'] = bd
    doi = nexml.get('^prism:doi')
    if doi:
        doi = doi2url(doi)
        nexml['^ot:studyPublication'] = {'@href': doi}
    year = nexml.get('^prism:publicationDate')
    if year:
        try:
            nexml['^ot:studyYear'] = int(year)
        except:
            pass
    #
    for otus in nexml['otusById'].values():
        for tag in to_del:
            if tag in otus:
                del otus[tag]
        _simplify_all_meta_by_id_del(otus)
        for oid, otu in otus['otuById'].items():
            for tag in to_del:
                if tag in otu:
                    del otu[tag]
            _simplify_all_meta_by_id_del(otu)
            label = otu['@label']
            _otu2label[oid] = label
            otu['^ot:originalLabel'] = label
            del otu['@label']
            al = otu.get(SKOS_ALT_LABEL)
            if al is not None:
                if otu.get('^ot:altLabel') is None:
                    otu['^ot:altLabel'] = al
                del otu[SKOS_ALT_LABEL]
            tl = {}
            scm = otu.get(SKOS_CLOSE_MATCH)
            # _LOG.debug('scm = ' + str(scm))
            if scm:
                if isinstance(scm, dict):
                    h = scm.get('@href')
                    if h:
                        try:
                            for p, t in strippable_pre.items():
                                if h.startswith(p):
                                    ident = h[len(p):]
                                    tl[t] = ident
                                    del otu[SKOS_CLOSE_MATCH]
                                    prefix_map[t] = p
                        except:
                            pass
                else:
                    nm = []
                    try:
                        for el in scm:
                            h = el.get('@href')
                            if h:
                                found = False
                                for p, t in strippable_pre.items():
                                    if h.startswith(p):
                                        ident = h[len(p):]
                                        tl[t] = ident
                                        found = True
                                        prefix_map[t] = p
                                        break
                                if not found:
                                    nm.append(el)
                    except:
                        pass
                    if len(nm) < len(scm):
                        if len(nm) > 1:
                            otu[SKOS_CLOSE_MATCH] = nm
                        elif len(nm) == 1:
                            otu[SKOS_CLOSE_MATCH] = nm[0]
                        else:
                            del otu[SKOS_CLOSE_MATCH]
            # _LOG.debug('tl =' + str(tl))
            for k, t in moveable2taxon_link.items():
                al = otu.get(k)
                if al:
                    tl[t] = al
                    del otu[k]
            if tl:
                otu['^ot:taxonLink'] = tl
    for trees in nexml['treesById'].values():
        for tag in to_del:
            if tag in trees:
                del trees[tag]
        _simplify_all_meta_by_id_del(trees)
        for tree in trees['treeById'].values():
            for tag in to_del:
                if tag in tree:
                    del tree[tag]
            _simplify_all_meta_by_id_del(tree)
            tt = tree.get('@xsi:type', 'nex:FloatTree')
            if tt.lower() == 'nex:inttree':
                e_len_coerce = int
            else:
                e_len_coerce = float
            for edge_d in tree['edgeBySourceId'].values():
                for edge in edge_d.values():
                    try:
                        x = e_len_coerce(edge['@length'])
                        edge['@length'] = x
                    except:
                        pass
            for node in tree['nodeById'].values():
                nl = node.get('@label')
                if nl:
                    no = node.get('@otu')
                    if no and _otu2label[no] == nl:
                        del node['@label']

    if prefix_map:
        nexml['^ot:taxonLinkPrefixes'] = prefix_map
    if merge_blocks:
        from peyotl.manip import merge_otus_and_trees
        merge_otus_and_trees(raw)
    if nexson_syntax_version != BY_ID_HONEY_BADGERFISH:
        convert_nexson_format(raw,
                              nexson_syntax_version,
                              current_format=BY_ID_HONEY_BADGERFISH,
                              sort_arbitrary=sort_arbitrary)
    elif sort_arbitrary:
        sort_arbitrarily_ordered_nexson(raw)
    return raw
예제 #7
0
    def __init__(self, obj, errors=None, **kwargs):
        if errors is None:
            errors = []
        try:
            # Python 2.x
            string_types = (str, unicode)
        except NameError:
            # Python 3
            string_types = (str, )
        self.required_toplevel_elements = {
            # N.B. anyjson might parse a text element as str or unicode,
            # depending on its value. Either is fine here.
            'curator': dict,
            'date_created': string_types,
            'taxa': list,
            'user_agent': string_types,
        }
        self.optional_toplevel_elements = {
            'id': string_types,  # not present in initial request
            'study_id': string_types,
            'new_ottids_required': int,  # provided by some agents
        }
        # track unknown keys in top-level object
        uk = None
        for k in obj.keys():
            if (k not in self.required_toplevel_elements.keys()
                    and k not in self.optional_toplevel_elements.keys()):
                if uk is None:
                    uk = []
                uk.append(k)
        if uk:
            uk.sort()
            # self._warn_event(_NEXEL.TOP_LEVEL,
            #                  obj=obj,
            #                  err_type=gen_UnrecognizedKeyWarning,
            #                  anc=_EMPTY_TUPLE,
            #                  obj_nex_id=None,
            #                  key_list=uk)
            errors.append(
                "Found these unexpected top-level properties: {k}".format(
                    k=uk))

        # test for existence and types of all required elements
        for el_key, el_type in self.required_toplevel_elements.items():
            test_el = obj.get(el_key, None)
            try:
                assert test_el is not None
            except:
                errors.append("Property '{p}' not found!".format(p=el_key))
            try:
                assert isinstance(test_el, el_type)
            except:
                errors.append(
                    "Property '{p}' should be one of these: {t}".format(
                        p=el_key, t=el_type))

        # test a non-empty id against our expected pattern
        self._id = obj.get('id')
        if self._id and isinstance(self._id, string_types):
            try:
                from peyotl.amendments import AMENDMENT_ID_PATTERN
                assert bool(AMENDMENT_ID_PATTERN.match(self._id))
            except:
                errors.append(
                    "The top-level amendment 'id' provided is not valid")

        # test a non-empty curator for expected 'login' and 'name' fields
        self._curator = obj.get('curator')
        if isinstance(self._curator, dict):
            for k in self._curator.keys():
                try:
                    assert k in [
                        'login',
                        'name',
                        'email',
                    ]
                except:
                    errors.append(
                        "Unexpected key '{k}' found in curator".format(k=k))
            if 'login' in self._curator:
                try:
                    assert isinstance(self._curator.get('name'), string_types)
                except:
                    errors.append("Curator 'name' should be a string")
            if 'name' in self._curator:
                try:
                    assert isinstance(self._curator.get('login'), string_types)
                except:
                    errors.append("Curator 'login' should be a string")
            if 'email' in self._curator:
                try:
                    assert isinstance(self._curator.get('email'), string_types)
                except:
                    # TODO: Attempt to validate as an email address?
                    errors.append(
                        "Curator 'email' should be a string (a valid email address)"
                    )

        # test for a valid date_created (should be valid ISO 8601)
        self._date_created = obj.get('date_created')
        import dateutil.parser
        try:
            dateutil.parser.parse(self._date_created)
        except:
            errors.append("Property 'date_created' is not a valid ISO date")

        # test for a valid study_id (if it's not an empty string)
        self._study_id = obj.get('study_id')
        if self._study_id and isinstance(self._study_id, string_types):
            from peyotl.phylesystem import STUDY_ID_PATTERN
            try:
                assert bool(STUDY_ID_PATTERN.match(self._study_id))
            except:
                errors.append("The 'study_id' provided is not valid")

        # text taxa for required properties, valid types+values, etc.
        self._taxa = obj.get('taxa')
        if isinstance(self._taxa, list):
            # N.B. required property cannot be empty!
            self.required_toplevel_taxon_elements = {
                'name': string_types,
                'name_derivation': string_types,  # from controlled vocabulary
                'sources': list,
            }
            self.optional_toplevel_taxon_elements = {
                'comment': string_types,
                'rank': string_types,  # can be 'no rank'
                'original_label': string_types,
                'adjusted_label': string_types,
                'parent': int,  # the parent taxon's OTT id
                'parent_tag': string_types,
                'tag': object,  # can be anything (int, string, ...)
                'ott_id': int  # if already assigned
            }

            # N.B. we should reject any unknown keys (not listed above)!
            uk = None
            for taxon in self._taxa:
                for k in taxon.keys():
                    if (k not in self.required_toplevel_taxon_elements.keys()
                            and k not in
                            self.optional_toplevel_taxon_elements.keys()):
                        if uk is None:
                            uk = []
                        uk.append(k)

                for el_key, el_type in self.required_toplevel_taxon_elements.items(
                ):
                    test_el = taxon.get(el_key, None)
                    try:
                        assert test_el is not None
                    except:
                        errors.append(
                            "Required taxon property '{p}' not found!".format(
                                p=el_key))
                    try:
                        assert isinstance(test_el, el_type)
                    except:
                        errors.append(
                            "Taxon property '{p}' should be one of these: {t}".
                            format(p=el_key, t=el_type))

                # TODO: name_derivation should be one of a limited set of values

                # any optional properties found should also be of the required type(s)
                for el_key, el_type in self.optional_toplevel_taxon_elements.items(
                ):
                    if el_key in taxon:
                        test_el = taxon.get(el_key, None)
                        try:
                            assert isinstance(test_el, el_type)
                        except:
                            errors.append(
                                "Taxon property '{p}' should be one of these: {t}"
                                .format(p=el_key, t=el_type))

                # each taxon must have either 'parent' or 'parent_tag'!
                try:
                    assert ('parent' in taxon) or ('parent_tag' in taxon)
                except:
                    errors.append(
                        "Taxon has neither 'parent' nor 'parent_tag'!")

                # we need at least one source with type and (sometimes) non-empty value
                self.source_types_requiring_value = [
                    'Link to online taxonomy',
                    'Link (DOI) to publication',
                    'Other',
                ]
                self.source_types_not_requiring_value = [
                    'The taxon is described in this study',
                ]
                self.source_types_requiring_URL = [
                    'Link to online taxonomy',
                    'Link (DOI) to publication',
                ]
                valid_source_found = False
                if len(taxon.get('sources')) > 0:
                    for s in taxon.get('sources'):
                        s_type = s.get('source_type', None)
                        try:
                            assert (s_type in self.source_types_requiring_value
                                    or s_type
                                    in self.source_types_not_requiring_value)
                            if s_type in self.source_types_requiring_value:
                                try:
                                    # the 'source' (value) field should be a non-empty string
                                    assert s.get('source', None)
                                    valid_source_found = True
                                except:
                                    errors.append(
                                        "Missing value for taxon source of type '{t}'!"
                                        .format(t=s_type))
                            else:
                                valid_source_found = True
                            if s_type in self.source_types_requiring_URL:
                                try:
                                    # its value should contain a URL (ie, conversion does nothing)
                                    s_val = s.get('source')
                                    assert s_val == doi2url(s_val)
                                except:
                                    errors.append(
                                        "Source '{s}' (of type '{t}') should be a URL!"
                                        .format(s=s_val, t=s_type))
                        except:
                            errors.append(
                                "Unknown taxon source type '{t}'!".format(
                                    t=s_type))

                if not valid_source_found:
                    errors.append(
                        "Taxon must have at least one valid source (none found)!"
                    )

            if uk:
                uk.sort()
                errors.append(
                    "Found these unexpected taxon properties: {k}".format(
                        k=uk))
예제 #8
0
    def __init__(self, obj, errors=None, **kwargs):
        if errors is None:
            errors = []
        try:
            # Python 2.x
            string_types = (str, unicode)
        except NameError:
            # Python 3
            string_types = (str,)
        self.required_toplevel_elements = {
            # N.B. anyjson might parse a text element as str or unicode,
            # depending on its value. Either is fine here.
            'curator': dict,
            'date_created': string_types,
            'taxa': list,
            'user_agent': string_types,
        }
        self.optional_toplevel_elements = {
            'id': string_types,  # not present in initial request
            'study_id': string_types,
            'new_ottids_required': int,  # provided by some agents
        }
        # track unknown keys in top-level object
        uk = None
        for k in obj.keys():
            if (k not in self.required_toplevel_elements.keys() and
                        k not in self.optional_toplevel_elements.keys()):
                if uk is None:
                    uk = []
                uk.append(k)
        if uk:
            uk.sort()
            # self._warn_event(_NEXEL.TOP_LEVEL,
            #                  obj=obj,
            #                  err_type=gen_UnrecognizedKeyWarning,
            #                  anc=_EMPTY_TUPLE,
            #                  obj_nex_id=None,
            #                  key_list=uk)
            errors.append("Found these unexpected top-level properties: {k}".format(k=uk))

        # test for existence and types of all required elements
        for el_key, el_type in self.required_toplevel_elements.items():
            test_el = obj.get(el_key, None)
            try:
                assert test_el is not None
            except:
                errors.append("Property '{p}' not found!".format(p=el_key))
            try:
                assert isinstance(test_el, el_type)
            except:
                errors.append("Property '{p}' should be one of these: {t}".format(p=el_key, t=el_type))

        # test a non-empty id against our expected pattern
        self._id = obj.get('id')
        if self._id and isinstance(self._id, string_types):
            try:
                from peyotl.amendments import AMENDMENT_ID_PATTERN
                assert bool(AMENDMENT_ID_PATTERN.match(self._id))
            except:
                errors.append("The top-level amendment 'id' provided is not valid")

        # test a non-empty curator for expected 'login' and 'name' fields
        self._curator = obj.get('curator')
        if isinstance(self._curator, dict):
            for k in self._curator.keys():
                try:
                    assert k in ['login', 'name', 'email', ]
                except:
                    errors.append("Unexpected key '{k}' found in curator".format(k=k))
            if 'login' in self._curator:
                try:
                    assert isinstance(self._curator.get('name'), string_types)
                except:
                    errors.append("Curator 'name' should be a string")
            if 'name' in self._curator:
                try:
                    assert isinstance(self._curator.get('login'), string_types)
                except:
                    errors.append("Curator 'login' should be a string")
            if 'email' in self._curator:
                try:
                    assert isinstance(self._curator.get('email'), string_types)
                except:
                    # TODO: Attempt to validate as an email address?
                    errors.append("Curator 'email' should be a string (a valid email address)")

        # test for a valid date_created (should be valid ISO 8601)
        self._date_created = obj.get('date_created')
        import dateutil.parser
        try:
            dateutil.parser.parse(self._date_created)
        except:
            errors.append("Property 'date_created' is not a valid ISO date")

        # test for a valid study_id (if it's not an empty string)
        self._study_id = obj.get('study_id')
        if self._study_id and isinstance(self._study_id, string_types):
            from peyotl.phylesystem import STUDY_ID_PATTERN
            try:
                assert bool(STUDY_ID_PATTERN.match(self._study_id))
            except:
                errors.append("The 'study_id' provided is not valid")

        # text taxa for required properties, valid types+values, etc.
        self._taxa = obj.get('taxa')
        if isinstance(self._taxa, list):
            # N.B. required property cannot be empty!
            self.required_toplevel_taxon_elements = {
                'name': string_types,
                'name_derivation': string_types,  # from controlled vocabulary
                'sources': list,
            }
            self.optional_toplevel_taxon_elements = {
                'comment': string_types,
                'rank': string_types,  # can be 'no rank'
                'original_label': string_types,
                'adjusted_label': string_types,
                'parent': int,  # the parent taxon's OTT id
                'parent_tag': string_types,
                'tag': object,  # can be anything (int, string, ...)
                'ott_id': int  # if already assigned
            }

            # N.B. we should reject any unknown keys (not listed above)!
            uk = None
            for taxon in self._taxa:
                for k in taxon.keys():
                    if (k not in self.required_toplevel_taxon_elements.keys() and
                        k not in self.optional_toplevel_taxon_elements.keys()):
                        if uk is None:
                            uk = []
                        uk.append(k)

                for el_key, el_type in self.required_toplevel_taxon_elements.items():
                    test_el = taxon.get(el_key, None)
                    try:
                        assert test_el is not None
                    except:
                        errors.append("Required taxon property '{p}' not found!".format(p=el_key))
                    try:
                        assert isinstance(test_el, el_type)
                    except:
                        errors.append("Taxon property '{p}' should be one of these: {t}".format(p=el_key, t=el_type))

                # TODO: name_derivation should be one of a limited set of values

                # any optional properties found should also be of the required type(s)
                for el_key, el_type in self.optional_toplevel_taxon_elements.items():
                    if el_key in taxon:
                        test_el = taxon.get(el_key, None)
                        try:
                            assert isinstance(test_el, el_type)
                        except:
                            errors.append(
                                "Taxon property '{p}' should be one of these: {t}".format(p=el_key, t=el_type))

                # each taxon must have either 'parent' or 'parent_tag'!
                try:
                    assert ('parent' in taxon) or ('parent_tag' in taxon)
                except:
                    errors.append("Taxon has neither 'parent' nor 'parent_tag'!")

                # we need at least one source with type and (sometimes) non-empty value
                self.source_types_requiring_value = [
                    'Link to online taxonomy',
                    'Link (DOI) to publication',
                    'Other',
                ]
                self.source_types_not_requiring_value = [
                    'The taxon is described in this study',
                ]
                self.source_types_requiring_URL = [
                    'Link to online taxonomy',
                    'Link (DOI) to publication',
                ]
                valid_source_found = False
                if len(taxon.get('sources')) > 0:
                    for s in taxon.get('sources'):
                        s_type = s.get('source_type', None)
                        try:
                            assert (s_type in self.source_types_requiring_value or
                                    s_type in self.source_types_not_requiring_value)
                            if s_type in self.source_types_requiring_value:
                                try:
                                    # the 'source' (value) field should be a non-empty string
                                    assert s.get('source', None)
                                    valid_source_found = True
                                except:
                                    errors.append("Missing value for taxon source of type '{t}'!".format(t=s_type))
                            else:
                                valid_source_found = True
                            if s_type in self.source_types_requiring_URL:
                                try:
                                    # its value should contain a URL (ie, conversion does nothing)
                                    s_val = s.get('source')
                                    assert s_val == doi2url(s_val)
                                except:
                                    errors.append("Source '{s}' (of type '{t}') should be a URL!".format(s=s_val, t=s_type))
                        except:
                            errors.append("Unknown taxon source type '{t}'!".format(t=s_type))

                if not valid_source_found:
                    errors.append("Taxon must have at least one valid source (none found)!")

            if uk:
                uk.sort()
                errors.append("Found these unexpected taxon properties: {k}".format(k=uk))