def testCanConvert(self): inp = pathmap.nexson_obj('merge/merge-input.v1.2.json') expected = pathmap.nexson_obj('merge/merge-expected.v1.2.json') expected = sort_arbitrarily_ordered_nexson(expected) inp = sort_arbitrarily_ordered_nexson(inp) self.assertNotEqual(inp, expected) merge_otus_and_trees(inp) equal_blob_check(self, '', inp, expected)
def testConvertHBF1_2toHBF1_0(self): for t in RT_DIRS: obj, b_expect = _get_pair(t, 'v1.2.json', 'v1.0.json') if obj is None: continue b = convert_nexson_format(obj, DIRECT_HONEY_BADGERFISH) sort_arbitrarily_ordered_nexson(b_expect) sort_arbitrarily_ordered_nexson(b) equal_blob_check(self, '', b, b_expect)
def testConvertHBF1_2toBF(self): for t in RT_DIRS: obj, b_expect = _get_pair(t, 'v1.2.json', 'v0.0.json') if obj is None: continue b = convert_nexson_format(obj, BADGER_FISH_NEXSON_VERSION) sort_arbitrarily_ordered_nexson(b_expect) sort_arbitrarily_ordered_nexson(b) equal_blob_check(self, '', b, b_expect)
def get_ot_study_info_from_treebase_nexml(src=None, nexml_content=None, encoding=u'utf8', nexson_syntax_version=DEFAULT_NEXSON_VERSION, merge_blocks=True, sort_arbitrary=False): '''Normalize treebase-specific metadata into the locations where open tree of life software that expects it. See get_ot_study_info_from_nexml for the explanation of the src, nexml_content, encoding, and nexson_syntax_version arguments If merge_blocks is True then peyotl.manip.merge_otus_and_trees Actions to "normalize" TreeBase objects to ot Nexson 1. the meta id for any meta item that has only a value and an id 2. throw away rdfs:isDefinedBy 3. otu @label -> otu ^ot:originalLabel 4. ^tb:indentifier.taxon, ^tb:indentifier.taxonVariant and some skos:closeMatch fields to ^ot:taxonLink 5. remove "@xml:base" 6. coerce edge lengths to native types ''' #pylint: disable=R0915 raw = get_ot_study_info_from_nexml(src=src, nexml_content=nexml_content, encoding=encoding, nexson_syntax_version=BY_ID_HONEY_BADGERFISH) nexml = raw['nexml'] SKOS_ALT_LABEL = '^skos:altLabel' SKOS_CLOSE_MATCH = '^skos:closeMatch' strippable_pre = { 'http://www.ubio.org/authority/metadata.php?lsid=urn:lsid:ubio.org:namebank:': '@ubio', 'http://purl.uniprot.org/taxonomy/': '@uniprot', } moveable2taxon_link = {"^tb:identifier.taxon": '@tb:identifier.taxon', "^tb:identifier.taxonVariant": '@tb:identifier.taxonVariant', } to_del = ['^rdfs:isDefinedBy', '@xml:base'] for tag in to_del: if tag in nexml: del nexml[tag] _simplify_all_meta_by_id_del(nexml) _otu2label = {} prefix_map = {} # compose dataDeposit nexid = nexml['@id'] tb_url = 'http://purl.org/phylo/treebase/phylows/study/TB2:' + nexid nexml['^ot:dataDeposit'] = {'@href': tb_url} # compose dataDeposit bd = nexml.get("^dcterms:bibliographicCitation") if bd: nexml['^ot:studyPublicationReference'] = bd doi = nexml.get('^prism:doi') if doi: nexml['^ot:studyPublication'] = {'@href': doi} year = nexml.get('^prism:publicationDate') if year: try: nexml['^ot:studyYear'] = int(year) except: pass # for otus in nexml['otusById'].values(): for tag in to_del: if tag in otus: del otus[tag] _simplify_all_meta_by_id_del(otus) for oid, otu in otus['otuById'].items(): for tag in to_del: if tag in otu: del otu[tag] _simplify_all_meta_by_id_del(otu) label = otu['@label'] _otu2label[oid] = label otu['^ot:originalLabel'] = label del otu['@label'] al = otu.get(SKOS_ALT_LABEL) if al is not None: if otu.get('^ot:altLabel') is None: otu['^ot:altLabel'] = al del otu[SKOS_ALT_LABEL] tl = {} scm = otu.get(SKOS_CLOSE_MATCH) #_LOG.debug('scm = ' + str(scm)) if scm: if isinstance(scm, dict): h = scm.get('@href') if h: try: for p, t in strippable_pre.items(): if h.startswith(p): ident = h[len(p):] tl[t] = ident del otu[SKOS_CLOSE_MATCH] prefix_map[t] = p except: pass else: nm = [] try: for el in scm: h = el.get('@href') if h: found = False for p, t in strippable_pre.items(): if h.startswith(p): ident = h[len(p):] tl[t] = ident found = True prefix_map[t] = p break if not found: nm.append(el) except: pass if len(nm) < len(scm): if len(nm) > 1: otu[SKOS_CLOSE_MATCH] = nm elif len(nm) == 1: otu[SKOS_CLOSE_MATCH] = nm[0] else: del otu[SKOS_CLOSE_MATCH] #_LOG.debug('tl =' + str(tl)) for k, t in moveable2taxon_link.items(): al = otu.get(k) if al: tl[t] = al del otu[k] if tl: otu['^ot:taxonLink'] = tl for trees in nexml['treesById'].values(): for tag in to_del: if tag in trees: del trees[tag] _simplify_all_meta_by_id_del(trees) for tree in trees['treeById'].values(): for tag in to_del: if tag in tree: del tree[tag] _simplify_all_meta_by_id_del(tree) tt = tree.get('@xsi:type', 'nex:FloatTree') if tt.lower() == 'nex:inttree': e_len_coerce = int else: e_len_coerce = float for edge_d in tree['edgeBySourceId'].values(): for edge in edge_d.values(): try: x = e_len_coerce(edge['@length']) edge['@length'] = x except: pass for node in tree['nodeById'].values(): nl = node.get('@label') if nl: no = node.get('@otu') if no and _otu2label[no] == nl: del node['@label'] if prefix_map: nexml['^ot:taxonLinkPrefixes'] = prefix_map if merge_blocks: from peyotl.manip import merge_otus_and_trees merge_otus_and_trees(raw) if nexson_syntax_version != BY_ID_HONEY_BADGERFISH: convert_nexson_format(raw, nexson_syntax_version, current_format=BY_ID_HONEY_BADGERFISH, sort_arbitrary=sort_arbitrary) elif sort_arbitrary: sort_arbitrarily_ordered_nexson(raw) return raw
def _main(): import sys, codecs, json, os import argparse _HELP_MESSAGE = '''NeXML/NexSON converter''' _EPILOG = '''UTF-8 encoding is used (for input and output). Environmental variables used: NEXSON_INDENTATION_SETTING indentation in NexSON (default 0) NEXML_INDENTATION_SETTING indentation in NeXML (default is 0). NEXSON_LOGGING_LEVEL logging setting: NotSet, Debug, Warn, Info, Error NEXSON_LOGGING_FORMAT format string for logging messages. ''' parser = argparse.ArgumentParser(description=_HELP_MESSAGE, formatter_class=argparse.RawDescriptionHelpFormatter, epilog=_EPILOG) parser.add_argument("input", help="filepath to input") parser.add_argument("-o", "--output", metavar="FILE", required=False, help="output filepath. Standard output is used if omitted.") parser.add_argument("-s", "--sort", action="store_true", default=False, help="If specified, the arbitrarily ordered items will be sorted.") e_choices = ["nexml", str(BADGER_FISH_NEXSON_VERSION), str(DIRECT_HONEY_BADGERFISH), str(BY_ID_HONEY_BADGERFISH), "0.0", "1.0", "1.2", "badgerfish"] e_choices.sort() e_help = 'output format. Valid choices are: "{c}". \ With "0.0" and "badgerfish" as aliases for "0.0.0", and \ "1.2" being an alias for the most recent version of honeybadgerfish \ (1.2.0). The verions "1.0.0" and its alias "1.0" refer to a \ version that uses the honeybadgefish syntax for meta elements, \ but maintained the direct object-mapping from NeXML of the \ badgerfish form of NexSON'.format(c='", "'.join(e_choices)) parser.add_argument("-e", "--export", metavar="FMT", required=False, choices=e_choices, help=e_help) codes = 'xjb' parser.add_argument("-m", "--mode", metavar="MODE", required=False, choices=[i + j for i in codes for j in codes], help="A less precise way to specify a mapping. The \ m option is a two-letter code for {input}{output} \ The letters are x for NeXML, j for NexSON, \ and b for BadgerFish JSON version of NexML. \ The default behavior is to autodetect the format \ and convert JSON to NeXML or NeXML to NexSON.") args = parser.parse_args() inpfn = args.input outfn = args.output mode = args.mode export_format = args.export if export_format: if export_format.lower() in ["badgerfish", "0.0"]: export_format = str(BADGER_FISH_NEXSON_VERSION) elif export_format.lower() == "1.0": export_format = str(DIRECT_HONEY_BADGERFISH) elif export_format.lower() == "1.2": export_format = str(BY_ID_HONEY_BADGERFISH) if export_format is not None and mode is not None: if (mode.endswith('b') and (export_format != str(BADGER_FISH_NEXSON_VERSION))) \ or (mode.endswith('x') and (export_format.lower() != "nexml")) \ or (mode.endswith('x') and (export_format.lower() not in [str(DIRECT_HONEY_BADGERFISH)])): sys.exit('export format {e} clashes with mode {m}. The mode option is not neeeded if the export option is used.'.format(e=export_format, m=mode)) try: inp = codecs.open(inpfn, mode='rU', encoding='utf-8') except: sys.exit('nexson_nexml: Could not open file "{fn}"\n'.format(fn=inpfn)) if mode is None: try: while True: first_graph_char = inp.read(1).strip() if first_graph_char == '<': mode = 'x*' break elif first_graph_char in '{[': mode = '*x' break elif first_graph_char: raise ValueError('Expecting input to start with <, {, or [') except: sys.exit('nexson_nexml: First character of "{fn}" was not <, {, or [\nInput does not appear to be NeXML or NexSON\n'.format(fn=inpfn)) if export_format is None: if mode.endswith('*'): export_format = str(DIRECT_HONEY_BADGERFISH) else: export_format = "nexml" inp.seek(0) elif export_format is None: if mode.endswith('j'): export_format = str(DIRECT_HONEY_BADGERFISH) elif mode.endswith('b'): export_format = str(BADGER_FISH_NEXSON_VERSION) else: assert mode.endswith('x') export_format = "nexml" if export_format == "nexml": indentation = int(os.environ.get('NEXML_INDENTATION_SETTING', 0)) else: indentation = int(os.environ.get('NEXSON_INDENTATION_SETTING', 0)) if outfn is not None: try: out = codecs.open(outfn, mode='w', encoding='utf-8') except: sys.exit('nexson_nexml: Could not open output filepath "{fn}"\n'.format(fn=outfn)) else: out = codecs.getwriter('utf-8')(sys.stdout) if mode.startswith('x'): blob = get_ot_study_info_from_nexml(inp, nexson_syntax_version=export_format) else: blob = json.load(inp) if mode.startswith('*'): try: n = get_nexml_el(blob) except: n = None if not n or (not isinstance(n, dict)): sys.exit('No top level "nex:nexml" element found. Document does not appear to be a JSON version of NeXML\n') if n: mode = 'j' + mode[1] if args.sort: sort_arbitrarily_ordered_nexson(blob) if export_format == "nexml": if indentation > 0: indent = ' '*indentation else: indent = '' newline = '\n' write_obj_as_nexml(blob, out, addindent=indent, newl=newline) else: if not mode.startswith('x'): blob = convert_nexson_format(blob, export_format, sort_arbitrary=True) write_as_json(blob, out, indent=indentation)