Exemplo n.º 1
0
def iter_otus(nexson, nexson_version=None):
    """generator over all otus in all otus group elements.
    yields a tuple of 3 items:
        otus group ID,
        otu ID,
        the otu obj
    """
    if nexson_version is None:
        nexson_version = detect_nexson_version(nexson)
    if not _is_by_id_hbf(nexson_version):
        convert_nexson_format(
            nexson, BY_ID_HONEY_BADGERFISH)  # TODO shouldn't modify...
    nex = get_nexml_el(nexson)
    otus_group_by_id = nex['otusById']
    group_order = nex.get('^ot:otusElementOrder', [])
    if len(group_order) < len(otus_group_by_id):
        group_order = list(otus_group_by_id.keys())
        group_order.sort()
    for otus_group_id in group_order:
        otus_group = otus_group_by_id[otus_group_id]
        otu_by_id = otus_group['otuById']
        ti_order = list(otu_by_id.keys())
        for otu_id in ti_order:
            otu = otu_by_id[otu_id]
            yield otus_group_id, otu_id, otu
Exemplo n.º 2
0
 def testConvertBFtoHBF1_0(self):
     for t in RT_DIRS:
         obj, b_expect = _get_pair(t, 'v0.0.json', 'v1.0.json')
         if obj is None:
             continue
         h = convert_nexson_format(obj, DIRECT_HONEY_BADGERFISH)
         equal_blob_check(self, '', h, b_expect)
Exemplo n.º 3
0
 def __init__(self, filepath='', nexson=None):
     self.filepath = filepath
     if nexson is None:
         if not filepath:
             raise ValueError('Either a filepath or nexson argument must be provided')
         self._nexson = read_as_json(self.filepath)
     else:
         self._nexson = nexson
     v = detect_nexson_version(self._nexson)
     if v != BY_ID_HONEY_BADGERFISH:
         _LOG.debug('NexsonProxy converting to hbf1.2')
         convert_nexson_format(self._nexson, BY_ID_HONEY_BADGERFISH)
     self._nexml_el = get_nexml_el(self._nexson)
     self._otu_cache = {}
     self._tree_cache = {}
     self._wr = None
Exemplo n.º 4
0
 def testConvertHBF1_0toHBF1_2(self):
     for t in RT_DIRS:
         obj, b_expect = _get_pair(t, 'v1.0.json', 'v1.2.json')
         if obj is None:
             continue
         b = convert_nexson_format(obj, BY_ID_HONEY_BADGERFISH)
         equal_blob_check(self, '', b, b_expect)
Exemplo n.º 5
0
 def testConvertBFtoHBF1_0(self):
     for t in RT_DIRS:
         obj, b_expect = _get_pair(t, 'v0.0.json', 'v1.0.json')
         if obj is None:
             continue
         h = convert_nexson_format(obj, DIRECT_HONEY_BADGERFISH)
         equal_blob_check(self, '', h, b_expect)
Exemplo n.º 6
0
def validate_and_convert_nexson(nexson, output_version, allow_invalid):
    '''Runs the nexson validator and returns a converted 4 object:
        nexson, annotation, validation_log, nexson_adaptor

    `nexson` is the nexson dict.
    `output_version` is the version of nexson syntax to be used after validation.
    if `allow_invalid` is False, and the nexson validation has errors, then
        a GitWorkflowError will be generated before conversion.
    '''
    try:
        if TRACE_FILES:
            _write_to_next_free('input', nexson)
        annotation, validation_log, nexson_adaptor = ot_validate(nexson)
        if TRACE_FILES:
            _write_to_next_free('annotation', annotation)
    except:
        msg = 'exception in ot_validate: ' + traceback.format_exc()
        raise GitWorkflowError(msg)
    if (not allow_invalid) and validation_log.has_error():
        raise GitWorkflowError('ot_validation failed: ' +
                               json.dumps(annotation))
    nexson = convert_nexson_format(nexson, output_version)
    if TRACE_FILES:
        _write_to_next_free('converted', nexson)
    return nexson, annotation, validation_log, nexson_adaptor
Exemplo n.º 7
0
 def testConvertHBF1_0toHBF1_2(self):
     for t in RT_DIRS:
         obj, b_expect = _get_pair(t, 'v1.0.json', 'v1.2.json')
         if obj is None:
             continue
         b = convert_nexson_format(obj, BY_ID_HONEY_BADGERFISH)
         equal_blob_check(self, '', b, b_expect)
Exemplo n.º 8
0
 def testConvertHBF1_2toHBF1_0(self):
     for t in RT_DIRS:
         obj, b_expect = _get_pair(t, 'v1.2.json', 'v1.0.json')
         if obj is None:
             continue
         b = convert_nexson_format(obj, DIRECT_HONEY_BADGERFISH)
         sort_arbitrarily_ordered_nexson(b_expect)
         sort_arbitrarily_ordered_nexson(b)
         equal_blob_check(self, '', b, b_expect)
Exemplo n.º 9
0
 def testConvertHBF1_2toBF(self):
     for t in RT_DIRS:
         obj, b_expect = _get_pair(t, 'v1.2.json', 'v0.0.json')
         if obj is None:
             continue
         b = convert_nexson_format(obj, BADGER_FISH_NEXSON_VERSION)
         sort_arbitrarily_ordered_nexson(b_expect)
         sort_arbitrarily_ordered_nexson(b)
         equal_blob_check(self, '', b, b_expect)
Exemplo n.º 10
0
 def testConvertHBF1_0toBF(self):
     for t in RT_DIRS:
         obj, b_expect = _get_pair(t, 'v1.0.json', 'v0.0.json')
         if obj is None:
             continue
         b = convert_nexson_format(obj, BADGER_FISH_NEXSON_VERSION)
         sort_meta_elements(b_expect)
         sort_meta_elements(b)
         equal_blob_check(self, '', b, b_expect)
Exemplo n.º 11
0
 def testConvertHBF1_2toHBF1_0(self):
     for t in RT_DIRS:
         obj, b_expect = _get_pair(t, 'v1.2.json', 'v1.0.json')
         if obj is None:
             continue
         b = convert_nexson_format(obj, DIRECT_HONEY_BADGERFISH)
         sort_arbitrarily_ordered_nexson(b_expect)
         sort_arbitrarily_ordered_nexson(b)
         equal_blob_check(self, '', b, b_expect)
Exemplo n.º 12
0
 def testConvertHBF1_0toBF(self):
     for t in RT_DIRS:
         obj, b_expect = _get_pair(t, 'v1.0.json', 'v0.0.json')
         if obj is None:
             continue
         b = convert_nexson_format(obj, BADGER_FISH_NEXSON_VERSION)
         sort_meta_elements(b_expect)
         sort_meta_elements(b)
         equal_blob_check(self, '', b, b_expect)
Exemplo n.º 13
0
 def testConvertHBF1_2toBF(self):
     for t in RT_DIRS:
         obj, b_expect = _get_pair(t, 'v1.2.json', 'v0.0.json')
         if obj is None:
             continue
         b = convert_nexson_format(obj, BADGER_FISH_NEXSON_VERSION)
         sort_arbitrarily_ordered_nexson(b_expect)
         sort_arbitrarily_ordered_nexson(b)
         equal_blob_check(self, '', b, b_expect)
Exemplo n.º 14
0
def iter_otus(nexson, nexson_version=None):
    '''generator over all otus in all otus group elements.
    yields a tuple of 3 items:
        otus group ID,
        otu ID,
        the otu obj
    '''
    if nexson_version is None:
        nexson_version = detect_nexson_version(nexson)
    nex = get_nexml_el(nexson)
    if not _is_by_id_hbf(nexson_version):
        convert_nexson_format(nexson_blob, BY_ID_HONEY_BADGERFISH) #TODO shouldn't modify...
    otus_group_by_id = nex['otusById']
    group_order = nex.get('^ot:otusElementOrder', [])
    if len(group_order) < len(otus_group_by_id):
        group_order = list(otus_group_by_id.keys())
        group_order.sort()
    for otus_group_id in group_order:
        otus_group = otus_group_by_id[otus_group_id]
        otu_by_id = otus_group['otuById']
        ti_order = list(otu_by_id.keys())
        for otu_id in ti_order:
            otu = otu_by_id[otu_id]
            yield otus_group_id, otu_id, otu
Exemplo n.º 15
0
def validate_and_convert_nexson(nexson, output_version, allow_invalid, **kwargs):
    '''Runs the nexson validator and returns a converted 4 object:
        nexson, annotation, validation_log, nexson_adaptor

    `nexson` is the nexson dict.
    `output_version` is the version of nexson syntax to be used after validation.
    if `allow_invalid` is False, and the nexson validation has errors, then
        a GitWorkflowError will be generated before conversion.
    '''
    try:
        if TRACE_FILES:
            _write_to_next_free('input', nexson)
        annotation, validation_log, nexson_adaptor = ot_validate(nexson, **kwargs)    
        if TRACE_FILES:
            _write_to_next_free('annotation', annotation)
    except:
        msg = 'exception in ot_validate: ' + traceback.format_exc()
        raise GitWorkflowError(msg)
    if (not allow_invalid) and validation_log.has_error():
        raise GitWorkflowError('ot_validation failed: ' + json.dumps(annotation))
    nexson = convert_nexson_format(nexson, output_version)
    if TRACE_FILES:
        _write_to_next_free('converted', nexson)
    return nexson, annotation, validation_log, nexson_adaptor
Exemplo n.º 16
0
def workaround_phylografter_nexson(obj):
    '''Also coerce's to NexSON 1.2.1
    '''
    _move_otu_at_label_properties(obj)
    convert_nexson_format(obj, BY_ID_HONEY_BADGERFISH, sort_arbitrary=True)
    _add_defaults(obj)
Exemplo n.º 17
0
def merge_otus_and_trees(nexson_blob):
    '''Takes a nexson object:
        1. merges trees elements 2 - # trees into the first trees element.,
        2. merges otus elements 2 - # otus into the first otus element.
        3. if there is no ot:originalLabel field for any otu,
            it sets that field based on @label and deletes @label
        4. merges an otu elements using the rule:
              A. treat (ottId, originalLabel) as a key
              B. If otu objects in subsequent trees match originalLabel and
                have a matching or absent ot:ottId, then they are merged into
                the same OTUs (however see C)
              C. No two leaves of a tree may share an otu (though otu should
                be shared across different trees). It is important that
                each leaf node be mapped to a distinct OTU. Otherwise there
                will be no way of separating them during OTU mapping. we
                do this indirectly by assuring to no two otu objects in the
                same otus object get merged with each other (or to a common
                object)

        5. correct object references to deleted entities.

    This function is used to patch up NexSONs created by multiple imports, hence the
    substitution of '@label' for 'ot:originalLabel'. Ids are arbitrary for imports from
    non-nexml tools, so matching is done based on names. This should mimic the behavior
    of the analysis tools that produced the trees (for most/all such tools unique names
    constitute unique OTUs).
    '''
    id_to_replace_id = {}
    orig_version = detect_nexson_version(nexson_blob)
    convert_nexson_format(nexson_blob, BY_ID_HONEY_BADGERFISH)
    nexson = get_nexml_el(nexson_blob)
    otus_group_order = nexson.get('^ot:otusElementOrder', [])
    # (ott, orig) -> list of otu elements
    retained_mapped2otu = {}
    # orig -> list of otu elements
    retained_orig2otu = {}
    # For the first (entirely retained) group of otus:
    #   1. assure that originalLabel is filled in
    #   2. register the otu in retained_mapped2otu and retained_orig2otu
    # otu elements that have no label, originalLabel or ottId will not
    #   be registered, so they'll never be matched.
    if len(otus_group_order) > 0:
        otus_group_by_id = nexson['otusById']
        retained_ogi = otus_group_order[0]
        retained_og = otus_group_by_id[retained_ogi]
        retained_og_otu = retained_og.setdefault('otuById', {})
        label_to_original_label_otu_by_id(retained_og_otu)
        for oid, otu in retained_og_otu.items():
            ottid = otu.get('^ot:ottId')
            orig = otu.get('^ot:originalLabel')
            key = (ottid, orig)
            if key != (None, None):
                m = retained_mapped2otu.setdefault(key, [])
                t = (oid, otu)
                m.append(t)
                if orig is not None:
                    m = retained_orig2otu.setdefault(orig, [])
                    m.append(t)
        # For each of the other otus elements, we:
        #   1. assure that originalLabel is filled in
        #   2. decide (for each otu) whether it will
        #       be added to retained_og or merged with
        #       an otu already in retained_og. In the
        #       case of the latter, we add to the
        #       replaced_otu dict (old oid as key, new otu as value)
        for ogi in otus_group_order[1:]:
            #_LOG.debug('retained_mapped2otu = {r}'.format(r=retained_mapped2otu))
            og = otus_group_by_id[ogi]
            del otus_group_by_id[ogi]
            otu_by_id = og.get('otuById', {})
            label_to_original_label_otu_by_id(otu_by_id)
            used_matches = set()
            id_to_replace_id[ogi] = retained_ogi
            for oid, otu in otu_by_id.items():
                ottid = otu.get('^ot:ottId')
                orig = otu.get('^ot:originalLabel')
                key = (ottid, orig)
                if key == (None, None):
                    retained_og[oid] = otu
                else:
                    match_otu = None
                    mlist = retained_mapped2otu.get(key)
                    if mlist is not None:
                        for m in mlist:
                            if m[0] not in used_matches:
                                # _LOG.debug('Matching {k} to {m}'.format(k=repr(key), m=repr(m)))
                                match_otu = m
                                break
                            #else:
                            #    _LOG.debug('{k} already in {m}'.format(k=repr(m[0]), m=repr(used_matches)))
                    if match_otu is None:
                        #_LOG.debug('New el: {k} mlist = {m}'.format(k=repr(key), m=repr(mlist)))
                        mlist = retained_orig2otu.get(orig, [])
                        for m in mlist:
                            if m[0] not in used_matches:
                                match_otu = m
                                break
                    if match_otu is not None:
                        id_to_replace_id[oid] = match_otu[0]
                        used_matches.add(match_otu[0])
                        _merge_otu_do_not_fix_references(otu, match_otu[1])
                    else:
                        assert oid not in retained_og_otu
                        retained_og_otu[oid] = otu
                        m = retained_mapped2otu.setdefault(key, [])
                        t = (oid, otu)
                        m.append(t)
                        if orig is not None:
                            m = retained_orig2otu.setdefault(orig, [])
                            m.append(t)
        nexson['^ot:otusElementOrder'] = [retained_ogi]
    # Move all of the tree elements to the first trees group.
    trees_group_order = nexson.get('^ot:treesElementOrder', [])
    if len(trees_group_order) > 0:
        trees_group_by_id = nexson['treesById']
        retained_tgi = trees_group_order[0]
        retained_tg = trees_group_by_id[retained_tgi]
        retained_tg['@otus'] = retained_ogi
        retained_tg_tree_obj = retained_tg.get('treeById', {})
        for tgi in trees_group_order[1:]:
            tg = trees_group_by_id[tgi]
            del trees_group_by_id[tgi]
            id_to_replace_id[tgi] = retained_tgi
            retained_tg['^ot:treeElementOrder'].extend(
                tg['^ot:treeElementOrder'])
            for tid, tree_obj in tg.get('treeById', {}).items():
                retained_tg_tree_obj[tid] = tree_obj
        for tree_obj in retained_tg_tree_obj.values():
            for node in tree_obj.get('nodeById', {}).values():
                o = node.get('@otu')
                if o is not None:
                    r = id_to_replace_id.get(o)
                    if r is not None:
                        node['@otu'] = r
        nexson['^ot:treesElementOrder'] = [retained_tgi]

    replace_entity_references_in_meta_and_annotations(nexson, id_to_replace_id)
    convert_nexson_format(nexson_blob, orig_version)
    return nexson_blob
Exemplo n.º 18
0
def get_ot_study_info_from_treebase_nexml(src=None,
                                          nexml_content=None,
                                          encoding=u'utf8',
                                          nexson_syntax_version=DEFAULT_NEXSON_VERSION,
                                          merge_blocks=True,
                                          sort_arbitrary=False):
    '''Normalize treebase-specific metadata into the locations where
    open tree of life software that expects it.

    See get_ot_study_info_from_nexml for the explanation of the src,
    nexml_content, encoding, and nexson_syntax_version arguments
    If merge_blocks is True then peyotl.manip.merge_otus_and_trees

    Actions to "normalize" TreeBase objects to ot Nexson
        1. the meta id for any meta item that has only a value and an id
        2. throw away rdfs:isDefinedBy
        3. otu @label -> otu ^ot:originalLabel
        4. ^tb:indentifier.taxon, ^tb:indentifier.taxonVariant and some skos:closeMatch
            fields to ^ot:taxonLink
        5. remove "@xml:base"
        6. coerce edge lengths to native types
    '''
    #pylint: disable=R0915
    raw = get_ot_study_info_from_nexml(src=src,
                                       nexml_content=nexml_content,
                                       encoding=encoding,
                                       nexson_syntax_version=BY_ID_HONEY_BADGERFISH)
    nexml = raw['nexml']
    SKOS_ALT_LABEL = '^skos:altLabel'
    SKOS_CLOSE_MATCH = '^skos:closeMatch'
    strippable_pre = {
        'http://www.ubio.org/authority/metadata.php?lsid=urn:lsid:ubio.org:namebank:': '@ubio',
        'http://purl.uniprot.org/taxonomy/': '@uniprot',
    }
    moveable2taxon_link = {"^tb:identifier.taxon": '@tb:identifier.taxon',
                           "^tb:identifier.taxonVariant": '@tb:identifier.taxonVariant', }
    to_del = ['^rdfs:isDefinedBy', '@xml:base']
    for tag in to_del:
        if tag in nexml:
            del nexml[tag]
    _simplify_all_meta_by_id_del(nexml)
    _otu2label = {}
    prefix_map = {}
    # compose dataDeposit
    nexid = nexml['@id']
    tb_url = 'http://purl.org/phylo/treebase/phylows/study/TB2:' + nexid
    nexml['^ot:dataDeposit'] = {'@href': tb_url}
    # compose dataDeposit
    bd = nexml.get("^dcterms:bibliographicCitation")
    if bd:
        nexml['^ot:studyPublicationReference'] = bd
    doi = nexml.get('^prism:doi')
    if doi:
        nexml['^ot:studyPublication'] = {'@href': doi}
    year = nexml.get('^prism:publicationDate')
    if year:
        try:
            nexml['^ot:studyYear'] = int(year)
        except:
            pass
    #
    for otus in nexml['otusById'].values():
        for tag in to_del:
            if tag in otus:
                del otus[tag]
        _simplify_all_meta_by_id_del(otus)
        for oid, otu in otus['otuById'].items():
            for tag in to_del:
                if tag in otu:
                    del otu[tag]
            _simplify_all_meta_by_id_del(otu)
            label = otu['@label']
            _otu2label[oid] = label
            otu['^ot:originalLabel'] = label
            del otu['@label']
            al = otu.get(SKOS_ALT_LABEL)
            if al is not None:
                if otu.get('^ot:altLabel') is None:
                    otu['^ot:altLabel'] = al
                del otu[SKOS_ALT_LABEL]
            tl = {}
            scm = otu.get(SKOS_CLOSE_MATCH)
            #_LOG.debug('scm = ' + str(scm))
            if scm:
                if isinstance(scm, dict):
                    h = scm.get('@href')
                    if h:
                        try:
                            for p, t in strippable_pre.items():
                                if h.startswith(p):
                                    ident = h[len(p):]
                                    tl[t] = ident
                                    del otu[SKOS_CLOSE_MATCH]
                                    prefix_map[t] = p
                        except:
                            pass
                else:
                    nm = []
                    try:
                        for el in scm:
                            h = el.get('@href')
                            if h:
                                found = False
                                for p, t in strippable_pre.items():
                                    if h.startswith(p):
                                        ident = h[len(p):]
                                        tl[t] = ident
                                        found = True
                                        prefix_map[t] = p
                                        break
                                if not found:
                                    nm.append(el)
                    except:
                        pass
                    if len(nm) < len(scm):
                        if len(nm) > 1:
                            otu[SKOS_CLOSE_MATCH] = nm
                        elif len(nm) == 1:
                            otu[SKOS_CLOSE_MATCH] = nm[0]
                        else:
                            del otu[SKOS_CLOSE_MATCH]
            #_LOG.debug('tl =' + str(tl))
            for k, t in moveable2taxon_link.items():
                al = otu.get(k)
                if al:
                    tl[t] = al
                    del otu[k]
            if tl:
                otu['^ot:taxonLink'] = tl
    for trees in nexml['treesById'].values():
        for tag in to_del:
            if tag in trees:
                del trees[tag]
        _simplify_all_meta_by_id_del(trees)
        for tree in trees['treeById'].values():
            for tag in to_del:
                if tag in tree:
                    del tree[tag]
            _simplify_all_meta_by_id_del(tree)
            tt = tree.get('@xsi:type', 'nex:FloatTree')
            if tt.lower() == 'nex:inttree':
                e_len_coerce = int
            else:
                e_len_coerce = float
            for edge_d in tree['edgeBySourceId'].values():
                for edge in edge_d.values():
                    try:
                        x = e_len_coerce(edge['@length'])
                        edge['@length'] = x
                    except:
                        pass
            for node in tree['nodeById'].values():
                nl = node.get('@label')
                if nl:
                    no = node.get('@otu')
                    if no and _otu2label[no] == nl:
                        del node['@label']

    if prefix_map:
        nexml['^ot:taxonLinkPrefixes'] = prefix_map
    if merge_blocks:
        from peyotl.manip import merge_otus_and_trees
        merge_otus_and_trees(raw)
    if nexson_syntax_version != BY_ID_HONEY_BADGERFISH:
        convert_nexson_format(raw,
                              nexson_syntax_version,
                              current_format=BY_ID_HONEY_BADGERFISH,
                              sort_arbitrary=sort_arbitrary)
    elif sort_arbitrary:
        sort_arbitrarily_ordered_nexson(raw)
    return raw
Exemplo n.º 19
0
            out = codecs.open(outfn, mode='w', encoding='utf-8')
        except:
            sys.exit('validate_ot_nexson: Could not open output filepath "{fn}"\n'.format(fn=outfn))
    else:
        out = codecs.getwriter('utf-8')(sys.stdout)
    try:
        nexson = read_as_json(inp_filepath)
    except ValueError as vx:
        _LOG.error('Not valid JSON.')
        if args.verbose:
            raise vx
        else:
            sys.exit(1)
    except Exception as nx:
        _LOG.error(nx.value)
        sys.exit(1)
    convert_nexson_format(nexson, BY_ID_HONEY_BADGERFISH)
    trees = extract_tree_nexson(nexson, tree_id=args.tree_id)
    if len(trees) == 0:
        trees = extract_tree_nexson(nexson, tree_id=None)
        if trees:
            v = '", "'.join([i[0] for i in trees])
            sys.exit('Tree ID {i} not found. Valid IDs for this file are "{l}"\n'.format(i=args.tree_id, l=v))
        else:
            sys.exit('This NexSON has not trees.\n')
    ott = OTT()
    for tree_id, tree, otus in trees:
        tree_proxy = NexsonTreeProxy(tree=tree, tree_id=tree_id, otus=otus)
        evaluate_tree_rooting(nexson, ott, tree_proxy)

Exemplo n.º 20
0
def merge_otus_and_trees(nexson_blob):
    '''Takes a nexson object:
        1. merges trees elements 2 - # trees into the first trees element.,
        2. merges otus elements 2 - # otus into the first otus element.
        3. if there is no ot:originalLabel field for any otu,
            it sets that field based on @label and deletes @label
        4. merges an otu elements using the rule:
              A. treat (ottId, originalLabel) as a key
              B. If otu objects in subsequent trees match originalLabel and
                have a matching or absent ot:ottId, then they are merged into
                the same OTUs (however see C)
              C. No two leaves of a tree may share an otu (though otu should
                be shared across different trees). It is important that
                each leaf node be mapped to a distinct OTU. Otherwise there
                will be no way of separating them during OTU mapping. we
                do this indirectly by assuring to no two otu objects in the
                same otus object get merged with each other (or to a common
                object)

        5. correct object references to deleted entities.

    This function is used to patch up NexSONs created by multiple imports, hence the
    substitution of '@label' for 'ot:originalLabel'. Ids are arbitrary for imports from
    non-nexml tools, so matching is done based on names. This should mimic the behavior
    of the analysis tools that produced the trees (for most/all such tools unique names
    constitute unique OTUs).
    '''
    id_to_replace_id = {}
    orig_version = detect_nexson_version(nexson_blob)
    convert_nexson_format(nexson_blob, BY_ID_HONEY_BADGERFISH)
    nexson = get_nexml_el(nexson_blob)
    otus_group_order = nexson.get('^ot:otusElementOrder', [])
    # (ott, orig) -> list of otu elements
    retained_mapped2otu = {}
    # orig -> list of otu elements
    retained_orig2otu = {}
    # For the first (entirely retained) group of otus:
    #   1. assure that originalLabel is filled in
    #   2. register the otu in retained_mapped2otu and retained_orig2otu
    # otu elements that have no label, originalLabel or ottId will not
    #   be registered, so they'll never be matched.
    if len(otus_group_order) > 0:
        otus_group_by_id = nexson['otusById']
        retained_ogi = otus_group_order[0]
        retained_og = otus_group_by_id[retained_ogi]
        retained_og_otu = retained_og.setdefault('otuById', {})
        label_to_original_label_otu_by_id(retained_og_otu)
        for oid, otu in retained_og_otu.items():
            ottid = otu.get('^ot:ottId')
            orig = otu.get('^ot:originalLabel')
            key = (ottid, orig)
            if key != (None, None):
                m = retained_mapped2otu.setdefault(key, [])
                t = (oid, otu)
                m.append(t)
                if orig is not None:
                    m = retained_orig2otu.setdefault(orig, [])
                    m.append(t)
        # For each of the other otus elements, we:
        #   1. assure that originalLabel is filled in
        #   2. decide (for each otu) whether it will
        #       be added to retained_og or merged with
        #       an otu already in retained_og. In the
        #       case of the latter, we add to the
        #       replaced_otu dict (old oid as key, new otu as value)
        for ogi in otus_group_order[1:]:
            #_LOG.debug('retained_mapped2otu = {r}'.format(r=retained_mapped2otu))
            og = otus_group_by_id[ogi]
            del otus_group_by_id[ogi]
            otu_by_id = og.get('otuById', {})
            label_to_original_label_otu_by_id(otu_by_id)
            used_matches = set()
            id_to_replace_id[ogi] = retained_ogi
            for oid, otu in otu_by_id.items():
                ottid = otu.get('^ot:ottId')
                orig = otu.get('^ot:originalLabel')
                key = (ottid, orig)
                if key == (None, None):
                    retained_og[oid] = otu
                else:
                    match_otu = None
                    mlist = retained_mapped2otu.get(key)
                    if mlist is not None:
                        for m in mlist:
                            if m[0] not in used_matches:
                                # _LOG.debug('Matching {k} to {m}'.format(k=repr(key), m=repr(m)))
                                match_otu = m
                                break
                            #else:
                            #    _LOG.debug('{k} already in {m}'.format(k=repr(m[0]), m=repr(used_matches)))
                    if match_otu is None:
                        #_LOG.debug('New el: {k} mlist = {m}'.format(k=repr(key), m=repr(mlist)))
                        mlist = retained_orig2otu.get(orig, [])
                        for m in mlist:
                            if m[0] not in used_matches:
                                match_otu = m
                                break
                    if match_otu is not None:
                        id_to_replace_id[oid] = match_otu[0]
                        used_matches.add(match_otu[0])
                        _merge_otu_do_not_fix_references(otu, match_otu[1])
                    else:
                        assert oid not in retained_og_otu
                        retained_og_otu[oid] = otu
                        m = retained_mapped2otu.setdefault(key, [])
                        t = (oid, otu)
                        m.append(t)
                        if orig is not None:
                            m = retained_orig2otu.setdefault(orig, [])
                            m.append(t)
        nexson['^ot:otusElementOrder'] = [retained_ogi]
    # Move all of the tree elements to the first trees group.
    trees_group_order = nexson.get('^ot:treesElementOrder', [])
    if len(trees_group_order) > 0:
        trees_group_by_id = nexson['treesById']
        retained_tgi = trees_group_order[0]
        retained_tg = trees_group_by_id[retained_tgi]
        retained_tg['@otus'] = retained_ogi
        retained_tg_tree_obj = retained_tg.get('treeById', {})
        for tgi in trees_group_order[1:]:
            tg = trees_group_by_id[tgi]
            del trees_group_by_id[tgi]
            id_to_replace_id[tgi] = retained_tgi
            retained_tg['^ot:treeElementOrder'].extend(tg['^ot:treeElementOrder'])
            for tid, tree_obj in tg.get('treeById', {}).items():
                retained_tg_tree_obj[tid] = tree_obj
        for tree_obj in retained_tg_tree_obj.values():
            for node in tree_obj.get('nodeById', {}).values():
                o = node.get('@otu')
                if o is not None:
                    r = id_to_replace_id.get(o)
                    if r is not None:
                        node['@otu'] = r
        nexson['^ot:treesElementOrder'] = [retained_tgi]

    replace_entity_references_in_meta_and_annotations(nexson, id_to_replace_id)
    convert_nexson_format(nexson_blob, orig_version)
    return nexson_blob
Exemplo n.º 21
0
def _main():
    import sys, codecs, json, os
    import argparse
    _HELP_MESSAGE = '''NeXML/NexSON converter'''
    _EPILOG = '''UTF-8 encoding is used (for input and output).

Environmental variables used:
    NEXSON_INDENTATION_SETTING indentation in NexSON (default 0)
    NEXML_INDENTATION_SETTING indentation in NeXML (default is 0).
    NEXSON_LOGGING_LEVEL logging setting: NotSet, Debug, Warn, Info, Error
    NEXSON_LOGGING_FORMAT format string for logging messages.
'''
    parser = argparse.ArgumentParser(description=_HELP_MESSAGE,
                                     formatter_class=argparse.RawDescriptionHelpFormatter,
                                     epilog=_EPILOG)
    parser.add_argument("input", help="filepath to input")
    parser.add_argument("-o", "--output", 
                        metavar="FILE",
                        required=False,
                        help="output filepath. Standard output is used if omitted.")
    parser.add_argument("-s", "--sort", 
                        action="store_true",
                        default=False,
                        help="If specified, the arbitrarily ordered items will be sorted.")
    e_choices = ["nexml",
                 str(BADGER_FISH_NEXSON_VERSION),
                 str(DIRECT_HONEY_BADGERFISH),
                 str(BY_ID_HONEY_BADGERFISH),
                 "0.0",
                 "1.0",
                 "1.2",
                 "badgerfish"]
    e_choices.sort()
    e_help = 'output format. Valid choices are: "{c}". \
With "0.0" and "badgerfish" as aliases for "0.0.0", and \
"1.2" being an alias for the most recent version of honeybadgerfish \
(1.2.0). The verions "1.0.0" and its alias "1.0" refer to a \
version that uses the honeybadgefish syntax for meta elements, \
but maintained the direct object-mapping from NeXML of the \
badgerfish form of NexSON'.format(c='", "'.join(e_choices))
    parser.add_argument("-e", "--export", 
                        metavar="FMT",
                        required=False,
                        choices=e_choices,
                        help=e_help)
    codes = 'xjb'
    parser.add_argument("-m", "--mode", 
                        metavar="MODE",
                        required=False,
                        choices=[i + j for i in codes for j in codes],
                        help="A less precise way to specify a mapping. The \
                               m option is a two-letter code for {input}{output} \
                               The letters are x for NeXML, j for NexSON, \
                               and b for BadgerFish JSON version of NexML. \
                               The default behavior is to autodetect the format \
                               and convert JSON to NeXML or NeXML to NexSON.")
    args = parser.parse_args()
    inpfn = args.input
    outfn = args.output
    mode = args.mode
    export_format = args.export
    if export_format:
        if export_format.lower() in ["badgerfish", "0.0"]:
            export_format = str(BADGER_FISH_NEXSON_VERSION)
        elif export_format.lower() ==  "1.0":
            export_format = str(DIRECT_HONEY_BADGERFISH)
        elif export_format.lower() ==  "1.2":
            export_format = str(BY_ID_HONEY_BADGERFISH)
    if export_format is not None and mode is not None:
        if (mode.endswith('b') and (export_format != str(BADGER_FISH_NEXSON_VERSION))) \
           or (mode.endswith('x') and (export_format.lower() != "nexml")) \
           or (mode.endswith('x') and (export_format.lower() not in [str(DIRECT_HONEY_BADGERFISH)])):
            sys.exit('export format {e} clashes with mode {m}. The mode option is not neeeded if the export option is used.'.format(e=export_format, m=mode))
    try:
        inp = codecs.open(inpfn, mode='rU', encoding='utf-8')
    except:
        sys.exit('nexson_nexml: Could not open file "{fn}"\n'.format(fn=inpfn))
    if mode is None:
        try:
            while True:
                first_graph_char = inp.read(1).strip()
                if first_graph_char == '<':
                    mode = 'x*'
                    break
                elif first_graph_char in '{[':
                    mode = '*x'
                    break
                elif first_graph_char:
                    raise ValueError('Expecting input to start with <, {, or [')
        except:
            sys.exit('nexson_nexml: First character of "{fn}" was not <, {, or [\nInput does not appear to be NeXML or NexSON\n'.format(fn=inpfn))
        if export_format is None:
            if mode.endswith('*'):
                export_format = str(DIRECT_HONEY_BADGERFISH)
            else:
                export_format = "nexml"
        inp.seek(0)
    elif export_format is None:
        if mode.endswith('j'):
            export_format = str(DIRECT_HONEY_BADGERFISH)
        elif mode.endswith('b'):
            export_format = str(BADGER_FISH_NEXSON_VERSION)
        else:
            assert mode.endswith('x')
            export_format = "nexml"

    if export_format == "nexml":
        indentation = int(os.environ.get('NEXML_INDENTATION_SETTING', 0))
    else:
        indentation = int(os.environ.get('NEXSON_INDENTATION_SETTING', 0))
    
    if outfn is not None:
        try:
            out = codecs.open(outfn, mode='w', encoding='utf-8')
        except:
            sys.exit('nexson_nexml: Could not open output filepath "{fn}"\n'.format(fn=outfn))
    else:
        out = codecs.getwriter('utf-8')(sys.stdout)

    if mode.startswith('x'):
        blob = get_ot_study_info_from_nexml(inp,
                                            nexson_syntax_version=export_format)
    else:
        blob = json.load(inp)
        if mode.startswith('*'):
            try:
                n = get_nexml_el(blob)
            except:
                n = None
            if not n or (not isinstance(n, dict)):
                sys.exit('No top level "nex:nexml" element found. Document does not appear to be a JSON version of NeXML\n')
            if n:
                mode = 'j' + mode[1]
    if args.sort:
        sort_arbitrarily_ordered_nexson(blob)
    if export_format == "nexml":
        if indentation > 0:
            indent = ' '*indentation
        else:
            indent = ''
        newline = '\n'
        write_obj_as_nexml(blob,
                           out,
                           addindent=indent,
                           newl=newline)
    else:
        if not mode.startswith('x'):
            blob = convert_nexson_format(blob, export_format, sort_arbitrary=True)
        write_as_json(blob, out, indent=indentation)