Exemplo n.º 1
0
def convert_nexson_format(blob,
                          out_nexson_format,
                          current_format=None,
                          remove_old_structs=True,
                          pristine_if_invalid=False,
                          sort_arbitrary=False):
    '''Take a dict form of NexSON and converts its datastructures to
    those needed to serialize as out_nexson_format.
    If current_format is not specified, it will be inferred.
    If `remove_old_structs` is False and different honeybadgerfish varieties
        are selected, the `blob` will be 'fat" containing both types
        of lookup structures.
    If pristine_if_invalid is False, then the object may be corrupted if it
        is an invalid nexson struct. Setting this to False can result in
        faster translation, but if an exception is raised the object may
        be polluted with partially constructed fields for the out_nexson_format.
    '''
    if not current_format:
        current_format = detect_nexson_version(blob)
    out_nexson_format = resolve_nexson_format(out_nexson_format)
    if current_format == out_nexson_format:
        if sort_arbitrary:
            sort_arbitrarily_ordered_nexson(blob)
        return blob
    two2zero = _is_by_id_hbf(out_nexson_format) and _is_badgerfish_version(current_format)
    zero2two = _is_by_id_hbf(current_format) and _is_badgerfish_version(out_nexson_format)
    if two2zero or zero2two:
        # go from 0.0 -> 1.0 then the 1.0->1.2 should succeed without nexml...
        blob = convert_nexson_format(blob,
                                     DIRECT_HONEY_BADGERFISH,
                                     current_format=current_format,
                                     remove_old_structs=remove_old_structs,
                                     pristine_if_invalid=pristine_if_invalid)
        current_format = DIRECT_HONEY_BADGERFISH
    ccdict = {'output_format':out_nexson_format,
              'input_format':current_format,
              'remove_old_structs': remove_old_structs,
              'pristine_if_invalid': pristine_if_invalid}
    ccfg = ConversionConfig(ccdict)
    if _is_badgerfish_version(current_format):
        converter = Badgerfish2DirectNexson(ccfg)
    elif _is_badgerfish_version(out_nexson_format):
        assert _is_direct_hbf(current_format)
        converter = Direct2BadgerfishNexson(ccfg)
    elif _is_direct_hbf(current_format) and (out_nexson_format == BY_ID_HONEY_BADGERFISH):
        converter = Direct2OptimalNexson(ccfg)
    elif _is_direct_hbf(out_nexson_format) and (current_format == BY_ID_HONEY_BADGERFISH):
        converter = Optimal2DirectNexson(ccfg)
    else:
        raise NotImplementedError('Conversion from {i} to {o}'.format(i=current_format, o=out_nexson_format))
    blob = converter.convert(blob)
    if sort_arbitrary:
        sort_arbitrarily_ordered_nexson(blob)
    return blob
Exemplo n.º 2
0
def convert_nexson_format(blob,
                          out_nexson_format,
                          current_format=None,
                          remove_old_structs=True,
                          pristine_if_invalid=False,
                          sort_arbitrary=False):
    '''Take a dict form of NexSON and converts its datastructures to
    those needed to serialize as out_nexson_format.
    If current_format is not specified, it will be inferred.
    If `remove_old_structs` is False and different honeybadgerfish varieties
        are selected, the `blob` will be 'fat" containing both types
        of lookup structures.
    If pristine_if_invalid is False, then the object may be corrupted if it
        is an invalid nexson struct. Setting this to False can result in
        faster translation, but if an exception is raised the object may
        be polluted with partially constructed fields for the out_nexson_format.
    '''
    if not current_format:
        current_format = detect_nexson_version(blob)
    out_nexson_format = resolve_nexson_format(out_nexson_format)
    if current_format == out_nexson_format:
        if sort_arbitrary:
            sort_arbitrarily_ordered_nexson(blob)
        return blob
    two2zero = _is_by_id_hbf(out_nexson_format) and _is_badgerfish_version(current_format)
    zero2two = _is_by_id_hbf(current_format) and _is_badgerfish_version(out_nexson_format)
    if two2zero or zero2two:
        # go from 0.0 -> 1.0 then the 1.0->1.2 should succeed without nexml...
        blob = convert_nexson_format(blob,
                                     DIRECT_HONEY_BADGERFISH,
                                     current_format=current_format,
                                     remove_old_structs=remove_old_structs,
                                     pristine_if_invalid=pristine_if_invalid)
        current_format = DIRECT_HONEY_BADGERFISH
    ccdict = {'output_format':out_nexson_format,
              'input_format':current_format,
              'remove_old_structs': remove_old_structs,
              'pristine_if_invalid': pristine_if_invalid}
    ccfg = ConversionConfig(ccdict)
    if _is_badgerfish_version(current_format):
        converter = Badgerfish2DirectNexson(ccfg)
    elif _is_badgerfish_version(out_nexson_format):
        assert _is_direct_hbf(current_format)
        converter = Direct2BadgerfishNexson(ccfg)
    elif _is_direct_hbf(current_format) and (out_nexson_format == BY_ID_HONEY_BADGERFISH):
        converter = Direct2OptimalNexson(ccfg)
    elif _is_direct_hbf(out_nexson_format) and (current_format == BY_ID_HONEY_BADGERFISH):
        converter = Optimal2DirectNexson(ccfg)
    else:
        raise NotImplementedError('Conversion from {i} to {o}'.format(i=current_format, o=out_nexson_format))
    blob = converter.convert(blob)
    if sort_arbitrary:
        sort_arbitrarily_ordered_nexson(blob)
    return blob
Exemplo n.º 3
0
def get_ot_study_info_from_nexml(src=None,
                                 nexml_content=None,
                                 encoding=u'utf8',
                                 nexson_syntax_version=DEFAULT_NEXSON_VERSION):
    '''Converts an XML doc to JSON using the honeybadgerfish convention (see to_honeybadgerfish_dict)
    and then prunes elements not used by open tree of life study curartion.

    If nexml_content is provided, it is interpreted as the contents
    of an NeXML file in utf-8 encoding.

    If nexml_content is None, then the src arg will be used src can be either:
        * a file_object, or
        * a string
    If `src` is a string then it will be treated as a filepath unless it
        begins with http:// or https:// (in which case it will be downloaded
        using peyotl.utility.download)
    Returns a dictionary with the keys/values encoded according to the honeybadgerfish convention
    See https://github.com/OpenTreeOfLife/api.opentreeoflife.org/wiki/HoneyBadgerFish

    Currently:
        removes nexml/characters @TODO: should replace it with a URI for
            where the removed character data can be found.
    '''
    if _is_by_id_hbf(nexson_syntax_version):
        nsv = DIRECT_HONEY_BADGERFISH
    else:
        nsv = nexson_syntax_version
    if nexml_content is None:
        if is_str_type(src):
            if src.startswith('http://') or src.startswith('https://'):
                from peyotl.utility import download
                nexml_content = download(url=src, encoding=encoding)
            else:
                with codecs.open(src, 'r', encoding=encoding) as src:
                    nexml_content = src.read().encode('utf-8')
        else:
            nexml_content = src.read().encode('utf-8')
    doc = xml.dom.minidom.parseString(nexml_content)
    doc_root = doc.documentElement

    ccfg = ConversionConfig(output_format=nsv,
                            input_format=NEXML_NEXSON_VERSION)
    converter = Nexml2Nexson(ccfg)
    o = converter.convert(doc_root)
    if _is_by_id_hbf(nexson_syntax_version):
        o = convert_nexson_format(o,
                                  BY_ID_HONEY_BADGERFISH,
                                  current_format=nsv)
    if 'nex:nexml' in o:
        n = o['nex:nexml']
        del o['nex:nexml']
        o['nexml'] = n
    return o
Exemplo n.º 4
0
def get_ot_study_info_from_nexml(src=None,
                                 nexml_content=None,
                                 encoding=u'utf8',
                                 nexson_syntax_version=DEFAULT_NEXSON_VERSION):
    '''Converts an XML doc to JSON using the honeybadgerfish convention (see to_honeybadgerfish_dict)
    and then prunes elements not used by open tree of life study curartion.

    If nexml_content is provided, it is interpreted as the contents
    of an NeXML file in utf-8 encoding.

    If nexml_content is None, then the src arg will be used src can be either:
        * a file_object, or
        * a string
    If `src` is a string then it will be treated as a filepath unless it
        begins with http:// or https:// (in which case it will be downloaded
        using peyotl.utility.download)
    Returns a dictionary with the keys/values encoded according to the honeybadgerfish convention
    See https://github.com/OpenTreeOfLife/api.opentreeoflife.org/wiki/HoneyBadgerFish

    Currently:
        removes nexml/characters @TODO: should replace it with a URI for
            where the removed character data can be found.
    '''
    if _is_by_id_hbf(nexson_syntax_version):
        nsv = DIRECT_HONEY_BADGERFISH
    else:
        nsv = nexson_syntax_version
    if nexml_content is None:
        if is_str_type(src):
            if src.startswith('http://') or src.startswith('https://'):
                from peyotl.utility import download
                nexml_content = download(url=src, encoding=encoding)
            else:
                with codecs.open(src, 'r', encoding=encoding) as src:
                    nexml_content = src.read().encode('utf-8')
        else:
            nexml_content = src.read().encode('utf-8')
    doc = xml.dom.minidom.parseString(nexml_content)
    doc_root = doc.documentElement

    ccfg = ConversionConfig(output_format=nsv, input_format=NEXML_NEXSON_VERSION)
    converter = Nexml2Nexson(ccfg)
    o = converter.convert(doc_root)
    if _is_by_id_hbf(nexson_syntax_version):
        o = convert_nexson_format(o, BY_ID_HONEY_BADGERFISH, current_format=nsv)
    if 'nex:nexml' in o:
        n = o['nex:nexml']
        del o['nex:nexml']
        o['nexml'] = n
    return o
Exemplo n.º 5
0
def extract_tree_nexson(nexson, tree_id, curr_version=None):
    '''Returns a list of (id, tree, otus_group) tuples for the
    specified tree_id (all trees if tree_id is None)
    '''
    if curr_version is None:
        curr_version = detect_nexson_version(nexson)
    if not _is_by_id_hbf(curr_version):
        nexson = convert_nexson_format(nexson, BY_ID_HONEY_BADGERFISH)

    nexml_el = get_nexml_el(nexson)
    tree_groups = nexml_el['treesById']
    tree_obj_otus_group_list = []
    for tree_group in tree_groups.values():
        if tree_id:
            tree_list = [(tree_id, tree_group['treeById'].get(tree_id))]
        else:
            tree_list = tree_group['treeById'].items()
        for tid, tree in tree_list:
            if tree is not None:
                otu_groups = nexml_el['otusById']
                ogi = tree_group['@otus']
                otu_group = otu_groups[ogi]['otuById']
                tree_obj_otus_group_list.append((tid, tree, otu_group))
                if tree_id is not None:
                    return tree_obj_otus_group_list
    return tree_obj_otus_group_list
Exemplo n.º 6
0
def strip_to_meta_only(blob, nexson_version):
    if nexson_version is None:
        nexson_version = detect_nexson_version(blob)
    nex = get_nexml_el(blob)
    if _is_by_id_hbf(nexson_version):
        for otus_group in nex.get('otusById', {}).values():
            if 'otuById' in otus_group:
                del otus_group['otuById']
        for trees_group in nex.get('treesById', {}).values():
            tree_group = trees_group['treeById']
            key_list = tree_group.keys()
            for k in key_list:
                tree_group[k] = None
    else:
        otus = nex['otus']
        if not isinstance(otus, list):
            otus = [otus]
        for otus_group in otus:
            if 'otu' in otus_group:
                del otus_group['otu']
        trees = nex['trees']
        if not isinstance(trees, list):
            trees = [trees]
        for trees_group in trees:
            tree_list = trees_group.get('tree')
            if not isinstance(tree_list, list):
                tree_list = [tree_list]
            t = [{'id': i.get('@id')} for i in tree_list]
            trees_group['tree'] = t
Exemplo n.º 7
0
def extract_tree_nexson(nexson, tree_id, curr_version=None):
    '''Returns a list of (id, tree, otus_group) tuples for the
    specified tree_id (all trees if tree_id is None)
    '''
    if curr_version is None:
        curr_version = detect_nexson_version(nexson)
    if not _is_by_id_hbf(curr_version):
        nexson = convert_nexson_format(nexson, BY_ID_HONEY_BADGERFISH)

    nexml_el = get_nexml_el(nexson)
    tree_groups = nexml_el['treesById']
    tree_obj_otus_group_list = []
    for tree_group in tree_groups.values():
        if tree_id:
            tree_list = [(tree_id, tree_group['treeById'].get(tree_id))]
        else:
            tree_list = tree_group['treeById'].items()
        for tid, tree in tree_list:
            if tree is not None:
                otu_groups = nexml_el['otusById']
                ogi = tree_group['@otus']
                otu_group = otu_groups[ogi]['otuById']
                tree_obj_otus_group_list.append((tid, tree, otu_group))
                if tree_id is not None:
                    return tree_obj_otus_group_list
    return tree_obj_otus_group_list
Exemplo n.º 8
0
def cull_nonmatching_trees(nexson, tree_id, curr_version=None):
    '''Modifies `nexson` and returns it in version 1.2.1 
    with any tree that does not match the ID removed.

    Note that this does not search through the NexSON for
    every node, edge, tree that was deleted. So the resulting
    NexSON may have broken references !
    '''
    if curr_version is None:
        curr_version = detect_nexson_version(nexson)
    if not _is_by_id_hbf(curr_version):
        nexson = convert_nexson_format(nexson, BY_ID_HONEY_BADGERFISH)

    nexml_el = get_nexml_el(nexson)
    tree_groups = nexml_el['treesById']
    tree_groups_to_del = []
    for tgi, tree_group in tree_groups.items():
        tbi = tree_group['treeById']
        if tree_id in tbi:
            trees_to_del = [i for i in tbi.keys() if i != tree_id]
            for tid in trees_to_del:
                tree_group['^ot:treeElementOrder'].remove(tid)
                del tbi[tid]
        else:
            tree_groups_to_del.append(tgi)
    for tgid in tree_groups_to_del:
        nexml_el['^ot:treesElementOrder'].remove(tgid)
        del tree_groups[tgid]
    return nexson
Exemplo n.º 9
0
def cull_nonmatching_trees(nexson, tree_id, curr_version=None):
    '''Modifies `nexson` and returns it in version 1.2.1
    with any tree that does not match the ID removed.

    Note that this does not search through the NexSON for
    every node, edge, tree that was deleted. So the resulting
    NexSON may have broken references !
    '''
    if curr_version is None:
        curr_version = detect_nexson_version(nexson)
    if not _is_by_id_hbf(curr_version):
        nexson = convert_nexson_format(nexson, BY_ID_HONEY_BADGERFISH)

    nexml_el = get_nexml_el(nexson)
    tree_groups = nexml_el['treesById']
    tree_groups_to_del = []
    for tgi, tree_group in tree_groups.items():
        tbi = tree_group['treeById']
        if tree_id in tbi:
            trees_to_del = [i for i in tbi.keys() if i != tree_id]
            for tid in trees_to_del:
                tree_group['^ot:treeElementOrder'].remove(tid)
                del tbi[tid]
        else:
            tree_groups_to_del.append(tgi)
    for tgid in tree_groups_to_del:
        nexml_el['^ot:treesElementOrder'].remove(tgid)
        del tree_groups[tgid]
    return nexson
Exemplo n.º 10
0
def strip_to_meta_only(blob, nexson_version):
    if nexson_version is None:
        nexson_version = detect_nexson_version(blob)
    nex = get_nexml_el(blob)
    if _is_by_id_hbf(nexson_version):
        for otus_group in nex.get('otusById', {}).values():
            if 'otuById' in otus_group:
                del otus_group['otuById']
        for trees_group in nex.get('treesById', {}).values():
            tree_group = trees_group['treeById']
            key_list = tree_group.keys()
            for k in key_list:
                tree_group[k] = None
    else:
        otus = nex['otus']
        if not isinstance(otus, list):
            otus = [otus]
        for otus_group in otus:
            if 'otu' in otus_group:
                del otus_group['otu']
        trees = nex['trees']
        if not isinstance(trees, list):
            trees = [trees]
        for trees_group in trees:
            tree_list = trees_group.get('tree')
            if not isinstance(tree_list, list):
                tree_list = [tree_list]
            t = [{'id': i.get('@id')} for i in tree_list]
            trees_group['tree'] = t
Exemplo n.º 11
0
def iter_otu(nexson, nexson_version=None):
    if nexson_version is None:
        nexson_version = detect_nexson_version(nexson)
    if not _is_by_id_hbf(nexson_version):
        raise NotImplementedError('iter_otu is only supported for nexson 1.2 at this point')
    nexml = get_nexml_el(nexson)
    for og in nexml.get('otusById', {}).values():
        for otu_id, otu in og.get('otuById', {}).items():
            yield otu_id, otu
Exemplo n.º 12
0
def iter_otu(nexson, nexson_version=None):
    if nexson_version is None:
        nexson_version = detect_nexson_version(nexson)
    if not _is_by_id_hbf(nexson_version):
        raise NotImplementedError('iter_otu is only supported for nexson 1.2 at this point')
    nexml = get_nexml_el(nexson)
    for og in nexml.get('otusById', {}).values():
        for otu_id, otu in og.get('otuById', {}).items():
            yield otu_id, otu
Exemplo n.º 13
0
def create_validation_adaptor(obj, logger, **kwargs):
    try:
        nexson_version = detect_nexson_version(obj)
    except:
        return BadgerFishValidationAdaptor(obj, logger, **kwargs)
    if _is_by_id_hbf(nexson_version):
        # _LOG.debug('validating as ById...')
        return ByIdHBFValidationAdaptor(obj, logger, **kwargs)
    elif _is_badgerfish_version(nexson_version):
        # _LOG.debug('validating as BadgerFish...')
        return BadgerFishValidationAdaptor(obj, logger, **kwargs)
    elif _is_direct_hbf(nexson_version):
        # _LOG.debug('validating as DirectHBF...')
        return DirectHBFValidationAdaptor(obj, logger, **kwargs)
    raise NotImplementedError('nexml2json version {v}'.format(v=nexson_version))
Exemplo n.º 14
0
def add_schema_attributes(container, nexson_version):
    """Adds several attributes to `container`:
    _using_hbf_meta  - boolean. True for HoneyBadgerFish v1-style meta elements ('^prop': value
                rather than 'meta': {'$':value})
    and the following _SchemaFragment instances:
    _NexmlEl_Schema
    """
    if _is_by_id_hbf(nexson_version):
        _add_by_id_nexson_schema_attributes(container)
    elif _is_badgerfish_version(nexson_version):
        _add_badgerfish_nexson_schema_attributes(container)
    elif _is_direct_hbf(nexson_version):
        _add_direct_nexson_schema_attributes(container)
    else:
        raise NotImplementedError('unrecognized nexson variant {}'.format(nexson_version))
Exemplo n.º 15
0
def extract_supporting_file_messages(nexson):
    curr_version = detect_nexson_version(nexson)
    if not _is_by_id_hbf(curr_version):
        nexson = convert_nexson_format(nexson, BY_ID_HONEY_BADGERFISH)
    nex = nexson['nexml']
    m_list = []
    m_list.extend(_get_supporting_file_messages_for_this_obj(nex))
    for otus in nex.get('otusById', {}).values():
        m_list.extend(_get_supporting_file_messages_for_this_obj(otus))
        for otu in otus.get('otuById', {}).values():
            m_list.extend(_get_supporting_file_messages_for_this_obj(otu))
    for tree_group in nex.get('treesById', {}).values():
        m_list.extend(_get_supporting_file_messages_for_this_obj(tree_group))
        for tree in tree_group.get('treeById', {}).values():
            m_list.extend(_get_supporting_file_messages_for_this_obj(tree))
    return m_list
Exemplo n.º 16
0
def extract_supporting_file_messages(nexson):
    curr_version = detect_nexson_version(nexson)
    if not _is_by_id_hbf(curr_version):
        nexson = convert_nexson_format(nexson, BY_ID_HONEY_BADGERFISH)
    nex = nexson['nexml']
    m_list = []
    m_list.extend(_get_supporting_file_messages_for_this_obj(nex))
    for otus in nex.get('otusById', {}).values():
        m_list.extend(_get_supporting_file_messages_for_this_obj(otus))
        for otu in otus.get('otuById', {}).values():
            m_list.extend(_get_supporting_file_messages_for_this_obj(otu))
    for tree_group in nex.get('treesById', {}).values():
        m_list.extend(_get_supporting_file_messages_for_this_obj(tree_group))
        for tree in tree_group.get('treeById', {}).values():
            m_list.extend(_get_supporting_file_messages_for_this_obj(tree))
    return m_list
Exemplo n.º 17
0
def sort_arbitrarily_ordered_nexson(blob):
    '''Primarily used for testing (getting nice diffs). Calls
    sort_meta_elements and then sorts otu, node and edge list by id
    '''
    # otu, node and edge elements have no necessary orger in v0.0 or v1.0
    v = detect_nexson_version(blob)
    nex = get_nexml_el(blob)
    if _is_by_id_hbf(v):
        return blob
    sort_meta_elements(blob)
    for ob in _get_index_list_of_values(nex, 'otus'):
        _inplace_sort_by_id(ob.get('otu', []))
    for tb in _get_index_list_of_values(nex, 'trees'):
        for tree in _get_index_list_of_values(tb, 'tree'):
            _inplace_sort_by_id(tree.get('node', []))
            _inplace_sort_by_id(tree.get('edge', []))
    return blob
Exemplo n.º 18
0
def sort_arbitrarily_ordered_nexson(blob):
    '''Primarily used for testing (getting nice diffs). Calls
    sort_meta_elements and then sorts otu, node and edge list by id
    '''
    # otu, node and edge elements have no necessary orger in v0.0 or v1.0
    v = detect_nexson_version(blob)
    nex = get_nexml_el(blob)
    if _is_by_id_hbf(v):
        return blob
    sort_meta_elements(blob)
    for ob in _get_index_list_of_values(nex, 'otus'):
        _inplace_sort_by_id(ob.get('otu', []))
    for tb in _get_index_list_of_values(nex, 'trees'):
        for tree in _get_index_list_of_values(tb, 'tree'):
            _inplace_sort_by_id(tree.get('node', []))
            _inplace_sort_by_id(tree.get('edge', []))
    return blob
Exemplo n.º 19
0
def nexml_el_of_by_id(nexson, curr_version=None):
    if curr_version is None:
        curr_version = detect_nexson_version(nexson)
    if not _is_by_id_hbf(curr_version):
        nexson = convert_nexson_format(nexson, BY_ID_HONEY_BADGERFISH)
    return get_nexml_el(nexson)
Exemplo n.º 20
0
def nexml_el_of_by_id(nexson, curr_version=None):
    if curr_version is None:
        curr_version = detect_nexson_version(nexson)
    if not _is_by_id_hbf(curr_version):
        nexson = convert_nexson_format(nexson, BY_ID_HONEY_BADGERFISH)
    return get_nexml_el(nexson)