def convert_otus(self, otus_list): otusById = dict((i['@id'], i) for i in otus_list) otusElementOrder = [i['@id'] for i in otus_list] otusIdToOtuObj = {} for oid, otus_el in otusById.items(): o_list = _index_list_of_values(otus_el, 'otu') otuById = dict((i['@id'], i) for i in o_list) otusIdToOtuObj[oid] = otuById # If all that succeeds, add the new object to the dict, creating a fat structure for k, v in otusIdToOtuObj.items(): otusById[k]['otuById'] = v # Make the struct leaner if self.remove_old_structs: for v in otusById.values(): del v['@id'] for k, otu_obj in otusIdToOtuObj.items(): o = otusById[k] del o['otu'] for v in otu_obj.values(): del v['@id'] # move @label to ^ot:manualLabel if it is not ottTaxonName # if self.suppress_label_if_ott_taxon: # if '@label' in v: # tax_name = v.get('^ot:ottTaxonName') # label = v.get('@label').strip() # if tax_name is None: # orig_name = v.get('^ot:originalLabel') # assert(orig_name is not None) # if label != orig_name: # v['^ot:manualLabel'] = label # elif label != tax_name: # v['^ot:manualLabel'] = label # del v['@label'] return otusById, otusElementOrder
def convert_tree(self, tree): nodeById = {} root_node = None node_list = _index_list_of_values(tree, 'node') for node in node_list: nodeById[node['@id']] = node r = node.get('@root') #_LOG.debug(' node {} @root={}'.format(node['@id'], r)) if r in [True, 'true']: #@TEMP accepting true or "true" assert root_node is None root_node = node assert root_node is not None edgeBySourceId = {} edge_list = _get_index_list_of_values(tree, 'edge') for edge in edge_list: sourceId = edge['@source'] eid = edge['@id'] del edge['@id'] byso = edgeBySourceId.setdefault(sourceId, {}) byso[eid] = edge # If all that succeeds, add the new object to the dict, creating a fat structure tree['nodeById'] = nodeById tree['edgeBySourceId'] = edgeBySourceId tree['^ot:rootNodeId'] = root_node['@id'] # Make the struct leaner tid = tree['@id'] if self.remove_old_structs: del tree['@id'] del tree['node'] del tree['edge'] for node in node_list: if '^ot:isLeaf' in node: del node['^ot:isLeaf'] del node['@id'] return tid, tree
def convert_tree(self, tree): nodeById = {} root_node = None node_list = _index_list_of_values(tree, 'node') for node in node_list: nodeById[node['@id']] = node r = node.get('@root') #_LOG.debug(' node {} @root={}'.format(node['@id'], r)) if r in [True, 'true']: #@TEMP accepting true or "true" assert root_node is None root_node = node assert root_node is not None edgeBySourceId = {} edge_list = _get_index_list_of_values(tree, 'edge') for edge in edge_list: sourceId = edge['@source'] eid = edge['@id'] del edge['@id'] byso = edgeBySourceId.setdefault(sourceId, {}) byso[eid] = edge # If all that succeeds, add the new object to the dict, creating a fat structure tree['nodeById'] = nodeById tree['edgeBySourceId'] = edgeBySourceId tree['^ot:rootNodeId'] = root_node['@id'] # Make the struct leaner tid = tree['@id'] if self.remove_old_structs: del tree['@id'] del tree['node'] del tree['edge'] for node in node_list: if '^ot:isLeaf' in node: del node['^ot:isLeaf'] del node['@id'] return tid, tree
def convert_otus(self, otus_list): otusById = dict((i['@id'], i) for i in otus_list) otusElementOrder = [i['@id'] for i in otus_list] otusIdToOtuObj = {} for oid, otus_el in otusById.items(): o_list = _index_list_of_values(otus_el, 'otu') otuById = dict((i['@id'], i) for i in o_list) otusIdToOtuObj[oid] = otuById # If all that succeeds, add the new object to the dict, creating a fat structure for k, v in otusIdToOtuObj.items(): otusById[k]['otuById'] = v # Make the struct leaner if self.remove_old_structs: for v in otusById.values(): del v['@id'] for k, otu_obj in otusIdToOtuObj.items(): o = otusById[k] del o['otu'] for v in otu_obj.values(): del v['@id'] # move @label to ^ot:manualLabel if it is not ottTaxonName # if self.suppress_label_if_ott_taxon: # if '@label' in v: # tax_name = v.get('^ot:ottTaxonName') # label = v.get('@label').strip() # if tax_name is None: # orig_name = v.get('^ot:originalLabel') # assert(orig_name is not None) # if label != orig_name: # v['^ot:manualLabel'] = label # elif label != tax_name: # v['^ot:manualLabel'] = label # del v['@label'] return otusById, otusElementOrder
def _add_dict_of_subtree_to_xml_doc(self, doc, parent, children_dict, key_order=None): written = set() if key_order: for t in key_order: k, nko = t assert nko is None or isinstance(nko, tuple) if k in children_dict: chl = _index_list_of_values(children_dict, k) written.add(k) self._add_subtree_list_to_xml_doc(doc, parent, chl, k, nko) ksl = list(children_dict.keys()) ksl.sort() for k in ksl: chl = _index_list_of_values(children_dict, k) if k not in written: self._add_subtree_list_to_xml_doc(doc, parent, chl, k, None)
def _add_dict_of_subtree_to_xml_doc(self, doc, parent, children_dict, key_order=None): written = set() if key_order: for t in key_order: k, nko = t assert nko is None or isinstance(nko, tuple) if k in children_dict: chl = _index_list_of_values(children_dict, k) written.add(k) self._add_subtree_list_to_xml_doc(doc, parent, chl, k, nko) ksl = list(children_dict.keys()) ksl.sort() for k in ksl: chl = _index_list_of_values(children_dict, k) if k not in written: self._add_subtree_list_to_xml_doc(doc, parent, chl, k, None)
def convert_tree(self, tree): """Return (tree_id, tree) or None (if the tree has no edges). """ nodeById = {} root_node = None node_list = _index_list_of_values(tree, 'node') for node in node_list: nodeById[node['@id']] = node r = node.get('@root') # _LOG.debug(' node {} @root={}'.format(node['@id'], r)) if r in [True, 'true']: # @TEMP accepting true or "true" assert root_node is None root_node = node assert root_node is not None edgeBySourceId = {} edge_list = _get_index_list_of_values(tree, 'edge') for edge in edge_list: sourceId = edge['@source'] eid = edge['@id'] del edge['@id'] byso = edgeBySourceId.setdefault(sourceId, {}) byso[eid] = edge # If all that succeeds, add the new object to the dict, creating a fat structure tree['nodeById'] = nodeById tree['edgeBySourceId'] = edgeBySourceId tree['^ot:rootNodeId'] = root_node['@id'] # Make the struct leaner tid = tree['@id'] if self.remove_old_structs: del tree['@id'] del tree['node'] try: del tree['edge'] except: # Tree Tr75035 in http://treebase.org/treebase-web/search/study/summary.html?id=14763 # is empty. in NeXML that shows up as a tree with a node but no edges. # See https://github.com/OpenTreeOfLife/opentree/issues/641 # TODO: returning None seems safest, but could cull trees with just metadata. # but creating a fake tree for metadata is ugly. So, I'm fine with not # supporting this. _LOG.warn( 'Tree with ID "{}" is being dropped because it has no edges' .format(tid)) assert not edge_list return None for node in node_list: if '^ot:isLeaf' in node: del node['^ot:isLeaf'] del node['@id'] return tid, tree
def _add_meta_dict_to_xml(self, doc, parent, meta_dict): ''' Values in the meta element dict are converted to a BadgerFish-style encoding (see _convert_hbf_meta_val_for_xml), so regardless of input_format, we treat them as if they were BadgerFish. ''' if not meta_dict: return key_list = list(meta_dict.keys()) key_list.sort() for key in key_list: el_list = _index_list_of_values(meta_dict, key) for el in el_list: self._add_meta_value_to_xml_doc(doc, parent, el)
def _add_meta_dict_to_xml(self, doc, parent, meta_dict): ''' Values in the meta element dict are converted to a BadgerFish-style encoding (see _convert_hbf_meta_val_for_xml), so regardless of input_format, we treat them as if they were BadgerFish. ''' if not meta_dict: return key_list = list(meta_dict.keys()) key_list.sort() for key in key_list: el_list = _index_list_of_values(meta_dict, key) for el in el_list: self._add_meta_value_to_xml_doc(doc, parent, el)
def convert_tree(self, tree): '''Return (tree_id, tree) or None (if the tree has no edges). ''' nodeById = {} root_node = None node_list = _index_list_of_values(tree, 'node') for node in node_list: nodeById[node['@id']] = node r = node.get('@root') #_LOG.debug(' node {} @root={}'.format(node['@id'], r)) if r in [True, 'true']: #@TEMP accepting true or "true" assert root_node is None root_node = node assert root_node is not None edgeBySourceId = {} edge_list = _get_index_list_of_values(tree, 'edge') for edge in edge_list: sourceId = edge['@source'] eid = edge['@id'] del edge['@id'] byso = edgeBySourceId.setdefault(sourceId, {}) byso[eid] = edge # If all that succeeds, add the new object to the dict, creating a fat structure tree['nodeById'] = nodeById tree['edgeBySourceId'] = edgeBySourceId tree['^ot:rootNodeId'] = root_node['@id'] # Make the struct leaner tid = tree['@id'] if self.remove_old_structs: del tree['@id'] del tree['node'] try: del tree['edge'] except: # Tree Tr75035 in http://treebase.org/treebase-web/search/study/summary.html?id=14763 # is empty. in NeXML that shows up as a tree with a node but no edges. # See https://github.com/OpenTreeOfLife/opentree/issues/641 # TODO: returning None seems safest, but could cull trees with just metadata. # but creating a fake tree for metadata is ugly. So, I'm fine with not # supporting this. _LOG.warn('Tree with ID "{}" is being dropped because it has no edges'.format(tid)) assert not edge_list return None for node in node_list: if '^ot:isLeaf' in node: del node['^ot:isLeaf'] del node['@id'] return tid, tree
def _hbf_handle_child_elements(self, obj, ntl): ''' Indirect recursion through _gen_hbf_el ''' # accumulate a list of the children names in ko, and # the a dictionary of tag to xml elements. # repetition of a tag means that it will map to a list of # xml elements cd = {} ko = [] ks = set() for child in ntl: k = child.nodeName if k == 'meta' and (not self._badgerfish_style_conversion): matk, matv = self._transform_meta_key_value(child) if matk is not None: _add_value_to_dict_bf(obj, matk, matv) else: if k not in ks: ko.append(k) ks.add(k) _add_value_to_dict_bf(cd, k, child) # Converts the child XML elements to dicts by recursion and # adds these to the dict. for k in ko: v = _index_list_of_values(cd, k) dcl = [] ct = None for xc in v: ct, dc = self._gen_hbf_el(xc) dcl.append(dc) # this assertion will trip is the hacky stripping of namespaces # results in a name clash among the tags of the children assert ct not in obj obj[ct] = dcl # delete redundant about attributes that are used in XML, but not JSON (last rule of HoneyBadgerFish) _cull_redundant_about(obj) return obj
def _hbf_handle_child_elements(self, obj, ntl): """ Indirect recursion through _gen_hbf_el """ # accumulate a list of the children names in ko, and # the a dictionary of tag to xml elements. # repetition of a tag means that it will map to a list of # xml elements cd = {} ko = [] ks = set() for child in ntl: k = child.nodeName if k == 'meta' and (not self._badgerfish_style_conversion): matk, matv = self._transform_meta_key_value(child) if matk is not None: _add_value_to_dict_bf(obj, matk, matv) else: if k not in ks: ko.append(k) ks.add(k) _add_value_to_dict_bf(cd, k, child) # Converts the child XML elements to dicts by recursion and # adds these to the dict. for k in ko: v = _index_list_of_values(cd, k) dcl = [] ct = None for xc in v: ct, dc = self._gen_hbf_el(xc) dcl.append(dc) # this assertion will trip is the hacky stripping of namespaces # results in a name clash among the tags of the children assert ct not in obj obj[ct] = dcl # delete redundant about attributes that are used in XML, but not JSON (last rule of HoneyBadgerFish) _cull_redundant_about(obj) return obj
def convert(self, obj): '''Takes a dict corresponding to the honeybadgerfish JSON blob of the 1.0.* type and converts it to BY_ID_HONEY_BADGERFISH version. The object is modified in place and returned. ''' if self.pristine_if_invalid: raise NotImplementedError( 'pristine_if_invalid option is not supported yet') nex = get_nexml_el(obj) assert nex # Create the new objects as locals. This section should not # mutate obj, so that if there is an exception the object # is unchanged on the error exit otus = _index_list_of_values(nex, 'otus') o_t = self.convert_otus(otus) otusById, otusElementOrder = o_t trees = _get_index_list_of_values(nex, 'trees') treesById = dict((i['@id'], i) for i in trees) treesElementOrder = [i['@id'] for i in trees] if len(treesById) != len(treesElementOrder): trees_id_set = set() for tgid in treesElementOrder: if tgid in trees_id_set: raise NexsonError( 'Repeated trees element id "{}"'.format(tgid)) trees_id_set.add(tgid) tree_id_set = set() treeContainingObjByTreesId = {} for tree_group in trees: #_LOG.debug('converting tree group {} to by_id'.format(tree_group['@id'])) treeById = {} treeElementOrder = [] tree_array = _get_index_list_of_values(tree_group, 'tree') for tree in tree_array: #_LOG.debug('# pre-convert keys = {}'.format(tree.keys())) t_t = self.convert_tree(tree) tid, tree_alias = t_t if tid in tree_id_set: raise NexsonError( 'Repeated tree element id "{}"'.format(tid)) tree_id_set.add(tid) #_LOG.debug('converting tree {} to by_id'.format(tid)) #_LOG.debug('# post-convert keys = {}'.format(tree.keys())) assert tree_alias is tree treeById[tid] = tree treeElementOrder.append(tid) treeContainingObjByTreesId[tree_group['@id']] = treeById tree_group['^ot:treeElementOrder'] = treeElementOrder # If all that succeeds, add the new object to the dict, creating a fat structure nex['otusById'] = otusById nex['^ot:otusElementOrder'] = otusElementOrder nex['treesById'] = treesById nex['^ot:treesElementOrder'] = treesElementOrder for k, v in treeContainingObjByTreesId.items(): treesById[k]['treeById'] = v nex['@nexml2json'] = str(BY_ID_HONEY_BADGERFISH) # Make the struct leaner if self.remove_old_structs: del nex['otus'] del nex['trees'] for k, v in treesById.items(): if 'tree' in v: del v['tree'] del v['@id'] return obj
def convert(self, obj): '''Takes a dict corresponding to the honeybadgerfish JSON blob of the 1.0.* type and converts it to BY_ID_HONEY_BADGERFISH version. The object is modified in place and returned. ''' if self.pristine_if_invalid: raise NotImplementedError('pristine_if_invalid option is not supported yet') nex = get_nexml_el(obj) assert nex # Create the new objects as locals. This section should not # mutate obj, so that if there is an exception the object # is unchanged on the error exit otus = _index_list_of_values(nex, 'otus') o_t = self.convert_otus(otus) otusById, otusElementOrder = o_t trees = _get_index_list_of_values(nex, 'trees') treesById = dict((i['@id'], i) for i in trees) treesElementOrder = [i['@id'] for i in trees] if len(treesById) != len(treesElementOrder): trees_id_set = set() for tgid in treesElementOrder: if tgid in trees_id_set: raise NexsonError('Repeated trees element id "{}"'.format(tgid)) trees_id_set.add(tgid) tree_id_set = set() treeContainingObjByTreesId = {} for tree_group in trees: #_LOG.debug('converting tree group {} to by_id'.format(tree_group['@id'])) treeById = {} treeElementOrder = [] tree_array = _get_index_list_of_values(tree_group, 'tree') for tree in tree_array: #_LOG.debug('# pre-convert keys = {}'.format(tree.keys())) t_t = self.convert_tree(tree) if t_t is None: continue tid, tree_alias = t_t if tid in tree_id_set: raise NexsonError('Repeated tree element id "{}"'.format(tid)) tree_id_set.add(tid) #_LOG.debug('converting tree {} to by_id'.format(tid)) #_LOG.debug('# post-convert keys = {}'.format(tree.keys())) assert tree_alias is tree treeById[tid] = tree treeElementOrder.append(tid) treeContainingObjByTreesId[tree_group['@id']] = treeById tree_group['^ot:treeElementOrder'] = treeElementOrder # If all that succeeds, add the new object to the dict, creating a fat structure nex['otusById'] = otusById nex['^ot:otusElementOrder'] = otusElementOrder nex['treesById'] = treesById nex['^ot:treesElementOrder'] = treesElementOrder for k, v in treeContainingObjByTreesId.items(): treesById[k]['treeById'] = v nex['@nexml2json'] = str(BY_ID_HONEY_BADGERFISH) # Make the struct leaner if self.remove_old_structs: del nex['otus'] del nex['trees'] for k, v in treesById.items(): if 'tree' in v: del v['tree'] del v['@id'] return obj