def iter_trees(nexson, nexson_version=None): '''generator over all trees in all trees elements. yields a tuple of 3 items: trees element ID, tree ID, the tree obj ''' if nexson_version is None: nexson_version = detect_nexson_version(nexson) nex = get_nexml_el(nexson) if _is_by_id_hbf(nexson_version): trees_group_by_id = nex['treesById'] group_order = nex.get('^ot:treesElementOrder', []) if len(group_order) < len(trees_group_by_id): group_order = list(trees_group_by_id.keys()) group_order.sort() for trees_group_id in group_order: trees_group = trees_group_by_id[trees_group_id] tree_by_id = trees_group['treeById'] ti_order = trees_group.get('^ot:treeElementOrder', []) if len(ti_order) < len(tree_by_id): ti_order = list(tree_by_id.keys()) ti_order.sort() for tree_id in ti_order: tree = tree_by_id[tree_id] yield trees_group_id, tree_id, tree else: for trees_group in nex.get('trees', []): trees_group_id = trees_group['@id'] for tree in trees_group.get('tree', []): tree_id = tree['@id'] yield trees_group_id, tree_id, tree
def iter_otus(nexson, nexson_version=None): """generator over all otus in all otus group elements. yields a tuple of 3 items: otus group ID, otu ID, the otu obj """ if nexson_version is None: nexson_version = detect_nexson_version(nexson) if not _is_by_id_hbf(nexson_version): convert_nexson_format( nexson, BY_ID_HONEY_BADGERFISH) # TODO shouldn't modify... nex = get_nexml_el(nexson) otus_group_by_id = nex['otusById'] group_order = nex.get('^ot:otusElementOrder', []) if len(group_order) < len(otus_group_by_id): group_order = list(otus_group_by_id.keys()) group_order.sort() for otus_group_id in group_order: otus_group = otus_group_by_id[otus_group_id] otu_by_id = otus_group['otuById'] ti_order = list(otu_by_id.keys()) for otu_id in ti_order: otu = otu_by_id[otu_id] yield otus_group_id, otu_id, otu
def __init__(self, filepath='', nexson=None): self.filepath = filepath if nexson is None: if not filepath: raise ValueError('Either a filepath or nexson argument must be provided') self._nexson = read_as_json(self.filepath) else: self._nexson = nexson v = detect_nexson_version(self._nexson) if v != BY_ID_HONEY_BADGERFISH: _LOG.debug('NexsonProxy converting to hbf1.2') convert_nexson_format(self._nexson, BY_ID_HONEY_BADGERFISH) self._nexml_el = get_nexml_el(self._nexson) self._otu_cache = {} self._tree_cache = {} self._wr = None
def count_num_trees(nexson, nexson_version=None): if nexson_version is None: nexson_version = detect_nexson_version(nexson) nex = get_nexml_el(nexson) num_trees_by_group = [] if _is_by_id_hbf(nexson_version): for tree_group in nex.get('treesById', {}).values(): nt = len(tree_group.get('treeById', {})) num_trees_by_group.append(nt) else: trees_group = nex.get('trees', []) if isinstance(trees_group, dict): trees_group = [trees_group] for tree_group in trees_group: t = tree_group.get('tree') if isinstance(t, list): nt = len(t) else: nt = 1 num_trees_by_group.append(nt) return sum(num_trees_by_group)
def count_num_trees(nexson, nexson_version=None): '''Returns the number of trees summed across all tree groups. ''' if nexson_version is None: nexson_version = detect_nexson_version(nexson) nex = get_nexml_el(nexson) num_trees_by_group = [] if _is_by_id_hbf(nexson_version): for tree_group in nex.get('treesById', {}).values(): nt = len(tree_group.get('treeById', {})) num_trees_by_group.append(nt) else: trees_group = nex.get('trees', []) if isinstance(trees_group, dict): trees_group = [trees_group] for tree_group in trees_group: t = tree_group.get('tree') if isinstance(t, list): nt = len(t) else: nt = 1 num_trees_by_group.append(nt) return sum(num_trees_by_group)
def iter_otus(nexson, nexson_version=None): '''generator over all otus in all otus group elements. yields a tuple of 3 items: otus group ID, otu ID, the otu obj ''' if nexson_version is None: nexson_version = detect_nexson_version(nexson) nex = get_nexml_el(nexson) if not _is_by_id_hbf(nexson_version): convert_nexson_format(nexson_blob, BY_ID_HONEY_BADGERFISH) #TODO shouldn't modify... otus_group_by_id = nex['otusById'] group_order = nex.get('^ot:otusElementOrder', []) if len(group_order) < len(otus_group_by_id): group_order = list(otus_group_by_id.keys()) group_order.sort() for otus_group_id in group_order: otus_group = otus_group_by_id[otus_group_id] otu_by_id = otus_group['otuById'] ti_order = list(otu_by_id.keys()) for otu_id in ti_order: otu = otu_by_id[otu_id] yield otus_group_id, otu_id, otu
#!/usr/bin/env python from peyotl.api import APIWrapper from peyotl.utility.input_output import read_as_json, write_as_json from peyotl.nexson_syntax import get_nexml_el a = APIWrapper(phylesystem_api_kwargs={'get_from': 'local'}) pa = a.phylesystem_api p = pa.phylesystem_obj for sid, fp in p.iter_study_filepaths(): blob = read_as_json(fp) nex = get_nexml_el(blob) x = nex.get('^ot:studyId') if x != sid: nex['^ot:studyId'] = sid write_as_json(blob, fp) print(x, sid)
def merge_otus_and_trees(nexson_blob): '''Takes a nexson object: 1. merges trees elements 2 - # trees into the first trees element., 2. merges otus elements 2 - # otus into the first otus element. 3. if there is no ot:originalLabel field for any otu, it sets that field based on @label and deletes @label 4. merges an otu elements using the rule: A. treat (ottId, originalLabel) as a key B. If otu objects in subsequent trees match originalLabel and have a matching or absent ot:ottId, then they are merged into the same OTUs (however see C) C. No two leaves of a tree may share an otu (though otu should be shared across different trees). It is important that each leaf node be mapped to a distinct OTU. Otherwise there will be no way of separating them during OTU mapping. we do this indirectly by assuring to no two otu objects in the same otus object get merged with each other (or to a common object) 5. correct object references to deleted entities. This function is used to patch up NexSONs created by multiple imports, hence the substitution of '@label' for 'ot:originalLabel'. Ids are arbitrary for imports from non-nexml tools, so matching is done based on names. This should mimic the behavior of the analysis tools that produced the trees (for most/all such tools unique names constitute unique OTUs). ''' id_to_replace_id = {} orig_version = detect_nexson_version(nexson_blob) convert_nexson_format(nexson_blob, BY_ID_HONEY_BADGERFISH) nexson = get_nexml_el(nexson_blob) otus_group_order = nexson.get('^ot:otusElementOrder', []) # (ott, orig) -> list of otu elements retained_mapped2otu = {} # orig -> list of otu elements retained_orig2otu = {} # For the first (entirely retained) group of otus: # 1. assure that originalLabel is filled in # 2. register the otu in retained_mapped2otu and retained_orig2otu # otu elements that have no label, originalLabel or ottId will not # be registered, so they'll never be matched. if len(otus_group_order) > 0: otus_group_by_id = nexson['otusById'] retained_ogi = otus_group_order[0] retained_og = otus_group_by_id[retained_ogi] retained_og_otu = retained_og.setdefault('otuById', {}) label_to_original_label_otu_by_id(retained_og_otu) for oid, otu in retained_og_otu.items(): ottid = otu.get('^ot:ottId') orig = otu.get('^ot:originalLabel') key = (ottid, orig) if key != (None, None): m = retained_mapped2otu.setdefault(key, []) t = (oid, otu) m.append(t) if orig is not None: m = retained_orig2otu.setdefault(orig, []) m.append(t) # For each of the other otus elements, we: # 1. assure that originalLabel is filled in # 2. decide (for each otu) whether it will # be added to retained_og or merged with # an otu already in retained_og. In the # case of the latter, we add to the # replaced_otu dict (old oid as key, new otu as value) for ogi in otus_group_order[1:]: #_LOG.debug('retained_mapped2otu = {r}'.format(r=retained_mapped2otu)) og = otus_group_by_id[ogi] del otus_group_by_id[ogi] otu_by_id = og.get('otuById', {}) label_to_original_label_otu_by_id(otu_by_id) used_matches = set() id_to_replace_id[ogi] = retained_ogi for oid, otu in otu_by_id.items(): ottid = otu.get('^ot:ottId') orig = otu.get('^ot:originalLabel') key = (ottid, orig) if key == (None, None): retained_og[oid] = otu else: match_otu = None mlist = retained_mapped2otu.get(key) if mlist is not None: for m in mlist: if m[0] not in used_matches: # _LOG.debug('Matching {k} to {m}'.format(k=repr(key), m=repr(m))) match_otu = m break #else: # _LOG.debug('{k} already in {m}'.format(k=repr(m[0]), m=repr(used_matches))) if match_otu is None: #_LOG.debug('New el: {k} mlist = {m}'.format(k=repr(key), m=repr(mlist))) mlist = retained_orig2otu.get(orig, []) for m in mlist: if m[0] not in used_matches: match_otu = m break if match_otu is not None: id_to_replace_id[oid] = match_otu[0] used_matches.add(match_otu[0]) _merge_otu_do_not_fix_references(otu, match_otu[1]) else: assert oid not in retained_og_otu retained_og_otu[oid] = otu m = retained_mapped2otu.setdefault(key, []) t = (oid, otu) m.append(t) if orig is not None: m = retained_orig2otu.setdefault(orig, []) m.append(t) nexson['^ot:otusElementOrder'] = [retained_ogi] # Move all of the tree elements to the first trees group. trees_group_order = nexson.get('^ot:treesElementOrder', []) if len(trees_group_order) > 0: trees_group_by_id = nexson['treesById'] retained_tgi = trees_group_order[0] retained_tg = trees_group_by_id[retained_tgi] retained_tg['@otus'] = retained_ogi retained_tg_tree_obj = retained_tg.get('treeById', {}) for tgi in trees_group_order[1:]: tg = trees_group_by_id[tgi] del trees_group_by_id[tgi] id_to_replace_id[tgi] = retained_tgi retained_tg['^ot:treeElementOrder'].extend( tg['^ot:treeElementOrder']) for tid, tree_obj in tg.get('treeById', {}).items(): retained_tg_tree_obj[tid] = tree_obj for tree_obj in retained_tg_tree_obj.values(): for node in tree_obj.get('nodeById', {}).values(): o = node.get('@otu') if o is not None: r = id_to_replace_id.get(o) if r is not None: node['@otu'] = r nexson['^ot:treesElementOrder'] = [retained_tgi] replace_entity_references_in_meta_and_annotations(nexson, id_to_replace_id) convert_nexson_format(nexson_blob, orig_version) return nexson_blob
if report_ids: v_dict = {} else: v_dict = defaultdict(int) def process_val(v, id_str): if v is not None: if report_ids: v_dict.setdefault(v, []).append(id_str) elif summarize_as_set: v_dict[v] += 1 else: out.write(u'{i}: {v}\n'.format(i=study_id, v=v)) for study_id, n in phy.iter_study_objs(): nexml = get_nexml_el(n) if check_trees: for trees_group_id, tree_id, tree in iter_trees(n): id_str = 'study: {s} tree: {t}'.format(s=study_id, t=tree_id) process_val(tree.get(study_prop), id_str) else: process_val(nexml.get(study_prop), study_id) if report_ids: as_list = [(len(v), k, v) for k, v in v_dict.items()] as_list.sort(reverse=True) for n, k, v in as_list: out.write(u'{k}\tseen {n:d} times\t{v}\n'.format(k=k, n=n, v='\t'.join(v))) elif summarize_as_set: as_list = [(v, k) for k, v in v_dict.items()] as_list.sort(reverse=True)
'nominated_study_unique_OTU_count', 'nominated_study_unmapped_OTU_count', 'run_time' ] for prop in report_properties: locals()[prop] = 0 # end locals-punching ################################################# ott_id_set = set() nominated_ott_id_set = set() for study_id, n in phy.iter_study_objs(): reported_study_count += 1 otu_dict = gen_otu_dict(n) if not bool(otu_dict): continue nex_obj = get_nexml_el(n) study_count += 1 not_intended_for_synth = nex_obj.get('^ot:notIntendedForSynthesis') intended_for_synth = (not_intended_for_synth is None) or (not_intended_for_synth is False) if intended_for_synth: nominated_study_count += 1 nominated_study_OTU_count += len(otu_dict) OTU_count += len(otu_dict) for oid, o in otu_dict.items(): ott_id = o.get('^ot:ottId') if ott_id is None: unmapped_OTU_count += 1 if intended_for_synth: nominated_study_unmapped_OTU_count += 1
unmapped_OTU_count = 0 unique_OTU_count = 0 nominated_study_count = 0 nominated_study_OTU_count = 0 nominated_study_unique_OTU_count = 0 nominated_study_unmapped_OTU_count = 0 run_time = 0 ott_id_set = set() nominated_ott_id_set = set() for study_id, n in phy.iter_study_objs(): reported_study_count += 1 otu_dict = gen_otu_dict(n) if not bool(otu_dict): continue nex_obj = get_nexml_el(n) study_count += 1 not_intended_for_synth = nex_obj.get('^ot:notIntendedForSynthesis') intended_for_synth = (not_intended_for_synth is None) or (not_intended_for_synth is False) if intended_for_synth: nominated_study_count += 1 nominated_study_OTU_count += len(otu_dict) OTU_count += len(otu_dict) for oid, o in otu_dict.items(): ott_id = o.get('^ot:ottId') if ott_id is None: unmapped_OTU_count += 1 if intended_for_synth: nominated_study_unmapped_OTU_count += 1 else:
def merge_otus_and_trees(nexson_blob): '''Takes a nexson object: 1. merges trees elements 2 - # trees into the first trees element., 2. merges otus elements 2 - # otus into the first otus element. 3. if there is no ot:originalLabel field for any otu, it sets that field based on @label and deletes @label 4. merges an otu elements using the rule: A. treat (ottId, originalLabel) as a key B. If otu objects in subsequent trees match originalLabel and have a matching or absent ot:ottId, then they are merged into the same OTUs (however see C) C. No two leaves of a tree may share an otu (though otu should be shared across different trees). It is important that each leaf node be mapped to a distinct OTU. Otherwise there will be no way of separating them during OTU mapping. we do this indirectly by assuring to no two otu objects in the same otus object get merged with each other (or to a common object) 5. correct object references to deleted entities. This function is used to patch up NexSONs created by multiple imports, hence the substitution of '@label' for 'ot:originalLabel'. Ids are arbitrary for imports from non-nexml tools, so matching is done based on names. This should mimic the behavior of the analysis tools that produced the trees (for most/all such tools unique names constitute unique OTUs). ''' id_to_replace_id = {} orig_version = detect_nexson_version(nexson_blob) convert_nexson_format(nexson_blob, BY_ID_HONEY_BADGERFISH) nexson = get_nexml_el(nexson_blob) otus_group_order = nexson.get('^ot:otusElementOrder', []) # (ott, orig) -> list of otu elements retained_mapped2otu = {} # orig -> list of otu elements retained_orig2otu = {} # For the first (entirely retained) group of otus: # 1. assure that originalLabel is filled in # 2. register the otu in retained_mapped2otu and retained_orig2otu # otu elements that have no label, originalLabel or ottId will not # be registered, so they'll never be matched. if len(otus_group_order) > 0: otus_group_by_id = nexson['otusById'] retained_ogi = otus_group_order[0] retained_og = otus_group_by_id[retained_ogi] retained_og_otu = retained_og.setdefault('otuById', {}) label_to_original_label_otu_by_id(retained_og_otu) for oid, otu in retained_og_otu.items(): ottid = otu.get('^ot:ottId') orig = otu.get('^ot:originalLabel') key = (ottid, orig) if key != (None, None): m = retained_mapped2otu.setdefault(key, []) t = (oid, otu) m.append(t) if orig is not None: m = retained_orig2otu.setdefault(orig, []) m.append(t) # For each of the other otus elements, we: # 1. assure that originalLabel is filled in # 2. decide (for each otu) whether it will # be added to retained_og or merged with # an otu already in retained_og. In the # case of the latter, we add to the # replaced_otu dict (old oid as key, new otu as value) for ogi in otus_group_order[1:]: #_LOG.debug('retained_mapped2otu = {r}'.format(r=retained_mapped2otu)) og = otus_group_by_id[ogi] del otus_group_by_id[ogi] otu_by_id = og.get('otuById', {}) label_to_original_label_otu_by_id(otu_by_id) used_matches = set() id_to_replace_id[ogi] = retained_ogi for oid, otu in otu_by_id.items(): ottid = otu.get('^ot:ottId') orig = otu.get('^ot:originalLabel') key = (ottid, orig) if key == (None, None): retained_og[oid] = otu else: match_otu = None mlist = retained_mapped2otu.get(key) if mlist is not None: for m in mlist: if m[0] not in used_matches: # _LOG.debug('Matching {k} to {m}'.format(k=repr(key), m=repr(m))) match_otu = m break #else: # _LOG.debug('{k} already in {m}'.format(k=repr(m[0]), m=repr(used_matches))) if match_otu is None: #_LOG.debug('New el: {k} mlist = {m}'.format(k=repr(key), m=repr(mlist))) mlist = retained_orig2otu.get(orig, []) for m in mlist: if m[0] not in used_matches: match_otu = m break if match_otu is not None: id_to_replace_id[oid] = match_otu[0] used_matches.add(match_otu[0]) _merge_otu_do_not_fix_references(otu, match_otu[1]) else: assert oid not in retained_og_otu retained_og_otu[oid] = otu m = retained_mapped2otu.setdefault(key, []) t = (oid, otu) m.append(t) if orig is not None: m = retained_orig2otu.setdefault(orig, []) m.append(t) nexson['^ot:otusElementOrder'] = [retained_ogi] # Move all of the tree elements to the first trees group. trees_group_order = nexson.get('^ot:treesElementOrder', []) if len(trees_group_order) > 0: trees_group_by_id = nexson['treesById'] retained_tgi = trees_group_order[0] retained_tg = trees_group_by_id[retained_tgi] retained_tg['@otus'] = retained_ogi retained_tg_tree_obj = retained_tg.get('treeById', {}) for tgi in trees_group_order[1:]: tg = trees_group_by_id[tgi] del trees_group_by_id[tgi] id_to_replace_id[tgi] = retained_tgi retained_tg['^ot:treeElementOrder'].extend(tg['^ot:treeElementOrder']) for tid, tree_obj in tg.get('treeById', {}).items(): retained_tg_tree_obj[tid] = tree_obj for tree_obj in retained_tg_tree_obj.values(): for node in tree_obj.get('nodeById', {}).values(): o = node.get('@otu') if o is not None: r = id_to_replace_id.get(o) if r is not None: node['@otu'] = r nexson['^ot:treesElementOrder'] = [retained_tgi] replace_entity_references_in_meta_and_annotations(nexson, id_to_replace_id) convert_nexson_format(nexson_blob, orig_version) return nexson_blob
if len(study) == 1: study = '0' + study study2tree.setdefault('pg_' + study, []).append('tree' + tree) pa = PhylesystemAPI(get_from='local') raw_phylsys = pa.phylesystem_obj nexson_version = raw_phylsys.repo_nexml2json for study_id, tree_list in study2tree.items(): if verbose: sys.stderr.write('treelist={t} for study {s}.\n'.format(t=str(tree_list), s=study_id)) try: fp = raw_phylsys.get_filepath_for_study(study_id) blob = read_as_json(fp) nex = get_nexml_el(blob) prev = nex.setdefault('^ot:candidateTreeForSynthesis', []) for tree_id in tree_list: if tree_id not in prev: prev.append(tree_id) i_t_o_list = extract_tree_nexson(blob, tree_id, nexson_version) if not i_t_o_list: sys.stderr.write('tree {t} of study {s} not found !!!\n'.format(t=tree_id, s=study_id)) for tid, tree, otus_group in i_t_o_list: tree['^ot:unrootedTree'] = False tree['^ot:specifiedRoot'] = tree['^ot:rootNodeId'] if not dry_run: write_as_json(blob, fp) except KeyError: sys.stderr.write('study {} not found !!!\n'.format(study_id))
#!/usr/bin/env python """Examines the tags (ot:tag) study. Prints out a list of each unique tag used in the studies """ from peyotl.manip import iter_trees from peyotl.phylesystem.phylesystem_umbrella import Phylesystem from peyotl.nexson_syntax import get_nexml_el from collections import defaultdict import codecs import sys phy = Phylesystem() study_dict = defaultdict(int) tree_dict = defaultdict(int) out = codecs.getwriter("utf-8")(sys.stdout) for study_id, n in phy.iter_study_objs(): nexml = get_nexml_el(n) t = nexml.get("^ot:tag") if t: # print study_id, t if isinstance(t, list): for tag in t: study_dict[tag] += 1 else: study_dict[t] += 1 for trees_group_id, tree_id, tree in iter_trees(n): t = tree.get("^ot:tag") if t: # print study_id, tree_id, t if isinstance(t, list): for tag in t: study_dict[tag] += 1
def _main(): import sys, codecs, json, os import argparse _HELP_MESSAGE = '''NeXML/NexSON converter''' _EPILOG = '''UTF-8 encoding is used (for input and output). Environmental variables used: NEXSON_INDENTATION_SETTING indentation in NexSON (default 0) NEXML_INDENTATION_SETTING indentation in NeXML (default is 0). NEXSON_LOGGING_LEVEL logging setting: NotSet, Debug, Warn, Info, Error NEXSON_LOGGING_FORMAT format string for logging messages. ''' parser = argparse.ArgumentParser(description=_HELP_MESSAGE, formatter_class=argparse.RawDescriptionHelpFormatter, epilog=_EPILOG) parser.add_argument("input", help="filepath to input") parser.add_argument("-o", "--output", metavar="FILE", required=False, help="output filepath. Standard output is used if omitted.") parser.add_argument("-s", "--sort", action="store_true", default=False, help="If specified, the arbitrarily ordered items will be sorted.") e_choices = ["nexml", str(BADGER_FISH_NEXSON_VERSION), str(DIRECT_HONEY_BADGERFISH), str(BY_ID_HONEY_BADGERFISH), "0.0", "1.0", "1.2", "badgerfish"] e_choices.sort() e_help = 'output format. Valid choices are: "{c}". \ With "0.0" and "badgerfish" as aliases for "0.0.0", and \ "1.2" being an alias for the most recent version of honeybadgerfish \ (1.2.0). The verions "1.0.0" and its alias "1.0" refer to a \ version that uses the honeybadgefish syntax for meta elements, \ but maintained the direct object-mapping from NeXML of the \ badgerfish form of NexSON'.format(c='", "'.join(e_choices)) parser.add_argument("-e", "--export", metavar="FMT", required=False, choices=e_choices, help=e_help) codes = 'xjb' parser.add_argument("-m", "--mode", metavar="MODE", required=False, choices=[i + j for i in codes for j in codes], help="A less precise way to specify a mapping. The \ m option is a two-letter code for {input}{output} \ The letters are x for NeXML, j for NexSON, \ and b for BadgerFish JSON version of NexML. \ The default behavior is to autodetect the format \ and convert JSON to NeXML or NeXML to NexSON.") args = parser.parse_args() inpfn = args.input outfn = args.output mode = args.mode export_format = args.export if export_format: if export_format.lower() in ["badgerfish", "0.0"]: export_format = str(BADGER_FISH_NEXSON_VERSION) elif export_format.lower() == "1.0": export_format = str(DIRECT_HONEY_BADGERFISH) elif export_format.lower() == "1.2": export_format = str(BY_ID_HONEY_BADGERFISH) if export_format is not None and mode is not None: if (mode.endswith('b') and (export_format != str(BADGER_FISH_NEXSON_VERSION))) \ or (mode.endswith('x') and (export_format.lower() != "nexml")) \ or (mode.endswith('x') and (export_format.lower() not in [str(DIRECT_HONEY_BADGERFISH)])): sys.exit('export format {e} clashes with mode {m}. The mode option is not neeeded if the export option is used.'.format(e=export_format, m=mode)) try: inp = codecs.open(inpfn, mode='rU', encoding='utf-8') except: sys.exit('nexson_nexml: Could not open file "{fn}"\n'.format(fn=inpfn)) if mode is None: try: while True: first_graph_char = inp.read(1).strip() if first_graph_char == '<': mode = 'x*' break elif first_graph_char in '{[': mode = '*x' break elif first_graph_char: raise ValueError('Expecting input to start with <, {, or [') except: sys.exit('nexson_nexml: First character of "{fn}" was not <, {, or [\nInput does not appear to be NeXML or NexSON\n'.format(fn=inpfn)) if export_format is None: if mode.endswith('*'): export_format = str(DIRECT_HONEY_BADGERFISH) else: export_format = "nexml" inp.seek(0) elif export_format is None: if mode.endswith('j'): export_format = str(DIRECT_HONEY_BADGERFISH) elif mode.endswith('b'): export_format = str(BADGER_FISH_NEXSON_VERSION) else: assert mode.endswith('x') export_format = "nexml" if export_format == "nexml": indentation = int(os.environ.get('NEXML_INDENTATION_SETTING', 0)) else: indentation = int(os.environ.get('NEXSON_INDENTATION_SETTING', 0)) if outfn is not None: try: out = codecs.open(outfn, mode='w', encoding='utf-8') except: sys.exit('nexson_nexml: Could not open output filepath "{fn}"\n'.format(fn=outfn)) else: out = codecs.getwriter('utf-8')(sys.stdout) if mode.startswith('x'): blob = get_ot_study_info_from_nexml(inp, nexson_syntax_version=export_format) else: blob = json.load(inp) if mode.startswith('*'): try: n = get_nexml_el(blob) except: n = None if not n or (not isinstance(n, dict)): sys.exit('No top level "nex:nexml" element found. Document does not appear to be a JSON version of NeXML\n') if n: mode = 'j' + mode[1] if args.sort: sort_arbitrarily_ordered_nexson(blob) if export_format == "nexml": if indentation > 0: indent = ' '*indentation else: indent = '' newline = '\n' write_obj_as_nexml(blob, out, addindent=indent, newl=newline) else: if not mode.startswith('x'): blob = convert_nexson_format(blob, export_format, sort_arbitrary=True) write_as_json(blob, out, indent=indentation)
def addStudy(session, study_id): # get latest version of nexson print "adding study {s}".format(s=study_id) phy = PhylesystemAPI(get_from="local") studyobj = phy.get_study(study_id)["data"] nexml = get_nexml_el(studyobj) year = nexml.get("^ot:studyYear") proposedTrees = nexml.get("^ot:candidateTreeForSynthesis") if proposedTrees is None: proposedTrees = [] # create a new Study object new_study = Study(id=study_id, year=year) session.add(new_study) # session.commit() # get curator(s), noting that ot:curators might be a # string or a list c = nexml.get("^ot:curatorName") print " ot:curatorName: ", c # create list of curator objects curator_list = [] if isinstance(c, basestring): curator_list.append(c) else: curator_list = c for curator in curator_list: test_c = session.query(Curator).filter(Curator.name == curator).first() if test_c: print "curator {c} already exists".format(c=curator) # session.add(curator) new_study.curators.append(test_c) else: print "curator {c} does no exist".format(c=curator) new_study.curators.append(Curator(name=curator)) # mapped otus in this study otu_dict = gen_otu_dict(studyobj) # iterate over the OTUs in the study, collecting the mapped # ones (oid to ott_id mapping held at the study level) mapped_otus = {} for oid, o in otu_dict.items(): ottID = o.get("^ot:ottId") if ottID is not None: mapped_otus[oid] = ottID # iterate over trees and insert tree data for trees_group_id, tree_id, tree in iter_trees(studyobj): print " tree :", tree_id proposedForSynth = False if tree_id in proposedTrees: proposedForSynth = True treejson = json.dumps(tree) new_tree = Tree(tree_id=tree_id, study_id=study_id, proposed=proposedForSynth, data=treejson) # get otus ottIDs = set() # ott ids for this tree ntips = 0 for node_id, node in iter_node(tree): oid = node.get("@otu") # no @otu property on internal nodes if oid is not None: ntips += 1 # ottID = mapped_otus[oid] if oid in mapped_otus: ottID = mapped_otus[oid] # check that this exists in the taxonomy # (it might not, if the ID has been deprecated) taxon = session.query(Taxonomy).filter(Taxonomy.id == ottID).first() if taxon: new_tree.otus.append(taxon) ottIDs.add(ottID) new_tree.ntips = ntips # need to write function for recursive query of Taxonomy table # ottIDs = parent_closure(ottIDs,taxonomy) # update with treebase id, if exists datadeposit = nexml.get("^ot:dataDeposit") if datadeposit: url = datadeposit["@href"] pattern = re.compile(u".+TB2:(.+)$") matchobj = re.match(pattern, url) if matchobj: tb_id = matchobj.group(1) new_tree.treebase_id = tb_id session.add(new_tree) # now that we have added the tree info, update the study record # with the json data (minus the tree info) del nexml["treesById"] studyjson = json.dumps(nexml) new_study.data = studyjson session.commit()
def add_study(study_id): _LOG.debug('adding study {s}'.format(s=study_id)) # get latest version of nexson # location of repo (test vs dev) dependent on peyotl config phy = create_phylesystem_obj() try: studyobj = phy.get_study(study_id)['data'] except: _LOG.debug('did not find study {s} in phylesystem'.format(s=study_id)) raise HTTPNotFound("Study {s} not found in phylesystem".format(s=study_id)) nexml = get_nexml_el(studyobj) proposedTrees = nexml.get('^ot:candidateTreeForSynthesis') if proposedTrees is None: proposedTrees = [] # create a new Study object new_study = Study(id=study_id) DBSession.add(new_study) # update with treebase id, if exists datadeposit = nexml.get('^ot:dataDeposit') if (datadeposit): url = datadeposit['@href'] if (url): pattern = re.compile(u'.+TB2:(.+)$') matchobj = re.match(pattern,url) if (matchobj): tb_id = matchobj.group(1) new_study.treebase_id=tb_id # get curator(s), noting that ot:curators might be a # string or a list c = nexml.get('^ot:curatorName') # create list of curator objects curator_list=[] if (isinstance(c,basestring)): curator_list.append(c) else: curator_list = c for curator in curator_list: test_c = DBSession.query(Curator).filter(Curator.name==curator).first() if test_c: _LOG.debug("curator {c} already exists".format(c=curator)) #DBSession.add(curator) new_study.curators.append(test_c) else: _LOG.debug("curator {c} does not yet exist".format(c=curator)) new_study.curators.append(Curator(name=curator)) # mapped otus in this study otu_dict = gen_otu_dict(studyobj) # iterate over the OTUs in the study, collecting the mapped # ones (oid to ott_id mapping held at the study level) mapped_otus = {} for oid, o in otu_dict.items(): ottID = o.get('^ot:ottId') if ottID is not None: mapped_otus[oid]=ottID # iterate over trees and insert tree data ntrees = 0 for trees_group_id, tree_id, tree in iter_trees(studyobj): _LOG.debug(' tree : {t}'.format(t=tree_id)) ntrees+=1 proposedForSynth = False if (tree_id in proposedTrees): proposedForSynth = True treejson = json.dumps(tree) new_tree = Tree( tree_id=tree_id, study_id=study_id, proposed=proposedForSynth, data=treejson ) # get otus ottIDs = set() # ott ids for this tree ntips=0 for node_id, node in iter_node(tree): oid = node.get('@otu') # no @otu property on internal nodes if oid is not None: ntips+=1 #ottID = mapped_otus[oid] if oid in mapped_otus: ottID = mapped_otus[oid] # _LOG.debug(' mapped ottID: {m}'.format(m=ottID)) # check that this exists in the taxonomy # (it might not, if the ID has been deprecated) taxon = DBSession.query(Taxonomy).filter( Taxonomy.id==ottID ).first() if taxon: lineage = get_lineage(ottID) _LOG.debug(' lineage of {m} = {l}'.format(m=ottID,l=lineage)) for t in lineage: ottIDs.add(t) new_tree.ntips = ntips for t in ottIDs: taxon = DBSession.query(Taxonomy).filter( Taxonomy.id==t ).first() # _LOG.debug(' adding {t},{n} to tree {tid}'.format( # t=t, # n=taxon.name, # tid=tree_id) # ) new_tree.otus.append(taxon) # add the tree DBSession.add(new_tree) # now that we have added the tree info, update the study record # with the json data (minus the tree info) del nexml['treesById'] studyjson = json.dumps(nexml) new_study.data=studyjson new_study.ntrees = ntrees
def load_nexsons(connection,cursor,phy,config_obj,nstudies=None): counter = 0 study_properties = set() tree_properties = set() for study_id, studyobj in phy.iter_study_objs(): nexml = get_nexml_el(studyobj) #print 'STUDY: ',study_id study_properties.update(nexml.keys()) # study data for study table STUDYTABLE = config_obj.get('database_tables','studytable') year = nexml.get('^ot:studyYear') proposedTrees = nexml.get('^ot:candidateTreeForSynthesis') if proposedTrees is None: proposedTrees = [] # must insert study before trees sqlstring = ("INSERT INTO {tablename} (id) " "VALUES (%s);" .format(tablename=STUDYTABLE) ) data = (study_id,) #print ' SQL: ',cursor.mogrify(sqlstring) cursor.execute(sqlstring,data) connection.commit() # update with treebase id, if exists datadeposit = nexml.get('^ot:dataDeposit') if (datadeposit): url = datadeposit['@href'] pattern = re.compile(u'.+TB2:(.+)$') matchobj = re.match(pattern,url) if (matchobj): tb_id = matchobj.group(1) sqlstring = ("UPDATE {tablename} " "SET treebase_id=%s " "WHERE id=%s;" .format(tablename=STUDYTABLE) ) data = (tb_id,study_id) #print ' SQL: ',cursor.mogrify(sqlstring,data) cursor.execute(sqlstring,data) connection.commit() # get curator(s), noting that ot:curators might be a # string or a list c = nexml.get('^ot:curatorName') #print ' ot:curatorName: ',c curators=[] if (isinstance(c,basestring)): curators.append(c) else: curators=c # remove duplicates curators = list(set(curators)) insert_curators(connection,cursor,config_obj,study_id,curators) # iterate over trees and insert tree data # note that OTU data done separately as COPY # due to size of table (see script <scriptname>) TREETABLE = config_obj.get('database_tables','treetable') ntrees = 0 try: for trees_group_id, tree_id, tree in iter_trees(studyobj): #print ' tree :' ,tree_id ntrees += 1 proposedForSynth = False tree_properties.update(tree.keys()) if (tree_id in proposedTrees): proposedForSynth = True treejson = json.dumps(tree) ntips = 0 for node_id, node in iter_node(tree): oid = node.get('@otu') # no @otu property on internal nodes if oid is not None: ntips+=1 sqlstring = ("INSERT INTO {tablename} " "(tree_id,study_id,ntips,proposed,data) " "VALUES (%s,%s,%s,%s,%s);" .format(tablename=TREETABLE) ) data = (tree_id,study_id,ntips,proposedForSynth,treejson) #print ' SQL: ',cursor.mogrify(sqlstring,data) cursor.execute(sqlstring,data) connection.commit() except psy.Error as e: print e.pgerror # now that we have added the tree info, update the study record # with the json data (minus the tree info) and ntrees del nexml['treesById'] studyjson = json.dumps(nexml) sqlstring = ("UPDATE {tablename} " "SET data=%s,ntrees=%s " "WHERE id=%s;" .format(tablename=STUDYTABLE) ) data = (studyjson,ntrees,study_id) cursor.execute(sqlstring,data) connection.commit() counter+=1 if (counter%500 == 0): print "loaded {n} studies".format(n=counter) if (nstudies and counter>=nstudies): print "finished inserting",nstudies,"studies" break # load the tree and study properties PROPERTYTABLE = config_obj.get('database_tables','propertytable') load_properties( connection, cursor, PROPERTYTABLE, study_properties, tree_properties)