def write_newick_ott(out, ott, ott_id2children, root_ott_id, label_style, prune_flags): '''`out` is an output stream `ott` is an OTT instance used for translating labels `ott_id2children` is a dict mapping an OTT ID to the IDs of its children `root_ott_id` is the root of the subtree to write. `label_style` is a facet of OTULabelStyleEnum `prune_flags` is a set strings (flags) or OTTFlagUnion instance or None ''' if prune_flags is not None: if not isinstance(prune_flags, OTTFlagUnion): prune_flags = ott.convert_flag_string_set_to_union(prune_flags) if ott.has_flag_set_key_intersection(root_ott_id, prune_flags): return else: prune_flags = None stack = [root_ott_id] first_children = set(stack) last_children = set(stack) out.write('(') while stack: ott_id = stack.pop() if isinstance(ott_id, tuple): ott_id = ott_id[0] else: children = ott_id2children[ott_id] if prune_flags is not None: children = [ i for i in children if not ott.has_flag_set_key_intersection(i, prune_flags) ] if ott_id not in first_children: out.write(',') else: first_children.remove(ott_id) if bool(children): out.write('(') first_children.add(children[0]) last_children.add(children[-1]) stack.append( (ott_id, )) # a tuple will signal exiting a node... stack.extend([i for i in reversed(children)]) continue n = ott.get_label(ott_id, label_style) n = quote_newick_name(n) out.write(n) if ott_id in last_children: out.write(')') last_children.remove(ott_id) out.write(';')
def write_newick_ott(out, ott, ott_id2children, root_ott_id, label_style, prune_flags): '''`out` is an output stream `ott` is an OTT instance used for translating labels `ott_id2children` is a dict mapping an OTT ID to the IDs of its children `root_ott_id` is the root of the subtree to write. `label_style` is a facet of OTULabelStyleEnum `prune_flags` is a set strings (flags) or OTTFlagUnion instance or None ''' if prune_flags is not None: if not isinstance(prune_flags, OTTFlagUnion): prune_flags = ott.convert_flag_string_set_to_union(prune_flags) if ott.has_flag_set_key_intersection(root_ott_id, prune_flags): return else: prune_flags = None stack = [root_ott_id] first_children = set(stack) last_children = set(stack) out.write('(') while stack: ott_id = stack.pop() if isinstance(ott_id, tuple): ott_id = ott_id[0] else: children = ott_id2children[ott_id] if prune_flags is not None: children = [i for i in children if not ott.has_flag_set_key_intersection(i, prune_flags)] if ott_id not in first_children: out.write(',') else: first_children.remove(ott_id) if bool(children): out.write('(') first_children.add(children[0]) last_children.add(children[-1]) stack.append((ott_id,)) # a tuple will signal exiting a node... stack.extend([i for i in reversed(children)]) continue n = ott.get_label(ott_id, label_style) n = quote_newick_name(n) out.write(n) if ott_id in last_children: out.write(')') last_children.remove(ott_id) out.write(';')
def write_newick_ott(out, ott, ott_id2children, root_ott_id, label_style, prune_flags, create_log_dict=False): """`out` is an output stream `ott` is an OTT instance used for translating labels `ott_id2children` is a dict mapping an OTT ID to the IDs of its children `root_ott_id` is the root of the subtree to write. `label_style` is a facet of OTULabelStyleEnum `prune_flags` is a set strings (flags) or OTTFlagUnion instance or None if `create_log_dict` is True, a dict will be returned that contains statistics about the pruning. """ # create to_prune_fsi_set a set of flag set indices to prune... if prune_flags: flags_to_prune_list = list(prune_flags) to_prune_fsi_set = ott.convert_flag_string_set_to_union( flags_to_prune_list) else: flags_to_prune_list = [] to_prune_fsi_set = None flags_to_prune_set = frozenset(flags_to_prune_list) pfd = {} log_dict = None if create_log_dict: log_dict = { 'version': ott.version, 'flags_to_prune': flags_to_prune_list } fsi_to_str_flag_set = {} for k, v in dict(ott.flag_set_id_to_flag_set).items(): fsi_to_str_flag_set[k] = frozenset(list(v)) if to_prune_fsi_set: pfd = {} for f in to_prune_fsi_set.keys(): s = fsi_to_str_flag_set[f] str_flag_intersection = flags_to_prune_set.intersection(s) pfd[f] = list(str_flag_intersection) pfd[f].sort() # log_dict['prune_flags_d'] = d # log_dict['pfd'] = pfd pruned_dict = {} num_tips = 0 num_pruned_anc_nodes = 0 num_nodes = 0 num_monotypic_nodes = 0 if to_prune_fsi_set and ott.has_flag_set_key_intersection( root_ott_id, to_prune_fsi_set): # entire taxonomy is pruned off if log_dict is not None: fsi = ott.get_flag_set_key(root_ott_id) pruned_dict[fsi] = {'': [root_ott_id]} num_pruned_anc_nodes += 1 else: stack = [root_ott_id] first_children = set(stack) last_children = set() while stack: ott_id = stack.pop() if isinstance(ott_id, tuple): ott_id = ott_id[0] else: num_nodes += 1 children = ott_id2children[ott_id] if to_prune_fsi_set is not None: c = [] for child_id in children: if ott.has_flag_set_key_intersection( child_id, to_prune_fsi_set): if log_dict is not None: fsi = ott.get_flag_set_key(child_id) fd = pruned_dict.get(fsi) if fd is None: pruned_dict[fsi] = { 'anc_ott_id_pruned': [child_id] } else: fd['anc_ott_id_pruned'].append(child_id) num_pruned_anc_nodes += 1 else: c.append(child_id) children = c nc = len(children) if nc < 2: if nc == 1: num_monotypic_nodes += 1 else: num_tips += 1 if ott_id not in first_children: out.write(',') else: first_children.remove(ott_id) if bool(children): out.write('(') first_children.add(children[0]) last_children.add(children[-1]) stack.append( (ott_id, )) # a tuple will signal exiting a node... stack.extend([i for i in reversed(children)]) continue n = ott.get_label(ott_id, label_style) n = quote_newick_name(n) out.write(n) if ott_id in last_children: out.write(')') last_children.remove(ott_id) out.write(';') if create_log_dict: log_dict['pruned'] = {} for fsi, obj in pruned_dict.items(): f = pfd[fsi] f.sort() obj['flags_causing_prune'] = f nk = ','.join(f) log_dict['pruned'][nk] = obj log_dict['num_tips'] = num_tips log_dict['num_pruned_anc_nodes'] = num_pruned_anc_nodes log_dict['num_nodes'] = num_nodes log_dict['num_non_leaf_nodes'] = num_nodes - num_tips log_dict[ 'num_non_leaf_nodes_with_multiple_children'] = num_nodes - num_tips - num_monotypic_nodes log_dict['num_monotypic_nodes'] = num_monotypic_nodes return log_dict
# read pruned labelled synthesis tree ottpattern = re.compile(r"([(,)])(ott)(\d+)") mrcapattern = re.compile(r"([(,)])mrcaott(\d+)ott(\d+)") pos = 0 ott = OTT(ott_dir=args.ott_dir) # load up the OTT dictionary... d = ott.ott_id_to_names outfile = codecs.open('ottnamelabelledtree.tre', 'w', encoding='utf-8') with open(args.newick_file, 'r') as f: newick = f.read() for m in re.finditer(ottpattern, newick): #print m.group(1),m.group(2),m.group(3) ottid = int(m.group(3)) ottresults = d[ottid] ottname = ottresults if isinstance(ottresults, tuple): ottname = ottresults[0] print m.group(3), ottname skippedchars = newick[pos:m.start()] outfile.write(skippedchars) outfile.write(m.group(1)) if args.keep_ottids: label = quote_newick_name('{} ott{}'.format(ottname, ottid)) else: label = quote_newick_name('{}'.format(ottname)) outfile.write(label) pos = m.end() outfile.write(newick[pos:]) f.close() outfile.close()
def write_newick_ott(out, ott, ott_id2children, root_ott_id, label_style, prune_flags, create_log_dict=False): """`out` is an output stream `ott` is an OTT instance used for translating labels `ott_id2children` is a dict mapping an OTT ID to the IDs of its children `root_ott_id` is the root of the subtree to write. `label_style` is a facet of OTULabelStyleEnum `prune_flags` is a set strings (flags) or OTTFlagUnion instance or None if `create_log_dict` is True, a dict will be returned that contains statistics about the pruning. """ # create to_prune_fsi_set a set of flag set indices to prune... if prune_flags: flags_to_prune_list = list(prune_flags) to_prune_fsi_set = ott.convert_flag_string_set_to_union(flags_to_prune_list) else: flags_to_prune_list = [] to_prune_fsi_set = None flags_to_prune_set = frozenset(flags_to_prune_list) pfd = {} log_dict = None if create_log_dict: log_dict = {'version': ott.version, 'flags_to_prune': flags_to_prune_list} fsi_to_str_flag_set = {} for k, v in dict(ott.flag_set_id_to_flag_set).items(): fsi_to_str_flag_set[k] = frozenset(list(v)) if to_prune_fsi_set: pfd = {} for f in to_prune_fsi_set.keys(): s = fsi_to_str_flag_set[f] str_flag_intersection = flags_to_prune_set.intersection(s) pfd[f] = list(str_flag_intersection) pfd[f].sort() # log_dict['prune_flags_d'] = d # log_dict['pfd'] = pfd pruned_dict = {} num_tips = 0 num_pruned_anc_nodes = 0 num_nodes = 0 num_monotypic_nodes = 0 if to_prune_fsi_set and ott.has_flag_set_key_intersection(root_ott_id, to_prune_fsi_set): # entire taxonomy is pruned off if log_dict is not None: fsi = ott.get_flag_set_key(root_ott_id) pruned_dict[fsi] = {'': [root_ott_id]} num_pruned_anc_nodes += 1 else: stack = [root_ott_id] first_children = set(stack) last_children = set() while stack: ott_id = stack.pop() if isinstance(ott_id, tuple): ott_id = ott_id[0] else: num_nodes += 1 children = ott_id2children[ott_id] if to_prune_fsi_set is not None: c = [] for child_id in children: if ott.has_flag_set_key_intersection(child_id, to_prune_fsi_set): if log_dict is not None: fsi = ott.get_flag_set_key(child_id) fd = pruned_dict.get(fsi) if fd is None: pruned_dict[fsi] = {'anc_ott_id_pruned': [child_id]} else: fd['anc_ott_id_pruned'].append(child_id) num_pruned_anc_nodes += 1 else: c.append(child_id) children = c nc = len(children) if nc < 2: if nc == 1: num_monotypic_nodes += 1 else: num_tips += 1 if ott_id not in first_children: out.write(',') else: first_children.remove(ott_id) if bool(children): out.write('(') first_children.add(children[0]) last_children.add(children[-1]) stack.append((ott_id,)) # a tuple will signal exiting a node... stack.extend([i for i in reversed(children)]) continue n = ott.get_label(ott_id, label_style) n = quote_newick_name(n) out.write(n) if ott_id in last_children: out.write(')') last_children.remove(ott_id) out.write(';') if create_log_dict: log_dict['pruned'] = {} for fsi, obj in pruned_dict.items(): f = pfd[fsi] f.sort() obj['flags_causing_prune'] = f nk = ','.join(f) log_dict['pruned'][nk] = obj log_dict['num_tips'] = num_tips log_dict['num_pruned_anc_nodes'] = num_pruned_anc_nodes log_dict['num_nodes'] = num_nodes log_dict['num_non_leaf_nodes'] = num_nodes - num_tips log_dict['num_non_leaf_nodes_with_multiple_children'] = num_nodes - num_tips - num_monotypic_nodes log_dict['num_monotypic_nodes'] = num_monotypic_nodes return log_dict