def read_treemix_file2(filename_treeout, filename_vertices, filename_edges, outgroup=None): if filename_treeout.endswith('.gz'): filename_treeout = unzip(filename_treeout) if filename_vertices.endswith('.gz'): filename_vertices = unzip(filename_vertices) if filename_edges.endswith('.gz'): filename_edges = unzip(filename_edges) with open(filename_treeout, 'r') as f: newick_tree = f.readline().rstrip() admixtures = parse_admixtures2(map(str.rstrip, f.readlines())) vertice_dictionary = read_vertices(filename_vertices) print vertice_dictionary edges = get_edge_lengths2(filename_edges) #print newick_tree tree = make_Rtree(edges, vertice_dictionary, admixtures) print pretty_string(tree) #tree=remove_children(tree) if outgroup is not None: tree = rearrange_root(tree, outgroup) print 'after rearrangement' print pretty_string(tree) return tree
def make_single_files(filename, blocksize, no_blocks, prefix='', verbose_level='normal'): assert (blocksize is not None) or ( no_blocks is not None), 'Has to specify either block size or number of blocks' filenames = [] if filename.endswith('.gz'): filename = unzip(filename) filename_reduced = prefix + filename.split(os.sep)[-1] + 'boot.' with open(filename, 'r') as f: first_line = f.readline() lines = f.readlines() #print lines n = len(lines) if no_blocks is not None: blocksize = n / no_blocks line_sets = get_partitions(lines, blocksize) if verbose_level != 'silent': print 'total SNPs=', n print 'total blocksize', blocksize #print 'no_blocks', no_blocks print 'len(line_sets)', len(line_sets) for i, lins in enumerate(line_sets): new_filename = filename_reduced + str(i) with open(new_filename, 'w') as g: g.write(first_line) g.writelines(lins) gzipped_filename = gzip(new_filename, overwrite=True) filenames.append(gzipped_filename) return filenames, first_line.split()
def read_treemix_file(filename_treeout, filename_vertices, filename_edges, outgroup=None): np = new_node_naming_policy() if filename_treeout.endswith('.gz'): filename_treeout = unzip(filename_treeout) if filename_vertices.endswith('.gz'): filename_vertices = unzip(filename_vertices) if filename_edges.endswith('.gz'): filename_edges = unzip(filename_edges) with open(filename_treeout, 'r') as f: newick_tree = f.readline().rstrip() admixtures = parse_admixtures(map(str.rstrip, f.readlines())) edges = get_edge_lengths2(filename_edges) #print newick_tree tree, translates = parse_newick_tree(newick_tree) vd = vertice_dictionary() for adm_key, treemix_N_key in translates.items(): vd.insert_mapping(adm_key, treemix_N_key, 'AdmB', 'Treemix_N') #print '-------------------------' #print vd vd, adm_vertices = match_vertices(filename_vertices, vd) #matched_admixtures=match_admixtures(admixtures, adm_vertices) # print '-------------------------' # print vd # print adm_vertices edges = get_edge_lengths(filename_edges) # print edges tree = insert_children_in_tree(tree) reverse_translates = {v: k for k, v in translates.items()} # for k,c in translates.items(): # print k, ':', c # for k,v in tree.items(): # print k,':',v # print translates # print admixtures tree = add_admixtures(tree, vd, adm_vertices, edges, admixtures) if outgroup is not None: tree = rearrange_root(tree, outgroup) return tree
def make_bootstrap_files(filename, blocksize=None, no_blocks=None, bootstrap_samples=None, prefix=''): assert (blocksize is not None) or ( no_blocks is not None), 'Has to specify either block size or number of blocks' filenames = [] if filename.endswith('.gz'): filename = unzip(filename) filename_reduced = os.path.join(prefix, filename.split(os.sep)[-1] + 'boot.') with open(filename, 'r') as f: first_line = f.readline() lines = f.readlines() n = len(lines) if no_blocks is not None: blocksize = n / no_blocks line_sets = get_partitions(lines, blocksize) print 'total SNPs=', n print 'total blocksize', blocksize print 'no_blocks', no_blocks print 'bootstrap_samples', bootstrap_samples print 'len(line_sets)', len(line_sets) if bootstrap_samples is None: bootstrap_samples = len(line_sets) for i in range(bootstrap_samples): new_filename = filename_reduced + str(i) with open(new_filename, 'w') as g: g.write(first_line) bootstrap_inds = bootstrap_indices(len(line_sets)) for i in bootstrap_inds: g.writelines(line_sets[i]) gzipped_filename = gzip(new_filename, overwrite=True) filenames.append(gzipped_filename) return filenames, first_line.split()
if options.input_type == 'tree': tree = identifier_file_to_tree_clean(options.input_file) if options.input_add: with open(options.input_add, 'r') as f: add = float(f.readline()) tree = add_outgroup(tree, inner_node_name='new_node', to_new_root_length=float(add), to_outgroup_length=0, outgroup_name=options.outgroup_name) nodes = get_leaf_keys(tree) assert all((a in nodes for a in options.populations )), 'Requested population was not found in the tree' subtree = get_subtree(tree, options.populations) if not options.output_file: options.output_file = options.input_file + '_'.join( options.populations) with open(options.output_file, 'w') as f: f.write(' '.join(sorted(options.populations)) + '\n') f.write(unique_identifier_and_branch_lengths(subtree)) if options.input_type == 'snps': if options.input_file.endswith('.gz'): options.input_file = unzip(options.input_file, overwrite=False) df = pd.read_csv(options.input_file, usecols=options.populations, sep=' ') if not options.output_file: options.output_file = options.input_file + '_'.join( options.populations) df.to_csv(options.output_file, sep=' ', index=False) gzip(options.output_file, overwrite=True)
def read_one_line(filename): if filename.endswith('.gz'): filename = unzip(filename) with open(filename, 'r') as f: return f.readline().rstrip().split()