name), metadata=True) else: tree = Tree() tree.read_from_file('{0}/trees/{1}.nwk'.format(working_dir, name)) dv_matrix_strip_header = '\n'.join(dv_matrix.split('\n' )[2:]).rstrip() labels_strip_header = labels.split('\n')[1].rstrip() record = TCSeqRec() record.dv = [(dv_matrix_strip_header, labels_strip_header)] record.tree = tree record.name = name record.headers = labels_strip_header.split() record.sequences = ['' for _ in record.headers] record._update() records.append(record) collection = SequenceCollection(records=records, get_distances=False, gtp_path=os.environ['GTP_PATH']) collection.put_distance_matrices('rf') T = \ collection.Clustering.run_spectral_rotate(collection.distance_matrices['rf' ]) collection.partitions[T] = Partition(T) collection.clusters_to_partitions[('rf', 'spectral_rotate', max(T))] = T collection.concatenate_records() cluster_recs = collection.get_cluster_records() number_of_clusters = len(cluster_recs) for j in range(number_of_clusters):
def simulate_from_record_WAG( cls, record, output_dir, name='tempsim', tmpdir='/tmp', allow_nonsense=False, split_lengths=None, gene_names=None, ): length = record.seqlength tree = record.tree directorycheck_and_quit(tmpdir) gamma = tree.extract_gamma_parameter() param_dir = '{0}/alf_parameter_dir'.format(tmpdir) working_dir = '{0}/alf_working_dir'.format(tmpdir) directorycheck_and_make(param_dir, verbose=False) directorycheck_and_make(working_dir, verbose=False) treefile = '{0}/treefile.nwk'.format(tmpdir) tree.pam2sps('sps2pam').write_to_file(treefile) directorycheck_and_make(param_dir) directorycheck_and_make(working_dir) sim = cls(simulation_name=name, working_directory=working_dir, outfile_path=param_dir, unit_is_pam=True) sim.indels() sim.rate_variation(gamma) sim.root_genome(number_of_genes=1, min_length=length) sim.one_word_model('WAG') sim.custom_tree(treefile) params = sim.write_parameters() sim.runALF(params, quiet=True) tree_newick = tree.newick alf_newick = \ open('{0}/alf_working_dir/{1}/RealTree.nwk'.format(tmpdir, name)).read() replacement_dict = dict(zip(re.findall(r'(\w+)(?=:)', alf_newick), re.findall(r'(\w+)(?=:)', tree_newick))) # bug correction alignment = \ glob.glob('{0}/alf_working_dir/{1}/MSA/*aa.fa'.format(tmpdir, name))[0] new_record = TCSeqRec(alignment) new_record.sequences = [seq[:length] for seq in new_record.sequences] new_record._update() print new_record.seqlength new_record.headers = [replacement_dict[x[:x.rindex('/')]] for x in new_record.headers] # bug should be fixed new_record._update() new_record.sort_by_name() if split_lengths and gene_names: with open('{0}/trees.txt'.format(output_dir), 'a') as trf: trf.write('{0}\t{1}\n'.format('-'.join(gene_names), tree.newick)) for rec in new_record.split_by_lengths(split_lengths, gene_names): rec.write_phylip('{0}/{1}.phy'.format(output_dir, rec.name)) else: with open('{0}/trees.txt'.format(output_dir), 'a') as trf: trf.write('{0}\t{1}\n'.format(new_record.name, tree.newick)) new_record.write_phylip('{0}/{1}.phy'.format(output_dir, name)) shutil.rmtree(param_dir) shutil.rmtree(working_dir)
def simulate_from_record_WAG( cls, record, output_dir, name='tempsim', tmpdir='/tmp', allow_nonsense=False, split_lengths=None, gene_names=None, ): length = record.seqlength tree = record.tree directorycheck_and_quit(tmpdir) gamma = tree.extract_gamma_parameter() param_dir = '{0}/alf_parameter_dir'.format(tmpdir) working_dir = '{0}/alf_working_dir'.format(tmpdir) directorycheck_and_make(param_dir, verbose=False) directorycheck_and_make(working_dir, verbose=False) treefile = '{0}/treefile.nwk'.format(tmpdir) tree.pam2sps('sps2pam').write_to_file(treefile) directorycheck_and_make(param_dir) directorycheck_and_make(working_dir) sim = cls(simulation_name=name, working_directory=working_dir, outfile_path=param_dir, unit_is_pam=True) sim.indels() sim.rate_variation(gamma) sim.root_genome(number_of_genes=1, min_length=length) sim.one_word_model('WAG') sim.custom_tree(treefile) params = sim.write_parameters() sim.runALF(params, quiet=True) tree_newick = tree.newick alf_newick = \ open('{0}/alf_working_dir/{1}/RealTree.nwk'.format(tmpdir, name)).read() replacement_dict = dict( zip(re.findall(r'(\w+)(?=:)', alf_newick), re.findall(r'(\w+)(?=:)', tree_newick))) # bug correction alignment = \ glob.glob('{0}/alf_working_dir/{1}/MSA/*aa.fa'.format(tmpdir, name))[0] new_record = TCSeqRec(alignment) new_record.sequences = [seq[:length] for seq in new_record.sequences] new_record._update() print new_record.seqlength new_record.headers = [ replacement_dict[x[:x.rindex('/')]] for x in new_record.headers ] # bug should be fixed new_record._update() new_record.sort_by_name() if split_lengths and gene_names: with open('{0}/trees.txt'.format(output_dir), 'a') as trf: trf.write('{0}\t{1}\n'.format('-'.join(gene_names), tree.newick)) for rec in new_record.split_by_lengths(split_lengths, gene_names): rec.write_phylip('{0}/{1}.phy'.format(output_dir, rec.name)) else: with open('{0}/trees.txt'.format(output_dir), 'a') as trf: trf.write('{0}\t{1}\n'.format(new_record.name, tree.newick)) new_record.write_phylip('{0}/{1}.phy'.format(output_dir, name)) shutil.rmtree(param_dir) shutil.rmtree(working_dir)