Exemplo n.º 1
0
        tree.write_to_file('{0}/trees/{1}.nwk'.format(working_dir,
                           name), metadata=True)
    else:
        tree = Tree()
        tree.read_from_file('{0}/trees/{1}.nwk'.format(working_dir,
                            name))

    dv_matrix_strip_header = '\n'.join(dv_matrix.split('\n'
            )[2:]).rstrip()
    labels_strip_header = labels.split('\n')[1].rstrip()
    record = TCSeqRec()
    record.dv = [(dv_matrix_strip_header, labels_strip_header)]
    record.tree = tree
    record.name = name
    record.headers = labels_strip_header.split()
    record.sequences = ['' for _ in record.headers]
    record._update()
    records.append(record)

collection = SequenceCollection(records=records, get_distances=False,
                                gtp_path=os.environ['GTP_PATH'])
collection.put_distance_matrices('rf')
T = \
    collection.Clustering.run_spectral_rotate(collection.distance_matrices['rf'
        ])
collection.partitions[T] = Partition(T)
collection.clusters_to_partitions[('rf', 'spectral_rotate', max(T))] = T
collection.concatenate_records()
cluster_recs = collection.get_cluster_records()

number_of_clusters = len(cluster_recs)
Exemplo n.º 2
0
    def simulate_from_record_WAG(
        cls,
        record,
        output_dir,
        name='tempsim',
        tmpdir='/tmp',
        allow_nonsense=False,
        split_lengths=None,
        gene_names=None,
        ):

        length = record.seqlength
        tree = record.tree
        directorycheck_and_quit(tmpdir)
        gamma = tree.extract_gamma_parameter()
        param_dir = '{0}/alf_parameter_dir'.format(tmpdir)
        working_dir = '{0}/alf_working_dir'.format(tmpdir)
        directorycheck_and_make(param_dir, verbose=False)
        directorycheck_and_make(working_dir, verbose=False)
        treefile = '{0}/treefile.nwk'.format(tmpdir)

        tree.pam2sps('sps2pam').write_to_file(treefile)

        directorycheck_and_make(param_dir)
        directorycheck_and_make(working_dir)

        sim = cls(simulation_name=name, working_directory=working_dir,
                  outfile_path=param_dir, unit_is_pam=True)

        sim.indels()
        sim.rate_variation(gamma)
        sim.root_genome(number_of_genes=1, min_length=length)
        sim.one_word_model('WAG')
        sim.custom_tree(treefile)
        params = sim.write_parameters()
        sim.runALF(params, quiet=True)
        tree_newick = tree.newick
        alf_newick = \
            open('{0}/alf_working_dir/{1}/RealTree.nwk'.format(tmpdir,
                 name)).read()
        replacement_dict = dict(zip(re.findall(r'(\w+)(?=:)',
                                alf_newick), re.findall(r'(\w+)(?=:)',
                                tree_newick)))  # bug correction

        alignment = \
            glob.glob('{0}/alf_working_dir/{1}/MSA/*aa.fa'.format(tmpdir,
                      name))[0]

        new_record = TCSeqRec(alignment)
        new_record.sequences = [seq[:length] for seq in
                                new_record.sequences]
        new_record._update()

        print new_record.seqlength
        new_record.headers = [replacement_dict[x[:x.rindex('/')]]
                              for x in new_record.headers]  # bug should be fixed
        new_record._update()
        new_record.sort_by_name()
        if split_lengths and gene_names:
            with open('{0}/trees.txt'.format(output_dir), 'a') as trf:
                trf.write('{0}\t{1}\n'.format('-'.join(gene_names), tree.newick))
            for rec in new_record.split_by_lengths(split_lengths,
                    gene_names):
                rec.write_phylip('{0}/{1}.phy'.format(output_dir,
                                 rec.name))
        else:
            with open('{0}/trees.txt'.format(output_dir), 'a') as trf:
                trf.write('{0}\t{1}\n'.format(new_record.name, tree.newick))
            new_record.write_phylip('{0}/{1}.phy'.format(output_dir,
                                    name))
        shutil.rmtree(param_dir)
        shutil.rmtree(working_dir)
Exemplo n.º 3
0
    def simulate_from_record_WAG(
        cls,
        record,
        output_dir,
        name='tempsim',
        tmpdir='/tmp',
        allow_nonsense=False,
        split_lengths=None,
        gene_names=None,
    ):

        length = record.seqlength
        tree = record.tree
        directorycheck_and_quit(tmpdir)
        gamma = tree.extract_gamma_parameter()
        param_dir = '{0}/alf_parameter_dir'.format(tmpdir)
        working_dir = '{0}/alf_working_dir'.format(tmpdir)
        directorycheck_and_make(param_dir, verbose=False)
        directorycheck_and_make(working_dir, verbose=False)
        treefile = '{0}/treefile.nwk'.format(tmpdir)

        tree.pam2sps('sps2pam').write_to_file(treefile)

        directorycheck_and_make(param_dir)
        directorycheck_and_make(working_dir)

        sim = cls(simulation_name=name,
                  working_directory=working_dir,
                  outfile_path=param_dir,
                  unit_is_pam=True)

        sim.indels()
        sim.rate_variation(gamma)
        sim.root_genome(number_of_genes=1, min_length=length)
        sim.one_word_model('WAG')
        sim.custom_tree(treefile)
        params = sim.write_parameters()
        sim.runALF(params, quiet=True)
        tree_newick = tree.newick
        alf_newick = \
            open('{0}/alf_working_dir/{1}/RealTree.nwk'.format(tmpdir,
                 name)).read()
        replacement_dict = dict(
            zip(re.findall(r'(\w+)(?=:)', alf_newick),
                re.findall(r'(\w+)(?=:)', tree_newick)))  # bug correction

        alignment = \
            glob.glob('{0}/alf_working_dir/{1}/MSA/*aa.fa'.format(tmpdir,
                      name))[0]

        new_record = TCSeqRec(alignment)
        new_record.sequences = [seq[:length] for seq in new_record.sequences]
        new_record._update()

        print new_record.seqlength
        new_record.headers = [
            replacement_dict[x[:x.rindex('/')]] for x in new_record.headers
        ]  # bug should be fixed
        new_record._update()
        new_record.sort_by_name()
        if split_lengths and gene_names:
            with open('{0}/trees.txt'.format(output_dir), 'a') as trf:
                trf.write('{0}\t{1}\n'.format('-'.join(gene_names),
                                              tree.newick))
            for rec in new_record.split_by_lengths(split_lengths, gene_names):
                rec.write_phylip('{0}/{1}.phy'.format(output_dir, rec.name))
        else:
            with open('{0}/trees.txt'.format(output_dir), 'a') as trf:
                trf.write('{0}\t{1}\n'.format(new_record.name, tree.newick))
            new_record.write_phylip('{0}/{1}.phy'.format(output_dir, name))
        shutil.rmtree(param_dir)
        shutil.rmtree(working_dir)