def dump_records(
        self,
        output_dir,
        records=None,
        file_format='phylip',
        use_hashname=True,
    ):
        """
        Dumps all sequence alignment records to an output directory
        Files are dumped in sequential phylip format; by default the
        names are hashed
        """

        directorycheck_and_make(output_dir)

        hash_translation = {}

        if not records:
            records = self.get_records()

        for rec in records:
            filename = rec._write_temp_phylip(output_dir,
                                              use_hashname=use_hashname)
            try:
                hash_translation[str(rec.name)] = filename
            except TypeError:
                print type(rec.name), rec.name, type(filename), filename
        cPickle.dump(hash_translation,
                     open('{0}/hash_translation.pkl'.format(output_dir), 'w'))
    def dump_records(self, output_dir, records=None, file_format="phylip", use_hashname=True):
        """
        Dumps all sequence alignment records to an output directory
        Files are dumped in sequential phylip format; by default the
        names are hashed
        """

        directorycheck_and_make(output_dir)

        hash_translation = {}

        if not records:
            records = self.get_records()

        for rec in records:
            filename = rec._write_temp_phylip(output_dir, use_hashname=use_hashname)
            try:
                hash_translation[str(rec.name)] = filename
            except TypeError:
                print type(rec.name), rec.name, type(filename), filename
        cPickle.dump(hash_translation, open("{0}/hash_translation.pkl".format(output_dir), "w"))
parser.add_argument('-score', dest='score', action='store_true')

args = vars(parser.parse_args())
input_dir = args['input'].rstrip('/')
output = args['output']
tmpdir = args['tmpdir'].rstrip('/')
min_clusters = args['min_clusters']
max_clusters = args['max_clusters']
distance = args['distance']
method = args['cluster_method']
delete = args['delete']
datatype = args['datatype']
score = args['score']

directorycheck_and_quit(input_dir)
directorycheck_and_make(tmpdir)

gtp_path = os.environ['GTP_PATH']
helper = os.environ['DARWINHELPER']

from sequence_collection import SequenceCollection

sc = SequenceCollection(
    input_dir,
    file_format='phylip',
    datatype=datatype,
    helper=helper,
    gtp_path=gtp_path,
    tmpdir=tmpdir,
    overwrite=True,
    )
progname = re.compile('[A-Za-z0-9.-_]+').search(sys.argv[0]).group()
desc = 'Read in a SequenceCollection from disk and dump records'
input_help = 'Filepath+name of gzipped SequenceCollection object'
output_help = 'Directory to dump files in'
choice_help = \
    '\n'.join(['Choose to dump post-clustering concatenated records',
              'instead of pre-clustering single records'])
parser = argparse.ArgumentParser(prog=progname, description=desc)
parser.add_argument('-i', dest='input_file', help=input_help, type=str)
parser.add_argument('-o', dest='output_dir', help=output_help, type=str)
parser.add_argument('-c', dest='cluster_recs', action='store_true')

args = parser.parse_args()
input_file = args.input_file
output_dir = args.output_dir.rstrip('/')
cluster_recs = args.cluster_recs

filecheck_and_quit(input_file)
directorycheck_and_make(output_dir)

from sequence_collection import SequenceCollection

sc = SequenceCollection.gunzip(input_file)
if cluster_recs:
    records = sc.get_cluster_records()
    sc.dump_records(output_dir, records)
else:
    records = sc.get_records()  # should be default anyway, but explicit
    sc.dump_records(output_dir,
                    records)  # is better than implicit, and all that
Пример #5
0
    type=str,
    choices=valid_methods,
    default='spectral',
)

args = vars(parser.parse_args())
input_dir = args['input'].rstrip('/')
tmpdir = args['tmpdir'].rstrip('/')
min_clusters = args['min_clusters']
max_clusters = args['max_clusters']
distance = args['distance']
method = args['cluster_method']
pickle = '{0}/scrand.pkl'.format(input_dir)

directorycheck_and_raise(input_dir)
directorycheck_and_make(tmpdir)
filecheck_and_quit(pickle)

sc = cPickle.load(open(pickle))
sc.tmpdir = tmpdir
print 'Loading phyml results...'
sc.load_phyml_results(input_dir, use_hashname=True)
print 'Autotuning...'
sc.autotune(distance, max_groups=max_clusters)
print 'Clustering...'
sc.put_partitions(distance, method, range(min_clusters, max_clusters))
sc.concatenate_records()
sc.put_cluster_trees(program='bionj')
scores = sorted(sc.get_scores(), key=lambda x: x[0])
print 'Scores:'
for score in scores:
progname = re.compile('[A-Za-z0-9.-_]+').search(sys.argv[0]).group()
desc = 'Read in a SequenceCollection from disk and dump records'
input_help = 'Filepath+name of gzipped SequenceCollection object'
output_help = 'Directory to dump files in'
choice_help = \
    '\n'.join(['Choose to dump post-clustering concatenated records',
              'instead of pre-clustering single records'])
parser = argparse.ArgumentParser(prog=progname, description=desc)
parser.add_argument('-i', dest='input_file', help=input_help, type=str)
parser.add_argument('-o', dest='output_dir', help=output_help, type=str)
parser.add_argument('-c', dest='cluster_recs', action='store_true')

args = parser.parse_args()
input_file = args.input_file
output_dir = args.output_dir.rstrip('/')
cluster_recs = args.cluster_recs

filecheck_and_quit(input_file)
directorycheck_and_make(output_dir)

from sequence_collection import SequenceCollection

sc = SequenceCollection.gunzip(input_file)
if cluster_recs:
    records = sc.get_cluster_records()
    sc.dump_records(output_dir, records)
else:
    records = sc.get_records()  # should be default anyway, but explicit
    sc.dump_records(output_dir, records)  # is better than implicit, and all that
Пример #7
0
    def simulate_from_record_WAG(
        cls,
        record,
        output_dir,
        name='tempsim',
        tmpdir='/tmp',
        allow_nonsense=False,
        split_lengths=None,
        gene_names=None,
        ):

        length = record.seqlength
        tree = record.tree
        directorycheck_and_quit(tmpdir)
        gamma = tree.extract_gamma_parameter()
        param_dir = '{0}/alf_parameter_dir'.format(tmpdir)
        working_dir = '{0}/alf_working_dir'.format(tmpdir)
        directorycheck_and_make(param_dir, verbose=False)
        directorycheck_and_make(working_dir, verbose=False)
        treefile = '{0}/treefile.nwk'.format(tmpdir)

        tree.pam2sps('sps2pam').write_to_file(treefile)

        directorycheck_and_make(param_dir)
        directorycheck_and_make(working_dir)

        sim = cls(simulation_name=name, working_directory=working_dir,
                  outfile_path=param_dir, unit_is_pam=True)

        sim.indels()
        sim.rate_variation(gamma)
        sim.root_genome(number_of_genes=1, min_length=length)
        sim.one_word_model('WAG')
        sim.custom_tree(treefile)
        params = sim.write_parameters()
        sim.runALF(params, quiet=True)
        tree_newick = tree.newick
        alf_newick = \
            open('{0}/alf_working_dir/{1}/RealTree.nwk'.format(tmpdir,
                 name)).read()
        replacement_dict = dict(zip(re.findall(r'(\w+)(?=:)',
                                alf_newick), re.findall(r'(\w+)(?=:)',
                                tree_newick)))  # bug correction

        alignment = \
            glob.glob('{0}/alf_working_dir/{1}/MSA/*aa.fa'.format(tmpdir,
                      name))[0]

        new_record = TCSeqRec(alignment)
        new_record.sequences = [seq[:length] for seq in
                                new_record.sequences]
        new_record._update()

        print new_record.seqlength
        new_record.headers = [replacement_dict[x[:x.rindex('/')]]
                              for x in new_record.headers]  # bug should be fixed
        new_record._update()
        new_record.sort_by_name()
        if split_lengths and gene_names:
            with open('{0}/trees.txt'.format(output_dir), 'a') as trf:
                trf.write('{0}\t{1}\n'.format('-'.join(gene_names), tree.newick))
            for rec in new_record.split_by_lengths(split_lengths,
                    gene_names):
                rec.write_phylip('{0}/{1}.phy'.format(output_dir,
                                 rec.name))
        else:
            with open('{0}/trees.txt'.format(output_dir), 'a') as trf:
                trf.write('{0}\t{1}\n'.format(new_record.name, tree.newick))
            new_record.write_phylip('{0}/{1}.phy'.format(output_dir,
                                    name))
        shutil.rmtree(param_dir)
        shutil.rmtree(working_dir)
import argparse
import re

progname = re.compile('[A-Za-z0-9.-_]+').search(sys.argv[0]).group()

desc = '\n'.join(['Read a SequenceCollection from pickle,',
                 'make a randomised copy,', 'dump records'])
input_help = 'Path+Filename for the input pickle file'
output_help = \
    'Path to output directory. Will be created if doesn\'t exist'
parser = argparse.ArgumentParser(prog=progname, description=desc,
                                 formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument('-i', '--input', help=input_help, type=str)
parser.add_argument('-o', '--output', help=output_help, type=str)

args = vars(parser.parse_args())

pickle = args['input']
output_dir = args['output']

filecheck_and_quit(pickle)           # can't find file -> quit
directorycheck_and_make(output_dir)  # can't find directory -> create it

sc = cPickle.load(file(pickle))

scrand = sc.make_randomised_copy()

scrand.dump_records(output_dir)

cPickle.dump(scrand, open('{0}/scrand.pkl'.format(output_dir), 'w'))
    default='spectral',
    )

args         = parser.parse_args()
input_file   = args.input_file
output_dir   = args.output_dir
tmpdir       = args.tmpdir
min_clusters = args.min_clusters
max_clusters = args.max_clusters
distance     = args.distance
method       = args.cluster_method
ind          = args.ind
tree_method  = args.tree_method

filecheck_and_quit(input_file)
directorycheck_and_make(output_dir)
directorycheck_and_make(tmpdir)

################################################################################
# Main 
################################################################################

from sequence_collection import SequenceCollection

sc = SequenceCollection.gunzip(input_file)
for c in range(min_clusters, max_clusters):
    try:
        assert (distance, method, c) in sc.clusters_to_partitions
    except AssertionError:
        print c
        sys.exit()
from errors import filecheck_and_quit, directorycheck_and_make
import cPickle
import sys
import argparse
import re

progname = re.compile("[A-Za-z0-9.-_]+").search(sys.argv[0]).group()

desc = "\n".join(["Read a SequenceCollection from pickle,", "make a randomised copy,", "dump records"])
input_help = "Path+Filename for the input pickle file"
output_help = "Path to output directory. Will be created if doesn't exist"
parser = argparse.ArgumentParser(prog=progname, description=desc, formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument("-i", "--input", help=input_help, type=str)
parser.add_argument("-o", "--output", help=output_help, type=str)

args = vars(parser.parse_args())

pickle = args["input"]
output_dir = args["output"]

filecheck_and_quit(pickle)  # can't find file -> quit
directorycheck_and_make(output_dir)  # can't find directory -> create it

sc = cPickle.load(file(pickle))

scrand = sc.make_randomised_copy()

scrand.dump_records(output_dir)

cPickle.dump(scrand, open("{0}/scrand.pkl".format(output_dir), "w"))
Пример #11
0
    def simulate_from_record_WAG(
        cls,
        record,
        output_dir,
        name='tempsim',
        tmpdir='/tmp',
        allow_nonsense=False,
        split_lengths=None,
        gene_names=None,
    ):

        length = record.seqlength
        tree = record.tree
        directorycheck_and_quit(tmpdir)
        gamma = tree.extract_gamma_parameter()
        param_dir = '{0}/alf_parameter_dir'.format(tmpdir)
        working_dir = '{0}/alf_working_dir'.format(tmpdir)
        directorycheck_and_make(param_dir, verbose=False)
        directorycheck_and_make(working_dir, verbose=False)
        treefile = '{0}/treefile.nwk'.format(tmpdir)

        tree.pam2sps('sps2pam').write_to_file(treefile)

        directorycheck_and_make(param_dir)
        directorycheck_and_make(working_dir)

        sim = cls(simulation_name=name,
                  working_directory=working_dir,
                  outfile_path=param_dir,
                  unit_is_pam=True)

        sim.indels()
        sim.rate_variation(gamma)
        sim.root_genome(number_of_genes=1, min_length=length)
        sim.one_word_model('WAG')
        sim.custom_tree(treefile)
        params = sim.write_parameters()
        sim.runALF(params, quiet=True)
        tree_newick = tree.newick
        alf_newick = \
            open('{0}/alf_working_dir/{1}/RealTree.nwk'.format(tmpdir,
                 name)).read()
        replacement_dict = dict(
            zip(re.findall(r'(\w+)(?=:)', alf_newick),
                re.findall(r'(\w+)(?=:)', tree_newick)))  # bug correction

        alignment = \
            glob.glob('{0}/alf_working_dir/{1}/MSA/*aa.fa'.format(tmpdir,
                      name))[0]

        new_record = TCSeqRec(alignment)
        new_record.sequences = [seq[:length] for seq in new_record.sequences]
        new_record._update()

        print new_record.seqlength
        new_record.headers = [
            replacement_dict[x[:x.rindex('/')]] for x in new_record.headers
        ]  # bug should be fixed
        new_record._update()
        new_record.sort_by_name()
        if split_lengths and gene_names:
            with open('{0}/trees.txt'.format(output_dir), 'a') as trf:
                trf.write('{0}\t{1}\n'.format('-'.join(gene_names),
                                              tree.newick))
            for rec in new_record.split_by_lengths(split_lengths, gene_names):
                rec.write_phylip('{0}/{1}.phy'.format(output_dir, rec.name))
        else:
            with open('{0}/trees.txt'.format(output_dir), 'a') as trf:
                trf.write('{0}\t{1}\n'.format(new_record.name, tree.newick))
            new_record.write_phylip('{0}/{1}.phy'.format(output_dir, name))
        shutil.rmtree(param_dir)
        shutil.rmtree(working_dir)