except AssertionError: print 'Can\'t get partitions to match' return -1 return partition_list sort_key = lambda item: tuple((int(num) if num else alpha) for (num,alpha) in re.findall(r'(\d+)|(\D+)', item.name)) print 'loading recs...' recs = sorted(load_records(sys.argv[1], sys.argv[2]), key=sort_key) print 'loading crecs...' crecs = load_records(sys.argv[3]) d = make_target_dict(crecs) print 'done.' t = [1]*15+[2]*15+[3]*15+[4]*15 c = SequenceCollection() c.records = recs for metric in ['sym', 'euc', 'geo']: c.put_distance_matrices(metric, gtp_path = '/homes/kgori/research/clustering_project/class_files') for method in ['single', 'complete', 'average', 'ward', 'MDS', 'spectral', 'kmedoids']: p = order(rebuild_partitions(recs, d, metric, method)) c.clustering.partitions[(metric, method, 4)] = p c.clustering.partitions['true'] = t c.put_clusters() print '(Done).' for rec in c.get_cluster_records(): try: rec.tree = d[rec.name].tree except KeyError:
5) Compare scores derived from clusters to random permutation of the original data either by making a copy of the SequenceCollection object, with clusters made up of the same number of genes with the same number of characters, or by randomising the alignments and performing hierarchical clustering on the randomised data if the former, do rand1 = col.make_randomised_copy if the latterm do rand2 = SequenceCollection(records=col.get_randomised_alignments(), datatype = 'protein') """ # indir = '/Users/kgori/git/kevin/yeast_data/MSA' indir = '/Users/kgori/git/kevin/data/simulated_data/eight/MSA' col = SequenceCollection(indir, datatype='protein') ran = SequenceCollection(records=col.get_randomised_alignments(), datatype='protein') col.put_trees_parallel() ran.put_trees_parallel() col.put_partitions(metrics=['euc','rf','sym'], linkages=['ward'], nclasses=[2,3,4,5,6,7,8,9,10]) ran.put_partitions(metrics=['euc','rf','sym'], linkages=['ward'], nclasses=[2,3,4,5,6,7,8,9,10]) col.put_clusters() col.put_cluster_trees_parallel() ran.put_clusters() ran.put_cluster_trees_parallel() rn2 = col.make_randomised_copy() r1 = ran.get_clusters() r2 = rn2.get_clusters() cl = col.get_clusters()
'\t')[1]) for rec in phymlrecords: rec.datatype = 'dna' for rec in bionjrecords: rec.datatype = 'dna' try: assert len(phymlrecords) == len(bionjrecords) == 60 except: print 'Missing records in {0}'.format(indir) sys.exit(1) phyml_sc = SequenceCollection(records=phymlrecords, datatype='dna', helper=os.environ['DARWINHELPER'], tmpdir=tmpdir, get_distances=False) bionj_sc = SequenceCollection(records=bionjrecords, datatype='dna', helper=os.environ['DARWINHELPER'], tmpdir=tmpdir, get_distances=False) phyml_sc.put_partitions( ['geo', 'euc', 'sym'], ['single', 'complete', 'average', 'ward', 'kmedoids', 'MDS', 'spectral'], 4, gtp_path= '/net/isilon7/nobackup/research/goldman/kevin/clustering_project/class_files', tmpdir=tmpdir)
progname = re.compile('[A-Za-z0-9.-_]+').search(sys.argv[0]).group() desc = 'Read in a SequenceCollection from disk and dump records' input_help = 'Filepath+name of gzipped SequenceCollection object' output_help = 'Directory to dump files in' choice_help = \ '\n'.join(['Choose to dump post-clustering concatenated records', 'instead of pre-clustering single records']) parser = argparse.ArgumentParser(prog=progname, description=desc) parser.add_argument('-i', dest='input_file', help=input_help, type=str) parser.add_argument('-o', dest='output_dir', help=output_help, type=str) parser.add_argument('-c', dest='cluster_recs', action='store_true') args = parser.parse_args() input_file = args.input_file output_dir = args.output_dir.rstrip('/') cluster_recs = args.cluster_recs filecheck_and_quit(input_file) directorycheck_and_make(output_dir) from sequence_collection import SequenceCollection sc = SequenceCollection.gunzip(input_file) if cluster_recs: records = sc.get_cluster_records() sc.dump_records(output_dir, records) else: records = sc.get_records() # should be default anyway, but explicit sc.dump_records(output_dir, records) # is better than implicit, and all that
]) else: helper = os.environ['DARWINHELPER'] try: TMPDIR = os.environ['TEMPORARY_DIRECTORY'] except: TMPDIR = '/tmp' ### MAIN if treeprog == 'treecollection': get_distances = True else: get_distances = False sc = SequenceCollection(indir, file_format=file_format, gtp_path=gtp_path, datatype=datatype, get_distances=get_distances, tmpdir=TMPDIR, helper=helper) sc.put_trees(program=treeprog) sc.put_partitions(['geo', 'euc', 'rf'], [ 'average', 'complete', 'kmedoids', 'MDS', 'single', 'spectral00', 'spectral01', 'spectral10', 'spectral11', 'ward', ], nclasses, recalculate=True)
'average', 'ward', 'kmedoids', 'spectral', 'MDS', ] calc_varinf = False if os.path.isfile(outf): sys.exit(1) sc = SequenceCollection( seqdir, tmpdir=TMPDIR, gtp_path=GTP_PATH, helper=HELPER, file_format=format, datatype=datatype, parallel_load=False, get_distances=False, ) sc.put_trees(program='bionj', model='GTR', tmpdir=TMPDIR, ncat=4, datatype='nt') # sc.put_distance_matrices(['rf']) # print sc.get_distance_matrices()['rf'] # sys.exit() sc.put_partitions('rf', methods, nclasses, recalculate=True) sc.put_partitions('euc', methods, nclasses, recalculate=True) sc.put_partitions('geo', methods, nclasses, recalculate=True)
#!/usr/bin/python # -*- coding: utf-8 -*- from sequence_collection import SequenceCollection import cPickle import time indir = '/Users/kgori/git/kevin/data/simulated_data/small/MSA' print 'test directory = ', indir load_start = time.time() print 'loading sequences (parallel)' col = SequenceCollection(indir, datatype='protein') print col load_end = time.time() tcpar_start = time.time() print 'putting TC trees (parallel)' col.put_trees_parallel(program='treecollection', tmpdir='/tmp') for rec in col.records: print rec.name print rec.tree tcpar_end = time.time() par_start = time.time() print 'Putting partitions' col.put_partitions(metrics=['sym', 'euc'], linkages=['ward', 'single'], nclasses=[3, 4, 5, 6]) print col.get_partitions()
from pylab import * print 'done.' def print_dict(d): for k in sorted(d): print d np.set_printoptions(linewidth=200, precision=3) sc = SequenceCollection( '/Users/kgori/scratch/chk/aa_alignments/', get_distances=False, file_format='phylip', helper='/Users/kgori/git/kevin/clustering_project/class_files/DV_wrapper.drw' , parallel_load=True, gtp_path='/Users/kgori/git/kevin/clustering_project/class_files/', tmpdir='/tmp', datatype='protein', ) sc_yeast = SequenceCollection( '/Users/kgori/scratch/yeast_MSA', get_distances=False, file_format='phylip', helper='/Users/kgori/git/kevin/clustering_project/class_files/DV_wrapper.drw' , parallel_load=True, gtp_path='/Users/kgori/git/kevin/clustering_project/class_files/', tmpdir='/tmp/yeast',
from tree import Tree from clustering import Clustering import cPickle import time import os import copy import numpy as np np.set_printoptions(precision=2, linewidth=200) indir = '/Users/kgori/git/kevin/data/simulated_data/small/MSA' print 'test directory = ', indir load_start = time.time() print 'loading sequences (parallel)' col = SequenceCollection(indir, datatype='protein') print col load_end = time.time() tcseq_start = time.time() print 'getting TC trees (sequential)' col.get_trees(program='treecollection', tmpdir='/tmp') for rec in col.records: print rec.name print rec.tree tcseq_end = time.time() tcpar_start = time.time() print 'getting TC trees (parallel)' col.get_trees_parallel(program='treecollection', tmpdir='/tmp') for rec in col.records:
from pylab import * print 'done.' def print_dict(d): for k in sorted(d): print d np.set_printoptions(linewidth=200, precision=3) sc = SequenceCollection( '/Users/kgori/scratch/chk/aa_alignments/', get_distances=False, file_format='phylip', helper= '/Users/kgori/git/kevin/clustering_project/class_files/DV_wrapper.drw', parallel_load=True, gtp_path='/Users/kgori/git/kevin/clustering_project/class_files/', tmpdir='/tmp', datatype='protein', ) sc_yeast = SequenceCollection( '/Users/kgori/scratch/yeast_MSA', get_distances=False, file_format='phylip', helper= '/Users/kgori/git/kevin/clustering_project/class_files/DV_wrapper.drw', parallel_load=True, gtp_path='/Users/kgori/git/kevin/clustering_project/class_files/', tmpdir='/tmp/yeast',
#!/usr/bin/python # -*- coding: utf-8 -*- from sequence_collection import SequenceCollection import cPickle import time indir = "/Users/kgori/git/kevin/data/simulated_data/small/MSA" print "test directory = ", indir load_start = time.time() print "loading sequences (parallel)" col = SequenceCollection(indir, datatype="protein") print col load_end = time.time() tcpar_start = time.time() print "putting TC trees (parallel)" col.put_trees_parallel(program="treecollection", tmpdir="/tmp") for rec in col.records: print rec.name print rec.tree tcpar_end = time.time() par_start = time.time() print "Putting partitions" col.put_partitions(metrics=["sym", "euc"], linkages=["ward", "single"], nclasses=[3, 4, 5, 6]) print col.get_partitions() par_end = time.time()
type=fpath, default='/tmp') parser.add_argument('-data', '--datatype', help='datatype', default=None) args = vars(parser.parse_args()) outdir = args['directory'] program = args['program'] model = args['model'] ncat = args['ncat'] datatype = args['datatype'] gtp_path = os.environ['GTP_PATH'] tmpdir = os.environ['TEMPORARY_DIRECTORY'] tmpdir = args['tmpdir'] print 'Reading alignments into SequenceRecord object' seq = SequenceCollection('{0}/dna_alignments'.format(outdir), datatype='dna', tmpdir=tmpdir, helper=os.environ['DARWINHELPER']) print 'Calculating trees' print program, model, datatype, ncat, tmpdir seq.put_trees(program=program, model=model, datatype=datatype, ncat=ncat, tmpdir=tmpdir) print 'doing geodesic distance matrices' seq.put_distance_matrices('geo', gtp_path=gtp_path, tmpdir=tmpdir) print 'doing euc distance matrices' seq.put_distance_matrices('euc') print 'doing sym distance matrices' seq.put_distance_matrices('sym')
tree.read_from_file('{0}/trees/{1}.nwk'.format(working_dir, name)) dv_matrix_strip_header = '\n'.join(dv_matrix.split('\n' )[2:]).rstrip() labels_strip_header = labels.split('\n')[1].rstrip() record = TCSeqRec() record.dv = [(dv_matrix_strip_header, labels_strip_header)] record.tree = tree record.name = name record.headers = labels_strip_header.split() record.sequences = ['' for _ in record.headers] record._update() records.append(record) collection = SequenceCollection(records=records, get_distances=False, gtp_path=os.environ['GTP_PATH']) collection.put_distance_matrices('rf') T = \ collection.Clustering.run_spectral_rotate(collection.distance_matrices['rf' ]) collection.partitions[T] = Partition(T) collection.clusters_to_partitions[('rf', 'spectral_rotate', max(T))] = T collection.concatenate_records() cluster_recs = collection.get_cluster_records() number_of_clusters = len(cluster_recs) for j in range(number_of_clusters): record = cluster_recs[j] record_dv = record.dv[0] labels = record.dv[1]
datatype = args['datatype'] score = args['score'] directorycheck_and_quit(input_dir) directorycheck_and_make(tmpdir) gtp_path = os.environ['GTP_PATH'] helper = os.environ['DARWINHELPER'] from sequence_collection import SequenceCollection sc = SequenceCollection( input_dir, file_format='phylip', datatype=datatype, helper=helper, gtp_path=gtp_path, tmpdir=tmpdir, overwrite=True, ) sc.load_phyml_results(input_dir, program=None) sc.quality_scores = {} for dist in distance: (_, qs) = sc.autotune(dist, max_groups=max_clusters, min_groups=min_clusters) sc.quality_scores[dist] = qs cluster_range = range(min_clusters, max_clusters + 1) if min_clusters > 1: cluster_range.insert(0, 1)
(simdir, 'bionj_clustering'))), key=sort_key) else: records = sorted(load_records('/'.join((simdir, record_dir)), '*.ml.pickle'), key=sort_key) cluster_records = sorted(load_records('/'.join( (simdir, 'phyml_clustering'))), key=sort_key) cluster_dic = make_target_dict(cluster_records) print '(Done).' #rebuild sequenceCollection object sc = SequenceCollection() sc.records = records print 'Generating distance matrices...' for metric in ['euc', 'sym', 'geo']: sc.put_distance_matrices( metric, gtp_path='/homes/kgori/research/clustering_project/class_files', tmpdir=tmpdir) for method in [ 'single', 'complete', 'ward', 'average', 'spectral', 'MDS', 'kmedoids' ]: partition = rebuild_partitions(records, cluster_dic, metric=metric,
'average', 'ward', 'kmedoids', 'spectral', 'MDS', ] calc_varinf = False if os.path.isfile(outf): sys.exit(1) sc = SequenceCollection( seqdir, tmpdir=TMPDIR, gtp_path=GTP_PATH, helper=HELPER, file_format=format, datatype=datatype, parallel_load=False, get_distances=False, ) sc.put_trees(program='bionj', model='GTR', tmpdir=TMPDIR, ncat=4, datatype='nt') # sc.put_distance_matrices(['rf']) # print sc.get_distance_matrices()['rf'] # sys.exit()
parser.add_argument('-m', '--model', help='which model to use in phylogenetic inference', default=None) parser.add_argument('-n', '--ncat', help='number of categories for gamma distributed rates', default=1) parser.add_argument('-t', '--tmpdir', help='temporary directory', type=fpath, default='/tmp') parser.add_argument('-data', '--datatype', help='datatype', default=None) args = vars(parser.parse_args()) outdir = args['directory'] program = args['program'] model = args['model'] ncat = args['ncat'] datatype = args['datatype'] gtp_path = os.environ['GTP_PATH'] tmpdir = os.environ['TEMPORARY_DIRECTORY'] tmpdir = args['tmpdir'] print 'Reading alignments into SequenceRecord object' seq = SequenceCollection('{0}/dna_alignments'.format(outdir), datatype='dna', tmpdir=tmpdir, helper=os.environ['DARWINHELPER']) print 'Calculating trees' print program,model,datatype,ncat,tmpdir seq.put_trees(program=program, model=model, datatype=datatype, ncat=ncat, tmpdir=tmpdir) print 'doing geodesic distance matrices' seq.put_distance_matrices('geo', gtp_path=gtp_path, tmpdir=tmpdir) print 'doing euc distance matrices' seq.put_distance_matrices('euc') print 'doing sym distance matrices' seq.put_distance_matrices('sym') print 'getting score for true clustering' with open('{0}/treedistances.txt'.format(outdir)) as file: T = file.readline().rstrip().split('\t')[1][1:-1].split(', ') seq.clustering.partitions['true'] = T seq.put_clusters()
bionjrecords = sorted([cPickle.load(file(x)) for x in bionjpickles], key=lambda x:sort_key(x.name)) true = eval(open('{0}/treedistances.txt'.format(indir)).read().split('\n')[0].split('\t')[1]) for rec in phymlrecords: rec.datatype='dna' for rec in bionjrecords: rec.datatype='dna' try: assert len(phymlrecords) == len(bionjrecords) == 60 except: print 'Missing records in {0}'.format(indir) sys.exit(1) phyml_sc = SequenceCollection(records=phymlrecords, datatype='dna', helper=os.environ['DARWINHELPER'],tmpdir=tmpdir, get_distances=False) bionj_sc = SequenceCollection(records=bionjrecords, datatype='dna', helper=os.environ['DARWINHELPER'],tmpdir=tmpdir, get_distances=False) phyml_sc.put_partitions(['geo','euc','sym'],['single','complete','average','ward','kmedoids','MDS','spectral'], 4, gtp_path='/net/isilon7/nobackup/research/goldman/kevin/clustering_project/class_files', tmpdir=tmpdir) bionj_sc.put_partitions(['geo','euc','sym'],['single','complete','average','ward','kmedoids','MDS','spectral'], 4, gtp_path='/net/isilon7/nobackup/research/goldman/kevin/clustering_project/class_files', tmpdir=tmpdir) phyml_sc.clustering.partitions['true']=true bionj_sc.clustering.partitions['true']=true phyml_sc.put_clusters() bionj_sc.put_clusters() if not os.path.isdir('{0}/phyml_clustering'.format(indir)): os.mkdir('{0}/phyml_clustering'.format(indir)) if not os.path.isdir('{0}/bionj_clustering'.format(indir)): os.mkdir('{0}/bionj_clustering'.format(indir))
col.put_cluster_trees_parallel() 5) Compare scores derived from clusters to random permutation of the original data either by making a copy of the SequenceCollection object, with clusters made up of the same number of genes with the same number of characters, or by randomising the alignments and performing hierarchical clustering on the randomised data if the former, do rand1 = col.make_randomised_copy if the latterm do rand2 = SequenceCollection(records=col.get_randomised_alignments(), datatype = 'protein') """ # indir = '/Users/kgori/git/kevin/yeast_data/MSA' indir = '/Users/kgori/git/kevin/data/simulated_data/eight/MSA' col = SequenceCollection(indir, datatype='protein') ran = SequenceCollection(records=col.get_randomised_alignments(), datatype='protein') col.put_trees_parallel() ran.put_trees_parallel() col.put_partitions(metrics=['euc', 'rf', 'sym'], linkages=['ward'], nclasses=[2, 3, 4, 5, 6, 7, 8, 9, 10]) ran.put_partitions(metrics=['euc', 'rf', 'sym'], linkages=['ward'], nclasses=[2, 3, 4, 5, 6, 7, 8, 9, 10]) col.put_clusters() col.put_cluster_trees_parallel() ran.put_clusters() ran.put_cluster_trees_parallel() rn2 = col.make_randomised_copy()
""" print 'loading sequences...' col = SequenceCollection(indir, helper=helper, tmpdir=tmpdir) print 'getting trees...' col.put_trees_parallel(program='phyml',tmpdir=tmpdir) print 'getting partitions...' col.put_partitions(metrics=['sym'],linkages=['ward'],nclasses=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17]) #print 'randomizing bytes...' col.put_clusters() #print 'immanentizing the eschaton...' col.put_cluster_trees_parallel(program='phyml',tmpdir=tmpdir) """ plottable = [] #plottable.append(add_to_plot(col,'sym','ward')) col = cPickle.load(file('col.pickle')) #print 'whipping into frenzy...' for i in range(1): r = SequenceCollection(records=col.get_randomised_alignments(), helper=helper, tmpdir=tmpdir) r.put_trees_parallel(program='phyml',tmpdir=tmpdir) r.put_partitions(metrics=['sym'],linkages=['ward'],nclasses=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17]) r.put_clusters() r.put_cluster_trees_parallel(program='phyml',tmpdir=tmpdir) plottable.append(add_to_plot(r, 'sym', 'ward')) cPickle.dump(plottable, file('plottable{0}.pickle'.format(index),'w')) #cPickle.dump(col, file('col.pickle','w'))
#!/usr/bin/python # -*- coding: utf-8 -*- from sequence_collection import SequenceCollection import time indir = '/Users/kgori/git/kevin/data/real_data/yeast_data/MSA' print 'test directory = ', indir load_start = time.time() print 'loading sequences (parallel)' col = SequenceCollection(indir, datatype='dna') print col load_end = time.time() col.put_trees_parallel() col.put_partitions(metrics=['sym','geodesic'], linkages=['ward'], nclasses=[2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]) col.put_clusters() col.put_cluster_trees_parallel() timings = [ load_end - load_start ] print 'time = {0:.3f}'.format(*timings)
datatype = args['datatype'] score = args['score'] directorycheck_and_quit(input_dir) directorycheck_and_make(tmpdir) gtp_path = os.environ['GTP_PATH'] helper = os.environ['DARWINHELPER'] from sequence_collection import SequenceCollection sc = SequenceCollection( input_dir, file_format='phylip', datatype=datatype, helper=helper, gtp_path=gtp_path, tmpdir=tmpdir, overwrite=True, ) sc.load_phyml_results(input_dir, program=None) sc.quality_scores = {} for dist in distance: (_, qs) = sc.autotune(dist, max_groups=max_clusters, min_groups=min_clusters) sc.quality_scores[dist] = qs cluster_range = range(min_clusters, max_clusters+1) if min_clusters > 1: cluster_range.insert(0, 1) sc.put_partitions(distance, method, cluster_range)
print 'Reading records...' # some initialisation if program == 'bionj': records = sorted(load_records('/'.join((simdir, record_dir)), '*.nj.pickle'), key=sort_key) cluster_records = sorted(load_records('/'.join((simdir, 'bionj_clustering'))), key=sort_key) else: records = sorted(load_records('/'.join((simdir, record_dir)), '*.ml.pickle'), key=sort_key) cluster_records = sorted(load_records('/'.join((simdir, 'phyml_clustering'))), key=sort_key) cluster_dic = make_target_dict(cluster_records) print '(Done).' #rebuild sequenceCollection object sc = SequenceCollection() sc.records = records print 'Generating distance matrices...' for metric in ['euc', 'sym', 'geo']: sc.put_distance_matrices(metric, gtp_path = '/homes/kgori/research/clustering_project/class_files', tmpdir=tmpdir) for method in ['single', 'complete', 'ward', 'average', 'spectral', 'MDS', 'kmedoids']: partition = rebuild_partitions(records, cluster_dic, metric=metric, method=method) sc.clustering.partitions[(metric, method, 4)] = partition sc.clustering.partitions['true'] = [1]*15 + [2]*15 + [3]*15 + [4]*15 sc.put_clusters() print '(Done).' for rec in sc.get_cluster_records():
import argparse import re import sys from errors import filecheck_and_quit, directorycheck_and_quit progname = re.compile('[A-Za-z0-9.-_]+').search(sys.argv[0]).group() desc = 'Read in a SequenceCollection from disk and print scores' input_help = 'Filepath+name of gzipped SequenceCollection object' category_choices = ['Observed', 'Randomised', 'Simulated', 'NA'] parser = argparse.ArgumentParser(prog=progname, description=desc) parser.add_argument('-i', dest='input_file', help=input_help, type=str) parser.add_argument('-t', dest='phyml_dir', help=input_help, type=str) args = parser.parse_args() input_file = args.input_file phyml_dir = args.phyml_dir.rstrip('/') filecheck_and_quit(input_file) directorycheck_and_quit(phyml_dir) from sequence_collection import SequenceCollection sc = SequenceCollection.gunzip(input_file) cluster_records = sc.get_cluster_records() sc.load_phyml_results(phyml_dir, records=cluster_records, use_hashname=True) sc.update_scores() sc.gzip(input_file)
print indir rob.r('library(ape)') rob.r('library(phangorn)') print 'r libraries loaded' alignments_dir = '{0}/dna_alignments'.format(indir) if os.path.isfile('{0}/tmpPickle.pkl'.format(indir)): sc = cPickle.load(file('{0}/tmpPickle.pkl'.format(indir))) else: sc = SequenceCollection( alignments_dir, file_format='fasta', datatype='dna', gtp_path=gtp_path, helper=helper, tmpdir=tmpdir, ) sc.put_trees(program='bionj') sc.put_partitions('geo', 'spectral', 4) sc.concatenate_records() sc.put_cluster_trees(program='bionj') cPickle.dump(sc, open('{0}/tmpPickle.pkl'.format(indir), 'w')) print 'SC object available' # Plot the heatmap of the distance matrix dm = sc.get_distance_matrices()['geo'] p = sc.partitions[sc.clusters_to_partitions[('geo', 'spectral', 4)]]