Python SequenceCollection примеры, sequence_collection.SequenceCollection Python примеры использования

Пример #1

0

Показать файл

    except AssertionError:
        print 'Can\'t get partitions to match'
        return -1
    return partition_list

sort_key = lambda item: tuple((int(num) if num else alpha) for (num,alpha) in re.findall(r'(\d+)|(\D+)', item.name))

print 'loading recs...'
recs = sorted(load_records(sys.argv[1], sys.argv[2]), key=sort_key)
print 'loading crecs...'
crecs = load_records(sys.argv[3])
d = make_target_dict(crecs)
print 'done.'

t = [1]*15+[2]*15+[3]*15+[4]*15
c = SequenceCollection()
c.records = recs

for metric in ['sym', 'euc', 'geo']:
    c.put_distance_matrices(metric, gtp_path = '/homes/kgori/research/clustering_project/class_files')
    for method in ['single', 'complete', 'average', 'ward', 'MDS', 'spectral', 'kmedoids']:
        p = order(rebuild_partitions(recs, d, metric, method))
        c.clustering.partitions[(metric, method, 4)] = p
c.clustering.partitions['true'] = t
c.put_clusters()
print '(Done).'

for rec in c.get_cluster_records():
    try:
        rec.tree = d[rec.name].tree
    except KeyError:

Пример #2

0

Показать файл

Файл: test_code.py Проект: kgori/clustering_project

    
5)  Compare scores derived from clusters to random permutation of the original data
    either by making a copy of the SequenceCollection object, with clusters made up
    of the same number of genes with the same number of characters, or by randomising
    the alignments and performing hierarchical clustering on the randomised data

    if the former, do rand1 = col.make_randomised_copy
    if the latterm do rand2 = SequenceCollection(records=col.get_randomised_alignments(),
        datatype = 'protein')
"""


# indir = '/Users/kgori/git/kevin/yeast_data/MSA'
indir = '/Users/kgori/git/kevin/data/simulated_data/eight/MSA'

col = SequenceCollection(indir, datatype='protein')
ran = SequenceCollection(records=col.get_randomised_alignments(), datatype='protein')
col.put_trees_parallel()
ran.put_trees_parallel()
col.put_partitions(metrics=['euc','rf','sym'], linkages=['ward'], nclasses=[2,3,4,5,6,7,8,9,10])
ran.put_partitions(metrics=['euc','rf','sym'], linkages=['ward'], nclasses=[2,3,4,5,6,7,8,9,10])
col.put_clusters()
col.put_cluster_trees_parallel()
ran.put_clusters()
ran.put_cluster_trees_parallel()
rn2 = col.make_randomised_copy()

r1 = ran.get_clusters()
r2 = rn2.get_clusters()
cl = col.get_clusters()

Пример #3

0

Показать файл

Файл: doclustering.py Проект: haehn/clustering_project

        '\t')[1])

for rec in phymlrecords:
    rec.datatype = 'dna'
for rec in bionjrecords:
    rec.datatype = 'dna'

try:
    assert len(phymlrecords) == len(bionjrecords) == 60
except:
    print 'Missing records in {0}'.format(indir)
    sys.exit(1)

phyml_sc = SequenceCollection(records=phymlrecords,
                              datatype='dna',
                              helper=os.environ['DARWINHELPER'],
                              tmpdir=tmpdir,
                              get_distances=False)
bionj_sc = SequenceCollection(records=bionjrecords,
                              datatype='dna',
                              helper=os.environ['DARWINHELPER'],
                              tmpdir=tmpdir,
                              get_distances=False)

phyml_sc.put_partitions(
    ['geo', 'euc', 'sym'],
    ['single', 'complete', 'average', 'ward', 'kmedoids', 'MDS', 'spectral'],
    4,
    gtp_path=
    '/net/isilon7/nobackup/research/goldman/kevin/clustering_project/class_files',
    tmpdir=tmpdir)

Пример #4

0

Показать файл

Файл: load_and_dump_records.py Проект: haehn/clustering_project

progname = re.compile('[A-Za-z0-9.-_]+').search(sys.argv[0]).group()
desc = 'Read in a SequenceCollection from disk and dump records'
input_help = 'Filepath+name of gzipped SequenceCollection object'
output_help = 'Directory to dump files in'
choice_help = \
    '\n'.join(['Choose to dump post-clustering concatenated records',
              'instead of pre-clustering single records'])
parser = argparse.ArgumentParser(prog=progname, description=desc)
parser.add_argument('-i', dest='input_file', help=input_help, type=str)
parser.add_argument('-o', dest='output_dir', help=output_help, type=str)
parser.add_argument('-c', dest='cluster_recs', action='store_true')

args = parser.parse_args()
input_file = args.input_file
output_dir = args.output_dir.rstrip('/')
cluster_recs = args.cluster_recs

filecheck_and_quit(input_file)
directorycheck_and_make(output_dir)

from sequence_collection import SequenceCollection

sc = SequenceCollection.gunzip(input_file)
if cluster_recs:
    records = sc.get_cluster_records()
    sc.dump_records(output_dir, records)
else:
    records = sc.get_records()  # should be default anyway, but explicit
    sc.dump_records(output_dir,
                    records)  # is better than implicit, and all that

Пример #5

0

Показать файл

        ])
else:
    helper = os.environ['DARWINHELPER']

try:
    TMPDIR = os.environ['TEMPORARY_DIRECTORY']
except:
    TMPDIR = '/tmp'

### MAIN
if treeprog == 'treecollection':
    get_distances = True
else:
    get_distances = False
sc = SequenceCollection(indir, file_format=file_format,
                        gtp_path=gtp_path, datatype=datatype,
                        get_distances=get_distances, tmpdir=TMPDIR, helper=helper)

sc.put_trees(program=treeprog)
sc.put_partitions(['geo', 'euc', 'rf'], [
    'average',
    'complete',
    'kmedoids',
    'MDS',
    'single',
    'spectral00',
    'spectral01',
    'spectral10',
    'spectral11',
    'ward',
    ], nclasses, recalculate=True)

Пример #6

0

Показать файл

Файл: calc_scores.py Проект: kgori/clustering_project

    'average',
    'ward',
    'kmedoids',
    'spectral',
    'MDS',
    ]
calc_varinf = False

if os.path.isfile(outf):
    sys.exit(1)

sc = SequenceCollection(
    seqdir,
    tmpdir=TMPDIR,
    gtp_path=GTP_PATH,
    helper=HELPER,
    file_format=format,
    datatype=datatype,
    parallel_load=False,
    get_distances=False,
    )

sc.put_trees(program='bionj', model='GTR', tmpdir=TMPDIR, ncat=4,
             datatype='nt')

# sc.put_distance_matrices(['rf'])
# print sc.get_distance_matrices()['rf']
# sys.exit()

sc.put_partitions('rf', methods, nclasses, recalculate=True)
sc.put_partitions('euc', methods, nclasses, recalculate=True)
sc.put_partitions('geo', methods, nclasses, recalculate=True)

Пример #7

0

Показать файл

Файл: test_get_cluster_trees.py Проект: haehn/clustering_project

#!/usr/bin/python
# -*- coding: utf-8 -*-

from sequence_collection import SequenceCollection
import cPickle
import time

indir = '/Users/kgori/git/kevin/data/simulated_data/small/MSA'

print 'test directory = ', indir

load_start = time.time()
print 'loading sequences (parallel)'
col = SequenceCollection(indir, datatype='protein')
print col
load_end = time.time()

tcpar_start = time.time()
print 'putting TC trees (parallel)'
col.put_trees_parallel(program='treecollection', tmpdir='/tmp')
for rec in col.records:
    print rec.name
    print rec.tree
tcpar_end = time.time()

par_start = time.time()
print 'Putting partitions'
col.put_partitions(metrics=['sym', 'euc'],
                   linkages=['ward', 'single'],
                   nclasses=[3, 4, 5, 6])
print col.get_partitions()

Пример #8

0

Показать файл

Файл: test_new_dictionary.py Проект: kgori/clustering_project

from pylab import *
print 'done.'


def print_dict(d):
    for k in sorted(d):
        print d


np.set_printoptions(linewidth=200, precision=3)
sc = SequenceCollection(
    '/Users/kgori/scratch/chk/aa_alignments/',
    get_distances=False,
    file_format='phylip',
    helper='/Users/kgori/git/kevin/clustering_project/class_files/DV_wrapper.drw'
        ,
    parallel_load=True,
    gtp_path='/Users/kgori/git/kevin/clustering_project/class_files/',
    tmpdir='/tmp',
    datatype='protein',
    )

sc_yeast = SequenceCollection(
    '/Users/kgori/scratch/yeast_MSA',
    get_distances=False,
    file_format='phylip',
    helper='/Users/kgori/git/kevin/clustering_project/class_files/DV_wrapper.drw'
        ,
    parallel_load=True,
    gtp_path='/Users/kgori/git/kevin/clustering_project/class_files/',
    tmpdir='/tmp/yeast',

Пример #9

0

Показать файл

Файл: test_scoll.py Проект: haehn/clustering_project

from tree import Tree
from clustering import Clustering
import cPickle
import time
import os
import copy
import numpy as np
np.set_printoptions(precision=2, linewidth=200)

indir = '/Users/kgori/git/kevin/data/simulated_data/small/MSA'

print 'test directory = ', indir

load_start = time.time()
print 'loading sequences (parallel)'
col = SequenceCollection(indir, datatype='protein')
print col
load_end = time.time()

tcseq_start = time.time()
print 'getting TC trees (sequential)'
col.get_trees(program='treecollection', tmpdir='/tmp')
for rec in col.records:
    print rec.name
    print rec.tree
tcseq_end = time.time()

tcpar_start = time.time()
print 'getting TC trees (parallel)'
col.get_trees_parallel(program='treecollection', tmpdir='/tmp')
for rec in col.records:

Пример #10

0

Показать файл

from pylab import *
print 'done.'


def print_dict(d):
    for k in sorted(d):
        print d


np.set_printoptions(linewidth=200, precision=3)
sc = SequenceCollection(
    '/Users/kgori/scratch/chk/aa_alignments/',
    get_distances=False,
    file_format='phylip',
    helper=
    '/Users/kgori/git/kevin/clustering_project/class_files/DV_wrapper.drw',
    parallel_load=True,
    gtp_path='/Users/kgori/git/kevin/clustering_project/class_files/',
    tmpdir='/tmp',
    datatype='protein',
)

sc_yeast = SequenceCollection(
    '/Users/kgori/scratch/yeast_MSA',
    get_distances=False,
    file_format='phylip',
    helper=
    '/Users/kgori/git/kevin/clustering_project/class_files/DV_wrapper.drw',
    parallel_load=True,
    gtp_path='/Users/kgori/git/kevin/clustering_project/class_files/',
    tmpdir='/tmp/yeast',

Пример #11

0

Показать файл

Файл: test_get_cluster_trees.py Проект: kgori/clustering_project

#!/usr/bin/python
# -*- coding: utf-8 -*-

from sequence_collection import SequenceCollection
import cPickle
import time

indir = "/Users/kgori/git/kevin/data/simulated_data/small/MSA"

print "test directory = ", indir

load_start = time.time()
print "loading sequences (parallel)"
col = SequenceCollection(indir, datatype="protein")
print col
load_end = time.time()

tcpar_start = time.time()
print "putting TC trees (parallel)"
col.put_trees_parallel(program="treecollection", tmpdir="/tmp")
for rec in col.records:
    print rec.name
    print rec.tree
tcpar_end = time.time()

par_start = time.time()
print "Putting partitions"
col.put_partitions(metrics=["sym", "euc"], linkages=["ward", "single"], nclasses=[3, 4, 5, 6])
print col.get_partitions()
par_end = time.time()

Пример #12

0

Показать файл

Файл: pickle_simulation.py Проект: haehn/clustering_project

                    type=fpath,
                    default='/tmp')
parser.add_argument('-data', '--datatype', help='datatype', default=None)

args = vars(parser.parse_args())
outdir = args['directory']
program = args['program']
model = args['model']
ncat = args['ncat']
datatype = args['datatype']
gtp_path = os.environ['GTP_PATH']
tmpdir = os.environ['TEMPORARY_DIRECTORY']
tmpdir = args['tmpdir']
print 'Reading alignments into SequenceRecord object'
seq = SequenceCollection('{0}/dna_alignments'.format(outdir),
                         datatype='dna',
                         tmpdir=tmpdir,
                         helper=os.environ['DARWINHELPER'])
print 'Calculating trees'
print program, model, datatype, ncat, tmpdir
seq.put_trees(program=program,
              model=model,
              datatype=datatype,
              ncat=ncat,
              tmpdir=tmpdir)
print 'doing geodesic distance matrices'
seq.put_distance_matrices('geo', gtp_path=gtp_path, tmpdir=tmpdir)
print 'doing euc distance matrices'
seq.put_distance_matrices('euc')
print 'doing sym distance matrices'
seq.put_distance_matrices('sym')

Пример #13

0

Показать файл

Файл: cluster_TC_input.py Проект: haehn/clustering_project

        tree.read_from_file('{0}/trees/{1}.nwk'.format(working_dir,
                            name))

    dv_matrix_strip_header = '\n'.join(dv_matrix.split('\n'
            )[2:]).rstrip()
    labels_strip_header = labels.split('\n')[1].rstrip()
    record = TCSeqRec()
    record.dv = [(dv_matrix_strip_header, labels_strip_header)]
    record.tree = tree
    record.name = name
    record.headers = labels_strip_header.split()
    record.sequences = ['' for _ in record.headers]
    record._update()
    records.append(record)

collection = SequenceCollection(records=records, get_distances=False,
                                gtp_path=os.environ['GTP_PATH'])
collection.put_distance_matrices('rf')
T = \
    collection.Clustering.run_spectral_rotate(collection.distance_matrices['rf'
        ])
collection.partitions[T] = Partition(T)
collection.clusters_to_partitions[('rf', 'spectral_rotate', max(T))] = T
collection.concatenate_records()
cluster_recs = collection.get_cluster_records()

number_of_clusters = len(cluster_recs)
for j in range(number_of_clusters):
    record = cluster_recs[j]
    record_dv = record.dv[0]
    labels = record.dv[1]

Пример #14

0

Показать файл

Файл: read_alignments_and_cluster.py Проект: haehn/clustering_project

datatype = args['datatype']
score = args['score']

directorycheck_and_quit(input_dir)
directorycheck_and_make(tmpdir)

gtp_path = os.environ['GTP_PATH']
helper = os.environ['DARWINHELPER']

from sequence_collection import SequenceCollection

sc = SequenceCollection(
    input_dir,
    file_format='phylip',
    datatype=datatype,
    helper=helper,
    gtp_path=gtp_path,
    tmpdir=tmpdir,
    overwrite=True,
)

sc.load_phyml_results(input_dir, program=None)
sc.quality_scores = {}
for dist in distance:
    (_, qs) = sc.autotune(dist,
                          max_groups=max_clusters,
                          min_groups=min_clusters)
    sc.quality_scores[dist] = qs
cluster_range = range(min_clusters, max_clusters + 1)
if min_clusters > 1:
    cluster_range.insert(0, 1)

Пример #15

0

Показать файл

        (simdir, 'bionj_clustering'))),
                             key=sort_key)
else:
    records = sorted(load_records('/'.join((simdir, record_dir)),
                                  '*.ml.pickle'),
                     key=sort_key)
    cluster_records = sorted(load_records('/'.join(
        (simdir, 'phyml_clustering'))),
                             key=sort_key)

cluster_dic = make_target_dict(cluster_records)
print '(Done).'

#rebuild sequenceCollection object

sc = SequenceCollection()
sc.records = records

print 'Generating distance matrices...'
for metric in ['euc', 'sym', 'geo']:
    sc.put_distance_matrices(
        metric,
        gtp_path='/homes/kgori/research/clustering_project/class_files',
        tmpdir=tmpdir)
    for method in [
            'single', 'complete', 'ward', 'average', 'spectral', 'MDS',
            'kmedoids'
    ]:
        partition = rebuild_partitions(records,
                                       cluster_dic,
                                       metric=metric,

Пример #16

0

Показать файл

Файл: calc_scores.py Проект: haehn/clustering_project

    'average',
    'ward',
    'kmedoids',
    'spectral',
    'MDS',
]
calc_varinf = False

if os.path.isfile(outf):
    sys.exit(1)

sc = SequenceCollection(
    seqdir,
    tmpdir=TMPDIR,
    gtp_path=GTP_PATH,
    helper=HELPER,
    file_format=format,
    datatype=datatype,
    parallel_load=False,
    get_distances=False,
)

sc.put_trees(program='bionj',
             model='GTR',
             tmpdir=TMPDIR,
             ncat=4,
             datatype='nt')

# sc.put_distance_matrices(['rf'])
# print sc.get_distance_matrices()['rf']
# sys.exit()

Пример #17

0

Показать файл

Файл: pickle_simulation.py Проект: kgori/clustering_project

parser.add_argument('-m', '--model', help='which model to use in phylogenetic inference', default=None)
parser.add_argument('-n', '--ncat', help='number of categories for gamma distributed rates', default=1)
parser.add_argument('-t', '--tmpdir', help='temporary directory', type=fpath, default='/tmp')
parser.add_argument('-data', '--datatype', help='datatype', default=None)

args = vars(parser.parse_args())
outdir = args['directory']
program = args['program']
model = args['model']
ncat = args['ncat']
datatype = args['datatype']
gtp_path = os.environ['GTP_PATH']
tmpdir = os.environ['TEMPORARY_DIRECTORY']
tmpdir = args['tmpdir']
print 'Reading alignments into SequenceRecord object'
seq = SequenceCollection('{0}/dna_alignments'.format(outdir), datatype='dna', tmpdir=tmpdir, helper=os.environ['DARWINHELPER'])
print 'Calculating trees'
print program,model,datatype,ncat,tmpdir
seq.put_trees(program=program, model=model, datatype=datatype, ncat=ncat, tmpdir=tmpdir)
print 'doing geodesic distance matrices'
seq.put_distance_matrices('geo', gtp_path=gtp_path, tmpdir=tmpdir)
print 'doing euc distance matrices'
seq.put_distance_matrices('euc')
print 'doing sym distance matrices'
seq.put_distance_matrices('sym')

print 'getting score for true clustering'
with open('{0}/treedistances.txt'.format(outdir)) as file:
	T = file.readline().rstrip().split('\t')[1][1:-1].split(', ')
seq.clustering.partitions['true'] = T
seq.put_clusters()

Пример #18

0

Показать файл

Файл: doclustering.py Проект: kgori/clustering_project

bionjrecords = sorted([cPickle.load(file(x)) for x in bionjpickles], key=lambda x:sort_key(x.name))

true = eval(open('{0}/treedistances.txt'.format(indir)).read().split('\n')[0].split('\t')[1])

for rec in phymlrecords:
    rec.datatype='dna'
for rec in bionjrecords:
    rec.datatype='dna'

try: 
    assert len(phymlrecords) == len(bionjrecords) == 60
except: 
    print 'Missing records in {0}'.format(indir)
    sys.exit(1)

phyml_sc = SequenceCollection(records=phymlrecords, datatype='dna', helper=os.environ['DARWINHELPER'],tmpdir=tmpdir, get_distances=False)
bionj_sc = SequenceCollection(records=bionjrecords, datatype='dna', helper=os.environ['DARWINHELPER'],tmpdir=tmpdir, get_distances=False)

phyml_sc.put_partitions(['geo','euc','sym'],['single','complete','average','ward','kmedoids','MDS','spectral'], 4, gtp_path='/net/isilon7/nobackup/research/goldman/kevin/clustering_project/class_files', tmpdir=tmpdir)
bionj_sc.put_partitions(['geo','euc','sym'],['single','complete','average','ward','kmedoids','MDS','spectral'], 4, gtp_path='/net/isilon7/nobackup/research/goldman/kevin/clustering_project/class_files', tmpdir=tmpdir)
phyml_sc.clustering.partitions['true']=true
bionj_sc.clustering.partitions['true']=true

phyml_sc.put_clusters()
bionj_sc.put_clusters()

if not os.path.isdir('{0}/phyml_clustering'.format(indir)):
    os.mkdir('{0}/phyml_clustering'.format(indir))
if not os.path.isdir('{0}/bionj_clustering'.format(indir)):
    os.mkdir('{0}/bionj_clustering'.format(indir))

Пример #19

0

Показать файл

Файл: test_code.py Проект: haehn/clustering_project

    col.put_cluster_trees_parallel()
    
5)  Compare scores derived from clusters to random permutation of the original data
    either by making a copy of the SequenceCollection object, with clusters made up
    of the same number of genes with the same number of characters, or by randomising
    the alignments and performing hierarchical clustering on the randomised data

    if the former, do rand1 = col.make_randomised_copy
    if the latterm do rand2 = SequenceCollection(records=col.get_randomised_alignments(),
        datatype = 'protein')
"""

# indir = '/Users/kgori/git/kevin/yeast_data/MSA'
indir = '/Users/kgori/git/kevin/data/simulated_data/eight/MSA'

col = SequenceCollection(indir, datatype='protein')
ran = SequenceCollection(records=col.get_randomised_alignments(),
                         datatype='protein')
col.put_trees_parallel()
ran.put_trees_parallel()
col.put_partitions(metrics=['euc', 'rf', 'sym'],
                   linkages=['ward'],
                   nclasses=[2, 3, 4, 5, 6, 7, 8, 9, 10])
ran.put_partitions(metrics=['euc', 'rf', 'sym'],
                   linkages=['ward'],
                   nclasses=[2, 3, 4, 5, 6, 7, 8, 9, 10])
col.put_clusters()
col.put_cluster_trees_parallel()
ran.put_clusters()
ran.put_cluster_trees_parallel()
rn2 = col.make_randomised_copy()

Пример #20

0

Показать файл

Файл: test_randomiser.py Проект: haehn/clustering_project

"""
print 'loading sequences...'
col = SequenceCollection(indir, helper=helper, tmpdir=tmpdir)

print 'getting trees...'
col.put_trees_parallel(program='phyml',tmpdir=tmpdir)

print 'getting partitions...'
col.put_partitions(metrics=['sym'],linkages=['ward'],nclasses=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17])
#print 'randomizing bytes...'
col.put_clusters()

#print 'immanentizing the eschaton...'
col.put_cluster_trees_parallel(program='phyml',tmpdir=tmpdir)
"""
plottable = []
#plottable.append(add_to_plot(col,'sym','ward'))
col = cPickle.load(file('col.pickle'))
#print 'whipping into frenzy...'
for i in range(1):
    r = SequenceCollection(records=col.get_randomised_alignments(), helper=helper, tmpdir=tmpdir)
    r.put_trees_parallel(program='phyml',tmpdir=tmpdir)
    r.put_partitions(metrics=['sym'],linkages=['ward'],nclasses=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17])
    r.put_clusters()
    r.put_cluster_trees_parallel(program='phyml',tmpdir=tmpdir)   
    plottable.append(add_to_plot(r, 'sym', 'ward'))
    
cPickle.dump(plottable, file('plottable{0}.pickle'.format(index),'w'))
#cPickle.dump(col, file('col.pickle','w'))

Пример #21

0

Показать файл

Файл: test_get_trees.py Проект: kgori/clustering_project

#!/usr/bin/python
# -*- coding: utf-8 -*-


from sequence_collection import SequenceCollection
import time

indir = '/Users/kgori/git/kevin/data/real_data/yeast_data/MSA'

print 'test directory = ', indir

load_start = time.time()
print 'loading sequences (parallel)'
col = SequenceCollection(indir, datatype='dna')
print col
load_end = time.time()

col.put_trees_parallel()
col.put_partitions(metrics=['sym','geodesic'], linkages=['ward'], nclasses=[2,3,4,5,6,7,8,9,10,11,12,13,14,15,16])
col.put_clusters()
col.put_cluster_trees_parallel()

timings = [
load_end - load_start
]

print 'time = {0:.3f}'.format(*timings)

Пример #22

0

Показать файл

Файл: test_scoll.py Проект: kgori/clustering_project

from tree import Tree
from clustering import Clustering
import cPickle
import time
import os
import copy
import numpy as np
np.set_printoptions(precision=2, linewidth=200)

indir = '/Users/kgori/git/kevin/data/simulated_data/small/MSA'

print 'test directory = ', indir

load_start = time.time()
print 'loading sequences (parallel)'
col = SequenceCollection(indir, datatype='protein')
print col
load_end = time.time()

tcseq_start = time.time()
print 'getting TC trees (sequential)'
col.get_trees(program='treecollection', tmpdir='/tmp')
for rec in col.records:
    print rec.name
    print rec.tree
tcseq_end = time.time()

tcpar_start = time.time()
print 'getting TC trees (parallel)'
col.get_trees_parallel(program='treecollection', tmpdir='/tmp')
for rec in col.records:

Пример #23

0

Показать файл

Файл: read_alignments_and_cluster.py Проект: kgori/clustering_project

datatype = args['datatype']
score = args['score']

directorycheck_and_quit(input_dir)
directorycheck_and_make(tmpdir)

gtp_path = os.environ['GTP_PATH']
helper = os.environ['DARWINHELPER']

from sequence_collection import SequenceCollection

sc = SequenceCollection(
    input_dir,
    file_format='phylip',
    datatype=datatype,
    helper=helper,
    gtp_path=gtp_path,
    tmpdir=tmpdir,
    overwrite=True,
    )

sc.load_phyml_results(input_dir, program=None)
sc.quality_scores = {}
for dist in distance:
    (_, qs) = sc.autotune(dist, max_groups=max_clusters,
                          min_groups=min_clusters)
    sc.quality_scores[dist] = qs
cluster_range = range(min_clusters, max_clusters+1)
if min_clusters > 1:
    cluster_range.insert(0, 1)
sc.put_partitions(distance, method, cluster_range)

Пример #24

0

Показать файл

Файл: findings.py Проект: kgori/clustering_project

print 'Reading records...'

# some initialisation
if program == 'bionj':
    records = sorted(load_records('/'.join((simdir, record_dir)), '*.nj.pickle'), key=sort_key)
    cluster_records = sorted(load_records('/'.join((simdir, 'bionj_clustering'))), key=sort_key)
else:
    records = sorted(load_records('/'.join((simdir, record_dir)), '*.ml.pickle'), key=sort_key)
    cluster_records = sorted(load_records('/'.join((simdir, 'phyml_clustering'))), key=sort_key)

cluster_dic = make_target_dict(cluster_records)
print '(Done).'

#rebuild sequenceCollection object

sc = SequenceCollection()
sc.records = records

print 'Generating distance matrices...'
for metric in ['euc', 'sym', 'geo']:
    sc.put_distance_matrices(metric, gtp_path = '/homes/kgori/research/clustering_project/class_files', tmpdir=tmpdir)
    for method in ['single', 'complete', 'ward', 'average', 'spectral', 'MDS', 'kmedoids']:
        partition = rebuild_partitions(records, cluster_dic, metric=metric, method=method)
        sc.clustering.partitions[(metric, method, 4)] = partition

sc.clustering.partitions['true'] = [1]*15 + [2]*15 + [3]*15 + [4]*15

sc.put_clusters()
print '(Done).'

for rec in sc.get_cluster_records():

Пример #25

0

Показать файл

Файл: load_final_results.py Проект: kgori/clustering_project

import argparse
import re
import sys
from errors import filecheck_and_quit, directorycheck_and_quit

progname = re.compile('[A-Za-z0-9.-_]+').search(sys.argv[0]).group()
desc = 'Read in a SequenceCollection from disk and print scores'
input_help = 'Filepath+name of gzipped SequenceCollection object'
category_choices = ['Observed', 'Randomised', 'Simulated', 'NA']

parser = argparse.ArgumentParser(prog=progname, description=desc)
parser.add_argument('-i', dest='input_file', help=input_help, type=str)
parser.add_argument('-t', dest='phyml_dir', help=input_help, type=str)

args = parser.parse_args()
input_file = args.input_file
phyml_dir = args.phyml_dir.rstrip('/')

filecheck_and_quit(input_file)
directorycheck_and_quit(phyml_dir)

from sequence_collection import SequenceCollection

sc = SequenceCollection.gunzip(input_file)
cluster_records = sc.get_cluster_records()
sc.load_phyml_results(phyml_dir, records=cluster_records,
                      use_hashname=True)
sc.update_scores()
sc.gzip(input_file)

Пример #26

0

Показать файл

print indir

rob.r('library(ape)')
rob.r('library(phangorn)')
print 'r libraries loaded'

alignments_dir = '{0}/dna_alignments'.format(indir)

if os.path.isfile('{0}/tmpPickle.pkl'.format(indir)):
    sc = cPickle.load(file('{0}/tmpPickle.pkl'.format(indir)))
else:

    sc = SequenceCollection(
        alignments_dir,
        file_format='fasta',
        datatype='dna',
        gtp_path=gtp_path,
        helper=helper,
        tmpdir=tmpdir,
        )

    sc.put_trees(program='bionj')
    sc.put_partitions('geo', 'spectral', 4)
    sc.concatenate_records()
    sc.put_cluster_trees(program='bionj')
    cPickle.dump(sc, open('{0}/tmpPickle.pkl'.format(indir), 'w'))
print 'SC object available'

# Plot the heatmap of the distance matrix

dm = sc.get_distance_matrices()['geo']
p = sc.partitions[sc.clusters_to_partitions[('geo', 'spectral', 4)]]

Python SequenceCollection примеры использования