#!/usr/bin/env python
import csv
import sys

partis_path = '.'  # edit this if you're not running from the main partis dir
sys.path.insert(1, partis_path + '/python')
import utils
import glutils
from clusterpath import ClusterPath

# read default germline info
glfo = glutils.read_glfo(partis_path + '/data/germlines/human', locus='igh')

print 'first parse an annotation csv file:'
with open(partis_path + '/test/reference-results/annotate-new-simu.csv') as csvfile:
    reader = csv.DictReader(csvfile)
    for line in reader:
        if line['v_gene'] == '':  # failed (i.e. couldn't find an annotation)
        utils.add_implicit_info(glfo, line)

print 'then parse a partition csv file:'
cp = ClusterPath()
cp.readfile(partis_path + '/test/reference-results/seed-partition-new-simu.csv')
Пример #2
def run_bios2mds(n_components, n_clusters, seqfos, base_workdir, seed, aligned=False, reco_info=None, region=None,
                 max_runs=100, max_iterations=1000, method='euclidean',
                 plotdir=None, plotname='mds', queries_to_include=None, color_scale_vals=None, labels=None, title=None, remove_duplicates=False, debug=False):
    workdir = base_workdir + '/mds'
    msafname = workdir + '/msa.fa'
    mdsfname = workdir + '/components.txt'
    clusterfname = workdir + '/clusters.txt'
    if not os.path.exists(workdir):

    if len(set([sfo['seq'] for sfo in seqfos])) < len(seqfos):  # it'll just crash when it's running mds later, but this is faster
        if remove_duplicates:
            seq_groups = [list(group) for _, group in itertools.groupby(sorted(seqfos, key=lambda x: x['seq']), key=lambda x: x['seq'])]
            seqs_to_remove = []
            for sgroup in seq_groups:
                seqs_to_remove += [sfo['name'] for sfo in sgroup[1:]]  # remove any after the first one
            seqfos = [sfo for sfo in seqfos if sfo['name'] not in seqs_to_remove]
            raise Exception('duplicate sequences in seqfos')

    if aligned:  # NOTE unlike the sklearn version below, this doesn't modify <seqfos>
        with open(msafname, 'w') as fastafile:
            for sfo in seqfos:
                fastafile.write('>%s\n%s\n' % (sfo['name'], sfo['seq']))
        utils.align_many_seqs(seqfos, outfname=msafname)

    # build the R cmd file
    cmdlines = [
        'require(bios2mds, quietly=TRUE)',
        'set.seed(%d)' % seed,
        'human <- import.fasta("%s")' % msafname,
        'active <- mat.dif(human, human)',  # mat.dif or mat.dis?

    if n_components is not None:
        cmdlines += ['mmds_active <- mmds(active, pc=%d)' % n_components]
        cmdlines += ['capture.output(mmds_active$coord, file="%s")' % mdsfname]
        raise Exception('need to implement')

    if n_clusters is not None:
        cmdlines += [
            'kmeans.run1 <- kmeans.run(mmds_active$coord, nb.clus=%d, nb.run=%d, iter.max=%d, method="%s")' % (n_clusters, max_runs, max_iterations, method),
            # 'kmeans.run1$clusters',
            # 'kmeans.run1$elements',
            'capture.output(kmeans.run1$clusters, file="%s")' % clusterfname,
            # sil.score(mat, nb.clus = c(2:13), nb.run = 100, iter.max = 1000,  # run for every possible number of clusters (?)
            #               method = "euclidean")
            # random.msa  # builds a random [...]

    rstart = time.time()
        utils.run_r(cmdlines, workdir)  #, print_time='kmeans')
    except subprocess.CalledProcessError as e:  # typically happens because of complex eigenvalues
        print e
        print '   mds failed on cluster'  # NOTE will still crash in read_kmeans_clusterfile(), but I'm not using that a.t.m.
        title = (title if title is not None else '') + ' mds failed'
    pcvals = read_component_file(mdsfname, n_components, seqfos)
    partition = read_kmeans_clusterfile(clusterfname, seqfos) if n_clusters is not None else None
    rstop = time.time()
    if debug and partition is not None:
        print '  kmeans partition:'
        cp = ClusterPath(partition=partition)


    plotstart = time.time()
    if plotdir is not None:
        # utils.prep_dir(plotdir, wildlings=['*.svg'])
        plot_mds(n_components, pcvals, plotdir, plotname, partition=partition if n_clusters is not None else None, queries_to_include=queries_to_include, color_scale_vals=color_scale_vals, labels=labels, title=title)
        if reco_info is not None:
            labels = {uid : reco_info[uid][region + '_gene'] for uid in pcvals}
            plot_mds(n_components, pcvals, plotdir, 'true-genes', labels=labels, queries_to_include=queries_to_include, color_scale_vals=color_scale_vals, title=title)
    if not debug:  # this isn't a great way to do this, but I don't want to deal with finding all the calling functions, I just want to add some debug printing to this fcn
        print '    %5.1f  %5.1f' % (rstop - rstart, time.time() - plotstart),

    return partition
Пример #3
#!/usr/bin/env python
import sys
sys.path.insert(1, './python')
import csv
csv.field_size_limit(sys.maxsize)  # make sure we can write very large csv fields
import argparse

from clusterpath import ClusterPath
from seqfileopener import get_seqfile_info
import utils

parser = argparse.ArgumentParser()
parser.add_argument('--infname', required=True)
parser.add_argument('--dont-abbreviate', action='store_true', help='Print full seq IDs (otherwise just prints an \'o\')')
parser.add_argument('--n-to-print', type=int, help='How many partitions to print (centered on the best partition)')
parser.add_argument('--datadir', default='data/imgt')
parser.add_argument('--is-data', action='store_true')
args = parser.parse_args()

glfo = utils.read_germline_set(args.datadir)

reco_info = None
if args.simfname is not None:
    input_info, reco_info = get_seqfile_info(args.simfname, args.is_data, glfo=glfo)

cp = ClusterPath()
cp.print_partitions(abbreviate=(not args.dont_abbreviate), n_to_print=args.n_to_print, reco_info=reco_info)
