Exemplo n.º 1
0
    def read_partition_performance(self, version_stype, input_stype, debug=False):
        """ Read new partitions from self.dirs['new'], and put the comparison numbers in self.perf_info (compare either to true, for simulation, or to the partition in reference dir, for data). """
        def do_this_test(pt):
            if 'partition' not in pt:
                return False
            if input_stype not in pt:
                return False
            if args.quick and pt not in self.quick_tests:
                return False
            return True

        ptest_list = [k for k in self.tests.keys() if do_this_test(k)]
        if len(ptest_list) == 0:
            return
        if debug:
            print '  version %s input %s partitioning' % (version_stype, input_stype)
            print '  precision      sensitivity        test                    description'
        for ptest in ptest_list:
            cp = ClusterPath(-1)
            cp.readfile(self.dirs[version_stype] + '/' + ptest + '.csv')
            ccfs = cp.ccfs[cp.i_best]
            if None in ccfs:
                raise Exception('none type ccf read from %s' % self.dirs[version_stype] + '/' + ptest + '.csv')
            self.perf_info[version_stype][ptest + '-precision'], self.perf_info[version_stype][ptest + '-sensitivity'] = ccfs
            if debug:
                print '    %5.2f          %5.2f      %-28s   to true partition' % (self.perf_info[version_stype][ptest + '-precision'], self.perf_info[version_stype][ptest + '-sensitivity'], ptest)
Exemplo n.º 2
0
 def read_partition_performance(self, version_stype, input_stype, debug=False):
     """ Read new partitions from self.dirs['new'], and put the comparison numbers in self.perf_info (compare either to true, for simulation, or to the partition in reference dir, for data). """
     ptest = "partition-" + input_stype + "-simu"
     if args.quick and ptest not in self.quick_tests:
         return
     if debug:
         print "  version %s input %s partitioning" % (version_stype, input_stype)
         print "    adj mi   ccf under/over        test                    description"
     for ptest in [k for k in self.tests.keys() if "partition" in k and input_stype in k]:
         if args.quick and ptest not in self.quick_tests:
             continue
         cp = ClusterPath(-1)
         cp.readfile(self.dirs[version_stype] + "/" + ptest + ".csv")
         if "data" in ptest:
             raise Exception("needs fixing")
             ref_cp = ClusterPath(-1)
             ref_cp.readfile(self.dirs["xxxref"] + "/" + ptest + ".csv")
             self.perf_info["xxx"][ptest] = utils.adjusted_mutual_information(
                 cp.partitions[cp.i_best], ref_cp.partitions[ref_cp.i_best]
             )  # adj mi between the reference and the new data partitions
             if debug:
                 print "    %5.2f   %-28s   to reference partition" % (self.perf_info["xxx"][ptest], ptest)
         else:
             self.perf_info[version_stype][ptest + "-adj_mi"] = cp.adj_mis[cp.i_best]  # adj mi to true partition
             self.perf_info[version_stype][ptest + "-ccf_under"], self.perf_info[version_stype][
                 ptest + "-ccf_over"
             ] = cp.ccfs[cp.i_best]
             if debug:
                 print "    %5.2f    %5.2f %5.2f      %-28s   to true partition" % (
                     self.perf_info[version_stype][ptest + "-adj_mi"],
                     self.perf_info[version_stype][ptest + "-ccf_under"],
                     self.perf_info[version_stype][ptest + "-ccf_over"],
                     ptest,
                 )
Exemplo n.º 3
0
    def make_cluster_size_distribution(self,
                                       base_plotdir,
                                       partition=None,
                                       infiles=None):
        subd, plotdir = self.init_subd('sizes', base_plotdir)

        if partition is not None:  # one partition
            csize_hists = {
                'best': self.plotting.get_cluster_size_hist(partition)
            }
        elif infiles is not None:  # plot the mean of a partition from each file
            subset_hists = []
            for fname in infiles:
                cp = ClusterPath()
                cp.readfile(fname)
                subset_hists.append(
                    self.plotting.get_cluster_size_hist(
                        cp.partitions[cp.i_best]))
            csize_hists = {'best': self.plotting.make_mean_hist(subset_hists)}
            for ih in range(len(subset_hists)):
                subset_hists[ih].write(plotdir +
                                       ('/subset-%d-cluster-sizes.csv' % ih))
        else:
            assert False

        self.plotting.plot_cluster_size_hists(plotdir + '/cluster-sizes.svg',
                                              csize_hists,
                                              title='',
                                              log='x')
        return [[subd + '/cluster-sizes.svg']]
Exemplo n.º 4
0
    def read_partition_performance(self, version_stype, input_stype, debug=False):
        """ Read new partitions from self.dirs['new'], and put the comparison numbers in self.perf_info (compare either to true, for simulation, or to the partition in reference dir, for data). """
        def do_this_test(pt):
            if 'partition' not in pt:
                return False
            if input_stype not in pt:
                return False
            if args.quick and pt not in self.quick_tests:
                return False
            return True

        ptest_list = [k for k in self.tests.keys() if do_this_test(k)]
        if len(ptest_list) == 0:
            return
        if debug:
            print '  version %s input %s partitioning' % (version_stype, input_stype)
            print '  purity         completeness        test                    description'
        for ptest in ptest_list:
            cp = ClusterPath(-1)
            cp.readfile(self.dirs[version_stype] + '/' + ptest + '.csv')
            ccfs = cp.ccfs[cp.i_best]
            if None in ccfs:
                raise Exception('none type ccf read from %s' % self.dirs[version_stype] + '/' + ptest + '.csv')
            self.perf_info[version_stype][ptest + '-purity'], self.perf_info[version_stype][ptest + '-completeness'] = ccfs
            if debug:
                print '    %5.2f          %5.2f      %-28s   to true partition' % (self.perf_info[version_stype][ptest + '-purity'], self.perf_info[version_stype][ptest + '-completeness'], ptest)
Exemplo n.º 5
0
    def plot(self, plotdir, partition=None, infiles=None, annotations=None, only_csv=None):
        print '  plotting partitions'
        sys.stdout.flush()
        start = time.time()
        for subdir in self.subplotdirs:
            utils.prep_dir(plotdir + '/' + subdir, wildlings=['*.csv', '*.svg'])

        if partition is not None:  # one partition
            assert infiles is None
            assert annotations is not None
            csize_hists = {'best' : plotting.get_cluster_size_hist(partition)}
            self.plot_within_vs_between_hists(partition, annotations, plotdir)
        elif infiles is not None:  # plot the mean of a partition from each file
            subset_hists = []
            for fname in infiles:
                cp = ClusterPath()
                cp.readfile(fname)
                subset_hists.append(plotting.get_cluster_size_hist(cp.partitions[cp.i_best]))
            csize_hists = {'best' : plotting.make_mean_hist(subset_hists)}
            for ih in range(len(subset_hists)):
                subset_hists[ih].write(plotdir + ('/subset-%d-cluster-sizes.csv' % ih))
        else:
            assert False

        plotting.plot_cluster_size_hists(plotdir + '/overall/cluster-sizes.svg', csize_hists, title='', log='x')

        if not only_csv:
            for subdir in self.subplotdirs:
                plotting.make_html(plotdir + '/' + subdir)

        print '(%.1f sec)' % (time.time()-start)
Exemplo n.º 6
0
    def plot(self,
             plotdir,
             partition=None,
             infiles=None,
             annotations=None,
             only_csv=None):
        import plotting
        print '  plotting partitions'
        sys.stdout.flush()
        start = time.time()
        for subdir in self.subplotdirs:
            utils.prep_dir(plotdir + '/' + subdir,
                           wildlings=['*.csv', '*.svg'])

        fnames = []

        if partition is not None:  # one partition
            assert infiles is None
            assert annotations is not None
            csize_hists = {'best': plotting.get_cluster_size_hist(partition)}
            # self.plot_within_vs_between_hists(partition, annotations, plotdir)
            fnames += self.plot_size_vs_shm(partition, annotations, plotdir)
        elif infiles is not None:  # plot the mean of a partition from each file
            subset_hists = []
            for fname in infiles:
                cp = ClusterPath()
                cp.readfile(fname)
                subset_hists.append(
                    plotting.get_cluster_size_hist(cp.partitions[cp.i_best]))
            csize_hists = {'best': plotting.make_mean_hist(subset_hists)}
            for ih in range(len(subset_hists)):
                subset_hists[ih].write(plotdir +
                                       ('/subset-%d-cluster-sizes.csv' % ih))
        else:
            assert False

        plotting.plot_cluster_size_hists(plotdir +
                                         '/overall/cluster-sizes.svg',
                                         csize_hists,
                                         title='',
                                         log='x')
        fnames.append(['cluster-sizes.svg'])

        if not only_csv:
            for subdir in self.subplotdirs:
                plotting.make_html(plotdir + '/' + subdir,
                                   fnames=fnames,
                                   new_table_each_row=True)

        print '(%.1f sec)' % (time.time() - start)
Exemplo n.º 7
0
 def read_partition_performance(self, version_stype, input_stype, debug=False):
     """ Read new partitions from self.dirs['new'], and put the comparison numbers in self.perf_info (compare either to true, for simulation, or to the partition in reference dir, for data). """
     ptest = 'partition-' + input_stype + '-simu'
     if args.quick and ptest not in self.quick_tests:
         return
     if debug:
         print '  version %s input %s partitioning' % (version_stype, input_stype)
         print '  precision      sensitivity        test                    description'
     for ptest in [k for k in self.tests.keys() if 'partition' in k and input_stype in k]:
         if args.quick and ptest not in self.quick_tests:
             continue
         cp = ClusterPath(-1)
         cp.readfile(self.dirs[version_stype] + '/' + ptest + '.csv')
         if 'data' in ptest:
             raise Exception('needs fixing')
             # ref_cp = ClusterPath(-1)
             # ref_cp.readfile(self.dirs['xxxref'] + '/' + ptest + '.csv')
             # self.perf_info['xxx'][ptest] = utils.adjusted_mutual_information(cp.partitions[cp.i_best], ref_cp.partitions[ref_cp.i_best])  # adj mi between the reference and the new data partitions
             # if debug:
             #     print '    %5.2f   %-28s   to reference partition' % (self.perf_info['xxx'][ptest], ptest)
         else:
             self.perf_info[version_stype][ptest + '-precision'], self.perf_info[version_stype][ptest + '-sensitivity'] = cp.ccfs[cp.i_best]
             if debug:
                 print '    %5.2f          %5.2f      %-28s   to true partition' % (self.perf_info[version_stype][ptest + '-precision'], self.perf_info[version_stype][ptest + '-sensitivity'], ptest)
#!/usr/bin/env python
import csv
import sys

partis_path = '.'  # edit this if you're not running from the main partis dir
sys.path.insert(1, partis_path + '/python')
import utils
import glutils
from clusterpath import ClusterPath

# read default germline info
glfo = glutils.read_glfo(partis_path + '/data/germlines/human', locus='igh')

print 'first parse an annotation csv file:'
with open(partis_path + '/test/reference-results/annotate-new-simu.csv') as csvfile:
    reader = csv.DictReader(csvfile)
    for line in reader:
        if line['v_gene'] == '':  # failed (i.e. couldn't find an annotation)
            continue
        utils.process_input_line(line)
        utils.add_implicit_info(glfo, line)
        utils.print_reco_event(line)
        break

print 'then parse a partition csv file:'
cp = ClusterPath()
cp.readfile(partis_path + '/test/reference-results/seed-partition-new-simu.csv')
cp.print_partitions(abbreviate=True)
Exemplo n.º 9
0
#!/usr/bin/env python
import sys
sys.path.insert(1, './python')
import csv
csv.field_size_limit(sys.maxsize)  # make sure we can write very large csv fields
import argparse

from clusterpath import ClusterPath
from seqfileopener import get_seqfile_info
import utils

parser = argparse.ArgumentParser()
parser.add_argument('--infname', required=True)
parser.add_argument('--dont-abbreviate', action='store_true', help='Print full seq IDs (otherwise just prints an \'o\')')
parser.add_argument('--n-to-print', type=int, help='How many partitions to print (centered on the best partition)')
parser.add_argument('--datadir', default='data/imgt')
parser.add_argument('--simfname')
parser.add_argument('--is-data', action='store_true')
args = parser.parse_args()

glfo = utils.read_germline_set(args.datadir)

reco_info = None
if args.simfname is not None:
    input_info, reco_info = get_seqfile_info(args.simfname, args.is_data, glfo=glfo)

cp = ClusterPath()
cp.readfile(args.infname)
cp.print_partitions(abbreviate=(not args.dont_abbreviate), n_to_print=args.n_to_print, reco_info=reco_info)
parser = argparse.ArgumentParser()
parser.add_argument('--infile')
parser.add_argument('--locus')
parser.add_argument('--param')
parser.add_argument('--nclust')
args = parser.parse_args()

glfo = glutils.read_glfo(args.param + '/hmm/germline-sets', locus=args.locus)

print(sys.argv)
print 'infile =', args.infile
print 'param =', args.param

cp = ClusterPath()
cp.readfile(args.infile)
best_partition = cp.partitions[cp.i_best]
# sorted_clusters = sorted(best_partition, key=len, reverse=True)  # sort by size

# clonal family attributes to print
print '''

score = interest score, indicating interesting attributes: size, SHM, SFS, bnAb VH usage

Size & SHM:
4 points for rank in top 25
3 points for rank 25-50
2 points for rank 50-75
1 point for rank 75-100

SFS (Fay Wu H) scores earning 4-1 points: < -20, -10, 0, 10
Exemplo n.º 11
0
import sys

partis_path = '.'  # edit this if you're not running from the main partis dir
sys.path.insert(1, partis_path + '/python')
import utils
import glutils
from clusterpath import ClusterPath

# read default germline info
glfo = glutils.read_glfo(partis_path + '/data/germlines/human', chain='h')

print 'first parse an annotation csv file:'
with open(partis_path + '/test/reference-results/annotate-new-simu.csv') as csvfile:
    reader = csv.DictReader(csvfile)
    for line in reader:
        utils.process_input_line(line)
        utils.add_implicit_info(glfo, line)
        utils.print_reco_event(glfo['seqs'], line)
        cdr3_bounds = (line['codon_positions']['v'], line['codon_positions']['j'] + 3)
        print ''
        print '  should match the above:'
        print '    %s naive cdr3' % line['naive_seq'][cdr3_bounds[0] : cdr3_bounds[1]]
        print '    %s mature' % line['indel_reversed_seqs'][0][cdr3_bounds[0] : cdr3_bounds[1]]
        print ''
        break

print 'then parse a partition csv file:'
cp = ClusterPath()
cp.readfile(partis_path + '/test/reference-results/seed-partition-new-simu.csv')
cp.print_partitions(abbreviate=True)