示例#1
0
 def print_true_partition(self):
     print '  true partition'
     print '   clonal?   ids'
     true_partition = utils.get_true_partition(self.reco_info)
     for cluster in true_partition:
         print '     %d    %s' % (utils.from_same_event(self.reco_info, cluster),
                                  ':'.join([str(uid) for uid in cluster]))
示例#2
0
        def get_bad_clusters(part):
            bad_clusters = [
            ]  # inferred clusters that aren't really all from the same event
            for ic in range(len(part)):
                same_event = utils.from_same_event(
                    reco_info,
                    part[ic])  # are all the sequences from the same event?
                entire_cluster = True  # ... and if so, are they the entire true cluster?
                if same_event:
                    reco_id = reco_info[part[ic][0]][
                        'reco_id']  # they've all got the same reco_id then, so pick an aribtrary one
                    true_cluster = true_partition[reco_id]
                    for uid in true_cluster:
                        if uid not in part[ic]:
                            entire_cluster = False
                            break
                else:
                    entire_cluster = False
                if not same_event or not entire_cluster:
                    bad_clusters.append(':'.join(part[ic]))

            if len(bad_clusters) > 25:
                bad_clusters = ['too', 'long']

            return bad_clusters
示例#3
0
    def print_partition(self, ip, reco_info=None, extrastr='', abbreviate=True, highlight_cluster_indices=None, print_cluster_indices=False, right_extrastr=''):  # NOTE <highlight_cluster_indices> and <print_cluster_indices> are quite different despite sounding similar, but I can't think of something else to call the latter that makes more sense
        #  NOTE it's nicer to *not* sort by cluster size here, since preserving the order tends to frequently make it obvious which clusters are merging as your eye scans downwards through the output
        if ip > 0:  # delta between this logprob and the previous one
            delta_str = '%.1f' % (self.logprobs[ip] - self.logprobs[ip-1])
        else:
            delta_str = ''
        print '      %s  %-12.2f%-7s   %s%-5d  %4d' % (extrastr, self.logprobs[ip], delta_str, ('%-5d  ' % ip) if print_cluster_indices else '', len(self.partitions[ip]), self.n_procs[ip]),

        print '    ' + self.get_ccf_str(ip),

        # clusters
        sorted_clusters = sorted(self.partitions[ip], key=lambda c: len(c), reverse=True)
        for iclust in range(len(sorted_clusters)):
            cluster = sorted_clusters[iclust]
            if abbreviate:
                cluster_str = ':'.join(['o' if len(uid) > 3 else uid for uid in cluster])
            else:
                # cluster_str = ':'.join(sorted([str(uid) for uid in cluster]))
                cluster_str = ':'.join([str(uid) for uid in cluster])

            if reco_info is not None and not utils.from_same_event(reco_info, cluster):
                cluster_str = utils.color('red', cluster_str)

            if self.seed_unique_id is not None and self.seed_unique_id in cluster:
                cluster_str = utils.color('reverse_video', cluster_str)

            if highlight_cluster_indices is not None and iclust in highlight_cluster_indices:
                cluster_str = utils.color('red', cluster_str)
            
            if abbreviate:
                print ' %s' % cluster_str,
            else:
                print '   %s' % cluster_str,
        print '%s' % right_extrastr,
        print ''
示例#4
0
    def get_bad_clusters(self, partition, reco_info, true_partition):
        bad_clusters = [
        ]  # inferred clusters that aren't really all from the same event
        for ic in range(len(partition)):
            same_event = utils.from_same_event(
                reco_info,
                partition[ic])  # are all the sequences from the same event?
            entire_cluster = True  # ... and if so, are they the entire true cluster?
            if same_event:
                reco_id = reco_info[partition[ic][0]][
                    'reco_id']  # they've all got the same reco_id then, so pick an aribtrary one
                true_clusters = [
                    cluster for cluster in true_partition
                    if reco_info[cluster[0]]['reco_id'] == reco_id
                ]  # NOTE I think this doesn't work right with shm indels in the cdr3
                assert len(true_clusters) == 1
                true_cluster = true_clusters[0]
                for uid in true_cluster:
                    if uid not in partition[ic]:
                        entire_cluster = False
                        break
            else:
                entire_cluster = False
            if not same_event or not entire_cluster:
                bad_clusters.append(':'.join(partition[ic]))

        if len(bad_clusters) > 25:
            bad_clusters = ['too', 'long']

        return bad_clusters
示例#5
0
 def print_true_partition(self):
     print '  true partition'
     print '   clonal?   ids'
     true_partition = utils.get_true_partition(self.reco_info)
     for cluster in true_partition:
         print '     %d    %s' % (utils.from_same_event(
             self.reco_info, cluster), ':'.join(
                 [str(uid) for uid in cluster]))
    def print_partition(self, ip, reco_info=None, extrastr='', one_line=True, abbreviate=True):
        if one_line:
            if ip > 0:  # delta between this logprob and the previous one
                delta_str = '%.1f' % (self.logprobs[ip] - self.logprobs[ip-1])
            else:
                delta_str = ''
            print '      %5s  %-12.2f%-7s   %-5d  %5d' % (extrastr, self.logprobs[ip], delta_str, len(self.partitions[ip]), self.n_procs[ip]),

            # logweight (and inverse of number of potential parents)
            if self.logweights[ip] is not None:
                way_str, logweight_str = '', ''
                expon = math.exp(self.logweights[ip])
                n_ways = 0 if expon == 0. else 1. / expon
                way_str = ('%.1f' % n_ways) if n_ways < 1e7 else ('%8.1e' % n_ways)
                logweight_str = '%8.3f' % self.logweights[ip]

            # adj mi
            if reco_info is not None:
                adj_mi_str = ''
                if self.adj_mis[ip] is None:
                    adj_mi_str = 'na'
                else:
                    if self.adj_mis[ip] > 1e-3:
                        adj_mi_str = '%-8.3f' % self.adj_mis[ip]
                    else:
                        adj_mi_str = '%-8.0e' % self.adj_mis[ip]
                print '      %8s   ' % (adj_mi_str),
            if self.logweights[ip] is not None:
                print '   %10s    %8s   ' % (way_str, logweight_str),
        else:
            print '  %5s partition   %-15.2f' % (extrastr, self.logprobs[ip]),
            if reco_info is not None:
                print '    %-8.2f' % (self.adj_mis[ip]),
            print ''
            print '   clonal?   ids'

        # clusters
        for cluster in self.partitions[ip]:
            same_event = utils.from_same_event(reco_info is None, reco_info, cluster)
            if same_event is None:
                same_event = -1

            if abbreviate:
                cluster_str = ':'.join(['o' for uid in cluster])
            else:
                cluster_str = ':'.join([str(uid) for uid in cluster])
            if not same_event:
                cluster_str = utils.color('red', cluster_str)
            
            if one_line:
                if abbreviate:
                    print ' %s' % cluster_str,
                else:
                    print '   %s' % cluster_str,
            else:
                print '     %d    %s' % (int(same_event), cluster_str)
        if one_line:
            print ''
 def print_true_partition(self):
     print '  true partition'
     print '   clonal?   ids'
     true_partition = utils.get_true_partition(self.reco_info)
     for cluster in true_partition.values():
         same_event = utils.from_same_event(self.reco_info is None, self.reco_info, cluster)
         if same_event is None:
             same_event = -1
         print '     %d    %s' % (int(same_event), ':'.join([str(uid) for uid in cluster]))
示例#8
0
def peruse_forward_scores():
    _, reco_info = seqfileopener.get_seqfile_info(simfname, is_data=False)  #, n_max_queries=10000)
    logprobs, partialcorr_logprobs, corr_logprobs = OrderedDict(), OrderedDict(), OrderedDict()
    for n_set in n_set_list:
        print n_set
        # if n_set != 5:
        #     continue
        logprobs[n_set], partialcorr_logprobs[n_set], corr_logprobs[n_set] = OrderedDict(), OrderedDict(), OrderedDict()
        with open(outputdir + '/' + str(n_set) + '-forward.csv') as csvfile:
            reader = csv.DictReader(csvfile)
            for line in reader:
                uidlist = line['unique_ids'].split(':')
                assert utils.from_same_event(reco_info, uidlist)
                reco_id = reco_info[uidlist[0]]['reco_id']
                if reco_id in logprobs[n_set]:
                    raise Exception('already had %s' % reco_id)

                logprobs[n_set][reco_id] = float(line['logprob'])

                factor = 1. / n_set
                partialcorr_logprobs[n_set][reco_id] = factor * float(line['logprob'])

                factor = (1. - 0.24 / pow(float(n_set), 0.9)) / n_set
                # factor = 1. / (0.77547824*n_set + 0.20327936)
                corr_logprobs[n_set][reco_id] = factor * float(line['logprob'])


    i_baseline = -1
    deviations = get_deviations(logprobs, i_baseline)
    # fit_stuff(n_set_list, deviations)
    partialcorr_deviations = get_deviations(partialcorr_logprobs, i_baseline)
    signed_partialcorr_deviations = get_deviations(partialcorr_logprobs, i_baseline, signed=True)
    corr_deviations = get_deviations(corr_logprobs, i_baseline)
    signed_corr_deviations = get_deviations(corr_logprobs, i_baseline, signed=True)

    import plotting
    fig, ax = plotting.mpl_init()
    ax.plot(n_set_list, deviations, marker='.')
    plotting.mpl_finish(ax, baseplotdir, 'forwards', xlabel='N simultaneous seqs', ylabel='log prob deviation to ' + str(n_set_list[i_baseline]))  #, ybounds=(-0.02, 0.02))

    # fig, ax = plotting.mpl_init()
    # ax.plot(n_set_list, partialcorr_deviations, marker='.')
    # ax.plot([n_set_list[0], n_set_list[-1]], [0, 0])
    # plotting.mpl_finish(ax, baseplotdir, 'partially-corrected-forwards', xlabel='N simultaneous seqs', ylabel='log prob deviation to ' + str(n_set_list[i_baseline])) #, ybounds=(-0.02, 0.02))

    fig, ax = plotting.mpl_init()
    ax.plot(n_set_list, partialcorr_deviations, marker='.', label='1/n (abs)')
    ax.plot(n_set_list, signed_partialcorr_deviations, marker='.', label='1/n')
    ax.plot(n_set_list, corr_deviations, marker='.', label='1/crap (abs)')
    ax.plot(n_set_list, signed_corr_deviations, marker='.', label='1/crap')
    ax.plot([n_set_list[0], n_set_list[-1]], [0, 0])
    plotting.mpl_finish(ax, baseplotdir, 'corrected-forwards', xlabel='N simultaneous seqs', ylabel='log prob deviation to ' + str(n_set_list[i_baseline])) #, ybounds=(-0.02, 0.02))

    fig, ax = plotting.mpl_init()
    ax.plot(n_set_list, signed_corr_deviations, marker='.')
    ax.plot([n_set_list[0], n_set_list[-1]], [0, 0])
    plotting.mpl_finish(ax, baseplotdir, 'signed-corrected-forwards', xlabel='N simultaneous seqs', ylabel='log prob deviation to ' + str(n_set_list[i_baseline])) #, ybounds=(-0.02, 0.02))
    def single_link(self, input_scores=None, infname=None, debug=False, reco_info=None, outfile=None):
        if infname is None:
            assert input_scores is not None
        else:
            assert input_scores is None  # should only specify <input_scores> *or* <infname>
            input_scores = []
            with opener('r')(infname) as infile:
                reader = csv.DictReader(infile)
                for line in reader:
                    input_scores.append(line)
        sorted_lines = sorted(input_scores, key=lambda k: float(k['logprob']))
        for line in sorted_lines:
            a_name = line['id_a']
            b_name = line['id_b']
            score = float(line['logprob'])
            dbg_str_list = ['%22s %22s   %8.3f' % (a_name, b_name, score), ]
            if reco_info is None:
                dbg_str_list[-1] += '   %s' % ('-')
            else:
                from_same_event = utils.from_same_event(reco_info, [a_name, b_name])
                dbg_str_list[-1] += '   %d' % (from_same_event)
            self.incorporate_into_clusters(a_name, b_name, score, dbg_str_list)
            self.pairscores[(utils.get_key((a_name, b_name)))] = score
            self.plotscores['all'].append(score)
            if reco_info is not None:
                if from_same_event:
                    self.plotscores['same'].append(score)
                else:
                    self.plotscores['diff'].append(score)
            if debug:
                outstr = ''.join(dbg_str_list)
                if outfile == None:
                    print outstr
                else:
                    outfile.write(outstr + '\n')

        for query, cluster_id in self.query_clusters.iteritems():
            if cluster_id not in self.id_clusters:
                self.id_clusters[cluster_id] = []
            self.id_clusters[cluster_id].append(query)
        for cluster_id, queries in self.id_clusters.items():
            if len(queries) == 1:
                self.singletons.append(queries[0])

        # print 'nearest',self.nearest_true_mate
        out_str_list = ['  %d clusters:\n'%len(self.id_clusters), ]
        for cluster_id in self.id_clusters:
            out_str_list.append('   ' + ' '.join([str(x) for x in self.id_clusters[cluster_id]]) + '\n')
        if outfile == None:
            print ''.join(out_str_list)
        else:
            outfile.write(''.join(out_str_list))
示例#10
0
    def print_partition(self,
                        ip,
                        reco_info=None,
                        extrastr='',
                        abbreviate=True,
                        smc_print=False):
        #  NOTE it's nicer to *not* sort by cluster size here, since preserving the order tends to frequently make it obvious which clusters are merging as your eye scans downwards through the output
        if ip > 0:  # delta between this logprob and the previous one
            delta_str = '%.1f' % (self.logprobs[ip] - self.logprobs[ip - 1])
        else:
            delta_str = ''
        print '      %s  %-12.2f%-7s   %-5d  %4d' % (
            extrastr, self.logprobs[ip], delta_str, len(
                self.partitions[ip]), self.n_procs[ip]),

        # logweight (and inverse of number of potential parents)
        if self.logweights[ip] is not None and smc_print:
            way_str, logweight_str = '', ''
            expon = math.exp(self.logweights[ip])
            n_ways = 0 if expon == 0. else 1. / expon
            way_str = ('%.1f' % n_ways) if n_ways < 1e7 else ('%8.1e' % n_ways)
            logweight_str = '%8.3f' % self.logweights[ip]

        print '    ' + self.get_ccf_str(ip),

        if self.logweights[ip] is not None and smc_print:
            print '   %10s    %8s   ' % (way_str, logweight_str),

        # clusters
        for cluster in sorted(self.partitions[ip],
                              key=lambda c: len(c),
                              reverse=True):
            if abbreviate:
                cluster_str = ':'.join(
                    ['o' if len(uid) > 3 else uid for uid in cluster])
            else:
                # cluster_str = ':'.join(sorted([str(uid) for uid in cluster]))
                cluster_str = ':'.join([str(uid) for uid in cluster])

            if reco_info is not None and not utils.from_same_event(
                    reco_info, cluster):
                cluster_str = utils.color('red', cluster_str)

            if self.seed_unique_id is not None and self.seed_unique_id in cluster:
                cluster_str = utils.color('reverse_video', cluster_str)

            if abbreviate:
                print ' %s' % cluster_str,
            else:
                print '   %s' % cluster_str,
        print ''
示例#11
0
    def print_partition(self, ip, reco_info=None, extrastr="", abbreviate=True, smc_print=False):
        #  NOTE it's nicer to *not* sort by cluster size here, since preserving the order tends to frequently make it obvious which clusters are merging as your eye scans downwards through the output
        if ip > 0:  # delta between this logprob and the previous one
            delta_str = "%.1f" % (self.logprobs[ip] - self.logprobs[ip - 1])
        else:
            delta_str = ""
        print "      %s  %-12.2f%-7s   %-5d  %4d" % (
            extrastr,
            self.logprobs[ip],
            delta_str,
            len(self.partitions[ip]),
            self.n_procs[ip],
        ),

        # logweight (and inverse of number of potential parents)
        if self.logweights[ip] is not None and smc_print:
            way_str, logweight_str = "", ""
            expon = math.exp(self.logweights[ip])
            n_ways = 0 if expon == 0.0 else 1.0 / expon
            way_str = ("%.1f" % n_ways) if n_ways < 1e7 else ("%8.1e" % n_ways)
            logweight_str = "%8.3f" % self.logweights[ip]

        print "    " + self.get_ccf_str(ip),

        if self.logweights[ip] is not None and smc_print:
            print "   %10s    %8s   " % (way_str, logweight_str),

        # clusters
        for cluster in self.partitions[ip]:
            if abbreviate:
                cluster_str = ":".join(["o" if len(uid) > 3 else uid for uid in cluster])
            else:
                # cluster_str = ':'.join(sorted([str(uid) for uid in cluster]))
                cluster_str = ":".join([str(uid) for uid in cluster])

            if reco_info is not None and not utils.from_same_event(reco_info, cluster):
                cluster_str = utils.color("red", cluster_str)

            if self.seed_unique_id is not None and self.seed_unique_id in cluster:
                cluster_str = utils.color("reverse_video", cluster_str)

            if abbreviate:
                print " %s" % cluster_str,
            else:
                print "   %s" % cluster_str,
        print ""
    def write_partitions(self, writer, is_data, reco_info, true_partition, smc_particles, path_index, n_to_write=None, calc_adj_mi=None):
        for ipart in self.get_partition_subset(n_partitions=n_to_write):
            part = self.partitions[ipart]
            cluster_str = ''
            bad_clusters = []  # inferred clusters that aren't really all from the same event
            for ic in range(len(part)):
                if ic > 0:
                    cluster_str += ';'
                cluster_str += ':'.join(part[ic])
                if not is_data:
                    same_event = utils.from_same_event(is_data, reco_info, part[ic])  # are all the sequences from the same event?
                    entire_cluster = True  # ... and if so, are they the entire true cluster?
                    if same_event:
                        reco_id = reco_info[part[ic][0]]['reco_id']  # they've all got the same reco_id then, so pick an aribtrary one
                        true_cluster = true_partition[reco_id]
                        for uid in true_cluster:
                            if uid not in part[ic]:
                                entire_cluster = False
                                break
                    else:
                        entire_cluster = False
                    if not same_event or not entire_cluster:
                        bad_clusters.append(':'.join(part[ic]))

            if len(bad_clusters) > 25:
                bad_clusters = ['too', 'long']
            row = {'logprob' : self.logprobs[ipart],
                   'n_clusters' : len(part),
                   'n_procs' : self.n_procs[ipart],
                   'clusters' : cluster_str}
            if smc_particles > 1:
                row['path_index'] = path_index
                row['logweight'] = self.logweights[ipart]
            if not is_data:
                if calc_adj_mi is None or self.adj_mis[ipart] != -1:  # if we don't want to write any adj mis, or if we already calculated it
                    row['adj_mi'] = self.adj_mis[ipart]
                else:
                    if calc_adj_mi == 'best' and ipart == self.i_best:  # only calculate adj_mi for the best partition
                        row['adj_mi'] = utils.mutual_information(part, reco_info)
                    else:
                        row['adj_mi'] = self.adj_mis[ipart]
                row['n_true_clusters'] = len(true_partition)
                row['bad_clusters'] = ';'.join(bad_clusters)
            writer.writerow(row)
示例#13
0
        def get_bad_clusters(part):
            bad_clusters = []  # inferred clusters that aren't really all from the same event
            for ic in range(len(part)):
                same_event = utils.from_same_event(reco_info, part[ic])  # are all the sequences from the same event?
                entire_cluster = True  # ... and if so, are they the entire true cluster?
                if same_event:
                    reco_id = reco_info[part[ic][0]]['reco_id']  # they've all got the same reco_id then, so pick an aribtrary one
                    true_cluster = true_partition[reco_id]
                    for uid in true_cluster:
                        if uid not in part[ic]:
                            entire_cluster = False
                            break
                else:
                    entire_cluster = False
                if not same_event or not entire_cluster:
                    bad_clusters.append(':'.join(part[ic]))

            if len(bad_clusters) > 25:
                bad_clusters = ['too', 'long']

            return bad_clusters
示例#14
0
    def print_partition(self, ip, reco_info=None, extrastr='', abbreviate=True, smc_print=False):
        if ip > 0:  # delta between this logprob and the previous one
            delta_str = '%.1f' % (self.logprobs[ip] - self.logprobs[ip-1])
        else:
            delta_str = ''
        print '      %s  %-12.2f%-7s   %-5d  %4d' % (extrastr, self.logprobs[ip], delta_str, len(self.partitions[ip]), self.n_procs[ip]),

        # logweight (and inverse of number of potential parents)
        if self.logweights[ip] is not None and smc_print:
            way_str, logweight_str = '', ''
            expon = math.exp(self.logweights[ip])
            n_ways = 0 if expon == 0. else 1. / expon
            way_str = ('%.1f' % n_ways) if n_ways < 1e7 else ('%8.1e' % n_ways)
            logweight_str = '%8.3f' % self.logweights[ip]

        print '    ' + self.get_ccf_str(ip),

        if self.logweights[ip] is not None and smc_print:
            print '   %10s    %8s   ' % (way_str, logweight_str),

        # clusters
        for cluster in self.partitions[ip]:
            if abbreviate:
                cluster_str = ':'.join(['o' if len(uid) > 3 else uid for uid in cluster])
            else:
                # cluster_str = ':'.join(sorted([str(uid) for uid in cluster]))
                cluster_str = ':'.join([str(uid) for uid in cluster])

            if reco_info is not None and not utils.from_same_event(reco_info, cluster):
                cluster_str = utils.color('red', cluster_str)

            if self.seed_unique_id is not None and self.seed_unique_id in cluster:
                cluster_str = utils.color('reverse_video', cluster_str)
            
            if abbreviate:
                print ' %s' % cluster_str,
            else:
                print '   %s' % cluster_str,
        print ''
示例#15
0
def peruse_forward_scores():
    _, reco_info = seqfileopener.get_seqfile_info(
        simfname, is_data=False)  #, n_max_queries=10000)
    logprobs, partialcorr_logprobs, corr_logprobs = OrderedDict(), OrderedDict(
    ), OrderedDict()
    for n_set in n_set_list:
        print n_set
        # if n_set != 5:
        #     continue
        logprobs[n_set], partialcorr_logprobs[n_set], corr_logprobs[
            n_set] = OrderedDict(), OrderedDict(), OrderedDict()
        with open(outputdir + '/' + str(n_set) + '-forward.csv') as csvfile:
            reader = csv.DictReader(csvfile)
            for line in reader:
                uidlist = line['unique_ids'].split(':')
                assert utils.from_same_event(reco_info, uidlist)
                reco_id = reco_info[uidlist[0]]['reco_id']
                if reco_id in logprobs[n_set]:
                    raise Exception('already had %s' % reco_id)

                logprobs[n_set][reco_id] = float(line['logprob'])

                factor = 1. / n_set
                partialcorr_logprobs[n_set][reco_id] = factor * float(
                    line['logprob'])

                factor = (1. - 0.24 / pow(float(n_set), 0.9)) / n_set
                # factor = 1. / (0.77547824*n_set + 0.20327936)
                corr_logprobs[n_set][reco_id] = factor * float(line['logprob'])

    i_baseline = -1
    deviations = get_deviations(logprobs, i_baseline)
    # fit_stuff(n_set_list, deviations)
    partialcorr_deviations = get_deviations(partialcorr_logprobs, i_baseline)
    signed_partialcorr_deviations = get_deviations(partialcorr_logprobs,
                                                   i_baseline,
                                                   signed=True)
    corr_deviations = get_deviations(corr_logprobs, i_baseline)
    signed_corr_deviations = get_deviations(corr_logprobs,
                                            i_baseline,
                                            signed=True)

    import plotting
    fig, ax = plotting.mpl_init()
    ax.plot(n_set_list, deviations, marker='.')
    plotting.mpl_finish(ax,
                        baseplotdir,
                        'forwards',
                        xlabel='N simultaneous seqs',
                        ylabel='log prob deviation to ' +
                        str(n_set_list[i_baseline]))  #, ybounds=(-0.02, 0.02))

    # fig, ax = plotting.mpl_init()
    # ax.plot(n_set_list, partialcorr_deviations, marker='.')
    # ax.plot([n_set_list[0], n_set_list[-1]], [0, 0])
    # plotting.mpl_finish(ax, baseplotdir, 'partially-corrected-forwards', xlabel='N simultaneous seqs', ylabel='log prob deviation to ' + str(n_set_list[i_baseline])) #, ybounds=(-0.02, 0.02))

    fig, ax = plotting.mpl_init()
    ax.plot(n_set_list, partialcorr_deviations, marker='.', label='1/n (abs)')
    ax.plot(n_set_list, signed_partialcorr_deviations, marker='.', label='1/n')
    ax.plot(n_set_list, corr_deviations, marker='.', label='1/crap (abs)')
    ax.plot(n_set_list, signed_corr_deviations, marker='.', label='1/crap')
    ax.plot([n_set_list[0], n_set_list[-1]], [0, 0])
    plotting.mpl_finish(ax,
                        baseplotdir,
                        'corrected-forwards',
                        xlabel='N simultaneous seqs',
                        ylabel='log prob deviation to ' +
                        str(n_set_list[i_baseline]))  #, ybounds=(-0.02, 0.02))

    fig, ax = plotting.mpl_init()
    ax.plot(n_set_list, signed_corr_deviations, marker='.')
    ax.plot([n_set_list[0], n_set_list[-1]], [0, 0])
    plotting.mpl_finish(ax,
                        baseplotdir,
                        'signed-corrected-forwards',
                        xlabel='N simultaneous seqs',
                        ylabel='log prob deviation to ' +
                        str(n_set_list[i_baseline]))  #, ybounds=(-0.02, 0.02))
示例#16
0
    def single_link(self,
                    input_scores=None,
                    infname=None,
                    debug=False,
                    reco_info=None,
                    outfile=None):
        if infname is None:
            assert input_scores is not None
        else:
            assert input_scores is None  # should only specify <input_scores> *or* <infname>
            input_scores = []
            with open(infname, 'r') as infile:
                reader = csv.DictReader(infile)
                for line in reader:
                    input_scores.append(line)
        sorted_lines = sorted(input_scores, key=lambda k: float(k['logprob']))
        for line in sorted_lines:
            a_name = line['id_a']
            b_name = line['id_b']
            score = float(line['logprob'])
            dbg_str_list = [
                '%22s %22s   %8.3f' % (a_name, b_name, score),
            ]
            if reco_info is None:
                dbg_str_list[-1] += '   %s' % ('-')
            else:
                from_same_event = utils.from_same_event(
                    reco_info, [a_name, b_name])
                dbg_str_list[-1] += '   %d' % (from_same_event)
            self.incorporate_into_clusters(a_name, b_name, score, dbg_str_list)
            self.pairscores[(utils.get_key((a_name, b_name)))] = score
            self.plotscores['all'].append(score)
            if reco_info is not None:
                if from_same_event:
                    self.plotscores['same'].append(score)
                else:
                    self.plotscores['diff'].append(score)
            if debug:
                outstr = ''.join(dbg_str_list)
                if outfile == None:
                    print outstr
                else:
                    outfile.write(outstr + '\n')

        for query, cluster_id in self.query_clusters.iteritems():
            if cluster_id not in self.id_clusters:
                self.id_clusters[cluster_id] = []
            self.id_clusters[cluster_id].append(query)
        for cluster_id, queries in self.id_clusters.items():
            if len(queries) == 1:
                self.singletons.append(queries[0])

        # print 'nearest',self.nearest_true_mate
        out_str_list = [
            '  %d clusters:\n' % len(self.id_clusters),
        ]
        for cluster_id in self.id_clusters:
            out_str_list.append(
                '   ' +
                ' '.join([str(x)
                          for x in self.id_clusters[cluster_id]]) + '\n')
        if outfile == None:
            print ''.join(out_str_list)
        else:
            outfile.write(''.join(out_str_list))