示例#1
0
def makeboxplot(filteredclusts, dblibrary, figname, pool=False):
    '''takes a filtered dict of clusts worth keeping and creates a boxplot of either by lane (default) or pool'''
    indiv_cluster_count = defaultdict(int) 
    for clust, inddict in filteredclusts.items():
        for ind, reads in inddict.items():
            if ind in indiv_cluster_count.keys():
                indiv_cluster_count[ind]+=1
            else:
                indiv_cluster_count[ind]+=1 
    
    t = gdata_tools.get_table_as_dict(dblibrary)
    db_ind_countd = Util.countdict([d['sampleid'] for d in t if d['sampleid'] in indiv_cluster_count.keys()[3]]) #creates a table of individual dicts from google spreadsheet
    indiv_by_group = defaultdict(list)
    for d in t:
        if 'pool' in d:
            indkey = (d.get('flowcell',None),d.get('lane',None),d.get('index',None),d.get('sampleid',None))
            if indkey in indiv_cluster_count:
                if pool == True:
                    indiv_by_group[(d['flowcell'],d['lane'],d.get('index',None),d['pool'])].append(indiv_cluster_count[indkey]) 
                else:
                    indiv_by_group[(d['flowcell'],d['lane'],d.get('index',None))].append(indiv_cluster_count[indkey])
    
    boxes = []
    labels = []
    for group,indcounts in indiv_by_group.items():
        boxes.append(indcounts)
        labels.append(group)
    boxplt = pylab.figure(1)
    pylab.boxplot(boxes)
    pylab.xticks(arange(1,(len(labels)+1)),labels,fontsize='small') #legend with best location (0) if pools
    boxplt.savefig(figname)
示例#2
0
def makeboxplot(filteredclusts, dblibrary, figname, pool=False):
    '''takes a filtered dict of clusts worth keeping and creates a boxplot of either by lane (default) or pool'''
    indiv_cluster_count = defaultdict(int)
    for clust, inddict in filteredclusts.items():
        for ind, reads in inddict.items():
            if ind in indiv_cluster_count.keys():
                indiv_cluster_count[ind] += 1
            else:
                indiv_cluster_count[ind] += 1

    t = gdata_tools.get_table_as_dict(dblibrary)
    db_ind_countd = Util.countdict([
        d['sampleid'] for d in t
        if d['sampleid'] in indiv_cluster_count.keys()[3]
    ])  #creates a table of individual dicts from google spreadsheet
    indiv_by_group = defaultdict(list)
    for d in t:
        if 'pool' in d:
            indkey = (d.get('flowcell', None), d.get('lane', None),
                      d.get('index', None), d.get('sampleid', None))
            if indkey in indiv_cluster_count:
                if pool == True:
                    indiv_by_group[(d['flowcell'], d['lane'],
                                    d.get('index', None), d['pool'])].append(
                                        indiv_cluster_count[indkey])
                else:
                    indiv_by_group[(d['flowcell'], d['lane'],
                                    d.get('index', None))].append(
                                        indiv_cluster_count[indkey])

    boxes = []
    labels = []
    for group, indcounts in indiv_by_group.items():
        boxes.append(indcounts)
        labels.append(group)
    boxplt = pylab.figure(1)
    pylab.boxplot(boxes)
    pylab.xticks(arange(1, (len(labels) + 1)), labels,
                 fontsize='small')  #legend with best location (0) if pools
    boxplt.savefig(figname)
示例#3
0
                counts_by_pool[pool][ind] += ct

    return counts_by_pool
            
def get_uniqued_info(uniqued):
    if 'index' in uniqued:
        ufields = os.path.splitext(os.path.basename(uniqued))[0].rsplit('_',3)
        ufields[3] = ufields[3][5:]
    else:
        ufields = os.path.splitext(os.path.basename(uniqued))[0].rsplit('_',2)
        ufields.append(None)

    ufields[1] = ufields[1][4:]

    return ufields


if __name__ == "__main__":

    db = gdata_tools.get_table_as_dict('DB_library_data')
    uniqued = sys.argv[1]

    ufields = get_uniqued_info(uniqued)

    counts_by_pool = get_counts_by_pool(uniqued,db)

    for k,v in counts_by_pool.items():
        print '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%0.1f\t%d' % (ufields[0],ufields[1],ufields[2],ufields[3],k,sum(v.values()),len(v),numpy.mean(v.values()),numpy.median(v.values()))