parser = argparse.ArgumentParser() parser.add_argument('-a', '--allelehits', required=True) parser.add_argument('-gi', '--allowedgenes') parser.add_argument('-ge', '--disallowedgenes', nargs='+', default=[]) parser.add_argument('-R', '--random_dots', type=int, default=1) parser.add_argument('-s', '--samplelist', required=True, nargs='+') parser.add_argument('-n', default=4, type=int) parser.add_argument('-o', '--figure', default='poolN.pdf') parser.add_argument('-S', '--subtract_allelerand', action='store_true') parser.add_argument('-r', '--allelerand_skew', action='store_true') o = parser.parse_args() expra = dr_tools.loadexpr(o.allelehits, True) if o.allowedgenes: allowed_genes = set(dr_tools.loadlist(o.allowedgenes)) else: allowed_genes = None disallowed_genes = set() for filename in o.disallowedgenes: disallowed_genes.update(set(dr_tools.loadlist(filename))) random.seed(0) samples_n = dict((samplelist, [random.sample(dr_tools.loadlist(samplelist, ignore='#'), o.n) for di in range(o.random_dots)]) for samplelist in o.samplelist) samples_all = [sa.split('_c57only')[0] for sa in expra.samples[::2]] allelerand_skew = dict((gi, ratio(expra, gi, samples_all)) for gi in range(len(expra['symbols']))) n_cells = o.n
parser.add_argument('-gi', '--genelist_include') parser.add_argument('-ge', '--genelist_exclude', nargs='+') parser.add_argument('-M', '--metric', default='v1', choices=['v0', 'v1', 'v3', '90pwM', '2wM']) parser.add_argument('-m', '--minreads', type=int, default=1) parser.add_argument('-o', '--figure', required=True) parser.add_argument('--all_clonal') parser.add_argument('-d', '--divide_by_total', action='store_true') o = parser.parse_args() expra = dr_tools.loadexpr(o.allelehits, True) a1 = '_c57only' a2 = '_castonly' all_samples = [s[:-len(a1)] for s in expra.samples[::2]] samples_to_randomly_pick_from = all_samples if o.all_clonal is None else set(all_samples)-set(dr_tools.loadlist(o.all_clonal)) genelist_include = None if o.genelist_include is None else set(dr_tools.loadlist(o.genelist_include)) genelist_exclude = None if o.genelist_exclude is None else set.union(*(set(dr_tools.loadlist(filename)) for filename in o.genelist_exclude)) minor_alleles = dict() included_genes = set() for ai, sym in enumerate(expra['symbols']): if genelist_include is not None and sym not in genelist_include: continue if genelist_exclude is not None and sym in genelist_exclude: continue countsum1 = sum(expra[s+a1][ai]>0 for s in all_samples) countsum2 = sum(expra[s+a2][ai]>0 for s in all_samples) if countsum1 and countsum2: minor_alleles[ai] = (a2,a1) if countsum1 >= countsum2 else (a1,a2) included_genes.add(sym) give_sharing = {'v0':give_sharing_v0, 'v1': give_sharing_v1, 'v3':give_sharing_v3, '90pwM':give_sharing_90pwMajor, '2wM':give_sharing_min2wMajor}[o.metric]
import argparse, dr_tools if '__main__' == __name__: parser = argparse.ArgumentParser() parser.add_argument('infile') parser.add_argument('samplepart') o = parser.parse_args() samples = dr_tools.loadlist(o.infile) samples = [s for s in samples if o.samplepart in s] prefix = samples[0].rsplit(o.samplepart)[0] + o.samplepart dr_tools.printlist(prefix + '.txt', samples)
from __future__ import division import argparse, dr_tools, numpy, pylab, random if '__main__' == __name__: parser = argparse.ArgumentParser() parser.add_argument('-r', '--rpkms', required=True) parser.add_argument('-m', '--minrpkm', default=20, type=float) parser.add_argument('-M', '--maxrpkm', type=float) parser.add_argument('-gi', '--allowedgenes') parser.add_argument('-ge', '--disallowedgenes', nargs='+') o = parser.parse_args() exprt = dr_tools.loadexpr(o.rpkms) allowedgenes = set(dr_tools.loadlist( o.allowedgenes)) if o.allowedgenes else None if o.disallowedgenes: disallowedgenes = set() for filename in o.disallowedgenes: disallowedgenes.update(set(dr_tools.loadlist(filename))) else: disallowedgenes = None samples = exprt.samples for ti, sym in enumerate(exprt['symbols']): meanexpr = numpy.mean([exprt[s][ti] for s in samples]) if meanexpr < o.minrpkm: continue if o.maxrpkm is not None and meanexpr >= o.maxrpkm: continue if disallowedgenes and sym in disallowedgenes: continue if allowedgenes and sym not in allowedgenes: continue print sym
parser = argparse.ArgumentParser() parser.add_argument('-a', '--allelehits', required=True) parser.add_argument('-r', '--rpkms', required=True) parser.add_argument('-m', '--minrpkm', default=20, type=float) parser.add_argument('-M', '--maxrpkm', type=float) parser.add_argument('-gi', '--allowedgenes') parser.add_argument('--random_seed', type=int) parser.add_argument('-om', '--outputmode', choices=['stats', 'c57', 'cast', 'bi'], default='stats') o = parser.parse_args() if o.random_seed is not None: random.seed(o.random_seed) allowedgenes = set(dr_tools.loadlist( o.allowedgenes)) if o.allowedgenes else None expra = dr_tools.loadexpr(o.allelehits, True) exprt = dr_tools.loadexpr(o.rpkms, False) c57mono, castmono, bi = list_mono(exprt.samples) if o.outputmode == 'stats': tot_genes = len(bi) + len(c57mono) + len(castmono) print len(castmono), len(c57mono), (len(c57mono) + len(castmono)) / tot_genes c57mono, castmono, bi = list_mono(exprt.samples, True) tot_genes = len(bi) + len(c57mono) + len(castmono) print len(castmono), len(c57mono), (len(c57mono) + len(castmono)) / tot_genes else: for sym in {'c57': c57mono, 'cast': castmono, 'bi': bi}[o.outputmode]:
# load expression data expr_alleles = dr_tools.loadexpr([args.rpkmf_alleles], counts=True) samples_alleles = sorted(e for e in expr_alleles if e not in ('IDs', 'symbols') and (args.filter is None or any(part in e for part in args.filter))) for p in dr_tools.splitlines(args.rpkmf_alleles): if p[0] == '#samples': samples = p[1:]; break samples_alleles = [e for e in samples if (args.filter is None or any(part in e for part in args.filter))] # sort the genes by position # only include transcripts which are the first ID in the entry of the rpkm file if args.allowedgenes is None and args.disallowedgenes is None: allowed_IDs = set(IDs.split('+')[0] for IDs in expr_alleles['IDs']) else: if args.allowedgenes: allowed_set = set(dr_tools.loadlist(args.allowedgenes)) if args.disallowedgenes: disallowed_set = set(dr_tools.loadlist(args.disallowedgenes)) allowed_IDs = set(IDs.split('+')[0] for IDs, symbols in zip(expr_alleles['IDs'],expr_alleles['symbols']) if (args.allowedgenes is None or any(identifier in allowed_set for identifier in (IDs.split('+') + symbols.split('+')))) and not (args.disallowedgenes is not None and any(identifier in disallowed_set for identifier in (IDs.split('+') + symbols.split('+'))))) genes_per_chr = dict() ID_to_gene = dict() for ID in expr_alleles['IDs']: if ID not in allowed_IDs: continue chromosome = ID.split(':')[0] coord = int(ID.split(':')[1].split('|')[0]) if args.chromosome == 'any': # place them together chromosome = 'any' elif chromosome != args.chromosome: continue if not chromosome in genes_per_chr: genes_per_chr[chromosome] = [] if args.mincoord and coord < args.mincoord: continue
set2 = set(entries2) set1_unique_c = len(set(entries1[sym] for sym in (set1-set2))) set2_unique_c = len(set(entries2[sym] for sym in (set2-set1))) common_c = len(set(entries1[sym] for sym in (set2&set1))) common_c2 = len(set(entries2[sym] for sym in (set2&set1))) if not common_c == common_c2: raise Exception saygenes = [] for genes in set(entries2[sym] for sym in (set2&set1)): saygenes.append(';'.join(list(genes))) return set1_unique_c, common_c, set2_unique_c, ', '.join(saygenes) if '__main__' == __name__: parser = argparse.ArgumentParser() parser.add_argument('-A', '--annotationfile', default='/mnt/crick/danielr/Xandclones_BR/BR_fibroblasts/snp-call/more_formats/mm9_ensembl_refseq_norandom_11Apr2012_genesymbols.txt') parser.add_argument('-a', '--set1', required=True) parser.add_argument('-b', '--set2', required=True) parser.add_argument('-ge', '--disallowedgenes', nargs='+') o = parser.parse_args() if o.disallowedgenes: disallowedgenes = set() for filename in o.disallowedgenes: disallowedgenes.update(set(dr_tools.loadlist(filename))) else: disallowedgenes = None ID_to_symbol = dict((p[1], p[12]) for p in dr_tools.splitlines(o.annotationfile) if disallowedgenes is None or p[12] not in disallowedgenes) print dr_tools.join(overlap_of_2(load_geneset(ID_to_symbol, o.set1), load_geneset(ID_to_symbol, o.set2))) print len(set(ID_to_symbol.values()))
parser = argparse.ArgumentParser() parser.add_argument('rpkmfile') parser.add_argument('allelehits') parser.add_argument('-X', '--chrX_genes', required=True) parser.add_argument('-I', '--imprinted_genes', required=True) parser.add_argument('-A', '--autosomal_genes_all', required=True) parser.add_argument('-S', '--autosomal_genes_selection') parser.add_argument('-m', '--minrpkm', type=float, default=20) parser.add_argument('-s', '--samplelist_clone', nargs='+', required=True) o = parser.parse_args() o.maxexpr = None if not o.autosomal_genes_selection: o.autosomal_genes_selection = o.autosomal_genes_all expra = dr_tools.loadexpr(o.allelehits, True) exprt = dr_tools.loadexpr(o.rpkms, False) allowedgenes = set(dr_tools.loadlist(o.autosomal_genes_selection)) - set(dr_tools.loadlist(o.imprinted_genes)) for samplelistfile in o.samplelist_clone: samples = dr_tools.loadlist(samplelistfile) c57mono, castmono, clonal_bi, ti_used = list_mono(samples) clonal_mono = c57mono | castmono num_random_mono = [] for ri in range(10): c57mono, castmono, bi, ti_used = list_mono(samples, True, ti_used) num_random_mono.append(len(c57mono)+len(castmono)) exp_random_mono = numpy.mean(num_random_mono)
parser = argparse.ArgumentParser() parser.add_argument('-a', '--allelehits', required=True) parser.add_argument('-s', '--samplelist', required=True) parser.add_argument('-r', '--randomisations', type=int, default=100) parser.add_argument('-gi', '--genelist_include') parser.add_argument('-ge', '--genelist_exclude') parser.add_argument('--all_clonal') o = parser.parse_args() expra = dr_tools.loadexpr(o.allelehits, True) a1 = '_c57only' a2 = '_castonly' all_samples = [s[:-len(a1)] for s in expra.samples[::2]] clone_samples = dr_tools.loadlist(o.samplelist) samples_to_randomly_pick_from = all_samples if o.all_clonal is None else set( all_samples) - set(dr_tools.loadlist(o.all_clonal)) genelist_include = None if o.genelist_include is None else set( dr_tools.loadlist(o.genelist_include)) genelist_exclude = None if o.genelist_exclude is None else set( dr_tools.loadlist(o.genelist_exclude)) minor_alleles = dict() for ai, sym in enumerate(expra['symbols']): if genelist_include is not None and sym not in genelist_include: continue if genelist_exclude is not None and sym in genelist_exclude: continue countsum1 = sum(expra[s + a1][ai] > 0 for s in all_samples) countsum2 = sum(expra[s + a2][ai] > 0 for s in all_samples) minor_alleles[ai] = (a2, a1) if countsum1 >= countsum2 else (a1, a2)
import argparse, dr_tools if '__main__' == __name__: parser = argparse.ArgumentParser() parser.add_argument( 'snp2genes', help= '/mnt/kauffman/danielr/crick/Xandclones_BR/snp-validation/snp2genes/ensembl__nooverlap_ra5val_3percent.txt' ) parser.add_argument('-g', '--genelist') o = parser.parse_args() if o.genelist: allowedgenes = set(dr_tools.loadlist(o.genelist)) num_snps = 0 num_genes = 0 for p in dr_tools.splitlines(o.snp2genes): if o.genelist and p[0] not in allowedgenes: continue num_snps += int(p[1]) num_genes += 1 print 'snps:', num_snps print 'genes:', num_genes
# load expression data expr_alleles = dr_tools.loadexpr([args.rpkmf_alleles], counts=True) samples_alleles = sorted(e for e in expr_alleles if e not in ('IDs', 'symbols') and (args.filter is None or any(part in e for part in args.filter))) for p in dr_tools.splitlines(args.rpkmf_alleles): if p[0] == '#samples': samples = p[1:]; break samples_alleles = [e for e in samples if (args.filter is None or any(part in e for part in args.filter))] # sort the genes by position # only include transcripts which are the first ID in the entry of the rpkm file if 0:#args.allowedgenes is None and args.disallowedgenes in None: allowed_IDs = set(IDs.split('+')[0] for IDs in expr_alleles['IDs']) else: if args.allowedgenes: allowed_set = set(dr_tools.loadlist(args.allowedgenes)) if args.disallowedgenes: disallowed_set = set(dr_tools.loadlist(args.disallowedgenes)) allowed_IDs = set(IDs.split('+')[0] for IDs, symbols in zip(expr_alleles['IDs'],expr_alleles['symbols']) if (args.allowedgenes is None or any(identifier in allowed_set for identifier in (IDs.split('+') + symbols.split('+')))) and not (args.disallowedgenes is not None and any(identifier in disallowed_set for identifier in (IDs.split('+') + symbols.split('+'))))) genes_per_chr = dict() ID_to_gene = dict() for p in dr_tools.splitlines(args.genePred): ID = p[1] if ID in allowed_IDs: chromosome = p[2] if args.chromosome == 'any': # place them together chromosome = 'any' elif chromosome != args.chromosome: continue if not chromosome in genes_per_chr: genes_per_chr[chromosome] = [] coord = int(p[4]) if p[3]=='+' else int(p[5])
V = twovalues.split() return V[1] + '\t' + V[0] if '__main__' == __name__: parser = argparse.ArgumentParser() parser.add_argument('inf') parser.add_argument('outf') parser.add_argument('-r', '--rpkmf_getID') parser.add_argument('--noNA', action='store_true') parser.add_argument('-s', '--samplenames') parser.add_argument('--rpkmf_genes', action='store_true') parser.add_argument('--exclude_genes') o = parser.parse_args() exclude_genes = set(dr_tools.loadlist( o.exclude_genes)) if o.exclude_genes else set() # load columns column_to_name = dict() symbols = [] IDs = [] sample_values = {} with open(o.inf, 'r') as infh: for li, line in enumerate(infh): p = line.rstrip('\r\n').split('\t') if li == 0: for ci, name in enumerate(p): if name in ('name', 'transcript', 'nbSNPs', 'SNPlocations', 'c57:cast', 'chrom', 'pos', 'genexcells'): continue newname = rename(name)
import dr_tools, argparse, pylab, numpy, math if '__main__' == __name__: parser = argparse.ArgumentParser() parser.add_argument('-r', '--rpkmfile', required=True) parser.add_argument('-g', '--genelist', required=True, nargs='+') parser.add_argument('-o', '--figure', default='rpkm_distribution.pdf') o = parser.parse_args() expr = dr_tools.loadexpr(o.rpkmfile) genes = set.union(*(set(dr_tools.loadlist(filename)) for filename in o.genelist)) rpkms_genelist = [] for sym in genes: try: ti = expr.symbol_to_index[sym] except KeyError: continue meanrpkm = numpy.mean([expr[s][ti] for s in expr.samples]) rpkms_genelist.append(math.log(max(2**-10, meanrpkm), 2)) pylab.hist(rpkms_genelist, 10) pylab.xlabel('log2 mean rpkm (-10 = not expressed)') pylab.ylabel('# genes') pylab.savefig(o.figure)
from __future__ import division import argparse, dr_tools, numpy, pylab, random if '__main__' == __name__: parser = argparse.ArgumentParser() parser.add_argument('-r', '--rpkms', required=True) parser.add_argument('-m', '--minrpkm', default=20, type=float) parser.add_argument('-M', '--maxrpkm', type=float) parser.add_argument('-gi', '--allowedgenes') parser.add_argument('-ge', '--disallowedgenes', nargs='+') o = parser.parse_args() exprt = dr_tools.loadexpr(o.rpkms) allowedgenes = set(dr_tools.loadlist(o.allowedgenes)) if o.allowedgenes else None if o.disallowedgenes: disallowedgenes = set() for filename in o.disallowedgenes: disallowedgenes.update(set(dr_tools.loadlist(filename))) else: disallowedgenes = None samples = exprt.samples for ti, sym in enumerate(exprt['symbols']): meanexpr = numpy.mean([exprt[s][ti] for s in samples]) if meanexpr < o.minrpkm: continue if o.maxrpkm is not None and meanexpr >= o.maxrpkm: continue if disallowedgenes and sym in disallowedgenes: continue if allowedgenes and sym not in allowedgenes: continue print sym
def gene_i_by_listf(genelistf_arr, expr): allowedgenes = set() for genelistf in genelistf_arr: allowedgenes |= set(dr_tools.loadlist(genelistf)) return set(i for i, sym in enumerate(expr['symbols']) if sym in allowedgenes)
import argparse, dr_tools if '__main__' == __name__: parser = argparse.ArgumentParser() parser.add_argument('snp2genes', help='/mnt/kauffman/danielr/crick/Xandclones_BR/snp-validation/snp2genes/ensembl__nooverlap_ra5val_3percent.txt') parser.add_argument('-g', '--genelist') o = parser.parse_args() if o.genelist: allowedgenes = set(dr_tools.loadlist(o.genelist)) num_snps = 0 num_genes = 0 for p in dr_tools.splitlines(o.snp2genes): if o.genelist and p[0] not in allowedgenes: continue num_snps += int(p[1]) num_genes += 1 print 'snps:', num_snps print 'genes:', num_genes
parser.add_argument('-n', '--names', action='append') parser.add_argument('-m', '--maxpergroup', type=int, default=300000000) o = parser.parse_args() expr = dr_tools.loadexpr(o.rpkmfile) boxplot_values = [] labels = [] for samplelistgroup, name in itertools.izip_longest(o.samplelist, o.names): if samplelistgroup is None: raise Exception if name is None: label = '' else: label = name + '\n' rho_values = [] samples_used = set() possible_pairs = 0 for samplelistfile in samplelistgroup: samples = set(dr_tools.loadlist(samplelistfile)) rho_values.extend([stats.spearmanr(expr[s1], expr[s2])[0] for s1, s2 in maxpairs(samples, o.maxpergroup)]) samples_used.update(samples) possible_pairs += len(samples) * (len(samples)-1) // 2 boxplot_values.append(rho_values) print name, 'samples=%d correlations=%d genes=%d possible_pairs=%d'%(len(samples_used),len(rho_values), len(expr[s1]), possible_pairs) label += 'n=%d'%len(rho_values) labels.append(label) pylab.ylim(0,1) dr_tools.violin_plot(pylab.axes(), boxplot_values, range(len(boxplot_values)), bp=True) pylab.xticks(range(len(boxplot_values)), labels, rotation=90) pylab.subplots_adjust(bottom=0.3) pylab.savefig(o.figure) # example command: # python correlation_boxplot.py -r ../bjorn_reinius_complete_set/rpkmforgenes/star_merged_cast1_mm9/ensembl/rpkms_counts_rmnameoverlap.txt -s sample_lists/excltechrepl/* -s sample_lists/splitcell_big/* -s sample_lists/splitcell_small/* -s sample_lists/samples_big.txt -s sample_lists/samples_small.txt -s sample_lists/fibroblast_samples_ERCC.txt -s sample_lists/livercellsamples.txt -s sample_lists/brain_samples.txt -n tech_repl -n big_splits -n small_splits -n big_fibroblasts -n small_fibroblasts -n fibroblasts_other -n liver -n brain
parser = argparse.ArgumentParser() parser.add_argument('-a', '--allelehits', required=True) parser.add_argument('-m', '--minreads', type=int, default=2) parser.add_argument('-gi', '--genelist_include') parser.add_argument('-ge', '--genelist_exclude', nargs='+') parser.add_argument('--figure') o = parser.parse_args() expra = dr_tools.loadexpr(o.allelehits, True) a1 = '_c57only' a2 = '_castonly' all_samples = [s[:-len(a1)] for s in expra.samples[::2]] genelist_include = None if o.genelist_include is None else set( dr_tools.loadlist(o.genelist_include)) genelist_exclude = None if o.genelist_exclude is None else set.union( *(set(dr_tools.loadlist(filename)) for filename in o.genelist_exclude)) minor_alleles = dict() for ai, sym in enumerate(expra['symbols']): if genelist_include is not None and sym not in genelist_include: continue if genelist_exclude is not None and sym in genelist_exclude: continue countsum1 = sum(expra[s + a1][ai] >= o.minreads for s in all_samples) countsum2 = sum(expra[s + a2][ai] >= o.minreads for s in all_samples) minor_alleles[ai] = (a2, a1) if countsum1 >= countsum2 else (a1, a2) fractions = [] for sample in all_samples: f = estimated_monoallelic_fraction(sample, expra, minor_alleles, o.minreads)
import dr_tools, argparse if '__main__' == __name__: parser = argparse.ArgumentParser() parser.add_argument('-i', '--rpkmf_in', required=True) parser.add_argument('-o', '--rpkmf_out', required=True) parser.add_argument('-s', '--sample_lists', nargs='+', required=True) o = parser.parse_args() with open(o.rpkmf_out, 'w') as outfh: with open(o.rpkmf_in, 'r') as infh: for li, line in enumerate(infh): if li == 0: p = line.rstrip('\r\n').split('\t') sample_to_clone = dict( (sample, filename) for filename in o.sample_lists for sample in dr_tools.loadlist(filename)) for i, name in enumerate(p): if i == 0: continue for suffix in ('', '_c57only', '_castonly'): if name.endswith( suffix ) and name[:-len(suffix)] in sample_to_clone: clone_name = sample_to_clone[ name[:-len(suffix)]].split('/')[-1].split( '.txt')[0] p[i] = clone_name + '-' + name print >> outfh, dr_tools.join(p) else: outfh.write(line)
from __future__ import division import argparse, dr_tools if '__main__' == __name__: parser = argparse.ArgumentParser() parser.add_argument('-a', '--genePred', required=True) parser.add_argument('-o', '--genelist_out', help="gives genes on -i chromosomes but not on -e", required=True) parser.add_argument('-i', '--chrom_include', nargs='+', help="'rest' matches all not in -i or -e, 'random' matches e.g. chr1_random", required=True) parser.add_argument('-e', '--chrom_exclude', nargs='+', help="'rest' matches all not in -i or -e, 'random' matches e.g. chr1_random", default=[]) o = parser.parse_args() all_chromosomes = set(dr_tools.loadlist(o.genePred, 2)) include = set(c for c in all_chromosomes if c in o.chrom_include) exclude = set(c for c in all_chromosomes if c in o.chrom_exclude) if 'random' in o.chrom_include: include.update(set(c for c in all_chromosomes if '_random' in c and c not in exclude)) if 'random' in o.chrom_exclude: exclude.update(set(c for c in all_chromosomes if '_random' in c and c not in include)) if 'rest' in o.chrom_include: include.update(all_chromosomes-exclude) if 'rest' in o.chrom_exclude: exclude.update(all_chromosomes-include) genes_incl = set() genes_excl = set() for p in dr_tools.splitlines(o.genePred): symbol = p[12] chromosome = p[2]
parser.add_argument('-n', '--names', action='append') parser.add_argument('-m', '--maxpergroup', type=int, default=300000000) o = parser.parse_args() expr = dr_tools.loadexpr(o.rpkmfile) boxplot_values = [] labels = [] for samplelistgroup, name in itertools.izip_longest(o.samplelist, o.names): if samplelistgroup is None: raise Exception if name is None: label = '' else: label = name + '\n' rho_values = [] samples_used = set() possible_pairs = 0 for samplelistfile in samplelistgroup: samples = set(dr_tools.loadlist(samplelistfile)) rho_values.extend([ stats.spearmanr(expr[s1], expr[s2])[0] for s1, s2 in maxpairs(samples, o.maxpergroup) ]) samples_used.update(samples) possible_pairs += len(samples) * (len(samples) - 1) // 2 boxplot_values.append(rho_values) print name, 'samples=%d correlations=%d genes=%d possible_pairs=%d' % ( len(samples_used), len(rho_values), len(expr[s1]), possible_pairs) label += 'n=%d' % len(rho_values) labels.append(label) pylab.ylim(0, 1) dr_tools.violin_plot(pylab.axes(), boxplot_values, range(len(boxplot_values)),
if '__main__' == __name__: parser = argparse.ArgumentParser() parser.add_argument('-a', '--allelehits', required=True) parser.add_argument('-r', '--rpkms', required=True) parser.add_argument('-m', '--minrpkm', default=20, type=float) parser.add_argument('-M', '--maxrpkm', type=float) parser.add_argument('-c', '--clonal_groups', default='clonal_groups.txt') parser.add_argument('-gi', '--allowedgenes') parser.add_argument('-ge', '--disallowedgenes') parser.add_argument('--random_seed', type=int) o = parser.parse_args() if o.random_seed is not None: random.seed(o.random_seed) allowedgenes = set(dr_tools.loadlist( o.allowedgenes)) if o.allowedgenes else None disallowedgenes = set(dr_tools.loadlist( o.disallowedgenes)) if o.disallowedgenes else None expra = dr_tools.loadexpr(o.allelehits, True) exprt = dr_tools.loadexpr(o.rpkms, False) randomC = Genecat('random') obsC = Genecat('observed') for clonal_group in dr_tools.loadlist(o.clonal_groups): samples = [ s for s in exprt.samples if any( s.startswith(clonal_group_start) or s.startswith('pool.' + clonal_group_start) for clonal_group_start in clonal_group.split('\t'))
import dr_tools, argparse if '__main__' == __name__: parser = argparse.ArgumentParser() parser.add_argument('-i', '--rpkmf_in', required=True) parser.add_argument('-o', '--rpkmf_out', required=True) parser.add_argument('-s', '--sample_lists', nargs='+', required=True) o = parser.parse_args() with open(o.rpkmf_out, 'w') as outfh: with open(o.rpkmf_in, 'r') as infh: for li, line in enumerate(infh): if li == 0: p = line.rstrip('\r\n').split('\t') sample_to_clone = dict((sample, filename) for filename in o.sample_lists for sample in dr_tools.loadlist(filename)) for i, name in enumerate(p): if i==0: continue for suffix in ('', '_c57only', '_castonly'): if name.endswith(suffix) and name[:-len(suffix)] in sample_to_clone: clone_name = sample_to_clone[name[:-len(suffix)] ].split('/')[-1].split('.txt')[0] p[i] = clone_name + '-' + name print >>outfh, dr_tools.join(p) else: outfh.write(line)
parser = argparse.ArgumentParser() parser.add_argument('-a', '--allelehits', required=True) parser.add_argument('-gi', '--allowedgenes') parser.add_argument('-ge', '--disallowedgenes', nargs='+', default=[]) parser.add_argument('-R', '--random_dots', type=int, default=1) parser.add_argument('-s', '--samplelist', required=True) parser.add_argument('-n', default=[5,15], type=int, nargs='+') parser.add_argument('-o', '--figure', default='poolN.pdf') parser.add_argument('--skip_multi', action='store_true') parser.add_argument('-r', '--allelerand_skew', action='store_true') o = parser.parse_args() expra = dr_tools.loadexpr(o.allelehits, True) if o.allowedgenes: allowed_genes = set(dr_tools.loadlist(o.allowedgenes)) else: allowed_genes = None disallowed_genes = set() for filename in o.disallowedgenes: disallowed_genes.update(set(dr_tools.loadlist(filename))) poolable_samples = dr_tools.loadlist(o.samplelist, ignore='#') random.seed(0) samples_n = dict((n_cells, [random.sample(poolable_samples, n_cells) for di in range(o.random_dots)]) for n_cells in o.n) samples_all = [sa.split('_c57only')[0] for sa in expra.samples[::2]] samples_all = [s for s in samples_all if s not in poolable_samples] allelerand_skew = dict((gi, ratio(expra, gi, samples_all)) for gi in range(len(expra['symbols'])))
def swap_order(twovalues): V = twovalues.split() return V[1] + '\t' + V[0] if '__main__' == __name__: parser = argparse.ArgumentParser() parser.add_argument('inf') parser.add_argument('outf') parser.add_argument('-r', '--rpkmf_getID') parser.add_argument('--noNA', action='store_true') parser.add_argument('-s', '--samplenames') parser.add_argument('--rpkmf_genes', action='store_true') parser.add_argument('--exclude_genes') o = parser.parse_args() exclude_genes = set(dr_tools.loadlist(o.exclude_genes)) if o.exclude_genes else set() # load columns column_to_name = dict() symbols = [] IDs = [] sample_values = {} with open(o.inf, 'r') as infh: for li, line in enumerate(infh): p = line.rstrip('\r\n').split('\t') if li == 0: for ci,name in enumerate(p): if name in ('name','transcript','nbSNPs','SNPlocations', 'c57:cast', 'chrom', 'pos', 'genexcells'): continue newname = rename(name) sample_values[newname] = []
import argparse, dr_tools if '__main__' == __name__: parser = argparse.ArgumentParser() parser.add_argument('tcr_summary') parser.add_argument('clone_list', nargs='+') o = parser.parse_args() summary_lines = dict() with open(o.tcr_summary, 'rU') as infh: for line in infh: sample = line.split('\t', 1)[0] summary_lines[sample] = line for clonelistfile in o.clone_list: samples = dr_tools.loadlist(clonelistfile) print '\n#', clonelistfile for sample in samples: print summary_lines[sample].rstrip('\r\n')
parser.add_argument('-o', '--figure', required=True) parser.add_argument('-m', '--minrpkm', default=20, type=float) parser.add_argument('-M', '--maxrpkm', type=float) parser.add_argument('-c', '--clonal_groups', default='clonal_groups.txt') parser.add_argument('-gi', '--allowedgenes') parser.add_argument('-ge', '--disallowedgenes', nargs='+') parser.add_argument('--addgreen', action='store_true') parser.add_argument('--genecount', action='store_true') parser.add_argument('--random_seed', type=int) parser.add_argument('-R', '--random_bars', type=int, default=1) parser.add_argument('-f', '--min_cell_fraction', type=float, default=1.0) o = parser.parse_args() if o.random_seed is not None: random.seed(o.random_seed) allowedgenes = set(dr_tools.loadlist(o.allowedgenes)) if o.allowedgenes else None if o.disallowedgenes: disallowedgenes = set() for filename in o.disallowedgenes: disallowedgenes.update(set(dr_tools.loadlist(filename))) else: disallowedgenes = None expra = dr_tools.loadexpr(o.allelehits, True) exprt = dr_tools.loadexpr(o.rpkms, False) bars = Bars() for clonal_group in dr_tools.loadlist(o.clonal_groups): samples = [s for s in exprt.samples if any(s.startswith(clonal_group_start) or s.startswith('pool.'+clonal_group_start) for clonal_group_start in clonal_group.split('\t'))] merged_output = count_mono(samples, False, None) use_ti = merged_output[-1]
if '__main__' == __name__: parser = argparse.ArgumentParser() parser.add_argument('infile') parser.add_argument('genelist_out') parser.add_argument('-g', '--genelist_in') parser.add_argument('-q', '--maxq', default=0.05, type=float) parser.add_argument('--top', action='store_true') parser.add_argument('--bottom', action='store_true') parser.add_argument('--oneID', action='store_true') o = parser.parse_args() if not o.top and not o.bottom: o.top = True o.bottom = True at_list_top = True if o.genelist_in: allowedgenes = set(dr_tools.loadlist(o.genelist_in)) last_q = 0 genes_top = [] genes_bottom = [] for li, p in enumerate(dr_tools.splitlines(o.infile)): if li == 0: q_i = p.index('FDR') sym_i = 0 if o.oneID: ID_i = p.index('IDs') else: qval = float(p[q_i]) if at_list_top and qval < last_q: at_list_top = False sym = p[sym_i] sym_out = p[ID_i].split('+')[0] if o.oneID else sym last_q = qval if o.genelist_in and sym not in allowedgenes: continue
if '__main__' == __name__: parser = argparse.ArgumentParser() parser.add_argument('-a', '--allelehits', required=True) parser.add_argument('-s', '--samplelist', required=True) parser.add_argument('-r', '--randomisations', type=int, default=100) parser.add_argument('-gi', '--genelist_include') parser.add_argument('-ge', '--genelist_exclude') o = parser.parse_args() expra = dr_tools.loadexpr(o.allelehits, True) a1 = '_c57only' a2 = '_castonly' all_samples = [s[:-len(a1)] for s in expra.samples[::2]] clone_samples = dr_tools.loadlist(o.samplelist) genelist_include = None if o.genelist_include is None else set( dr_tools.loadlist(o.genelist_include)) genelist_exclude = None if o.genelist_exclude is None else set( dr_tools.loadlist(o.genelist_exclude)) minor_alleles = dict() for ai, sym in enumerate(expra['symbols']): if genelist_include is not None and sym not in genelist_include: continue if genelist_exclude is not None and sym in genelist_exclude: continue countsum1 = sum(expra[s + a1][ai] > 0 for s in all_samples) countsum2 = sum(expra[s + a2][ai] > 0 for s in all_samples) minor_alleles[ai] = (a2, a1) if countsum1 >= countsum2 else (a1, a2) clone_shared = give_sharing(clone_samples, minor_alleles, expra)
cast += expra[s+'_c57only'][ai] else: cast += expra[s+'_castonly'][ai] c57 += expra[s+'_c57only'][ai] if c57 and cast: pass elif c57: c57mono_genes.add(sym) elif cast: castmono_genes.add(sym) return c57mono_genes, castmono_genes if '__main__' == __name__: parser = argparse.ArgumentParser() parser.add_argument('-a', '--allelehits', required=True) parser.add_argument('-r', '--rpkms', required=True) parser.add_argument('-m', '--minrpkm', default=20, type=float) parser.add_argument('-gi', '--allowedgenes') parser.add_argument('-rg', '--randomize_per_gene', action='store_true') parser.add_argument('-G', '--genotype', choices=['cast', 'c57'], required=True) o = parser.parse_args() random.seed(20) allowedgenes = set(dr_tools.loadlist(o.allowedgenes)) if o.allowedgenes else None expra = dr_tools.loadexpr(o.allelehits, True) exprt = dr_tools.loadexpr(o.rpkms, False) c57mono_genes, castmono_genes = count_mono(exprt.samples, o.randomize_per_gene) genes = c57mono_genes if o.genotype == 'c57' else castmono_genes for genesym in genes: print genesym
def gene_i_by_listf(genelistf_arr, expr): allowedgenes = set() for genelistf in genelistf_arr: allowedgenes |= set(dr_tools.loadlist(genelistf)) return set(i for i,sym in enumerate(expr['symbols']) if sym in allowedgenes)
parser.add_argument('-gX', '--allowedgenesX', required=True) parser.add_argument('-ge', '--disallowedgenes', nargs='+') parser.add_argument('--addgreen', action='store_true') parser.add_argument('--genecount', action='store_true') parser.add_argument('--random_seed', type=int) parser.add_argument('-R', '--random_bars', type=int, default=1) parser.add_argument('-f', '--min_cell_fraction', type=float, default=1.0) parser.add_argument('--cellXbar', action='store_true') parser.add_argument('--sharedXbar', action='store_true') parser.add_argument('-I', '--individualcutoff', action='store_true') parser.add_argument('--vocal', action='store_true') o = parser.parse_args() if o.random_seed is not None: random.seed(o.random_seed) allowedgenesA = set(dr_tools.loadlist(o.allowedgenesA)) if o.allowedgenesA else None allowedgenesX = set(dr_tools.loadlist(o.allowedgenesX)) if o.allowedgenesX else None if o.disallowedgenes: disallowedgenes = set() for filename in o.disallowedgenes: disallowedgenes.update(set(dr_tools.loadlist(filename))) else: disallowedgenes = None expra = dr_tools.loadexpr(o.allelehits, True) exprt = dr_tools.loadexpr(o.rpkms, False) bars = Bars() for clonal_group in dr_tools.loadlist(o.clonal_groups): samples = [s for s in exprt.samples if any(s.startswith(clonal_group_start) or s.startswith('pool.'+clonal_group_start) for clonal_group_start in clonal_group.split('\t'))]
return sum(num_shared) if '__main__' == __name__: parser = argparse.ArgumentParser() parser.add_argument('-a', '--allelehits', required=True) parser.add_argument('-s', '--samplelist', required=True) parser.add_argument('-r', '--randomisations', type=int, default=100) o = parser.parse_args() expra = dr_tools.loadexpr(o.allelehits, True) a1 = '_c57only' a2 = '_castonly' all_samples = [s[:-len(a1)] for s in expra.samples[::2]] clone_samples = dr_tools.loadlist(o.samplelist) minor_alleles = dict() for ai, ID in enumerate(expra['IDs']): countsum1 = sum(expra[s+a1][ai]>0 for s in all_samples) countsum2 = sum(expra[s+a2][ai]>0 for s in all_samples) minor_alleles[ai] = (a2,a1) if countsum1 >= countsum2 else (a1,a2) clone_shared = give_sharing(clone_samples, minor_alleles, expra) random_shared = [give_sharing(random.sample(all_samples, len(clone_samples)), minor_alleles, expra) for i in range(o.randomisations)] p_crude = max(1, sum(r>=clone_shared for r in random_shared))/len(random_shared) p_normal = stats.ttest_1samp(random_shared, clone_shared)[1] excess_genes = clone_shared - numpy.mean(random_shared) print p_crude, p_normal, excess_genes, random_shared[:10], clone_shared
parser.add_argument('infile') parser.add_argument('outfile') parser.add_argument('-m', '--maxgenes', type=int) parser.add_argument('-S', '--maxgeneselection', choices=['max', 'mean', 'random'], default='max') parser.add_argument('-t', '--transform', choices=['none', 'log10+0.3'], default='none') parser.add_argument('-c', '--centering', choices=['none', 'mean'], default='none') parser.add_argument('-s', '--samplelist') parser.add_argument('-e', '--excludesample', nargs='+') o = parser.parse_args() # load input expr = dr_tools.loadexpr(o.infile) # select samples if o.samplelist is not None: samples = dr_tools.loadlist(o.samplelist) else: samples = expr.samples if o.excludesample: samples = [s for s in samples if s not in o.excludesample] # select genes genes_i = range(len(expr['symbols'])) if o.maxgenes is not None: select_fn = {'max':max, 'mean':numpy.mean, 'random': (lambda v: random.random())}[o.maxgeneselection] sort_list = [(select_fn([expr[s][i] for s in samples]), i) for i in genes_i] sort_list.sort(reverse=True) genes_i = [i for sort_val, i in sort_list[:o.maxgenes]] # sort genes alphabetically sort_list = [(expr['symbols'][i], i) for i in genes_i]
parser = argparse.ArgumentParser() parser.add_argument('-a', '--allelehits', required=True) parser.add_argument('-s', '--samplelist', required=True) parser.add_argument('-r', '--randomisations', type=int, default=100) parser.add_argument('-gi', '--genelist_include') parser.add_argument('-ge', '--genelist_exclude') parser.add_argument('--all_clonal') o = parser.parse_args() expra = dr_tools.loadexpr(o.allelehits, True) a1 = '_c57only' a2 = '_castonly' all_samples = [s[:-len(a1)] for s in expra.samples[::2]] clone_samples = dr_tools.loadlist(o.samplelist) samples_to_randomly_pick_from = all_samples if o.all_clonal is None else set(all_samples)-set(dr_tools.loadlist(o.all_clonal)) genelist_include = None if o.genelist_include is None else set(dr_tools.loadlist(o.genelist_include)) genelist_exclude = None if o.genelist_exclude is None else set(dr_tools.loadlist(o.genelist_exclude)) minor_alleles = dict() for ai, sym in enumerate(expra['symbols']): if genelist_include is not None and sym not in genelist_include: continue if genelist_exclude is not None and sym in genelist_exclude: continue countsum1 = sum(expra[s+a1][ai]>0 for s in all_samples) countsum2 = sum(expra[s+a2][ai]>0 for s in all_samples) minor_alleles[ai] = (a2,a1) if countsum1 >= countsum2 else (a1,a2) clone_shared = give_sharing(clone_samples, minor_alleles, expra) random_shared = [give_sharing(random.sample(samples_to_randomly_pick_from, len(clone_samples)), minor_alleles, expra) for i in range(o.randomisations)] p_crude = max(1, sum(r>=clone_shared for r in random_shared))/len(random_shared)
if '__main__' == __name__: parser = argparse.ArgumentParser() parser.add_argument('rpkmfile') parser.add_argument('nondiatable') parser.add_argument('allelehits') parser.add_argument('genelist') parser.add_argument('-o', '--pdfout', default='ERCCsum_vs_biallelic_nonDia_variableminrpkm_v11.pdf') parser.add_argument('--RNAamountfactor', default=1.0, type=float) parser.add_argument('--lineparams', nargs=2, type=float) parser.add_argument('--xfor20rpkm', default=21, type=float) o = parser.parse_args() ERCCvol_ul = 0.1/40000 ERCC_moleculenumber = calc_ERCC_moleculenumber('ERCC.txt', ERCCvol_ul) * o.RNAamountfactor genelist_first = set(dr_tools.loadlist(o.genelist)) expra = dr_tools.loadexpr(o.allelehits, True) expr = dr_tools.loadexpr(o.rpkmfile, False) spikes_i = [i for i, ID in enumerate(expr['IDs']) if 'ERCC' in ID] genes_i = [i for i, ID in enumerate(expr['IDs']) if 'ERCC' not in ID] # pass 1: get cells per source cells_per_source = defaultdict(list) for p, sample, sample_i, cellsource in table_loader(): cells_per_source[cellsource].append(sample) # middle step: get gene lists per cell source at RPKM cutoff genelist_sources = defaultdict(dict) for source, samples_source in cells_per_source.items(): #samples = set.union(*map(set, cells_per_source.values())) #new in v10 from v9 samples = samples_source
if '__main__' == __name__: parser = argparse.ArgumentParser() parser.add_argument('-a', '--allelehits', required=True) parser.add_argument('-m', '--minreads', type=int, default=2) parser.add_argument('-gi', '--genelist_include') parser.add_argument('-ge', '--genelist_exclude', nargs='+') parser.add_argument('--figure') o = parser.parse_args() expra = dr_tools.loadexpr(o.allelehits, True) a1 = '_c57only' a2 = '_castonly' all_samples = [s[:-len(a1)] for s in expra.samples[::2]] genelist_include = None if o.genelist_include is None else set(dr_tools.loadlist(o.genelist_include)) genelist_exclude = None if o.genelist_exclude is None else set.union(*(set(dr_tools.loadlist(filename)) for filename in o.genelist_exclude)) minor_alleles = dict() for ai, sym in enumerate(expra['symbols']): if genelist_include is not None and sym not in genelist_include: continue if genelist_exclude is not None and sym in genelist_exclude: continue countsum1 = sum(expra[s+a1][ai]>=o.minreads for s in all_samples) countsum2 = sum(expra[s+a2][ai]>=o.minreads for s in all_samples) minor_alleles[ai] = (a2,a1) if countsum1 >= countsum2 else (a1,a2) fractions = [] for sample in all_samples: f = estimated_monoallelic_fraction(sample, expra, minor_alleles, o.minreads) print sample, f if not math.isnan(f): fractions.append(f) print 'average', numpy.mean(fractions)