def make_snp2gene_file(genepred, snptable, outfile_mpileup, outfile_snp2genes, include_overlap=False): with open(genepred) as infh: for line in infh: if line.startswith('#'): continue chromosome, strand, cdsstart, cdsend, exonstarts, exonends, genename, ID, inferred_strand = fromannotationline(line) for start, end in zip(exonstarts, exonends): exon = dr_tools.Cregion(chromosome, start, end) exon.gene = genename exon.addtowindows() snps_per_gene = defaultdict(list) snp_positions = [] for p in dr_tools.splitlines(snptable): # e.g. 585 chr1 10019 10020 rs376643643 0 + A A -/A genomic deletion unknown 0 0 near-gene-5 exact 1 1 SSMP, 0 if p[11] != 'single': continue # ignore non-SNPs chromosome = p[1] position = int(p[2]) # 0-based genes = set(exon.gene for exon in dr_tools.Cregion.overlappingpoint(chromosome, position)) if include_overlap or len(genes) == 1: for gene in genes: snps_per_gene[gene].append('%s:%s|%s'%(p[1], p[3], p[9])) snp_positions.append('%s\t%s'%(p[1], p[3])) with open(outfile_snp2genes, 'w') as outfh: for gene, snps in snps_per_gene.items(): print >>outfh, dr_tools.join(gene, len(snps), ';'.join(sorted(snps))) with open(outfile_mpileup, 'w') as outfh: for snpline in snp_positions: print >>outfh, snpline
def find_heterozygous(hitcounts_files, snp2gene_file_in, snp2gene_file_out, o): minreads_allele = o.minreadsH samples_prim = set(o.samplesH) if o.samplesH else set() snps_per_gene_in = dict() gene_order = [] with open(snp2gene_file_in, 'r') as infh: for line in infh: p = line.rstrip('\r\n').split('\t') snps_per_gene_in[p[0]] = p[2].split(';') gene_order.append(p[0]) reads_per_gene = defaultdict(lambda: defaultdict(lambda: (0,0))) sec_reads_per_gene = defaultdict(lambda: defaultdict(lambda: (0,0))) sec_samples_count = defaultdict(lambda: defaultdict(lambda: (0,0))) for inf in hitcounts_files: sample = inf.split('/')[-1].split('.counts')[0] if o.minothersamplesH == 0 and o.minothersamplereadsH == 0 and samples_prim and sample not in samples_prim: continue with gzip.open(inf, 'r') as infh: for line in infh: p = line.rstrip('\r\n').split('\t') gene = p[0] if (not samples_prim) or sample in samples_prim: reads_per_gene[gene] = [(reads_per_gene[gene][i][0]+int(v.split(',')[0]), reads_per_gene[gene][i][1]+int(v.split(',')[1])) for i, v in enumerate(p[1].split(';'))] else: sec_samples_count[gene] = [(sec_samples_count[gene][i][0]+(int(v.split(',')[0])>=1), sec_samples_count[gene][i][1]+(int(v.split(',')[1]))>=1) for i, v in enumerate(p[1].split(';'))] sec_reads_per_gene[gene] = [(reads_per_gene[gene][i][0]+int(v.split(',')[0]), reads_per_gene[gene][i][1]+int(v.split(',')[1])) for i, v in enumerate(p[1].split(';'))] with open(snp2gene_file_out, 'w') as outfh: for gene in gene_order: ok_snps = [] for snpinfo, reads, sec_s_count, reads_sec in zip(snps_per_gene_in[gene], reads_per_gene[gene], sec_samples_count[gene], sec_reads_per_gene[gene]): if reads[0] >=minreads_allele and reads[1] >= minreads_allele and (reads[1]==0 or reads[0]/reads[1] <= o.maxratioH) and (reads[0]==0 or reads[1]/reads[0] <= o.maxratioH) and sec_s_count[0] >= o.minothersamplesH and sec_s_count[1] >= o.minothersamplesH and reads_sec[0] >=o.minothersamplereadsH and reads_sec[1] >= o.minothersamplereadsH: ok_snps.append(snpinfo) print >>outfh, dr_tools.join(gene, len(ok_snps), ';'.join(ok_snps))
def counts_to_SNPallelehits(counts_file, samplename, snp2gene_file_in, outputpath): snps_per_gene_in = dict() gene_order = [] with open(snp2gene_file_in, 'r') as infh: for line in infh: p = line.rstrip('\r\n').split('\t') snps_per_gene_in[p[0]] = p[2].split(';') gene_order.append(p[0]) reads_per_gene = dict() with gzip.open(counts_file, 'r') as infh: for line in infh: p = line.rstrip('\r\n').split('\t') gene = p[0] reads_per_gene[gene] = [(int(v.split(',')[0]), int(v.split(',')[1])) for i, v in enumerate(p[1].split(';')) if v] with open(outputpath, 'w') as outfh: print >>outfh, dr_tools.join('#samples', samplename+'_c57only', samplename+'_castonly') print >>outfh, dr_tools.join('#allmappedreads', 0, 0) print >>outfh, dr_tools.join('#normalizationreads', 0, 0) print >>outfh, dr_tools.join('#arguments', ' '.join(sys.argv), 'time: '+time.asctime()) for gene in gene_order: for snpinfo, reads in zip(snps_per_gene_in[gene], reads_per_gene[gene]): print >>outfh, dr_tools.join(gene, snpinfo, 0, 0, reads)
def collapse_mirnas(molc_file): gene2molc = {} gene2trid = {} trid2gene = {} with open(o.out_molc_files, 'w') as outfh: for line in open(molc_file, 'r'): if line.startswith('#'): print >> outfh, line[:-1] else: p = line.strip('\n').split('\t') trans_ids = p[1]; genename = p[0] gene2trid[genename] = trans_ids if genename.startswith("hsa"): #selects only mirbase mirnas from the expression table molc_counts = map(float, p[2:]) zeros = [0]*len(molc_counts) gene2molc[genename] = [i+j for i,j in zip(gene2molc.get(genename, zeros), molc_counts)] else: print >> outfh, line[:-1] for gene in gene2molc: print >> outfh, dr_tools.join(gene, gene2trid[gene], [round(m, 2) for m in gene2molc[gene]])
if o.rpkmf_genes: symbols_set = dict((s,i) for i,s in enumerate(symbols)) new_sample_values = dict() for name in sample_order: new_sample_values[name] = [] for i, symbol in enumerate(expr['symbols']): if symbol in symbols_set: new_sample_values[name].append(sample_values[name][symbols_set[symbol]]) else: new_sample_values[name].append('0 0') sample_values = new_sample_values symbols = expr['symbols'] IDs = expr['IDs'] elif o.rpkmf_genes: raise Exception # write to file with open(o.outf, 'w') as outfh: print >>outfh, dr_tools.join('#samples', ['%s_c57only\t%s_castonly'%(s,s) for s in sample_order]) print >>outfh, dr_tools.join('#allmappedreads', ['0\t0' for s in sample_order]) print >>outfh, dr_tools.join('#normalizationreads', ['0\t0' for s in sample_order]) print >>outfh, dr_tools.join('#arguments', ' '.join(sys.argv), 'time: '+time.asctime()) for i in range(len(symbols)): #if IDs[i] == '0 0': # print symbols[i] # IDs[i] = 'NA' if o.noNA and IDs[i] == 'NA': continue try: print >>outfh, dr_tools.join(symbols[i], IDs[i], ['0\t0' for name in sample_order], [swap_order(sample_values[name][i]) for name in sample_order]) except: print symbols[i], sample_values[name][i] raise
import dr_tools, argparse if '__main__' == __name__: parser = argparse.ArgumentParser() parser.add_argument('-i', '--rpkmf_in', required=True) parser.add_argument('-o', '--rpkmf_out', required=True) parser.add_argument('-s', '--sample_lists', nargs='+', required=True) o = parser.parse_args() with open(o.rpkmf_out, 'w') as outfh: with open(o.rpkmf_in, 'r') as infh: for li, line in enumerate(infh): if li == 0: p = line.rstrip('\r\n').split('\t') sample_to_clone = dict((sample, filename) for filename in o.sample_lists for sample in dr_tools.loadlist(filename)) for i, name in enumerate(p): if i==0: continue for suffix in ('', '_c57only', '_castonly'): if name.endswith(suffix) and name[:-len(suffix)] in sample_to_clone: clone_name = sample_to_clone[name[:-len(suffix)] ].split('/')[-1].split('.txt')[0] p[i] = clone_name + '-' + name print >>outfh, dr_tools.join(p) else: outfh.write(line)
args.addminreads) + subtract( expr2[s2][gene_i], args.F2, expr2[s1][gene_i], 0, args.round) ''' s1e = subtract(expr1[s1][gene_i], args.F1, expr1[s2][gene_i], args.addminreads) s2e = subtract(expr1[s2][gene_i], args.F1, expr1[s1][gene_i], args.addminreads) if args.inf2 is not None: s1e = max(s1e, subtract(expr2[s1][gene_i], args.F2, expr2[s2][gene_i], 0)) s2e = max(s2e, subtract(expr2[s2][gene_i], args.F2, expr2[s1][gene_i], 0)) ''' gene_counts_out[gene_i].extend([s1e, s2e]) with open(args.outf, 'w') as outfh: for i, p in enumerate(dr_tools.splitlines(args.inf1)): if i < 3: print >> outfh, dr_tools.join(p) elif i == 3: assert p[0] == '#arguments' print >> outfh, dr_tools.join(p, ' '.join(sys.argv), 'time: ' + time.asctime()) else: gene_i = i - 4 # replace the expression values according to the change in read counts new_expressions = [ old_rpkm if old_rpkm <= 0 else 0.0 if old_count == 0 else new_count / old_count * old_rpkm for new_count, old_count, old_rpkm in zip( gene_counts_out[gene_i], map(float, p[2:2 + len(samples)]), map(float, p[2 + len(samples):2 + 2 * len(samples)])) ]
import dr_tools, argparse if '__main__' == __name__: parser = argparse.ArgumentParser() parser.add_argument('-i', '--rpkmf_in', required=True) parser.add_argument('-o', '--rpkmf_out', required=True) parser.add_argument('-s', '--sample_lists', nargs='+', required=True) o = parser.parse_args() with open(o.rpkmf_out, 'w') as outfh: with open(o.rpkmf_in, 'r') as infh: for li, line in enumerate(infh): if li == 0: p = line.rstrip('\r\n').split('\t') sample_to_clone = dict( (sample, filename) for filename in o.sample_lists for sample in dr_tools.loadlist(filename)) for i, name in enumerate(p): if i == 0: continue for suffix in ('', '_c57only', '_castonly'): if name.endswith( suffix ) and name[:-len(suffix)] in sample_to_clone: clone_name = sample_to_clone[ name[:-len(suffix)]].split('/')[-1].split( '.txt')[0] p[i] = clone_name + '-' + name print >> outfh, dr_tools.join(p) else: outfh.write(line)
s2e = subtract(expr1[s2][gene_i]-expr2[s2][gene_i], args.F1, expr1[s1][gene_i]-expr2[s1][gene_i], args.addminreads) + subtract(expr2[s2][gene_i], args.F2, expr2[s1][gene_i], 0, args.round) ''' s1e = subtract(expr1[s1][gene_i], args.F1, expr1[s2][gene_i], args.addminreads) s2e = subtract(expr1[s2][gene_i], args.F1, expr1[s1][gene_i], args.addminreads) if args.inf2 is not None: s1e = max(s1e, subtract(expr2[s1][gene_i], args.F2, expr2[s2][gene_i], 0)) s2e = max(s2e, subtract(expr2[s2][gene_i], args.F2, expr2[s1][gene_i], 0)) ''' gene_counts_out[gene_i].extend([s1e, s2e]) with open(args.outf, 'w') as outfh: for i, p in enumerate(dr_tools.splitlines(args.inf1)): if i < 3: print >>outfh, dr_tools.join(p) elif i == 3: assert p[0] == '#arguments' print >>outfh, dr_tools.join(p, ' '.join(sys.argv), 'time: '+time.asctime()) else: gene_i = i - 4 # replace the expression values according to the change in read counts new_expressions = [old_rpkm if old_rpkm <= 0 else 0.0 if old_count == 0 else new_count/old_count*old_rpkm for new_count, old_count, old_rpkm in zip(gene_counts_out[gene_i], map(float, p[2:2+len(samples)]), map(float, p[2+len(samples):2+2*len(samples)]))] print >>outfh, dr_tools.join(p[0], p[1], new_expressions, ['%.12g'%c for c in gene_counts_out[gene_i]]) ''' Results: danielr@rna ~/casthybrid/one_chr_reads $ python threshold_strainspec.py -i1 oorefv15_trainingvals.txt -i2 oorefv13_trainingvals.txt -F1 0.0 -F2 0.001 --addminreads 1 |python ../maternal_tx/maternal_fraction.py /dev/stdin 8cell_8-1 0.0732625841351 0.0
parser.add_argument('rpkmf_total') o = parser.parse_args() exprt = dr_tools.loadexpr(o.rpkmf_total, counts=False) counts = dr_tools.loadexpr(o.rpkmf_total, counts=True) if o.rpkmf_alleles: expra = dr_tools.loadexpr(o.rpkmf_alleles, counts=True) AiD = dict((ti, expra.ID_to_index[ID]) for ti, ID in enumerate(exprt['IDs']) if ID in expra.ID_to_index) for s in exprt.samples: if s+'_castonly' not in expra.samples: continue with open(s + '_expression.txt', 'w') as outfh: print >>outfh, dr_tools.join('#Gene_symbol', 'Refseq_IDs', 'RPKM', 'reads', 'CAST_hits', 'C57_hits') for ti in range(len(exprt['IDs'])): if ti in AiD: ai = AiD[ti] cast = int(expra[s+'_castonly'][ai]) c57 = int(expra[s+'_c57only'][ai]) else: cast = 0 c57 = 0 rpkm = exprt[s][ti] reads = int(round(counts[s][ti])) symbol = exprt['symbols'][ti].replace('+','|') ID = exprt['IDs'][ti].replace('+','|') print >>outfh, dr_tools.join(symbol, ID, rpkm, reads, cast, c57) else: for s in exprt.samples:
if coord not in database_snps: # strange, it should be (in database_snps) # unless o.snp_validatedbefore was used continue snp = database_snps[coord] snp.c57count += int(p[snp.c57index]) snp.castcount += int(p[snp.castindex]) ratios = [] with open(o.outfile, 'w') as outfh: for coord, snpinfo in database_snps.items(): reads = snpinfo.c57count + snpinfo.castcount if reads == 0: ratio = 0 else: ratio = snpinfo.c57count / reads if o.minratio <= ratio <= ( 1 - o.minratio) and reads >= o.minreads_sum and min( snpinfo.c57count, snpinfo.castcount) >= o.minreads_allele: print >> outfh, dr_tools.join(coord, snpinfo.bases, '0', '1.00', '1.00', '1.00', '1.00', '1.00') ratios.append(ratio) if o.figure: import pylab step = 0.005 xarr, yarr = dr_tools.bin(ratios, -step, 1 + step, step, 1) #yarr = [y/len(ratios) for y in yarr] pylab.plot(xarr, yarr, 'k-') pylab.savefig(o.figure)
expr_out = dr_tools.Parsed_rpkms([], False) normalization_factors = [] for s in expr_in.samples: Y_k = expr_in[s] N_k = sum(Y_k) nonzero = [gi for gi in range(len(expr_in['symbols'])) if Y_k[gi] > 0 and Y_r[gi] > 0] A_distr = sorted((A(gi, Y_k, Y_r, N_k, N_r), gi) for gi in nonzero) M_distr = sorted((M(gi, Y_k, Y_r, N_k, N_r), gi) for gi in nonzero) Gstar = set(gi for A_val,gi in A_distr[int(0.05*len(A_distr)):-int(0.05*len(A_distr))]) & set(gi for M_val,gi in M_distr[int(0.3*len(M_distr)):-int(0.3*len(M_distr))]) if len(nonzero) == 0: f_k = 1 else: log2TMM = sum(w(gi, Y_k, Y_r, N_k, N_r) * M(gi, Y_k, Y_r, N_k, N_r) for gi in Gstar)/sum(w(gi, Y_k, Y_r, N_k, N_r) for gi in Gstar) f_k = 2**log2TMM # multipy non-reference by this value #print s, f_k expr_out[s] = [Y_k[gi]*f_k for gi in range(len(expr_in['symbols']))] normalization_factors.append(f_k) expr_out.allmappedreads = expr_in.allmappedreads expr_out.normalizationreads = expr_in.normalizationreads expr_out.samples = expr_in.samples expr_out['symbols'] = expr_in['symbols'] expr_out['IDs'] = expr_in['IDs'] dr_tools.writeexpr(o.outfile, expr_out, counts_expr=(dr_tools.loadexpr(o.infile, counts=True) if o.copy_counts else None), extra_comment_lines=[dr_tools.join('#TMM_normalization_factors', normalization_factors)])
exprt = dr_tools.loadexpr(o.rpkmf_total, counts=False) counts = dr_tools.loadexpr(o.rpkmf_total, counts=True) if o.rpkmf_alleles: expra = dr_tools.loadexpr(o.rpkmf_alleles, counts=True) AiD = dict((ti, expra.ID_to_index[ID]) for ti, ID in enumerate(exprt['IDs']) if ID in expra.ID_to_index) for s in exprt.samples: if s + '_castonly' not in expra.samples: continue with open(s + '_expression.txt', 'w') as outfh: print >> outfh, dr_tools.join('#Gene_symbol', 'Refseq_IDs', 'RPKM', 'reads', 'CAST_hits', 'C57_hits') for ti in range(len(exprt['IDs'])): if ti in AiD: ai = AiD[ti] cast = int(expra[s + '_castonly'][ai]) c57 = int(expra[s + '_c57only'][ai]) else: cast = 0 c57 = 0 rpkm = exprt[s][ti] reads = int(round(counts[s][ti])) symbol = exprt['symbols'][ti].replace('+', '|') ID = exprt['IDs'][ti].replace('+', '|') print >> outfh, dr_tools.join(symbol, ID, rpkm, reads, cast, c57)
marker_order = [m for m in marker_order if m in gene_to_marker.values()] if not o.shuffle_patterns: pop_cytof_pattern = dict( (pop, [markers[m][popi] for m in marker_order]) for popi, pop in enumerate(header)) else: pop_cytof_pattern = dict( (pop, random.shuffle([markers[m][popi] for m in marker_order])) for popi, pop in enumerate(header)) exprt = dr_tools.loadexpr(o.rpkmfile) random.seed(0) midexpr_symi_all_D = dict() for symi, sym in enumerate(exprt['symbols']): if sym not in gene_to_marker: raise Exception(dr_tools.join(sym, 'sym')) if gene_to_marker[sym] not in markers: raise Exception(dr_tools.join(gene_to_marker[sym], 'cytof')) midexpr_symi_all_D[gene_to_marker[sym]] = (numpy.mean( [exprt[s][symi] for s in exprt.samples]), symi) midexpr_symi_all = [midexpr_symi_all_D[m] for m in marker_order] sym_order = [midexpr_symi_all_D[m][1] for m in marker_order] pop_counts = dict((pop, 0) for pop in pop_cytof_pattern) pop_samples = defaultdict(list) for sample in exprt.samples: relexpr = [ exprt[sample][symi] / midall for midall, symi in midexpr_symi_all ]
for i, symbol in enumerate(expr['symbols']): if symbol in symbols_set: new_sample_values[name].append( sample_values[name][symbols_set[symbol]]) else: new_sample_values[name].append('0 0') sample_values = new_sample_values symbols = expr['symbols'] IDs = expr['IDs'] elif o.rpkmf_genes: raise Exception # write to file with open(o.outf, 'w') as outfh: print >> outfh, dr_tools.join( '#samples', ['%s_c57only\t%s_castonly' % (s, s) for s in sample_order]) print >> outfh, dr_tools.join('#allmappedreads', ['0\t0' for s in sample_order]) print >> outfh, dr_tools.join('#normalizationreads', ['0\t0' for s in sample_order]) print >> outfh, dr_tools.join('#arguments', ' '.join(sys.argv), 'time: ' + time.asctime()) for i in range(len(symbols)): #if IDs[i] == '0 0': # print symbols[i] # IDs[i] = 'NA' if o.noNA and IDs[i] == 'NA': continue try: print >> outfh, dr_tools.join( symbols[i], IDs[i], ['0\t0' for name in sample_order], [
file1 = '/mnt/crick/rickards/projects/hsa_snp_calling/snp_stats_ac2.txt' file2 = '/mnt/kauffman/danielr/Xandclones_late2014/Tcell/male_P1299_YFV2001_newsnpcall/SNP_list/SNPs_per_gene.txt' # created by make_allelecalls.py -P using the -a and -s arguments output = '/mnt/kauffman/danielr/Xandclones_late2014/Tcell/male_P1299_YFV2001_newsnpcall/SNP_list/heterozygous_SNPs_per_gene.txt' import dr_tools positions = set() for p in dr_tools.splitlines(file1): # for each SNP line in the file if float(p[-2]) < 0.9: # if second last column's value is <0.9 positions.add('%s:%s' % (p[0], p[1])) # add to allowed SNP list print len(positions) c = 0 outfh = open(output, 'w') for p in dr_tools.splitlines(file2): # for each gene snps = [] for snpinfo in p[2].split(';'): # go through the SNPs for the gene if snpinfo.split('|')[0] in positions: # see if on allowed list snps.append(snpinfo) # add to SNPs to print to output c += 1 print >> outfh, dr_tools.join( p[0], len(snps), ';'.join(snps)) # output the SNPs for the gene outfh.close() print c
file1 = '/mnt/crick/rickards/projects/hsa_snp_calling/snp_stats_ac2.txt' file2 = '/mnt/kauffman/danielr/Xandclones_late2014/Tcell/male_P1299_YFV2001_newsnpcall/SNP_list/SNPs_per_gene.txt' # created by make_allelecalls.py -P using the -a and -s arguments output = '/mnt/kauffman/danielr/Xandclones_late2014/Tcell/male_P1299_YFV2001_newsnpcall/SNP_list/heterozygous_SNPs_per_gene.txt' import dr_tools positions = set() for p in dr_tools.splitlines(file1): # for each SNP line in the file if float(p[-2]) < 0.9: # if second last column's value is <0.9 positions.add('%s:%s'%(p[0], p[1])) # add to allowed SNP list print len(positions) c=0 outfh = open(output, 'w') for p in dr_tools.splitlines(file2): # for each gene snps = [] for snpinfo in p[2].split(';'): # go through the SNPs for the gene if snpinfo.split('|')[0] in positions: # see if on allowed list snps.append(snpinfo) # add to SNPs to print to output c+=1 print >>outfh, dr_tools.join(p[0], len(snps), ';'.join(snps)) # output the SNPs for the gene outfh.close() print c
o = parser.parse_args() samples = [] for li, p in enumerate(dr_tools.splitlines(o.cellnumbertable)): samples.append(Sample(str(li + 1), p[0], int(p[1]) if p[1] else None)) for csv_path in o.allcells_csv: name = csv_path.split('/')[-1].split('_1.')[0].split('.txt')[0] matching_samples = [s for s in samples if name in s.names] if len(matching_samples) > 1: raise Exception if len(matching_samples) == 0: if o.vocal: print name, 'A' continue matching_samples[0].cells_cytof_all = count_cells(csv_path) for csv_path in o.Bcells_csv: name = csv_path.split('/')[-1].split('.')[0] matching_samples = [s for s in samples if name in s.names] if len(matching_samples) > 1: raise Exception elif len(matching_samples) == 0: print name, 'B' continue matching_samples[0].cells_cytof_B = count_cells(csv_path) for sample in samples: if sample.has_all_info(): print dr_tools.join(sample.names[0], sample.est_Bcells()) elif o.vocal: print sample.names, 'C'
snps_per_gene = defaultdict(list) for p in dr_tools.splitlines(o.snplist): # e.g. chr11 117883408 C A 0 1.00 -1.00 0.90 0.10 0.71 0.29 if ',' in p[2] or ',' in p[3]: continue # added 18 Dec, since snp_stats2.py -S removes these SNPs anyway chromosome = p[0] position = int(p[1]) - 1 genes = set(exon.gene for exon in dr_tools.Cregion.overlappingpoint( chromosome, position)) if o.include_overlap: for gene in genes: snps_per_gene[gene].append('%s:%s' % (p[0], p[1])) else: if len(genes ) == 1: # don't allow overlapping genes, exclude those SNPs gene = list(genes)[0] try: snps_per_gene[gene].append('%s:%s' % (p[0], p[1])) except: print p raise with open(o.outfile, 'w') as outfh: for gene, snps in snps_per_gene.items(): print >> outfh, dr_tools.join(gene, len(snps), ';'.join(sorted(snps)))
def bam_to_windows(inbam): inbamPysamObj = pysam.Samfile(inbam, "rb" ) p = inbam.split("/") samplename = p[-2] outbamTmp = "/".join(p[:-3]+[o.outdir]+p[-2:]) tempCountfile = ".".join(outbamTmp.split(".")[:-1]) + "_tmpCount.txt" finalCountfile = ".".join(outbamTmp.split(".")[:-1]) + "_Count.txt" read2overlapCoords=defaultdict(list) for read in inbamPysamObj: readname = read.qname tid = read.rname readchr = inbamPysamObj.getrname(tid) readstart = int(read.pos) readend = read.aend if read.is_reverse: strand="-" else: strand="+" readlen = len(read.seq) #this is the actual read length (41M, means readlen=41) read_len = read.qlen #this only considers matches (8S30M, means read_len=30) midpos = (readstart + readend)//2 #retrieve list of overlapping coordinates overlap_list = betweenRE.overlappingpoint(readchr, midpos, strand) annotatedCount = len(overlap_list) #make a dictionary of read and overlapping coordinates read2overlapCoords[readname].append(overlap_list) with open(tempCountfile, "w") as outfh: for read in read2overlapCoords: coordsList = read2overlapCoords[read] readCount = len(coordsList) annotatedCount = readCount-coordsList.count([]) #len(coordsList) is never zero for coord in coordsList: if len(coord) == 0: print >> outfh, dr_tools.join(read, "NA", readCount, annotatedCount) else: ###coord[1] will be double-counting coord = str(coord[0]) #otherwise I got keyError. it was "instance" type variable geneid = coord2geneid.get(coord, 'NA') print >> outfh, dr_tools.join(read, geneid, readCount, annotatedCount) outfh.close() ## readCount, annotatedCount scenarios # 1, 1 unique map, annotated to single gene, counts as 1 # 2, 1 multi map, annotated to single gene, count as 1, discard other alignment # n, n multi map, annotated to two genes, count as 1/n # k, m where k>m and m>1, multi map, annotated to multi genes, count 1/m, discard other alignment # #formula is always: count = 1/annotatedCount geneid2counts={} unannotReadsDict={} for line in open(tempCountfile, "r"): p = line.split() read, geneid, readCount, annotatedCount = p annotatedCount = int(annotatedCount) if not geneid in geneid2counts: geneid2counts[geneid] = 0 if annotatedCount > 0: geneid2counts[geneid] += 1/annotatedCount else: geneid2counts[geneid] += 0 if annotatedCount < readCount and annotatedCount == 0: unannotReadsDict[read] = 1 num_unannot = len(unannotReadsDict) num_annot = 0 for geneid in geneid2counts: if "P-cel" in geneid: continue if geneid == "NA": continue num_annot += geneid2counts[geneid] with open(finalCountfile, "w") as outfh2: print >> outfh2, dr_tools.join("#samples", samplename) print >> outfh2, dr_tools.join("#unannotatedmolc", num_unannot) print >> outfh2, dr_tools.join("#annotatedmolc", num_annot) for geneid in geneidlist: print >> outfh2, dr_tools.join(geneid2name[geneid], geneid, geneid2counts.get(geneid, "0")) outfh2.close()
database_snps[coord] = SNPinfo(c57base, castbase) for filepath in o.cellsums_files: for p in dr_tools.splitlines(filepath): coord = '%s\t%s'%(p[0], p[1]) if coord not in database_snps: # strange, it should be (in database_snps) # unless o.snp_validatedbefore was used continue snp = database_snps[coord] snp.c57count += int(p[snp.c57index]) snp.castcount += int(p[snp.castindex]) ratios = [] with open(o.outfile, 'w') as outfh: for coord, snpinfo in database_snps.items(): reads = snpinfo.c57count+snpinfo.castcount if reads == 0: ratio = 0 else: ratio = snpinfo.c57count/reads if o.minratio <= ratio <= (1-o.minratio) and reads >= o.minreads_sum and min(snpinfo.c57count, snpinfo.castcount) >= o.minreads_allele: print >>outfh, dr_tools.join(coord, snpinfo.bases, '0', '1.00', '1.00', '1.00', '1.00', '1.00') ratios.append(ratio) if o.figure: import pylab step = 0.005 xarr, yarr = dr_tools.bin(ratios, -step, 1+step, step, 1) #yarr = [y/len(ratios) for y in yarr] pylab.plot(xarr, yarr, 'k-') pylab.savefig(o.figure)
set2 = set(entries2) set1_unique_c = len(set(entries1[sym] for sym in (set1-set2))) set2_unique_c = len(set(entries2[sym] for sym in (set2-set1))) common_c = len(set(entries1[sym] for sym in (set2&set1))) common_c2 = len(set(entries2[sym] for sym in (set2&set1))) if not common_c == common_c2: raise Exception saygenes = [] for genes in set(entries2[sym] for sym in (set2&set1)): saygenes.append(';'.join(list(genes))) return set1_unique_c, common_c, set2_unique_c, ', '.join(saygenes) if '__main__' == __name__: parser = argparse.ArgumentParser() parser.add_argument('-A', '--annotationfile', default='/mnt/crick/danielr/Xandclones_BR/BR_fibroblasts/snp-call/more_formats/mm9_ensembl_refseq_norandom_11Apr2012_genesymbols.txt') parser.add_argument('-a', '--set1', required=True) parser.add_argument('-b', '--set2', required=True) parser.add_argument('-ge', '--disallowedgenes', nargs='+') o = parser.parse_args() if o.disallowedgenes: disallowedgenes = set() for filename in o.disallowedgenes: disallowedgenes.update(set(dr_tools.loadlist(filename))) else: disallowedgenes = None ID_to_symbol = dict((p[1], p[12]) for p in dr_tools.splitlines(o.annotationfile) if disallowedgenes is None or p[12] not in disallowedgenes) print dr_tools.join(overlap_of_2(load_geneset(ID_to_symbol, o.set1), load_geneset(ID_to_symbol, o.set2))) print len(set(ID_to_symbol.values()))
# comes when len(V1)!=len(V2), maybe because some samples have zero cells meeting the filter requirements? print >>sys.stderr, combo stat = float('nan') p = float('nan') try:stat = float(stat) # to deal with 1-element array values except TypeError: pass n = len(V1) if o.output_summary_values: n = repr(zip(V1, V2)) else: transformed_x, clean_y = zip(*[(distfunc(x),y) for x,y in zip(locations_x, locations_y) if not math.isnan(x) and not math.isnan(distfunc(x))]) stat, p = correlation(transformed_x, clean_y) n = len(clean_y) if str(p) == 'nan': #print combo, len(V1), len(V2), V1[:10], V2[:10] continue pvals.append(PVal(p, dr_tools.join(comboname, distname, str(stat), abundance, n))) # different from _v4: n if isinstance(stat, tuple): pvals[-1].r = numpy.mean(stat) else: pvals[-1].r = stat # false discovery rate and output for test_inst, q in zip(pvals, dr_tools.globalFDR([test_inst.p for test_inst in pvals])): test_inst.q = q for test_inst in sorted(pvals, key=lambda obj: (obj.p, -abs(obj.r)), reverse=False): if test_inst.q < o.maxq or o.maxq >= 1: print test_inst if o.saycombinedP: print 'combined P:', dr_tools.combinedP([test_inst.p for test_inst in pvals])
yarr = [y - mid_y for y in yarr] locations_x.extend(xarr) locations_y.extend(yarr) patient_by_sample.extend([patient for x in xarr]) if o.shuffle: random.shuffle(locations_y) table[comboname] = locations_y table['time_from_baseline_months'] = locations_x table['CMM_ID'] = patient_by_sample if o.shuffle_name: random.shuffle(table['CMM_ID']) column_order.append(comboname) print dr_tools.join(column_order) transposed_table = zip(*[table[c] for c in column_order]) last_patient = '' for patient in set_order_patients: if patient == last_patient: print dr_tools.join(patient) else: for row in transposed_table: if row[0] == patient: print dr_tools.join(row) break else: print patient last_patient = patient