def parse_gene_ref(ref_gene) : #FIXME - maybe, if galaxy doesn't work out, figure out how to deal with multiple RefGene mapping formats? fieldnames = ['geneName','name','chrom','strand','txStart','txEnd','cdsStart','cdsEnd','exonCount','exonStarts','exonEnds'] reader = DictReader(ref_gene,fieldnames=fieldnames,delimiter='\t') gene_ref = dd(list) for ref_dict in reader : for k,v in ref_dict.items() : # coerce numbers where possible ref_dict[k] = parse_number(v) # turn 'x,x,x,...' into a list ref_dict['exonStarts'] = [parse_number(x) for x in ref_dict['exonStarts'].split(',')] if ref_dict['exonStarts'][-1] == '' : ref_dict['exonStarts'].remove('') ref_dict['exonEnds'] = [parse_number(x) for x in ref_dict['exonEnds'].split(',')] if ref_dict['exonEnds'][-1] == '' : ref_dict['exonEnds'].remove('') gene_ref[ref_dict['chrom']].append(ref_dict) return gene_ref
extrasaction='ignore', lineterminator='\n') peaks_writer.writerow(dict([(k, k) for k in output_fields])) unique_genes = set() map_stats = dd(int) for peak in peaks_reader: # if this is a comment or header line get skip it if peak[fieldnames[0]].startswith('#') or \ peak[fieldnames[0]] == fieldnames[0] or \ peak[fieldnames[0]].startswith('track') : continue # coerce values to numeric if possible for k, v in peak.items(): peak[k] = parse_number(v) # MACS output gives us summit if opts.peaks_fmt == 'MACS': peak_loc = peak[start_field] + peak['summit'] else: # peak assumed to be in the middle of the reported peak range peak_loc = (peak[start_field] + peak[end_field]) / 2 chrom_genes = gene_ref[peak[chr_field]] if len(chrom_genes) == 0: sys.stderr.write( 'WARNING: peak chromosome %s not found in gene reference, skipping: %s\n' % (peak[chr_field], peak)) continue
symbol_xref_map[rec['kgID']] = rec output_fields = ['knownGeneID','geneSymbol']+fieldnames peaks_writer = DictWriter(peak_output,output_fields,delimiter='\t',extrasaction='ignore',lineterminator='\n') peaks_writer.writerow(dict([(k,k) for k in output_fields])) unique_genes = set() map_stats = dd(int) for peak in peaks_reader : # if this is a comment or header line get skip it if peak[fieldnames[0]].startswith('#') or \ peak[fieldnames[0]] == fieldnames[0] or \ peak[fieldnames[0]].startswith('track') : continue # coerce values to numeric if possible for k,v in peak.items() : peak[k] = parse_number(v) # MACS output gives us summit if opts.peaks_fmt == 'MACS' : peak_loc = peak[start_field]+peak['summit'] else : # peak assumed to be in the middle of the reported peak range peak_loc = (peak[start_field]+peak[end_field])/2 chrom_genes = gene_ref[peak[chr_field]] if len(chrom_genes) == 0 : sys.stderr.write('WARNING: peak chromosome %s not found in gene reference, skipping: %s\n'%(peak[chr_field],peak)) continue mapped = False
print '\nParsing %d rows from peak file and will provide update every %d rows'%(totalrows,interval) for peak in peaks_reader : rowcount+=1 if rowcount % interval ==0: print 'Processing row %d out of %d...'%(rowcount,totalrows) # if this is a comment or header line get skip it #removed 'startswith' call so that this can work with tuples if peak[fieldnames[0]][0]=='#' or \ peak[fieldnames[0]] == fieldnames[0] or \ peak[fieldnames[0]][0]=='track' : continue # coerce values to numeric if possible for k,v in peak.items() : peak[k] = parse_number(v) # MACS output gives us summit if opts.peaks_fmt == 'MACS' : peak_loc = int(peak[start_field])+int(peak['summit']) elif opts.peaks_fmt == 'GPS' : #get position and also add in a real window ch,mid,tot = peak['Position']##reader already parses this into tuple peak['Position']=tot peak_loc = int(mid) peak[chr_field] = ch peak[start_field] = peak_loc-125 peak[end_field] = peak_loc+125 else : # peak assumed to be in the middle of the reported peak range peak_loc = (int(peak[start_field])+int(peak[end_field]))/2
def main(): opts, args = parser.parse_args(sys.argv[1:]) if len(args) < 3 : parser.error('Must provide three filename arguments') gene_ref = parse_gene_ref(args[0]) xref_fn = args[1] peaks_fn = args[2] if opts.peaks_fmt == 'MACS' : peaks_reader_cls = MACSFile chr_field, start_field, end_field = 'chr', 'start', 'end' elif opts.peaks_fmt == 'BED' : peaks_reader_cls = BEDFile chr_field, start_field, end_field = 'chrom', 'chromStart', 'chromEnd' else : # should never happen fieldnames = [] #peaks_reader = DictReader(open(args[1]),fieldnames=fieldnames,delimiter='\t') peaks_reader = peaks_reader_cls(peaks_fn) # default output format: if opts.peak_output : peak_output = open(opts.peak_output,'w') else : peak_output = sys.stdout fieldnames = peaks_reader.FIELD_NAMES if opts.detail : fieldnames += ["peak loc","dist from feature","map type","map subtype"]#"score" output_fields = ['knownGeneID']+fieldnames # see if the user wants gene symbols too # TODO - actually make this an option, or make it required opts.symbol_xref = xref_fn if opts.symbol_xref : kgXref_fieldnames = ['kgID','mRNA','spID','spDisplayID','geneSymbol','refseq','protAcc','description'] symbol_xref_reader = DictReader(open(opts.symbol_xref),fieldnames=kgXref_fieldnames,delimiter='\t') symbol_xref_map = {} for rec in symbol_xref_reader : symbol_xref_map[rec['kgID']] = rec output_fields = ['knownGeneID','geneSymbol']+fieldnames peaks_writer = DictWriter(peak_output,output_fields,delimiter='\t',extrasaction='ignore',lineterminator='\n') peaks_writer.writerow(dict([(k,k) for k in output_fields])) unique_genes = set() map_stats = dd(int) for peak in peaks_reader : # if this is a comment or header line get skip it if peak[fieldnames[0]].startswith('#') or \ peak[fieldnames[0]] == fieldnames[0] or \ peak[fieldnames[0]].startswith('track') : continue # coerce values to numeric if possible for k,v in peak.items() : peak[k] = parse_number(v) # MACS output gives us summit if opts.peaks_fmt == 'MACS' : peak_loc = peak[start_field]+peak['summit'] else : # peak assumed to be in the middle of the reported peak range peak_loc = (peak[start_field]+peak[end_field])/2 chrom_genes = gene_ref[peak[chr_field]] if len(chrom_genes) == 0 : sys.stdout.write('WARNING: peak chromosome %s not found in gene reference, skipping: %s\n'%(peak[chr_field],peak)) continue mapped = False # walk through the genes for this chromosome for gene in chrom_genes : # reusable dictionary for output out_d = {}.fromkeys(output_fields,0) out_d.update(peak) out_d['map type'] = '' out_d['chromo'] = peak[chr_field] out_d['peak loc'] = peak_loc # determine intervals for promoter, gene, and downstream if gene['strand'] == '+' : promoter_coords = max(gene['txStart']-1-opts.upst_win,0), gene['txStart']-1 if opts.tss : gene_coords = gene['txStart'], min(gene['txEnd'],gene['txStart']+opts.dnst_win) downstream_coords = gene['txEnd']+1,gene['txStart']+opts.dnst_win else : gene_coords = gene['txStart'], gene['txEnd'] downstream_coords = gene['txEnd']+1, gene['txEnd']+1+opts.dnst_win else : promoter_coords = gene['txEnd']+1, gene['txEnd']+1+opts.upst_win # +1 because we're using 1 based indexing if opts.tss : gene_coords = max(gene['txStart'],gene['txEnd']-opts.upst_win), gene['txEnd'] downstream_coords = gene['txEnd']-1-opts.dnst_win, gene['txStart']-1 # -1 because we're using 1 based indexing else : gene_coords = gene['txStart'], gene['txEnd'] downstream_coords = gene['txStart']-1-opts.dnst_win, gene['txStart']-1 # -1 because we're using 1 based indexing # check for promoter if peak_loc >= promoter_coords[0] and peak_loc <= promoter_coords[1] : out_d['map type'] = 'promoter' out_d['dist from feature'] = peak_loc - promoter_coords[1] if gene['strand'] == '+' else promoter_coords[0] - peak_loc # check for gene elif peak_loc >= gene_coords[0] and peak_loc <= gene_coords[1] : # check for intron/exon exon_coords = zip(gene['exonStarts'],gene['exonEnds']) in_exon = False for st,en in exon_coords : if peak_loc >= st and peak_loc <= en : in_exon = True break out_d['map type'] = 'gene' out_d['map subtype'] = 'exon' if in_exon else 'intron' #Commented out to keep score reported in bed file - AJD 7/29/14 # score = (peak-TSS)/(TSE-TSS) - peak distance from TSS as fraction of length of gene #gene_len = float(gene_coords[1]-gene_coords[0]) #out_d['score'] = (peak_loc-gene_coords[0])/gene_len if gene['strand'] == '+' else (gene_coords[1]-peak_loc)/gene_len # distance calculated from start of gene out_d['dist from feature'] = peak_loc - promoter_coords[1] if gene['strand'] == '+' else promoter_coords[0] - peak_loc map_stats[out_d['map subtype']] += 1 # check for downstream elif peak_loc >= downstream_coords[0] and peak_loc <= downstream_coords[1] : out_d['map type'] = 'after' if opts.tss : out_d['dist from feature'] = peak_loc - gene_coords[0] if gene['strand'] == '+' else gene_coords[1] - peak_loc else : out_d['dist from feature'] = peak_loc - downstream_coords[0] if gene['strand'] == '+' else downstream_coords[1] - peak_loc # does not map to this gene else : pass # map type is not blank if we mapped to something if out_d['map type'] != '' : #out_d = {'knownGeneID':gene['name']} out_d['knownGeneID'] = gene['name'] if opts.symbol_xref : out_d['geneSymbol'] = symbol_xref_map[gene['name']]['geneSymbol'] peaks_writer.writerow(out_d) mapped = True # reset map_type out_d['map type'] = '' if not mapped : if opts.intergenic : out_d['knownGeneID'] = 'None' out_d['geneSymbol'] = 'None' out_d['map type'] = 'intergenic' peaks_writer.writerow(out_d) map_stats['intergenic'] += 1 if peak_output != sys.stdout: peak_output.close()