Exemplo n.º 1
0
def parse_gene_ref(ref_gene) :
    #FIXME - maybe, if galaxy doesn't work out, figure out how to deal with multiple RefGene mapping formats?
    fieldnames = ['geneName','name','chrom','strand','txStart','txEnd','cdsStart','cdsEnd','exonCount','exonStarts','exonEnds']
    reader = DictReader(ref_gene,fieldnames=fieldnames,delimiter='\t')
    gene_ref = dd(list)
    for ref_dict in reader :
        for k,v in ref_dict.items() :
            # coerce numbers where possible
            ref_dict[k] = parse_number(v)

        # turn 'x,x,x,...' into a list
        ref_dict['exonStarts'] = [parse_number(x) for x in ref_dict['exonStarts'].split(',')]
        if ref_dict['exonStarts'][-1] == '' : ref_dict['exonStarts'].remove('')
        ref_dict['exonEnds'] = [parse_number(x) for x in ref_dict['exonEnds'].split(',')]
        if ref_dict['exonEnds'][-1] == '' : ref_dict['exonEnds'].remove('')

        gene_ref[ref_dict['chrom']].append(ref_dict)

    return gene_ref
Exemplo n.º 2
0
                              extrasaction='ignore',
                              lineterminator='\n')
    peaks_writer.writerow(dict([(k, k) for k in output_fields]))
    unique_genes = set()
    map_stats = dd(int)
    for peak in peaks_reader:

        # if this is a comment or header line get skip it
        if peak[fieldnames[0]].startswith('#') or \
           peak[fieldnames[0]] == fieldnames[0] or \
           peak[fieldnames[0]].startswith('track') :
            continue

        # coerce values to numeric if possible
        for k, v in peak.items():
            peak[k] = parse_number(v)

        # MACS output gives us summit
        if opts.peaks_fmt == 'MACS':
            peak_loc = peak[start_field] + peak['summit']
        else:  # peak assumed to be in the middle of the reported peak range
            peak_loc = (peak[start_field] + peak[end_field]) / 2

        chrom_genes = gene_ref[peak[chr_field]]

        if len(chrom_genes) == 0:
            sys.stderr.write(
                'WARNING: peak chromosome %s not found in gene reference, skipping: %s\n'
                % (peak[chr_field], peak))
            continue
            symbol_xref_map[rec['kgID']] = rec
        output_fields = ['knownGeneID','geneSymbol']+fieldnames

    peaks_writer = DictWriter(peak_output,output_fields,delimiter='\t',extrasaction='ignore',lineterminator='\n')
    peaks_writer.writerow(dict([(k,k) for k in output_fields]))
    unique_genes = set()
    map_stats = dd(int)
    for peak in peaks_reader :

        # if this is a comment or header line get skip it
        if peak[fieldnames[0]].startswith('#') or \
           peak[fieldnames[0]] == fieldnames[0] or \
           peak[fieldnames[0]].startswith('track') : continue

        # coerce values to numeric if possible
        for k,v in peak.items() : peak[k] = parse_number(v)

        # MACS output gives us summit
        if opts.peaks_fmt == 'MACS' :
            peak_loc = peak[start_field]+peak['summit']
        else : # peak assumed to be in the middle of the reported peak range
            peak_loc = (peak[start_field]+peak[end_field])/2

        chrom_genes = gene_ref[peak[chr_field]]

        if len(chrom_genes) == 0 :
            sys.stderr.write('WARNING: peak chromosome %s not found in gene reference, skipping: %s\n'%(peak[chr_field],peak))
            continue

        mapped = False
Exemplo n.º 4
0
    print '\nParsing %d rows from peak file and will provide update every %d rows'%(totalrows,interval)
    
    for peak in peaks_reader :
        rowcount+=1
        if rowcount % interval ==0:
            print 'Processing row %d out of %d...'%(rowcount,totalrows)
                                                
        # if this is a comment or header line get skip it
        #removed 'startswith' call so that this can work with tuples
        if peak[fieldnames[0]][0]=='#' or \
           peak[fieldnames[0]] == fieldnames[0] or \
           peak[fieldnames[0]][0]=='track' : continue

        # coerce values to numeric if possible
        for k,v in peak.items() : peak[k] = parse_number(v)

        # MACS output gives us summit
        if opts.peaks_fmt == 'MACS' :
            peak_loc = int(peak[start_field])+int(peak['summit'])
        elif opts.peaks_fmt == 'GPS' :
            #get position and also add in a real window
            ch,mid,tot = peak['Position']##reader already parses this into tuple
            peak['Position']=tot
            peak_loc = int(mid)
            peak[chr_field] = ch
            peak[start_field] = peak_loc-125
            peak[end_field] = peak_loc+125
        else : # peak assumed to be in the middle of the reported peak range
            peak_loc = (int(peak[start_field])+int(peak[end_field]))/2
def main():
    opts, args = parser.parse_args(sys.argv[1:])

    if len(args) < 3 :
        parser.error('Must provide three filename arguments')

    gene_ref = parse_gene_ref(args[0])
    xref_fn = args[1]
    peaks_fn = args[2]

    if opts.peaks_fmt == 'MACS' :
        peaks_reader_cls = MACSFile
        chr_field, start_field, end_field = 'chr', 'start', 'end'
    elif opts.peaks_fmt == 'BED' :
        peaks_reader_cls = BEDFile
        chr_field, start_field, end_field = 'chrom', 'chromStart', 'chromEnd'
    else :
        # should never happen
        fieldnames = []

    #peaks_reader = DictReader(open(args[1]),fieldnames=fieldnames,delimiter='\t')
    peaks_reader = peaks_reader_cls(peaks_fn)

    # default output format:
    if opts.peak_output :
        peak_output = open(opts.peak_output,'w')
    else :
        peak_output = sys.stdout

    fieldnames = peaks_reader.FIELD_NAMES
    if opts.detail :
        fieldnames += ["peak loc","dist from feature","map type","map subtype"]#"score"
    output_fields = ['knownGeneID']+fieldnames

    # see if the user wants gene symbols too
    # TODO - actually make this an option, or make it required
    opts.symbol_xref = xref_fn
    if opts.symbol_xref :
        kgXref_fieldnames = ['kgID','mRNA','spID','spDisplayID','geneSymbol','refseq','protAcc','description']
        symbol_xref_reader = DictReader(open(opts.symbol_xref),fieldnames=kgXref_fieldnames,delimiter='\t')
        symbol_xref_map = {}
        for rec in symbol_xref_reader :
            symbol_xref_map[rec['kgID']] = rec
        output_fields = ['knownGeneID','geneSymbol']+fieldnames

    peaks_writer = DictWriter(peak_output,output_fields,delimiter='\t',extrasaction='ignore',lineterminator='\n')
    peaks_writer.writerow(dict([(k,k) for k in output_fields]))
    unique_genes = set()
    map_stats = dd(int)
    for peak in peaks_reader :

        # if this is a comment or header line get skip it
        if peak[fieldnames[0]].startswith('#') or \
           peak[fieldnames[0]] == fieldnames[0] or \
           peak[fieldnames[0]].startswith('track') : continue

        # coerce values to numeric if possible
        for k,v in peak.items() : peak[k] = parse_number(v)

        # MACS output gives us summit
        if opts.peaks_fmt == 'MACS' :
            peak_loc = peak[start_field]+peak['summit']
        else : # peak assumed to be in the middle of the reported peak range
            peak_loc = (peak[start_field]+peak[end_field])/2

        chrom_genes = gene_ref[peak[chr_field]]

        if len(chrom_genes) == 0 :
            sys.stdout.write('WARNING: peak chromosome %s not found in gene reference, skipping: %s\n'%(peak[chr_field],peak))
            continue

        mapped = False

        # walk through the genes for this chromosome
        for gene in chrom_genes :

            # reusable dictionary for output
            out_d = {}.fromkeys(output_fields,0)
            out_d.update(peak)
            out_d['map type'] = ''
            out_d['chromo'] = peak[chr_field]
            out_d['peak loc'] = peak_loc

            # determine intervals for promoter, gene, and downstream
            if gene['strand'] == '+' :
                promoter_coords = max(gene['txStart']-1-opts.upst_win,0), gene['txStart']-1
                if opts.tss :
                    gene_coords = gene['txStart'], min(gene['txEnd'],gene['txStart']+opts.dnst_win)
                    downstream_coords = gene['txEnd']+1,gene['txStart']+opts.dnst_win
                else :
                    gene_coords = gene['txStart'], gene['txEnd']
                    downstream_coords = gene['txEnd']+1, gene['txEnd']+1+opts.dnst_win
            else :
                promoter_coords = gene['txEnd']+1, gene['txEnd']+1+opts.upst_win # +1 because we're using 1 based indexing
                if opts.tss :
                    gene_coords = max(gene['txStart'],gene['txEnd']-opts.upst_win), gene['txEnd']
                    downstream_coords = gene['txEnd']-1-opts.dnst_win, gene['txStart']-1 # -1 because we're using 1 based indexing
                else :
                    gene_coords = gene['txStart'], gene['txEnd']
                    downstream_coords = gene['txStart']-1-opts.dnst_win, gene['txStart']-1 # -1 because we're using 1 based indexing

            # check for promoter
            if peak_loc >= promoter_coords[0] and peak_loc <= promoter_coords[1] :
                out_d['map type'] = 'promoter'
                out_d['dist from feature'] = peak_loc - promoter_coords[1] if gene['strand'] == '+' else promoter_coords[0] - peak_loc

            # check for gene
            elif peak_loc >= gene_coords[0] and peak_loc <= gene_coords[1] :
                # check for intron/exon
                exon_coords = zip(gene['exonStarts'],gene['exonEnds'])
                in_exon = False
                for st,en in exon_coords :
                    if peak_loc >= st and peak_loc <= en :
                        in_exon = True
                        break
                out_d['map type'] = 'gene'
                out_d['map subtype'] = 'exon' if in_exon else 'intron'

                #Commented out to keep score reported in bed file - AJD 7/29/14
                # score = (peak-TSS)/(TSE-TSS) - peak distance from TSS as fraction of length of gene
                #gene_len = float(gene_coords[1]-gene_coords[0])
                #out_d['score'] = (peak_loc-gene_coords[0])/gene_len if gene['strand'] == '+' else (gene_coords[1]-peak_loc)/gene_len

                # distance calculated from start of gene
                out_d['dist from feature'] = peak_loc - promoter_coords[1] if gene['strand'] == '+' else promoter_coords[0] - peak_loc

                map_stats[out_d['map subtype']] += 1

            # check for downstream
            elif peak_loc >= downstream_coords[0] and peak_loc <= downstream_coords[1] :
                out_d['map type'] = 'after'
                if opts.tss :
                    out_d['dist from feature'] = peak_loc - gene_coords[0] if gene['strand'] == '+' else gene_coords[1] - peak_loc
                else :
                    out_d['dist from feature'] = peak_loc - downstream_coords[0] if gene['strand'] == '+' else downstream_coords[1] - peak_loc

            # does not map to this gene
            else :
                pass

            # map type is not blank if we mapped to something
            if out_d['map type'] != '' :

                #out_d = {'knownGeneID':gene['name']}
                out_d['knownGeneID'] = gene['name']
                if opts.symbol_xref :
                    out_d['geneSymbol'] = symbol_xref_map[gene['name']]['geneSymbol']
                peaks_writer.writerow(out_d)
                mapped = True

                # reset map_type
                out_d['map type'] = ''

        if not mapped :
            if opts.intergenic :
                out_d['knownGeneID'] = 'None'
                out_d['geneSymbol'] = 'None'
                out_d['map type'] = 'intergenic'
                peaks_writer.writerow(out_d)
            map_stats['intergenic'] += 1

    if peak_output != sys.stdout:
        peak_output.close()