示例#1
0
def main():

    usage = "%prog [options]" + "\n"
    parser = OptionParser(usage, version="%prog " + __version__)
    parser.add_option(
        "-i",
        "--input-file",
        action="store",
        type="string",
        dest="input_file",
        help=
        "BED6+ file specifying the C position. This BED file should have at least 6 columns (Chrom, ChromStart, ChromeEnd, Name, Beta_value, Strand). BED6+ file can be a regular text file or compressed file (*.gz, *.bz2) or accessible url."
    )
    parser.add_option(
        "-r",
        "--refgene",
        action="store",
        type="string",
        dest="gene_file",
        help=
        "Reference gene model in standard BED12 format (https://genome.ucsc.edu/FAQ/FAQformat.html#format1). \"Strand\" column must exist in order to decide 5' and 3' UTRs, up- and down-stream intergenic regions."
    )
    parser.add_option(
        "-d",
        "--downstream",
        action="store",
        type="int",
        dest="downstream_size",
        default=2000,
        help=
        "Size of down-stream genomic region added to gene. default=%default (bp)"
    )
    parser.add_option(
        "-u",
        "--upstream",
        action="store",
        type="int",
        dest="upstream_size",
        default=2000,
        help=
        "Size of up-stream genomic region added to gene. default=%default (bp)"
    )
    parser.add_option("-o",
                      "--output",
                      action="store",
                      type='string',
                      dest="out_file",
                      help="Prefix of the output file.")
    (options, args) = parser.parse_args()

    print()
    if not (options.input_file):
        print(__doc__)
        parser.print_help()
        sys.exit(101)

    if not (options.gene_file):
        print(__doc__)
        parser.print_help()
        sys.exit(102)

    if not (options.out_file):
        print(__doc__)
        parser.print_help()
        sys.exit(103)

    FOUT = open(options.out_file + '.txt', 'w')
    ROUT = open(options.out_file + '.r', 'w')
    print("\t".join(["Group", "Relative_position(5'->3')", "Average_beta"]),
          file=FOUT)

    #step1: read CpG file
    printlog("Reading CpG file: \"%s\"" % (options.input_file))
    cpg_ranges = read_CpG_bed(options.input_file)

    #step2: read gene file
    printlog("Reading reference gene model: \"%s\"" % (options.gene_file))
    ref_gene = BED.ParseBED(options.gene_file)

    group_sizes = []  #number of datapoints in each group
    printlog("Process upstream regions ...")
    up_2k = ref_gene.getIntergenic(direction='up', size=options.upstream_size)
    s = coverage_over_range(up_2k, cpg_ranges)
    group_sizes.append(len(s))
    for i in sorted(s):
        print('\t'.join(['Upstream_intergenic', str(i), str(s[i])]), file=FOUT)
    print('Upstream_intergenic_y=c(%s)' %
          ','.join([str(s[i]) for i in sorted(s)]),
          file=ROUT)

    printlog("Process 5' UTR exons ...")
    utr5_exons = ref_gene.getUTRs(utr=5)
    s = coverage_over_range(utr5_exons, cpg_ranges)
    group_sizes.append(len(s))
    for i in sorted(s):
        print('\t'.join(['Five_prime_UTR', str(i), str(s[i])]), file=FOUT)
    print('Five_prime_UTR_y=c(%s)' % ','.join([str(s[i]) for i in sorted(s)]),
          file=ROUT)

    printlog("Process Coding exons ...")
    cds_exons = ref_gene.getCDSExons()
    s = coverage_over_range(cds_exons, cpg_ranges)
    group_sizes.append(len(s))
    for i in sorted(s):
        print('\t'.join(['Coding_exon', str(i), str(s[i])]), file=FOUT)
    print('Coding_exon_y=c(%s)' % ','.join([str(s[i]) for i in sorted(s)]),
          file=ROUT)

    printlog("Process first introns ...")
    introns = ref_gene.getIntrons(itype='first')
    s = coverage_over_range(introns, cpg_ranges)
    group_sizes.append(len(s))
    for i in sorted(s):
        print('\t'.join(['First_intron', str(i), str(s[i])]), file=FOUT)
    print('First_intron_y=c(%s)' % ','.join([str(s[i]) for i in sorted(s)]),
          file=ROUT)

    printlog("Process internal introns ...")
    introns = ref_gene.getIntrons(itype='internal')
    s = coverage_over_range(introns, cpg_ranges)
    group_sizes.append(len(s))
    for i in sorted(s):
        print('\t'.join(['Internal_intron', str(i), str(s[i])]), file=FOUT)
    print('Internal_intron_y=c(%s)' % ','.join([str(s[i]) for i in sorted(s)]),
          file=ROUT)

    printlog("Process last introns ...")
    introns = ref_gene.getIntrons(itype='last')
    s = coverage_over_range(introns, cpg_ranges)
    group_sizes.append(len(s))
    for i in sorted(s):
        print('\t'.join(['Last_intron', str(i), str(s[i])]), file=FOUT)
    print('Last_intron_y=c(%s)' % ','.join([str(s[i]) for i in sorted(s)]),
          file=ROUT)

    printlog("Process 3' UTR exons ...")
    utr3_exons = ref_gene.getUTRs(utr=3)
    s = coverage_over_range(utr3_exons, cpg_ranges)
    group_sizes.append(len(s))
    for i in sorted(s):
        print('\t'.join(['Three_prime_UTR', str(i), str(s[i])]), file=FOUT)
    print('Three_prime_UTR_y=c(%s)' % ','.join([str(s[i]) for i in sorted(s)]),
          file=ROUT)

    printlog("Process downstream regions ...")
    down_2k = ref_gene.getIntergenic(direction='down',
                                     size=options.downstream_size)
    s = coverage_over_range(down_2k, cpg_ranges)
    group_sizes.append(len(s))
    for i in sorted(s):
        print('\t'.join(['Downstream_intergenic',
                         str(i), str(s[i])]),
              file=FOUT)
    print('Downstream_intergenic_y=c(%s)' %
          ','.join([str(s[i]) for i in sorted(s)]),
          file=ROUT)

    print('\n')
    print('pdf(file=\"%s\", width=10, height=5)' % (options.out_file + '.pdf'),
          file=ROUT)
    print(
        'plot(1:%d, c(Upstream_intergenic_y, Five_prime_UTR_y, Coding_exon_y, First_intron_y, Internal_intron_y, Last_intron_y, Three_prime_UTR_y, Downstream_intergenic_y),ylim=c(0,1), xaxt="n",xlab="", ylab="Average methylation", type="l", col="red")'
        % sum(group_sizes),
        file=ROUT)
    print(
        'abline(v = c(100,201,302,403,504,605,706),col="blue", lty="dashed")',
        file=ROUT)
    print('abline(v = c(%d,%d,%d,%d,%d,%d,%d),col="blue", lty="dashed")' %
          (sum(group_sizes[0:1]), sum(group_sizes[0:2]), sum(group_sizes[0:3]),
           sum(group_sizes[0:4]), sum(group_sizes[0:5]), sum(
               group_sizes[0:6]), sum(group_sizes[0:7])),
          file=ROUT)
    print('abline(h = 0.5,col="grey", lty="dashed")', file=ROUT)
    print(
        'text(x=c(%d,%d,%d,%d,%d,%d,%d, %d)+50, y=0.9, cex=0.7, labels=c("Upstream\\n(5\'->3\')", "5\'UTR exon\\n(5\'->3\')","Coding exon\\n(5\'->3\')","First intron\\n(5\'->3\')","Internal intron\\n(5\'->3\')","Last intron\\n(5\'->3\')", "3\'UTR exon\\n(5\'->3\')","Downstream\n(5\'->3\')"))'
        % (0, sum(group_sizes[0:1]), sum(group_sizes[0:2]),
           sum(group_sizes[0:3]), sum(group_sizes[0:4]), sum(group_sizes[0:5]),
           sum(group_sizes[0:6]), sum(group_sizes[0:7])),
        file=ROUT)
    print('dev.off()', file=ROUT)

    FOUT.close()
    ROUT.close()
    try:
        subprocess.call("Rscript " + options.out_file + '.r', shell=True)
    except:
        print("Cannot generate pdf file from " + options.out_file + '.r',
              file=sys.stderr)
        pass
示例#2
0
def main():
	
	usage="%prog [options]" + "\n"
	parser = OptionParser(usage,version="%prog " + __version__)
	parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="BED file specifying the C position. This BED file should have at least three columns (Chrom, ChromStart, ChromeEnd).  Note: the first base in a chromosome is numbered 0. This file can be a regular text file or compressed file (.gz, .bz2).")
	parser.add_option("-r","--refgene",action="store",type="string",dest="gene_file",help="Reference gene model in standard BED-12 format (https://genome.ucsc.edu/FAQ/FAQformat.html#format1). ")
	parser.add_option("-d","--downstream",action="store",type="int",dest="downstream_size",default=2000,help="Size of down-stream intergenic region w.r.t. TES (transcription end site). default=%default (bp)")
	parser.add_option("-u","--upstream",action="store",type="int",dest="upstream_size",default=2000,help="Size of up-stream intergenic region w.r.t. TSS (transcription start site). default=%default (bp)")
	parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.")
	(options,args)=parser.parse_args()
	
	print ()

	if not (options.input_file):
		print (__doc__)
		parser.print_help()
		sys.exit(101)

	if not (options.gene_file):
		print (__doc__)
		parser.print_help()
		sys.exit(102)
				
	if not (options.out_file):
		print (__doc__)
		parser.print_help()
		sys.exit(103)	
	
	FOUT = open(options.out_file + '.tsv','w')
	ROUT = open(options.out_file + '.r','w')
	
	#step1: read CpG file
	printlog("Reading CpG file: \"%s\"" % (options.input_file))
	cpg_ranges = read_CpG_bed(options.input_file)
		
	#step2: read gene file
	printlog("Reading reference gene model: \"%s\"" % (options.gene_file))
	ref_gene = BED.ParseBED(options.gene_file)
	
	result = [("Priority_order", "Name", "Number_of_regions", "Size_of_regions(bp)", "CpG_raw_count", "CpG_count_per_KB")]
	
	#priority order: #1
	printlog("Extract Coding exons ...")
	cds_exons = ref_gene.getCDSExons(stranded=False)
	printlog("Merge Coding exons ...")
	cds_exons = BED.unionBed3(cds_exons)
	printlog("Count CpGs in Coding exons ...")
	(size,count) = count_over_range(cds_exons, cpg_ranges)
	result.append(['0','Coding exons', len(cds_exons), size, count, count*1000.0/size])	#Class, number_of_region, size_of_region, CpG_raw_count, CpG_count_perKb

	#priority order: #2
	printlog("Extract UTR exons ...")
	utr_exons = ref_gene.getUTRs(utr=35, uniquify=True, stranded = False)
	
	printlog("Merge UTR exons ...")
	utr_exons = BED.unionBed3(utr_exons)
	
	printlog("Subtract regions with higher priority from UTR exons ...")
	utr_exons = BED.subtractBed3(utr_exons, cds_exons)	#nucleotides of utr_exons that overlaps with coding exons will be removed
	
	printlog("Count CpGs in UTR exons ...")
	(size,count) = count_over_range(utr_exons, cpg_ranges)
	result.append(['1','UTR exons', len(utr_exons), size, count, count*1000.0/size])
	
	#priority order: #3
	printlog("Extract introns ...")
	introns = ref_gene.getIntrons(itype='all', uniquify=True, stranded=False)
	
	printlog("Merge introns ...")
	introns = BED.unionBed3(introns)
	
	printlog("Subtract regions with higher priority from introns ...")
	introns = BED.subtractBed3(introns, cds_exons)
	introns = BED.subtractBed3(introns, utr_exons)
	
	printlog("Count CpGs in introns ...")
	(size,count) = count_over_range(introns, cpg_ranges)
	result.append(['2','Introns', len(introns), size, count, count*1000.0/size])

	#priority order: #4
	printlog("Extract upstream intergenic regions ...")
	upstream = ref_gene.getIntergenic(direction='up', size=options.upstream_size, uniquify=True, stranded = False)
	
	printlog("Merge upstream intergenic regions ...")
	upstream = BED.unionBed3(upstream)
	
	printlog("Subtract regions with higher priority from upstream intergenic regions...")
	upstream = BED.subtractBed3(upstream, cds_exons)
	upstream = BED.subtractBed3(upstream, utr_exons)
	upstream = BED.subtractBed3(upstream, introns)
	
	printlog("Count CpGs in upstream regions ...")
	(size,count) = count_over_range(upstream, cpg_ranges)
	result.append(['3','Upstream of TSS', len(upstream), size, count, count*1000.0/size])
	
	#priority order: #5
	printlog("Extract downstream intergenic regions ...")
	downstream = ref_gene.getIntergenic(direction='down', size=options.downstream_size, uniquify=True, stranded = False)
	
	printlog("Merge downstream intergenic regions ...")
	downstream = BED.unionBed3(downstream)
	
	printlog("Subtract regions with higher priority from downstream intergenic regions...")
	downstream = BED.subtractBed3(downstream, cds_exons)
	downstream = BED.subtractBed3(downstream, utr_exons)
	downstream = BED.subtractBed3(downstream, introns)
	downstream = BED.subtractBed3(downstream, upstream)
	
	printlog("Count CpGs in downstream regions ...")
	(size,count) = count_over_range(downstream, cpg_ranges)
	result.append(['4','Downstream of TES', len(downstream), size, count, count*1000.0/size])

	print('\n')
	names=[]	#[0,1,2,3,4]
	labels = []	#[bed names]
	density=[]
	for tmp in result:
		print ('\t'.join([str(i) for i in tmp]), file=FOUT)
		names.append(tmp[0])
		labels.append(tmp[1])
		density.append(tmp[5])
	FOUT.close()
	
	print("name = c(%s)" % ','.join(['"' + i + '"' for i in names[1:]]), file=ROUT)
	print("values = c(%s)" % ','.join([str(i) for i in density[1:]]), file=ROUT)
	print ('pdf("%s", width=8, height=6)' % (options.out_file + '.pdf'), file=ROUT)
	print ('layout(matrix(c(1,1,2,1,1,2), nrow=2, byrow=TRUE))', file=ROUT)
	print ('barplot(values,names.arg=name,col=c(%s),ylab="CpG per Kb")' % ','.join(colors(5)), file=ROUT)
	print ("plot(c(0, 1), c(0, 1), ann = F, bty = 'n', type = 'n', xaxt = 'n', yaxt = 'n')", file=ROUT)
	for name,label in zip(names[1:], labels[1:]):
		x_pos = 0.0
		y_pos = 1-(int(name)*20.0 +5)/100 
		print ("text(x=%f, y=%f, labels=c(\"%s = %s\"),adj=c(0,0))" % (x_pos, y_pos,name,label), file=ROUT)
	print ('dev.off()', file=ROUT)
	
	ROUT.close()
	
	printlog("Running R script ...")
	try:
		subprocess.call("Rscript " + options.out_file + '.r', shell=True)
	except:
		print ("Cannot generate pdf file from " + options.out_file + '.r', file=sys.stderr)
		pass		
示例#3
0
def main():

    usage = "%prog [options]" + "\n"
    parser = OptionParser(usage, version="%prog " + __version__)
    parser.add_option(
        "-i",
        "--cpg",
        action="store",
        type="string",
        dest="cpg_file",
        help=
        "BED file specifying the C position. This BED file should have at least three columns (Chrom, ChromStart, ChromeEnd).  Note: the first base in a chromosome is numbered 0. This file can be a regular text file or compressed file (.gz, .bz2)."
    )
    parser.add_option("-b",
                      "--bed",
                      action="store",
                      type="string",
                      dest="bed_files",
                      help="List of BED files specifying the genomic regions.")
    parser.add_option("-o",
                      "--output",
                      action="store",
                      type='string',
                      dest="out_file",
                      help="The prefix of the output file.")
    (options, args) = parser.parse_args()

    print()

    if not (options.cpg_file):
        print(__doc__)
        parser.print_help()
        sys.exit(101)

    if not (options.bed_files):
        print(__doc__)
        parser.print_help()
        sys.exit(101)

    if not (options.out_file):
        print(__doc__)
        parser.print_help()
        sys.exit(102)

    FOUT = open(options.out_file + '.txt', 'w')
    ROUT = open(options.out_file + '.r', 'w')

    #step1: read CpG file
    printlog("Reading CpG file: \"%s\"" % (options.cpg_file))
    cpg_ranges = read_CpG_bed(options.cpg_file)

    #step2: check BED file
    printlog("Checking BED files: \"%s\"" % (options.bed_files))
    input_bed_files = options.bed_files.replace(' ', '').split(',')
    for i in input_bed_files:
        if os.path.exists(i):
            print("\t%s" % i, file=sys.stderr)
        else:
            print("\"%s\" does not exist!" % i, file=sys.stderr)
            sys.exit(103)

    #step3: read, merge, and subtract BED file
    dat = {}
    result = [("Priority_order", "Name", "Number_of_regions",
               "Size_of_regions(bp)", "CpG_raw_count", "CpG_count_per_KB")]

    #step3.1: read the first BED file
    i = 0
    printlog("Reading BED file: \"%s\"" % (input_bed_files[i]))
    file_name = os.path.basename(input_bed_files[i])
    tmp = read_bed_as_list(input_bed_files[i])
    printlog("Merging overlap entries in BED file: \"%s\"" %
             (input_bed_files[i]))
    dat[i] = BED.unionBed3(tmp)
    printlog("Counting CpGs ...")
    (size, count) = count_over_range(dat[i], cpg_ranges)
    result.append(
        [str(i), file_name,
         len(dat[i]), size, count, count * 1000.0 / size]
    )  #Class, number_of_region, size_of_region, CpG_raw_count, CpG_count_perKb

    #step3.2: read the remaining BED files
    for i in range(1, len(input_bed_files)):
        printlog("Reading BED file: \"%s\"" % (input_bed_files[i]))
        file_name = os.path.basename(input_bed_files[i])
        tmp = read_bed_as_list(input_bed_files[i])
        printlog("Merging overlap entries in BED file: \"%s\"" %
                 (input_bed_files[i]))
        dat[i] = BED.unionBed3(tmp)

        for j in range(0, i):
            printlog("Subtract \"%s\" from \"%s\"" %
                     (input_bed_files[j], input_bed_files[i]))
            dat[i] = BED.subtractBed3(dat[i], dat[j])
        (size, count) = count_over_range(dat[i], cpg_ranges)
        result.append([
            str(i), file_name,
            len(dat[i]), size, count, count * 1000.0 / size
        ])

    print('\n')
    names = []  #[0,1,2,3,4,...]
    labels = []  #[bed names]
    density = []
    for tmp in result:
        print('\t'.join([str(i) for i in tmp]), file=FOUT)
        names.append(tmp[0])
        labels.append(tmp[1])
        density.append(tmp[5])
    FOUT.close()

    print("name = c(%s)" % ','.join(['"' + i + '"' for i in names[1:]]),
          file=ROUT)
    print("values = c(%s)" % ','.join([str(i) for i in density[1:]]),
          file=ROUT)
    print('pdf("%s", width=8, height=6)' % (options.out_file + '.pdf'),
          file=ROUT)
    print('layout(matrix(c(1,1,2,1,1,2), nrow=2, byrow=TRUE))', file=ROUT)
    print('barplot(values,names.arg=name,col=c(%s),ylab="CpG per Kb")' %
          ','.join(colors(len(input_bed_files))),
          file=ROUT)
    print(
        "plot(c(0, 1), c(0, 1), ann = F, bty = 'n', type = 'n', xaxt = 'n', yaxt = 'n')",
        file=ROUT)
    for name, label in zip(names[1:], labels[1:]):
        x_pos = 0.0
        y_pos = 1 - (int(name) * 20.0 + 5) / 200
        print("text(x=%f, y=%f, labels=c(\"%s = %s\"),adj=c(0,0))" %
              (x_pos, y_pos, name, label),
              file=ROUT)
    print('dev.off()', file=ROUT)

    ROUT.close()

    printlog("Running R script ...")
    try:
        subprocess.call("Rscript " + options.out_file + '.r', shell=True)
    except:
        print("Cannot generate pdf file from " + options.out_file + '.r',
              file=sys.stderr)
        pass