def main(argv):
	parser = OptionParser()
	parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species, mm8, hg18", metavar="<str>")
	parser.add_option("-a", "--summary_graph_file1", action="store", type="string", dest="bedfile1", metavar="<file>", help="summary graph file 1 in bed format")
	parser.add_option("-b", "--summary_graph_file2", action="store", type="string", dest="bedfile2", metavar="<file>", help="summary graph file 2 in bed format")
	parser.add_option("-i", "--windows_size", action="store", type="int", dest="window_size", metavar="<int>", help="window size in summary graph file")
	parser.add_option("-d", "--data_resolution", action="store", type="int", dest="step", metavar="<int>", help="distance between data points, must be integer times of window size")
	parser.add_option("-o", "--outfile", action="store", type="string", dest="out_file", metavar="<file>", help="output file extension")
	
	(opt, args) = parser.parse_args(argv)
	if len(argv) < 8:
        	parser.print_help()
        	sys.exit(1)
	
	if opt.species in species_chroms.keys():
		chroms = species_chroms[opt.species];
		chrom_lengths = species_chrom_lengths[opt.species];
	else:
		print "This species is not recognized, exiting";
		sys.exit(1);
	
	SeparateByChrom.separateByChrom(['chr1'], opt.bedfile1, '.bed1')
	SeparateByChrom.separateByChrom(['chr1'], opt.bedfile2, '.bed2')
	generate_all_functions_2('.bed1', '.bed2', ['chr1'], opt.window_size, opt.step, opt.out_file, chrom_lengths)
	SeparateByChrom.cleanup(['chr1'], '.bed1')
	SeparateByChrom.cleanup(['chr1'], '.bed2')
def main(argv):
    
    parser = OptionParser()
    parser.add_option("-s", "--species", action="store", type="string",
                      dest="species", help="species under consideration", metavar="<str>")
    parser.add_option("-b", "--raw_bed_file", action="store", type="string",
                      dest="bed_file", help="raw bed file", metavar="<file>")
    parser.add_option("-t", "--threshold", action="store", type="int",
                      dest="threshold", help="threshold for copy number", metavar="<int>")          
    parser.add_option("-o", "--output_file_name", action="store", type="string",
                      dest="out_file", help="output file name", metavar="<file>")
    
    (opt, args) = parser.parse_args(argv)
    if len(argv) < 8:
            parser.print_help()
            sys.exit(1)
    
    if opt.species in GenomeData.species_chroms.keys():
        chroms = GenomeData.species_chroms[opt.species];
    else:
        print "This species is not recognized, exiting";
        sys.exit(1);
    
    SeparateByChrom.separateByChrom(chroms, opt.bed_file, '.bed1')
    
    for chrom in chroms:
        if (Utility.fileExists(chrom + ".bed1")):
            strand_broken_remove(chrom, opt.threshold)
    
    SeparateByChrom.combineAllGraphFiles(chroms, '.bed2', opt.out_file)
    SeparateByChrom.cleanup(chroms, '.bed1')
    SeparateByChrom.cleanup(chroms, '.bed2')
def main(argv):
	
	parser = OptionParser()
	parser.add_option("-s", "--species", action="store", type="string",
                      dest="species", help="species under consideration", metavar="<str>")
	parser.add_option("-b", "--raw_bed_file", action="store", type="string",
                      dest="bed_file", help="raw bed file", metavar="<file>")
	parser.add_option("-t", "--threshold", action="store", type="int",
                      dest="threshold", help="threshold for copy number", metavar="<int>")	      
	parser.add_option("-o", "--output_file_name", action="store", type="string",
                      dest="out_file", help="output file name", metavar="<file>")
	
	(opt, args) = parser.parse_args(argv)
	if len(argv) < 8:
        	parser.print_help()
        	sys.exit(1)
	
	if opt.species in GenomeData.species_chroms.keys():
		chroms = GenomeData.species_chroms[opt.species];
	else:
		print "This species is not recognized, exiting";
		sys.exit(1);
	
	SeparateByChrom.separateByChrom(chroms, opt.bed_file, '.bed1')
	
	for chrom in chroms:
		if (Utility.fileExists(chrom + ".bed1")):
			strand_broken_remove(chrom, opt.threshold)
	
	SeparateByChrom.combineAllGraphFiles(chroms, '.bed2', opt.out_file)
	SeparateByChrom.cleanup(chroms, '.bed1')
	SeparateByChrom.cleanup(chroms, '.bed2')
def find_windows_on_islands(species, summary_graph_file, islands_file, window_size, out_file, window_read_count_threshold=0):
	summary_graph_extension=".summarygraph"
	island_extension=".islands"
	chroms = species_chroms[species];
	SeparateByChrom.separateByChrom(chroms, summary_graph_file, summary_graph_extension)
	SeparateByChrom.separateByChrom(chroms, islands_file, island_extension)
	
	windows_on_island={};
	for chrom in chroms:
		if Utility.fileExists(chrom+summary_graph_extension) and Utility.fileExists(chrom+island_extension):
			summary = BED.BED(species, chrom+summary_graph_extension, "BED_GRAPH", 0);
			islands = BED.BED(species, chrom+island_extension, "BED_GRAPH", 0);
			windows_on_island[chrom] = filter_out_uncovered_windows(islands[chrom], summary[chrom], window_size)
			
	if out_file !="":
		f = open(out_file, 'w')
		for chrom in chroms:
			if chrom in windows_on_island.keys():
				for item in windows_on_island[chrom]:
					if (item.value >= window_read_count_threshold):
						f.write(item.chrom + '\t' + str(item.start) +'\t'+ str(item.end) +'\t'+ str(item.value) + '\n')
		f.close()

	SeparateByChrom.cleanup(chroms, summary_graph_extension);
	SeparateByChrom.cleanup(chroms, island_extension);	
	return windows_on_island;
def main(argv):
    """
    Note the window_size and the fragment_size are both input as strings, as they are used in
    a shell script in makeGraphFile.
    """
    parser = OptionParser()
    parser.add_option("-s",
                      "--species",
                      action="store",
                      type="string",
                      dest="species",
                      help="mm8,hg18,dm2,etc",
                      metavar="<str>")
    parser.add_option("-b",
                      "--bed_file",
                      action="store",
                      type="string",
                      dest="bedfile",
                      help="bed file to make graph file of",
                      metavar="<file>")
    parser.add_option("-w",
                      "--window_size",
                      action="store",
                      type="int",
                      dest="window_size",
                      help="window size",
                      metavar="<int>")
    parser.add_option("-i",
                      "--fragment_size",
                      action="store",
                      type="int",
                      dest="fragment_size",
                      help="size of fragments after CHIP experiment",
                      metavar="<int>")
    parser.add_option("-o",
                      "--outfile",
                      action="store",
                      type="string",
                      dest="outfile",
                      help="output bed summary file name",
                      metavar="<file>")
    (opt, args) = parser.parse_args(argv)
    if len(argv) < 10:
        parser.print_help()
        sys.exit(1)

    if opt.species in GenomeData.species_chroms.keys():
        chroms = GenomeData.species_chroms[opt.species]
        chrom_lengths = GenomeData.species_chrom_lengths[opt.species]
        SeparateByChrom.separateByChrom(chroms, opt.bedfile, ".bed")
        makeGraphFile(chroms, chrom_lengths, opt.window_size,
                      opt.fragment_size)
        final_output_file = opt.outfile
        final_output_file = SeparateByChrom.combineAllGraphFiles(
            chroms, ".graph", final_output_file)
        SeparateByChrom.cleanup(chroms, ".bed")
        SeparateByChrom.cleanup(chroms, ".graph")
    else:
        print opt.species + " is not in the species list "
def main(argv):
    """
    Note the window_size and the fragment_size are both input as strings, as they are used in
    a shell script in makeGraphFile.
    """
    parser = OptionParser()
    parser.add_option("-s", "--species", action="store", type="string",
                      dest="species", help="mm8,hg18,dm2,etc", metavar="<str>")
    parser.add_option("-b", "--bed_file", action="store", type="string",
                      dest="bamfile", help="bed file to make graph file of",
                      metavar="<file>")
    parser.add_option("-w", "--window_size", action="store", type="int",
                      dest="window_size", help="window size", metavar="<int>")
    parser.add_option("-i", "--fragment_size", action="store", type="int",
                      dest="fragment_size",
                      help="size of fragments after CHIP experiment",
                      metavar="<int>")
    parser.add_option("-o", "--outfile", action="store", type="string",
                      dest="outfile", help="output bed summary file name",
                      metavar="<file>")

    (opt, args) = parser.parse_args(argv)
    #if len(argv) < 10:
    #    sys.stderr.write(str(len(argv)) + '\n')
    #    parser.print_help()
    #    sys.exit(1)
    #
    #if opt.species in GenomeData.species_chroms.keys():
    #    chroms = GenomeData.species_chroms[opt.species];
    #
	#chrom_lengths = GenomeData.species_chrom_lengths[opt.species];
    chromsDict= SeparateByChrom.getChromsFromBam(opt.bamfile)

    SeparateByChrom.separateByChromBamToBed(chromsDict.keys(), opt.bamfile, '.bed');

    makeGraphFile(chromsDict.keys(), chromsDict, opt.window_size, opt.fragment_size);
    final_output_file = opt.outfile;
    final_output_file = SeparateByChrom.combineAllGraphFiles(chromsDict.keys(), ".graph", final_output_file);
    SeparateByChrom.cleanup(chromsDict.keys(), ".bed");

    SeparateByChrom.cleanup(chromsDict.keys(), ".graph");
示例#7
0
def find_windows_on_islands(species,
                            summary_graph_file,
                            islands_file,
                            window_size,
                            out_file,
                            window_read_count_threshold=0):
    summary_graph_extension = ".summarygraph"
    island_extension = ".islands"
    chroms = species_chroms[species]
    SeparateByChrom.separateByChrom(chroms, summary_graph_file,
                                    summary_graph_extension)
    SeparateByChrom.separateByChrom(chroms, islands_file, island_extension)

    windows_on_island = {}
    for chrom in chroms:
        if Utility.fileExists(chrom +
                              summary_graph_extension) and Utility.fileExists(
                                  chrom + island_extension):
            summary = BED.BED(species, chrom + summary_graph_extension,
                              "BED_GRAPH", 0)
            islands = BED.BED(species, chrom + island_extension, "BED_GRAPH",
                              0)
            windows_on_island[chrom] = filter_out_uncovered_windows(
                islands[chrom], summary[chrom], window_size)

    if out_file != "":
        f = open(out_file, 'w')
        for chrom in chroms:
            if chrom in windows_on_island.keys():
                for item in windows_on_island[chrom]:
                    if (item.value >= window_read_count_threshold):
                        f.write(item.chrom + '\t' + str(item.start) + '\t' +
                                str(item.end) + '\t' + str(item.value) + '\n')
        f.close()

    SeparateByChrom.cleanup(chroms, summary_graph_extension)
    SeparateByChrom.cleanup(chroms, island_extension)
    return windows_on_island
def main(argv):
	parser = OptionParser()
	parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species, mm8, hg18", metavar="<str>")
	parser.add_option("-a", "--summarygraphfile", action="store", type="string", dest="bedfile", metavar="<file>", help="summary graph file")
	parser.add_option("-b", "--islandfile", action="store", type="string", dest="islandbedfile", metavar="<file>", help="island file")
	parser.add_option("-o", "--outfile", action="store", type="string", dest="out_file", metavar="<file>", help="filtered summary graph file")
	parser.add_option("-w", "--window_size", action="store", type="int",  dest="window_size", help="window size of summary graph", metavar="<int>")
	
	
	(opt, args) = parser.parse_args(argv)
	if len(argv) < 10:
        	parser.print_help()
        	sys.exit(1)
	
	if opt.species in species_chroms.keys():
		chroms = species_chroms[opt.species];
	else:
		print "This species is not recognized, exiting";
		sys.exit(1);
	
	summary_graph_extension=".summarygraph"
	island_extension=".islands"
	SeparateByChrom.separateByChrom(chroms, opt.bedfile, summary_graph_extension)
	SeparateByChrom.separateByChrom(chroms, opt.islandbedfile, island_extension)
	
	f = open(opt.out_file, 'w')
	for chrom in chroms:
		if Utility.fileExists(chrom+summary_graph_extension) and Utility.fileExists(chrom+island_extension):
			summary = BED.BED(opt.species, chrom+summary_graph_extension, "BED_GRAPH", 0);
			islands = BED.BED(opt.species, chrom+island_extension, "BED_GRAPH", 0);
			result = filter_out_uncovered_windows(islands[chrom], summary[chrom], opt.window_size)
			for item in result:
				f.write(item.chrom + '\t' + str(item.start) +'\t'+ str(item.end) +'\t'+ str(item.value) + '\n')
	f.close()

	SeparateByChrom.cleanup(chroms, summary_graph_extension);
	SeparateByChrom.cleanup(chroms, island_extension);
def main(argv):
	parser = OptionParser()
	parser.add_option("-s", "--species", action="store",
			  type="string", dest="species",
			  help="species, mm8, hg18", metavar="<str>")
	parser.add_option("-a", "--rawbedfile", action="store",
			  type="string", dest="bedfile",
			  metavar="<file>",
			  help="raw data file in bed format")
	parser.add_option("-i", "--fragment_size", action="store",
			  type="int", dest="fragment_size",
			  metavar="<int>",
			  help="average size of a fragment after CHIP experiment")
	parser.add_option("-b", "--islandfile", action="store", type="string",
			  dest="islandbedfile", metavar="<file>",
			  help="island file")
	parser.add_option("-o", "--outfile", action="store", type="string",
			  dest="out_file", metavar="<file>",
			  help="filtered raw bed file")
	
	(opt, args) = parser.parse_args(argv)
	if len(argv) < 10:
        	parser.print_help()
        	sys.exit(1)
	
	if opt.species in species_chroms.keys():
		chroms = species_chroms[opt.species];
	else:
		print "This species is not recognized, exiting";
		sys.exit(1);
	
	islands = BED.BED(opt.species, opt.islandbedfile, "BED3", 0);
	SeparateByChrom.separateByChrom(chroms, opt.bedfile, '.bed1')
	filter_tags_by_islands(chroms, islands, opt.fragment_size)
	final_output_file = opt.out_file;
	final_output_file = SeparateByChrom.combineAllGraphFiles(chroms,
								   '_filtered.bed1',
								   final_output_file);
	SeparateByChrom.cleanup(chroms,'.bed1');
	SeparateByChrom.cleanup(chroms,'_filtered.bed1');
def main(argv):
    """
    Note the window_size and the fragment_size are both input as strings, as they are used in
    a shell script in makeGraphFile.
    """
    parser = OptionParser()
    parser.add_option("-s", "--species", action="store", type="string",
                      dest="species", help="mm8,hg18,dm2,etc", metavar="<str>")
    parser.add_option("-b", "--bed_file", action="store", type="string",
                      dest="bedfile", help="bed file to make graph file of",
                      metavar="<file>")
    parser.add_option("-w", "--window_size", action="store", type="int",
                      dest="window_size", help="window size", metavar="<int>")
    parser.add_option("-i", "--fragment_size", action="store", type="int",
                      dest="fragment_size",
                      help="size of fragments after CHIP experiment",
                      metavar="<int>")
    parser.add_option("-o", "--outfile", action="store", type="string",
                      dest="outfile", help="output bed summary file name",
                      metavar="<file>")
    (opt, args) = parser.parse_args(argv)
    if len(argv) < 10:
        parser.print_help()
        sys.exit(1)

    if opt.species in GenomeData.species_chroms.keys():
        chroms = GenomeData.species_chroms[opt.species];
	chrom_lengths = GenomeData.species_chrom_lengths[opt.species];
        SeparateByChrom.separateByChrom(chroms, opt.bedfile, ".bed");
	makeGraphFile(chroms, chrom_lengths, opt.window_size, opt.fragment_size);
        final_output_file = opt.outfile;
        final_output_file = SeparateByChrom.combineAllGraphFiles(chroms, ".graph", final_output_file);
        SeparateByChrom.cleanup(chroms, ".bed");
	SeparateByChrom.cleanup(chroms, ".graph");
    else:
        print opt.species + " is not in the species list ";
示例#11
0
def main(argv):
    """
    Note the window_size and the fragment_size are both input as strings, as they are used in
    a shell script in makeGraphFile.
    """
    parser = OptionParser()
    parser.add_option("-s",
                      "--species",
                      action="store",
                      type="string",
                      dest="species",
                      help="mm8,hg18,dm2,etc",
                      metavar="<str>")
    parser.add_option("-b",
                      "--bed_file",
                      action="store",
                      type="string",
                      dest="bamfile",
                      help="bed file to make graph file of",
                      metavar="<file>")
    parser.add_option("-w",
                      "--window_size",
                      action="store",
                      type="int",
                      dest="window_size",
                      help="window size",
                      metavar="<int>")
    parser.add_option("-i",
                      "--fragment_size",
                      action="store",
                      type="int",
                      dest="fragment_size",
                      help="size of fragments after CHIP experiment",
                      metavar="<int>")
    parser.add_option("-o",
                      "--outfile",
                      action="store",
                      type="string",
                      dest="outfile",
                      help="output bed summary file name",
                      metavar="<file>")

    (opt, args) = parser.parse_args(argv)
    #if len(argv) < 10:
    #    sys.stderr.write(str(len(argv)) + '\n')
    #    parser.print_help()
    #    sys.exit(1)
    #
    #if opt.species in GenomeData.species_chroms.keys():
    #    chroms = GenomeData.species_chroms[opt.species];
    #
    #chrom_lengths = GenomeData.species_chrom_lengths[opt.species];
    chromsDict = SeparateByChrom.getChromsFromBam(opt.bamfile)

    SeparateByChrom.separateByChromBamToBed(chromsDict.keys(), opt.bamfile,
                                            '.bed')

    makeGraphFile(chromsDict.keys(), chromsDict, opt.window_size,
                  opt.fragment_size)
    final_output_file = opt.outfile
    final_output_file = SeparateByChrom.combineAllGraphFiles(
        chromsDict.keys(), ".graph", final_output_file)
    SeparateByChrom.cleanup(chromsDict.keys(), ".bed")

    SeparateByChrom.cleanup(chromsDict.keys(), ".graph")
示例#12
0
def calculate_non_strandspecific_rc_on_ExonIntrons(entrez_genes, bedfile,
                                                   chroms, fragment_size):
    """
	entrez_genes is a EntrezGenes class object. The core is a entrez_genes.entrez_genes is a dic (keyed by entrez_id) of lists of EntrezGene object
	
	return:
	all_reads_on_shared_exons = {} # {entrezID:[((start, end), read_count)]}
	all_reads_on_shared_introns = {} # {entrezID:[((start, end), read_count)]}
	all_reads_on_merged_transcripts = {} #{entrezID:[((start, end), read_count)]}
	all_summary = {} # {entrezID:{attribute:value}}
		(summary[entrez_id])["merged_exons_rc"] = merged_exons_rc
		(summary[entrez_id])["merged_exon_RPKM"] = merged_exon_RPKM
		(summary[entrez_id])["merged_exons_total_length"] = merged_exons_total_length
		(summary[entrez_id])["shared_exons_rc"] = shared_exons_rc
		(summary[entrez_id])["shared_exon_RPKM"] = shared_exon_RPKM
		(summary[entrez_id])["shared_exons_total_length"] = shared_exons_total_length
		(summary[entrez_id])["shared_introns_rc"] = shared_introns_rc
		(summary[entrez_id])["shared_intron_RPKM"] = shared_intron_RPKM
		(summary[entrez_id])["shared_introns_total_length"] = shared_introns_total_length
		(summary[entrez_id])["merged_transcript_rc"] = merged_transcript_rc
		(summary[entrez_id])["merged_transcript_RPKM"] = merged_transcript_RPKM
		(summary[entrez_id])["merged_transcript_length"] = merged_transcript_length
	"""
    lib_name = (bedfile).split('/')[-1]  # remove directory
    suffix = lib_name.split('.')[-1]  # txt
    lib_name = lib_name.split('.')[0]
    extension = "-" + lib_name + '.' + suffix + "1"

    if Utility_extended.fileExists(bedfile):
        p_file_name = bedfile + "_P"
        n_file_name = bedfile + "_n"
        Utility_extended.separate_by_strand(
            bedfile, p_file_name, n_file_name
        )  #partition the bed file into reads in positive strand and negative strand
        ##################################################################3
        #The column numbers are 1 based instead of 0 based!
        #For positive strand
        start_index_P = 2
        #For negative strand
        start_index_N = 3
        ##################################################################3
        p_totalcount = get_total_tag_counts.get_total_tag_counts(p_file_name)
        (
            forward_reads_on_shared_exons, forward_reads_on_shared_introns,
            forward_reads_on_merged_transcripts, forward_summary
        ) = get_strandspecific_read_count_on_ExonsIntrons.calculateExonIntrons(
            entrez_genes, p_file_name, start_index_P, chroms, fragment_size,
            p_totalcount, None)

        n_totalcount = get_total_tag_counts.get_total_tag_counts(n_file_name)
        (
            reverse_reads_on_shared_exons, reverse_reads_on_shared_introns,
            reverse_reads_on_merged_transcripts, reverse_summary
        ) = get_strandspecific_read_count_on_ExonsIntrons.calculateExonIntrons(
            entrez_genes, n_file_name, start_index_N, chroms, fragment_size,
            n_totalcount, None)

        all_reads_on_shared_exons = {
        }  # {entrezID:[((start, end), read_count)]}
        all_reads_on_shared_introns = {
        }  # {entrezID:[((start, end), read_count)]}
        all_reads_on_merged_transcripts = {
        }  #{entrezID:[((start, end), read_count)]}
        all_summary = {}  # {entrezID:{attributes}}

        all_reads_on_shared_exons = combine_rc(forward_reads_on_shared_exons,
                                               reverse_reads_on_shared_exons)
        all_reads_on_shared_introns = combine_rc(
            forward_reads_on_shared_introns, reverse_reads_on_shared_introns)
        all_reads_on_merged_transcripts = combine_rc(
            forward_reads_on_merged_transcripts,
            reverse_reads_on_merged_transcripts)
        all_summary = combine_summary(forward_summary, reverse_summary,
                                      p_totalcount, n_totalcount)

    SeparateByChrom.cleanup(chroms, extension)
    return (all_reads_on_shared_exons, all_reads_on_shared_introns,
            all_reads_on_merged_transcripts, all_summary)
示例#13
0
def main(argv):

    parser = OptionParser()
    parser.add_option("-s",
                      "--species",
                      action="store",
                      type="string",
                      dest="species",
                      help="species under consideration",
                      metavar="<str>")
    parser.add_option("-b",
                      "--raw_bam_file",
                      action="store",
                      type="string",
                      dest="bam_file",
                      help="raw bam file",
                      metavar="<file>")
    parser.add_option("-t",
                      "--threshold",
                      action="store",
                      type="int",
                      dest="threshold",
                      help="threshold for copy number",
                      metavar="<int>")
    parser.add_option("-o",
                      "--output_file_name",
                      action="store",
                      type="string",
                      dest="out_file",
                      help="output file name",
                      metavar="<file>")
    ## Add options to filter reads
    parser.add_option(
        "-f",
        "--requiredFlag",
        type='int',
        help="Required bit in sam flag. Same as samtools view -f")
    parser.add_option(
        "-F",
        "--filterFlag",
        type='int',
        help="Filter out bit in sam flag, Same as samtools view -F")
    parser.add_option("-q",
                      "--mapq",
                      type='int',
                      help="minimum mapq for a read to be kept")

    (opt, args) = parser.parse_args(argv)
    if len(argv) < 8:
        parser.print_help()
        sys.exit(1)

    #if opt.species in GenomeData.species_chroms.keys():
    #    chroms = GenomeData.species_chroms[opt.species];
    #else:
    #    sys.stderr.write("\nThis species is not recognized, exiting\n");
    #    sys.exit(1);
    chroms = SeparateByChrom.getChromsFromBam(opt.bam_file)

    SeparateByChrom.separateByChromBamToBed(chroms,
                                            opt.bam_file,
                                            '.bed1',
                                            requiredFlag=opt.requiredFlag,
                                            filterFlag=opt.filterFlag,
                                            mapq=opt.mapq)

    if opt.threshold > 0:
        for chrom in chroms:
            if (Utility.fileExists(chrom + ".bed1")):
                strand_broken_remove(chrom, opt.threshold)
        SeparateByChrom.combineAllGraphFilesBedToBam(chroms, '.bed2',
                                                     opt.bam_file,
                                                     opt.out_file)
    else:
        SeparateByChrom.combineAllGraphFilesBedToBam(chroms, '.bed1',
                                                     opt.bam_file,
                                                     opt.out_file)
    SeparateByChrom.cleanup(chroms, '.bed1')
    SeparateByChrom.cleanup(chroms, '.bed2')
def main(argv):
    parser = OptionParser()
    parser.add_option("-k",
                      "--known_gene_file",
                      action="store",
                      type="string",
                      dest="genefile",
                      help="file with known gene info",
                      metavar="<file>")
    parser.add_option("-b",
                      "--bedfile",
                      action="store",
                      type="string",
                      dest="bedfile",
                      help="file with tags in bed format",
                      metavar="<file>")
    parser.add_option("-c",
                      "--TypeOfSites",
                      action="store",
                      type="string",
                      dest="type",
                      help="GENE, ISLAND",
                      metavar="<str>")
    parser.add_option("-o",
                      "--outfile",
                      action="store",
                      type="string",
                      dest="outfile",
                      help="outfile name",
                      metavar="<file>")
    parser.add_option("-s",
                      "--species",
                      action="store",
                      type="string",
                      dest="species",
                      help="species",
                      metavar="<str>")
    parser.add_option("-u",
                      "--UpstreamExtension",
                      action="store",
                      type="int",
                      dest="upstreamExtension",
                      help="UpstreamExtension",
                      metavar="<int>")
    parser.add_option("-d",
                      "--DownstreamExtension",
                      action="store",
                      type="int",
                      dest="downstreamExtension",
                      help="DownstreamExtension",
                      metavar="<int>")
    parser.add_option(
        "-r",
        "--resolution",
        action="store",
        type="int",
        dest="resolution",
        help="resolution of the upstream and downstream profile, eg, 5",
        metavar="<int>")
    parser.add_option(
        "-w",
        "--WindowSize",
        action="store",
        type="int",
        dest="window_size",
        help=
        "window size for averaging for the upstream and downstream profile. When window size > resolution, there is smoothing",
        metavar="<int>")
    parser.add_option("-g",
                      "--genicPartition",
                      action="store",
                      type="int",
                      dest="genicPartition",
                      help="genicPartition, eg, 20",
                      metavar="<int>")
    parser.add_option("-p",
                      "--plusReadShift",
                      action="store",
                      type="int",
                      dest="pshift",
                      help="plusReadShift",
                      metavar="<int>")
    parser.add_option("-m",
                      "--minusReadShift",
                      action="store",
                      type="int",
                      dest="mshift",
                      help="minusReadShift",
                      metavar="<int>")

    (opt, args) = parser.parse_args(argv)
    if len(argv) < 24:
        parser.print_help()
        sys.exit(1)

    if opt.species in GenomeData.species_chroms.keys():
        chroms = GenomeData.species_chroms[opt.species]
        chrom_lengths = GenomeData.species_chrom_lengths[opt.species]
    else:
        print "This species is not recognized, exiting"
        sys.exit(1)

    #t0 = time.time()
    libName = (opt.bedfile).split('/')[-1]
    libName = libName.split('.')[0]
    extension = "-" + libName + '.bed1'
    SeparateByChrom.separateByChrom(chroms, opt.bedfile, extension)
    num_genes = 0
    num_tags = 0

    if (opt.upstreamExtension % opt.resolution != 0):
        print "Please choose the resolution commensurate with the length of the upstream region"
        sys.exit(1)
    if (opt.downstreamExtension % opt.resolution != 0):
        print "Please choose the resolution commensurate with the length of the downstream region"
        sys.exit(1)

    upstreamNumPoints = opt.upstreamExtension / opt.resolution
    all_gene_scores = {}

    print "Species: ", opt.species
    print "Upstream extension: ", opt.upstreamExtension
    print "Downstream extension: ", opt.downstreamExtension
    print "Upstream and Downstream resolution:", opt.resolution
    print "Upstream and Downstream Scanning window size: ", opt.window_size
    print "Genic partition: ", opt.genicPartition
    print "Plus reads shift: ", opt.pshift
    print "Minus reads shift: ", opt.mshift

    if opt.type == "GENE":
        coords = UCSC.KnownGenes(opt.genefile)
    elif opt.type == "ISLAND":
        # Build coords in the mode of a pseudo ucsc file, all pseudo genes are in the positive direction
        # Here we are assuming that the file has the format chrom start end + .....for each line
        # chrom is sline[0], start is sline[1], end is sline[2]

        strand = '+'
        coords = {}
        index = 0

        infile = open(opt.genefile, 'r')
        for line in infile:
            """ check to make sure not a header line """
            if not re.match("track", line):
                index += 1
                line = line.strip()
                sline = line.split()
                if sline[0] not in coords.keys():
                    coords[sline[0]] = []
                name = "Island" + str(index)
                chrom = sline[0]
                txStart = atoi(sline[1])
                txEnd = atoi(sline[2])
                # (name, chrom, strand, txStart, txEnd, cdsStart, cdsEnd, exonCount, exonStarts, exonEnds)
                mycoord = UCSC.UCSC(name, chrom, strand, txStart, txEnd,
                                    txStart, txEnd, 0, '0', '0')
                coords[chrom].append(mycoord)
        infile.close()
    else:
        print "Only two types of locations are allowed: GENE, ISLAND"
        sys.exit(1)

    normalization = num_tags / 1000000.0

    minimum_genic_resolution = 10
    old_num_genes = getNumGenes(coords)
    coords = findEligibleGenes(
        coords, chroms, opt.genicPartition,
        minimum_genic_resolution)  # no longer a knowngene object
    num_genes = getNumGenes(coords)
    print num_genes - new_num_genes, " genes whose length does not support minimal genic resolution of ", minimum_genic_resolution, " or are on exotic chroms, are discarded"
    print "Number of " + opt.type + ": ", num_genes
    print "Number of reads: ", num_tags

    for chrom in chroms:
        chrombed = chrom + extension
        if Utility.fileExists(chrombed):
            bed_vals = {}
            bed_vals = BED_revised.BED(opt.species, chrombed, "BED2")
            num_tags += bed_vals.getNumVals()
            if (chrom in coords.keys()):
                if (len(coords[chrom]) > 0):
                    mycoords = {}
                    scoredic_upstream = {}
                    scoredic_downstream = {}
                    scoredic_genebody = {}

                    mycoords[chrom] = coords[chrom]
                    scoredic_upstream = GenerateProfileMatrixAroundLocations.getTSSPMProfileMatrix(
                        mycoords, opt.upstreamExtension, 0, opt.resolution,
                        opt.window_size, opt.pshift, opt.mshift, bed_vals)
                    scoredic_downstream = GenerateProfileMatrixAroundLocations.getTESPMProfileMatrix(
                        mycoords, 0, opt.downstreamExtension, opt.resolution,
                        opt.window_size, opt.pshift, opt.mshift, bed_vals)
                    scoredic_genebody = getGeneBodyProfileMatrix(
                        mycoords, opt.genicPartition, opt.pshift, opt.mshift,
                        bed_vals, minimum_genic_resolution)

                    myid_set = list(
                        set(scoredic_upstream.keys())
                        & set(scoredic_downstream.keys())
                        & set(scoredic_genebody.keys()))
                    chrom_gene_scores = {}
                    count_normalization = float(opt.window_size) / 1000.0
                    for myid in myid_set:
                        chrom_gene_scores[myid] = [
                            item / count_normalization
                            for item in scoredic_upstream[myid]
                        ] + scoredic_genebody.keys[mykey] + [
                            item / count_normalization
                            for item in scoredic_downstream[mykey]
                        ]
    all_genes_scores.update(chrom_gene_scores)
    # Save in a file
    outFile = open(opt.outfile, 'w')
    for mykey in xrange(len(all_genes_scores.keys())):
        outline = mykey + "\t" + "\t".join(
            [str(item / normalization)
             for item in all_genes_scores[mykey]]) + '\n'
        outFile.write(outline)
    outFile.close()

    #test
    totalPoints = upstreamNumPoints + opt.genicPartition + downstreamNumPoints
    half_partition = int(opt.resolution / 2.0)

    upstreamXcoordinates = [0.0] * upstreamNumPoints
    for i in xrange(upstreamNumPoints):
        upstreamXcoordinates[
            i] = -1.0 * opt.upstreamExtension + half_partition + i * opt.resolution
    downstreamXcoordinates = [0.0] * downstreamNumPoints
    for i in xrange(downstreamNumPoints):
        downstreamXcoordinates[i] = half_partition + i * opt.resolution
    genebodyXcoordinates = [0.0] * opt.genicPartition
    for i in xrange(opt.genicPartition):
        genebodyXcoordinates[i] = float((i + 1)) / opt.genicPartition

    overallXcoordinates = upstreamXcoordinates + genebodyXcoordinates + downstreamXcoordinates
    overall_score_profile = [0] * overallXcoordinates
    for mykey in xrange(len(all_genes_scores.keys())):
        assert (len(overallXcoordinates) == len(all_genes_scores[mykey]))
        for i in xrange(overallXcoordinates):
            overall_score_profile[i] += (all_genes_scores[mykey])[i]

    #Plot it out
    xcords = [0] * len(overallXcoordinates)
    for i in xrange(len(xcords)):
        xcords[i] = i

    libName = (opt.bedfile).split('/')[-1]
    libName = libName.split('.')[0]
    annotationName = (opt.genefile).split('/')[-1]
    annotationName = annotationName.split('.')[0]
    title = libName + " on " + annotationName
    legend = ""
    GenerateAroundRegions.plot_profile(opt.upstreamExtension,
                                       downstreamExtension, opt.resolution,
                                       opt.genicPartition, xcords,
                                       overall_score_profile, 0, title, legend,
                                       opt.outfile + '_plot.eps')

    SeparateByChrom.cleanup(chroms, extension)
示例#15
0
def main(argv):
    parser = OptionParser()

    parser.add_option("-b",
                      "--bedfile",
                      action="store",
                      type="string",
                      dest="bedfile",
                      metavar="<file>",
                      help="ChIP seq read file")
    parser.add_option(
        "-f",
        "--fragment_size",
        action="store",
        type="int",
        dest="fragment_size",
        help=
        "fragment_size determins the shift (half of fragment_size of ChIP-seq read position, in bps",
        metavar="<int>")
    parser.add_option("-t",
                      "--RE_tree_pickle_file",
                      action="store",
                      type="string",
                      dest="RE_Tree",
                      metavar="<file>",
                      help="file with RE tree in pickle format")
    parser.add_option(
        "-l",
        "--RE_annotation_file_location",
        action="store",
        type="string",
        dest="RE_file_location",
        metavar="<file>",
        help="location of RE files named in repClass_repFamily_repName.txt")
    parser.add_option("-u",
                      "--upstream_extension",
                      action="store",
                      type="int",
                      dest="upstream_extension",
                      help="upstream extension from start",
                      metavar="<int>")
    parser.add_option("-d",
                      "--downstream_extension",
                      action="store",
                      type="int",
                      dest="downstream_extension",
                      help="downstream extension from end",
                      metavar="<int>")
    parser.add_option("-s",
                      "--species",
                      action="store",
                      type="string",
                      dest="species",
                      help="species, mm8, hg18, etc",
                      metavar="<str>")
    parser.add_option("-n",
                      "--feature_name",
                      action="store",
                      type="string",
                      dest="feature_name",
                      help="name of the library",
                      metavar="<str>")

    (opt, args) = parser.parse_args(argv)
    if len(argv) < 16:
        parser.print_help()
        sys.exit(1)

    startTime = time.time()
    if opt.species in GenomeData.species_chroms.keys():
        chroms = GenomeData.species_chroms[opt.species]
        chrom_lengths = GenomeData.species_chrom_lengths[opt.species]
    else:
        print "This species is not recognized, exiting"
        sys.exit(1)

    total_count = get_total_tag_counts.get_total_tag_counts(opt.bedfile)

    #Separate_by_chrom on bedfile
    lib_name = (opt.bedfile).split('/')[-1]  # remove directory
    suffix = lib_name.split('.')[-1]  # txt
    lib_name = lib_name.split('.')[0]
    extension = "-" + lib_name + '.' + suffix + "1"
    if Utility_extended.fileExists(opt.bedfile):
        if Utility_extended.chrom_files_exist(chroms, extension) != 1:
            SeparateByChrom.separateByChrom(chroms, opt.bedfile, extension)
    else:
        print bedfile, " is not found"
        sys.exit(1)

    #load the RE tree to get the RE file names
    re_tree = pickle.load(open(opt.RE_Tree, 'rb'))
    (numb_classes, numb_families, numb_names) = numbers(re_tree)
    print "There are %d classes, %d family, and %d names." % (
        numb_classes, numb_families, numb_names)

    #Prepare the summary
    read_counts = {}
    for reClass in re_tree.keys():
        read_counts[reClass] = {}
        for reFamily in re_tree[reClass].keys():
            read_counts[reClass][reFamily] = {}
            for reName in re_tree[reClass][reFamily]:
                read_counts[reClass][reFamily][reName] = {}

    #cycle through chrom
    for chrom in chroms:
        print chrom
        chrom_length = chrom_lengths[chrom]
        chrombed = chrom + extension
        if Utility_extended.fileExists(chrombed):
            # load in each read and shift
            tag_position_list = []
            inf = open(chrombed, 'r')
            for line in inf:
                if not re.match("#", line):
                    line = line.strip()
                    sline = line.split()
                    tag_position_list.append(
                        associate_tags_with_regions.tag_position(
                            sline, opt.fragment_size))
            inf.close()
            if not Utility_extended.is_list_sorted(tag_position_list):
                tag_position_list.sort()  #[tag_positions]

        min_re_length = 10
        for reClass in re_tree.keys():
            for reFamily in re_tree[reClass].keys():
                for reName in re_tree[reClass][reFamily]:
                    re_file_name = "_".join([reClass, reFamily, reName
                                             ]) + ".txt"
                    #{id:{feature_name:value}}
                    rc_dic = get_read_count(
                        opt.RE_file_location, re_file_name, opt.feature_name,
                        chrom, chrom_length, tag_position_list, total_count,
                        opt.upstream_extension, opt.downstream_extension,
                        min_re_length)
                    # id is unique and updated only once, so this should be ok
                    read_counts[reClass][reFamily][reName].update(rc_dic)

    #{reClass:{reFamily:{reName:{id:feature_name, value}}}}
    #feature_name include: feature_name + "_rc", feature_name + "_rpkm"
    #output_file_name = feature_name + "_on_" + "mm9_rmsk.pkl"
    #output = open(output_file_name, 'wb')
    #pickle.dump(read_counts, output)
    #output.close()

    #instead of outputing a huge one, let's output many small pieces
    breakdown_and_output(read_counts, opt.feature_name)

    repClass = 'LTR'
    repFamily = 'ERV1'
    repName = 'RLTR4_Mm'
    outfile_name = lib_name + "_on_" + "_".join([repClass, repFamily, repName
                                                 ]) + ".dat"
    test(read_counts, repClass, repFamily, repName, outfile_name)

    SeparateByChrom.cleanup(chroms, extension)

    print "it took", time.time() - startTime, "seconds."
示例#16
0
def main(argv):
    parser = OptionParser()
    parser.add_option("-s",
                      "--species",
                      action="store",
                      type="string",
                      dest="species",
                      help="species, mm8, hg18",
                      metavar="<str>")
    parser.add_option("-a",
                      "--rawchipreadfile",
                      action="store",
                      type="string",
                      dest="chipreadfile",
                      metavar="<file>",
                      help="raw read file from chip in bed format")
    parser.add_option("-b",
                      "--rawcontrolreadfile",
                      action="store",
                      type="string",
                      dest="controlreadfile",
                      metavar="<file>",
                      help="raw read file from control in bed format")
    parser.add_option("-f",
                      "--fragment_size",
                      action="store",
                      type="int",
                      dest="fragment_size",
                      metavar="<int>",
                      help="average size of a fragment after CHIP experiment")
    parser.add_option("-d",
                      "--islandfile",
                      action="store",
                      type="string",
                      dest="islandfile",
                      metavar="<file>",
                      help="island file in BED format")
    parser.add_option("-o",
                      "--outfile",
                      action="store",
                      type="string",
                      dest="out_file",
                      metavar="<file>",
                      help="island read count summary file")
    parser.add_option("-t",
                      "--mappable_fraction_of_genome_size ",
                      action="store",
                      type="float",
                      dest="fraction",
                      help="mapable fraction of genome size",
                      metavar="<float>")

    (opt, args) = parser.parse_args(argv)
    if len(argv) < 14:
        parser.print_help()
        sys.exit(1)

    if opt.species in GenomeData.species_chroms.keys():
        chroms = GenomeData.species_chroms[opt.species]
        genomesize = sum(
            GenomeData.species_chrom_lengths[opt.species].values())
        genomesize = opt.fraction * genomesize
    else:
        print "This species is not recognized, exiting"
        sys.exit(1)

    chip_library_size = get_total_tag_counts.get_total_tag_counts(
        opt.chipreadfile)
    control_library_size = get_total_tag_counts.get_total_tag_counts(
        opt.controlreadfile)
    print "chip library size  ", chip_library_size
    print "control library size  ", control_library_size

    totalchip = 0
    totalcontrol = 0

    islands = BED.BED(opt.species, opt.islandfile, "BED3", 0)

    # separate by chrom the chip library
    if Utility.fileExists(opt.chipreadfile):
        SeparateByChrom.separateByChrom(chroms, opt.chipreadfile, '.bed1')
    else:
        print opt.chipreadfile, " not found"
        sys.exit(1)
    # separate by chrom the control library
    if Utility.fileExists(opt.controlreadfile):
        SeparateByChrom.separateByChrom(chroms, opt.controlreadfile, '.bed2')
    else:
        print opt.controlreadfile, " not found"
        sys.exit(1)

    island_chip_readcount = {}
    island_control_readcount = {}

    for chrom in chroms:
        if chrom in islands.keys():
            if len(islands[chrom]) != 0:
                island_list = islands[chrom]
                if Utility.is_bed_sorted(island_list) == 0:
                    island_list.sort(key=operator.attrgetter('start'))

                island_start_list = []
                island_end_list = []
                for item in island_list:
                    island_start_list.append(item.start)
                    island_end_list.append(item.end)

                island_chip_readcount_list = [0] * len(island_list)
                read_file = chrom + ".bed1"
                f = open(read_file, 'r')
                for line in f:
                    if not re.match("#", line):
                        line = line.strip()
                        sline = line.split()
                        position = associate_tags_with_regions.tag_position(
                            sline, opt.fragment_size)
                        index = associate_tags_with_regions.find_readcount_on_islands(
                            island_start_list, island_end_list, position)
                        if index >= 0:
                            island_chip_readcount_list[index] += 1
                            totalchip += 1
                f.close()
                island_chip_readcount[chrom] = island_chip_readcount_list

                island_control_readcount_list = [0] * len(island_list)
                read_file = chrom + ".bed2"
                f = open(read_file, 'r')
                for line in f:
                    if not re.match("#", line):
                        line = line.strip()
                        sline = line.split()
                        position = associate_tags_with_regions.tag_position(
                            sline, opt.fragment_size)
                        index = associate_tags_with_regions.find_readcount_on_islands(
                            island_start_list, island_end_list, position)
                        if index >= 0:
                            island_control_readcount_list[index] += 1
                            totalcontrol += 1
                f.close()

                island_control_readcount[chrom] = island_control_readcount_list

    chip_background_read = chip_library_size - totalchip
    control_background_read = control_library_size - totalcontrol
    #scaling_factor = chip_background_read*1.0/control_background_read;
    scaling_factor = chip_library_size * 1.0 / control_library_size

    print "Total number of chip reads on islands is: ", totalchip
    print "Total number of control reads on islands is: ", totalcontrol

    #print "chip_background_read   ", chip_background_read
    #print "control_background_read   ", control_background_read

    out = open(opt.out_file, 'w')
    pvalue_list = []
    result_list = []
    for chrom in chroms:
        if chrom in islands.keys():
            if len(islands[chrom]) != 0:
                island_list = islands[chrom]
                for index in xrange(len(island_list)):
                    item = island_list[index]
                    observation = (island_chip_readcount[chrom])[index]
                    control_tag = (island_control_readcount[chrom])[index]
                    if (island_control_readcount[chrom])[index] > 0:
                        #average = (island_control_readcount[chrom])[index] * scaling_factor;
                        average = control_tag * scaling_factor
                        fc = float(observation) / float(average)
                    else:
                        length = item.end - item.start + 1
                        average = length * control_library_size * 1.0 / genomesize
                        average = min(0.25, average) * scaling_factor
                        fc = float(observation) / float(average)
                    if observation > average:
                        pvalue = scipy.stats.poisson.sf(
                            (island_chip_readcount[chrom])[index], average)[()]
                    else:
                        pvalue = 1
                    pvalue_list.append(pvalue)
                    item_dic = {}
                    item_dic['chrom'] = item.chrom
                    item_dic['start'] = item.start
                    item_dic['end'] = item.end
                    item_dic['chip'] = observation
                    item_dic['control'] = control_tag
                    item_dic['pvalue'] = pvalue
                    item_dic['fc'] = fc
                    result_list.append(item_dic)

    pvaluearray = scipy.array(pvalue_list)
    pvaluerankarray = scipy.stats.rankdata(pvaluearray)
    totalnumber = len(result_list)
    for i in range(totalnumber):
        item = result_list[i]
        alpha = pvalue_list[i] * totalnumber / pvaluerankarray[i]
        if alpha > 1:
            alpha = 1
        outline = item['chrom'] + "\t" + str(item['start']) + "\t" + str(
            item['end']) + "\t" + str(item['chip']) + "\t" + str(
                item['control']) + "\t" + str(item['pvalue']) + "\t" + str(
                    item['fc']) + "\t" + str(alpha) + "\n"
        out.write(outline)

    #pvalue_list.sort()
    #for item in result_list:
    #pvalue = float(item['pvalue'])
    #alpha = pvalue * len(result_list) / (pvalue_list.index(pvalue) + 1)
    #if alpha > 1:
    #alpha = 1;
    #outline = item['chrom'] + "\t" + str(item['start']) + "\t" + str(item['end']) + "\t" + str(item['chip']) + "\t" + str(item['control']) + "\t" + str(item['pvalue']) + "\t" + str(item['fc']) + "\t" + str(alpha) + "\n";
    #out.write(outline);
    out.close()

    SeparateByChrom.cleanup(chroms, '.bed1')
    SeparateByChrom.cleanup(chroms, '.bed2')
def main(argv):
    
    parser = OptionParser()
    parser.add_option("-s", "--species", action="store", type="string",
                      dest="species", help="species under consideration", metavar="<str>")
    parser.add_option("-b", "--raw_bam_file", action="store", type="string",
                      dest="bam_file", help="raw bam file", metavar="<file>")
    parser.add_option("-t", "--threshold", action="store", type="int",
                      dest="threshold", help="threshold for copy number", metavar="<int>")          
    parser.add_option("-o", "--output_file_name", action="store", type="string",
                      dest="out_file", help="output file name", metavar="<file>")
    ## Add options to filter reads
    parser.add_option("-f", "--requiredFlag", type= 'int', help="Required bit in sam flag. Same as samtools view -f")
    parser.add_option("-F", "--filterFlag", type= 'int', help="Filter out bit in sam flag, Same as samtools view -F")
    parser.add_option("-q", "--mapq", type= 'int', help="minimum mapq for a read to be kept")
    
    (opt, args) = parser.parse_args(argv)
    if len(argv) < 8:
            parser.print_help()
            sys.exit(1)
    
    #if opt.species in GenomeData.species_chroms.keys():
    #    chroms = GenomeData.species_chroms[opt.species];
    #else:
    #    sys.stderr.write("\nThis species is not recognized, exiting\n");
    #    sys.exit(1);
    chroms= SeparateByChrom.getChromsFromBam(opt.bam_file)
    
    SeparateByChrom.separateByChromBamToBed(chroms, opt.bam_file, '.bed1', requiredFlag= opt.requiredFlag, filterFlag= opt.filterFlag, mapq= opt.mapq)
    
    if opt.threshold > 0:
        for chrom in chroms:
            if (Utility.fileExists(chrom + ".bed1")):
                strand_broken_remove(chrom, opt.threshold)
        SeparateByChrom.combineAllGraphFilesBedToBam(chroms, '.bed2', opt.bam_file, opt.out_file)
    else:
        SeparateByChrom.combineAllGraphFilesBedToBam(chroms, '.bed1', opt.bam_file, opt.out_file)
    SeparateByChrom.cleanup(chroms, '.bed1')
    SeparateByChrom.cleanup(chroms, '.bed2')
示例#18
0
def main(argv):
	'''
	Coarse graining test chr1, input must only have chr1
	
	'''
	parser = OptionParser()
	parser.add_option("-s", "--species", action="store", type="string", dest="species", help="mm8, hg18, background, etc", metavar="<str>")
	parser.add_option("-b", "--summarygraph", action="store",type="string", dest="summarygraph", help="summarygraph", metavar="<file>")
	parser.add_option("-w", "--window_size(bp)", action="store", type="int", dest="window_size", help="window_size(in bps)", metavar="<int>")
	parser.add_option("-g", "--graining_size", action="store", type="int",  dest="step", help="graining unit size (>0)", metavar="<int>")
	parser.add_option("-e", "--score", action="store", type="int", dest="score", help="graining criterion, 0<score<=graining_size", metavar="<int>")
	parser.add_option("-t", "--mappable_faction_of_genome_size", action="store", type="float", dest="fraction", help="mapable fraction of genome size", metavar="<float>")
	parser.add_option("-f", "--output_file", action="store", type="string", dest="out_file", help="output file name", metavar="<file>")
	
	(opt, args) = parser.parse_args(argv)
	if len(argv) < 14:
        	parser.print_help()
        	sys.exit(1)

	print "Coarse-graining approach to identify ChIP-Seq enriched domains:"
	if opt.species in  GenomeData.species_chroms.keys():
		print "Species: ", opt.species;
		print "Window_size: ", opt.window_size;
		print "Coarse graining step: ", opt.step;
		print "Coarse graining score:", opt.score;
		chroms = GenomeData.species_chroms[opt.species]
		total_read_count = get_total_tag_counts.get_total_tag_counts_bed_graph(opt.summarygraph);
		print "Total read count:", total_read_count
		genome_length = sum (GenomeData.species_chrom_lengths[opt.species].values());
		genome_length = int(opt.fraction * genome_length);

		average = float(total_read_count) * opt.window_size/genome_length; 
		print "Effective genome length: ", genome_length;
		print "window average:", average;
		
		min_tags_in_window = int(average) + 1
		print "Minimum read count in a qualified window: ", min_tags_in_window
		
		print "Generate preprocessed data list"; 
		#read in the summary graph file
		bed_val = BED.BED(opt.species, opt.summarygraph, "BED_GRAPH");
		#generate the probscore summary graph file, only care about enrichment
		for chrom in chroms: 
			if chrom in bed_val.keys() and len(bed_val[chrom]) > 0:
				chrom_length = GenomeData.species_chrom_lengths[opt.species][chrom]
				eligible_start_list = []
				for index in xrange(len(bed_val[chrom])):
					read_count = bed_val[chrom][index].value;
					if read_count >= min_tags_in_window:
						eligible_start_list.append(bed_val[chrom][index].start)
				print "Coarse graining:";
				(result_list, island_list) = coarsegraining(eligible_start_list, opt.window_size, opt.step, opt.score, chrom_length)
				print "Trace back...", len(island_list)
				islands = traceback(island_list, opt.window_size, opt.step, 0, chrom_length, chrom)
				print len(islands), "islands found in", chrom
				f = open(chrom + ".islandstemp", 'w')
				for i in range(0, len(islands)):
					f.write(chrom + '\t' + str(int(islands[i].start)) + '\t' + str(int(islands[i].end)) + '\t1\n')
				f.close()
		o = open(opt.out_file, 'w')
		o.write('track type=bedGraph name=' + opt.out_file + '\n')
		o.close()
		SeparateByChrom.combineAllGraphFiles(chroms, ".islandstemp", opt.out_file)
		SeparateByChrom.cleanup(chroms, ".islandstemp")
		#else: 
			#print "input data error!"
	else:
		print "This species is not in my list!"; 
示例#19
0
def calculateExonIntrons(entrez_genes,
                         bedfile,
                         column_index,
                         chroms,
                         fragment_size,
                         totalcount,
                         out_file=None):
    """
	entrez_genes is a EntrezGenes class object. The core is a entrez_genes.entrez_genes is a dic (keyed by entrez_id) of lists of EntrezGene object
	
	return:
	all_reads_on_shared_exons = {} # {entrezID:[((start, end), read_count)]}
	all_reads_on_shared_introns = {} # {entrezID:[((start, end), read_count)]}
	all_reads_on_merged_transcripts = {} #{entrezID:[((start, end), read_count)]}
	all_summary = {} # {entrezID:{attribute:value}}
		(summary[entrez_id])["merged_exons_rc"] = merged_exons_rc
		(summary[entrez_id])["merged_exon_RPKM"] = merged_exon_RPKM
		(summary[entrez_id])["merged_exons_total_length"] = merged_exons_total_length
		(summary[entrez_id])["shared_exons_rc"] = shared_exons_rc
		(summary[entrez_id])["shared_exon_RPKM"] = shared_exon_RPKM
		(summary[entrez_id])["shared_exons_total_length"] = shared_exons_total_length
		(summary[entrez_id])["shared_introns_rc"] = shared_introns_rc
		(summary[entrez_id])["shared_intron_RPKM"] = shared_intron_RPKM
		(summary[entrez_id])["shared_introns_total_length"] = shared_introns_total_length
		(summary[entrez_id])["merged_transcript_rc"] = merged_transcript_rc
		(summary[entrez_id])["merged_transcript_RPKM"] = merged_transcript_RPKM
		(summary[entrez_id])["merged_transcript_length"] = merged_transcript_length
	"""
    lib_name = (bedfile).split('/')[-1]  # remove directory
    suffix = lib_name.split('.')[-1]  # txt
    lib_name = lib_name.split('.')[0]
    extension = "-" + lib_name + '.' + suffix + "1"
    if Utility_extended.fileExists(bedfile):
        if Utility_extended.chrom_files_exist(chroms, extension) != 1:
            # Separate by chrom and sort by start
            print chroms, extension, " files do not exist, separate by chroms. "
            Utility_extended.separate_by_chrom_sort(chroms, bedfile, extension,
                                                    [column_index])
    else:
        print bedfile, " is not found"
        sys.exit(1)

    all_reads_on_shared_exons = {}  # {entrezID:[((start, end), read_count)]}
    all_reads_on_shared_introns = {}  # {entrezID:[((start, end), read_count)]}
    all_reads_on_merged_transcripts = {
    }  #{entrezID:[((start, end), read_count)]}
    all_summary = {}  # {entrezID:{attributes}}

    for chrom in chroms:
        chrombed = chrom + extension
        if chrom in entrez_genes.chroms:
            entrez_genes_by_chrom = Entrez.KnownEntrezGenes(
                [chrom], entrez_genes.subset_by_chrom(chrom))
            (reads_on_shared_exons, reads_on_shared_introns,
             reads_on_merged_transcripts,
             summary) = calculateExonIntrons_by_chrom(entrez_genes_by_chrom,
                                                      chrombed, fragment_size,
                                                      totalcount, out_file)
            #if chrom == chroms[0]:
            #myid = reads_on_shared_exons.keys()[0]
            #test(entrez_genes_by_chrom, reads_on_shared_introns, myid)
            all_reads_on_shared_exons.update(reads_on_shared_exons)
            all_reads_on_shared_introns.update(reads_on_shared_introns)
            all_reads_on_merged_transcripts.update(reads_on_merged_transcripts)
            all_summary.update(summary)
            print len(all_summary.keys())

    SeparateByChrom.cleanup(chroms, extension)
    return (all_reads_on_shared_exons, all_reads_on_shared_introns,
            all_reads_on_merged_transcripts, all_summary)
def main(argv):
	parser = OptionParser()
	parser.add_option("-a", "--islandfile1", action="store", type="string", dest="islandfile1", metavar="<file>", help="file 1 with islands info to be compared")
	parser.add_option("-b", "--islandfile2", action="store", type="string", dest="islandfile2", metavar="<file>", help="file 2 with islands info to be compared")
	parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species, mm8 or hg18", metavar="<str>")
	parser.add_option("-p", "--overlapin1", action="store", type="string", dest="overlapin1", metavar="<file>", help="file for islands in 1 overlapping with islands in 2")
	parser.add_option("-q", "--nonoverlapin1", action="store", type="string", dest="nonoverlapin1", help="file for islands in 1 not overlapping with islands in 2 ", metavar="<file>")

	(opt, args) = parser.parse_args(argv)
	if len(argv) < 10:
        	parser.print_help()
        	sys.exit(1)
	
	if opt.species in species_chroms.keys():
		chroms = species_chroms[opt.species];
	else:
		print "This species is not recognized, exiting";
		sys.exit(1);
	
	
	total_overlap_number_1 = 0
	total_islands_1 = 0
	
	SeparateByChrom.separateByChrom(chroms, opt.islandfile1, '.island1')
	SeparateByChrom.separateByChrom(chroms, opt.islandfile2, '.island2')
	
	for chrom in chroms: 
		f = open(chrom + '.1in2', 'w')
		g = open(chrom + '.1notin2', 'w')
		bed_vals_2 = BED.BED(opt.species, chrom+'.island2', "BED3", 0)
		if Utility.fileExists(chrom+'.island1') and len(bed_vals_2[chrom])>0:
			islandlist2 = bed_vals_2[chrom];
			if (are_islands_sorted(islandlist2) != 1):
				islandlist2.sort(key=operator.attrgetter('start'));
			(island2_start_list, island2_end_list) = union_islands(islandlist2)
			islands1 = open(chrom+'.island1', 'r')
			for line in islands1:
				if not re.match("#", line):
					total_islands_1 += 1
					line = line.strip()
					sline = line.split()
					start = int(sline[1])
					end = int(sline[2])
					if (region_overlap(start, end, island2_start_list, island2_end_list) == 1):
						f.write('\t'.join(sline) + '\n')
						total_overlap_number_1 += 1;
					else:
						g.write('\t'.join(sline) + '\n');
		elif Utility.fileExists(chrom+'.island1') and (len(bed_vals_2[chrom])==0):
			islands1 = open(chrom+'.island1', 'r')
			for line in islands1:
				if not re.match("#", line):
					total_islands_1 += 1
					line = line.strip()
					sline = line.split()
					g.write('\t'.join(sline) + '\n');
		f.close()
		g.close()
	
	print "total number of island in "+opt.islandfile1+":     ", total_islands_1;
	print "total number of island in "+opt.overlapin1+":     ", total_overlap_number_1;
	
	SeparateByChrom.combineAllGraphFiles(chroms, '.1in2', opt.overlapin1);
	SeparateByChrom.combineAllGraphFiles(chroms, '.1notin2', opt.nonoverlapin1);
	
	SeparateByChrom.cleanup(chroms, '.1in2')
	SeparateByChrom.cleanup(chroms, '.1notin2')
	SeparateByChrom.cleanup(chroms, '.island1')
	SeparateByChrom.cleanup(chroms, '.island2')
def get_read_count_on_genes(rawreadfile, fragment_size, knowngenefile,
                            regiontype, promoter_upstream_extension,
                            promoter_downstream_extension):
    """
	This one provides an integrated module, where the chrom separation etc is done inside. 
	
	Promoter and GeneBody are mutually exclusive. 
	Promoter: TSS-upstreamextention, TSS+downstreamextension
	GeneBody: TSS+downstreamextension, TES
	ExonicRegion: exons of a gene taken together
	PromoterGenebody: Promoter + gene body.
	
	Return: a dictionary with key of gene name and value of read count
	"""
    knowngenes = UCSC_revised.KnownGenes(knowngenefile)
    chroms = knowngenes.keys()
    allowed_region_type = [
        'Promoter', 'GeneBody', 'PromoterGenebody', 'ExonicRegion'
    ]
    if regiontype not in allowed_region_type:
        print " The allowed region types are Promoter, GeneBody,  PromoterGenebody and ExonicRegion. The region type is not recognized, exiting"
        sys.exit(1)
    if regiontype == 'Promoter':
        region_dic = knowngenes.getPromoters(promoter_upstream_extension,
                                             promoter_downstream_extension)
    if regiontype == 'GeneBody':
        region_dic = knowngenes.getGenebodys(promoter_downstream_extension)
    if regiontype == 'PromoterGenebody':
        region_dic = knowngenes.getPromotergenebodys(
            promoter_upstream_extension)

    libName = (rawreadfile).split('/')[-1]
    libName = libName.split('.')[0]
    extension = "-" + libName + ".bed1"
    if Utility_extended.fileExists(rawreadfile):
        SeparateByChrom.separateByChrom(chroms, rawreadfile, extension)
    else:
        print rawreadfile, " not found"
        sys.exit(1)

    genes = {}
    #for output

    for chrom in chroms:
        chrombed = chrom + extension
        if Utility_extended.fileExists(chrombed):
            gene_coords = knowngenes[chrom]
            if len(gene_coords) > 0:
                if regiontype == 'ExonicRegion':
                    (gene_name_list, region_length_list,
                     read_count_list) = get_read_count_on_exons(
                         gene_coords, chrombed, fragment_size)
                else:
                    (gene_name_list, region_length_list,
                     read_count_list) = get_read_count_on_genic_regions(
                         region_dic[chrom], chrombed, fragment_size)
                assert len(gene_name_list) == len(region_length_list)
                assert len(gene_name_list) == len(read_count_list)
                #RPKM = [0] * len(gene_name_list)
                for i in xrange(len(gene_name_list)):
                    #if region_length_list[i] > 0:
                    #	RPKM[i] = read_count_list[i] / (region_length_list[i]/1000.0) / (totalcount/1000000.0)
                    genes[gene_name_list[i]] = read_count_list[i]
                    #outline = gene_name_list[i] + '\t' + str(read_count_list[i]) + '\t' + str(RPKM[i]) + '\n'
                    #f.write(outline)

    SeparateByChrom.cleanup(chroms, extension)

    return genes
def main(argv):
    parser = OptionParser()

    parser.add_option("-b",
                      "--bedfile",
                      action="store",
                      type="string",
                      dest="bedFile",
                      metavar="<file>",
                      help="ChIP seq read file")
    parser.add_option(
        "-f",
        "--fragment_size",
        action="store",
        type="int",
        dest="fragment_size",
        help=
        "fragment_size determins the shift (half of fragment_size of ChIP-seq read position, in bps",
        metavar="<int>")
    parser.add_option("-g",
                      "--known_genes_file",
                      action="store",
                      type="string",
                      dest="known_genes",
                      metavar="<file>",
                      help="file with known genes in UCSC format")
    parser.add_option(
        "-r",
        "--'Promoter' or 'GeneBody' or 'PromoterGenebody' or 'ExonicRegion'",
        action="store",
        type="string",
        dest="region_type",
        metavar="<str>",
        help="region to count tags in")
    parser.add_option("-u",
                      "--promoter_upstream_extension",
                      action="store",
                      type="int",
                      dest="promoter_upstream_extension",
                      help="upstream extension of promoter region from TSS",
                      metavar="<int>")
    parser.add_option("-d",
                      "--promoter_downstream_extension",
                      action="store",
                      type="int",
                      dest="promoter_downstream_extension",
                      help="downstream extension of promoter region from TSS",
                      metavar="<int>")
    parser.add_option("-o",
                      "--outfile",
                      action="store",
                      type="string",
                      dest="out_file",
                      metavar="<file>",
                      help="output file name for genes and tag numbers")

    (opt, args) = parser.parse_args(argv)
    if len(argv) < 14:
        parser.print_help()
        sys.exit(1)

    startTime = time.time()

    known_genes = UCSC_revised.KnownGenes(opt.known_genes)
    chroms = known_genes.keys()
    #Promoter and GeneBody are mutually exclusive.
    #Promoter: TSS-upstreamextention, TSS+downstreamextension
    #GeneBody: TSS+downstreamextension, TES
    #PromoterGenebody: TSS-upstreamextention,  TES.
    allowed_region_type = [
        'Promoter', 'GeneBody', 'PromoterGenebody', 'ExonicRegion'
    ]
    if opt.region_type not in allowed_region_type:
        print " The allowed region types are Promoter, GeneBody, PromoterGenebody and ExonicRegion. The region type is not recognized, exiting"
        sys.exit(1)

    if opt.region_type == 'Promoter':
        region_dic = known_genes.getPromoters(
            opt.promoter_upstream_extension, opt.promoter_downstream_extension)
    elif opt.region_type == 'GeneBody':
        region_dic = known_genes.getGenebodys(
            opt.promoter_downstream_extension)
    elif opt.region_type == 'PromoterGenebody':
        region_dic = known_genes.getPromotergenebodys(
            opt.promoter_upstream_extension)

    libName = (opt.bedFile).split('/')[-1]
    libName = libName.split('.')[0]
    extension = "-" + libName + '.bed1'
    if Utility_extended.fileExists(opt.bedFile):
        SeparateByChrom.separateByChrom(chroms, opt.bedFile, extension)
    else:
        print opt.bedFile, " not found"
        sys.exit(1)

    totalcount = get_total_tag_counts.get_total_tag_counts(opt.bedFile)

    f = open(opt.out_file, 'w')
    outline = "# GeneName" + '\t' + "Read Count" + '\t' + "RPKM" + '\n'
    f.write(outline)

    for chrom in chroms:
        chrombed = chrom + extension
        if Utility_extended.fileExists(chrombed):
            gene_coords = known_genes[chrom]
            if len(gene_coords) > 0:
                if opt.region_type == 'ExonicRegion':
                    (gene_name_list, region_length_list,
                     read_count_list) = get_read_count_on_exons(
                         gene_coords, chrombed, opt.fragment_size)
                else:
                    (gene_name_list, region_length_list,
                     read_count_list) = get_read_count_on_genic_regions(
                         region_dic[chrom], chrombed, opt.fragment_size)
                    #test_get_read_count_on_genic_regions("AAAS", gene_name_list, region_length_list, read_count_list)
                    #test_get_read_count_on_genic_regions("AACS", gene_name_list, region_length_list, read_count_list)
                assert len(gene_name_list) == len(region_length_list)
                assert len(gene_name_list) == len(read_count_list)
                RPKM = [0] * len(gene_name_list)
                for i in xrange(len(gene_name_list)):
                    if region_length_list[i] > 0:
                        RPKM[i] = read_count_list[i] / (region_length_list[i] /
                                                        1000.0) / (totalcount /
                                                                   1000000.0)
                        outline = gene_name_list[i] + '\t' + str(
                            read_count_list[i]) + '\t' + str(RPKM[i]) + '\n'
                        f.write(outline)
    f.close()

    SeparateByChrom.cleanup(chroms, extension)

    print "it took", time.time() - startTime, "seconds."
示例#23
0
def main(argv):
	parser = OptionParser()
	parser.add_option("-k", "--known_gene_file", action="store", type="string",
			dest="genefile", help="file with known gene info", metavar="<file>")
	parser.add_option("-b", "--bedfile", action="store", type="string",
			dest="bedfile", help="file with tags in bed format", metavar="<file>")
	parser.add_option("-n", "--name", action="store", type="string",
			dest="name", help="name for plotting", metavar="<str>")
	parser.add_option("-s", "--species", action="store", type="string",
			dest="species", help="species", metavar="<str>")
	parser.add_option("-c", "--number", action="store", type="int",
			dest="exonnumber", help="number of exons", metavar="<int>")
	parser.add_option("-f", "--fragmentsize", action="store", type="int",
			dest="fragment_size", help="fragment size", metavar="<int>")
	
	(opt, args) = parser.parse_args(argv)
	if len(argv) < 12:
		parser.print_help()
		sys.exit(1)
	
	if opt.species in GenomeData.species_chroms.keys():
		chroms = GenomeData.species_chroms[opt.species];
		chrom_length = GenomeData.species_chrom_lengths[opt.species];
		
	else:
		print "This species is not recognized, exiting";
		sys.exit(1);
	

    	gene_coords = UCSC.KnownGenes(opt.genefile);
	
	#separate by chrom
	if Utility.fileExists(opt.bedfile):
		SeparateByChrom.separateByChrom(chroms, opt.bedfile, '.bed1');
	else:
		print opt.beddfile, " not found";
		sys.exit(1)
	
	total_exon_sums = [0]*opt.exonnumber;
	total_intron_sums = [0]*opt.exonnumber;
	total_exon_sizes = [0]*opt.exonnumber;
	total_intron_sizes = [0]*opt.exonnumber;
	total_num_tags = 0;
	
	for chrom in chroms:
		read_file = chrom + ".bed1";
		bed_vals = BED.BED(opt.species, read_file, "BED6", 0);
    		total_num_tags += bed_vals.getNumVals();
		(exon_counts, intron_counts, exon_seq_sizes, intron_seq_sizes) = getExonIntronDensities(gene_coords, bed_vals, opt.exonnumber, opt.fragment_size);
  		for j in range(opt.exonnumber):
			total_exon_sums[j] += exon_counts[j];
			total_intron_sums[j] += intron_counts[j];
			total_exon_sizes[j] += exon_seq_sizes[j];
			total_intron_sizes[j] += intron_seq_sizes[j];

	""" print everything out to a file """
    	outfilename = '%s-exon-intron-scores' % opt.name;
    	outFile = open(outfilename, 'w');    	
	for j in range(len(exon_counts)):
		exon_density = float(total_exon_sums[j]) / float(total_exon_sizes[j]);
		intron_density = float(total_intron_sums[j]) / float(total_intron_sizes[j]);
		exon_density /= float(total_num_tags);
		intron_density  /= float(total_num_tags);
		outline = str(j+1) + " " + str(exon_density) + " " + str(intron_density) + "\n";
		outFile.write(outline);
	outFile.close();

	SeparateByChrom.cleanup(chroms,'.bed1');
def main(argv):
	parser = OptionParser()
	parser.add_option("-b", "--readfile", action="store", type="string", dest="readFile", metavar="<file>", help="raw read file in bed format")
	parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species, mm8, hg18", metavar="<str>")
	parser.add_option("-i", "--islands", action="store", type="string", dest="islandFile", metavar="<file>", help="island File in chrom start end ... format")
	parser.add_option("-w", "--binSize", action="store", type="int", dest="binSize", metavar="<int>", help="bin size for resolution")
	parser.add_option("-m", "--minimum-number-of-points-per-island", action="store", type="int", dest="minimumRequiredPoints", metavar="<int>", help="minimum-number-of-data-points-needed-per-island")
	parser.add_option("-n", "--maxdistance", action="store", type="int", dest="maxDistance", metavar="<int>", help=" max distance for correlation, in terms of bin size")
	parser.add_option("-r", "--resolution", action="store", type="int", dest="resolution", metavar="<int>", help=" resolution in distance in terms of bin size")
	parser.add_option("-t", "--type", action="store", type="string", dest="type", metavar="<str>", help=" type of correlation, +auto, -auto, cross")
	parser.add_option("-f", "--shift", action="store", type="int", dest="shift", metavar="<int>", help=" shift of reads, only useful when calculate cross-correlation or combining the plus and minus reads")
	parser.add_option("-o", "--outfile", action="store", type="string", dest="out_file", metavar="<file>", help="output file")
	
	(opt, args) = parser.parse_args(argv)
	if len(argv) < 20:
        	parser.print_help()
        	sys.exit(1)
	
	if opt.species in GenomeData.species_chroms.keys():
		chroms = GenomeData.species_chroms[opt.species];
		chrom_lengths = GenomeData.species_chrom_lengths[opt.species];
	else:
		print "This species is not recognized, exiting";
		sys.exit(1);
	
	
	#t0 = time.time()
	if Utility.fileExists(opt.readFile) == 0:
		print opt.readFile, " does not exist"
		exit(1)
	
	libName = (opt.readFile).split('/')[-1]
	libName = libName.split('.')[0]
	extension = "-" + libName +'.bed1'
	SeparateByChrom.separateByChrom(chroms, opt.readFile,  extension)
	
	print "Species: ", opt.species
	print "Read File: ", opt.readFile
	print "Island File: ", opt.islandFile
	print "Bin Size: ", opt.binSize, "bp"
	print "Resolution: ", opt.resolution, " bins"
	print "Max distance: ", opt.maxDistance, " bins"
	assert (opt.type == "+auto" or opt.type == "-auto" or opt.type == "cross")
	print "Type of correlation: ", opt.type
	print "Reads shift: ", opt.shift

	# Here we are assuming that the file has the format chrom start end + .....for each line
	# chrom is sline[0], start is sline[1], end is sline[2]	
	if Utility.fileExists(opt.islandFile):	
		islandDic = BED_revised.BED(opt.species, opt.islandFile, "BED3")
	
	num_islands = 0
	for chrom in islandDic.keys():
		num_islands += len(islandDic[chrom]);
		# Clean up potential island-specific read files
		filter_raw_tags_by_islands_dev.cleanup_files(islandDic[chrom], extension)
	
	total = (int) (opt.maxDistance/opt.resolution) + 1
	distances = [0] * total
	numberOfPointsCollector = [0] * total
	correlationCollector = [0] * total
	
	totalReadCount = 0
	
	for chrom in chroms:
		chrombed = chrom + extension;
		if Utility.fileExists(chrombed):
			if (chrom in islandDic.keys()):
				if (len(islandDic[chrom]) > 0):
					# First find out all the reads that lands on islands and save them on island-specific temporary files.Then use only the read file specific to that island to do binning.  
					currentReadCount = filter_raw_tags_by_islands_dev.find_reads_on_each_island(chrombed, islandDic[chrom], opt.shift, extension)
					totalReadCount += currentReadCount
					for island in islandDic[chrom]:
						assert (island.start >= 0)
						assert (island.end <= chrom_lengths[chrom])
						islandReadFile = island.chrom + "-" + str(island.start) + "-" + str(island.end) + extension
						numberOfPoints = [0] * total
						correlations = [0] * total
						if opt.type == "+auto":
							readCountVector = generateReadCountVector(island.chrom, island.start, island.end, opt.binSize, "+", opt.shift, islandReadFile)
							(numberOfPoints, correlations) = autoCorrelations(readCountVector, opt.resolution, opt.maxDistance, opt.minimumRequiredPoints)
						elif opt.type == "-auto":
							readCountVector = generateReadCountVector(island.chrom, island.start, island.end, opt.binSize, "-", opt.shift, islandReadFile)
							(numberOfPoints, correlations) = autoCorrelations(readCountVector,opt.resolution, opt.maxDistance, opt.minimumRequiredPoints)
						elif opt.type == "cross":
							plusReadCountVector = generateReadCountVector(island.chrom, island.start, island.end, opt.binSize, "+", opt.shift, islandReadFile)
							minusReadCountVector = generateReadCountVector(island.chrom, island.start, island.end, opt.binSize, "-", opt.shift, islandReadFile)
							(numberOfPoints, correlations) = crossCorrelations(plusReadCountVector, minusReadCountVector, opt.resolution, opt.maxDistance, opt.minimumRequiredPoints)
						assert (len(numberOfPoints) == total)
						assert (len(correlations) == total)
						print chrom, island.start, island.end
						for i in xrange(total):
							if numberOfPoints[i] == 0:
								distances[i] = i * opt.resolution * opt.binSize
								print distances[i], "\t", numberOfPointsCollector[i], "\t", correlationCollector[i]
							numberOfPointsCollector[i] += numberOfPoints[i]
							correlationCollector[i] += correlations[i]
	
	# Normalization and output
	f = open(opt.out_file, 'w')
	for i in xrange(total):
		distances[i] = i * opt.resolution * opt.binSize
		correlationCollector[i] /= numberOfPointsCollector[i] #normalize by the number of points 
		correlationCollector[i] /= (totalReadCount/1000000.0)*(totalReadCount/1000000.0)
		print distances[i], "\t", numberOfPointsCollector[i], "\t", correlationCollector[i]
		outline = str(distances[i]) + "\t" + str(correlationCollector[i]) + "\n"
		f.write(outline)
	f.close()

	SeparateByChrom.cleanup(chroms, extension)
		
	#Plot it out
	title = libName + " " +  opt.type + "  correlation"
	legend = "B" + str(opt.binSize) + " S" + str(opt.shift)
	plot_profile(distances[1:], correlationCollector[1:], 0,  title, legend, opt.out_file + '.eps')
def main(argv):
    parser = OptionParser()
    parser.add_option("-a",
                      "--islandfile1",
                      action="store",
                      type="string",
                      dest="islandfile1",
                      metavar="<file>",
                      help="file 1 with islands info to be compared")
    parser.add_option("-b",
                      "--islandfile2",
                      action="store",
                      type="string",
                      dest="islandfile2",
                      metavar="<file>",
                      help="file 2 with islands info to be compared")
    parser.add_option("-s",
                      "--species",
                      action="store",
                      type="string",
                      dest="species",
                      help="species, mm8 or hg18",
                      metavar="<str>")
    parser.add_option(
        "-p",
        "--overlapin1",
        action="store",
        type="string",
        dest="overlapin1",
        metavar="<file>",
        help="file for islands in 1 overlapping with islands in 2")
    parser.add_option(
        "-q",
        "--nonoverlapin1",
        action="store",
        type="string",
        dest="nonoverlapin1",
        help="file for islands in 1 not overlapping with islands in 2 ",
        metavar="<file>")

    (opt, args) = parser.parse_args(argv)
    if len(argv) < 10:
        parser.print_help()
        sys.exit(1)

    if opt.species in species_chroms.keys():
        chroms = species_chroms[opt.species]
    else:
        print "This species is not recognized, exiting"
        sys.exit(1)

    total_overlap_number_1 = 0
    total_islands_1 = 0

    SeparateByChrom.separateByChrom(chroms, opt.islandfile1, '.island1')
    SeparateByChrom.separateByChrom(chroms, opt.islandfile2, '.island2')

    for chrom in chroms:
        f = open(chrom + '.1in2', 'w')
        g = open(chrom + '.1notin2', 'w')
        bed_vals_1 = BED.BED(opt.species, chrom + '.island1', "BED_GRAPH", 0)
        bed_vals_2 = BED.BED(opt.species, chrom + '.island2', "BED_GRAPH", 0)
        if len(bed_vals_1[chrom]) > 0 and len(bed_vals_2[chrom]) > 0:
            islandlist1 = bed_vals_1[chrom]
            islandlist2 = bed_vals_2[chrom]
            total_islands_1 += len(bed_vals_1[chrom])
            for islandlist1_item in islandlist1:
                start = islandlist1_item.start
                end = islandlist1_item.end
                if (region_overlap(start, end, islandlist2) == 1):
                    write(islandlist1_item, f)
                    total_overlap_number_1 += 1
                else:
                    write(islandlist1_item, g)
        elif (len(bed_vals_1[chrom]) > 0) and (len(bed_vals_2[chrom]) == 0):
            total_islands_1 += len(bed_vals_1[chrom])
            for islandlist1_item in bed_vals_1[chrom]:
                write(islandlist1_item, g)
        f.close()
        g.close()

    print "total number of island in " + opt.islandfile1 + ":     ", total_islands_1
    print "total number of island in " + opt.overlapin1 + ":     ", total_overlap_number_1

    SeparateByChrom.combineAllGraphFiles(chroms, '.1in2', opt.overlapin1)
    SeparateByChrom.combineAllGraphFiles(chroms, '.1notin2', opt.nonoverlapin1)

    SeparateByChrom.cleanup(chroms, '.1in2')
    SeparateByChrom.cleanup(chroms, '.1notin2')
    SeparateByChrom.cleanup(chroms, '.island1')
    SeparateByChrom.cleanup(chroms, '.island2')
示例#26
0
def main(argv):
    parser = OptionParser()
    parser.add_option("-s",
                      "--species",
                      action="store",
                      type="string",
                      dest="species",
                      help="species, mm8, hg18, etc",
                      metavar="<str>")
    parser.add_option("-a",
                      "--rawreadfileA",
                      action="store",
                      type="string",
                      dest="readfileA",
                      metavar="<file>",
                      help="raw read file A in bed format")
    parser.add_option("-b",
                      "--rawreadfileB",
                      action="store",
                      type="string",
                      dest="readfileB",
                      metavar="<file>",
                      help="raw read file B in bed format")
    parser.add_option("-f",
                      "--fragment_size",
                      action="store",
                      type="int",
                      dest="fragment_size",
                      metavar="<int>",
                      help="average size of a fragment after A experiment")
    parser.add_option("-d",
                      "--islandfile",
                      action="store",
                      type="string",
                      dest="islandfile",
                      metavar="<file>",
                      help="island file in BED format")
    parser.add_option("-o",
                      "--outfile",
                      action="store",
                      type="string",
                      dest="out_file",
                      metavar="<file>",
                      help="island read count summary file")

    (opt, args) = parser.parse_args(argv)
    if len(argv) < 12:
        parser.print_help()
        sys.exit(1)

    if opt.species in GenomeData.species_chroms.keys():
        chroms = GenomeData.species_chroms[opt.species]
    else:
        print "This species is not recognized, exiting"
        sys.exit(1)

    if not Utility.fileExists(opt.readfileA):
        print opt.readfileA, " not found"
        sys.exit(1)
    if not Utility.fileExists(opt.readfileB):
        print opt.readfileB, " not found"
        sys.exit(1)

    A_library_size = get_total_tag_counts.get_total_tag_counts(opt.readfileA)
    B_library_size = get_total_tag_counts.get_total_tag_counts(opt.readfileB)
    print "Library size of ", opt.readfileA, ":  ", A_library_size
    print "Library size of ", opt.readfileB, ":  ", B_library_size

    totalA = 0
    totalB = 0

    islands = BED.BED(opt.species, opt.islandfile, "BED3", 0)

    # separate by chrom the A library
    SeparateByChrom.separateByChrom(chroms, opt.readfileA, '.bed1')
    # separate by chrom the B library
    SeparateByChrom.separateByChrom(chroms, opt.readfileB, '.bed2')

    island_A_readcount = {}
    island_B_readcount = {}

    #Find read counts on the islands
    for chrom in chroms:
        if chrom in islands.keys():
            if len(islands[chrom]) != 0:
                island_list = islands[chrom]
                if Utility.is_bed_sorted(island_list) == 0:
                    island_list.sort(key=operator.attrgetter('start'))

                island_start_list = []
                island_end_list = []
                for item in island_list:
                    island_start_list.append(item.start)
                    island_end_list.append(item.end)

                island_A_readcount_list = [0] * len(island_list)
                read_file = chrom + ".bed1"
                f = open(read_file, 'r')
                for line in f:
                    if not re.match("#", line):
                        line = line.strip()
                        sline = line.split()
                        position = associate_tags_with_regions.tag_position(
                            sline, opt.fragment_size)
                        index = associate_tags_with_regions.find_readcount_on_islands(
                            island_start_list, island_end_list, position)
                        if index >= 0:
                            island_A_readcount_list[index] += 1
                            totalA += 1
                f.close()
                island_A_readcount[chrom] = island_A_readcount_list

                island_B_readcount_list = [0] * len(island_list)
                read_file = chrom + ".bed2"
                f = open(read_file, 'r')
                for line in f:
                    if not re.match("#", line):
                        line = line.strip()
                        sline = line.split()
                        position = associate_tags_with_regions.tag_position(
                            sline, opt.fragment_size)
                        index = associate_tags_with_regions.find_readcount_on_islands(
                            island_start_list, island_end_list, position)
                        if index >= 0:
                            island_B_readcount_list[index] += 1
                            totalB += 1
                f.close()
                island_B_readcount[chrom] = island_B_readcount_list

    #A_background_read = A_library_size - totalA;
    #B_background_read = B_library_size - totalB;

    print "Total number of A reads on islands is: ", totalA
    print "Total number of B reads on islands is: ", totalB

    # Calculate the p value.
    library_scaling_factor = A_library_size * 1.0 / B_library_size
    #A vs B
    pseudo_count = 1
    pvalue_A_vs_B_list = []
    pvalue_B_vs_A_list = []
    for chrom in chroms:
        if chrom in islands.keys():
            if len(islands[chrom]) != 0:
                island_list = islands[chrom]
                for index in xrange(len(island_list)):
                    item = island_list[index]
                    Acount = (island_A_readcount[chrom])[index]
                    Bcount = (island_B_readcount[chrom])[index]
                    pvalue_A_vs_B = pvaule(Acount, Bcount,
                                           library_scaling_factor,
                                           pseudo_count)
                    pvalue_A_vs_B_list.append(pvalue_A_vs_B)
                    pvalue_B_vs_A = pvaule(Bcount, Acount,
                                           1 / library_scaling_factor,
                                           pseudo_count)
                    pvalue_B_vs_A_list.append(pvalue_B_vs_A)
    #Calculate the FDR
    fdr_A_vs_B_list = fdr(pvalue_A_vs_B_list)
    fdr_B_vs_A_list = fdr(pvalue_B_vs_A_list)

    #Output the islands read counts, normalized read counts, fc, pvalue both ways
    scaling_factor = 1000000
    out = open(opt.out_file, 'w')
    outline = '#chrom' + "\t" + 'start' + "\t" + 'end' + "\t" + "Readcount_A" + "\t" + 'Normalized_Readcount_A' + "\t" + 'ReadcountB' + "\t" + 'Normalized_Readcount_B' + "\t" + "Fc_A_vs_B" + "\t" + "pvalue_A_vs_B" + "\t" + "FDR_A_vs_B" + "\t" + "Fc_B_vs_A" + "\t" + "pvalue_B_vs_A" + "\t" + "FDR_B_vs_A" + "\n"
    out.write(outline)
    ii = 0
    for chrom in chroms:
        if chrom in islands.keys():
            if len(islands[chrom]) != 0:
                island_list = islands[chrom]
                for index in xrange(len(island_list)):
                    item = island_list[index]
                    Acount = (island_A_readcount[chrom])[index]
                    Bcount = (island_B_readcount[chrom])[index]
                    normalized_A = Acount / float(
                        A_library_size) * scaling_factor
                    normalized_B = Bcount / float(
                        B_library_size) * scaling_factor
                    fc_A_vs_B = (
                        (Acount + pseudo_count) * 1.0 /
                        (Bcount + pseudo_count)) / library_scaling_factor
                    fc_B_vs_A = (
                        (Bcount + pseudo_count) * 1.0 /
                        (Acount + pseudo_count)) * library_scaling_factor
                    print("Acount", Acount, "Bcount", Bcount, "pseudo_count",
                          pseudo_count, "library_scaling_factor",
                          library_scaling_factor, "fc_A_vs_B", fc_A_vs_B,
                          "fc_B_vs_A", fc_B_vs_A)
                    outline = item.chrom + "\t" + str(item.start) + "\t" + str(
                        item.end) + "\t" + str(Acount) + "\t" + str(
                            normalized_A) + "\t" + str(Bcount) + "\t" + str(
                                normalized_B
                            ) + "\t" + str(fc_A_vs_B) + "\t" + str(
                                pvalue_A_vs_B_list[ii]) + "\t" + str(
                                    fdr_A_vs_B_list[ii]
                                ) + "\t" + str(fc_B_vs_A) + "\t" + str(
                                    pvalue_B_vs_A_list[ii]) + "\t" + str(
                                        fdr_B_vs_A_list[ii]) + "\n"
                    out.write(outline)
                    ii += 1
    out.close()

    SeparateByChrom.cleanup(chroms, '.bed1')
    SeparateByChrom.cleanup(chroms, '.bed2')

    # Calculate the correlations using normalized read counts
    A_array = ()
    B_array = ()
    for chrom in chroms:
        if chrom in islands.keys():
            if len(islands[chrom]) != 0:
                temp_array = scipy.array(island_A_readcount[chrom])
                A_array = scipy.concatenate((temp_array, A_array))
                temp_array = scipy.array(island_B_readcount[chrom])
                B_array = scipy.concatenate((temp_array, B_array))
    #Normalization to reads per million
    A_array = A_array / float(A_library_size) * scaling_factor
    B_array = B_array / float(B_library_size) * scaling_factor
    pearson = scipy.stats.pearsonr(A_array, B_array)
    print "Pearson's correlation is: ", pearson[0], " with p-value ", pearson[
        1]
    spearman = scipy.stats.spearmanr(A_array, B_array)
    print "Spearman's correlation is: ", spearman[
        0], " with p-value ", spearman[1]
def main(argv):
	parser = OptionParser()
	parser.add_option("-s", "--species", action="store",
			  type="string", dest="species",
			  help="species, mm8, hg18, etc", metavar="<str>")
	parser.add_option("-a", "--rawbedfile", action="store",
			  type="string", dest="bedfile",
			  metavar="<file>",
			  help="raw data file in bed format")
	parser.add_option("-i", "--shift", action="store",
			  type="int", dest="shift",
			  metavar="<int>",
			  help="shift for finding the center of DNA fragment represented by the read")
	parser.add_option("-b", "--islandfile", action="store", type="string",
			  dest="islandbedfile", metavar="<file>",
			  help="island file")
	parser.add_option("-o", "--outfile", action="store", type="string",
			  dest="out_file", metavar="<file>",
			  help="filtered raw bed file")
	
	(opt, args) = parser.parse_args(argv)
	if len(argv) < 10:
        	parser.print_help()
        	sys.exit(1)
	
	if opt.species in species_chroms.keys():
		chroms = species_chroms[opt.species];
	else:
		print "This species is not recognized, exiting";
		sys.exit(1);
	
	islands = BED_revised.BED_annotated(opt.species, opt.islandbedfile, "BED3", 0);
	
	libName = (opt.bedfile).split('/')[-1] #remove directories
	libName = libName.split('.')[0] #remove .bed
	extension = "-" + libName +'.bed1'
	SeparateByChrom.separateByChrom(chroms, opt.bedfile, extension)
	for chrom in chroms:
		if chrom in islands.keys():
			outfile = chrom + "-filtered" + extension 
			# [(island, rc)]
			island_rc = filter_tags_by_islands(chrom + extension, islands[chrom], opt.shift, outfile, boundary_extension=0)
			output_island_with_rc(island_rc, chrom + "-islands" + extension)
			
	SeparateByChrom.combineAllGraphFiles(chroms, "-islands" + extension, "islands" + extension);
	SeparateByChrom.combineAllGraphFiles(chroms, "-filtered" + extension, opt.out_file);
	
	SeparateByChrom.cleanup(chroms, extension);
	SeparateByChrom.cleanup(chroms, "-filtered" + extension);
	SeparateByChrom.cleanup(chroms, "-islands" + extension);
示例#28
0
def main(argv):
    parser = OptionParser()
    parser.add_option("-s",
                      "--species",
                      action="store",
                      type="string",
                      dest="species",
                      help="species, mm8, hg18",
                      metavar="<str>")
    parser.add_option("-a",
                      "--rawreadfile",
                      action="store",
                      type="string",
                      dest="readfile",
                      metavar="<file>",
                      help="raw read file in bed format")
    parser.add_option("-f",
                      "--fragment_size",
                      action="store",
                      type="int",
                      dest="fragment_size",
                      metavar="<int>",
                      help="average size of a fragment after CHIP experiment")
    parser.add_option("-b",
                      "--islandfile",
                      action="store",
                      type="string",
                      dest="islandfile",
                      metavar="<file>",
                      help="island file in BED format")
    parser.add_option("-o",
                      "--outfile",
                      action="store",
                      type="string",
                      dest="out_file",
                      metavar="<file>",
                      help="island read count file")

    (opt, args) = parser.parse_args(argv)
    if len(argv) < 10:
        parser.print_help()
        sys.exit(1)

    if opt.species in GenomeData.species_chroms.keys():
        chroms = GenomeData.species_chroms[opt.species]
    else:
        print "This species is not recognized, exiting"
        sys.exit(1)

    islands = BED.BED(opt.species, opt.islandfile, "BED3", 0)
    if Utility.fileExists(opt.readfile):
        SeparateByChrom.separateByChrom(chroms, opt.readfile, '.bed1')
    else:
        print opt.readfile, " not found"
        sys.exit(1)

    total = 0
    library_size = get_total_tag_counts.get_total_tag_counts(opt.readfile)

    scaling_factor = 1000000
    out = open(opt.out_file, 'w')
    for chrom in chroms:
        if chrom in islands.keys():
            island_list = islands[chrom]
            island_readcount_list = [0] * len(island_list)

            if Utility.is_bed_sorted(island_list) == 0:
                island_list.sort(key=operator.attrgetter('start'))

            island_start_list = []
            island_end_list = []
            for item in island_list:
                island_start_list.append(item.start)
                island_end_list.append(item.end)

            read_file = chrom + ".bed1"
            f = open(read_file, 'r')
            for line in f:
                if not re.match("#", line):
                    line = line.strip()
                    sline = line.split()
                    position = tag_position(sline, opt.fragment_size)
                    index = find_readcount_on_islands(island_start_list,
                                                      island_end_list,
                                                      position)
                    if index >= 0:
                        island_readcount_list[index] += 1
                        total += 1
            f.close()

            for index in xrange(len(island_list)):
                item = island_list[index]
                normalized_read_count = island_readcount_list[index] / float(
                    library_size) * scaling_factor
                outline = item.chrom + "\t" + str(item.start) + "\t" + str(
                    item.end) + "\t" + str(
                        island_readcount_list[index]) + "\t" + str(
                            normalized_read_count) + "\n"
                out.write(outline)

    SeparateByChrom.cleanup(chroms, '.bed1')
    out.close()
    print "Total number of reads on islands are: ", total
示例#29
0
def main(argv):
    parser = OptionParser()
    parser.add_option("-s",
                      "--species",
                      action="store",
                      type="string",
                      dest="species",
                      help="species, mm8, hg18",
                      metavar="<str>")
    parser.add_option("-a",
                      "--summary_graph_file1",
                      action="store",
                      type="string",
                      dest="bedfile1",
                      metavar="<file>",
                      help="summary graph file 1 in bed format")
    parser.add_option("-b",
                      "--summary_graph_file2",
                      action="store",
                      type="string",
                      dest="bedfile2",
                      metavar="<file>",
                      help="summary graph file 2 in bed format")
    parser.add_option("-i",
                      "--windows_size",
                      action="store",
                      type="int",
                      dest="window_size",
                      metavar="<int>",
                      help="window size in summary graph file")
    parser.add_option(
        "-d",
        "--data_resolution",
        action="store",
        type="int",
        dest="step",
        metavar="<int>",
        help=
        "distance between data points, must be integer times of window size")
    parser.add_option("-o",
                      "--outfile",
                      action="store",
                      type="string",
                      dest="out_file",
                      metavar="<file>",
                      help="output file extension")

    (opt, args) = parser.parse_args(argv)
    if len(argv) < 8:
        parser.print_help()
        sys.exit(1)

    if opt.species in species_chroms.keys():
        chroms = species_chroms[opt.species]
        chrom_lengths = species_chrom_lengths[opt.species]
    else:
        print "This species is not recognized, exiting"
        sys.exit(1)

    SeparateByChrom.separateByChrom(['chr1'], opt.bedfile1, '.bed1')
    SeparateByChrom.separateByChrom(['chr1'], opt.bedfile2, '.bed2')
    generate_all_functions_2('.bed1', '.bed2', ['chr1'], opt.window_size,
                             opt.step, opt.out_file, chrom_lengths)
    SeparateByChrom.cleanup(['chr1'], '.bed1')
    SeparateByChrom.cleanup(['chr1'], '.bed2')
def main(argv):
    parser = OptionParser()
    parser.add_option(
        "-s", "--species", action="store", type="string", dest="species", help="species, mm8, hg18", metavar="<str>"
    )
    parser.add_option(
        "-a",
        "--rawreadfile",
        action="store",
        type="string",
        dest="readfile",
        metavar="<file>",
        help="raw read file in bed format",
    )
    parser.add_option(
        "-f",
        "--fragment_size",
        action="store",
        type="int",
        dest="fragment_size",
        metavar="<int>",
        help="average size of a fragment after CHIP experiment",
    )
    parser.add_option(
        "-b",
        "--islandfile",
        action="store",
        type="string",
        dest="islandfile",
        metavar="<file>",
        help="island file in BED format",
    )
    parser.add_option(
        "-o",
        "--outfile",
        action="store",
        type="string",
        dest="out_file",
        metavar="<file>",
        help="island read count file",
    )

    (opt, args) = parser.parse_args(argv)
    if len(argv) < 10:
        parser.print_help()
        sys.exit(1)

    if opt.species in GenomeData.species_chroms.keys():
        chroms = GenomeData.species_chroms[opt.species]
    else:
        print "This species is not recognized, exiting"
        sys.exit(1)

    islands = BED.BED(opt.species, opt.islandfile, "BED3", 0)
    if Utility.fileExists(opt.readfile):
        SeparateByChrom.separateByChrom(chroms, opt.readfile, ".bed1")
    else:
        print opt.readfile, " not found"
        sys.exit(1)

    total = 0
    library_size = get_total_tag_counts.get_total_tag_counts(opt.readfile)

    scaling_factor = 1000000
    out = open(opt.out_file, "w")
    for chrom in chroms:
        if chrom in islands.keys():
            island_list = islands[chrom]
            island_readcount_list = [0] * len(island_list)

            if Utility.is_bed_sorted(island_list) == 0:
                island_list.sort(key=operator.attrgetter("start"))

            island_start_list = []
            island_end_list = []
            for item in island_list:
                island_start_list.append(item.start)
                island_end_list.append(item.end)

            read_file = chrom + ".bed1"
            f = open(read_file, "r")
            for line in f:
                if not re.match("#", line):
                    line = line.strip()
                    sline = line.split()
                    position = tag_position(sline, opt.fragment_size)
                    index = find_readcount_on_islands(island_start_list, island_end_list, position)
                    if index >= 0:
                        island_readcount_list[index] += 1
                        total += 1
            f.close()

            for index in xrange(len(island_list)):
                item = island_list[index]
                normalized_read_count = island_readcount_list[index] / float(library_size) * scaling_factor
                outline = (
                    item.chrom
                    + "\t"
                    + str(item.start)
                    + "\t"
                    + str(item.end)
                    + "\t"
                    + str(island_readcount_list[index])
                    + "\t"
                    + str(normalized_read_count)
                    + "\n"
                )
                out.write(outline)

    SeparateByChrom.cleanup(chroms, ".bed1")
    out.close()
    print "Total number of reads on islands are: ", total
def main(argv):
    parser = OptionParser()
    parser.add_option("-a",
                      "--islandfile1",
                      action="store",
                      type="string",
                      dest="islandfile1",
                      metavar="<file>",
                      help="file 1 with islands info to be compared")
    parser.add_option("-b",
                      "--islandfile2",
                      action="store",
                      type="string",
                      dest="islandfile2",
                      metavar="<file>",
                      help="file 2 with islands info to be compared")
    parser.add_option("-s",
                      "--species",
                      action="store",
                      type="string",
                      dest="species",
                      help="species, mm8 or hg18",
                      metavar="<str>")
    parser.add_option(
        "-p",
        "--overlapin1",
        action="store",
        type="string",
        dest="overlapin1",
        metavar="<file>",
        help="file for islands in 1 overlapping with islands in 2")
    parser.add_option(
        "-q",
        "--nonoverlapin1",
        action="store",
        type="string",
        dest="nonoverlapin1",
        help="file for islands in 1 not overlapping with islands in 2 ",
        metavar="<file>")

    (opt, args) = parser.parse_args(argv)
    if len(argv) < 10:
        parser.print_help()
        sys.exit(1)

    if opt.species in species_chroms.keys():
        chroms = species_chroms[opt.species]
    else:
        print "This species is not recognized, exiting"
        sys.exit(1)

    total_overlap_number_1 = 0
    total_islands_1 = 0

    SeparateByChrom.separateByChrom(chroms, opt.islandfile1, '.island1')
    SeparateByChrom.separateByChrom(chroms, opt.islandfile2, '.island2')

    for chrom in chroms:
        f = open(chrom + '.1in2', 'w')
        g = open(chrom + '.1notin2', 'w')
        bed_vals_2 = BED.BED(opt.species, chrom + '.island2', "BED3", 0)
        if Utility.fileExists(chrom +
                              '.island1') and len(bed_vals_2[chrom]) > 0:
            islandlist2 = bed_vals_2[chrom]
            if (are_islands_sorted(islandlist2) != 1):
                islandlist2.sort(key=operator.attrgetter('start'))
            (island2_start_list, island2_end_list) = union_islands(islandlist2)
            islands1 = open(chrom + '.island1', 'r')
            for line in islands1:
                if not re.match("#", line):
                    total_islands_1 += 1
                    line = line.strip()
                    sline = line.split()
                    start = int(sline[1])
                    end = int(sline[2])
                    if (region_overlap(start, end, island2_start_list,
                                       island2_end_list) == 1):
                        f.write('\t'.join(sline) + '\n')
                        total_overlap_number_1 += 1
                    else:
                        g.write('\t'.join(sline) + '\n')
        elif Utility.fileExists(chrom + '.island1') and (len(bed_vals_2[chrom])
                                                         == 0):
            islands1 = open(chrom + '.island1', 'r')
            for line in islands1:
                if not re.match("#", line):
                    total_islands_1 += 1
                    line = line.strip()
                    sline = line.split()
                    g.write('\t'.join(sline) + '\n')
        f.close()
        g.close()

    print "total number of island in " + opt.islandfile1 + ":     ", total_islands_1
    print "total number of island in " + opt.overlapin1 + ":     ", total_overlap_number_1

    SeparateByChrom.combineAllGraphFiles(chroms, '.1in2', opt.overlapin1)
    SeparateByChrom.combineAllGraphFiles(chroms, '.1notin2', opt.nonoverlapin1)

    SeparateByChrom.cleanup(chroms, '.1in2')
    SeparateByChrom.cleanup(chroms, '.1notin2')
    SeparateByChrom.cleanup(chroms, '.island1')
    SeparateByChrom.cleanup(chroms, '.island2')
示例#32
0
def main(argv):
    """
    Probability scoring with random background model.
    
    """
    parser = OptionParser()
    
    #parser.add_option("-s", "--species", action="store", type="string", dest="species", help="mm8, hg18, background, etc", metavar="<str>")
    parser.add_option("-B", "--bam", action="store", type="string", dest="bam", help="Any suitable bam file that can be used to extrcat chroms from header", metavar="<str>")
    parser.add_option("-b", "--summarygraph", action="store",type="string", dest="summarygraph", help="summarygraph", metavar="<file>")
    parser.add_option("-w", "--window_size(bp)", action="store", type="int", dest="window_size", help="window_size(in bps)", metavar="<int>")
    parser.add_option("-g", "--gap_size(bp)", action="store", type="int",  dest="gap", help="gap size (in bps)", metavar="<int>")
    parser.add_option("-t", "--mappable_fraction_of_genome_size ", action="store", type="float", dest="fraction", help="mapable fraction of genome size", metavar="<float>")
    parser.add_option("-e", "--evalue ", action="store", type="float", dest="evalue", help="evalue that determines score threshold for significant islands", metavar="<float>")
    parser.add_option("-f", "--out_island_file", action="store", type="string", dest="out_island_file", help="output island file name", metavar="<file>")
    
    (opt, args) = parser.parse_args(argv)
    if len(argv) < 14:
            parser.print_help()
            sys.exit(1)

    #if opt.species in GenomeData.species_chroms.keys():
    
    chromsDict= SeparateByChrom.getChromsFromBam(opt.bam)
       
    sys.stderr.write("Window_size: %s\n" %(opt.window_size))
    sys.stderr.write("Gap size: %s\n" %(opt.gap))
    sys.stderr.write("E value is: %s\n" %(opt.evalue))
    
    total_read_count = get_total_tag_counts.get_total_tag_counts_bed_graph(opt.summarygraph);
    sys.stderr.write("Total read count: %s\n" %(total_read_count))
    genome_length = sum(chromsDict.values()) ## sum (GenomeData.species_chrom_lengths[opt.species].values());
    sys.stderr.write("Genome Length: %s\n" %(genome_length));
    genome_length = int(opt.fraction * genome_length);

    average = float(total_read_count) * opt.window_size/genome_length; 
    sys.stderr.write("Effective genome Length: %s\n" %(genome_length));
    sys.stderr.write("Window average: %s\n" %(average));
    
    window_pvalue = 0.20;
    bin_size = 0.001;
    sys.stderr.write("Window pvalue: %s\n" %(window_pvalue))
    background = Background_island_probscore_statistics.Background_island_probscore_statistics(total_read_count, opt.window_size, opt.gap, window_pvalue, genome_length, bin_size);
    min_tags_in_window = background.min_tags_in_window
    sys.stderr.write("Minimum num of tags in a qualified window: %s\n" %(min_tags_in_window))
    
    sys.stderr.write("Generate the enriched probscore summary graph and filter the summary graph to get rid of ineligible windows\n"); 
    #read in the summary graph file
    bed_val = BED.BED(chromsDict.keys(), opt.summarygraph, "BED_GRAPH");
    
    #generate the probscore summary graph file, only care about enrichment
    #filter the summary graph to get rid of windows whose scores are less than window_score_threshold
    
    filtered_bed_val = {};
    
    for chrom in bed_val.keys():
        if len(bed_val[chrom])>0:
            filtered_bed_val [chrom]= [];
            for index in xrange(len(bed_val[chrom])):
                read_count = bed_val[chrom][index].value;
                if ( read_count < min_tags_in_window):
                    score = -1;
                    #score = 0;
                else:
                    prob = poisson(read_count, average);
                    if prob <1e-250:
                        score = 1000; #outside of the scale, take an arbitrary number.
                    else:
                        score = -log(prob);
                bed_val[chrom][index].value = score;
                if score > 0:
                    filtered_bed_val[chrom].append( (bed_val[chrom])[index] );
                #print chrom, start, read_count, score;
    
    #write the probscore summary graph file
    #Background_simulation_pr.output_bedgraph(bed_val, opt.out_sgraph_file);
    
    #Background_simulation_pr.output_bedgraph(filtered_bed_val, opt.out_sgraph_file+".filtered");
    
    sys.stderr.write("Determine the score threshold from random background\n"); 
    #determine threshold from random background
    hist_outfile="L" + str(genome_length) + "_W" +str(opt.window_size) + "_G" +str(opt.gap) +  "_s" +str(min_tags_in_window) + "_T"+ str(total_read_count) + "_B" + str(bin_size) +"_calculatedprobscoreisland.hist";
    score_threshold = background.find_island_threshold(opt.evalue); 
    # background.output_distribution(hist_outfile);
    sys.stderr.write("The score threshold is: %s\n" %(score_threshold));
    
    
    sys.stderr.write("Make and write islands\n");
    total_number_islands = 0;
    outputfile = open(opt.out_island_file, 'w');
    for chrom in filtered_bed_val.keys():
        if len(filtered_bed_val[chrom])>0:
            islands = combine_proximal_islands(filtered_bed_val[chrom], opt.gap, 2);
            islands = find_region_above_threshold(islands, score_threshold);
            total_number_islands += len(islands);
            if len(islands)>0:
                for i in islands:
                    outline = chrom + "\t" + str(i.start) + "\t" + str(i.end) + "\t" + str(i.value) + "\n";    
                    outputfile.write(outline);
            else:
                sys.stderr.write("\t" + chrom + " does not have any islands meeting the required significance\n");
    outputfile.close();    
    sys.stderr.write("Total number of islands: %s\n" %(total_number_islands))
示例#33
0
def main(argv):
    parser = OptionParser()
    parser.add_option("-s",
                      "--species",
                      action="store",
                      type="string",
                      dest="species",
                      help="species, mm8, hg18",
                      metavar="<str>")
    parser.add_option("-a",
                      "--summarygraphfile",
                      action="store",
                      type="string",
                      dest="bedfile",
                      metavar="<file>",
                      help="summary graph file")
    parser.add_option("-b",
                      "--islandfile",
                      action="store",
                      type="string",
                      dest="islandbedfile",
                      metavar="<file>",
                      help="island file")
    parser.add_option("-o",
                      "--outfile",
                      action="store",
                      type="string",
                      dest="out_file",
                      metavar="<file>",
                      help="filtered summary graph file")
    parser.add_option("-w",
                      "--window_size",
                      action="store",
                      type="int",
                      dest="window_size",
                      help="window size of summary graph",
                      metavar="<int>")

    (opt, args) = parser.parse_args(argv)
    if len(argv) < 10:
        parser.print_help()
        sys.exit(1)

    if opt.species in species_chroms.keys():
        chroms = species_chroms[opt.species]
    else:
        print "This species is not recognized, exiting"
        sys.exit(1)

    summary_graph_extension = ".summarygraph"
    island_extension = ".islands"
    SeparateByChrom.separateByChrom(chroms, opt.bedfile,
                                    summary_graph_extension)
    SeparateByChrom.separateByChrom(chroms, opt.islandbedfile,
                                    island_extension)

    f = open(opt.out_file, 'w')
    for chrom in chroms:
        if Utility.fileExists(chrom +
                              summary_graph_extension) and Utility.fileExists(
                                  chrom + island_extension):
            summary = BED.BED(opt.species, chrom + summary_graph_extension,
                              "BED_GRAPH", 0)
            islands = BED.BED(opt.species, chrom + island_extension,
                              "BED_GRAPH", 0)
            result = filter_out_uncovered_windows(islands[chrom],
                                                  summary[chrom],
                                                  opt.window_size)
            for item in result:
                f.write(item.chrom + '\t' + str(item.start) + '\t' +
                        str(item.end) + '\t' + str(item.value) + '\n')
    f.close()

    SeparateByChrom.cleanup(chroms, summary_graph_extension)
    SeparateByChrom.cleanup(chroms, island_extension)
def main(argv):
	parser = OptionParser()
	parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species, mm8, hg18, etc", metavar="<str>")
	parser.add_option("-a", "--rawreadfileA", action="store", type="string", dest="readfileA", metavar="<file>", help="raw read file A in bed format")
	parser.add_option("-b", "--rawreadfileB", action="store", type="string", dest="readfileB", metavar="<file>", help="raw read file B in bed format")
	parser.add_option("-f", "--fragment_size", action="store", type="int", dest="fragment_size", metavar="<int>", help="average size of a fragment after A experiment")
	parser.add_option("-d", "--islandfile", action="store", type="string", dest="islandfile", metavar="<file>", help="island file in BED format")
	parser.add_option("-o", "--outfile", action="store", type="string", dest="out_file", metavar="<file>", help="island read count summary file")
	
	(opt, args) = parser.parse_args(argv)
	if len(argv) < 12:
        	parser.print_help()
        	sys.exit(1)
		
	if opt.species in GenomeData.species_chroms.keys():
		chroms = GenomeData.species_chroms[opt.species];
	else:
		print "This species is not recognized, exiting";
		sys.exit(1);
	
	if not Utility.fileExists(opt.readfileA):
		print opt.readfileA, " not found";
		sys.exit(1)
	if not Utility.fileExists(opt.readfileB):
		print opt.readfileB, " not found";
		sys.exit(1)	
	
	A_library_size = get_total_tag_counts.get_total_tag_counts(opt.readfileA);
	B_library_size = get_total_tag_counts.get_total_tag_counts(opt.readfileB);
	print "Library size of ", opt.readfileA, ":  ", A_library_size
	print "Library size of ", opt.readfileB, ":  ", B_library_size
	
	totalA = 0;
	totalB = 0;
	
	islands = BED.BED(opt.species, opt.islandfile, "BED3", 0);
	
	# separate by chrom the A library
	SeparateByChrom.separateByChrom(chroms, opt.readfileA, '.bed1');
	# separate by chrom the B library
	SeparateByChrom.separateByChrom(chroms, opt.readfileB, '.bed2');
	
	
	island_A_readcount = {};
	island_B_readcount = {};
	
	#Find read counts on the islands
	for chrom in chroms:
		if chrom in islands.keys():
			if len(islands[chrom]) != 0:
				island_list = islands[chrom];
				if Utility.is_bed_sorted(island_list) == 0:
					island_list.sort(key=operator.attrgetter('start'));
					
				island_start_list = []
				island_end_list = []
				for item in island_list:
					island_start_list.append(item.start)
					island_end_list.append(item.end)
	
				island_A_readcount_list=[0]*len(island_list);
				read_file = chrom + ".bed1";
				f = open(read_file,'r')
				for line in f:
					if not re.match("#", line):
						line = line.strip()
						sline = line.split()
						position = associate_tags_with_regions.tag_position(sline, opt.fragment_size)
						index =associate_tags_with_regions.find_readcount_on_islands(island_start_list, island_end_list, position);
						if index >= 0:
							island_A_readcount_list[index] += 1;
							totalA += 1;
				f.close();
				island_A_readcount[chrom] = island_A_readcount_list;
							
				island_B_readcount_list=[0]*len(island_list);
				read_file = chrom + ".bed2";
				f = open(read_file,'r')
				for line in f:
					if not re.match("#", line):
						line = line.strip()
						sline = line.split()
						position = associate_tags_with_regions.tag_position(sline, opt.fragment_size)
						index = associate_tags_with_regions.find_readcount_on_islands(island_start_list, island_end_list, position);
						if index >= 0:
							island_B_readcount_list[index] += 1;
							totalB += 1;
				f.close();		
				island_B_readcount[chrom] = island_B_readcount_list;			
						
	#A_background_read = A_library_size - totalA;
	#B_background_read = B_library_size - totalB;
	
	print "Total number of A reads on islands is: ", totalA; 
	print "Total number of B reads on islands is: ", totalB; 

	# Calculate the p value.
	library_scaling_factor = A_library_size*1.0/B_library_size; #A vs B
	pseudo_count = 1; 
	pvalue_A_vs_B_list = [];
	pvalue_B_vs_A_list = [];
	for chrom in chroms:
		if chrom in islands.keys():
			if len(islands[chrom]) != 0:
				island_list = islands[chrom];
				for index in xrange(len(island_list)):
					item = island_list[index];
					Acount = (island_A_readcount[chrom])[index]; 
					Bcount = (island_B_readcount[chrom])[index];
					pvalue_A_vs_B = pvaule (Acount, Bcount, library_scaling_factor, pseudo_count);
					pvalue_A_vs_B_list.append(pvalue_A_vs_B);
					pvalue_B_vs_A = pvaule (Bcount, Acount, 1/library_scaling_factor, pseudo_count);
					pvalue_B_vs_A_list.append(pvalue_B_vs_A);
	#Calculate the FDR
	fdr_A_vs_B_list = fdr(pvalue_A_vs_B_list);
	fdr_B_vs_A_list = fdr(pvalue_B_vs_A_list);


	#Output the islands read counts, normalized read counts, fc, pvalue both ways
	scaling_factor = 1000000; 
	out = open(opt.out_file, 'w');
	outline = '#chrom' + "\t" + 'start' + "\t" + 'end' + "\t" + "Readcount_A" + "\t" + 'Normalized_Readcount_A' + "\t" + 'ReadcountB' + "\t" + 'Normalized_Readcount_B' + "\t" + "Fc_A_vs_B" + "\t" + "pvalue_A_vs_B" + "\t" + "FDR_A_vs_B" + "\t" + "Fc_B_vs_A" + "\t" + "pvalue_B_vs_A" + "\t" + "FDR_B_vs_A"  + "\n"; 	
	out.write(outline);
	ii=0;
	for chrom in chroms:
		if chrom in islands.keys():
			if len(islands[chrom]) != 0:
				island_list = islands[chrom];
				for index in xrange(len(island_list)):
					item = island_list[index];
					Acount = (island_A_readcount[chrom])[index]; 
					Bcount = (island_B_readcount[chrom])[index];
					normalized_A = Acount/ float(A_library_size) * scaling_factor;
					normalized_B = Bcount/ float(B_library_size) * scaling_factor;
					fc_A_vs_B = ((Acount + pseudo_count)*1.0/(Bcount + pseudo_count))/library_scaling_factor;
					fc_B_vs_A = ((Bcount + pseudo_count)*1.0/(Acount + pseudo_count)) * library_scaling_factor;
					outline = item.chrom + "\t" + str(item.start) + "\t" + str(item.end) + "\t" + str(Acount) + "\t"  +  str(normalized_A) + "\t"  +  str(Bcount) + "\t" + str(normalized_B) + "\t" +  str(fc_A_vs_B) + "\t" + str(pvalue_A_vs_B_list[ii]) + "\t" + str(fdr_A_vs_B_list[ii]) + "\t" + str(fc_B_vs_A) + "\t" + str(pvalue_B_vs_A_list[ii]) + "\t" + str(fdr_B_vs_A_list[ii]) + "\n";	
					out.write(outline);
					ii += 1;		
	out.close();

	SeparateByChrom.cleanup(chroms, '.bed1');
	SeparateByChrom.cleanup(chroms, '.bed2');


	# Calculate the correlations using normalized read counts
	A_array=();
	B_array=();
	for chrom in chroms:
		if chrom in islands.keys():
			if len(islands[chrom]) != 0:
				temp_array= scipy.array(island_A_readcount[chrom]);
				A_array=scipy.concatenate((temp_array, A_array));
				temp_array= scipy.array(island_B_readcount[chrom]);
				B_array=scipy.concatenate((temp_array, B_array));
	#Normalization to reads per million
	A_array = A_array/float(A_library_size) * scaling_factor;
	B_array = B_array/float(B_library_size) * scaling_factor;
	pearson=scipy.stats.pearsonr(A_array, B_array);
	print "Pearson's correlation is: ", pearson[0], " with p-value ",  pearson[1];
	spearman = scipy.stats.spearmanr(A_array, B_array);
	print "Spearman's correlation is: ", spearman[0], " with p-value ",  spearman[1];
示例#35
0
def main(argv):
    desc = """This is a template for the analysis of aggretated tag distribution with respect to a set of points, such as the TSSs of known genes, with one profile from each strand."""
    parser = OptionParser(description=desc)
    parser.add_option("-k",
                      "--known_genes_file",
                      action="store",
                      type="string",
                      dest="known_file",
                      help="file with known genes",
                      metavar="<file>")
    parser.add_option("-b",
                      "--bedfile",
                      action="store",
                      type="string",
                      dest="bedfile",
                      help="file with tags in bed format",
                      metavar="<file>")
    parser.add_option("-c",
                      "--TypeOfSites",
                      action="store",
                      type="string",
                      dest="type",
                      help="TSS, TES, TFBS",
                      metavar="<str>")
    parser.add_option("-o",
                      "--outfile",
                      action="store",
                      type="string",
                      dest="outfile",
                      help="outfile name",
                      metavar="<file>")
    parser.add_option(
        "-n",
        "--normalization",
        action="store",
        type="float",
        dest="norm",
        help=
        "additional normalization in addition to number of sites, number of reads per million and window_size per 1K"
    )
    parser.add_option("-s",
                      "--species",
                      action="store",
                      type="string",
                      dest="species",
                      help="species",
                      metavar="<str>")
    parser.add_option("-u",
                      "--UpstreamExtension",
                      action="store",
                      type="int",
                      dest="upstreamExtension",
                      help="UpstreamExtension",
                      metavar="<int>")
    parser.add_option("-d",
                      "--DownstreamExtension",
                      action="store",
                      type="int",
                      dest="downstreamExtension",
                      help="DownstreamExtension",
                      metavar="<int>")
    parser.add_option("-r",
                      "--resolution",
                      action="store",
                      type="int",
                      dest="resolution",
                      help="resolution of the profile, eg, 5",
                      metavar="<int>")
    parser.add_option(
        "-w",
        "--WindowSize",
        action="store",
        type="int",
        dest="window_size",
        help=
        "window size for averaging. When window size > resolution, there is smoothing",
        metavar="<int>")
    (opt, args) = parser.parse_args(argv)
    if len(argv) < 20:
        parser.print_help()
        sys.exit(1)

    if opt.species in GenomeData.species_chroms.keys():
        chroms = GenomeData.species_chroms[opt.species]
        chrom_lengths = GenomeData.species_chrom_lengths[opt.species]
    else:
        print "This species is not recognized, exiting"
        sys.exit(1)

    #t0 = time.time()
    libName = (opt.bedfile).split('/')[-1]
    libName = libName.split('.')[0]
    extension = "-" + libName + '.bed1'
    SeparateByChrom.separateByChrom(chroms, opt.bedfile, extension)
    num_genes = 0
    num_tags = 0
    profiles = {}
    numPoints = float(opt.upstreamExtension + opt.downstreamExtension) / float(
        opt.resolution)
    print "Upstream extension: ", opt.upstreamExtension
    print "Downstream extension: ", opt.downstreamExtension
    print "Resolution:", opt.resolution
    print "Scanning window size: ", opt.window_size
    print "Number of Points", numPoints
    plus_score_profile = [0] * int(numPoints)
    minus_score_profile = [0] * int(numPoints)

    if (opt.type == "TSS"):
        coords = UCSC.KnownGenes(opt.known_file)
        for chrom in chroms:
            mycoords = {}
            chrombed = chrom + extension
            if Utility.fileExists(chrombed):
                bed_vals = {}
                bed_vals = BED.BED(opt.species, chrombed, "BED2")
                num_tags += bed_vals.getNumVals()
                if (chrom in coords.keys()):
                    if (len(coords[chrom]) > 0):
                        num_genes += len(coords[chrom])
                        mycoords[chrom] = coords[chrom]
                        profiles[chrom] = getTSSProfile(
                            mycoords, opt.upstreamExtension,
                            opt.downstreamExtension, opt.resolution,
                            opt.window_size, 75, 75, bed_vals)
    elif (opt.type == "TES"):
        coords = UCSC.KnownGenes(opt.known_file)
        for chrom in chroms:
            mycoords = {}
            chrombed = chrom + extension
            if Utility.fileExists(chrombed):
                bed_vals = {}
                bed_vals = BED.BED(opt.species, chrombed, "BED2")
                num_tags += bed_vals.getNumVals()
                if (chrom in coords.keys()):
                    if (len(coords[chrom]) > 0):
                        num_genes += len(coords[chrom])
                        mycoords[chrom] = coords[chrom]
                        profiles[chrom] = getTESProfile(
                            mycoords, opt.upstreamExtension,
                            opt.downstreamExtension, opt.resolution,
                            opt.window_size, 75, 75, bed_vals)
    elif (opt.type == "TFBS"):
        # Build coords
        # Here we are assuming that the file has the format chrom location + .....for each line
        # chrom is sline[0], location is sline[1]
        coords = {}
        if (opt.known_file):
            infile = open(opt.known_file, 'r')
            for line in infile:
                """ check to make sure not a header line """
                if not re.match("track", line):
                    line = line.strip()
                    sline = line.split()
                    if sline[0] not in coords.keys():
                        coords[sline[0]] = []
                    coords[sline[0]].append(atoi(sline[1]))
            infile.close()

        for chrom in chroms:
            mycoords = {}
            chrombed = chrom + extension
            if Utility.fileExists(chrombed):
                bed_vals = {}
                bed_vals = BED.BED(opt.species, chrombed, "BED2")
                num_tags += bed_vals.getNumVals()
                if chrom in coords.keys():
                    if len(coords[chrom]) > 0:
                        num_genes += len(coords[chrom])
                        mycoords[chrom] = coords[chrom]
                        profiles[chrom] = getTFBSProfile(
                            mycoords, opt.upstreamExtension,
                            opt.downstreamExtension, opt.resolution,
                            opt.window_size, 75, 75, bed_vals)
    else:
        print "Only three types of locations are allowed: TSS, TES, TFBS"
        sys.exit(1)

    for chrom in profiles.keys():
        (plus_scores, minus_scores) = profiles[chrom]
        assert (int(numPoints) == len(plus_scores))
        assert (int(numPoints) == len(minus_scores))
        for i in xrange(int(numPoints)):
            plus_score_profile[i] += plus_scores[i]
            minus_score_profile[i] += minus_scores[i]

    SeparateByChrom.cleanup(chroms, extension)
    normalization = num_tags / 1000000.0
    normalization *= num_genes
    normalization *= opt.window_size / 1000.0
    normalization *= opt.norm
    print "Number of locations: ", num_genes
    print "Number of reads: ", num_tags
    print "Normalization is by total number of reads per million. normalization = ", normalization

    xValues = output(opt.upstreamExtension, opt.resolution, plus_score_profile,
                     minus_score_profile, normalization, opt.outfile)
示例#36
0
def main(argv):
	parser = OptionParser()
	parser.add_option("-a", "--islandfile1", action="store", type="string", dest="islandfile1", metavar="<file>", help="file 1 with islands info to be unioned")
	parser.add_option("-b", "--islandfile2", action="store", type="string", dest="islandfile2", metavar="<file>", help="file 2 with islands info to be unioned; if no, type in any word")
	parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species, mm8 or hg18", metavar="<str>")
	parser.add_option("-o", "--outputfile", action="store", type="string", dest="outfile", metavar="<file>", help="output file name")

	(opt, args) = parser.parse_args(argv)
	if len(argv) < 8:
        	parser.print_help()
        	sys.exit(1)
	
	if opt.species in species_chroms.keys():
		chroms = species_chroms[opt.species];
	else:
		print "This species is not recognized, exiting";
		sys.exit(1);
	
	
	SeparateByChrom.separateByChrom(chroms, opt.islandfile1, '.island1')
	
	if Utility.fileExists(opt.islandfile2):
		SeparateByChrom.separateByChrom(chroms, opt.islandfile2, '.island2')
		
		for chrom in chroms: 
			f = open(chrom + '.output', 'w')
			bed_vals_1 = BED.BED(opt.species, chrom+'.island1', "BED3", 0)
			bed_vals_2 = BED.BED(opt.species, chrom+'.island2', "BED3", 0)
			if len(bed_vals_1[chrom]) > 0 or len(bed_vals_2[chrom]) > 0:
				islandlist = bed_vals_1[chrom] + bed_vals_2[chrom];
				union_islands_to_file(islandlist, f)
			f.close()
		SeparateByChrom.cleanup(chroms, '.island2')
	else:
		for chrom in chroms: 
			f = open(chrom + '.output', 'w')
			bed_vals_1 = BED.BED(opt.species, chrom+'.island1', "BED3", 0)
			if len(bed_vals_1[chrom]) > 0:
				islandlist = bed_vals_1[chrom]
				union_islands_to_file(islandlist, f)
			f.close()

	SeparateByChrom.combineAllGraphFiles(chroms, '.output', opt.outfile);
	
	SeparateByChrom.cleanup(chroms, '.output')
	SeparateByChrom.cleanup(chroms, '.island1')
def main(argv):
    parser = OptionParser()
    parser.add_option("-k",
                      "--known_genes_file",
                      action="store",
                      type="string",
                      dest="known_file",
                      help="file with known genes",
                      metavar="<file>")
    parser.add_option("-b",
                      "--bedfile",
                      action="store",
                      type="string",
                      dest="bedfile",
                      help="file with tags in bed format",
                      metavar="<file>")
    parser.add_option("-c",
                      "--TypeOfSites",
                      action="store",
                      type="string",
                      dest="type",
                      help="TSS, TES, TFBS",
                      metavar="<str>")
    parser.add_option("-o",
                      "--outfile",
                      action="store",
                      type="string",
                      dest="outfile",
                      help="outfile name",
                      metavar="<file>")
    parser.add_option(
        "-n",
        "--normalization",
        action="store",
        type="float",
        dest="norm",
        help=
        "additional normalization in addition to number of reads per million and window_size per 1K"
    )
    parser.add_option("-s",
                      "--species",
                      action="store",
                      type="string",
                      dest="species",
                      help="species",
                      metavar="<str>")
    parser.add_option("-u",
                      "--UpstreamExtension",
                      action="store",
                      type="int",
                      dest="upstreamExtension",
                      help="UpstreamExtension",
                      metavar="<int>")
    parser.add_option("-d",
                      "--DownstreamExtension",
                      action="store",
                      type="int",
                      dest="downstreamExtension",
                      help="DownstreamExtension",
                      metavar="<int>")
    parser.add_option("-r",
                      "--resolution",
                      action="store",
                      type="int",
                      dest="resolution",
                      help="resolution of the profile, eg, 5",
                      metavar="<int>")
    parser.add_option(
        "-w",
        "--WindowSize",
        action="store",
        type="int",
        dest="window_size",
        help=
        "window size for averaging. When window size > resolution, there is smoothing",
        metavar="<int>")
    parser.add_option("-p",
                      "--plusReadShift",
                      action="store",
                      type="int",
                      dest="pshift",
                      help="plusReadShift",
                      metavar="<int>")
    parser.add_option("-m",
                      "--minusReadShift",
                      action="store",
                      type="int",
                      dest="mshift",
                      help="minusReadShift",
                      metavar="<int>")
    (opt, args) = parser.parse_args(argv)
    if len(argv) < 24:
        parser.print_help()
        sys.exit(1)

    if opt.species in GenomeData.species_chroms.keys():
        chroms = GenomeData.species_chroms[opt.species]
    else:
        print "This species is not recognized, exiting"
        sys.exit(1)

    #t0 = time.time()
    libName = (opt.bedfile).split('/')[-1]
    libName = libName.split('.')[0]
    extension = "-" + libName + '.bed1'
    SeparateByChrom.separateByChrom(chroms, opt.bedfile, extension)

    num_genes = 0
    num_tags = 0
    numPoints = float(opt.upstreamExtension + opt.downstreamExtension) / float(
        opt.resolution)
    print "Upstream extension: ", opt.upstreamExtension
    print "Downstream extension: ", opt.downstreamExtension
    print "Resolution:", opt.resolution
    print "Scanning window size: ", opt.window_size
    print "Number of Points", numPoints

    all_genes_scores = {}  #{name:[]}

    if (opt.type == "TSS"):
        coords = UCSC.KnownGenes(opt.known_file)
        for chrom in chroms:
            chrombed = chrom + extension
            if Utility.fileExists(chrombed):
                scoredic = {}
                mycoords = {}
                bed_vals = {}
                bed_vals = BED.BED(opt.species, chrombed, "BED2")
                num_tags += bed_vals.getNumVals()
                if (chrom in coords.keys()) and (len(coords[chrom]) > 0):
                    num_genes += len(coords[chrom])
                    mycoords[chrom] = coords[chrom]
                    scoredic = getTSSPMProfileMatrix(
                        mycoords, opt.upstreamExtension,
                        opt.downstreamExtension, opt.resolution,
                        opt.window_size, opt.pshift, opt.mshift, bed_vals)
                    all_genes_scores.update(scoredic)
                    #print annotations
                    #print scoreMatrix
                    #print score_profiles
    elif (opt.type == "TES"):
        coords = UCSC.KnownGenes(opt.known_file)
        for chrom in chroms:
            chrombed = chrom + extension
            if Utility.fileExists(chrombed):
                scoredic = {}
                mycoords = {}
                bed_vals = {}
                bed_vals = BED.BED(opt.species, chrombed, "BED2")
                num_tags += bed_vals.getNumVals()
                if (chrom in coords.keys()) and (len(coords[chrom]) > 0):
                    num_genes += len(coords[chrom])
                    mycoords[chrom] = coords[chrom]
                    scoredic = getTESPMProfileMatrix(
                        mycoords, opt.upstreamExtension,
                        opt.downstreamExtension, opt.resolution,
                        opt.window_size, opt.pshift, opt.mshift, bed_vals)
                    all_genes_scores.update(scoredic)
    elif (opt.type == "TFBS"):
        # Build coords
        # Here we are assuming that the file has the format chrom location + .....for each line
        # chrom is sline[0], location is sline[1]
        coords = {}
        if (opt.known_file):
            infile = open(opt.known_file, 'r')
            for line in infile:
                """ check to make sure not a header line """
                if not re.match("track", line):
                    line = line.strip()
                    sline = line.split()
                    if sline[0] not in coords.keys():
                        coords[sline[0]] = []
                    coords[sline[0]].append(atoi(sline[1]))
            infile.close()

        for chrom in chroms:
            chrombed = chrom + extension
            if Utility.fileExists(chrombed):
                scoredic = {}
                mycoords = {}
                bed_vals = {}
                bed_vals = BED.BED(opt.species, chrombed, "BED2")
                num_tags += bed_vals.getNumVals()
                if chrom in coords.keys() and len(coords[chrom]) > 0:
                    num_genes += len(coords[chrom])
                    mycoords[chrom] = coords[chrom]
                    scoredic = getTFBSPMProfileMatrix(
                        mycoords, opt.upstreamExtension,
                        opt.downstreamExtension, opt.resolution,
                        opt.window_size, opt.pshift, opt.mshift, bed_vals)
                    all_genes_scores.update(scoredic)
    else:
        print "Only three types of locations are allowed: TSS, TES, TFBS"
        sys.exit(1)

    SeparateByChrom.cleanup(chroms, extension)

    normalization = num_tags / 1000000.0
    normalization *= opt.window_size / 1000.0
    normalization *= opt.norm

    outFile = open(opt.outfile, 'w')
    # export the normalized result to a file.
    for mykey in all_genes_scores.keys():
        outline = str(mykey) + "\t" + "\t".join(
            [str(item / normalization)
             for item in all_genes_scores[mykey]]) + '\n'
        outFile.write(outline)
    outFile.close()

    print "Number of locations: ", num_genes
    print "Number of reads: ", num_tags
    print "normalization = ", normalization

    # Testing
    overall_profile = [0] * int(numPoints)
    for mykey in all_genes_scores.keys():
        assert (len(all_genes_scores[mykey]) == int(numPoints))
        for j in xrange(int(numPoints)):
            overall_profile[j] += (all_genes_scores[mykey])[j] / normalization
    overall_profile = [item / float(num_genes) for item in overall_profile]
    #for item in overall_profile:
    #	print item;
    pylab.clf()
    pylab.plot(overall_profile, "b")
    pylab.savefig("Overall_profile.png", format='png')
def main(argv):
	parser = OptionParser()
	
	parser.add_option("-b", "--bedfile", action="store", type="string", dest="bedfile", metavar="<file>", help="island bed file")
	parser.add_option("-t", "--RE_tree_pickle_file", action="store", type="string", dest="RE_Tree", metavar="<file>", help="file with RE tree in pickle format")
	parser.add_option("-l", "--RE_annotation_file_location", action="store", type="string", dest="RE_file_location", metavar="<file>", help="location of RE files named in repClass_repFamily_repName.txt")
	parser.add_option("-u", "--upstream_extension", action="store", type="int", dest="upstream_extension", help="upstream extension from start", metavar="<int>")
	parser.add_option("-d", "--downstream_extension", action="store", type="int", dest="downstream_extension", help="downstream extension from end",  metavar="<int>")
	parser.add_option("-s", "--species", action="store", type="string", dest="species",help="species, mm8, hg18, etc", metavar="<str>")
	
	(opt, args) = parser.parse_args(argv)
	if len(argv) < 12:
		parser.print_help()
		sys.exit(1)
	
	if opt.species in GenomeData.species_chroms.keys():
		chroms = GenomeData.species_chroms[opt.species]
		chrom_lengths = GenomeData.species_chrom_lengths[opt.species]
	else:
		print "This species is not recognized, exiting";
		sys.exit(1);
	
	#Separate_by_chrom on bedfile
	lib_name = (opt.bedfile).split('/')[-1] # remove directory
	suffix = lib_name.split('.')[-1] # txt
	lib_name = lib_name.split('.')[0] 
	extension = "-" + lib_name +'.' + suffix +"1"
	if Utility_extended.fileExists(opt.bedfile):
		if Utility_extended.chrom_files_exist(chroms, extension) != 1:
			SeparateByChrom.separateByChrom(chroms, opt.bedfile,  extension)
	else:
		print bedfile, " is not found";
		sys.exit(1)
	
	
	
	print "\nLoad the RE tree to get the RE file names"
	re_tree = pickle.load(open(opt.RE_Tree, 'rb'))
	(numb_classes, numb_families, numb_names) = get_read_count_on_REs.numbers(re_tree)
	print "There are %d classes, %d family, and %d names." %(numb_classes, numb_families, numb_names)
	
	total_num_islands = 0
	total_num_RE_islands = 0
	
	#cycle through chrom
	for chrom in chroms:
		# Get the islands
		island_list = []
		print chrom
		chrom_length = chrom_lengths[chrom]
		chrombed = chrom + extension
		if Utility_extended.fileExists(chrombed):		
			# load in each island
			inf = open(chrombed,'r')
			for line in inf:
				if not re.match("#", line):
					line = line.strip()
					sline = line.split()
					start = int(sline[1])
					end = int(sline[2])
					island_list.append( (start, end) )
			inf.close()	
			if Utility_extended.is_tuplelist_sorted(island_list, 0) != 1:
				island_list.sort(key = itemgetter[0]) # sort by start, assume non-overlapping
		else:
			print "%s can not be found" %chrombed
			
		island_flags = [0 for island in island_list]
		
		min_re_length = 10
		for reClass in re_tree.keys():
			for reFamily in re_tree[reClass].keys():
				for reName in re_tree[reClass][reFamily]:
					re_file_name = "_".join([reClass, reFamily, reName]) + ".txt"
					#print re_file_name
					this_island_flags = assign_islands_to_REs(opt.RE_file_location, re_file_name, chrom, chrom_length, island_list,  opt.upstream_extension, opt.downstream_extension, min_re_length) 
					#Collect the results into island_flags
					for i in xrange(len(this_island_flags)):
						if this_island_flags[i] == 1:
							island_flags[i] = 1
	
		print "There are %d island on %s" %(len(island_list),chrom)
		print "There are %d RE islands" %(sum(island_flags))
		total_num_islands += len(island_list)
		total_num_RE_islands += sum(island_flags)
	
	SeparateByChrom.cleanup(chroms, extension)
	print "There are %d islands" %(total_num_islands)
	print "There are %d RE islands" %(total_num_RE_islands)
def main(argv):
	parser = OptionParser()
	parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species, mm8, hg18", metavar="<str>")
	parser.add_option("-a", "--rawchipreadfile", action="store", type="string", dest="chipreadfile", metavar="<file>", help="raw read file from chip in bed format")
	parser.add_option("-b", "--rawcontrolreadfile", action="store", type="string", dest="controlreadfile", metavar="<file>", help="raw read file from control in BAM format")
	parser.add_option("-f", "--fragment_size", action="store", type="int", dest="fragment_size", metavar="<int>", help="average size of a fragment after CHIP experiment")
	parser.add_option("-d", "--islandfile", action="store", type="string", dest="islandfile", metavar="<file>", help="island file in BED format")
	parser.add_option("-o", "--outfile", action="store", type="string", dest="out_file", metavar="<file>", help="island read count summary file")
	parser.add_option("-t", "--mappable_fraction_of_genome_size ", action="store", type="float", dest="fraction", help="mapable fraction of genome size", metavar="<float>")
	
	(opt, args) = parser.parse_args(argv)
	if len(argv) < 14:
        	parser.print_help()
        	sys.exit(1)
		
	if opt.species in GenomeData.species_chroms.keys():
		chroms = GenomeData.species_chroms[opt.species];
		genomesize = sum (GenomeData.species_chrom_lengths[opt.species].values());
		genomesize = opt.fraction * genomesize;
	else:
		print "This species is not recognized, exiting";
		sys.exit(1);
	
	chip_library_size=get_total_tag_counts.get_total_tag_counts_bam(opt.chipreadfile);
	control_library_size=get_total_tag_counts.get_total_tag_counts_bam(opt.controlreadfile);
	print "chip library size  ", chip_library_size
	print "control library size  ", control_library_size
	
	totalchip = 0;
	totalcontrol = 0;
	
	islands = BED.BED(opt.species, opt.islandfile, "BED3", 0);
	
	# separate by chrom the chip library
	if Utility.fileExists(opt.chipreadfile):
		SeparateByChrom.separateByChromBamToBed(chroms, opt.chipreadfile, '.bed1');
	else:
		print opt.chipreadfile, " not found";
		sys.exit(1)
	# separate by chrom the control library
	if Utility.fileExists(opt.controlreadfile):
		SeparateByChrom.separateByChromBamToBed(chroms, opt.controlreadfile, '.bed2');
	else:
		print opt.controlreadfile, " not found";
		sys.exit(1)	
	
	island_chip_readcount = {};
	island_control_readcount = {};
	
	for chrom in chroms:
		if chrom in islands.keys():
			if len(islands[chrom]) != 0:
				island_list = islands[chrom];
				if Utility.is_bed_sorted(island_list) == 0:
					island_list.sort(key=operator.attrgetter('start'));
					
				island_start_list = []
				island_end_list = []
				for item in island_list:
					island_start_list.append(item.start)
					island_end_list.append(item.end)
	
				island_chip_readcount_list=[0]*len(island_list);
				read_file = chrom + ".bed1";
				f = open(read_file,'r')
				for line in f:
					if not re.match("#", line):
						line = line.strip()
						sline = line.split()
						position = associate_tags_with_regions.tag_position(sline, opt.fragment_size)
						index =associate_tags_with_regions.find_readcount_on_islands(island_start_list, island_end_list, position);
						if index >= 0:
							island_chip_readcount_list[index] += 1;
							totalchip += 1;
				f.close();
				island_chip_readcount[chrom] = island_chip_readcount_list;
							
				island_control_readcount_list=[0]*len(island_list);
				read_file = chrom + ".bed2";
				f = open(read_file,'r')
				for line in f:
					if not re.match("#", line):
						line = line.strip()
						sline = line.split()
						position = associate_tags_with_regions.tag_position(sline, opt.fragment_size)
						index = associate_tags_with_regions.find_readcount_on_islands(island_start_list, island_end_list, position);
						if index >= 0:
							island_control_readcount_list[index] += 1;
							totalcontrol += 1;
				f.close();
							
				island_control_readcount[chrom] = island_control_readcount_list;			
						
	chip_background_read = chip_library_size - totalchip;
	control_background_read = control_library_size - totalcontrol;
	#scaling_factor = chip_background_read*1.0/control_background_read;
	scaling_factor = chip_library_size*1.0/control_library_size;
	
	
	print "Total number of chip reads on islands is: ", totalchip; 
	print "Total number of control reads on islands is: ", totalcontrol; 

	#print "chip_background_read   ", chip_background_read
	#print "control_background_read   ", control_background_read

	out = open(opt.out_file, 'w');
	pvalue_list = [];
	result_list = [];
	for chrom in chroms:
		if chrom in islands.keys():
			if len(islands[chrom]) != 0:
				island_list = islands[chrom];
				for index in xrange(len(island_list)):
					item = island_list[index];
					observation = (island_chip_readcount[chrom])[index];
					control_tag = (island_control_readcount[chrom])[index];
					if (island_control_readcount[chrom])[index] > 0:
						#average = (island_control_readcount[chrom])[index] * scaling_factor;
						average = control_tag * scaling_factor
						fc = float(observation)/float(average);
					else:
						length = item.end - item.start + 1;
						average = length * control_library_size *1.0/genomesize;			
						average = min(0.25, average)* scaling_factor;
						fc = float(observation)/float(average);
					if observation > average:
						pvalue = scipy.stats.poisson.sf((island_chip_readcount[chrom])[index], average)[()]; 
					else:
						pvalue = 1;
					pvalue_list.append(pvalue);
					item_dic = {}
					item_dic['chrom'] = item.chrom
					item_dic['start'] = item.start
					item_dic['end'] = item.end
					item_dic['chip'] = observation
					item_dic['control'] = control_tag
					item_dic['pvalue'] = pvalue
					item_dic['fc'] = fc
					result_list.append(item_dic)
	
	pvaluearray=scipy.array(pvalue_list);
	pvaluerankarray=scipy.stats.rankdata(pvaluearray);
	totalnumber = len(result_list);
	for i in range(totalnumber):
		item = result_list[i];
		alpha = pvalue_list[i] * totalnumber/pvaluerankarray[i];
		if alpha > 1:
			alpha = 1;
		outline = item['chrom'] + "\t" + str(item['start']) + "\t" + str(item['end']) + "\t" + str(item['chip']) + "\t" + str(item['control']) + "\t" + str(item['pvalue']) + "\t" + str(item['fc']) + "\t" + str(alpha) + "\n";	
		out.write(outline);
					
	#pvalue_list.sort()
	#for item in result_list:
		#pvalue = float(item['pvalue'])
		#alpha = pvalue * len(result_list) / (pvalue_list.index(pvalue) + 1)
		#if alpha > 1:
			#alpha = 1;
		#outline = item['chrom'] + "\t" + str(item['start']) + "\t" + str(item['end']) + "\t" + str(item['chip']) + "\t" + str(item['control']) + "\t" + str(item['pvalue']) + "\t" + str(item['fc']) + "\t" + str(alpha) + "\n";	
		#out.write(outline);		
	out.close();
	
	
	SeparateByChrom.cleanup(chroms, '.bed1');
	SeparateByChrom.cleanup(chroms, '.bed2');
示例#40
0
def main(argv):
    parser = OptionParser()
    parser.add_option("-a",
                      "--islandfile1",
                      action="store",
                      type="string",
                      dest="islandfile1",
                      metavar="<file>",
                      help="file 1 with islands info to be unioned")
    parser.add_option(
        "-b",
        "--islandfile2",
        action="store",
        type="string",
        dest="islandfile2",
        metavar="<file>",
        help="file 2 with islands info to be unioned; if no, type in any word")
    parser.add_option("-s",
                      "--species",
                      action="store",
                      type="string",
                      dest="species",
                      help="species, mm8 or hg18",
                      metavar="<str>")
    parser.add_option("-o",
                      "--outputfile",
                      action="store",
                      type="string",
                      dest="outfile",
                      metavar="<file>",
                      help="output file name")

    (opt, args) = parser.parse_args(argv)
    if len(argv) < 8:
        parser.print_help()
        sys.exit(1)

    if opt.species in species_chroms.keys():
        chroms = species_chroms[opt.species]
    else:
        print "This species is not recognized, exiting"
        sys.exit(1)

    SeparateByChrom.separateByChrom(chroms, opt.islandfile1, '.island1')

    if Utility.fileExists(opt.islandfile2):
        SeparateByChrom.separateByChrom(chroms, opt.islandfile2, '.island2')

        for chrom in chroms:
            f = open(chrom + '.output', 'w')
            bed_vals_1 = BED.BED(opt.species, chrom + '.island1', "BED3", 0)
            bed_vals_2 = BED.BED(opt.species, chrom + '.island2', "BED3", 0)
            if len(bed_vals_1[chrom]) > 0 or len(bed_vals_2[chrom]) > 0:
                islandlist = bed_vals_1[chrom] + bed_vals_2[chrom]
                union_islands_to_file(islandlist, f)
            f.close()
        SeparateByChrom.cleanup(chroms, '.island2')
    else:
        for chrom in chroms:
            f = open(chrom + '.output', 'w')
            bed_vals_1 = BED.BED(opt.species, chrom + '.island1', "BED3", 0)
            if len(bed_vals_1[chrom]) > 0:
                islandlist = bed_vals_1[chrom]
                union_islands_to_file(islandlist, f)
            f.close()

    SeparateByChrom.combineAllGraphFiles(chroms, '.output', opt.outfile)

    SeparateByChrom.cleanup(chroms, '.output')
    SeparateByChrom.cleanup(chroms, '.island1')
def main(argv):
    parser = OptionParser()
    parser.add_option(
        "-a",
        "--AluElementsFile",
        action="store",
        type="string",
        dest="Alus",
        help="input Alu annotation file for non-strand specific analysis",
        metavar="<file>")
    parser.add_option(
        "-u",
        "--entrez_genes_file",
        action="store",
        type="string",
        dest="entrez_collection",
        metavar="<file>",
        help=
        "file with curated known genes clustered by entrez ID in pickle format"
    )
    parser.add_option("-o",
                      "--outfile",
                      action="store",
                      type="string",
                      dest="outfile",
                      metavar="<file>",
                      help="output file name for genes and tag numbers")
    parser.add_option("-s",
                      "--species",
                      action="store",
                      type="string",
                      dest="species",
                      help="species, mm8, hg18, etc",
                      metavar="<str>")

    (opt, args) = parser.parse_args(argv)
    if len(argv) < 8:
        parser.print_help()
        sys.exit(1)

    startTime = time.time()

    if opt.species in GenomeData.species_chroms.keys():
        chroms = GenomeData.species_chroms[opt.species]
    else:
        print "This species is not recognized, exiting"
        sys.exit(1)

    # entrez_collection is a dic (keyed by entrez_id) of lists of EntrezGene object
    annotation = open(opt.entrez_collection, 'rb')
    temp = pickle.load(annotation)
    my_entrez_genes = Entrez.KnownEntrezGenes(chroms, temp)
    annotation.close()

    #test entrez, checks out
    #id = my_entrez_genes.entrez_genes.keys()[0]
    #print id
    #for i in my_entrez_genes.entrez_genes[id].transcripts:
    #print i.getAll()

    lib_name = (opt.Alus).split('/')[-1]  # remove directory
    suffix = lib_name.split('.')[-1]  # txt
    lib_name = lib_name.split('.')[0]
    extension = "-" + lib_name + '.' + suffix + "1"
    if Utility_extended.fileExists(opt.Alus):
        if Utility_extended.chrom_files_exist(chroms, extension) != 1:
            # Separate by chrom and sort by start
            print chroms, extension, " files do not exist, separate by chroms. "
            SeparateByChrom.separateByChrom(chroms, opt.Alus, extension)
    else:
        print opt.Alus, " is not found"
        sys.exit(1)

    Alus_in_shared_intron = {}
    Alus_in_shared_exon = {}
    Alus_in_merged_transcript = {}

    for chrom in chroms:
        (shared_intron_Alus, shared_exon_Alus,
         merged_transcript_Alus) = assign_AluElements_to_intronexons_by_chrom(
             my_entrez_genes, chrom + extension, chrom)
        if chrom == chroms[0]:
            myid = shared_intron_Alus.keys()[0]
            test(my_entrez_genes, shared_intron_Alus, myid)
        Alus_in_shared_intron.update(shared_intron_Alus)
        Alus_in_shared_exon.update(shared_exon_Alus)
        Alus_in_merged_transcript.update(merged_transcript_Alus)

    #{entrezID:[(region=(start, end), Alu_count)]}
    Alus_in_shared_intron_dist = {}
    for myid in Alus_in_shared_intron.keys():
        shared_intronic_regions_on_this_gene = Alus_in_shared_intron[myid]
        Alus_on_shared_intronic_regions_on_this_gene = []
        for region in shared_intronic_regions_on_this_gene:
            region_coord, Alu_positions = region
            number_of_Alus = len(Alu_positions)
            Alus_on_shared_intronic_regions_on_this_gene.append(
                (region_coord, number_of_Alus))
        Alus_in_shared_intron_dist[
            myid] = Alus_on_shared_intronic_regions_on_this_gene
    outname = opt.outfile + "_Alu_distribution_in_shared_intron.pkl"
    output = open(outname, 'wb')
    pickle.dump(Alus_in_shared_intron_dist, output)
    print "The number of genes output to %s is %d " % (
        outname, len(Alus_in_shared_intron.keys()))
    output.close()

    #total_intronic_regions = 0
    #for myid in Alus_in_shared_intron.keys():
    #	total_intronic_regions += len(Alus_in_shared_intron[myid])
    #print "There are %d genes with %d shared intronic regions " % (len(Alus_in_shared_intron.keys()),  total_intronic_regions)

    #{entrezID:[(region, Alu_positions)]}
    outname = opt.outfile + "_Alus_in_shared_intron.pkl"
    output = open(outname, 'wb')
    pickle.dump(Alus_in_shared_intron, output)
    print "The number of genes output to %s is %d " % (
        outname, len(Alus_in_shared_intron.keys()))
    output.close()

    #{entrezID:[(region, Alu_positions)]}
    outname = opt.outfile + "_Alus_in_shared_exon.pkl"
    output = open(outname, 'wb')
    pickle.dump(Alus_in_shared_exon, output)
    print "The number of genes output to %s is %d " % (
        outname, len(Alus_in_shared_exon.keys()))
    output.close()

    #Though in this case the structure can be simpler: {entrezID:(region, Alu_count)}, it is better to make the interface uniform.{entrezID:[(region, Alu_count)]}
    Alus_in_merged_transcript_dist = {}
    for myid in Alus_in_merged_transcript.keys():
        assert len(Alus_in_merged_transcript[myid]) == 1
        region = (Alus_in_merged_transcript[myid])[0]
        region_coord, Alu_positions = region
        number_of_Alus = len(Alu_positions)
        Alus_in_merged_transcript_dist[myid] = [(region_coord, number_of_Alus)]
    outname = opt.outfile + "_Alu_distribution_in_merged_transcript.pkl"
    output = open(outname, 'wb')
    pickle.dump(Alus_in_merged_transcript_dist, output)
    print "The number of genes output to %s is %d " % (
        outname, len(Alus_in_merged_transcript.keys()))
    output.close()

    #{entrezID:[(region, Alu_positions)]}
    outname = opt.outfile + "_Alus_in_merged_transcript.pkl"
    output = open(outname, 'wb')
    pickle.dump(Alus_in_merged_transcript, output)
    print "The number of genes output to %s is %d " % (
        outname, len(Alus_in_merged_transcript.keys()))
    output.close()

    print "it took", time.time() - startTime, "seconds."
示例#42
0
def AssignPeaksToEntrez3UTRs(entrez_genes, peakfile, chroms, chrom_lengths, peak_threshold, downstream_extension):
	"""
	Returns {entrez_id:(gene, ThreeUTR_length, peaks_on_3UTR)}
	gene:gene = entrez_genes_by_chrom.entrez_genes[entrez_id] 
	ThreeUTR_length: longest 3UTR length; length includes the downstream extension
	peaks_on_3UTR:[(location, read_count)]
	"""
	
	peaks_on_entrez_3UTRs = {} #store the peaks for each 3UTR of the entrez cluster. {Entrez_ID: (gene, ThreeUTR_length, peaks_on_3UTR)}
	
	if Utility_extended.fileExists(peakfile):
		# Read the peaks, which is assumed to have the pseudo ucsc format
		island_libName1 = (peakfile).split('/')[-1]
		island_suffix1 = island_libName1.split('.')[-1] 
		island_libName1 = island_libName1.split('.')[0]
		island_extension1 = "-" + island_libName1 + '.' + island_suffix1 + "1"
		SeparateByChrom.separateByChrom(chroms, peakfile, island_extension1)
	else:
		print peakfile, " is not found";
		sys.exit(1)
	
	for chrom in chroms: 
		if chrom in entrez_genes.chroms:
			entrez_genes_by_chrom =  Entrez.KnownEntrezGenes([chrom], entrez_genes.subset_by_chrom(chrom))
			this_chrom_length = chrom_lengths[chrom]
			
			# Load in the PA peak information 
			if Utility_extended.fileExists(chrom + island_extension1):
				inf = open(chrom + island_extension1, 'r')
				# Read in the peaks and separate the forward strand peaks and the reverse strand peaks
				five_peaks = [] # peaks on forward strand, element (location, read_count)
				three_peaks = [] # peaks on reverse strand, element (location, read_count)
				for line in inf:
					line = line.strip();
					sline = line.split();
					strand = sline[2]
					if plus.match(strand):
						if float(sline[10]) >= peak_threshold:
							five_peaks.append ((int(sline[3]), float(sline[10])))
					elif minus.match(strand):
						if float(sline[10]) >= peak_threshold:
							three_peaks.append ((int(sline[4]), float(sline[10])))
				five_peaks = sorted(five_peaks, key = itemgetter(0)) #sort according to location
				five_peaks_location = [item[0] for item in five_peaks]
				three_peaks = sorted(three_peaks, key = itemgetter(0))
				three_peaks_location = [item[0] for item in three_peaks]
				inf.close()
				
				for entrez_id in entrez_genes_by_chrom.entrez_ids:
					gene = entrez_genes_by_chrom.entrez_genes[entrez_id] # an EntrezGene class object
					
					# For the set of transcripts, use the longest 3UTR at the designated representative 3UTR
					transcript_with_longest_3UTR = gene.identify_transcript_with_longest_3UTR() # a UCSC class object
					
					if plus.match(transcript_with_longest_3UTR.strand):
						start = transcript_with_longest_3UTR.cdsEnd
						end = min(transcript_with_longest_3UTR.txEnd + downstream_extension, this_chrom_length)
						start_ind = bisect.bisect_left(five_peaks_location, start);
						end_ind = bisect.bisect_right(five_peaks_location, end);
						peaks_on_3UTR = five_peaks[start_ind: end_ind] #[(mode_location, readcount)]
					if minus.match(transcript_with_longest_3UTR.strand):
						start = max(transcript_with_longest_3UTR.txStart - downstream_extension, 0)
						end = transcript_with_longest_3UTR.cdsStart
						start_ind = bisect.bisect_left(three_peaks_location, start);
						end_ind = bisect.bisect_right(three_peaks_location, end);
						peaks_on_3UTR = three_peaks[start_ind: end_ind]
					ThreeUTR_length = end - start + 1 #length includes the downstream extension
					peaks_on_entrez_3UTRs[entrez_id] = (gene, ThreeUTR_length, peaks_on_3UTR)
				
	SeparateByChrom.cleanup(chroms, island_extension1)
	return peaks_on_entrez_3UTRs
示例#43
0
def get_read_count_on_exons(genefile, bedfile, species, output):

    gene_coords = UCSC.KnownGenes(genefile)
    bed_vals = my_BED.Starts(species, bedfile)
    #print bed_vals.keys() --- only those chroms in species hg18 stored in my_BED.py
    num_tags = bed_vals.getNumVals()
    print num_tags
    if Utility.fileExists(bedfile):
        SeparateByChrom.separateByChrom(bed_vals.keys(), bedfile, '.bed1')
    else:
        print bedfile, " not found"
        sys.exit(1)

    outFile = open(output, 'w')
    tag_starts = []
    exon_sums = 0
    exon_sizes = 0
    exon_density = 0
    RPKM = 0
    for chrom in gene_coords.keys():
        if chrom in bed_vals.keys():
            #print chrom
            bed_vals = my_BED.Starts(species, chrom + '.bed1')
            tag_starts = bed_vals[chrom]
            #print len(tag_starts)
            tag_starts.sort()
            for g in gene_coords[chrom]:
                if len(tag_starts) > 0:
                    #print 'tag_start_length='+str(len(tag_starts))
                    #print 'exonCount='+ str(g.exonCount)
                    exon_Starts = g.exonStarts.split(',')
                    exon_Ends = g.exonEnds.split(',')
                    assert len(exon_Starts) == len(exon_Ends)
                    if g.exonCount > 0:
                        exon_sums = 0
                        exon_sizes = 0
                        if plus.match(g.strand):
                            for i in range(0, int(g.exonCount)):
                                exon_sums += countTagsInWindow(
                                    int(exon_Starts[i]), int(exon_Ends[i]),
                                    tag_starts)
                                exon_sizes += abs(
                                    int(exon_Ends[i]) - int(exon_Starts[i]))
        ## exon_density is per mappedreads(million) * exonlength(kb), edgeR will divide this by mappedreads
                            exon_density = (float(exon_sums) /
                                            float(exon_sizes))
                            RPKM = (exon_density /
                                    float(num_tags)) * 1000 * 1000000
#print exon_density
                        elif minus.match(g.strand):
                            for i in range(0, int(g.exonCount)):
                                exon_sums += countTagsInWindow(
                                    int(exon_Starts[-2 - i]),
                                    int(exon_Ends[-2 - i]), tag_starts)
                                exon_sizes += abs(
                                    int(exon_Ends[-2 - i]) -
                                    int(exon_Starts[-2 - i]))
                            exon_density = (float(exon_sums) /
                                            float(exon_sizes))
                            RPKM = (exon_density /
                                    float(num_tags)) * 1000 * 1000000

#print exon_density
                        print g.name, exon_sums, exon_sizes, exon_density, RPKM
                else:
                    print g.name, exon_sums, exon_sizes, exon_density, RPKM
                outline = str(g.name) + "\t" + str(RPKM) + "\n"
                #outline = str(g.name) + "\t" +  str(exon_sums) + "\t" + str(exon_sizes)+"\t" + str(exon_density)+"\t" + str(RPKM) + "\n"
                outFile.write(outline)
    outFile.close()

    SeparateByChrom.cleanup(bed_vals.keys(), '.bed1')

    return 0