def getCoverageAtDistance(): #first, get the ribosome occupancy data for our 700 verified genes VERIFIED_FILE_NAME = "/storage/james/data/ecoli/verified.gff" POSITIVE_FILE_NAME = "/storage/james/data/ecoli/reads/2015study/positive.ribo" NEGATIVE_FILE_NAME = POSITIVE_FILE_NAME.replace("positive", "negative") OFFSET = 50 NUM_GENES = 769 verified_coverage = {} with open(VERIFIED_FILE_NAME, 'r') as f: #parse coverage i = 0 for line in f: if i > NUM_GENES: break vals = line.split('\t') start, stop, strand = int(vals[3]), int(vals[4]), vals[6] if i % 10 == 0: print str(i) + " genes processed so far" print "processing gene: (" + str(start) + ", " + str( stop) + ")" if strand == "+": verified_coverage[(start, stop)] = geneTools.getIntervalCoverage( POSITIVE_FILE_NAME, start - OFFSET, start + OFFSET) elif strand == "-": verified_coverage[(start, stop)] = geneTools.getIntervalCoverage( NEGATIVE_FILE_NAME, start - OFFSET, start + OFFSET) i += 1 #dictionary: key is distance from start codon. value is a length-769 array of every y (coverage) value at that distance #can then turn the array into a histogram d_dict = {} for i in range(-50, 51): d_dict[i] = [] for (start, stop) in verified_coverage.keys(): coverage_vals = verified_coverage[(start, stop)] print "coverage vals length: " + str(len(coverage_vals)) for i in range(len(coverage_vals)): d_dict[i - 50].append(coverage_vals[i]) plt.figure() plt.hist(d_dict[15], bins=100) plt.show() return d_dict
def getCoverageAtDistance(): #first, get the ribosome occupancy data for our 700 verified genes VERIFIED_FILE_NAME = "/storage/james/data/ecoli/verified.gff" POSITIVE_FILE_NAME = "/storage/james/data/ecoli/reads/2015study/positive.ribo" NEGATIVE_FILE_NAME = POSITIVE_FILE_NAME.replace("positive", "negative") OFFSET = 50 NUM_GENES = 769 verified_coverage = {} with open(VERIFIED_FILE_NAME, 'r') as f: #parse coverage i = 0 for line in f: if i > NUM_GENES: break; vals = line.split('\t') start, stop, strand = int(vals[3]), int(vals[4]), vals[6] if i % 10 == 0: print str(i) + " genes processed so far" print "processing gene: (" + str(start) + ", " + str(stop) + ")" if strand == "+": verified_coverage[(start, stop)] = geneTools.getIntervalCoverage(POSITIVE_FILE_NAME, start-OFFSET, start+OFFSET) elif strand == "-": verified_coverage[(start, stop)] = geneTools.getIntervalCoverage(NEGATIVE_FILE_NAME, start-OFFSET, start+OFFSET) i += 1 #dictionary: key is distance from start codon. value is a length-769 array of every y (coverage) value at that distance #can then turn the array into a histogram d_dict = {} for i in range(-50, 51): d_dict[i] = [] for (start, stop) in verified_coverage.keys(): coverage_vals = verified_coverage[(start, stop)] print "coverage vals length: " + str(len(coverage_vals)) for i in range(len(coverage_vals)): d_dict[i - 50].append(coverage_vals[i]) plt.figure() plt.hist(d_dict[15], bins=100) plt.show() return d_dict
import argparse from geneTools import getIntervalCoverage import matplotlib.pyplot as plt parser = argparse.ArgumentParser(description="foo") parser.add_argument( "COVERAGE_FILE", help="link to fully specified coverage file you want to use") parser.add_argument("START", help="start position in the genome") parser.add_argument("END", help="stop position in the genome") args = vars(parser.parse_args()) start = int(args['START']) end = int(args['END']) vals = getIntervalCoverage(args['COVERAGE_FILE'], start, end) plt.figure() x = range(start, end) print "x length: " + str(len(x)) + " vals length: " + str(len(vals)) if len(vals) < len(x): vals.extend([0] * (len(x) - len(vals))) plt.plot(x, vals, 'ro') plt.plot(start, 0, 'b*') plt.xlabel("genome position") plt.ylabel("coverage score") plt.title("Coverage over the range specified from the given file") plt.show()