def getCoverageAtDistance():
    #first, get the ribosome occupancy data for our 700 verified genes

    VERIFIED_FILE_NAME = "/storage/james/data/ecoli/verified.gff"
    POSITIVE_FILE_NAME = "/storage/james/data/ecoli/reads/2015study/positive.ribo"
    NEGATIVE_FILE_NAME = POSITIVE_FILE_NAME.replace("positive", "negative")

    OFFSET = 50
    NUM_GENES = 769

    verified_coverage = {}

    with open(VERIFIED_FILE_NAME, 'r') as f:
        #parse coverage
        i = 0
        for line in f:
            if i > NUM_GENES:
                break
            vals = line.split('\t')
            start, stop, strand = int(vals[3]), int(vals[4]), vals[6]
            if i % 10 == 0:
                print str(i) + " genes processed so far"
                print "processing gene: (" + str(start) + ", " + str(
                    stop) + ")"
            if strand == "+":
                verified_coverage[(start,
                                   stop)] = geneTools.getIntervalCoverage(
                                       POSITIVE_FILE_NAME, start - OFFSET,
                                       start + OFFSET)
            elif strand == "-":
                verified_coverage[(start,
                                   stop)] = geneTools.getIntervalCoverage(
                                       NEGATIVE_FILE_NAME, start - OFFSET,
                                       start + OFFSET)
            i += 1
        #dictionary: key is distance from start codon. value is a length-769 array of every y (coverage) value at that distance
        #can then turn the array into a histogram
        d_dict = {}
        for i in range(-50, 51):
            d_dict[i] = []
        for (start, stop) in verified_coverage.keys():
            coverage_vals = verified_coverage[(start, stop)]
            print "coverage vals length: " + str(len(coverage_vals))
            for i in range(len(coverage_vals)):
                d_dict[i - 50].append(coverage_vals[i])
        plt.figure()
        plt.hist(d_dict[15], bins=100)
        plt.show()

        return d_dict
def getCoverageAtDistance():
    #first, get the ribosome occupancy data for our 700 verified genes

    VERIFIED_FILE_NAME = "/storage/james/data/ecoli/verified.gff"
    POSITIVE_FILE_NAME = "/storage/james/data/ecoli/reads/2015study/positive.ribo"
    NEGATIVE_FILE_NAME = POSITIVE_FILE_NAME.replace("positive", "negative")

    OFFSET = 50
    NUM_GENES = 769

    verified_coverage = {}

    with open(VERIFIED_FILE_NAME, 'r') as f:
        #parse coverage
        i = 0
        for line in f:
            if i > NUM_GENES:
                break;
            vals = line.split('\t')
            start, stop, strand = int(vals[3]), int(vals[4]), vals[6]
            if i % 10 == 0:
                print str(i) + " genes processed so far"
                print "processing gene: (" + str(start) + ", " + str(stop) + ")"
            if strand == "+":
                verified_coverage[(start, stop)] = geneTools.getIntervalCoverage(POSITIVE_FILE_NAME, start-OFFSET, start+OFFSET)
            elif strand == "-":
                verified_coverage[(start, stop)] = geneTools.getIntervalCoverage(NEGATIVE_FILE_NAME, start-OFFSET, start+OFFSET)
            i += 1
        #dictionary: key is distance from start codon. value is a length-769 array of every y (coverage) value at that distance
        #can then turn the array into a histogram
        d_dict = {}
        for i in range(-50, 51):
            d_dict[i] = []
        for (start, stop) in verified_coverage.keys():
            coverage_vals = verified_coverage[(start, stop)]
            print "coverage vals length: " + str(len(coverage_vals))
            for i in range(len(coverage_vals)):
                d_dict[i - 50].append(coverage_vals[i])
        plt.figure()
        plt.hist(d_dict[15], bins=100)
        plt.show()
        
        return d_dict
Exemplo n.º 3
0
import argparse
from geneTools import getIntervalCoverage
import matplotlib.pyplot as plt

parser = argparse.ArgumentParser(description="foo")
parser.add_argument(
    "COVERAGE_FILE",
    help="link to fully specified coverage file you want to use")
parser.add_argument("START", help="start position in the genome")
parser.add_argument("END", help="stop position in the genome")

args = vars(parser.parse_args())
start = int(args['START'])
end = int(args['END'])

vals = getIntervalCoverage(args['COVERAGE_FILE'], start, end)

plt.figure()
x = range(start, end)

print "x length: " + str(len(x)) + " vals length: " + str(len(vals))

if len(vals) < len(x):
    vals.extend([0] * (len(x) - len(vals)))

plt.plot(x, vals, 'ro')
plt.plot(start, 0, 'b*')
plt.xlabel("genome position")
plt.ylabel("coverage score")
plt.title("Coverage over the range specified from the given file")
plt.show()