예제 #1
0
# flter for coverage
sys.stderr.write("Before cleaning there are " + str(len(hits.keys())) +
                 " hits\n")
hits = clean_hits()
sys.stderr.write("After cleaning there are " + str(len(hits.keys())) +
                 " hits\n")

# calculate the median for each position

sys.stderr.write("calculating medians\n")
median = []
mean = []
for i in range(longest_sequence):
    data = [hits[s][i] for s in hits]
    median.append(roblib.median(data))
    mean.append(roblib.mean(data))

median = np.array(median)
mean = np.array(mean)

if subtract_medians:
    for s in hits:
        hits[s] = np.array(hits[s]) - median

if subtract_means:
    for s in hits:
        hits[s] = np.array(hits[s]) - mean

maxy = 0
for s in hits:
    m = max(hits[s])
                elif int(headers[i]) <= 63:
                    cols['control'].append(i)
                else:
                    sys.stderr.write("Don't understand header {} at column {}\n".format(headers[i], i))
        else:
            data[p[0]] = map(int, p[1:])

allcontigs = data.keys()
allcontigs.sort()

# calculate the mean and stdev for each group and each contig
means = {}
std = {}

for contig in allcontigs:
    means[contig] = {}
    std[contig] = {}
    for sample in cols:
        testdata = [data[contig][i] for i in cols[sample]]
        means[contig][sample] = roblib.mean(testdata)
        std[contig][sample] = 2 * roblib.stdev(testdata)

    # test the NS vs Control
    if means[contig]['plasma'] - std[contig]['plasma'] > means[contig]['control'] + std[contig]['control'] and \
        means[contig]['buffy'] - std[contig]['buffy'] > means[contig]['control'] + std[contig]['control'] and \
        means[contig]['csf'] - std[contig]['csf'] > means[contig]['control'] + std[contig]['control']:
        print("\t".join(map(str, ["ALL", contig, means[contig]['plasma'], means[contig]['buffy'], means[contig]['csf'],
                                                      means[contig]['control']])))


예제 #3
0
        points = map(float, p[hcols:])
        nz = filter(notzero, points)
        psum = sum(points)
        total += psum
        data[p[0]] = [len(nz), psum]


# now calculate the mean and stdev based on the beta distribtion
xvalues = set()
betad = {}
for p in data:
    if data[p][0] not in betad:
        sys.stderr.write("alpha:" + str(data[p][0]) + " beta: " + str((ncols-data[p][0])+1)+ "\n")
        # samples = np.random.beta(data[p][0]+1, ncols-data[p][0], 1000)
        samples = np.random.beta(data[p][1]+1, total-data[p][1], 100000)
        betad[data[p][0]] = (roblib.mean(samples), roblib.stdev(samples))



seen = set()
for p in data:
    x = str(1.0 * data[p][0]/ncols)
    sys.stdout.write(p + "\t"  + str(x) + "\t" + str(1.0 * data[p][1]/total))
    if x not in seen:
        sys.stdout.write( "\t" + str(betad[data[p][0]][0]) + "\t" + str(betad[data[p][0]][1]))
    seen.add(x)
    sys.stdout.write("\n")



예제 #4
0

# flter for coverage
sys.stderr.write("Before cleaning there are " + str(len(hits.keys())) + " hits\n")
hits = clean_hits()
sys.stderr.write("After cleaning there are " + str(len(hits.keys())) + " hits\n")

# calculate the median for each position

sys.stderr.write("calculating medians\n")
median = []
mean = []
for i in range(longest_sequence):
    data = [hits[s][i] for s in hits]
    median.append(roblib.median(data))
    mean.append(roblib.mean(data))

median = np.array(median)
mean = np.array(mean)

if subtract_medians:
    for s in hits:
        hits[s] = np.array(hits[s]) - median

if subtract_means:
    for s in hits:
        hits[s] = np.array(hits[s]) - mean

maxy = 0
for s in hits:
    m = max(hits[s])
예제 #5
0
                    sys.stderr.write(
                        "Don't understand header {} at column {}\n".format(
                            headers[i], i))
        else:
            data[p[0]] = map(int, p[1:])

allcontigs = data.keys()
allcontigs.sort()

# calculate the mean and stdev for each group and each contig
means = {}
std = {}

for contig in allcontigs:
    means[contig] = {}
    std[contig] = {}
    for sample in cols:
        testdata = [data[contig][i] for i in cols[sample]]
        means[contig][sample] = roblib.mean(testdata)
        std[contig][sample] = 2 * roblib.stdev(testdata)

    # test the NS vs Control
    if means[contig]['plasma'] - std[contig]['plasma'] > means[contig]['control'] + std[contig]['control'] and \
        means[contig]['buffy'] - std[contig]['buffy'] > means[contig]['control'] + std[contig]['control'] and \
        means[contig]['csf'] - std[contig]['csf'] > means[contig]['control'] + std[contig]['control']:
        print("\t".join(
            map(str, [
                "ALL", contig, means[contig]['plasma'], means[contig]['buffy'],
                means[contig]['csf'], means[contig]['control']
            ])))
예제 #6
0
        except:
            sys.stderr.write("Can't add to position {}\n".format(
                pu.reference_pos))

    # here we trim the coverage array to make the math easier!
    start = 1
    end = args.l + 1
    if args.e:
        end = args.e + 1
        coverage = coverage[0:end]
    if args.s:
        start = args.s
        coverage = coverage[start:]

    # calculate the average over coverage
    av = mean(coverage)
    st = stdev(coverage)

    k = 0
    for i, j in enumerate(coverage):
        k += (j - av)**4

    k = k / (len(coverage) * (st**4))

    k -= 3

    if args.r:
        if args.c:
            print("Filename\tReference\tAverage\tStDev\tKurtosis")
        print(f"{args.f}\t{args.r}\t{av}\t{st}\t{k}")
    else:
예제 #7
0
        p = l.strip().split("\t")
        if len(p) > ncols:
            ncols = len(p)

        points = map(float, p[hcols:])
        nz = filter(notzero, points)
        psum = sum(points)
        total += psum
        data[p[0]] = [len(nz), psum]

# now calculate the mean and stdev based on the beta distribtion
xvalues = set()
betad = {}
for p in data:
    if data[p][0] not in betad:
        sys.stderr.write("alpha:" + str(data[p][0]) + " beta: " +
                         str((ncols - data[p][0]) + 1) + "\n")
        # samples = np.random.beta(data[p][0]+1, ncols-data[p][0], 1000)
        samples = np.random.beta(data[p][1] + 1, total - data[p][1], 100000)
        betad[data[p][0]] = (roblib.mean(samples), roblib.stdev(samples))

seen = set()
for p in data:
    x = str(1.0 * data[p][0] / ncols)
    sys.stdout.write(p + "\t" + str(x) + "\t" + str(1.0 * data[p][1] / total))
    if x not in seen:
        sys.stdout.write("\t" + str(betad[data[p][0]][0]) + "\t" +
                         str(betad[data[p][0]][1]))
    seen.add(x)
    sys.stdout.write("\n")