/
perm_analysis.py
251 lines (203 loc) · 9.4 KB
/
perm_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
# -*- coding: utf-8 -*-
"""
Created on Sun Nov 24 00:50:44 2013
@author: Talmo
@editor: David
This script does a permutation analysis on the scores of the LexA binding motif
in the MetaHit database in order to support the findings in Cornish et al.
(2013).
Dependencies:
- NumPy (http://www.numpy.org/)
- matplotlib (http://matplotlib.org/)
- gpu_pssm (https://github.com/ErillLab/gpu_pssm)
- MetaHit database (Qin et al., 2010; see comments below)
"""
import sys
import os.path
import glob
import numpy as np
import matplotlib.pyplot as plt
import time
import metagenomics as mg
# Import gpu_pssm from the parent directory
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)))
from gpu_pssm import gpu_pssm
def main():
##### Parameters #####
#Want to run with promoter regions
promoter = True
# Path to the MetaHit database.
# The patient files can be downloaded from:
# http://www.bork.embl.de/~arumugam/Qin_et_al_2010/
# Make sure these are extracted from their packages! (i.e.: "gunzip *.gz")
# Note: After extraction, the 84 patient files occupy a total of 6.56 GB on
# disk!
# The original paper can be found at:
# http://www.nature.com/nature/journal/v464/n7285/full/nature08821.html
if promoter:
metahit_path = "./MetaHit/Pruned"
else:
metahit_path = "./MetaHit/Data"
# The collection of binding sites to generate the PSSM from.
# LexA.seq.fa is a collection of 115 experimentally-determined binding
# sites reported in literature. See Table S1 in Cornish et al. (2013).
# binding_sites_path = "./LexA.seq.fa"
#binding_sites_path= "./LexA_Gamma_collection.fas"
binding_sites_path = "./LexA_Grampos_collection.fas"
# Number of permutations to run. Note that the COLUMNS of the PSSM
permutations = 50
# Scores below this number of bits will not be reported.
# Lower values will give more (false-positive) results and also slow down
# the execution of the program since more memory needs to be allocated to
# store the score values.
score_threshold = -50.0
### Parameters below this line should *probably* not be changed. ###
# The background frequency of the bases. An equiprobable frequency
# distribution assumes that each base has an equal probability of occuring,
# that is: P(A) = P(C) = P(G) = P(T) = 0.25 => GC-content = 0.5
# If set to False, the background frequency will be calculated based on the
# nucleotide composition of each patient.
# For better comparison across patients, this should be set to True.
equiprobable_nuc_freqs = True
# The ranges that the scores should be binned into.
# Since no scores will be saved below the score_threshold, it serves as a
# lower bound to the bins. A good upper bound is around ~30 bits since the
# maximum theoretical score a sequence can have from a PSSM is 32.
# Under equiprobable frequencies, the LexA consensus sequence has a score
# of ~24 bits, so no sequence should score higher than that.
bins = range(int(score_threshold), 32, 1)
# If you'd like to compare the output of this program by scoring the
# E. coli genome, set this to True.
# Make sure NC_000913.fna is in the parent directory.
# The genome sequence can be download from:
# ftp://ftp.ncbi.nlm.nih.gov/genomes/Bacteria/Escherichia_coli_K_12_substr__MG1655_uid57779/
# Use this for comparison/debugging.
score_ecoli_instead = False
#Calculate the total number of sites, scaffolds scanned
total_num_sites = 0
total_size = 0
total_scaffold = 0
##########
# Find patient files on disk
if promoter:
metahit_db = glob.glob(metahit_path + "/Pruned_MH[0-9]*.seq.fa")
else:
metahit_db = glob.glob(metahit_path + "/MH[0-9]*.seq.fa")
# metahit_db = ["Eco_300_1_50_P.txt"]
if score_ecoli_instead:
metahit_db = ["../NC_000913.fna"] # E. coli genome (for debugging)
# gather total time
start = time.time()
# For debugging, truncate to just first patient
# An alert just incase I am clumsy and forget these lines are uncommented.
#print "USING ONLY ONE PATIENT!!"
#metahit_db = ["Eco_300_1_50_P.txt"]
#metahit_db = [metahit_db[0]]
# Assume equiprobable mononucleotide frequencies
mg_frequencies = [0.25] * 4
if not equiprobable_nuc_freqs:
# Calculate the background nucleotide frequency for the metagenome
mg_frequencies = np.bincount(metagenome).astype(np.float) / metagenome.size
# Calculate the original PSSM from binding sites
original_pssm = gpu_pssm.create_pssm(binding_sites_path, genome_frequencies = mg_frequencies)
# Print the unpermuted PSSM
print "Unpermuted PSSM:"
mg.print_pssm(original_pssm)
#preallocate the array to speed up process
permute_pssm = np.empty(shape=(permutations+1,len(original_pssm)), dtype=object)
patient_scores = np.zeros(shape=(permutations+1,len(bins)-1))
#calculate predetermined pssm
for permutation in range(permutations):
# Permute the PSSM
permute_pssm[permutation] = mg.permute_pssm(original_pssm)
#Cycle through every patient file
for patient_file in metahit_db:
#Status Update
print "File: ", patient_file
# Load the sequence into memory
metagenome, scaffolds = mg.load_scaffolds(patient_file)
total_size += metagenome.size
total_scaffold += scaffolds.size
print "Genome size:", metagenome.size, "| Scaffolds:", scaffolds.size
# Score the metagenome using the original PSSM
print "Scoring without permuting..."
original_scores,partial_num_sites = score_patient(metagenome, scaffolds, original_pssm, score_threshold, bins)
# Keep the distributions of the scores
patient_scores[0]+= original_scores
total_num_sites += partial_num_sites
#For each permutated pssm
for permutation in range(permutations):
#which permutation it is on
print "Permutation %d/%d..." % (permutation + 1, permutations)
# Re-score using permuted PSSM
perm_scores,partial_num_sites = score_patient(metagenome, scaffolds, permute_pssm[permutation], score_threshold, bins)
# Save the distribution of the scores
patient_scores[permutation+1] += perm_scores
# Plot results
plt.figure()
for score in patient_scores[1:]:
cdf = np.cumsum(score)
plt.plot(bins[1:], cdf, "D-r", alpha=0.5, label="Permutation")
cdf = np.cumsum(patient_scores[0])
plt.plot(bins[1:], cdf, "D-b", lw=3, label="Original")
plt.xlabel("Site score (bits)")
plt.ylabel("# of Sites Found")
plt.title("Cumulative Density Function")
handles, labels = plt.gca().get_legend_handles_labels()
plt.legend(handles[-2:], labels[-2:], loc="best")
plt.grid()
plt.figure()
for score in patient_scores[1:]:
plt.plot(bins[1:], score, "D-r", alpha=0.5, label="Permutation")
plt.plot(bins[1:], patient_scores[0], 'D-b', lw=3, label="Original")
plt.xlabel("Site score (bits)")
plt.ylabel("# of Sites Found ")
plt.title("Probability Density Function")
handles, labels = plt.gca().get_legend_handles_labels()
plt.legend(handles[-2:], labels[-2:], loc="best")
plt.grid()
#Calculate p-values
total_fake_patient_scores = patient_scores[1]
for score in patient_scores[2:]:
total_fake_patient_scores += score
total_fake_patient_scores = np.vectorize(lambda x: x if x > 0 else .000001)(np.float64(total_fake_patient_scores))
print
real_prob = np.float64(patient_scores[0])/total_num_sites
fake_prob = total_fake_patient_scores/(total_num_sites * permutations)
#print out the p-values
print "Probability (True Matrix | Score):"
matrix_prob = (real_prob/(fake_prob+real_prob)) * 100
print "\n".join("%d:%.2f" % (s, p) for s, p in zip(bins,matrix_prob))
#Plot the Probability Values
plt.figure()
plt.bar(bins[1:],matrix_prob)
plt.xlabel("Site score (bits)")
plt.ylabel("Confidence Level (%)")
plt.grid()
#final diagnostics
end = time.time()
print "Total time: %.2f seconds" % (end-start)
print "Total Metagenome Size: %d bp" % (total_size)
print "Total Scaffolds Scanned: %d" % (total_scaffold)
def score_patient(metagenome, scaffolds, pssm, score_threshold, bins):
# Score the metagenome with the new PSSM
scores = gpu_pssm.score_long_sequence(metagenome, pssm, keep_strands=False)
#scores, __ = gpu_pssm.score_sequence_with_cpu(metagenome, pssm, benchmark=False)
# Invalidate cross-scaffold scores
for pos in scaffolds:
if pos - (pssm.size / 4) + 1 > 0:
scores[pos - (pssm.size / 4) + 1:pos] = -np.inf
partial_num_sites = len(np.where(scores > -np.inf)[0])
# Eliminate scores below threshold
high_scores = np.where(scores >= score_threshold)
#print np.where(scores >= score_threshold)l
#print scores.size
#print len(scaffolds)
print "Scores >= 20 bits:", len(np.where(scores >= 20)[0])
#print len(high_scores[0])
scores = scores[high_scores]
# Bin the scores
score_hist, __ = np.histogram(scores, bins, density=False)
return (score_hist, partial_num_sites)
if __name__ == "__main__":
main()