/
run_all.py
200 lines (173 loc) · 10.6 KB
/
run_all.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
import os
import argparse
from Bio import SeqIO
from Bio import Entrez
from Bio import SearchIO
from Bio.Blast import NCBIWWW
from Bio.Seq import Seq
path = os.getcwd()
def Setup(SRR):
#SRR1 = SRR[0:5] #first five values of SRR --> deprecated
#https://sra-downloadb.be-md.ncbi.nlm.nih.gov/sos2/sra-pub-run-11/SRR5660030/SRR5660030.1
wget_command = 'wget https://sra-downloadb.be-md.ncbi.nlm.nih.gov/sos2/sra-pub-run-11/' + SRR+ '/'+ SRR+ '.1 '
fastq_dump = 'fastq-dump -I --split-files ' + SRR + '.1' #split the sra files into paired reads
os.system(wget_command) #run the commands
os.system(fastq_dump)
def Transcriptome_index():
fasta = 'curl -s "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=EF999921&rettype=gb&retmode=txt">EF999921.gb'
os.system(fasta) #pull the genbank record for EF999921
outfile = open("CDS_EF999921.fasta", 'w') #open the output file
i = 0
Entrez.email = "ecgeoffroy@gmail.com"
handle = Entrez.efetch(db="nucleotide", id=["EF999921"], rettype="fasta") #obtain plain text records of the GenBank ids in FASTA format from NCBI's [Nucleotide] database.
records = list(SeqIO.parse(handle, "fasta"))
fasta_out = open("EF999921.fasta", 'w')
fasta_out.write(str(records[0].description) + '\n' + str(records[0].seq)) #write out the fasta sequence to the file
fasta_out.close()
SeqIO.write(records[0],'EF999921.fasta', 'fasta')
for rec in SeqIO.parse("EF999921.gb", "genbank"):
if rec.features:
for feature in rec.features:
if feature.type == "CDS": #find the CDS regions and add them to the transcriptome index file
outfile.write('>' + str(feature.qualifiers['protein_id']).replace('[', '').replace(']', '').replace("'", '') + '\n' + str(feature.location.extract(rec).seq) + '\n')
i+=1
with open('MiniProject.log', 'a') as output:
output.write('The HCMV genome (EF999921) has '+ str(i) + '\n') #output how many CDS regions there are in the EF999921 genbank file
output.close()
outfile.close()
def run_kallisto(SRR):
kallisto_build = 'time kallisto index -i HCMV_index.idx CDS_EF999921.fasta' ##create the transcriptome index in kallisto
os.system(kallisto_build) #run command
kallisto = 'time kallisto quant -i HCMV_index.idx -o ./' + SRR+' -b 30 -t 4 '+ SRR + '.1_1.fastq ' + SRR+ '.1_2.fastq' #run kallisto for each of the SRRs
os.system(kallisto)
def run_spades(SRR1, SRR2, SRR3, SRR4): #use only assembler to make Spades a lot shorter
spades_command = 'spades -k 55,77,99,127 --only-assembler -t 2 --pe1-1 EF999921_'+ SRR1 + '.1.fastq --pe1-2 EF999921_'+ SRR1 + '.2.fastq --pe2-1 EF999921_'+ SRR2 + '.1.fastq --pe2-2 EF999921_' + SRR2 + '.2.fastq --pe3-1 EF999921_' + SRR3 + '.1.fastq --pe3-2 EF999921_' + SRR3 +'.2.fastq --pe4-1 EF999921_' + SRR4 + '.1.fastq --pe4-2 EF999921_' + SRR4 + '.2.fastq -o Spades/'
with open('MiniProject.log','a') as output: #run SPades and print out the command to MiniProject.log
output.write(spades_command + '\n')
output.close()
os.system(spades_command)
def bowtie_build(SRR):
#builds initial index using reference genome-- use transcriptome_index CDS fasta file?
bowtie_command = 'bowtie2-build ./EF999921.fasta EF999921_1'
os.system(bowtie_command)
#maps transcriptome reads to the index we just created, generating sam file
#need al-conc to make fastq files in output
bowtie_command2 = 'bowtie2 --quiet --no-unal --al-conc EF999921_' + SRR + '.fastq -x EF999921_1 -1 '+ SRR+ '.1_1.fastq -2 ' + SRR+ '.1_2.fastq -S EF999921_' + SRR+ '.sam'
os.system(bowtie_command2)
def Count_bowtie(SRR, number):
bowtie_SRR1 = open('EF999921_' + SRR + '.1.fastq').readlines()
bowtie_SRR2 = open('EF999921_' + SRR + '.2.fastq').readlines()
donor = ''
if number == 1:
donor += 'Donor 1 (2dpi)'
elif number == 2:
donor += 'Donor 1 (6dpi)'
elif number == 3:
donor += 'Donor 3 (2dpi)'
elif number == 4:
donor += 'Donor 3 (6dpi)'
len_bowtie = ((len(bowtie_SRR1)+len(bowtie_SRR2))/8)
original1 = open(SRR + '.1_1.fastq').readlines()
original2 = open(SRR + '.1_2.fastq').readlines() #count the number of read pairs
original = (len(original1) + len(original2))/8 #theoretically could just use one file and divide by 4
#write out to the log file
with open('MiniProject.log', 'a') as output:
output.write(donor + " had " + str(original) + ' read pairs before Bowtie2 filtering and ' + str(len_bowtie) + ' read pairs after \n')
output.close()
def run_sleuth(SRR):
run_R = 'Rscript make_sample_covariates.R ' + SRR[0] + ' ' + SRR[1]+ ' ' + SRR[2] + ' ' + SRR[3] #make the sample_covariance.txt file
os.system(run_R)
run_more_R = 'Rscript sleuth.R' #run sleuth
os.system(run_more_R)
file = open('sleuth_output.txt').readlines()
with open('MiniProject.log' ,'a') as output:
for i in range(len(file)):
output.write(str(file[i]) + '\n') #print out the top sleuth hits
output.close()
#Assemble contigs > 1000 bp
def concat_contigs():
file = open('./Spades/significant_contigs.txt') #input file
file = file.read().split() #split the file
final_seq = '' #create empty string for the assembly to be inputted to
for i in range(len(file)):
final_seq += file[i] + 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN'
output1 = open('assembly.fasta' , 'a') #write out the final sequence to the assembly.fasta file
output1.write(final_seq)
output1.close()
def contigs_count():
sequences = [] #make empty array
for record in SeqIO.parse("./Spades/contigs.fasta", "fasta"):
sequences.append(str(record.seq)) #add fasta sequences to the array sequences
count = 0
outfile = open("./Spades/significant_contigs.txt", "w") # open new file
for contig in sequences: #for each of the sequences (contigs)
if len(contig) > 1000: #check if the length of bp is > 1000
count +=1 #if it is, add one to count and write out the contig to the file
outfile.write(contig + '\n')
outfile.close()
with open('MiniProject.log', 'a') as output:
output.write("There are " + str(count) + " contigs > 1000 in the assembly" + '\n') #write out the number of contigs with bp >1000 to MiniProject.log
output.close()
return count
def contigs_length_count():
file = open("./Spades/significant_contigs.txt").readlines()
count=0
for contig in file:
number = len(contig) #add up the number of bp in the assembly
count+= number
with open('MiniProject.log', 'a') as output:
output.write("There are " + str(count) + " bp in the assembly" + '\n') #count the number of bp in the assembly and write out to MiniProject.log
output.close()
def blast():
fasta = open("assembly.fasta").read()
handle = NCBIWWW.qblast("blastn", "nr", fasta, entrez_query='"Herpesviridae"[organism]') #run blast against the assembled sequence
with open("blast.xml", "w") as out_handle:
out_handle.write(handle.read())
out_handle.close()
blast_qresult = SearchIO.read("blast.xml", "blast-xml")
output = open('MiniProject.log', 'a')
output.write('seq_title '+ 'align_len ' + 'number_HSPs '+ 'topHSP_ident '+ 'topHSP_gaps '+ 'topHSP_bits '+ 'topHSP_expect \n')
max_blast_id = 10
if len(blast_qresult) < 10: #prevents program from crashing when there are less than 10 results
max_blast_id = len(blast_qresult)
for i in range(0,max_blast_id):
hit = blast_qresult[i]
blast_hsp = blast_qresult[i][0]
output.write(str(hit.description) + ' ' + str(hit.seq_len) + ' '+ str(len(hit.hsps)) + ' ' + str(blast_hsp.ident_num) + ' ' + str(blast_hsp.gap_num) + ' ' + str(blast_hsp.bitscore)+ ' ' +str(blast_hsp.evalue) + '\n')
output.close()
# ----------------- Theoretical Main Function ------------------
parser = argparse.ArgumentParser(description='Process some SRRs.')
parser.add_argument('SRRs', metavar='N', type=str, nargs='+', help='SRR values')
parser.add_argument('--download_files', default='N', help='Pull SRA Files from the database and split the paired reads')
args = parser.parse_args()
# Call all of the functions using the user given SRRs
with open('MiniProject.log', 'a') as output:
output.write('SRA values tested: ' + str(args.SRRs) + '\n')
output.close()
if args.download_files != 'N':
for i in args.SRRs:
Setup(i)
with open('MiniProject.log', 'a') as output:
output.write(i + ' SRA file downloaded \n')
output.close()
Transcriptome_index()
for i in args.SRRs:
run_kallisto(i)
run_sleuth(args.SRRs)
number = 1
for i in args.SRRs:
bowtie_build(i)
Count_bowtie(i, number)
number+=1
run_spades(args.SRRs[0], args.SRRs[1], args.SRRs[2], args.SRRs[3])
count = contigs_count()
contigs_length_count()
if count > 1: #if there is more than one significant contig, run the assembly
concat_contigs()
else: #otherwise just make that one contig the assembly
file = open("./Spades/significant_contigs.txt").readlines()#
for i in file:
with open('assembly.fasta', 'a') as outfile:
outfile.write(i)
outfile.close()
blast()