forked from wpwupingwp/python
/
portable.py
executable file
·487 lines (456 loc) · 18.1 KB
/
portable.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
#!/usr/bin/python3
from datetime import datetime
from ftplib import FTP
import re
import sqlite3
import urllib.request
import warnings
from Bio import BiopythonDeprecationWarning
from Bio import Entrez
from Bio import SeqIO
from Bio.Seq import MutableSeq
from os import makedirs
from os.path import exists
from zipfile import ZipFile
warnings.simplefilter('ignore', BiopythonDeprecationWarning)
def parser(raw_seq, date):
"""Base on annotations in genbank files to extract fragments from Chloroplast Genome Sequence.
"""
taxon_id = int(raw_seq.features[0].qualifiers['db_xref'][0][6:])
organism = raw_seq.annotations['organism']
accession = raw_seq.annotations['accessions'][0]
gene = []
records = []
frag_type = 'whole'
begin = 1
end = len(raw_seq)
sequence = str(raw_seq.seq)
name = organism
strand = 1
rec = [taxon_id, organism, accession, name, frag_type, begin, end, strand, sequence, date]
records.append(rec)
for i in raw_seq.features:
if i.type == 'gene' and 'gene' in i.qualifiers:
if i.location_operator != 'join':
frag_type = 'gene'
begin = int(i.location.start)
end = int(i.location.end)
sequence = str(raw_seq.seq[begin:end])
name = str(i.qualifiers['gene'][0])
strand = str(i.location.strand)
rec = [taxon_id, organism, accession, name, frag_type, begin, end, strand, sequence, date]
elif i.location_operator == 'join':
frag_type = 'gene'
begin = int(i.sub_features[0].location.start)
end = int(i.sub_features[0].location.end)
name = str(i.qualifiers['gene'][0])
strand = str(i.location.strand)
sequence = ''
rec = [taxon_id, organism, accession, name, frag_type, begin, end, strand, sequence, date]
gene.append(rec)
begin = int(i.sub_features[1].location.start)
end = int(i.sub_features[1].location.end)
sequence = ''.join([str(raw_seq.seq[begin:end]), str(raw_seq.seq[begin:end])])
rec = [taxon_id, organism, accession, name, frag_type, begin, end, strand, sequence, date]
elif i.type == 'CDS' and 'gene' in i.qualifiers:
frag_type = 'cds'
begin = int(i.location.start)
end = int(i.location.end)
sequence = str(raw_seq.seq[begin:end])
name = str(i.qualifiers['gene'][0]).replace(' ', '_')
strand = str(i.location.strand)
rec = [taxon_id, organism, accession, name, frag_type, begin, end, strand, sequence, date]
elif i.type == 'tRNA' and 'gene' in i.qualifiers:
frag_type = 'tRNA'
begin = int(i.location.start)
end = int(i.location.end)
sequence = str(raw_seq.seq[begin:end])
if len(sequence) >= 100:
sequence = ''
name = str(i.qualifiers['gene'][0]).replace(' ', '_')
strand = str(i.location.strand)
rec = [taxon_id, organism, accession, name, frag_type, begin, end, strand, sequence, date]
elif i.type == 'rRNA':
frag_type = 'rRNA'
begin = int(i.location.start)
end = int(i.location.end)
sequence = str(raw_seq.seq[begin:end])
name = str(i.qualifiers['product'][0]).replace(' ', '_')
strand = str(i.location.strand)
rec = [taxon_id, organism, accession, name, frag_type, begin, end, strand, sequence, date]
elif i.type == 'exon' and 'gene' in i.qualifiers:
frag_type = 'exon'
begin = int(i.location.start)
end = int(i.location.end)
sequence = str(raw_seq.seq[begin:end])
if 'number' in i.qualifiers:
name = '_'.join([str(i.qualifiers['gene'][0]), 'exon', str(i.qualifiers['number'][0])])
else:
name = '_'.join([str(i.qualifiers['gene'][0]), 'exon'])
strand = int(i.location.strand)
rec = [taxon_id, organism, accession, name, frag_type, begin, end, strand, sequence, date]
elif i.type == 'intron' and 'gene' in i.qualifiers:
frag_type = 'intron'
begin = int(i.location.start)
end = int(i.location.end)
sequence = str(raw_seq.seq[begin:end])
strand = str(i.location.strand)
if 'number' in i.qualifiers:
name = '_'.join([str(i.qualifiers['gene'][0]), 'intron', str(i.qualifiers['number'][0])])
else:
name = '_'.join([str(i.qualifiers['gene'][0]), 'intron'])
rec = [taxon_id, organism, accession, name, frag_type, begin, end, strand, sequence, date]
records.append(rec)
gene.sort(key=lambda x: x[5])
for i in range(len(gene) - 1):
frag_type = 'spacer'
now = gene[i]
then = gene[i + 1]
tail = now[6] + 1
head = then[5] - 1
sequence = str(raw_seq.seq[tail:head])
name = '_'.join(['-'.join([now[3], then[3]]), 'Spacer'])
strand = 0
rec = [taxon_id, organism, accession, name, frag_type, begin, end, strand, sequence, date]
records.append(rec)
records.extend(gene)
database.extend(records)
def init_seq():
"""Init Sequence Database.
"""
con = sqlite3.connect('./data/DB')
cur = con.cursor()
cur.execute(
'CREATE TABLE IF NOT EXISTS main (Taxon INT, Organism TEXT, Accession TEXT, Name TEXT, Type TEXT, Head INT, Tail INT, Strand TEXT, Sequence TEXT, Date TEXT, ID INTEGER PRIMARY KEY);')
for row in database:
if row[8] != '':
cur.execute(
'INSERT INTO main (Taxon, Organism, Accession, Name, Type, Head, Tail, Strand, Sequence, Date) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?);',
(row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7], row[8], row[9]))
con.commit()
cur.close()
con.close()
print('Done.\n')
def seq_batch_query():
con = sqlite3.connect('./data/DB')
cur = con.cursor()
list_file = input('list file name:\n')
with open(list_file, 'r') as In:
organism_list = In.read().split(sep='\n')
cur.execute('CREATE TABLE IF NOT EXISTS tasklist (Name TEXT);')
for organism in organism_list:
cur.execute('INSERT INTO tasklist (Name) VALUES (?);', (organism,))
cur.execute(
'SELECT Taxon, Organism, Name, Type, Strand, Sequence, Head FROM main WHERE Organism IN (SELECT Name FROM tasklist) ORDER BY Head',
(organism))
result = cur.fetchall()
cur.execute('DROP TABLE tasklist;')
cur.close()
con.close()
query_result = []
for i in result:
title = '|'.join([str(i[0]), i[1], i[2], i[3]])
filename = i[2]
sequence = MutableSeq(i[5])
if i[4] == '-1':
sequence.seq = sequence.reverse_complement()
record = [title, filename, sequence]
query_result.append(record)
for i in query_result:
with open(''.join(['./out/', i[1], '.fasta']), 'a') as Fileout:
Fileout.write('>%s\n%s\n' % (i[0], i[2]))
# rps12 may have larger than 50k fragments, here to filter it
rps12 = SeqIO.parse('./out/rps12.fasta', 'fasta')
rps12short = list()
for item in rps12:
if len(item.seq) < 4000:
rps12short.append(item)
SeqIO.write(rps12short, './out/rps12short.fasta', 'fasta')
print('Done.\n')
def seq_query():
"""Sequence query function, to be continued.
"""
query_type = input(
'1.Specific fragment\n'
'2.Specific Organism\n'
'3.Specific gene\n'
'4.All\n'
'5.All cds\n'
)
organize = input('Organize output?(y/n)\n')
if query_type not in ['1', '2', '3', '4', '5']:
raise ValueError('wrong input!\n')
con = sqlite3.connect('./data/DB')
cur = con.cursor()
if query_type == '1':
organism = input('Organism:\n')
gene = input('Gene:\n')
frag_type = input('Fragment type(gene, cds, rRNA, tRNA, exon, intron, spacer):\n')
cur.execute(
'SELECT Taxon, Organism, Name, Type, Strand, Sequence FROM main WHERE Name LIKE ? AND Type = ? AND Organism=?',
('%' + gene + '%', frag_type, organism))
result = cur.fetchall()
elif query_type == '2':
organism = input('Organism:\n')
frag_type = input('Fragment type(gene, cds, rRNA, tRNA, exon, intron, spacer, whole, fragments):\n')
if frag_type == 'fragments':
cur.execute(
'SELECT Taxon, Organism, Name, Type, Strand, Sequence, Head FROM main WHERE Organism = ? ORDER BY Head',
(organism,))
else:
cur.execute(
'SELECT Taxon, Organism, Name, Type, Strand, Sequence, Head FROM main WHERE Organism LIKE ? AND Type = ? ORDER BY Head',
('%' + organism + '%', frag_type))
result = cur.fetchall()
elif query_type == '3':
gene = input('Gene:\n')
frag_type = input('Fragment type(gene, cds, rRNA, tRNA, exon, intron, spacer):\n')
cur.execute(
'SELECT Taxon, Organism, Name, Type, Strand, Sequence FROM main WHERE Name LIKE ? AND Type = ? ORDER BY Taxon',
('%' + gene + '%', frag_type))
result = cur.fetchall()
elif query_type == '4':
cur.execute('SELECT Taxon, Organism, Name, Type, Strand, Sequence, Head FROM main ORDER BY Taxon')
result = cur.fetchall()
elif query_type == '5':
cur.execute(
'SELECT Taxon, Organism, Name, Type, Strand, Sequence, Head FROM main WHERE type = "cds" ORDER BY Taxon')
result = cur.fetchall()
query_result = []
for i in result:
title = '|'.join([str(i[0]), i[1], i[2], i[3]])
sequence = MutableSeq(i[5])
gene = i[2]
if i[4] == '-1':
sequence.seq = sequence.reverse_complement()
record = [title, gene, sequence]
query_result.append(record)
if organize == 'y':
if not exists('output'):
makedirs('output')
for i in query_result:
file_name = ''.join([
'output',
'/',
i[1].replace('/', ''),
'.fasta'
])
with open(file_name, 'a') as output_file:
output_file.write('>%s\n%s\n' % (i[0], i[2]))
else:
output = input('Enter output filename:\n')
with open('.'.join([output, 'fasta']), 'w') as output_file:
for i in query_result:
output_file.write('>%s\n%s\n' % (i[0], i[2]))
cur.close()
con.close()
print('Done.\n')
def update_seq_db_from_genbank(date):
"""Update Sequence database from Genbank, need time to download.
"""
down = urllib.request.urlopen(
'http://www.ncbi.nlm.nih.gov/genomes/GenomesGroup.cgi?taxid=2759&opt=plastid').read().decode('utf-8')
genome_list = re.findall('((?<=nuccore/)[0-9]{9})', down)
user_email = input('Input your email address for downloading data or use default(by press enter):\n')
if user_email is '\n':
Entrez.email = 'wpwupingwp@outlook.com'
else:
Entrez.email = user_email
# need email address certify
handle = Entrez.read(Entrez.epost(db='nuccore', id=', '.join(genome_list)))
w = handle['WebEnv']
k = handle['QueryKey']
genome_content = Entrez.efetch(db='nuccore', webenv=w, query_key=k, rettype='gb', retmode='text')
output = open('genbank', 'w')
output.write(genome_content.read())
output.close()
update_seq_from_file('genbank', date)
def update_seq_from_file(genbank_file, date):
"""Update Sequence database from private file.
"""
global database
database = []
records = SeqIO.parse(genbank_file, 'genbank')
for raw_seq in records:
parser(raw_seq, date)
init_seq()
def init_taxon():
"""Init Taxon database from file.
to be continued(add download function
ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdmp.tar.gz
"""
if not exists('./data/taxdmp.zip'):
ftp = FTP('ftp.ncbi.nih.gov')
ftp.login()
ftp.cwd('pub/taxonomy')
ftp.retrbinary('RETR taxdmp.zip', open('./data/taxdmp.zip', 'wb').write)
ftp.quit
with ZipFile('./data/taxdmp.zip', 'r') as dumpfile:
dumpfile.extractall(path='./data/')
taxon_id = dict()
data = list()
name = dict()
specie = list()
son = dict()
greatson = dict()
parent = dict()
rank = dict()
global taxon
taxon = list()
with open('./data/names.dmp', 'r') as dumpfile:
raw = dumpfile.read().split(sep='\n')
raw.pop()
for record in raw:
add = record.replace('\t', '').split(sep='|')
if add[0] not in name or add[2] == 'scientific name':
name[add[0]] = add[1]
with open('./data/nodes.dmp', 'r') as dumpfile:
raw = dumpfile.read().split(sep='\n')
raw.pop()
for record in raw:
add = record.replace('\t', '').split(sep='|')
# 1696063|Sarcocystis corvusi||scientific name|
taxon_id[add[0]] = add[1]
rank[add[0]] = add[3]
if add[2] == 'species':
specie.append(add[0])
for specie in specie:
record = [specie, ]
while taxon_id[specie] != '1':
record.append(taxon_id[specie])
specie = taxon_id[specie]
# if '33090' in record:
# record.pop()
# record.pop()
data.append(record)
for data in data:
for n in range(len(data)):
if data[n] not in parent:
parent[data[n]] = data[(n + 1):]
if n == 0:
continue
if data[n] not in son:
son[data[n]] = {data[n - 1], }
else:
son[data[n]].add(data[n - 1])
if data[n] not in greatson:
greatson[data[n]] = {data[0], }
else:
greatson[data[n]].add(data[0])
for specie in name.items():
if specie[0] not in son:
son[specie[0]] = set()
if specie[0] not in parent:
parent[specie[0]] = list()
if specie[0] not in greatson:
greatson[specie[0]] = set()
record = [specie[0], name[specie[0]], rank[specie[0]], son[specie[0]], parent[specie[0]], greatson[specie[0]]]
taxon.append(record)
con = sqlite3.connect('./data/DB')
cur = con.cursor()
cur.execute(
'CREATE TABLE IF NOT EXISTS taxon (Id TEXT, Name TEXT, Rank TEXT, Son TEXT, Parent TEXT, GreatSon TEXT);')
for line in taxon:
son = ' '.join(line[3])
parent = ' '.join(line[4])
greatson = ' '.join(line[5])
cur.execute('INSERT INTO taxon (Id, Name, Rank, Son, Parent, GreatSon) VALUES (?, ?, ?, ?, ?, ?);',
(line[0], line[1], line[2], son, parent, greatson))
con.commit()
cur.close()
con.close()
print('Done.\n')
def taxon_query_auto(name):
"""Taxon query for seqquery, may be remove.
"""
con = sqlite3.connect('./data/DB')
cur = con.cursor()
cur.execute('SELECT Parent FROM taxon WHERE Name LIKE ?;', ('%' + name + '%',))
# Result = cur.fetchall()
# Rank = dict()
# for record in Record:
# Rank[result[0]] = result[1]
'''to be continue'''
def taxon_query_no_auto():
"""Interactive query taxon database.
"""
while True:
query_type = input(
'1.by id\n'
'2.by name\n'
)
if query_type not in ['1', '2']:
return
con = sqlite3.connect('./data/DB')
cur = con.cursor()
if query_type == '1':
taxon_id = input('taxon id:\n')
cur.execute('SELECT * FROM taxon WHERE Id = ?;', (taxon_id,))
result = cur.fetchall()
elif query_type == '2':
name = input('scientific name:\n')
# cur.execute('select * from taxon where Name like ?;', ('%'+Name+'%', ))
cur.execute('SELECT * FROM taxon WHERE Name = ?;', (name,))
result = cur.fetchall()
cur.execute('SELECT Id, Name FROM taxon;')
result2 = cur.fetchall()
cur.close()
con.close()
name_dict = {'': ''}
for item in result2:
name_dict[item[0]] = item[1]
for i in result:
taxon_id = i[0]
name = i[1]
rank = i[2]
son = i[3].split(sep=' ')
son_name = list()
for item in son:
son_name.append(name_dict[item])
parent = i[4].split(sep=' ')
parent_name = list()
for item2 in parent:
parent_name.append(name_dict[item2])
greatson = i[5].split(sep=' ')
greatson_name = list()
for item3 in greatson:
greatson_name.append(name_dict[item3])
handle = open('out.txt', 'a', encoding='utf-8')
handle.write(''.join(['id : ', taxon_id, '\n']))
handle.write(''.join(['name : ', name, '\n']))
handle.write(''.join(['rank : ', rank, '\n']))
handle.write(''.join(['parent : ', '->'.join(parent_name), '\n']))
handle.write(''.join(['son : ', ', '.join(son_name), '\n']))
handle.write(''.join(['greatson: ', ', '.join(greatson_name), '\n\n']))
# __main()__
def main():
'''Main function, entrance of the program.'''
option = input(
'Select:\n'
'1.Update database from GenBank\n'
'2.Add pvirate data\n'
'3.Query\n'
'4.Init Taxon\n'
'5.Query Taxon\n'
'6.Batch Query\n'
)
date = str(datetime.now())[:19].replace(' ', '-')
if option == '1':
update_seq_db_from_genbank(date)
elif option == '2':
genbank_file = input('Genbank format filename:\n')
update_seq_from_file(genbank_file, date)
elif option == '3':
seq_query()
elif option == '4':
init_taxon()
elif option == '5':
taxon_query_no_auto()
elif option == '6':
seq_batch_query()
else:
raise ValueError('Input error!\n')
if not exists('data'):
makedirs('data')
if __name__ == "__main__":
main()