-
Notifications
You must be signed in to change notification settings - Fork 3
/
gene_fasta.py
48 lines (33 loc) · 1.43 KB
/
gene_fasta.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
#!/usr/bin/env python
import sys
from operator import itemgetter, attrgetter
from itertools import imap, ifilter, izip
from functools import partial
from urllib import unquote
from jbio.io.file import iterator_over_file
from jbio.gff import record_iterator as gff_iterator
from jbio.fasta import record_iterator as fasta_iterator
if not len(sys.argv) == 3:
print "gene_fasta.py input.fa input.gff"
sys.exit(1)
#FIELDS = ["ID","Alias","orf_classification","gene","Note"]
FIELDS = ["ID","Note"]
fa_fn,gff_fn = sys.argv[1:3]
#read fasta records into memory
def fasta_clean_getter(fasta_entry):
name = fasta_entry.name.split()[0]
return (name, fasta_entry.seq)
fasta_records = dict(imap(fasta_clean_getter,fasta_iterator(iterator_over_file(fa_fn))))
gene_entries = ifilter(lambda x: x.feature == "gene",
gff_iterator(iterator_over_file(gff_fn)))
for gene_record in gene_entries:
attrs = dict(map(lambda x: x.split("="), gene_record.attribute.split(";")))
header = ">" + attrs["Name"]
fields = FIELDS
field_getter_func = lambda x : unquote(attrs.get(x,"None")) if x =="Note" else attrs.get(x,"None")
field_getter = imap(field_getter_func, fields)
header += " " + " ".join(imap(lambda fv: "[%s=%s]" % fv, izip(fields, field_getter)))
start, end = gene_record.start-1, gene_record.end-1
seq = fasta_records[gene_record.seqname][start:end+1]
print header
print seq