/
geneprotxml.py
183 lines (165 loc) · 9.66 KB
/
geneprotxml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
# main program: samplespecificdbgenerator.py
__author__ = "anthony"
__date__ = "$Oct 27, 2015 2:41:36 PM$"
import sys
import os.path
import optparse
import refparse
import novelsplices
import variantcalls
import featureshare
import indel
from lxml import etree as et
import numpy as np
from Bio import SeqIO
import threading
HTML_NS = "http://uniprot.org/uniprot"
XSI_NS = "http://www.w3.org/2001/XMLSchema-instance"
NAMESPACE_MAP = {None:HTML_NS, "xsi":XSI_NS}
UP = '{'+HTML_NS+'}'
def __main__():
#Parse Command Line
parser = optparse.OptionParser()
#I/O
parser.add_option( '-x', '--reference_xml', dest='reference_xml', help='Reference protein UniProt-XML file. Sequence variant peptide entries are appended to this database to generate the ouptut UniProt-XML protein database.' )
parser.add_option( '-p', '--protein_fasta', dest='protein_fasta', help='Reference protein FASTA file. Used to generate SAV peptide entries. If no UniProt-XML is specified, SAV and NSJ entries will be appended to this database to generate an output database. By default, this output will be a UniProt-XML protein database without PTM annotations. If --output-fasta is selected, the output will be a protein FASTA.')
parser.add_option( '-g', '--gene_model', dest='gene_model', default=None, help='GTF gene model file. Used to annotate NSJ peptide entries.')
parser.add_option( '-v', '--snpeff_vcf', dest='snpeff_vcf', help='SnpEff VCF file with HGVS annotations (else read from stdin).' )
parser.add_option( '-c', '--cufflinks_bed', dest='cufflinks_bed', help='Reconstructed transcripts in bed format.' )
parser.add_option( '-b', '--splice_bed', dest='splice_bed', help='BED file (tophat junctions.bed) with sequence column added.' )
parser.add_option( '-o', '--output', dest='output', help='Output file path. Outputs UniProt-XML and FASTA format databases.' )
# parser.add_option( '-z', '--output_fasta', dest='output_fasta', action='store_true', default=False, help='Output a FASTA-format database. Place path for output file after the --output flag.')
# parser.add_option( '-m', '--threads', dest='threads', type='int', default=)
#Peptide sequence construction
parser.add_option( '-l', '--leading_aa_num', dest='leading_aa_num', type='int', default=33, help='Leading number of AAs to output for SAV peptides. Default: 33.' )
parser.add_option( '-t', '--trailing_aa_num', dest='trailing_aa_num', type='int', default=33, help='Trailing number of AAs to output for SAV peptides. Default: 33.' )
#Filtering parameters
parser.add_option( '-D', '--nsj_depth_cutoff', dest='nsj_depth_cutoff', type='int', default=0, help='Keep only NSJs found with above this depth (BED score field). Default: 0.' )
parser.add_option( '-E', '--snv_depth_cutoff', dest='snv_depth_cutoff', type='int', default=0, help='Keep only SNVs found with above this depth (DP=# field). Default: 0.' )
parser.add_option( '-M', '--minimum_length', dest='minimum_length', type='int', default=0, help='Keep only sequence variant peptides with greater than or equal to this length. Default: 0.' )
#Simple entry
parser.add_option( '-Q', '--bed_score_name', dest='bed_score_name', default="depth", help='Include in the NSJ ID line score_name:score. Default: "depth."' )
parser.add_option( '-R', '--reference', dest='reference', default="None", help='Genome Reference Name for NSJ ID location. Automatically pulled from genome_build header in GTF if present.' )
(options, args) = parser.parse_args()
##INPUTS##
#Protein FASTA
try:
protein_fasta = os.path.abspath(options.protein_fasta)
protein_fasta = open(protein_fasta, 'r')
protein_fasta = refparse.read_protein_fasta(protein_fasta)
except Exception, e:
print >> sys.stderr, "failed: %s" % e
exit(2)
#Reference XML/FASTA
ensembl, uniprot = None, None
if options.reference_xml != None:
try:
refXml = os.path.abspath(options.reference_xml)
refXml = open(refXml, 'r')
p = et.XMLParser(remove_blank_text=True) #required for pretty additions
uniprot = et.parse(refXml, p)
uniprot_root = uniprot.getroot()
for entry in uniprot_root:
refparse.condense_xml_entry(entry)
except Exception, e:
print >> sys.stderr, "Parsing and/or condensing reference xml failed: %s" % e
exit(2)
else:
print >> sys.stderr, "failed: no UniProt reference protein database specified"
#Read the Ensembl fasta into an XML structure
if options.protein_fasta != None:
try:
refFasta = os.path.abspath(options.protein_fasta)
refFasta = open(refFasta, 'r')
ensembl_root = et.Element(UP+'uniprot', nsmap=NAMESPACE_MAP)
ensembl = et.ElementTree(ensembl_root)
refparse.read_fasta_to_xml(ensembl_root, refFasta) #TODO: singluarize the entry here
except Exception, e:
print >> sys.stderr, "Parsing reference fasta failed: %s" %e
exit(2)
else:
print >> sys.stderr, "failed: no Ensembl reference protein database specified"
##OUTPUT##
outXml, outFasta = None, None
if options.output == None: outXml = sys.stdout
else:
try:
outBasename = os.path.basename(options.output)
outXml = open(outBasename + '.xml', 'w')
outFasta = open(outBasename + '.fasta', 'w')
except Exception, e:
print >> sys.stderr, "Opening outfile failed: %s" % e
exit(3)
#Process gene model
try:
geneModelFile = os.path.abspath(options.gene_model)
linect = sum(1 for line in open(geneModelFile))
geneModelFile = open(geneModelFile, 'r')
geneModel = refparse.GeneModel()
for i, gtf_line in enumerate(geneModelFile):
if i % 20000 == 0: print "gene_model line " + str(i) + " of " + str(linect)
if gtf_line.startswith('#') and gtf_line.find('genome_build') >= 0: options.reference = gtf_line.split()[1]
elif gtf_line.startswith('#'): continue
else: geneModel.new_entry(gtf_line)
except Exception, e:
print >> sys.stderr, "Parsing gene model failed: %s" % e
exit(2)
#Process VCF
try:
snpeff_vcf = os.path.abspath(options.snpeff_vcf)
linect = sum(1 for line in open(snpeff_vcf))
snpeff_vcf = open(snpeff_vcf, 'r')
for i, vcf_line in enumerate(snpeff_vcf):
if i % 1000 == 0: print "snpeff_vcf line " + str(i) + " of " + str(linect)
variantcalls.parse_vcf_line(ensembl_root, vcf_line, protein_fasta, options.snv_depth_cutoff, options.minimum_length, options.leading_aa_num, options.trailing_aa_num)
snpeff_vcf.close()
except Exception, e:
print >> sys.stderr, "VCF processing failed: %s" % e
exit(1)
#Process Splice BED
try:
splice_bed = os.path.abspath(options.splice_bed)
linect = sum(1 for line in open(splice_bed))
splice_bed = open(splice_bed, 'r')
for i, bed_line in enumerate(splice_bed):
if i % 2000 == 0: print "splice_bed line " + str(i) + " of " + str(linect)
if bed_line.startswith('track'): continue
novelsplices.translate_bed_line(ensembl_root, geneModel, bed_line, options.nsj_depth_cutoff, options.minimum_length, options.reference, options.bed_score_name)
splice_bed.close()
except Exception, e:
print >> sys.stderr, "Splice BED processing failed: %s" % e
exit(2)
#Enter PTM information from uniprot into the EnsemblXML
uniprot_root.remove(uniprot_root.find(UP+'copyright'))
ensembl_entries_by_seq = {entry.find(UP+'sequence').text.replace('\n', '').replace('\r', ''): entry for entry in ensembl_root}
uniprot_entries_by_seq = {entry.find(UP+'sequence').text.replace('\n', '').replace('\r', ''): entry for entry in uniprot_root}
common_seqs = list(set(ensembl_entries_by_seq) & set(uniprot_entries_by_seq))
uniprot_only = list(set(uniprot_entries_by_seq) - set(common_seqs))
ensembl_only = list(set(ensembl_entries_by_seq) - set(common_seqs))
for seq in common_seqs:
featureshare.share_all_features(uniprot_entries_by_seq[seq], ensembl_entries_by_seq[seq])
aa_num_dict = featureshare.aa_num_dict()
for i, ensembl_seq in enumerate(ensembl_only):
for j, uniprot_seq in enumerate(uniprot_only):
if len(ensembl_seq) == len(uniprot_seq):
ensembl_num_seq = featureshare.seq_to_num(ensembl_seq, aa_num_dict)
uniprot_num_seq = featureshare.seq_to_num(uniprot_seq, aa_num_dict)
comparison = uniprot_num_seq - ensembl_num_seq
if np.count_nonzero(comparison) <= 3:
indel.append_seqvar(ensembl_entries_by_seq[ensembl_seq], uniprot_only[j], comparison) # j is a holder for now; planning to remove this...
elif np.abs(len(ensembl_seq) - len(uniprot_seq)) <= 3:
result = indel.get_indel_locations(ensembl_seq, uniprot_seq)
indel.append_indel(ensembl_entries_by_seq[ensembl_seq], uniprot_only[j], result)
#Write the sample-specific xml to outfile
ensembl.write(outXml, pretty_print=True)
#Write the sample-specific fasta to outfile
entryct = len(ensembl_root)
if outFasta != None:
for i, entry in enumerate(ensembl_root):
if i % 1000 == 0: print "writing entry " + str(i) + " of " + str(entryct)
entry = refparse.xml_to_fasta(entry)
if entry == None: continue
else: outFasta.write(entry[0] + '\n' + entry[1] + '\n')
outXml.close()
else: print >> sys.stderr, "Writing to fasta failed: no out-file found"
if __name__ == "__main__" : __main__()