/
tab_to_vcf.py
124 lines (94 loc) · 4.63 KB
/
tab_to_vcf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/bin/env python
"""
Convert the given tab-delimited document into a VCF 4.1 document for annotation with Seattle Seq.
"""
import argparse
import csv
from fastahack import FastaHack
import vcf
from vcf.model import _Record
TEMPLATE_VCF_FILE = "template-4.1.vcf"
VCF_TO_FIELDS = (
("#CHROM", "Chrom"),
("POS", "Pos(hg19)"),
("ID", "Unique id"),
("REF", "Ref"),
("ALT", "Allele"),
("QUAL", "QUAL"),
("FILTER", "FILTER")
)
CHROMOSOME_INDEX=0
POSITION_INDEX=1
REF_INDEX=3
ALT_INDEX=4
def get_sequence(reference_dict, chrom, position):
position = int(position)
return reference_dict["%s:%s-%s" % (chrom, position, position)].upper()
def gatk_indel_to_vcf(vcf_row, reference_dict):
"""
Convert the given indel from GATK format to VCF 4.1 standard. For example,
the following lines should be converted from:
2 60689253 1720 * +G . . . .
21 38877833 1721 * -C . . . .
21 47958429 1722 * +CTGGTCT . . . .
to:
2 60689253 1720 A AG . . . .
21 38877833 1721 GC G . . . .
21 47958429 1722 A ACTGGTCT . . . .
>>> reference_dict = FastaHack("human_1kg_v37.fasta")
>>> gatk_indel_to_vcf(['2', 60689253, '1720', '*', '+G', '.', '.', '.', '.'], reference_dict)
['2', 60689253, '1720', 'A', 'AG', '.', '.', '.', '.']
>>> gatk_indel_to_vcf(['21', 38877833, '1721', '*', '-C', '.', '.', '.', '.'], reference_dict)
['21', 38877833, '1721', 'GC', 'G', '.', '.', '.', '.']
>>> gatk_indel_to_vcf(['21', 47958429, '1722', '*', '+CTGGTCT', '.', '.', '.', '.'], reference_dict)
['21', 47958429, '1722', 'A', 'ACTGGTCT', '.', '.', '.', '.']
"""
# Load the base at the given position.
reference_base = get_sequence(reference_dict, vcf_row[CHROMOSOME_INDEX], vcf_row[POSITION_INDEX])
# Create a new reference allele based on the event type (the position's base
# for insertions, the position base plus the deleted base(s) for deletions).
# Create a new alternate allele based on the event type (the position's base
# plus the inserted base(s) for insertions, the position's base for
# deletions).
if vcf_row[ALT_INDEX].startswith("+"):
vcf_row[REF_INDEX] = reference_base
vcf_row[ALT_INDEX] = vcf_row[ALT_INDEX].replace("+", reference_base)
elif vcf_row[ALT_INDEX].startswith("-"):
vcf_row[REF_INDEX] = "%s%s" % (reference_base, vcf_row[ALT_INDEX].lstrip("-"))
vcf_row[ALT_INDEX] = reference_base
return vcf_row
def tab_to_vcf(input_file, output_file, reference_file):
"""
Convert tab-delimited file to VCF.
Support for the fixed VCF fields: #CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO
PyVCF's _Record class requires the following arguments:
CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT, sample_indexes
"""
reference_dict = FastaHack(reference_file)
with open(input_file, "r") as input_fh:
reader = csv.DictReader(input_fh, delimiter="\t")
with open(TEMPLATE_VCF_FILE, "r") as template_fh:
vcf_reader = vcf.Reader(template_fh)
with open(output_file, "w") as output_fh:
vcf_writer = vcf.Writer(output_fh, vcf_reader, lineterminator='\n')
for row in reader:
args = [row.get(tab_field, ".")
for vcf_field, tab_field in VCF_TO_FIELDS]
# Convert position to an integer.
args[POSITION_INDEX] = int(args[POSITION_INDEX])
# Convert indels from GATK to VCF format.
if args[ALT_INDEX].startswith(("+", "-")) and not "/" in args[ALT_INDEX]:
args = gatk_indel_to_vcf(args, reference_dict)
# Convert alternate allele scalar to a list.
args[ALT_INDEX] = [args[ALT_INDEX]]
# Add empty entries for INFO, FORMAT, and sample_indexes.
args.extend([{}, ".", []])
record = _Record(*args)
vcf_writer.write_record(record)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("input_file", help="tab-delimited input")
parser.add_argument("output_file", help="VCF 4.1 output")
parser.add_argument("reference_file", help="reference assembly for variants in a single FASTA file")
args = parser.parse_args()
tab_to_vcf(args.input_file, args.output_file, args.reference_file)