-
Notifications
You must be signed in to change notification settings - Fork 0
/
file_writer.py
112 lines (97 loc) · 4.08 KB
/
file_writer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import vcfpy
import sys
def write_output(records, file_path, sample_name_to_header, chromosome_set):
"""
Serialises the data into a VCF file.
:param records: the list of records to serialise. No order is assumed.
:param file_path: path to the output file
:param sample_name_to_header: a map from the sample names to the headers
:param chromosome_set: the set of chromosomes selected for analysis
:return: nothing
"""
assert len(sample_name_to_header) > 0, "At least one sample is required"
# Sort the output by chromosome and position. We do a sort-in-place to optimise the memory
sorting_function = lambda record: (record.CHROM, record.POS)
records.sort(key=sorting_function)
# Calculate the header of the output file
header = get_header(sample_name_to_header, chromosome_set)
if file_path == '-':
writer = vcfpy.Writer.from_stream(sys.stdout, header=header)
else:
writer = vcfpy.Writer.from_path(file_path, header=header)
for output_record in records:
writer.write_record(output_record)
# Only close the output if its a file
if file_path != '-':
writer.close()
def get_header(sample_name_to_header, chromosome_set):
"""
Returns the header of the output VCF file
:param sample_name_to_header: a dictionary from the sample names to the headers
:param chromosome_set: the set of chromosomes selected for analysis
:return: a vcfpy.Header
"""
header = vcfpy.Header()
header.add_line(vcfpy.HeaderLine(key="fileformat", value="VCFv4.2"))
# CONTIG headers
first_sample_header = next(iter(sample_name_to_header.values()))
for input_header_line in first_sample_header.lines:
if isinstance(input_header_line, vcfpy.ContigHeaderLine):
if chromosome_set is None or input_header_line.mapping["ID"] in chromosome_set:
header.add_line(input_header_line)
# INFO fields
header.add_info_line(vcfpy.OrderedDict(
ID="END",
Number=1,
Type="Integer",
Description="Stop position of the interval"))
header.add_info_line(vcfpy.OrderedDict(
ID="SVTYPE",
Number=1,
Type="String",
Description="Type of structural variant"))
header.add_info_line(vcfpy.OrderedDict(
ID="INSSEQ",
Number=1,
Type="String",
Description="Insertion sequence of structural variant, not including sequence marked as duplication"))
header.add_info_line(vcfpy.OrderedDict(
ID="TRANCHE2",
Number=1,
Type="String",
Description="Quality category of GRIDSS structural variant calls determined using FILTER,SRQ,AS,RAS. Values are LOW INTERMEDIATE HIGH"))
header.add_info_line(vcfpy.OrderedDict(
ID="BNDVAF",
Number=1,
Type="Float",
Description="VAF of this gridss-called BND calculated as (SR+RP+IC+AS)/(REF+SR+RP+IC+AS)"))
# FORMAT fields
header.add_format_line(vcfpy.OrderedDict(
ID="GT",
Number=1,
Type="String",
Description="Genotype"))
header.add_format_line(vcfpy.OrderedDict(
ID="TRANCHE2",
Number=1,
Type="String",
Description="Quality category of GRIDSS structural variant calls determined using FILTER,SRQ,AS,RAS. Values are LOW INTERMEDIATE HIGH"))
header.add_format_line(vcfpy.OrderedDict(
ID="BNDVAF",
Number=1,
Type="Float",
Description="VAF of this gridss-called BND calculated as (SR+RP+IC+AS)/(REFPAIR+SR+RP+IC+AS)"))
header.add_format_line(vcfpy.OrderedDict(
ID="VAF",
Number=1,
Type="Float",
Description="VAF of this SV call, derived from BNDVAF values of BND calls used to call this SV"))
header.add_format_line(vcfpy.OrderedDict(
ID="INSSEQ",
Number=1,
Type="String",
Description="Insertion sequence of structural variant, not including sequence marked as duplication"))
# Samples, sorted to ensure determinism
sample_names = sample_name_to_header.keys()
header.samples = vcfpy.SamplesInfos(sorted(sample_names))
return header