def process_vcf(self, cols): """Build object from vcf """ vcf = open(self.name, 'r') info_dict, format_dict = {}, {} # Read the meta-information lines from the vcf for i, line in enumerate(vcf): # Handle exceptions: the AF will be calcualted regardless; if line.startswith('##FORMAT=<ID=AF'): pass # Select the INFO/FORMAT lines elif line.startswith('##FORMAT'): vcf_header = VcfHeader(line) format_dict.update({vcf_header.meta_id: vcf_header}) elif line.startswith('##INFO'): vcf_header = VcfHeader(line) info_dict.update({vcf_header.meta_id: vcf_header}) # Keep other meta-info lines elif line.startswith('##'): if line.startswith('##source='): self.caller = line.replace('##source=', '').strip() self.meta_info.append(line) else: break # Only extract the (filtered) DP in the format if "DP" in info_dict.keys() and format_dict.keys(): info_dict.pop("DP", None) if not self.caller: sys.exit("Cannot identify caller from file {}\nPlease add caller \ identify line '##source=(caller name)' to vcf header" .format(self.name)) # When user specify the AF and vcf does not have, try to calculate that # for the user if ('AF' in cols) and ('AF' not in info_dict.keys()): vcf_header = VcfHeader(AF_LINE) info_dict.update({vcf_header.meta_id: vcf_header}) # Select the columns from INFO/FORMAT info_cols, format_cols = extract_cols(info_dict, format_dict, cols) # Add the INFO line (with caller) / FORMAT (unchanged) to header_list self.meta_info += [VcfHeader.write(VcfHeader.add_caller(v, self.caller)) for k, v in info_cols.items()] self.meta_info += [VcfHeader.write(v) for k, v in format_cols.items()] self.header = line # Continue to read the file, this time the variants for j, line in enumerate(vcf): variant = Variant().process_variant(line, caller=self.caller) if variant.alt == '*': print("Warning: Vcf {} line {} has variant with alt=*".format(self.caller, str(i+j+1))) cleaned_variant = Variant.select_info(variant, info_cols, format_cols) # The dictionary is query by chr\tpos\tref\talt self.variants.update({cleaned_variant.variant_key: cleaned_variant}) return self
def process_somatic_vcf(self, cols, nid, tid): """Process somatic vcf, with normal and tumor sample id provided """ vcf = open(self.name, 'r') info_dict, format_dict = od(), od() # Read the meta-information lines from the vcf for i, line in enumerate(vcf): # Handle exceptions: skip the AF and DP in INFO if (line.startswith('##INFO=<ID=DP') or line.startswith('##INFO=<ID=AF')): pass # Select the INFO/FORMAT lines elif line.startswith('##FORMAT'): vcf_header = VcfHeader(line) format_dict.update({vcf_header.meta_id: vcf_header}) elif line.startswith('##INFO'): vcf_header = VcfHeader(line) info_dict.update({vcf_header.meta_id: vcf_header}) # Keep other meta-info lines elif line.startswith('##'): if line.startswith('##source='): self.caller = line.replace('##source=', '').strip() self.meta_info.append(line) else: break if not self.caller: sys.exit("Cannot locate caller from file {}\nPlease add caller \ identify line '##source=(caller name)' to vcf header" .format(self.name)) # Select the columns from INFO/FORMAT info_cols, format_cols = extract_cols_somatic(info_dict, format_dict, cols) # Add the INFO line (with caller) / FORMAT (unchanged) to header_list self.meta_info += [VcfHeader.write(VcfHeader.add_caller(v, self.caller)) for k, v in info_cols.items()] self.meta_info += [VcfHeader.write(v) for k, v in format_cols.items()] self.header = line # find the location of normal / tumor column index if self.caller == 'strelka': normal_index, tumor_index = (self.header.split().index('NORMAL'), self.header.split().index('TUMOR')) elif (self.header.split()[9] == nid) and \ (self.header.split()[10] == tid): normal_index, tumor_index = 9, 10 elif (self.header.split()[10] == nid) and \ (self.header.split()[9] == tid): normal_index, tumor_index = 10, 9 else: sys.exit("Normal sample id [{}] or tumor sample id [{}] didn't match with file {}: [{}], [{}]" .format(nid, tid, self.name, self.header.split()[9], self.header.split()[10])) # Continue to read the file, this time the variants for j, line in enumerate(vcf): variant = Variant().process_somatic_variant( line, self.caller, normal_index, tumor_index) if variant.alt == '*': print("Warning: Line {} contains variant with alt=*".format(str(i+j+1))) cleaned_variant = Variant.select_info(variant, info_cols, format_cols, caller=self.caller, somatic=True) # The dictionary is query by chr\tpos\tref\talt self.variants.update({cleaned_variant.variant_key: cleaned_variant}) return self