def get_variants_as_records(varfile): """ Arguments: varfile -- list of desired variants as CSV Returns: recordlist -- list of those variants in vcfpy.Record form """ recordlist = [] reader = csv.DictReader(varfile, delimiter='\t') for row in reader: ref, alt = row['REF'], row['ALT'] if len(alt) == len(ref): alttype = vcfpy.SNV else: alttype = vcfpy.INDEL altrecord = vcfpy.Substitution(alttype, alt) record = vcfpy.Record(row['CHROM'], int(row['POS']), [], row['REF'], [altrecord], None, ['PASS'], {}, None, None) recordlist.append(record) return recordlist
def main(): parser = argparse.ArgumentParser(description="vcf writer") parser.add_argument("input", metavar='input.vcf', action='store', help='vcf file.', type=str) parser.add_argument("output", metavar='output.vcf', action='store', help='vcf file.', type=str) args = parser.parse_args() outvcf = args.output invcf = args.input ######################### # # # creating the header # # # ######################### # The header can contain some fixed type lines (INFO, FORMAT, FILTER, etc.) and some general ones # In this case, the header will contain a line storing the name of the program which generated # the file. We also add the information about the name of the sample which have been analyzed header = vcfpy.Header(lines=[vcfpy.HeaderLine(key="source", value=sys.argv[0]), vcfpy.HeaderLine(key="fileformat", value="VCFv4.3"), vcfpy.HeaderLine(key="fileDate", value=date.today().strftime("%d/%m/%Y")) ], samples=vcfpy.SamplesInfos(["Sample1"])) # adding format lines header.add_format_line(OrderedDict([("ID", "GT"),("Number", "1"), ("Type","String"), ("Description", "Genotype")])) header.add_format_line(OrderedDict([("ID", "DP"),("Number", "1"), ("Type","Integer"), ("Description", "Filtered read depth (MAPQ > 30)")])) # read the input vcf with vcfpy.Reader.from_path(invcf) as reader: # get the FORMAT header lines of the input file # and convert them in INFO header lines of the output file format_ids = reader.header.format_ids() for format_id in format_ids: format_line = reader.header.get_format_field_info(format_id) ''' output example: FormatHeaderLine('FORMAT', '<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">', {'ID': 'AD', 'Number': 'R', 'Type': 'Integer', 'Description': 'Allelic depths for the ref and alt alleles in the order listed'}) key = 'FORMAT' value = '<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed"> ''' header.add_info_line(str_to_mapping(format_line.value)) #print(header) # write the vcf with vcfpy.Writer.from_path(outvcf, header) as writer: # creating one record record = vcfpy.Record( CHROM="1", POS=1, ID=[], REF="C", ALT=[vcfpy.Substitution(type_="SNV", value="G")], QUAL=None, FILTER=[], INFO={}, FORMAT=["GT", "DP"], calls=[vcfpy.Call("Sample1", OrderedDict([("GT", "0/1"),("DP", "47")]))] ) #print(record) writer.write_record(record)
def test_cell_vcf_mutation_counting(self): with io.StringIO() as vcf_stream: with vcfpy.Reader.from_path( self.template_vcf) as template_vcf_reader: vcf_writer = vcfpy.Writer.from_stream( vcf_stream, header=template_vcf_reader.header) cosmic_subset = self.cosmic_df.loc[self.cosmic_df["Primary site"] == "lung"] # Write test VCF expected_gene_mut_counts = {} hgvs_parser = hgvs.parser.Parser() for _, row in cosmic_subset.iterrows(): genome_pos = GenomePosition.from_str( str(row["Mutation genome position"])) if genome_pos is None: continue try: posedit = hgvs_parser.parse_c_posedit( row["Mutation CDS"][2:]) # pylint: disable=no-member except: continue record = vcfpy.Record( CHROM=genome_pos.chrom, POS=genome_pos.start + 1, ID='.', REF=posedit.edit.ref, ALT=[vcfpy.Substitution(None, posedit.edit.alt)], QUAL=0, FILTER='.', INFO={}) vcf_writer.write_record(record) gene_name = row["Gene name"] # Remove any gene name suffixes gene_name = gene_name.split('_')[0] expected_gene_mut_counts[ gene_name] = expected_gene_mut_counts.get(gene_name, 0) + 1 # Test mutation counting # Reset the buffer's cursor position vcf_stream.seek(0) _, filtered_gene_mut_counts = self.mutation_counter.find_cell_gene_mut_counts( stream=vcf_stream) self.assertDictEqual(expected_gene_mut_counts, filtered_gene_mut_counts)
def main(): if len(sys.argv) != 2: print("Usage: vcf_from_scratch.py OUTPUT.vcf", file=sys.stderr) return 1 header = vcfpy.Header(samples=vcfpy.SamplesInfos([])) with vcfpy.Writer.from_path(sys.argv[1], header) as writer: record = vcfpy.Record(CHROM="1", POS=1, ID=[], REF="N", ALT=[], QUAL=None, FILTER=[], INFO={}, FORMAT=[]) writer.write_record(record)
def test_from_vcf_record(self): tests = [ self.GPOS_A, self.GPOS_B, self.GPOS_C, self.GPOS_D, self.GPOS_E, ] tests = [ (vcfpy.Record( CHROM=pos.chrom, POS=pos.start + 1, ID='.', REF='.' * len(pos), ALT=[], QUAL=0, FILTER='.', INFO={} ), pos) for pos in tests ] for test, expected in tests: self.assertEqual(expected, GenomePosition.from_vcf_record(test))
def generate_sv_record(records, comparison_result, sample_names): """ This method generates a single SV record after a call has been made over a set of input records :param records: the input records involved in the SV call :param comparison_result: :param sample_names: :return: """ # Build a map to easily find the records by the sample name. It can be multi-valued sample_names_to_records = group_by(records, lambda record: get_sample_name(record)) # Generate calls for each sample in this group calls = [ get_sample_call(sample_name, sample_names_to_records.get(sample_name, None)) for sample_name in sample_names ] first_record_of_the_group = records[0] chrom = first_record_of_the_group.CHROM id_of_new_record = generate_id(chrom, comparison_result.initial_position) info = vcfpy.OrderedDict() info["SVTYPE"] = comparison_result.svtype info["END"] = comparison_result.final_position if comparison_result.insseq is not None: info["INSSEQ"] = comparison_result.insseq return vcfpy.Record( CHROM=chrom, # by construction, all the grouped records have the same POS=comparison_result. initial_position, # by construction, all the grouped records have the same ID=[id_of_new_record], REF=first_record_of_the_group. REF, # by construction, all the grouped records have the same ALT=[ vcfpy.Substitution(type_=comparison_result.svtype, value='<{}>'.format(comparison_result.svtype)) ], QUAL=maximum_qual(records), FILTER=["PASS"], INFO=info, FORMAT=["GT", "TRANCHE2", "VAF"], calls=calls)
def __call__(self, record: vcfpy.Record) -> Union[vcfpy.Record, None]: if not ("AO" in record.INFO and "DP" in record.INFO): return None # VCF records have 1 (or 0?) or more ALT records supported by calls from 1 or more samples # and AO INFO fields with dimension matching the ALT dimensions # This Transform type Filter retains only those ALTs and corresponding INFO matching the # criteria of the filter # It does not modify the calls which might cause problems of calls not matching ALT retain = [] for i, alt in enumerate(record.ALT): alt_percentage = (float(record.INFO["AO"][i]) / float(record.INFO["DP"]) * 100.0) retain.append(not alt_percentage < self.min_percentage) if not any(retain): return None new_ALT = [alt for i, alt in enumerate(record.ALT) if retain[i]] new_INFO = OrderedDict() # these are produced by snpEff and keys occur once per implicated gene # the simplest solution is to copy them all across snpeff_keys = set(["ANN", "LOF", "NMD"]) for key in record.INFO: if type(record.INFO[key]) == list: new_INFO[key] = [ # retain all ANN records and the only those other records that correspond to alts that we retain el for i, el in enumerate(record.INFO[key]) if key in snpeff_keys or retain[i] ] else: new_INFO[key] = record.INFO[key] new_record = vcfpy.Record( record.CHROM, record.POS, record.ID, record.REF, new_ALT, record.QUAL, record.FILTER, new_INFO, record.FORMAT, record.calls, ) return new_record
def _write_variants_data(self): for small_var in self._yield_smallvars(): # Get variant type if len(small_var.reference) == 1 and len( small_var.alternative) == 1: var_type = vcfpy.SNV elif len(small_var.reference) == len(small_var.alternative): var_type = vcfpy.MNV else: var_type = vcfpy.INDEL # Build list of calls calls = [ vcfpy.Call( member, { key.upper(): f( small_var.genotype.get(member, {}).get( key, default_value)) for key, default_value, f in ( ("gt", "./.", lambda x: x), ("gq", None, lambda x: x), ("ad", None, lambda x: None if x is None else [x]), ("dp", None, lambda x: x), ) }, ) for member in self.members ] # Construct and write out the VCF ``Record`` object self.vcf_writer.write_record( vcfpy.Record( small_var.chromosome, small_var.start, [], small_var.reference, [vcfpy.Substitution(var_type, small_var.alternative)], None, [], {}, ["GT", "GQ", "AD", "DP"], calls, ))
def write_vcf(vcffilename, sample_name, records): """ Generate a VCF with the given records and randomly generated genotypes Arguments: vcffilename - path to generated file records - list of vcfpy.Record describing the variants """ lengths = [249250621, 243199373, 198022430, 191154276, 180915260, 171115067, 159138663, 146364022, 141213431, 135534747, 135006516, 133851895, 115169878, 107349540, 102531392, 90354753, 81195210, 78077248, 59128983, 63025520, 48129895, 51304566] samples = vcfpy.SamplesInfos([sample_name]) header = vcfpy.Header(samples=samples) header.add_line(vcfpy.HeaderLine("fileformat", "VCFv4.3")) header.add_line(vcfpy.HeaderLine("fileDate", "20200901")) for chrom, length in enumerate(lengths): header.add_contig_line({"ID": str(chrom), "assembly": "GRCh37", "length": length}) header.add_format_line({"ID":"GT", "Number":1, "Type":"String", "Description": "Genotype"}) with open(vcffilename, 'wb') as vcffile: writer = vcfpy.Writer.from_stream(vcffile, header, samples, use_bgzf=True) for record in records: genotype = random.choice(['0/0', '0/1', '1/1']) newrecord = vcfpy.Record(record.CHROM, record.POS, record.ID, record.REF, record.ALT, record.QUAL, record.FILTER, record.INFO, ["GT"], calls=[vcfpy.record.Call(sample_name, {"GT": genotype})]) writer.write_record(newrecord) writer.close()
def extract_vcf_records( sample_name, # input paths alignments_path, contigs_path, ref_fasta_path, vcf_template_path, # output paths vcf_out_path, selected_contigs_path, flanked_contigs_path, flank_length, min_insert_size): n_records = 0 ref_fasta = pysam.FastaFile(ref_fasta_path) contig_fasta = pysam.FastaFile(contigs_path) selected_contig_fasta = open(selected_contigs_path, "w") flanked_contig_fasta = open(flanked_contigs_path, "w") alns = pandas.read_csv(alignments_path, sep=" ") reader = vcfpy.Reader.from_path(vcf_template_path) reader.header.samples = vcfpy.SamplesInfos([sample_name]) writer = vcfpy.Writer.from_path(vcf_out_path, reader.header) contig_loci = set() # parse each alignment and look for insertions above min_insert_size for r in alns.iterrows(): # skip secondary alignments hit = r[1]["Hit"] if hit > 0: continue query_name = r[1]["QName"] # local alignment window in the reference ref_chrom, ref_start, ref_end, phase_set, phase, n = query_name.split( "_") phase_set = phase_set[2:] phase = phase[2:] # convert to ints ref_start, ref_end = (int(ref_start), int(ref_end)) # alignment start and end for reference sequence target_start = r[1]["TStart"] target_end = r[1]["TEnd"] # alignment start and end for query sequence query_start = r[1]["QStart"] query_end = r[1]["QEnd"] # strand-ness of the query sequence strand = r[1]["Strand"] # parse cigar for variant extraction cig = cigar.Cigar(r[1]["CIGAR"]) ops = list(cig.items()) # convert sequences to the positive strand query_seq = contig_fasta.fetch(query_name) if strand == "-": query_seq = str(Bio.Seq.Seq(query_seq).reverse_complement()) ref_seq = ref_fasta.fetch(ref_chrom, ref_start, ref_end) # initialize iterators for the cigar string query_pos = query_start target_pos = target_start # we are looking to extract insertions larger than 50bp for op in ops: # skip matches if op[1] == 'M': query_pos += op[0] target_pos += op[0] # skip deletions in the query sequence elif op[1] == 'D': target_pos += op[0] # insertions in the query sequence elif op[1] == 'I': # only interested in large insertions if op[0] > min_insert_size: # Generate pysam.VariantRecord # need to check conversion from 0-based coordinates to 1-based ref_allele = ref_seq[target_pos] alt_allele = ref_allele + query_seq[query_pos:query_pos + op[0]] gt = "" if phase == "1": gt = "1|0" elif phase == "2": gt = "0|1" else: gt = "0/1" break_point = ref_start + target_pos # output VCF record corresponding to the insertion rec = vcfpy.Record( CHROM=ref_chrom, POS=break_point + 1, ID=[query_name], REF=ref_allele, ALT=[vcfpy.Substitution("INS", alt_allele)], QUAL=999, FILTER=["PASS"], INFO={}, FORMAT=[ "GT", "SVLEN", "PS", "HP", "CIGAR", "STRAND", "CONTIG_START" ], calls=[ vcfpy.Call(sample=sample_name, data=vcfpy.OrderedDict( GT=gt, SVLEN=op[0], PS=phase_set, HP=phase, CIGAR=str(cig), STRAND=strand, CONTIG_START=str(query_start))) ]) n_records += 1 # output contig that contains this insertion writer.write_record(rec) contig_locus = ">" + query_name + "_" + sample_name contig_hash = sha1("_{chrom}_{pos}_{alt}".format( chrom=ref_chrom, pos=ref_start, alt=alt_allele[1:]).encode()).hexdigest() contig_name = contig_locus + "_" + contig_hash + "_" + str( op[0]) if contig_locus not in contig_loci: selected_contig_fasta.writelines( [contig_name + "\n", query_seq + "\n"]) contig_loci.add(contig_locus) # output same insertion, but with flanking sequences # note, the interval is [start, end[ if flank_length > 0: left_flank = ref_fasta.fetch( ref_chrom, break_point - flank_length, break_point) right_flank = ref_fasta.fetch( ref_chrom, break_point, break_point + flank_length) else: left_flank = "" right_flank = "" flanked_contig_fasta.writelines([ contig_name + "\n", left_flank + alt_allele[1:] + right_flank + "\n" ]) query_pos += op[0] selected_contig_fasta.close() return n_records
def write_haplotype_to_vcf(self, fake_genome_mapping_filename, isoform_tally, output_prefix): """ The following functions must first be called first: -- self.get_haplotype_vcf_assignment """ if self.haplotype_vcf_index is None or self.alt_at_pos is None: raise Exception( "Must call self.get_haplotype_vcf_assignment() first!") self.sanity_check() name_isoforms = list(isoform_tally.keys()) name_isoforms.sort() # write a fake VCF example so we can read the headers in with open("template.vcf", "w") as f: f.write(__VCF_EXAMPLE__) reader = vcfpy.Reader(open("template.vcf")) reader.samples = name_isoforms f_vcf = vcfpy.Writer(f"{output_prefix}.vcf", reader) # human readable text: # first line: assoc VCF filename # second line: haplotype, list of sorted isoforms # third line onwards: haplotype and assoc count with open(f"{output_prefix}.human_readable.txt", "w") as f_human: f_human.write(f"Associated VCF file: {output_prefix}.vcf\n") f_human.write("haplotype\t{samples}\n".format( samples="\t".join(name_isoforms))) for hap_index, hap_str in enumerate(self.haplotypes): f_human.write(hap_str) for _iso in name_isoforms: if hap_index in isoform_tally[_iso]: f_human.write(f"\t{isoform_tally[_iso][hap_index]}") else: f_human.write("\t0") f_human.write("\n") # read fake genome mapping file fake_map = {} # 0-based position on fake --> (, 0-based ref position) with open(fake_genome_mapping_filename) as f: for line in f: fake_pos, ref_chr, ref_pos = line.strip().split(",") fake_map[int(fake_pos)] = (ref_chr, int(ref_pos)) # for each position, write out the ref and alt bases # then fill in for each isoform (aka "sample"): # if this isoform only shows one allele, then it's just that allele (0 for ref, 1+ otherwise) # if this isoform shows 2+ allele, then the first allele is indicated by self.haplotypes[0] for i, pos in enumerate(self.hap_var_positions): ref_chr, ref_pos = fake_map[pos] total_count = sum(self.count_of_vars_by_pos[pos].values()) alt_freq = [ f"{self.count_of_vars_by_pos[pos][b] * 1.0 / total_count:.2f}" for b in self.alt_at_pos[pos] ] rec = vcfpy.Record( CHROM=ref_chr, POS=ref_pos + 1, ID=".", REF=self.ref_at_pos[pos], ALT=[vcfpy.Substitution(b) for b in self.alt_at_pos[pos]], QUAL=".", FILTER="PASS", INFO={ "AF": alt_freq, "DP": total_count }, FORMAT="GT:HQ", sample_indexes=None, ) rec.samples = [] for _iso in name_isoforms: # isoform_tally[_iso] is a dict of haplotype index --> count # the index for thos base at this pos would thus be haplotype_vcf_index[hap_index][i] # we always need to show the phases in haplotype index order sorted hap_indices = list(isoform_tally[_iso].keys()) hap_indices.sort() genotype = "|".join( str(self.haplotype_vcf_index[hap_index][pos]) for hap_index in hap_indices) counts = ",".join( str(isoform_tally[_iso][hap_index]) for hap_index in hap_indices) rec.samples.append( vcfpy.Call( rec, _iso, vcfpy.OrderedDict([("GT", genotype), ("HQ", counts)]))) f_vcf.write_record(rec) f_vcf.close()
def main(): parser = argparse.ArgumentParser( description="Looks for a given set of SNPs whithin a bam file.") parser.add_argument("bam", metavar='sample.bam', action='store', help='BAM file.', type=str) parser.add_argument( "barcodes", metavar='barcodes.list', action='store', help= "File containing cell barcodes (the same used in the alignment file to identify cell reads).", type=str) parser.add_argument("vcf", metavar='variants.vcf', action='store', help="VCF file storing BULK SNPs.", type=str) parser.add_argument("sample_name", metavar='sample1', action='store', help="Sample identifier.", type=str) parser.add_argument("out_prefix", metavar="outdir/sample", action="store", help="Output VCF file prefix.", type=str) parser.add_argument( "--gt", metavar='1/1 (0/1)', choices=["0/0", "0/1", "1/1"], action='store', help= "Genotype filter: considers only mutations with the specified GT in the original vcf file.", type=str) args = parser.parse_args() bam = args.bam barcodes = args.barcodes invcf = args.vcf sample = args.sample_name outvcf = args.out_prefix + ".snpseeker.vcf" if args.gt: gt_filter = True gt = args.gt else: gt_filter = False with open(barcodes, "r") as f: samples = f.read().splitlines() #read bam file samfile = pysam.AlignmentFile(bam, "rb") #build the header of the output vcf header_out = vcfpy.Header(lines=[ vcfpy.HeaderLine(key="fileformat", value="VCFv4.3"), vcfpy.HeaderLine(key="source", value=sys.argv[0]), vcfpy.HeaderLine(key="fileDate", value=date.today().strftime("%d/%m/%Y")) ], samples=vcfpy.SamplesInfos(samples)) # sample header lines header_out.add_line( vcfpy.SampleHeaderLine.from_mapping( OrderedDict([("ID", sample), ("Description", "Sample name")]))) # filter header lines # sample header lines header_out.add_filter_line( OrderedDict([("ID", "1/1"), ("Number", "1"), ("Description", "Filtered on such GT")])) header_out.add_filter_line( OrderedDict([("ID", "0/1"), ("Number", "1"), ("Description", "Filtered on such GT")])) header_out.add_filter_line( OrderedDict([("ID", "0/0"), ("Number", "1"), ("Description", "Filtered on such GT")])) #header_out.add_info_line(OrderedDict([("ID", "MUT"), ("Number", "1"), ("Type","Integer"), ("Description", "States if the record mutation is supported (1) or not (0).")])) # format header lines header_out.add_format_line( OrderedDict([("ID", "GT"), ("Number", "1"), ("Type", "String"), ("Description", "Genotype (0/1, 0/0)")])) header_out.add_format_line( OrderedDict([ ("ID", "DP"), ("Number", "1"), ("Type", "Integer"), ("Description", "Filtered read depth (reads with MAPQ < 30, indels and gaps are filtered)" ) ])) header_out.add_format_line( OrderedDict([("ID", "RD"), ("Number", "1"), ("Type", "Integer"), ("Description", "Reference allele read depth")])) header_out.add_format_line( OrderedDict([("ID", "AD"), ("Number", "1"), ("Type", "Integer"), ("Description", "Alternate allele read depth")])) header_out.add_format_line( OrderedDict([ ("ID", "AF"), ("Number", "1"), ("Type", "Float"), ("Description", "Allele frequency: AD/(RD+AD). Other alleles, in case of mutli-allelic regions, are ignored." ) ])) # read input vcf reader = vcfpy.Reader.from_path(invcf) # info header lines # Use input FORMAT lines as output INFO line header_out.add_info_line( OrderedDict([("ID", "SUPP"), ("Number", "1"), ("Type", "Integer"), ("Description", "Number of cells supporting the mutation.")])) format_ids = reader.header.format_ids() for format_id in format_ids: format_line = reader.header.get_format_field_info(format_id) ''' output example: FormatHeaderLine('FORMAT', '<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">', {'ID': 'AD', 'Number': 'R', 'Type': 'Integer', 'Description': 'Allelic depths for the ref and alt alleles in the order listed'}) key = 'FORMAT' value = '<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed"> ''' mapping = str_to_mapping(format_line.value) mapping["Description"] = "(Info about bulk mutation)" + mapping[ "Description"] header_out.add_info_line(str_to_mapping(format_line.value)) # open the output vcf writer = vcfpy.Writer.from_path(outvcf, header_out) #read bam file samfile = pysam.AlignmentFile(bam, "rb") #for each mutation in the vcf file for record_in in reader: d = samples_dict(samples) supp = 0 # filter out indels: only interested in snvs in this analysis phase if gt_filter: if record.calls[0].data.get('GT') != gt: continue if not record_in.is_snv(): continue chrom = record_in.CHROM pos = record_in.POS - 1 #to correct on 1-based positions ref = record_in.REF alt = record_in.ALT[ 0].value #record.ALT is a list by construction which contains only one value # if the mutation is a SNV #line += [call.data.get('GT') or './.' for call in record.calls] #look for the pileup in the samfile at position (chrom,pos) for pileupcolumn in samfile.pileup(chrom, pos, pos + 1, stepper='all', truncate=True, max_depth=10000): for base in pileupcolumn.pileups: # .is_del -> the base is a deletion? # .is_refskip -> the base is a N in the CIGAR string ? if not base.is_del and not base.is_refskip and not base.alignment.mapping_quality < 30: #iterate on cells tags = list_to_dict(base.alignment.tags) if "CB" not in tags.keys(): ''' reads with no error-corrected barcode are discarded ''' continue elif tags["CB"].split("-")[0] not in samples: ''' The barcode hasn't been labeled has belonging to a cell by cellranger (floating DNA)''' continue cb = tags["CB"].split("-")[0] #10x barcodes #print("barcode {} is a cell barcode ".format(cb)) d[cb][ 'dp'] += 1 #update info for the sample identified by CB if base.alignment.query_sequence[ base.query_position] == alt: d[cb]['ad'] += 1 elif base.alignment.query_sequence[ base.query_position] == ref: d[cb]['rd'] += 1 for cb in d.keys(): if d[cb]['ad'] > 0: supp += 1 d[cb][ 'gt'] = "0/1" #temporary, all the supported mutations are set to 0/1 d[cb]['af'] = d[cb]['ad'] / (d[cb]['rd'] + d[cb]['ad']) # generate calls for each sample/cell calls = [] for cb in d.keys(): calls.append( vcfpy.Call( cb, OrderedDict([("GT", d[cb]['gt']), ("DP", d[cb]['dp']), ("RD", d[cb]['rd']), ("AD", d[cb]['ad']), ("AF", d[cb]['af'])]))) # create a mapping between each FORMAT entry and the # corresponding value, in the call, in the input vcf file # note that the input vcf contains only one sample, so # the calls field of each record contains only one entry info_d = {} info_d['SUPP'] = supp for f in record_in.FORMAT: info_d[f] = record_in.calls[0].data.get(f) if gt_filter == True: filter_l = [gt] else: filter_l = [] # build and write the output record record_out = vcfpy.Record( CHROM=chrom, POS=pos + 1, ID=[], REF=ref, ALT=[vcfpy.Substitution(type_="SNV", value=alt)], QUAL=None, FILTER=filter_l, INFO=info_d, FORMAT=["GT", "DP", "RD", "AD", "AF"], calls=calls) writer.write_record(record_out) reader.close() writer.close() samfile.close()
# logging.info("ref alleles assigned for %s at %s", acc, site) gt = str(allele) + "|" + str(allele) if args.diploid else str( allele) sampleCall = vcfpy.Call( sample=acc, data={'GT': gt}, # has to be string; diploid # data = {'GT': str(allele) }, # has to be string site=site) genoCalls.append(sampleCall) record = vcfpy.Record( CHROM=refEPI, POS=site, ID=snpInfo[site]['varID'], REF=snpInfo[site]['refNT'], ALT=subs, QUAL=None, FILTER=[], # PASS INFO={}, # consequence calls, locus, etc; a dict FORMAT=['GT'], # a list calls=genoCalls) varCt += 1 writer.write_record(record) logging.info("SNPs records written to file: n = %s at %s", len(sitesSNPs), datetime.datetime.now()) for change in indelDict: # change is "cv-" varID geno = {} genoCalls = [] refNT = indelInfo[change]['refNT'] # 'NAAAAA' altNTs = indelInfo[change]['altNT'] # [ 'N' ]
def main(): parser = argparse.ArgumentParser(description="vcf writer") parser.add_argument("output", metavar='output.vcf', action='store', help='vcf file.', type=str) args = parser.parse_args() outvcf = args.output ######################### # # # creating the header # # # ######################### # The header can contain some fixed type lines (INFO, FORMAT, FILTER, etc.) and some general ones # In this case, the header will contain a line storing the name of the program which generated # the file. We also add the information about the name of the sample which have been analyzed header = vcfpy.Header(lines=[ vcfpy.HeaderLine(key="source", value=sys.argv[0]), vcfpy.HeaderLine(key="fileformat", value="VCFv4.3"), vcfpy.HeaderLine(key="fileDate", value=date.today().strftime("%d/%m/%Y")) ], samples=vcfpy.SamplesInfos(["Sample1", "Sample2"])) # Tuples of valid entries ----------------------------------------------------- # #: valid INFO value types # INFO_TYPES = ("Integer", "Float", "Flag", "Character", "String") #: valid FORMAT value types # FORMAT_TYPES = ("Integer", "Float", "Character", "String") #: valid values for "Number" entries, except for integers # VALID_NUMBERS = ("A", "R", "G", ".") #: header lines that contain an "ID" entry # LINES_WITH_ID = ("ALT", "contig", "FILTER", "FORMAT", "INFO", "META", "PEDIGREE", "SAMPLE") # Constants for "Number" entries ---------------------------------------------- # #: number of alleles excluding reference # HEADER_NUMBER_ALLELES = "A" #: number of alleles including reference # HEADER_NUMBER_REF = "R" #: number of genotypes # HEADER_NUMBER_GENOTYPES = "G" #: unbounded number of values # HEADER_NUMBER_UNBOUNDED = "." # adding filter lines header.add_filter_line( OrderedDict([("ID", "PASS"), ("Description", "All filters passed")])) # adding info lines header.add_info_line( OrderedDict([("ID", "DP"), ("Number", "1"), ("Type", "Integer"), ("Description", "Raw read depth (without mapping quality filters)")])) header.add_info_line( OrderedDict([ ("ID", "MUT"), ("Number", "1"), ("Type", "Integer"), ("Description", "States if the record mutation is supported (1) or not (0).") ])) # adding format lines header.add_format_line( OrderedDict([("ID", "GT"), ("Number", "1"), ("Type", "String"), ("Description", "Genotype")])) header.add_format_line( OrderedDict([("ID", "DP"), ("Number", "1"), ("Type", "Integer"), ("Description", "Filtered read depth (MAPQ > 30)")])) #header.add_format_line(OrderedDict([vcfpy.header.RESERVED_FORMAT["GT"]])) # adding contig lines header.add_contig_line( OrderedDict([("ID", "chr1"), ("length", "248956422")])) # adding sample lines header.add_line( vcfpy.SampleHeaderLine.from_mapping( OrderedDict([("ID", "Sample1"), ("Description", "Tumor")]))) # writing the vcf with vcfpy.Writer.from_path(outvcf, header) as writer: # creating one record calls = [] calls.append( vcfpy.Call("Sample1", OrderedDict([("GT", "0/1"), ("DP", "47")]))) calls.append( vcfpy.Call("Sample2", OrderedDict([("GT", "0/1"), ("DP", "31")]))) record = vcfpy.Record(CHROM="1", POS=1, ID=[], REF="C", ALT=[vcfpy.Substitution(type_="SNV", value="G")], QUAL=None, FILTER=["PASS"], INFO={ "DP": "50", "MUT": 0 }, FORMAT=["GT", "DP"], calls=calls) #record.add_format(key="GT") #record.calls.append(vcfpy.Call("Sample1", OrderedDict([("GT", "0|1")]))) writer.write_record(record)
def write_snp_to_vcf( snp_filename: Path, vcf_filename: Path, genome_filename: Path, genome_d: LazyFastaReader = None, ) -> None: # read the genome is genome_d is not given if genome_d is None: genome_d = LazyFastaReader(genome_filename) # read the first SNP record so we know the query name snp_reader = SNPReader(snp_filename) snp_rec = next(snp_reader) sample_name = snp_rec.query_name cur_recs = [snp_rec] genome_rec = genome_d[snp_rec.ref_name] with open("template.vcf", "w+") as f: f.write(f"{__VCF_EXAMPLE__}\n") reader = vcfpy.Reader(f) reader.samples = [sample_name] f_vcf = vcfpy.Writer(vcf_filename, reader) for r1 in snp_reader: if r1.ref_pos == cur_recs[ -1].ref_pos: # multi-nt insertion, keep recording cur_recs.append(r1) elif (r1.query_base == "." and cur_recs[-1].query_base == "."): # multi-nt deletion, keep recording cur_recs.append(r1) else: # time to write out the current set of records # multiple records mean it could be: # 1. multi-nucleotide insertions # 2. multi-nucleotide deletions if (len(cur_recs) == 1 and cur_recs[0].ref_base != "." and cur_recs[0].query_base != "."): # just a SNP record pos = cur_recs[0].ref_pos ref_base = cur_recs[0].ref_base alt_base = cur_recs[0].query_base elif cur_recs[0].ref_base == ".": # is a single or multi-nt insertions, must retrieve ref base from genome # ex: in out.snps_files it is . --> ATG # in VCF it should be T --> TATG (meaning insertion of ATG) pos = cur_recs[0].ref_pos ref_base = genome_rec[cur_recs[0].ref_pos] alt_base = ref_base + "".join(r.query_base for r in cur_recs) else: # is a single multi-nt deletions, we need to get one more ref base before the first deletion # ex: in out.snps_files it is GGG --> deletion # in VCF it should be TGGG --> T (meaning deletion of GGG) pos = cur_recs[0].ref_pos - 1 ref_base_prev = genome_rec[pos] ref_base = ref_base_prev + "".join(r.ref_base for r in cur_recs) alt_base = ref_base_prev rec = vcfpy.Record( CHROM=snp_rec.ref_name, POS=pos + 1, ID=".", REF=ref_base, ALT=[vcfpy.Substitution(alt_base)], QUAL=".", FILTER="PASS", INFO={"AF": 0.5}, FORMAT="GT", sample_indexes=None, ) rec.samples.append( vcfpy.Call(rec, sample_name, vcfpy.OrderedDict([("GT", "0|1")]))) f_vcf.write_record(rec) if r1.ref_name != cur_recs[0].ref_name: genome_rec = genome_d[r1.ref_name] cur_recs = [r1]
def main(): parser = argparse.ArgumentParser(description="From single cell VCF to clones vcf.") parser.add_argument("input1", metavar="sample.muts.vcf", action="store", help="Single cell VCF file.", type=str) parser.add_argument("input2", metavar="clusters.list", action="store", help="Clusters list.", type=str) #parser.add_argument("input_type", choices=["gz", "vcf"], help="VCF input type (vcf/gz).", type=str) #parser.add_argument("sample", metavar="sample_name", action="store", help="Sample name", type=str) parser.add_argument("outprefix", metavar="out/path/prefix", action="store", help="Output prefix", type=str) args = parser.parse_args() input1 = args.input1 input2 = args.input2 prefix = args.outprefix #sample = args.sample #input_type = args.input_type clusters_df = pd.read_csv(input2) #clusters_df['cluster'] = clusters_df['a'].apply(lambda x: "{}_{}".format(sample, x)) clusters = [str(cluster) for cluster in clusters_df['cluster'].unique()] # Create out header header_out = vcfpy.Header(lines=[ vcfpy.HeaderLine(key="fileformat", value="VCFv4.3"), vcfpy.HeaderLine(key="source", value=sys.argv[0]), vcfpy.HeaderLine(key="fileDate", value=date.today().strftime("%d/%m/%Y")) ], samples=vcfpy.SamplesInfos(clusters)) # format header lines header_out.add_format_line(OrderedDict([("ID", "GT"),("Number", "1"), ("Type","String"), ("Description", "Genotype (0/1, 0/0)")])) header_out.add_format_line(OrderedDict([("ID", "DP"),("Number", "1"), ("Type","Integer"), ("Description", "Filtered read depth (reads with MAPQ < 30, indels and gaps are filtered)")])) header_out.add_format_line(OrderedDict([("ID", "RD"),("Number", "1"), ("Type","Integer"), ("Description", "Reference allele read depth")])) header_out.add_format_line(OrderedDict([("ID", "AD"),("Number", "1"), ("Type","Integer"), ("Description", "Alternate allele read depth")])) header_out.add_format_line(OrderedDict([("ID", "AF"),("Number", "1"), ("Type","Float"), ("Description", "Allele frequency: AD/(RD+AD). Other alleles, in case of mutli-allelic regions, are ignored.")])) # info header lines header_out.add_info_line(OrderedDict([("ID", "SUPP"), ("Number", "1"), ("Type","Integer"), ("Description", "Whether the mutation is supported or not.")])) # read input vcf reader = vcfpy.Reader.from_path(input1) # open the output vcf writer = vcfpy.Writer.from_path(prefix+"_clusters.vcf", header_out) """ snps = read_vcf(input1, input_type) #Filtering bulk mutations not supported by cells snps = snps[~snps['INFO'].str.startswith("SUPP=0")] #Create mutation id column and set it as index snps["mutid"] = snps["CHROM"] + "_"+snps["POS"].map(str) + "_" + snps["REF"] + "_" +snps["ALT"] snps = snps.set_index('mutid') """ #for each record in the vcf file for record_in in reader: d = samples_dict(clusters_df['cluster'].unique()) supp = 0 chrom = record_in.CHROM pos = record_in.POS-1 #to correct on 1-based positions ref = record_in.REF alt = record_in.ALT[0].value #for each cluster compute 'GT:DP:RD:AD:AF' to be provided as call argument for c in clusters_df['cluster'].unique(): #retrieve cell columns for cells in current cluster cells = clusters_df['cellid'][clusters_df['cluster'] == c] #retrieve cell data calls = [record_in.call_for_sample[cell] for cell in cells] #sum total read count, alt read count and ref read count of cells in the cluster for call in calls: d[c]['dp'] = d[c]['dp'] + call.data.get('DP') d[c]['rd'] = d[c]['rd'] + call.data.get('RD') d[c]['ad'] = d[c]['ad'] + call.data.get('AD') if d[c]['ad'] > 0: d[c]['gt'] = "0/1" d[c]['af'] = d[c]['ad'] / (d[c]['rd'] + d[c]['ad']) supp = 1 calls = [] # create one call for each cluster for c in d.keys(): calls.append(vcfpy.Call(str(c), OrderedDict([("GT", d[c]['gt']), ("DP", d[c]['dp']), ("RD", d[c]['rd']), ("AD", d[c]['ad']), ("AF", d[c]['af'])]))) print(calls) # write new record record_out = vcfpy.Record(CHROM=chrom, POS=pos+1, ID=[], REF=ref, ALT=[vcfpy.Substitution(type_="SNV", value=alt)], QUAL=None, FILTER=[], INFO={"SUPP":supp}, FORMAT=["GT","DP","RD","AD","AF"], calls=calls ) writer.write_record(record_out) reader.close() writer.close()
def generate_non_sv_records(colocated_records, sample_names): """ This function processes records that have not been used to call a SV. :param colocated_records: :param sample_names: :return: """ # The co-located records need to be re-grouped based not just on their true position (CHROM+POS) but also similarity subgrouping_function = lambda record: ( record.CHROM, record.POS, record.REF, str(record.ALT), record.INFO.get("END", None), record.INFO.get("INSSEQ", None)) records_grouped_by_all_coordinates = group_by(colocated_records, key=subgrouping_function) # Once the regrouping has happened, each group will generate exactly one line in the output. These lines # may be produced out-of-order, but we don't care because we will sort them later before generating the VCF. output = [] for subkey, group in records_grouped_by_all_coordinates.items(): # Build a map to easily find the records by the sample name sample_names_to_record = group_by(group, get_sample_name) # Generate calls for each sample in this group calls = [ get_sample_call(sample_name, sample_names_to_record.get(sample_name, [])) for sample_name in sample_names ] # Add a record to the output first_record_of_the_group = group[0] id_of_new_record = generate_id(first_record_of_the_group.CHROM, first_record_of_the_group.POS) info = vcfpy.OrderedDict() info["SVTYPE"] = "BND" info["TRANCHE2"] = maximum_tranche(group) info["BNDVAF"] = get_average_vaf(group) if "END" in first_record_of_the_group.INFO: # by construction, all the grouped records have the same info["END"] = first_record_of_the_group.INFO["END"] if "INSSEQ" in first_record_of_the_group.INFO: # by construction, all the grouped records have the same info["INSSEQ"] = first_record_of_the_group.INFO["INSSEQ"] output.append( vcfpy.Record( CHROM=first_record_of_the_group. CHROM, # by construction, all the grouped records have the same POS=first_record_of_the_group. POS, # by construction, all the grouped records have the same ID=[id_of_new_record], REF=first_record_of_the_group. REF, # by construction, all the grouped records have the same ALT=first_record_of_the_group. ALT, # by construction, all the grouped records have the same QUAL=maximum_qual(group), FILTER=["PASS"], INFO=info, FORMAT=["GT", "TRANCHE2", "VAF"], calls=calls)) return output
#***** FORMAT *****# lst_format_id = ['GT', 'DP', 'AF'] # Merge GT field try: set_gt.remove('./.') except: pass if len(set_gt)==1: field_gt = set_gt.pop() else: field_gt = "./." # Merge DP field field_dp = int(round(numpy.median(lst_dp),0)) # Merge AF field while lst_af.count(".")>0: lst_af.remove(".") field_af = float(round(numpy.median(lst_af),2)) # Create call dico_calls = [vcfpy.Call(dico_vcf[var_id]["sample"], {'GT':field_gt, 'DP':field_dp, 'AF':[field_af]})] #***** WRITE VARIANT *****# new_record = vcfpy.Record(chrom, pos, ".", ref, dico_vcf[var_id]["ALT"], field_qual, [field_filter], dico_info, lst_format_id, dico_calls) writer.write_record(new_record) writer.close() #***** POST-PROCESSING *****# # Sort sortVCF(pathMergeUnsortedVCF,pathMergeVCF) # Validate boolvalid,lst_errors = validateVCF(path_vcfvalidator,pathMergeVCF) if boolvalid==False: exit("🅴 🆁 🆁 🅾 🆁\n[Nk_mergeVCF] Validate VCF `"+os.path.basename(pathMergeVCF)+"`\n "+"\n ".join(lst_errors)) # bgzip cmd_bgzip = "bgzip -f "+pathMergeVCF process = subprocess.Popen([cmd_bgzip], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) out, err = process.communicate()
def block_to_records(block, prev_block): """Given an alignment block, yield the VCF records. NB: the first/last amino acids are stored for the exon with the major part of the codon. """ logging.debug("Starting new block") meta = block.meta location = block.meta.location in_frame = block.meta.in_frame out_frame = block.meta.out_frame logging.debug("prev_block: %s", prev_block) logging.debug("block: %s", block) logging.debug("%s\n%s", meta, "\n".join(block.aa_seqs)) # Special case handling for first codon. assert not in_frame or prev_block if in_frame == 1: # The alignment for this exon has the amino acid, yield last amino acid for previous exon. prev_location = prev_block.meta.location start, end = pos_magic( prev_location, prev_location[2] - prev_location[1] - 1, prev_location[2] - prev_location[1], ) logging.debug("in_frame == 1, start, end = %d, %d", start, end) yield vcfpy.Record( CHROM=prev_block.meta.location[0], POS=start + 1, ID=[], REF="N", ALT=[], FILTER=[], QUAL=None, INFO={ "END": end, "UCSC_GENE": prev_block.meta.ucsc_gene_id, "EXON": prev_block.meta.exon_idx, "EXON_COUNT": prev_block.meta.exon_count, "ALIGNMENT": "".join(seq[0] for seq in block.aa_seqs), }, FORMAT={}, calls=[], ) # We start at the first codon of this exon starts = [0] ends = [2] elif in_frame == 2: # The alignment for the previous exon has the amino acid, yield first amino acid for this exon. start, end = pos_magic(location, 0, 1) logging.debug("in_frame == 2, start, end = %d, %d", start, end) yield vcfpy.Record( CHROM=location[0], POS=start + 1, ID=[], REF="N", ALT=[], FILTER=[], QUAL=None, INFO={ "END": end, "UCSC_GENE": meta.ucsc_gene_id, "EXON": meta.exon_idx, "EXON_COUNT": meta.exon_count, "ALIGNMENT": "".join(seq[-1] for seq in prev_block.aa_seqs), }, FORMAT={}, calls=[], ) # We start at the second codon on this exon starts = [1] ends = [4] else: # Start at codon border logging.debug("in_frame == 0, start, end = 0, 3") starts = [0] ends = [3] # Handle major part of exon. nts = location[2] - location[1] assert (nts - (3 - in_frame) - out_frame) % 3 == 0 starts = starts + list(range(ends[0], nts, 3)) ends = ends + list(range(ends[0] + 3, nts, 3)) if ends[-1] != nts: ends.append(nts) logging.debug("nts=%s", nts) logging.debug("starts=%s", starts) logging.debug("ends=%s", ends) if out_frame == 2: # We have the amino acid for the last partial codon. If out_frame == 1 then the next exon will take care of writing record. starts += [ends[-1]] ends += [ends[-1] + 2] for i, (start, end) in enumerate(zip(starts, ends)): if i >= meta.exon_len: logging.debug("Too short AA seq found for %s, this happens...", meta.ucsc_gene_id) continue start, end = pos_magic(location, start, end) yield vcfpy.Record( CHROM=location[0], POS=start + 1, ID=[], REF="N", ALT=[], FILTER=[], QUAL=None, INFO={ "END": end, "UCSC_GENE": meta.ucsc_gene_id, "EXON": meta.exon_idx, "EXON_COUNT": meta.exon_count, "ALIGNMENT": "".join(seq[i] for seq in block.aa_seqs), }, FORMAT={}, calls=[], )
def main(): parser = argparse.ArgumentParser(description="Looks for a given set of SNPs whithin a bam file.") parser.add_argument("bam", metavar='sample.bam', action='store', help='BAM file.', type=str) parser.add_argument("vcf", metavar='file.vcf', action='store', help="VCF file storing SNPs.", type=str) parser.add_argument("sample_name", metavar='sample1', action='store', help="Sample identifier.", type=str) parser.add_argument("out_prefix", metavar="outdir/sample", action="store", help="Output VCF file prefix.", type=str) #parser.add_argument("--sample_name2", metavar='sample2', action='store', # help="Another sample name", type=str) args = parser.parse_args() bam= args.bam invcf = args.vcf sample = args.sample_name outvcf = args.out_prefix + ".snpseeker.vcf" ''' if args.sample_name2: sample_name2 = args.sample_name2 else: sample_name2 = null ''' #read bam file samfile = pysam.AlignmentFile(bam, "rb") #build the header of the output vcf header_out = vcfpy.Header(lines=[vcfpy.HeaderLine(key="fileformat", value="VCFv4.3"), vcfpy.HeaderLine(key="source", value=sys.argv[0]), vcfpy.HeaderLine(key="fileDate", value=date.today().strftime("%d/%m/%Y")) ], samples=vcfpy.SamplesInfos([sample])) # sample header lines header_out.add_line(vcfpy.HeaderLine(key="SampleName", value=sample)) ''' if sample_name2 is not null: header_out.add_line(vcfpy.SampleHeaderLine.from_mapping(OrderedDict([("ID", sample_name2),("Description", "Second sample name")]))) ''' # info header lines header_out.add_info_line(OrderedDict([("ID", "SUPP"), ("Number", "1"), ("Type","Integer"), ("Description", "States if the mutation is supported (1) or not (0).")])) # adding format lines header_out.add_format_line(OrderedDict([("ID", "GT"),("Number", "1"), ("Type","String"), ("Description", "Genotype (0/1, 0/0)")])) header_out.add_format_line(OrderedDict([("ID", "SDP"),("Number", "1"), ("Type","Integer"), ("Description", "Samtools read depth (secondary alignments, PCR duplicates, unppammed reads and reads not passing vendor QC are filtered)")])) header_out.add_format_line(OrderedDict([("ID", "DP"),("Number", "1"), ("Type","Integer"), ("Description", "Filtered read depth (reads with MAPQ < 30, indels and gaps are filtered)")])) header_out.add_format_line(OrderedDict([("ID", "RD"),("Number", "1"), ("Type","Integer"), ("Description", "Reference allele read depth")])) header_out.add_format_line(OrderedDict([("ID", "AD"),("Number", "1"), ("Type","Integer"), ("Description", "Alternate allele read depth")])) header_out.add_format_line(OrderedDict([("ID", "AF"),("Number", "1"), ("Type","Float"), ("Description", "Allele frequency: AD/(RD+AD). Other alleles, in case of mutli-allelic regions, are ignored.")])) # read input vcf reader = vcfpy.Reader.from_path(invcf) format_ids = reader.header.format_ids() for format_id in format_ids: format_line = reader.header.get_format_field_info(format_id) ''' output example: FormatHeaderLine('FORMAT', '<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">', {'ID': 'AD', 'Number': 'R', 'Type': 'Integer', 'Description': 'Allelic depths for the ref and alt alleles in the order listed'}) key = 'FORMAT' value = '<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed"> ''' mapping = str_to_mapping(format_line.value) mapping["Description"] = "(Info about mutation in the original vcf)" + mapping["Description"] header_out.add_info_line(str_to_mapping(format_line.value)) # open the output vcf writer = vcfpy.Writer.from_path(outvcf, header_out) #read bam file samfile = pysam.AlignmentFile(bam, "rb") #for each mutation in the vcf file for record_in in reader: # filter out indels: only interested in snvs in this analysis phase if not record_in.is_snv(): continue chrom = record_in.CHROM pos = record_in.POS-1 #to correct on 1-based positions ref = record_in.REF alt = record_in.ALT[0].value #record.ALT is a list by construction which contains only one value # if the mutation is a SNV #line += [call.data.get('GT') or './.' for call in record.calls] #look for the pileup in the samfile at position (chrom,pos) for pileupcolumn in samfile.pileup(chrom, pos, pos+1, stepper='all', truncate=True, max_depth=10000): #number of reads at this position sdp = pileupcolumn.n #number of supporting reads for the alternate base ad = 0 rd = 0 dp = 0 af = 0.0 for base in pileupcolumn.pileups: # .is_del -> the base is a deletion? # .is_refskip -> the base is a N in the CIGAR string ? if not base.is_del and not base.is_refskip and not base.alignment.mapping_quality < 30: dp += 1 if base.alignment.query_sequence[base.query_position] == alt: ad += 1 elif base.alignment.query_sequence[base.query_position] == ref: rd += 1 if ad > 0: af = ad / (rd + ad) supp = 1 gt = "0/1" #temporary, all the supported mutations are set to 0/1 else: supp = 0 gt = "0/0" #af = ad / (rd + ad) info_d = {} info_d['SUPP'] = supp for f in record_in.FORMAT: info_d[f] = record_in.calls[0].data.get(f) record_out = vcfpy.Record(CHROM=chrom, POS=pos+1, ID=[], REF=ref, ALT=[vcfpy.Substitution(type_="SNV", value=alt)], QUAL=None, FILTER=[], INFO=info_d, FORMAT=["GT","SDP","DP","RD","AD","AF"], calls=[vcfpy.Call(sample, OrderedDict([("GT", gt), ("SDP",sdp), ("DP", dp), ("RD", rd), ("AD", ad), ("AF", af)]))] ) writer.write_record(record_out) reader.close() writer.close() samfile.close()
def extract_consensus_insertions(contig_path, cons_path, ref_fasta_path, vcf_out_path, vcf_template_path, min_insertion_size, flank_length, flanked_contigs_path): n_records = 0 # open input sequences cons_fasta = pysam.FastaFile(cons_path) ref_fasta = pysam.FastaFile(ref_fasta_path) flanked_contig_fasta = open(flanked_contigs_path, "w") (samples, loci) = collect_genotypes(contig_path) print("Found", len(samples), "samples for", len(loci), "phased loci") reader = vcfpy.Reader.from_path(vcf_template_path) reader.header.samples = vcfpy.SamplesInfos(list(samples)) writer = vcfpy.Writer.from_path(vcf_out_path, reader.header) for contig in cons_fasta.references: # parse coordinates (chrom, start, end) = contig.split("_") (start, end) = int(start), int(end) cons_seq = cons_fasta.fetch(contig) ref_seq = ref_fasta.fetch(chrom, start, end) aligner = mappy.Aligner(seq = ref_seq, preset = None , k = 15, w = 10, n_threads = 1, max_join_long = 20000, max_join_short = 10000, min_join_flank_sc = 10, min_join_flank_ratio = 0.1, max_gap = 10000, bw = 2000, end_bonus = 10, zdrop = 10000, zdrop_inv = 1000, scoring = (2, 4, 4, 10, 300, 0, 1), extra_flags = 0x1) alignments = list(aligner.map(cons_seq, seq2 = None, cs = True, MD = False)) if len(alignments) == 0: print("No hits in", contig) continue aln = max(alignments, key = lambda x: x.blen) cig = cigar.Cigar(aln.cigar_str) ops = list(cig.items()) cons_pos = aln.q_st target_pos = aln.r_st strand = "+" if aln.strand == -1: cons_seq = str(Bio.Seq.Seq(cons_seq).reverse_complement()) strand = "-" # print(contig) for op in ops: # skip matches if op[1] == 'M': cons_pos += op[0] target_pos += op[0] # skip deletions in the query sequence elif op[1] == 'D': target_pos += op[0] # insertions in the query sequence elif op[1] == 'I': # only interested in large insertions if op[0] > min_insertion_size: # Generate pysam.VariantRecord # need to check conversion from 0-based coordinates to 1-based ref_allele = ref_seq[target_pos-1] alt_allele = cons_seq[cons_pos:cons_pos + op[0]] break_point = start + target_pos # output VCF record corresponding to the insertion # print(break_point, (start + end) / 2 ) # print(len(loci[contig]), "samples at", contig) # build calls data structure calls = [] for sample in samples: sample_gt = "0/0" ps = 0 if sample in loci[contig]: sample_gt = loci[contig][sample]["1"] + "|" + loci[contig][sample]["2"] ps = loci[contig][sample]["ps"] sample_call = vcfpy.Call(sample = sample, data = vcfpy.OrderedDict(GT = sample_gt, PS = ps)) # print(sample_call) calls.append(sample_call) rec = vcfpy.Record(CHROM = chrom, POS = break_point, ID = [contig + "_" + str(cons_pos)], REF = ref_allele, ALT = [vcfpy.Substitution("INS", ref_allele + alt_allele)], QUAL = 999, FILTER = ["PASS"], INFO = vcfpy.OrderedDict(SVLEN = op[0], CIGAR = [str(cig)], STRAND = strand, CONTIG_START = str(aln.q_st)), FORMAT = ["GT", "PS"], calls = calls) # output contig that contains this insertion writer.write_record(rec) # output same insertion, but with flanking sequences # note, the interval is [start, end[ if flank_length > 0: left_flank = ref_fasta.fetch(chrom, break_point - flank_length, break_point) right_flank = ref_fasta.fetch(chrom, break_point, break_point + flank_length) else: left_flank = "" right_flank = "" flanked_contig_fasta.writelines([ ">" + contig + "_" + str(cons_pos) + "\n", left_flank + alt_allele[1:] + right_flank + "\n"]) # output same contig, but with large flanking sequences # note, the interval is [start, end[ n_records += 1 cons_pos += op[0] flanked_contig_fasta.close() return n_records