def main(): parser = argparse.ArgumentParser(description="vcf writer") parser.add_argument("input", metavar='input.vcf', action='store', help='vcf file.', type=str) parser.add_argument("output", metavar='output.vcf', action='store', help='vcf file.', type=str) args = parser.parse_args() outvcf = args.output invcf = args.input ######################### # # # creating the header # # # ######################### # The header can contain some fixed type lines (INFO, FORMAT, FILTER, etc.) and some general ones # In this case, the header will contain a line storing the name of the program which generated # the file. We also add the information about the name of the sample which have been analyzed header = vcfpy.Header(lines=[vcfpy.HeaderLine(key="source", value=sys.argv[0]), vcfpy.HeaderLine(key="fileformat", value="VCFv4.3"), vcfpy.HeaderLine(key="fileDate", value=date.today().strftime("%d/%m/%Y")) ], samples=vcfpy.SamplesInfos(["Sample1"])) # adding format lines header.add_format_line(OrderedDict([("ID", "GT"),("Number", "1"), ("Type","String"), ("Description", "Genotype")])) header.add_format_line(OrderedDict([("ID", "DP"),("Number", "1"), ("Type","Integer"), ("Description", "Filtered read depth (MAPQ > 30)")])) # read the input vcf with vcfpy.Reader.from_path(invcf) as reader: # get the FORMAT header lines of the input file # and convert them in INFO header lines of the output file format_ids = reader.header.format_ids() for format_id in format_ids: format_line = reader.header.get_format_field_info(format_id) ''' output example: FormatHeaderLine('FORMAT', '<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">', {'ID': 'AD', 'Number': 'R', 'Type': 'Integer', 'Description': 'Allelic depths for the ref and alt alleles in the order listed'}) key = 'FORMAT' value = '<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed"> ''' header.add_info_line(str_to_mapping(format_line.value)) #print(header) # write the vcf with vcfpy.Writer.from_path(outvcf, header) as writer: # creating one record record = vcfpy.Record( CHROM="1", POS=1, ID=[], REF="C", ALT=[vcfpy.Substitution(type_="SNV", value="G")], QUAL=None, FILTER=[], INFO={}, FORMAT=["GT", "DP"], calls=[vcfpy.Call("Sample1", OrderedDict([("GT", "0/1"),("DP", "47")]))] ) #print(record) writer.write_record(record)
def _open(self): # Setup header lines = [ vcfpy.HeaderLine("fileformat", "VCFv4.2"), vcfpy.FormatHeaderLine.from_mapping({ "ID": "AD", "Number": "R", "Type": "Integer", "Description": "Allelic depths for the ref and alt alleles in the order listed", }), vcfpy.FormatHeaderLine.from_mapping({ "ID": "DP", "Number": "1", "Type": "Integer", "Description": "Approximate read depth at the locus", }), vcfpy.FormatHeaderLine.from_mapping({ "ID": "GQ", "Number": "1", "Type": "Integer", "Description": "Phred-scaled genotype quality", }), vcfpy.FormatHeaderLine.from_mapping({ "ID": "GT", "Number": "1", "Type": "String", "Description": "Genotype" }), ] # Add header lines for contigs. # TODO: switch based on release in case for name, length in CONTIGS_GRCH37: lines.append( vcfpy.ContigHeaderLine.from_mapping({ "ID": name, "length": length })) header = vcfpy.Header(lines=lines, samples=vcfpy.SamplesInfos(self.members)) # Open VCF writer self.vcf_writer = vcfpy.Writer.from_path(self.tmp_file.name, header)
def write_vcf(vcffilename, sample_name, records): """ Generate a VCF with the given records and randomly generated genotypes Arguments: vcffilename - path to generated file records - list of vcfpy.Record describing the variants """ lengths = [249250621, 243199373, 198022430, 191154276, 180915260, 171115067, 159138663, 146364022, 141213431, 135534747, 135006516, 133851895, 115169878, 107349540, 102531392, 90354753, 81195210, 78077248, 59128983, 63025520, 48129895, 51304566] samples = vcfpy.SamplesInfos([sample_name]) header = vcfpy.Header(samples=samples) header.add_line(vcfpy.HeaderLine("fileformat", "VCFv4.3")) header.add_line(vcfpy.HeaderLine("fileDate", "20200901")) for chrom, length in enumerate(lengths): header.add_contig_line({"ID": str(chrom), "assembly": "GRCh37", "length": length}) header.add_format_line({"ID":"GT", "Number":1, "Type":"String", "Description": "Genotype"}) with open(vcffilename, 'wb') as vcffile: writer = vcfpy.Writer.from_stream(vcffile, header, samples, use_bgzf=True) for record in records: genotype = random.choice(['0/0', '0/1', '1/1']) newrecord = vcfpy.Record(record.CHROM, record.POS, record.ID, record.REF, record.ALT, record.QUAL, record.FILTER, record.INFO, ["GT"], calls=[vcfpy.record.Call(sample_name, {"GT": genotype})]) writer.write_record(newrecord) writer.close()
def build_header(contigs, species): header = vcfpy.Header() header.samples = vcfpy.SamplesInfos([]) header.add_line(vcfpy.HeaderLine("fileformat", "VCFv4.2")) for name, length in contigs: header.add_contig_line({"ID": name, "length": length}) header.add_line(vcfpy.HeaderLine("species", ",".join(species))) header.add_info_line({ "ID": "END", "Description": "End position of the alignment", "Type": "Integer", "Number": 1, }) header.add_info_line({ "ID": "UCSC_GENE", "Description": "UCSC gene ID", "Type": "String", "Number": 1 }) header.add_info_line({ "ID": "EXON", "Description": "Index of exon in transcript", "Type": "Integer", "Number": 1 }) header.add_info_line({ "ID": "EXON_COUNT", "Description": "Number of exons in transcript", "Type": "Integer", "Number": 1, }) header.add_info_line({ "ID": "ALIGNMENT", "Description": "Amino acid alignment at this location", "Type": "String", "Number": 1, }) return header
def get_process_header_line(self, existing_head): """ Generates a new vcfProvcess header line for this process. Uses the existing header to check whether we require an index and (UTC) date appended to the key """ head_key = base_vcf_process_key index = get_last_vcf_process_index(existing_head.lines, head_key) # Test for key without date - gives all lines if index is not None and index > 0: now = datetime.datetime.utcnow() date_str = now.strftime("%Y%m%d") head_key += "_{}".format(date_str) # Test for key with current date to get precise index index = get_last_vcf_process_index(existing_head.lines, head_key) if index is None: index = 0 head_key = head_key + "." + str(index + 1) new_process_line = vcfpy.HeaderLine( key=head_key, value=base_vcf_process_log.format(self.vcfinname, self.run_script, self.arg_str), ) return new_process_line
def get_header(sample_name_to_header, chromosome_set): """ Returns the header of the output VCF file :param sample_name_to_header: a dictionary from the sample names to the headers :param chromosome_set: the set of chromosomes selected for analysis :return: a vcfpy.Header """ header = vcfpy.Header() header.add_line(vcfpy.HeaderLine(key="fileformat", value="VCFv4.2")) # CONTIG headers first_sample_header = next(iter(sample_name_to_header.values())) for input_header_line in first_sample_header.lines: if isinstance(input_header_line, vcfpy.ContigHeaderLine): if chromosome_set is None or input_header_line.mapping[ "ID"] in chromosome_set: header.add_line(input_header_line) # INFO fields header.add_info_line( vcfpy.OrderedDict(ID="END", Number=1, Type="Integer", Description="Stop position of the interval")) header.add_info_line( vcfpy.OrderedDict(ID="SVTYPE", Number=1, Type="String", Description="Type of structural variant")) header.add_info_line( vcfpy.OrderedDict( ID="INSSEQ", Number=1, Type="String", Description= "Insertion sequence of structural variant, not including sequence marked as duplication" )) header.add_info_line( vcfpy.OrderedDict( ID="TRANCHE2", Number=1, Type="String", Description= "Quality category of GRIDSS structural variant calls determined using FILTER,SRQ,AS,RAS. Values are LOW INTERMEDIATE HIGH" )) header.add_info_line( vcfpy.OrderedDict( ID="BNDVAF", Number=1, Type="Float", Description= "VAF of this gridss-called BND calculated as (SR+RP+IC+AS)/(REF+SR+RP+IC+AS)" )) # FORMAT fields header.add_format_line( vcfpy.OrderedDict(ID="GT", Number=1, Type="String", Description="Genotype")) header.add_format_line( vcfpy.OrderedDict( ID="TRANCHE2", Number=1, Type="String", Description= "Quality category of GRIDSS structural variant calls determined using FILTER,SRQ,AS,RAS. Values are LOW INTERMEDIATE HIGH" )) header.add_format_line( vcfpy.OrderedDict( ID="BNDVAF", Number=1, Type="Float", Description= "VAF of this gridss-called BND calculated as (SR+RP+IC+AS)/(REFPAIR+SR+RP+IC+AS)" )) header.add_format_line( vcfpy.OrderedDict( ID="VAF", Number=1, Type="Float", Description= "VAF of this SV call, derived from BNDVAF values of BND calls used to call this SV" )) header.add_format_line( vcfpy.OrderedDict( ID="INSSEQ", Number=1, Type="String", Description= "Insertion sequence of structural variant, not including sequence marked as duplication" )) # Samples, sorted to ensure determinism sample_names = sample_name_to_header.keys() header.samples = vcfpy.SamplesInfos(sorted(sample_names)) return header
process = subprocess.Popen([cmd_vt], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) out, err = process.communicate() if process.returncode!=0: exit("🅴 🆁 🆁 🅾 🆁\n[Nk_mergeVCF] Decompose & Normalize\n "+err.decode('utf-8')) #***** MERGE callers VCFs *****# lst_caller_name = [] lst_contig_line = [] dico_filter_line = {} dico_vcf = {} pathMergeVCF = sample+"_Nk.vcf" pathMergeUnsortedVCF = pathMergeVCF.replace(".vcf","_unsorted.vcf") #***** INIT new vcf header *****# new_header = vcfpy.Header(lines=None, samples=None) new_header.add_line(vcfpy.HeaderLine("fileformat","VCFv4.2")) new_header.add_line(vcfpy.HeaderLine("Nk_version",niourkVersion)) #***** BROWSE caller vcf *****# for path_vcf in lst_vcf_sample: caller_name = os.path.basename(path_vcf).split("_")[2].replace(".vcf","") lst_caller_name.append(caller_name) path_normalized_vcf = path_vcf.replace(".vcf","_normalize.vcf") vcf_tool_reader = vcfpy.Reader.from_path(path_normalized_vcf) vcf_header = vcf_tool_reader.header #***** READ HEADERS *****# # check header sample if new_header.samples==None: new_header.samples = vcf_header.samples # check header filters for filter_line in vcf_header.get_lines("FILTER"): if not filter_line.id in dico_filter_line: dico_filter_line[filter_line.id] = filter_line.description elif not filter_line.description in dico_filter_line[filter_line.id]: dico_filter_line[filter_line.id]+=", "+filter_line.description
def test_parse_bed_file(bed, alleles, exp_locs): """ Unit tests over the parse_bed_file method """ parser = VcfParse(input_vcf, output_vcf, run_script, arg_str) # Try no allele bed file # Try non gzipped file parser.bed = bed parser.alleles = alleles assert parser.parse_bed_file() == exp_locs @pytest.mark.parametrize( "in_head,key_prefix,exp_idx", [ ([vcfpy.HeaderLine("vcfProcessLog", "BLAH")], "vcfProcessLog", None), ( [ vcfpy.HeaderLine("vcfProcessLog_20160212.1", "BLAH1"), vcfpy.HeaderLine("vcfProcessLog_20160212.2", "BLAH2"), ], "vcfProcessLog", 2, ), ( [ vcfpy.HeaderLine("vcfProcessLog_20151218.1", "BLAH1"), vcfpy.HeaderLine("vcfProcessLog_20160212.1", "BLAH2"), ], "vcfProcessLog", 1,
def main(): parser = argparse.ArgumentParser( description="Looks for a given set of SNPs whithin a bam file.") parser.add_argument("bam", metavar='sample.bam', action='store', help='BAM file.', type=str) parser.add_argument( "barcodes", metavar='barcodes.list', action='store', help= "File containing cell barcodes (the same used in the alignment file to identify cell reads).", type=str) parser.add_argument("vcf", metavar='variants.vcf', action='store', help="VCF file storing BULK SNPs.", type=str) parser.add_argument("sample_name", metavar='sample1', action='store', help="Sample identifier.", type=str) parser.add_argument("out_prefix", metavar="outdir/sample", action="store", help="Output VCF file prefix.", type=str) parser.add_argument( "--gt", metavar='1/1 (0/1)', choices=["0/0", "0/1", "1/1"], action='store', help= "Genotype filter: considers only mutations with the specified GT in the original vcf file.", type=str) args = parser.parse_args() bam = args.bam barcodes = args.barcodes invcf = args.vcf sample = args.sample_name outvcf = args.out_prefix + ".snpseeker.vcf" if args.gt: gt_filter = True gt = args.gt else: gt_filter = False with open(barcodes, "r") as f: samples = f.read().splitlines() #read bam file samfile = pysam.AlignmentFile(bam, "rb") #build the header of the output vcf header_out = vcfpy.Header(lines=[ vcfpy.HeaderLine(key="fileformat", value="VCFv4.3"), vcfpy.HeaderLine(key="source", value=sys.argv[0]), vcfpy.HeaderLine(key="fileDate", value=date.today().strftime("%d/%m/%Y")) ], samples=vcfpy.SamplesInfos(samples)) # sample header lines header_out.add_line( vcfpy.SampleHeaderLine.from_mapping( OrderedDict([("ID", sample), ("Description", "Sample name")]))) # filter header lines # sample header lines header_out.add_filter_line( OrderedDict([("ID", "1/1"), ("Number", "1"), ("Description", "Filtered on such GT")])) header_out.add_filter_line( OrderedDict([("ID", "0/1"), ("Number", "1"), ("Description", "Filtered on such GT")])) header_out.add_filter_line( OrderedDict([("ID", "0/0"), ("Number", "1"), ("Description", "Filtered on such GT")])) #header_out.add_info_line(OrderedDict([("ID", "MUT"), ("Number", "1"), ("Type","Integer"), ("Description", "States if the record mutation is supported (1) or not (0).")])) # format header lines header_out.add_format_line( OrderedDict([("ID", "GT"), ("Number", "1"), ("Type", "String"), ("Description", "Genotype (0/1, 0/0)")])) header_out.add_format_line( OrderedDict([ ("ID", "DP"), ("Number", "1"), ("Type", "Integer"), ("Description", "Filtered read depth (reads with MAPQ < 30, indels and gaps are filtered)" ) ])) header_out.add_format_line( OrderedDict([("ID", "RD"), ("Number", "1"), ("Type", "Integer"), ("Description", "Reference allele read depth")])) header_out.add_format_line( OrderedDict([("ID", "AD"), ("Number", "1"), ("Type", "Integer"), ("Description", "Alternate allele read depth")])) header_out.add_format_line( OrderedDict([ ("ID", "AF"), ("Number", "1"), ("Type", "Float"), ("Description", "Allele frequency: AD/(RD+AD). Other alleles, in case of mutli-allelic regions, are ignored." ) ])) # read input vcf reader = vcfpy.Reader.from_path(invcf) # info header lines # Use input FORMAT lines as output INFO line header_out.add_info_line( OrderedDict([("ID", "SUPP"), ("Number", "1"), ("Type", "Integer"), ("Description", "Number of cells supporting the mutation.")])) format_ids = reader.header.format_ids() for format_id in format_ids: format_line = reader.header.get_format_field_info(format_id) ''' output example: FormatHeaderLine('FORMAT', '<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">', {'ID': 'AD', 'Number': 'R', 'Type': 'Integer', 'Description': 'Allelic depths for the ref and alt alleles in the order listed'}) key = 'FORMAT' value = '<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed"> ''' mapping = str_to_mapping(format_line.value) mapping["Description"] = "(Info about bulk mutation)" + mapping[ "Description"] header_out.add_info_line(str_to_mapping(format_line.value)) # open the output vcf writer = vcfpy.Writer.from_path(outvcf, header_out) #read bam file samfile = pysam.AlignmentFile(bam, "rb") #for each mutation in the vcf file for record_in in reader: d = samples_dict(samples) supp = 0 # filter out indels: only interested in snvs in this analysis phase if gt_filter: if record.calls[0].data.get('GT') != gt: continue if not record_in.is_snv(): continue chrom = record_in.CHROM pos = record_in.POS - 1 #to correct on 1-based positions ref = record_in.REF alt = record_in.ALT[ 0].value #record.ALT is a list by construction which contains only one value # if the mutation is a SNV #line += [call.data.get('GT') or './.' for call in record.calls] #look for the pileup in the samfile at position (chrom,pos) for pileupcolumn in samfile.pileup(chrom, pos, pos + 1, stepper='all', truncate=True, max_depth=10000): for base in pileupcolumn.pileups: # .is_del -> the base is a deletion? # .is_refskip -> the base is a N in the CIGAR string ? if not base.is_del and not base.is_refskip and not base.alignment.mapping_quality < 30: #iterate on cells tags = list_to_dict(base.alignment.tags) if "CB" not in tags.keys(): ''' reads with no error-corrected barcode are discarded ''' continue elif tags["CB"].split("-")[0] not in samples: ''' The barcode hasn't been labeled has belonging to a cell by cellranger (floating DNA)''' continue cb = tags["CB"].split("-")[0] #10x barcodes #print("barcode {} is a cell barcode ".format(cb)) d[cb][ 'dp'] += 1 #update info for the sample identified by CB if base.alignment.query_sequence[ base.query_position] == alt: d[cb]['ad'] += 1 elif base.alignment.query_sequence[ base.query_position] == ref: d[cb]['rd'] += 1 for cb in d.keys(): if d[cb]['ad'] > 0: supp += 1 d[cb][ 'gt'] = "0/1" #temporary, all the supported mutations are set to 0/1 d[cb]['af'] = d[cb]['ad'] / (d[cb]['rd'] + d[cb]['ad']) # generate calls for each sample/cell calls = [] for cb in d.keys(): calls.append( vcfpy.Call( cb, OrderedDict([("GT", d[cb]['gt']), ("DP", d[cb]['dp']), ("RD", d[cb]['rd']), ("AD", d[cb]['ad']), ("AF", d[cb]['af'])]))) # create a mapping between each FORMAT entry and the # corresponding value, in the call, in the input vcf file # note that the input vcf contains only one sample, so # the calls field of each record contains only one entry info_d = {} info_d['SUPP'] = supp for f in record_in.FORMAT: info_d[f] = record_in.calls[0].data.get(f) if gt_filter == True: filter_l = [gt] else: filter_l = [] # build and write the output record record_out = vcfpy.Record( CHROM=chrom, POS=pos + 1, ID=[], REF=ref, ALT=[vcfpy.Substitution(type_="SNV", value=alt)], QUAL=None, FILTER=filter_l, INFO=info_d, FORMAT=["GT", "DP", "RD", "AD", "AF"], calls=calls) writer.write_record(record_out) reader.close() writer.close() samfile.close()
def main(): parser = argparse.ArgumentParser(description="vcf writer") parser.add_argument("output", metavar='output.vcf', action='store', help='vcf file.', type=str) args = parser.parse_args() outvcf = args.output ######################### # # # creating the header # # # ######################### # The header can contain some fixed type lines (INFO, FORMAT, FILTER, etc.) and some general ones # In this case, the header will contain a line storing the name of the program which generated # the file. We also add the information about the name of the sample which have been analyzed header = vcfpy.Header(lines=[ vcfpy.HeaderLine(key="source", value=sys.argv[0]), vcfpy.HeaderLine(key="fileformat", value="VCFv4.3"), vcfpy.HeaderLine(key="fileDate", value=date.today().strftime("%d/%m/%Y")) ], samples=vcfpy.SamplesInfos(["Sample1", "Sample2"])) # Tuples of valid entries ----------------------------------------------------- # #: valid INFO value types # INFO_TYPES = ("Integer", "Float", "Flag", "Character", "String") #: valid FORMAT value types # FORMAT_TYPES = ("Integer", "Float", "Character", "String") #: valid values for "Number" entries, except for integers # VALID_NUMBERS = ("A", "R", "G", ".") #: header lines that contain an "ID" entry # LINES_WITH_ID = ("ALT", "contig", "FILTER", "FORMAT", "INFO", "META", "PEDIGREE", "SAMPLE") # Constants for "Number" entries ---------------------------------------------- # #: number of alleles excluding reference # HEADER_NUMBER_ALLELES = "A" #: number of alleles including reference # HEADER_NUMBER_REF = "R" #: number of genotypes # HEADER_NUMBER_GENOTYPES = "G" #: unbounded number of values # HEADER_NUMBER_UNBOUNDED = "." # adding filter lines header.add_filter_line( OrderedDict([("ID", "PASS"), ("Description", "All filters passed")])) # adding info lines header.add_info_line( OrderedDict([("ID", "DP"), ("Number", "1"), ("Type", "Integer"), ("Description", "Raw read depth (without mapping quality filters)")])) header.add_info_line( OrderedDict([ ("ID", "MUT"), ("Number", "1"), ("Type", "Integer"), ("Description", "States if the record mutation is supported (1) or not (0).") ])) # adding format lines header.add_format_line( OrderedDict([("ID", "GT"), ("Number", "1"), ("Type", "String"), ("Description", "Genotype")])) header.add_format_line( OrderedDict([("ID", "DP"), ("Number", "1"), ("Type", "Integer"), ("Description", "Filtered read depth (MAPQ > 30)")])) #header.add_format_line(OrderedDict([vcfpy.header.RESERVED_FORMAT["GT"]])) # adding contig lines header.add_contig_line( OrderedDict([("ID", "chr1"), ("length", "248956422")])) # adding sample lines header.add_line( vcfpy.SampleHeaderLine.from_mapping( OrderedDict([("ID", "Sample1"), ("Description", "Tumor")]))) # writing the vcf with vcfpy.Writer.from_path(outvcf, header) as writer: # creating one record calls = [] calls.append( vcfpy.Call("Sample1", OrderedDict([("GT", "0/1"), ("DP", "47")]))) calls.append( vcfpy.Call("Sample2", OrderedDict([("GT", "0/1"), ("DP", "31")]))) record = vcfpy.Record(CHROM="1", POS=1, ID=[], REF="C", ALT=[vcfpy.Substitution(type_="SNV", value="G")], QUAL=None, FILTER=["PASS"], INFO={ "DP": "50", "MUT": 0 }, FORMAT=["GT", "DP"], calls=calls) #record.add_format(key="GT") #record.calls.append(vcfpy.Call("Sample1", OrderedDict([("GT", "0|1")]))) writer.write_record(record)
def main(): parser = argparse.ArgumentParser(description="Looks for a given set of SNPs whithin a bam file.") parser.add_argument("bam", metavar='sample.bam', action='store', help='BAM file.', type=str) parser.add_argument("vcf", metavar='file.vcf', action='store', help="VCF file storing SNPs.", type=str) parser.add_argument("sample_name", metavar='sample1', action='store', help="Sample identifier.", type=str) parser.add_argument("out_prefix", metavar="outdir/sample", action="store", help="Output VCF file prefix.", type=str) #parser.add_argument("--sample_name2", metavar='sample2', action='store', # help="Another sample name", type=str) args = parser.parse_args() bam= args.bam invcf = args.vcf sample = args.sample_name outvcf = args.out_prefix + ".snpseeker.vcf" ''' if args.sample_name2: sample_name2 = args.sample_name2 else: sample_name2 = null ''' #read bam file samfile = pysam.AlignmentFile(bam, "rb") #build the header of the output vcf header_out = vcfpy.Header(lines=[vcfpy.HeaderLine(key="fileformat", value="VCFv4.3"), vcfpy.HeaderLine(key="source", value=sys.argv[0]), vcfpy.HeaderLine(key="fileDate", value=date.today().strftime("%d/%m/%Y")) ], samples=vcfpy.SamplesInfos([sample])) # sample header lines header_out.add_line(vcfpy.HeaderLine(key="SampleName", value=sample)) ''' if sample_name2 is not null: header_out.add_line(vcfpy.SampleHeaderLine.from_mapping(OrderedDict([("ID", sample_name2),("Description", "Second sample name")]))) ''' # info header lines header_out.add_info_line(OrderedDict([("ID", "SUPP"), ("Number", "1"), ("Type","Integer"), ("Description", "States if the mutation is supported (1) or not (0).")])) # adding format lines header_out.add_format_line(OrderedDict([("ID", "GT"),("Number", "1"), ("Type","String"), ("Description", "Genotype (0/1, 0/0)")])) header_out.add_format_line(OrderedDict([("ID", "SDP"),("Number", "1"), ("Type","Integer"), ("Description", "Samtools read depth (secondary alignments, PCR duplicates, unppammed reads and reads not passing vendor QC are filtered)")])) header_out.add_format_line(OrderedDict([("ID", "DP"),("Number", "1"), ("Type","Integer"), ("Description", "Filtered read depth (reads with MAPQ < 30, indels and gaps are filtered)")])) header_out.add_format_line(OrderedDict([("ID", "RD"),("Number", "1"), ("Type","Integer"), ("Description", "Reference allele read depth")])) header_out.add_format_line(OrderedDict([("ID", "AD"),("Number", "1"), ("Type","Integer"), ("Description", "Alternate allele read depth")])) header_out.add_format_line(OrderedDict([("ID", "AF"),("Number", "1"), ("Type","Float"), ("Description", "Allele frequency: AD/(RD+AD). Other alleles, in case of mutli-allelic regions, are ignored.")])) # read input vcf reader = vcfpy.Reader.from_path(invcf) format_ids = reader.header.format_ids() for format_id in format_ids: format_line = reader.header.get_format_field_info(format_id) ''' output example: FormatHeaderLine('FORMAT', '<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">', {'ID': 'AD', 'Number': 'R', 'Type': 'Integer', 'Description': 'Allelic depths for the ref and alt alleles in the order listed'}) key = 'FORMAT' value = '<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed"> ''' mapping = str_to_mapping(format_line.value) mapping["Description"] = "(Info about mutation in the original vcf)" + mapping["Description"] header_out.add_info_line(str_to_mapping(format_line.value)) # open the output vcf writer = vcfpy.Writer.from_path(outvcf, header_out) #read bam file samfile = pysam.AlignmentFile(bam, "rb") #for each mutation in the vcf file for record_in in reader: # filter out indels: only interested in snvs in this analysis phase if not record_in.is_snv(): continue chrom = record_in.CHROM pos = record_in.POS-1 #to correct on 1-based positions ref = record_in.REF alt = record_in.ALT[0].value #record.ALT is a list by construction which contains only one value # if the mutation is a SNV #line += [call.data.get('GT') or './.' for call in record.calls] #look for the pileup in the samfile at position (chrom,pos) for pileupcolumn in samfile.pileup(chrom, pos, pos+1, stepper='all', truncate=True, max_depth=10000): #number of reads at this position sdp = pileupcolumn.n #number of supporting reads for the alternate base ad = 0 rd = 0 dp = 0 af = 0.0 for base in pileupcolumn.pileups: # .is_del -> the base is a deletion? # .is_refskip -> the base is a N in the CIGAR string ? if not base.is_del and not base.is_refskip and not base.alignment.mapping_quality < 30: dp += 1 if base.alignment.query_sequence[base.query_position] == alt: ad += 1 elif base.alignment.query_sequence[base.query_position] == ref: rd += 1 if ad > 0: af = ad / (rd + ad) supp = 1 gt = "0/1" #temporary, all the supported mutations are set to 0/1 else: supp = 0 gt = "0/0" #af = ad / (rd + ad) info_d = {} info_d['SUPP'] = supp for f in record_in.FORMAT: info_d[f] = record_in.calls[0].data.get(f) record_out = vcfpy.Record(CHROM=chrom, POS=pos+1, ID=[], REF=ref, ALT=[vcfpy.Substitution(type_="SNV", value=alt)], QUAL=None, FILTER=[], INFO=info_d, FORMAT=["GT","SDP","DP","RD","AD","AF"], calls=[vcfpy.Call(sample, OrderedDict([("GT", gt), ("SDP",sdp), ("DP", dp), ("RD", rd), ("AD", ad), ("AF", af)]))] ) writer.write_record(record_out) reader.close() writer.close() samfile.close()
def main(): parser = argparse.ArgumentParser(description="From single cell VCF to clones vcf.") parser.add_argument("input1", metavar="sample.muts.vcf", action="store", help="Single cell VCF file.", type=str) parser.add_argument("input2", metavar="clusters.list", action="store", help="Clusters list.", type=str) #parser.add_argument("input_type", choices=["gz", "vcf"], help="VCF input type (vcf/gz).", type=str) #parser.add_argument("sample", metavar="sample_name", action="store", help="Sample name", type=str) parser.add_argument("outprefix", metavar="out/path/prefix", action="store", help="Output prefix", type=str) args = parser.parse_args() input1 = args.input1 input2 = args.input2 prefix = args.outprefix #sample = args.sample #input_type = args.input_type clusters_df = pd.read_csv(input2) #clusters_df['cluster'] = clusters_df['a'].apply(lambda x: "{}_{}".format(sample, x)) clusters = [str(cluster) for cluster in clusters_df['cluster'].unique()] # Create out header header_out = vcfpy.Header(lines=[ vcfpy.HeaderLine(key="fileformat", value="VCFv4.3"), vcfpy.HeaderLine(key="source", value=sys.argv[0]), vcfpy.HeaderLine(key="fileDate", value=date.today().strftime("%d/%m/%Y")) ], samples=vcfpy.SamplesInfos(clusters)) # format header lines header_out.add_format_line(OrderedDict([("ID", "GT"),("Number", "1"), ("Type","String"), ("Description", "Genotype (0/1, 0/0)")])) header_out.add_format_line(OrderedDict([("ID", "DP"),("Number", "1"), ("Type","Integer"), ("Description", "Filtered read depth (reads with MAPQ < 30, indels and gaps are filtered)")])) header_out.add_format_line(OrderedDict([("ID", "RD"),("Number", "1"), ("Type","Integer"), ("Description", "Reference allele read depth")])) header_out.add_format_line(OrderedDict([("ID", "AD"),("Number", "1"), ("Type","Integer"), ("Description", "Alternate allele read depth")])) header_out.add_format_line(OrderedDict([("ID", "AF"),("Number", "1"), ("Type","Float"), ("Description", "Allele frequency: AD/(RD+AD). Other alleles, in case of mutli-allelic regions, are ignored.")])) # info header lines header_out.add_info_line(OrderedDict([("ID", "SUPP"), ("Number", "1"), ("Type","Integer"), ("Description", "Whether the mutation is supported or not.")])) # read input vcf reader = vcfpy.Reader.from_path(input1) # open the output vcf writer = vcfpy.Writer.from_path(prefix+"_clusters.vcf", header_out) """ snps = read_vcf(input1, input_type) #Filtering bulk mutations not supported by cells snps = snps[~snps['INFO'].str.startswith("SUPP=0")] #Create mutation id column and set it as index snps["mutid"] = snps["CHROM"] + "_"+snps["POS"].map(str) + "_" + snps["REF"] + "_" +snps["ALT"] snps = snps.set_index('mutid') """ #for each record in the vcf file for record_in in reader: d = samples_dict(clusters_df['cluster'].unique()) supp = 0 chrom = record_in.CHROM pos = record_in.POS-1 #to correct on 1-based positions ref = record_in.REF alt = record_in.ALT[0].value #for each cluster compute 'GT:DP:RD:AD:AF' to be provided as call argument for c in clusters_df['cluster'].unique(): #retrieve cell columns for cells in current cluster cells = clusters_df['cellid'][clusters_df['cluster'] == c] #retrieve cell data calls = [record_in.call_for_sample[cell] for cell in cells] #sum total read count, alt read count and ref read count of cells in the cluster for call in calls: d[c]['dp'] = d[c]['dp'] + call.data.get('DP') d[c]['rd'] = d[c]['rd'] + call.data.get('RD') d[c]['ad'] = d[c]['ad'] + call.data.get('AD') if d[c]['ad'] > 0: d[c]['gt'] = "0/1" d[c]['af'] = d[c]['ad'] / (d[c]['rd'] + d[c]['ad']) supp = 1 calls = [] # create one call for each cluster for c in d.keys(): calls.append(vcfpy.Call(str(c), OrderedDict([("GT", d[c]['gt']), ("DP", d[c]['dp']), ("RD", d[c]['rd']), ("AD", d[c]['ad']), ("AF", d[c]['af'])]))) print(calls) # write new record record_out = vcfpy.Record(CHROM=chrom, POS=pos+1, ID=[], REF=ref, ALT=[vcfpy.Substitution(type_="SNV", value=alt)], QUAL=None, FILTER=[], INFO={"SUPP":supp}, FORMAT=["GT","DP","RD","AD","AF"], calls=calls ) writer.write_record(record_out) reader.close() writer.close()
def add_file_format(self): self.write_header.add_line(vcfpy.HeaderLine('fileformat', 'VCFv4.2'))
filteredEPIs = list(genoSample.keys()) filteredEPIs.sort() # filter out EPIs contains only low-freq variants #for acc in ['ISL_700228', 'ISL_539719', 'ISL_539706', 'ISL_539708']: # x = genoSample[acc] # print(acc, x) #sys.exit() ##################### # construct VCF records ############################# timePre = datetime.datetime.now() varCt = 0 header = vcfpy.Header( samples=vcfpy.SamplesInfos(filteredEPIs), lines=[ vcfpy.HeaderLine('fileformat', 'VCFv4.0'), vcfpy.HeaderLine('fileDate', str(datetime.datetime.now())), vcfpy.HeaderLine('source', parser.prog), vcfpy.ContigHeaderLine('contig', '<ID=String,Length=Integer>', { 'ID': 'EPI_ISL_406030', 'length': 29903 }), vcfpy.InfoHeaderLine( 'INFO', '<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">', { 'ID': 'NS', 'Number': 1, 'Type': 'Integer', 'Description': 'Number of Samples With Data' }),