def run(self): # Phase 1 - Detection of BarCode self.bc = BarCode(self.bam) sys.stderr.write("[%s] Starting BarCode Analysis \n" % (self.get_time(),)) self.bc.simple_approach() sys.stderr.write("[%s] Analyzed BarCodes \n" % (self.get_time(),)) self.bc.write_barcodes(self.barcodes) sys.stderr.write("[%s] Wrote BarCodes\n" % (self.get_time(),)) # Phase 2 - Rewrite BAM sys.stderr.write("[%s] Starting Sort and Rewrite BAM\n" % (self.get_time(),)) self.bc.load_barcodes(self.barcodes) sys.stderr.write("[%s] Loaded BarCodes\n" % (self.get_time(),)) self.bc.bam.reset() self.bc.sort_and_rewrite_bam(self.rewritten_bam) pysam.sort("-n", self.rewritten_bam, self.rewritten_sorted_bam.replace(".bam", "")) sys.stderr.write("[%s] Sort and Rewrite BAM\n" % (self.get_time(),)) # Phase 3 - Build Consensus self.consensus = Consensus(self.rewritten_sorted_bam, self.ref) sys.stderr.write("[%s] Starting Consensus Building\n" % (self.get_time(),)) self.consensus.build() sys.stderr.write("[%s] Built and Calculated Consensus\n" % (self.get_time(),)) self.consensus.infer_consensus(self.consensus_reference) sys.stderr.write("[%s] Inferred Consensus\n" % (self.get_time(),)) # Phase 4 - Call Variants and Haplotypes self.consensus.output_consensus_genomes(self.consensus_genomes) sys.stderr.write("[%s] Output Consensus Genomes\n" % (self.get_time(),)) self.consensus.output_haplotype_distribution(self.haplotype_distribution) sys.stderr.write("[%s] Output Haplotype Distribution\n" % (self.get_time(),)) self.ovcf = VCF(self.vcf, crossmap=self.crossmap) self.ovcf.get_variants(self.ref.sequence, self.consensus.consensus_genomes) self.ovcf.output_vcf(self.ref.sequence) sys.stderr.write("[%s] Output VCF\n" % (self.get_time(),)) # Phase 5 - Summary Statistics and Chain Files f_out = open(self.out, "w") self.consensus.output_consensus_coverage(f_out) self.ovcf.output_variants_distribution(f_out) self.bc.output_reads_in_barcode_distribution(f_out) f_out.close() sys.stderr.write("[%s] Output Summary Statistics\n" % (self.get_time(),)) self.ochain = Chain(self.chain) self.ochain.output_chain(self.ref, self.consensus.inferred_consensus, self.consensus.inferred_structure) sys.stderr.write("[%s] Output Chain File\n" % (self.get_time(),))
def as_indel(self, ref_fasta): chrom = self.chroms[0].lstrip('chr') pos = self.breaks[0] ref = alt = None size = self.get_size() if self.rearrangement == 'del': ref = ref_fasta.fetch(self.chroms[0], self.breaks[0] - 1, self.breaks[1] - 1).upper() alt = ref_fasta.fetch(self.chroms[0], self.breaks[0] - 1, self.breaks[0]).upper() else: ref = ref_fasta.fetch(self.chroms[0], self.breaks[0] - 1, self.breaks[1]).upper() alt = ref + self.novel_seq.upper() id = self.id qual = '.' filter = '.' info = { 'BKPTID': ','.join(self.contigs), } # read support if self.final_support is not None: #info['READSUPPORT'] = self.final_support info['SPANNING_READS'] = self.support['spanning'] # somatic if self.somatic: info['SOMATIC'] = 'SOMATIC' # repeat contraction if self.rearrangement == 'del' and self.repeat_seq is not None: if self.repeat_seq is not None: info['REPEAT_SEQ'] = self.repeat_seq if self.repeat_num is not None: info['REPEAT_NUM'] = self.repeat_num if self.repeat_num_change is not None: info['REPEAT_NUM_CHANGE'] = self.repeat_num_change if ref is not None and alt is not None: fields = [ chrom, pos, id, ref, alt, qual, filter, VCF.info_dict_to_str(info) ] return '\t'.join(map(str, fields))
def main(): input = ComLine(sys.argv[1:]) vcf_file = VCF(input.args.vcf, input.args.thin, input.args.maf, input.args.indcov, input.args.snpcov, input.args.bi) #if input.args.filter == True: # vcf_file.convert_filter() #else: #convert to Plink vcf_file.convert() populations = Popmap(input.args.popmap) vcf_file.plink() vcf_file.print_populations(populations) admix_run = Admixture(vcf_file.prefix, input.args.np, input.args.minK, input.args.maxK, input.args.rep, input.args.cv) admix_run.admix() admix_run.create_zip() admix_run.loglik() admix_run.print_cv()
def as_indel(self, ref_fasta): chrom = self.chroms[0].lstrip('chr') pos = self.breaks[0] ref = alt = None size = self.get_size() if self.rearrangement == 'del': ref = ref_fasta.fetch(self.chroms[0], self.breaks[0] - 1, self.breaks[1] - 1).upper() alt = ref_fasta.fetch(self.chroms[0], self.breaks[0] - 1, self.breaks[0]).upper() else: ref = ref_fasta.fetch(self.chroms[0], self.breaks[0] - 1, self.breaks[1]).upper() alt = ref + self.novel_seq.upper() id = self.id qual = '.' filter = '.' info = { 'BKPTID':','.join(self.contigs), } # read support if self.final_support is not None: #info['READSUPPORT'] = self.final_support info['SPANNING_READS'] = self.support['spanning'] # somatic if self.somatic: info['SOMATIC'] = 'SOMATIC' # repeat contraction if self.rearrangement == 'del' and self.repeat_seq is not None: if self.repeat_seq is not None: info['REPEAT_SEQ'] = self.repeat_seq if self.repeat_num is not None: info['REPEAT_NUM'] = self.repeat_num if self.repeat_num_change is not None: info['REPEAT_NUM_CHANGE'] = self.repeat_num_change if ref is not None and alt is not None: fields = [chrom, pos, id, ref, alt, qual, filter, VCF.info_dict_to_str(info)] return '\t'.join(map(str, fields))
def assemble_consensus_genomes(self): # build consensus self.consensus = Consensus(self.rewritten_sorted_bam, self.ref) sys.stderr.write("[%s] Starting Consensus Building\n" % (self.get_time(),)) self.consensus.build(debug=self.debug) sys.stderr.write("[%s] Built and Calculated Consensus\n" % (self.get_time(),)) self.consensus.infer_consensus(self.consensus_reference) sys.stderr.write("[%s] Inferred Consensus\n" % (self.get_time(),)) self.consensus.output_consensus_genomes(self.consensus_genomes) sys.stderr.write("[%s] Output Consensus Genomes\n" % (self.get_time(),)) self.consensus.output_haplotype_distribution(self.haplotype_distribution) sys.stderr.write("[%s] Output Haplotype Distribution\n" % (self.get_time(),)) self.quark = Quark(self.ref.sequence) self.quark.distance_matrix(sorted(self.consensus.freq_distribution.items(), key=lambda q: q[1], reverse=True)) self.quark.graph_it() self.quark.rank_it(self.rank) sys.stderr.write("[%s] Output Quark Analysis\n" % (self.get_time(),)) self.ovcf = VCF(self.vcf, crossmap=self.crossmap) self.ovcf.get_variants(self.ref.sequence, self.consensus.consensus_genomes) self.ovcf.output_vcf(self.ref.sequence) sys.stderr.write("[%s] Output VCF\n" % (self.get_time(),)) self.summary_statistics() sys.stderr.write("[%s] Output Summary Statistics\n" % (self.get_time(),)) self.ochain = Chain(self.chain) self.ochain.output_chain(self.ref, self.consensus.inferred_consensus, self.consensus.inferred_structure) sys.stderr.write("[%s] Output Chain File\n" % (self.get_time(),))
import sys import matplotlib.pyplot as plt from matplotlib_venn import venn2 sys.path.append("/home/zhusitao/ngs-tools/format/") from vcf import VCF vcf1 = sys.argv[1] title1 = sys.argv[2] vcf2 = sys.argv[3] title2 = sys.argv[4] list1 = [] list2 = [] v1 = VCF(vcf1) for record in v1.readVCF(): chrom, pos, ref, ale = record.CHROM, record.POS, record.REF, record.ALT list1.append(chrom + pos + ref + ale) v2 = VCF(vcf2) for record in v2.readVCF(): chrom, pos, ref, ale = record.CHROM, record.POS, record.REF, record.ALT list2.append(chrom + pos + ref + ale) set1 = set(list1) set2 = set(list2) inter = set1 & set2 inter_len = len(inter) set1_len = len(set1) set2_len = len(set2)
def main(): input = ComLine(sys.argv[1:]) pops = Popmap(input.args.popmap) vcf = VCF(input.args.vcf, pops) vcf.printFile(input.args.out) vcf.printPrivate(input.args.out)
def main(): # argument method parser = argparse.ArgumentParser() # positional argument parser.add_argument('-d', '--directory', '--dir', help='parent directory of the samples i.e. Sample_054', required=True) # optional argument parser.add_argument( '-f', '--filter', action='store_true', help= 'use argument to keep the following columns: CHROM, POS, REF, ALT, genotype info. \n' 'Note: if option selection, the -d can either be the vcf.gz or the parent directory. ' 'If vcf.gz, it will perform filtering on that file. Else, it will perform filtering on' 'all the files in the hierarchical directory.') # optional argument parser.add_argument( '-m', '--merge', action='store_true', help= 'use argument to merge all filtered.vcf.gz files in the parent directory' ) # optional argument parser.add_argument( '-o', '--output', help='directory where to output the filtered or merged files') # optional argument parser.add_argument('-ht', '--homozygous_test', action='store_true', help='use argument to collect homozygous statistics') # optional argument parser.add_argument( '-s', '--subset', action='store_true', help='use argument to subset the vcf file based on chromosomes') # optional argument parser.add_argument( '-c', '--chromosome', help= 'use argument to select the chromosome number on which to subset on') # optional argument parser.add_argument( '-n', '--number_sites', help='use argument to select the number of line on which to subset on', type=int) # optional argument parser.add_argument('-p', '--phase', action='store_true', help='use argument to select the phasing test') parser.add_argument('-lc', '--list_chromosomes', action='store_true', help='list all the chromosomes in a vcf.gz ' 'file') args = parser.parse_args() working_directory, output_directory, chromosome, filter_merge = process_arguments( c_directory=args.directory, o_directory=args.output, filter_flag=args.filter, merge_flag=args.merge, chrom=args.chromosome, arg_parser=parser) # create VCF object vcf = VCF() if args.list_chromosomes: vcf.read_files(c_dir=working_directory, vcf_filtered_file=False) vcf.list_chrom(output_dir=output_directory) # filtering and merging have to read all the vcf files on the subdirectories of the families elif filter_merge: # read all the vcf files for that family vcf.read_files(c_dir=working_directory) # filter columns of the vcf files if args.filter: vcf.filter(output_dir=output_directory) else: vcf.merge(output_dir=output_directory) # subset, homozygous or phasing tests have to read a vcf file in the parent directory else: # for the subset, the chromosome should not be given in the read_file function if args.subset: # chromosome if required if goal is to subset since the file will be subsetted on it if not args.chromosome: raise ValueError( 'Chromosome number must be provided in order to perform subset' ) # read all the vcf files for that family vcf.read_files(c_dir=working_directory, vcf_filtered_file=True) vcf.subset(chrom=args.chromosome, output_dir=output_directory, n_sites=args.number_sites) else: # read all the vcf files for that family vcf.read_files(c_dir=working_directory, vcf_filtered_file=True, chrom=chromosome) if args.homozygous_test: # collect homozygous statistics vcf.tests(output_dir=output_directory, chrom=chromosome, homozygous_test=True) if args.phase: vcf.tmp_test(output_dir=output_directory, chrom=chromosome)
def main(): input = ComLine(sys.argv[1:]) vcf_file = VCF(input.args.vcf, input.args.thin, input.args.maf, input.args.mac, input.args.indcov, input.args.snpcov, input.args.bi, input.args.remove) #convert to Plink populations = Popmap(input.args.popmap) vcf_file.compIndLists(populations) vcf_file.convert() vcf_file.plink() vcf_file.print_populations(populations) vcf_file.print_individuals(populations) admix_run = Admixture(vcf_file.prefix, input.args.np, input.args.minK, input.args.maxK, input.args.rep, input.args.cv) admix_run.admix() admix_run.create_zip() admix_run.loglik() admix_run.print_cv()
def as_sv(self, ref_fasta, id_ext=None, info_ext=None, chrom_ext=None, pos_ext=None): chrom = self.chroms[0] if chrom_ext is None else chrom_ext pos = self.breaks[0] if pos_ext is None else pos_ext chrom = chrom.lstrip('chr') alt = None ref = ref_fasta.fetch(self.chroms[0], self.breaks[0] - 1, self.breaks[0]).upper() sv_len = self.get_size() end = None if type(sv_len) is int: end = pos + sv_len if self.rearrangement == 'del': alt = '<DEL>' sv_type = 'DEL' if type(sv_len) is int: sv_len = -1 * sv_len end = pos - sv_len elif self.rearrangement == 'dup': alt = '<DUP:TANDEM>' sv_type = 'DUP' elif self.rearrangement == 'inv': alt = '<INV>' sv_type = 'INV' elif self.rearrangement == 'ins': alt = '<INS>' sv_type = 'INS' end = pos id = self.id if id_ext is None else id_ext qual = '.' filter = '.' info = {'SVTYPE': sv_type, 'END': end, 'BKPTID':','.join(self.contigs), } if end is not None: info['END'] = end if type(sv_len) is int: info['SVLEN'] = sv_len if sv_type == 'DUP': if self.repeat_seq is not None: info['REPEAT_SEQ'] = self.repeat_seq if self.repeat_num is not None: info['REPEAT_NUM'] = self.repeat_num if self.repeat_num_change is not None: info['REPEAT_NUM_CHANGE'] = self.repeat_num_change # read support if self.final_support is not None: #info['READSUPPORT'] = self.final_support info['SPANNING_READS'] = self.support['spanning'] if self.support['flanking'] is not None: info['FLANKING_PAIRS'] = self.support['flanking'] # somatic if self.somatic: info['SOMATIC'] = 'SOMATIC' cipos = None homol_len = None homol_seq = None if self.homol_seq and self.homol_seq[0] != '-': homol_seq = self.homol_seq[0].upper() homol_len = len(self.homol_seq[0]) contig_breaks = self.contig_breaks[0] # e.g. GMAP if contig_breaks[0] + 1 == contig_breaks[1]: #print 'gmap', contig_breaks pass # e.g. BWA-mem elif contig_breaks[0] >= contig_breaks[1]: cipos = '0,%d' % homol_len if cipos is not None: info['CIPOS'] = cipos info['CIPOS'] = cipos if homol_len is not None: info['HOMLEN'] = homol_len info['HOMLEN'] = homol_len if homol_seq is not None: info['HOMSEQ'] = homol_seq info['HOMSEQ'] = homol_seq # external info - overrides given info if info_ext: for key, value in info_ext.iteritems(): if key == 'SVLEN' and value == 'NA': continue info[key] = value if ref is not None and alt is not None: fields = [chrom, pos, id, ref, alt, qual, filter, VCF.info_dict_to_str(info)] return '\t'.join(map(str, fields))
class BaseSeq(Helper): def __init__(self, bam, barcodes=None, out=None, ref=None, rewritten_bam=None, consensus_reference=None, consensus_genomes=None, haplotype_distribution=None, vcf=None, chain=None, crossmap=None, export=None, rank=None, debug=None): self.bam = bam self.barcodes = barcodes self.out = out self.ref = Reference(ref) self.rewritten_bam = rewritten_bam self.rewritten_sorted_bam = rewritten_bam.replace(".bam", ".sorted.bam") if rewritten_bam else None self.consensus_reference = consensus_reference self.consensus_genomes = consensus_genomes self.haplotype_distribution = haplotype_distribution self.vcf = vcf self.chain = chain self.crossmap = crossmap self.export = export self.rank = rank self.debug = int(debug) def get_barcodes(self): # simple approach - align, take soft-clipped, and use the arbitrary 20 bases # intermediate approach - use the seed and extend approach out = open(self.out, "w") self.bc = BarCode(self.bam) self.bc.simple_approach() for k, v in sorted(self.bc.barcode_to_read.items()): q = sorted(v) out.write("%s\t%s\n" % (k, ",".join(q))) out.close() def error_correction_barcodes(self): # start analysis self.bc = BarCode(self.bam) sys.stdout.write("[%s] Starting Error Correction Analysis\n" % (self.get_time(),)) # load barcodes self.bc.load_barcodes(self.barcodes) sys.stdout.write("[%s] Loaded BarCodes\n" % (self.get_time(),)) # cluster barcodes self.bc.cluster_barcodes() sys.stdout.write("[%s] Clustered BarCodes\n" % (self.get_time(),)) def filter_barcodes(self, barcode, export="fastq"): list_of_ids = [] with open(self.barcodes, "r") as f: for line in f: data = line.strip("\r\n").split("\t") if barcode == data[0]: list_of_ids = data[1].split(",") break self.bc = BarCode(self.bam) self.bc.filter_and_export(list_of_ids, self.out, export=export) def sort_and_rewrite_bam(self): self.bc = BarCode(self.bam) sys.stderr.write("[%s] Starting Sort and Rewrite BAM\n" % (self.get_time(),)) self.bc.load_barcodes(self.barcodes) sys.stderr.write("[%s] Loaded BarCodes\n" % (self.get_time(),)) self.bc.sort_and_rewrite_bam(self.rewritten_bam) pysam.sort("-n", self.rewritten_bam, self.rewritten_sorted_bam.replace(".bam", "")) sys.stderr.write("[%s] Sort and Rewrite BAM\n" % (self.get_time(),)) def split_bam_by_barcode(self): self.bc = BarCode(self.bam) sys.stderr.write("[%s] Starting procedure to split BAM by barcode\n" % (self.get_time(),)) self.bc.split_bam_into_barcodes(self.ref, self.out, self.export) sys.stderr.write("[%s] Finished splitting BAM by barcode id\n" % (self.get_time(),)) def assemble_consensus_genomes(self): # build consensus self.consensus = Consensus(self.rewritten_sorted_bam, self.ref) sys.stderr.write("[%s] Starting Consensus Building\n" % (self.get_time(),)) self.consensus.build(debug=self.debug) sys.stderr.write("[%s] Built and Calculated Consensus\n" % (self.get_time(),)) self.consensus.infer_consensus(self.consensus_reference) sys.stderr.write("[%s] Inferred Consensus\n" % (self.get_time(),)) self.consensus.output_consensus_genomes(self.consensus_genomes) sys.stderr.write("[%s] Output Consensus Genomes\n" % (self.get_time(),)) self.consensus.output_haplotype_distribution(self.haplotype_distribution) sys.stderr.write("[%s] Output Haplotype Distribution\n" % (self.get_time(),)) self.quark = Quark(self.ref.sequence) self.quark.distance_matrix(sorted(self.consensus.freq_distribution.items(), key=lambda q: q[1], reverse=True)) self.quark.graph_it() self.quark.rank_it(self.rank) sys.stderr.write("[%s] Output Quark Analysis\n" % (self.get_time(),)) self.ovcf = VCF(self.vcf, crossmap=self.crossmap) self.ovcf.get_variants(self.ref.sequence, self.consensus.consensus_genomes) self.ovcf.output_vcf(self.ref.sequence) sys.stderr.write("[%s] Output VCF\n" % (self.get_time(),)) self.summary_statistics() sys.stderr.write("[%s] Output Summary Statistics\n" % (self.get_time(),)) self.ochain = Chain(self.chain) self.ochain.output_chain(self.ref, self.consensus.inferred_consensus, self.consensus.inferred_structure) sys.stderr.write("[%s] Output Chain File\n" % (self.get_time(),)) def summary_statistics(self): # coverage per genome # variants per genome # estimate PCR and sequencing errors # barcode distribution f_out = open(self.out, "w") self.bc = BarCode(self.bam) #TEMP self.bc.load_barcodes(self.barcodes) #TEMP self.consensus.output_consensus_coverage(f_out) self.ovcf.output_variants_distribution(f_out) self.bc.output_reads_in_barcode_distribution(f_out) f_out.close() def run(self): # Phase 1 - Detection of BarCode self.bc = BarCode(self.bam) sys.stderr.write("[%s] Starting BarCode Analysis \n" % (self.get_time(),)) self.bc.simple_approach() sys.stderr.write("[%s] Analyzed BarCodes \n" % (self.get_time(),)) self.bc.write_barcodes(self.barcodes) sys.stderr.write("[%s] Wrote BarCodes\n" % (self.get_time(),)) # Phase 2 - Rewrite BAM sys.stderr.write("[%s] Starting Sort and Rewrite BAM\n" % (self.get_time(),)) self.bc.load_barcodes(self.barcodes) sys.stderr.write("[%s] Loaded BarCodes\n" % (self.get_time(),)) self.bc.bam.reset() self.bc.sort_and_rewrite_bam(self.rewritten_bam) pysam.sort("-n", self.rewritten_bam, self.rewritten_sorted_bam.replace(".bam", "")) sys.stderr.write("[%s] Sort and Rewrite BAM\n" % (self.get_time(),)) # Phase 3 - Build Consensus self.consensus = Consensus(self.rewritten_sorted_bam, self.ref) sys.stderr.write("[%s] Starting Consensus Building\n" % (self.get_time(),)) self.consensus.build() sys.stderr.write("[%s] Built and Calculated Consensus\n" % (self.get_time(),)) self.consensus.infer_consensus(self.consensus_reference) sys.stderr.write("[%s] Inferred Consensus\n" % (self.get_time(),)) # Phase 4 - Call Variants and Haplotypes self.consensus.output_consensus_genomes(self.consensus_genomes) sys.stderr.write("[%s] Output Consensus Genomes\n" % (self.get_time(),)) self.consensus.output_haplotype_distribution(self.haplotype_distribution) sys.stderr.write("[%s] Output Haplotype Distribution\n" % (self.get_time(),)) self.ovcf = VCF(self.vcf, crossmap=self.crossmap) self.ovcf.get_variants(self.ref.sequence, self.consensus.consensus_genomes) self.ovcf.output_vcf(self.ref.sequence) sys.stderr.write("[%s] Output VCF\n" % (self.get_time(),)) # Phase 5 - Summary Statistics and Chain Files f_out = open(self.out, "w") self.consensus.output_consensus_coverage(f_out) self.ovcf.output_variants_distribution(f_out) self.bc.output_reads_in_barcode_distribution(f_out) f_out.close() sys.stderr.write("[%s] Output Summary Statistics\n" % (self.get_time(),)) self.ochain = Chain(self.chain) self.ochain.output_chain(self.ref, self.consensus.inferred_consensus, self.consensus.inferred_structure) sys.stderr.write("[%s] Output Chain File\n" % (self.get_time(),)) def assemble_genomes(self): pass def assemble_genomes_from_fastq(self): pass
def as_sv(self, ref_fasta, id_ext=None, info_ext=None, chrom_ext=None, pos_ext=None): chrom = self.chroms[0] if chrom_ext is None else chrom_ext pos = self.breaks[0] if pos_ext is None else pos_ext chrom = chrom.lstrip('chr') alt = None ref = ref_fasta.fetch(self.chroms[0], self.breaks[0] - 1, self.breaks[0]).upper() sv_len = self.get_size() end = None if type(sv_len) is int: end = pos + sv_len if self.rearrangement == 'del': alt = '<DEL>' sv_type = 'DEL' if type(sv_len) is int: sv_len = -1 * sv_len end = pos - sv_len elif self.rearrangement == 'dup': alt = '<DUP:TANDEM>' sv_type = 'DUP' elif self.rearrangement == 'inv': alt = '<INV>' sv_type = 'INV' elif self.rearrangement == 'ins': alt = '<INS>' sv_type = 'INS' end = pos id = self.id if id_ext is None else id_ext qual = '.' filter = '.' info = { 'SVTYPE': sv_type, 'END': end, 'BKPTID': ','.join(self.contigs), } if end is not None: info['END'] = end if type(sv_len) is int: info['SVLEN'] = sv_len if sv_type == 'DUP': if self.repeat_seq is not None: info['REPEAT_SEQ'] = self.repeat_seq if self.repeat_num is not None: info['REPEAT_NUM'] = self.repeat_num if self.repeat_num_change is not None: info['REPEAT_NUM_CHANGE'] = self.repeat_num_change # read support if self.final_support is not None: #info['READSUPPORT'] = self.final_support info['SPANNING_READS'] = self.support['spanning'] if self.support['flanking'] is not None: info['FLANKING_PAIRS'] = self.support['flanking'] # somatic if self.somatic: info['SOMATIC'] = 'SOMATIC' cipos = None homol_len = None homol_seq = None if self.homol_seq and self.homol_seq[0] != '-': homol_seq = self.homol_seq[0].upper() homol_len = len(self.homol_seq[0]) contig_breaks = self.contig_breaks[0] # e.g. GMAP if contig_breaks[0] + 1 == contig_breaks[1]: #print 'gmap', contig_breaks pass # e.g. BWA-mem elif contig_breaks[0] >= contig_breaks[1]: cipos = '0,%d' % homol_len if cipos is not None: info['CIPOS'] = cipos info['CIPOS'] = cipos if homol_len is not None: info['HOMLEN'] = homol_len info['HOMLEN'] = homol_len if homol_seq is not None: info['HOMSEQ'] = homol_seq info['HOMSEQ'] = homol_seq # external info - overrides given info if info_ext: for key, value in info_ext.iteritems(): if key == 'SVLEN' and value == 'NA': continue info[key] = value if ref is not None and alt is not None: fields = [ chrom, pos, id, ref, alt, qual, filter, VCF.info_dict_to_str(info) ] return '\t'.join(map(str, fields))
def as_breakends(self, ref_fasta, genomic=True, max_novel_seq_len=50, info_ext=None, parids=None, event=None): chroms = map(lambda c: c.lstrip('chr'), self.chroms) alt_chroms = chroms[:] pos = list(self.breaks) alt_pos = pos[:] # inserted novel sequences inserted_seqs = ['', ''] if self.novel_seq and self.novel_seq != 'NA' and self.novel_seq != '-': if len(self.novel_seq) > max_novel_seq_len: alt_chroms[0] = '<%s>' % self.contigs[0] alt_chroms[1] = '<%s>' % self.contigs[0] alt_pos[1] = self.contig_breaks[0][0] + 1 alt_pos[0] = self.contig_breaks[0][1] - 1 else: if len(self.aligns[0]) == 1: inserted_seqs[0] = self.novel_seq if self.aligns[0][ 0].strand == '+' else reverse_complement( self.novel_seq) inserted_seqs[1] = self.novel_seq if self.aligns[0][ 0].strand == '+' else reverse_complement( self.novel_seq) else: inserted_seqs[0] = self.novel_seq if self.aligns[0][ 0].strand == '+' else reverse_complement( self.novel_seq) inserted_seqs[1] = self.novel_seq if self.aligns[0][ 1].strand == '+' else reverse_complement( self.novel_seq) # microhomology, cipos cipos = None homol_len = None homol_seq = None if self.homol_seq and self.homol_seq[0] != '-' and len( self.homol_seq) > 0: homol_seq = self.homol_seq[0].upper() homol_len = len(self.homol_seq[0]) contig_breaks = self.contig_breaks[0] # e.g. GMAP if contig_breaks[0] + 1 == contig_breaks[1]: pass # e.g. BWA-mem elif contig_breaks[0] >= contig_breaks[1]: pos[0] -= homol_len alt_pos[1] += homol_len cipos = '0,%d' % homol_len refs = (ref_fasta.fetch(self.chroms[0], self.breaks[0] - 1, self.breaks[0]).upper(), ref_fasta.fetch(self.chroms[1], self.breaks[1] - 1, self.breaks[1]).upper()) ids = ('%s%s' % (self.id, 'a'), '%s%s' % (self.id, 'b')) svtype = 'BND' if genomic else 'FND' infos = [{ 'SVTYPE': svtype, 'MATEID': ids[1], 'EVENTTYPE': self.rearrangement.upper() }, { 'SVTYPE': svtype, 'MATEID': ids[0], 'EVENTTYPE': self.rearrangement.upper() }] if cipos is not None: infos[0]['CIPOS'] = cipos infos[1]['CIPOS'] = cipos if homol_len is not None: infos[0]['HOMLEN'] = homol_len infos[1]['HOMLEN'] = homol_len if homol_seq is not None: infos[0]['HOMSEQ'] = homol_seq infos[1]['HOMSEQ'] = homol_seq # read support if self.final_support is not None: #infos[0]['READSUPPORT'] = self.final_support #infos[1]['READSUPPORT'] = self.final_support infos[0]['SPANNING_READS'] = self.support['spanning'] infos[1]['SPANNING_READS'] = self.support['spanning'] if self.support['flanking'] is not NONE: infos[0]['FLANKING_PAIRS'] = self.support['flanking'] infos[1]['FLANKING_PAIRS'] = self.support['flanking'] adj_size = self.get_size() if type(adj_size) is int: infos[0]['SVLEN'] = adj_size infos[1]['SVLEN'] = adj_size # somatic if self.somatic: infos[0]['SOMATIC'] = 'SOMATIC' infos[1]['SOMATIC'] = 'SOMATIC' # contig and contig breakpoints if self.contigs: for i in range(2): infos[i]['BKPTID'] = ','.join(self.contigs) if self.contig_breaks and len(self.contig_breaks) == len(self.contigs): contig_breaks = [] for bk in self.contig_breaks: if len(bk) == 2: contig_breaks.append('%s-%s' % (bk[0], bk[1])) else: print 'error' if len(contig_breaks) == len(self.contigs): for i in range(2): infos[i]['CTG_BKS'] = ','.join(contig_breaks) # external info - overrides given info if info_ext: for key, value in info_ext.iteritems(): if len(value) == 2: infos[0][key] = value[0] infos[1][key] = value[1] if self.orients[0] == 'L': # LL if self.orients[1] == 'L': alts = ('%s%s]%s:%s]' % (refs[0], inserted_seqs[0], alt_chroms[1], alt_pos[1]), '%s%s]%s:%s]' % (refs[1], inserted_seqs[1], alt_chroms[0], alt_pos[0])) # LR else: alts = ('%s%s[%s:%s[' % (refs[0], inserted_seqs[0], alt_chroms[1], alt_pos[1]), ']%s:%s]%s%s' % (alt_chroms[0], alt_pos[0], inserted_seqs[1], refs[1])) else: # RL if self.orients[1] == 'L': alts = (']%s:%s]%s%s' % (chroms[1], alt_pos[1], inserted_seqs[0], refs[0]), '%s%s[%s:%s[' % (refs[1], inserted_seqs[1], chroms[0], alt_pos[0])) # RR else: alts = ('[%s:%s[%s%s' % (chroms[1], alt_pos[1], inserted_seqs[0], refs[0]), '[%s:%s[%s%s' % (chroms[0], alt_pos[0], inserted_seqs[1], refs[1])) breakends = map( lambda i: '\t'.join([ chroms[i], str(pos[i]), ids[i], refs[i], alts[i], '.', '.', VCF.info_dict_to_str(infos[i]) ]), range(2)) return '\n'.join(breakends)
##indel_exonicfunc.xls title_indel_exonicfunc = ['Sample','frameshift_deletion','frameshift_insertion','nonframeshift_deletion','nonframeshift_insertion','stoploss','stopgain','unknown'] indel_exonicfunc.write('\t'.join(title_indel_exonicfunc)+'\n') for file in open(files, 'r'): if file.startswith('#'):continue file = file.strip() sample_name = os.path.basename(file) sample_name = sample_name.split('.')[0] myVCF = VCF(file) snp = myVCF.filter() indel_file = file.replace('snp','indel') myVCF = VCF(indel_file) indel = myVCF.filter() """ ##chr.xls chr = myVCF.chr_stat(vcf) chromosome.write(sample_name) for i in [str(i) for i in range(1,23)]+['X','Y']: try: chromosome.write('\t'+str(chr[i])) except: chromosome.write('\t0') chromosome.write('\n')
def as_breakends(self, ref_fasta, genomic=True, max_novel_seq_len=50, info_ext=None, parids=None, event=None): chroms = map(lambda c: c.lstrip('chr'), self.chroms) alt_chroms = chroms[:] pos = list(self.breaks) alt_pos = pos[:] # inserted novel sequences inserted_seqs = ['',''] if self.novel_seq and self.novel_seq != 'NA' and self.novel_seq != '-': if len(self.novel_seq) > max_novel_seq_len: alt_chroms[0] = '<%s>' % self.contigs[0] alt_chroms[1] = '<%s>' % self.contigs[0] alt_pos[1] = self.contig_breaks[0][0] + 1 alt_pos[0] = self.contig_breaks[0][1] - 1 else: if len(self.aligns[0]) == 1: inserted_seqs[0] = self.novel_seq if self.aligns[0][0].strand == '+' else reverse_complement(self.novel_seq) inserted_seqs[1] = self.novel_seq if self.aligns[0][0].strand == '+' else reverse_complement(self.novel_seq) else: inserted_seqs[0] = self.novel_seq if self.aligns[0][0].strand == '+' else reverse_complement(self.novel_seq) inserted_seqs[1] = self.novel_seq if self.aligns[0][1].strand == '+' else reverse_complement(self.novel_seq) # microhomology, cipos cipos = None homol_len = None homol_seq = None if self.homol_seq and self.homol_seq[0] != '-' and len(self.homol_seq) > 0: homol_seq = self.homol_seq[0].upper() homol_len = len(self.homol_seq[0]) contig_breaks = self.contig_breaks[0] # e.g. GMAP if contig_breaks[0] + 1 == contig_breaks[1]: pass # e.g. BWA-mem elif contig_breaks[0] >= contig_breaks[1]: pos[0] -= homol_len alt_pos[1] += homol_len cipos = '0,%d' % homol_len refs = (ref_fasta.fetch(self.chroms[0], self.breaks[0] - 1, self.breaks[0]).upper(), ref_fasta.fetch(self.chroms[1], self.breaks[1] - 1, self.breaks[1]).upper()) ids = ('%s%s' % (self.id, 'a'), '%s%s' % (self.id, 'b')) svtype = 'BND' if genomic else 'FND' infos = [{'SVTYPE':svtype, 'MATEID':ids[1], 'EVENTTYPE':self.rearrangement.upper()}, {'SVTYPE':svtype, 'MATEID':ids[0], 'EVENTTYPE':self.rearrangement.upper()}] if cipos is not None: infos[0]['CIPOS'] = cipos infos[1]['CIPOS'] = cipos if homol_len is not None: infos[0]['HOMLEN'] = homol_len infos[1]['HOMLEN'] = homol_len if homol_seq is not None: infos[0]['HOMSEQ'] = homol_seq infos[1]['HOMSEQ'] = homol_seq # read support if self.final_support is not None: #infos[0]['READSUPPORT'] = self.final_support #infos[1]['READSUPPORT'] = self.final_support infos[0]['SPANNING_READS'] = self.support['spanning'] infos[1]['SPANNING_READS'] = self.support['spanning'] if self.support['flanking'] is not NONE: infos[0]['FLANKING_PAIRS'] = self.support['flanking'] infos[1]['FLANKING_PAIRS'] = self.support['flanking'] adj_size = self.get_size() if type(adj_size) is int: infos[0]['SVLEN'] = adj_size infos[1]['SVLEN'] = adj_size # somatic if self.somatic: infos[0]['SOMATIC'] = 'SOMATIC' infos[1]['SOMATIC'] = 'SOMATIC' # contig and contig breakpoints if self.contigs: for i in range(2): infos[i]['BKPTID'] = ','.join(self.contigs) if self.contig_breaks and len(self.contig_breaks) == len(self.contigs): contig_breaks = [] for bk in self.contig_breaks: if len(bk) == 2: contig_breaks.append('%s-%s' % (bk[0], bk[1])) else: print 'error' if len(contig_breaks) == len(self.contigs): for i in range(2): infos[i]['CTG_BKS'] = ','.join(contig_breaks) # external info - overrides given info if info_ext: for key, value in info_ext.iteritems(): if len(value) == 2: infos[0][key] = value[0] infos[1][key] = value[1] if self.orients[0] == 'L': # LL if self.orients[1] == 'L': alts = ('%s%s]%s:%s]' % (refs[0], inserted_seqs[0], alt_chroms[1], alt_pos[1]), '%s%s]%s:%s]' % (refs[1], inserted_seqs[1], alt_chroms[0], alt_pos[0])) # LR else: alts = ('%s%s[%s:%s[' % (refs[0], inserted_seqs[0], alt_chroms[1], alt_pos[1]), ']%s:%s]%s%s' % (alt_chroms[0], alt_pos[0], inserted_seqs[1], refs[1])) else: # RL if self.orients[1] == 'L': alts = (']%s:%s]%s%s' % (chroms[1], alt_pos[1], inserted_seqs[0], refs[0]), '%s%s[%s:%s[' % (refs[1], inserted_seqs[1], chroms[0], alt_pos[0])) # RR else: alts = ('[%s:%s[%s%s' % (chroms[1], alt_pos[1], inserted_seqs[0], refs[0]), '[%s:%s[%s%s' % (chroms[0], alt_pos[0], inserted_seqs[1], refs[1])) breakends = map(lambda i: '\t'.join([chroms[i], str(pos[i]), ids[i], refs[i], alts[i], '.', '.', VCF.info_dict_to_str(infos[i])]), range(2)) return '\n'.join(breakends)
def main(): input = ComLine(sys.argv[1:]) phy = Phylip(input.args.phy) pops = Popmap(input.args.popmap) VCF(phy, pops, input.args.out)
##indel_exonicfunc.xls title_indel_exonicfunc = [ 'Sample', 'frameshift_deletion', 'frameshift_insertion', 'nonframeshift_deletion', 'nonframeshift_insertion', 'stoploss', 'stopgain', 'unknown' ] indel_exonicfunc.write('\t'.join(title_indel_exonicfunc) + '\n') for file in open(files, 'r'): if file.startswith('#'): continue file = file.strip() sample_name = os.path.basename(file) sample_name = sample_name.split('.')[0] myVCF = VCF(file) snp = myVCF.filter() indel_file = file.replace('snp', 'indel') myVCF = VCF(indel_file) indel = myVCF.filter() """ ##chr.xls chr = myVCF.chr_stat(vcf) chromosome.write(sample_name) for i in [str(i) for i in range(1,23)]+['X','Y']: try: chromosome.write('\t'+str(chr[i])) except: chromosome.write('\t0') chromosome.write('\n') """