Пример #1
0
def read_vep(in_vep_vcf, variants):
    with pysam.VariantFile(in_vep_vcf) as ifile:
        csq_meta = ifile.header.info.get('CSQ', None)
        if csq_meta is None:
            raise Exception(
                'No meta-information entry about CSQ INFO field found!')
        csq_header = csq_meta.description.split(':', 1)[1].strip().split('|')
        for record in ifile.fetch():
            if len(
                    record.alts
            ) > 1:  # multi-allelic variants must be split into multiple bi-allelic VCF entries
                raise Exception(
                    'Multi-allelic VCF records are not supported. Multi-allelic variants must be split into multiple bi-allelic VCF entries.'
                )
            variant_name = (record.chrom[3:] if record.chrom.startswith('chr')
                            else record.chrom, record.pos, record.ref,
                            record.alts[0])
            variant = variants.get(variant_name, None)
            if variant is None:
                continue
            variant_csqs = set()
            lof = False
            for csq in record.info['CSQ']:
                csq = dict(zip(csq_header, csq.split('|')))
                if csq['BIOTYPE'] != 'protein_coding':
                    continue
                if csq['LoF'] == 'HC':
                    lof = True
                csqs = csq['Consequence'].split('&')
                if any(x in csqs for x in cds_variant_types):
                    variant_csqs.update(csqs)
            if not variant_csqs:
                variants.pop(variant_name)
                continue
            variant_most_severe_csq = None
            if 'splice_acceptor_variant' in variant_csqs or 'splice_donor_variant' in variant_csqs:
                variant_most_severe_csq = 'splice'
            elif 'stop_gained' in variant_csqs:
                variant_most_severe_csq = 'stop_gained'
            elif 'stop_lost' in variant_csqs:
                variant_most_severe_csq = 'stop_lost'
            elif 'start_lost' in variant_csqs:
                variant_most_severe_csq = 'start_lost'
            elif 'frameshift_variant' in variant_csqs:
                variant_most_severe_csq = 'frameshift'
            elif 'inframe_insertion' in variant_csqs:
                variant_most_severe_csq = 'inframe_insertion'
            elif 'inframe_deletion' in variant_csqs:
                variant_most_severe_csq = 'inframe_deletion'
            elif 'missense_variant' in variant_csqs:
                variant_most_severe_csq = 'missense'
            elif 'synonymous_variant' in variant_csqs or 'stop_retained_variant' in variant_csqs or 'start_retained_variant' in variant_csqs:
                variant_most_severe_csq = 'synonymous'
            else:
                print(
                    f'WARNING (not in CDS): Variant {variant_name} ({variant_csqs}) will be omitted.'
                )
                variants.pop(variant_name)
                continue
            categories = ['ALL', variant_most_severe_csq]
            if lof:
                categories.append('LOF')
            length = len(record.ref) - len(record.alts[0])
            if length == 0:
                if len(record.ref) > 1:
                    categories.append('MNP')
                else:
                    categories.append('SNP')
            elif length > 0:
                categories.append('INDEL')
                categories.append('DEL')
                length = abs(length)
                if length < 4:
                    categories.append(f'DEL:{length}')
                elif length < 10:
                    categories.append('DEL:4-9')
                else:
                    categories.append('DEL:10+')
            elif length < 0:
                categories.append('INDEL')
                categories.append('INS')
                length = abs(length)
                if length < 4:
                    categories.append(f'INS:{length}')
                elif length < 10:
                    categories.append('INS:4-9')
                else:
                    categories.append('INS:10+')
            variant.ac = record.info['AC'][0]
            variant.an = record.info['AN']
            variant.cat = categories
Пример #2
0
def count_sr(argv):
    parser = argparse.ArgumentParser(
        description="Count clipped reads at SV breakpoints. Unwindowed.",
        prog='svtk count-sr',
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('vcf',
                        help='VCF of variant calls. Standardized to include '
                        'CHR2, END, SVTYPE, STRANDS in INFO.')
    parser.add_argument('countfile',
                        help='Tabix indexed file of split counts.'
                        ' Columns: chrom,pos,clip,count,sample')
    parser.add_argument('fout', help='Output table of split read counts.')
    parser.add_argument('--common',
                        default=False,
                        action='store_true',
                        help='Ignore background for common AF')
    parser.add_argument('-s',
                        '--samples',
                        type=argparse.FileType('r'),
                        default=None,
                        help='Whitelist of samples to restrict testing to.')
    parser.add_argument(
        '--index',
        default=None,
        help='Tabix index of discordant pair file. Required if '
        'discordant pair file is hosted remotely.')
    # TODO: add normalization
    parser.add_argument('--medianfile',
                        default=None,
                        help='Median coverage statistics for each library '
                        '(optional). If provided, each sample\'s split '
                        'counts will be normalized accordingly. '
                        'Same format as RdTest, one column per sample.')
    # Print help if no arguments specified
    if len(argv) == 0:
        parser.print_help()
        sys.exit(1)
    args = parser.parse_args(argv)

    vcf = pysam.VariantFile(args.vcf)

    if args.index is not None:
        countfile = pysam.TabixFile(args.countfile,
                                    index=args.index,
                                    parser=pysam.asTuple())
    else:
        if args.countfile.startswith('http'):
            raise Exception('Must provide tabix index with remote URL')
        countfile = pysam.TabixFile(args.countfile, parser=pysam.asTuple())

    if args.fout in '- stdout'.split():
        fout = sys.stdout
    else:
        fout = open(args.fout, 'w')

    header = 'name coord sample count'.split()
    fout.write('\t'.join(header) + '\n')

    if args.samples is not None:
        whitelist = [s.strip() for s in args.samples.readlines()]
    else:
        whitelist = [s for s in vcf.header.samples]

    if args.medianfile is not None:
        medians = pd.read_table(args.medianfile)
        medians = pd.melt(medians, var_name='sample', value_name='median_cov')
    else:
        medians = None
    srtest = SRTest(countfile, args.common, window=0, medians=medians)

    for record in vcf:
        for coord in 'start end'.split():
            if coord == 'start':
                pos, strand, chrom = record.pos, record.info['STRANDS'][
                    0], record.chrom
            else:
                # TODO: With a properly formatted VCF, should be using END2 instead of END here
                pos, strand, chrom = record.stop, record.info['STRANDS'][
                    1], record.info['CHR2']

            counts = srtest.load_counts(chrom, pos, strand)
            counts = srtest.normalize_counts(counts)
            counts = counts['sample count'.split()]
            counts = counts.set_index('sample')
            counts = counts.reindex(whitelist).fillna(0).astype(int)
            counts = counts.reset_index()
            counts['name'] = record.id
            counts['coord'] = coord

            for row in counts[header].values:
                fout.write('\t'.join([str(x) for x in row]) + '\n')
Пример #3
0
def get_variants(path):
    records = []
    with pysam.VariantFile(path) as vcf:
        for rec in vcf:
            records.append(rec)
    return records
Пример #4
0
def main():
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('vcf', help='Input vcf (supports "stdin").')
    parser.add_argument('minGQtable',
                        help='Tab-delimited minGQ filtering lookup' +
                        ' table generated by create_minGQ_lookup_table.R.')
    parser.add_argument('fout', help='Output file (supports "stdout").')
    parser.add_argument('-m',
                        '--minGQ',
                        help='Global min GQ',
                        type=int,
                        default=0,
                        dest='globalMin')
    parser.add_argument('--multiallelics',
                        default=False,
                        action='store_true',
                        help='Also apply filtering to multiallelic sites ' +
                        '(default: skip multiallelics).')
    parser.add_argument(
        '--dropEmpties',
        default=False,
        action='store_true',
        help='After GT reassignments, drop any SV with no remaining ' +
        ' non-ref samples (default: keep all SV).')
    parser.add_argument(
        '--maxNCR',
        help='Max no-call rate among all ' +
        'samples before adding a flag to the record\'s FILTER field' +
        ' (default: 0.005)',
        type=float,
        default=0.005,
        dest='maxNCR')
    parser.add_argument(
        '--cleanAFinfo',
        help='Remove all AF-related terms from ' +
        ' the INFO field and VCF header (default: keep all terms).',
        default=False,
        action='store_true')
    parser.add_argument('--prefix',
                        help='Cohort label to append to NCR FILTER.',
                        default='COHORT',
                        dest='prefix')

    args = parser.parse_args()

    if args.vcf in '- stdin'.split():
        vcf = pysam.VariantFile(sys.stdin)
    else:
        vcf = pysam.VariantFile(args.vcf)

    #Add HIGH_NOCALL_RATE filter to vcf header
    NEW_FILTER = '##FILTER=<ID=HIGH_{0}_NOCALL_RATE,Description="More than '.format(args.prefix) + \
                 '{:.2%}'.format(args.maxNCR) + ' of {0} sample GTs were '.format(args.prefix) + \
                 'masked as no-call GTs due to low GQ. Indicates a possibly noisy locus ' + \
                 'in {0} samples.>'.format(args.prefix)
    header = vcf.header
    header.add_line(NEW_FILTER)
    filter_text = 'HIGH_{0}_NOCALL_RATE'.format(args.prefix)

    if args.fout in '- stdout'.split():
        fout = pysam.VariantFile(sys.stdout, 'w', header=vcf.header)
    else:
        fout = pysam.VariantFile(args.fout, 'w', header=vcf.header)

    #Make dummy lookup tables for SVLEN, AF, SVTYPE, FILTER, and EV
    SVLEN_table = _make_SVLEN_interval_dict(args.minGQtable)
    AF_table = _make_AF_interval_dict(args.minGQtable)
    SVTYPE_table = _make_SVTYPE_dict(args.minGQtable)
    FILTER_table = _make_FILTER_dict(args.minGQtable, vcf)
    EV_table = _make_EV_dict(args.minGQtable)

    #Make minGQ lookup table
    minGQ_dict = make_minGQ_dict(args.minGQtable, SVLEN_table, AF_table,
                                 SVTYPE_table, FILTER_table, EV_table)

    #Iterate over records in vcf and apply filter
    for record in vcf.fetch():
        #Do not process multiallelic variants, unless optioned
        if args.multiallelics or \
        (not args.multiallelics and
         not _is_multiallelic(record)):
            apply_minGQ_filter(record,
                               minGQ_dict,
                               SVLEN_table,
                               AF_table,
                               SVTYPE_table,
                               FILTER_table,
                               EV_table,
                               globalMin=args.globalMin,
                               maxNCR=args.maxNCR,
                               highNCR_filter=filter_text)

        if args.cleanAFinfo:
            for key in 'AN AC AF N_BI_GENOS N_HOMREF N_HET N_HOMALT FREQ_HOMREF FREQ_HET FREQ_HOMALT'.split(
                    ' '):
                if key in record.info.keys():
                    record.info.pop(key)

        if args.dropEmpties:
            samps = svu.get_called_samples(record, include_null=False)
            if len(samps) > 0:
                fout.write(record)
        else:
            fout.write(record)

    fout.close()
Пример #5
0
def sr_test(argv):
    parser = argparse.ArgumentParser(
        description="Calculate enrichment of clipped reads at SV breakpoints.",
        prog='svtk sr-test',
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('vcf',
                        help='VCF of variant calls. Standardized to include '
                        'CHR2, END, SVTYPE, STRANDS in INFO.')
    parser.add_argument('countfile',
                        help='Tabix indexed file of split counts.'
                        ' Columns: chrom,pos,clip,count,sample')
    parser.add_argument('fout',
                        help='Output table of most significant start/end'
                        'positions.')
    parser.add_argument('-w',
                        '--window',
                        type=int,
                        default=100,
                        help='Window around variant start/end to consider for '
                        'split read support. [100]')
    parser.add_argument('--common',
                        default=False,
                        action='store_true',
                        help='Ignore background for common AF')
    parser.add_argument('-b',
                        '--background',
                        type=int,
                        default=160,
                        help='Number of background samples to choose for '
                        'comparison in t-test. [160]')
    parser.add_argument('-s',
                        '--samples',
                        type=argparse.FileType('r'),
                        default=None,
                        help='Whitelist of samples to restrict testing to.')
    parser.add_argument(
        '--index',
        default=None,
        help='Tabix index of discordant pair file. Required if '
        'discordant pair file is hosted remotely.')
    # TODO: add normalization
    parser.add_argument('--medianfile',
                        default=None,
                        help='Median coverage statistics for each library '
                        '(optional). If provided, each sample\'s split '
                        'counts will be normalized accordingly. '
                        'Same format as RdTest, one column per sample.')
    parser.add_argument('--log',
                        action='store_true',
                        default=False,
                        help='Print progress log to stderr.')

    # Print help if no arguments specified
    if len(argv) == 0:
        parser.print_help()
        sys.exit(1)
    args = parser.parse_args(argv)

    vcf = pysam.VariantFile(args.vcf)

    if args.index is not None:
        countfile = pysam.TabixFile(args.countfile,
                                    index=args.index,
                                    parser=pysam.asTuple())
    else:
        if args.countfile.startswith('http'):
            raise Exception('Must provide tabix index with remote URL')
        countfile = pysam.TabixFile(args.countfile, parser=pysam.asTuple())

    if args.fout in '- stdout'.split():
        fout = sys.stdout
    else:
        fout = open(args.fout, 'w')

    header = 'name coord pos log_pval called_median bg_median bg_frac'.split()
    fout.write('\t'.join(header) + '\n')

    if args.samples is not None:
        whitelist = [s.strip() for s in args.samples.readlines()]
    else:
        whitelist = None

    if args.medianfile is not None:
        medians = pd.read_table(args.medianfile)
        medians = pd.melt(medians, var_name='sample', value_name='median_cov')
    else:
        medians = None

    runner = SRTestRunner(vcf,
                          countfile,
                          fout,
                          args.background,
                          args.common,
                          args.window,
                          whitelist,
                          medians=medians,
                          log=args.log)
    runner.run()
Пример #6
0
def load_vcf(vcf_file, info_keys=[], format_keys=[]):
    """Function to load VCF into gwas dataframe."""
    # Load VCF file using pysam
    reader = pysam.VariantFile(vcf_file)

    if "*" in info_keys:
        header_dict = dict(reader.header.info)
        new_keys = []
        for k in header_dict.keys():
            new_keys.append(k)
        info_keys = new_keys
    if "*" in format_keys:
        header_dict = dict(reader.header.formats)
        new_keys = []
        for k in header_dict.keys():
            new_keys.append(k)
        format_keys = new_keys

    print(info_keys)
    info_keys = set(info_keys)
    print(format_keys)
    format_keys = set(format_keys)

    df_dict = defaultdict(list)
    for record in reader:
        if len(record.alts) != 1:
            continue
        if record.ref not in nucleotide_dict or record.alts[
                0] not in nucleotide_dict:
            continue

        # Run through all variants and all their keys in format
        for sample in record.samples:
            format_dict = dict(record.samples[sample])
            for key, value in format_dict.items():
                if key not in format_keys:
                    continue
                # _add_basic_component(record, sample, df_dict)
                if key == "GT":
                    if None in list(value):
                        value = -1
                    else:
                        value = sum(list(value))
                _add_key_value(record, sample, f"call_{key}", value, df_dict)

            # Run through all variants and all their info keys
            info_dict = dict(record.info)
            for key, value in info_dict.items():
                if key not in info_keys:
                    continue
                # _add_basic_component(record, sample, df_dict)
                _add_key_value(record, sample, key, value, df_dict)

    df = pd.DataFrame.from_dict(df_dict)
    df, feature_mapping = _create_numerical_features(df)
    df = df.pivot_table(
        index=[
            "chrom", "pos", "ref", "alt", "sample", "quality", "feature_id"
        ],
        columns="key",
        values="value",
    ).reset_index()
    cuda_df = cudf.DataFrame(df)
    return cuda_df
Пример #7
0
def annotate_vcf_with_inference(args):
	cnns = {}
	stats = Counter()
	vcf_reader = pysam.VariantFile(args.negative_vcf, 'rb')
	pyvcf_vcf_reader = vcf.Reader(open(args.negative_vcf, 'rb'))
	input_tensors = {}

	for a in args.architectures:	
		cnns[a] = models.set_args_and_get_model_from_semantics(args, a)
		print('Annotating with architecture:', a, 'sample name is', args.sample_name)		

		if not score_key_from_json(a) in vcf_reader.header.info:
			vcf_reader.header.info.add(score_key_from_json(a), '1', 'Float', 'Site-level score from Convolutional Neural Net named '+a+'.')
		if defines.annotations_from_args(args) is not None:
			input_tensors[args.annotation_set] = (len(args.annotations),)
		input_tensors[args.tensor_map] = defines.tensor_shape_from_args(args)

	vcf_writer = pysam.VariantFile(args.output_vcf, 'w', header=vcf_reader.header)
	print('got vcfs. input tensor shape mapping:', input_tensors)

	reference = SeqIO.to_dict(SeqIO.parse(args.reference_fasta, "fasta"))
	print('got ref.')

	samfile = pysam.AlignmentFile(args.bam_file, "rb")	
	print('got sam.')

	positions = []
	variant_batch = []
	time_batch = time.time()

	batch = {}
	for tm in input_tensors:
		batch[tm] = np.zeros(((args.batch_size,) + input_tensors[tm]))
	print('input tensors:', input_tensors)
	if args.chrom:
		print('iterate over region of vcf', args.chrom, args.start_pos, args.end_pos)
		variants = vcf_reader.fetch(args.chrom, args.start_pos, args.end_pos)
	else:
		print('iterate over vcf')
		variants = vcf_reader

	start_time = time.time()
	for variant in variants:
		idx_offset, ref_start, ref_end = get_variant_window(args, variant)
		args.chrom = variant.contig # In case chrom isn't set on command line we need it to fetch reads.
		contig = reference[variant.contig]	
		record = contig[ ref_start : ref_end ]
		v = pysam_variant_in_pyvcf(variant, pyvcf_vcf_reader)
		for tm in batch:
			batch_key = tm+'_in_batch'
			if tm in defines.annotations:
				args.annotation_set = tm
				annotation_data = td.get_annotation_data(args, v, stats)
				batch[tm][stats[batch_key]] = annotation_data
				stats[batch_key] += 1

			if 'read' in tm:
				args.tensor_map = tm
				if "read_tensor" == args.tensor_map:
					read_tensor = td.make_reference_and_reads_tensor(args, v, samfile, record.seq, ref_start, stats)
				elif "paired_reads" == args.tensor_map:	
					read_tensor = td.make_paired_read_tensor(args, v, samfile, record.seq, ref_start, ref_end, stats)
				else:
					raise ValueError("Unknown read tensor mapping."+tt)

				batch[tm][stats[batch_key]] = read_tensor
				if read_tensor is None:
					print('got empty', args.tensor_map, 'tensor at:', v)
					batch[tm][stats[batch_key]] = np.zeros(input_tensors[tm])
				stats[batch_key] += 1

			if 'reference' in tm:
				args.tensor_map = tm
				reference_tensor = td.make_reference_tensor(args, record.seq)
				batch[tm][stats[batch_key]] = reference_tensor
				stats[batch_key] += 1
			
		positions.append(variant.contig + '_' + str(variant.pos))
		variant_batch.append(variant)

		if stats[batch_key] == args.batch_size:
			apply_cnns_to_batch(args, cnns, batch, positions, variant_batch, vcf_writer, stats)
			
			# Reset the batch
			positions = []		
			variant_batch = []
			for tm in batch:
				batch_key = tm+'_in_batch'
				batch[tm] = np.zeros(((args.batch_size,) + input_tensors[tm]))
				stats[batch_key] = 0

			stats['batches processed'] += 1
			if stats['batches processed'] % 10 == 0:
				elapsed = time.time()-start_time
				v_per_minute = stats['batches processed']*args.batch_size / (elapsed/60)
				print('Variants per minute:', v_per_minute, 'Batches:', stats['batches processed'], 'batches.  Last variant:', variant)

		if stats['batches processed']*args.batch_size > args.samples:
			break

	if stats[batch_key] > 0:
		apply_cnns_to_batch(args, cnns, batch, positions, variant_batch, vcf_writer, stats)

	for s in stats.keys():
		print(s, 'has:', stats[s])	
Пример #8
0
def main():
    
	# Exception handling for input files format.
	if args['f'].endswith(".fa") or args['f'].endswith(".fasta"):
		genome = pysam.FastaFile(args['f'])  # open fasta file
		print('[	  OK       ] Reading Fasta file is done.')
	else:
		raise FileFormatError("\n[	  ERROR    ] Input File is not in Fasta format.")

	if args['v'].endswith(".vcf"):
		vcf = pysam.VariantFile(args['v'])  # open vcf file
		print('[	  OK       ] Reading vcf file is done.')
	else:
		raise FileFormatError("\n[	  ERROR    ] Input File is not in VCF format.")
		
	k = args['k']  # Length of kmer

	# Handling different output formats.
	if args['outfmt'].upper() == 'TSV':
		with open(args['o']+'/'+args['outfile']+'.tsv','w') as fd:
			fd.write('## VMK-mer version: v1.0\n## Output file: {}\n## Reference fasta file: {}\n## VCF file: {}\n'.format(args['o']+'/'+args['outfile'],args['f'],args['v']))
		head_dict={'chr': 'Chr', 'pos': 'Pos', 'id': 'Mutation-ID', 'ref':'Ref-Allele' , 'alt':'Mut-Allele', 'refk':'Ref-Kmers', 'mutk':'Mut-Kmers'}
		write_in_tsv("Head",head_dict)
        
	elif args['outfmt'].upper() == 'XML':
		with open(args['o']+'/'+args['outfile']+'.xml','w') as fd:
			fd.write('## VMK-mer version: v1.0\n## Output file: {}\n## Reference fasta file: {}\n## VCF file: {}\n'.format(args['o']+'/'+args['outfile'],args['f'],args['v']))
	
	elif args['outfmt'] == 'both':
		with open(args['o']+'/'+args['outfile']+'.tsv','w') as fd:
			fd.write('## VMK-mer version: v1.0\n## Output file: {}\n## Reference fasta file: {}\n## VCF file: {}\n'.format(args['o']+'/'+args['outfile'],args['f'],args['v']))
		with open(args['o']+'/'+args['outfile']+'.xml','w') as fd:
			fd.write('## VMK-mer version: v1.0\n## Output file: {}\n## Reference fasta file: {}\n## VCF file: {}\n'.format(args['o']+'/'+args['outfile'],args['f'],args['v']))
	


	print('[	PROCESS    ] Extracting mutant kmers, please wait...')

	total = recordcount("v", vcf)
	iterations = 0
	for record in vcf:
        
		# Handling the mutation type included in the info tag of some VCF file as "TSA".
		if 'TSA' in record.info.keys():
			
			mutation_type = str(record.info['TSA'])
			if mutation_type == "SNV":
				snp(record, genome, k)
				#pass
                
			elif mutation_type == "insertion":
				insertion(record, genome, k)
				#pass
                
			elif mutation_type == "deletion":
				deletion(record, genome, k)
				#pass

        	# Handling the mutation type included in the info tag of some VCF file as "VT".
		elif 'VT' in record.info.keys():

			mutation_type = str(record.info['VT'][0])
			if mutation_type == "SNP":
				snp(record, genome, k)
				#pass
                
			elif mutation_type == "INDEL":
				if len(record.alts[0]) > len(record.ref):
					insertion(record, genome, k)
					#pass
                    
				elif len(record.alts[0]) < len(record.ref):
					deletion(record, genome, k)
					#pass
		iterations += 1
		progress (iterations, total)
	print('\n[	  OK       ] All kmers have been extracted successfully.')
Пример #9
0
def main(argv=sys.argv):

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument(
        "-i", "--input-vcf", dest="input_vcf_file", type=str,
        help="input vcf file")

    parser.add_argument(
        "-t", "--truth-vcf", dest="truth_vcf_file", type=str,
        help="truth vcf file")

    parser.add_argument(
        "-f", "--input-fasta", dest="input_fasta_file", type=str,
        help="input fasta file. faidx indexed reference sequence file to "
        "determine INDEL context ")

    parser.add_argument(
        "-e", "--input-bed", dest="input_bed_file", type=str,
        help="input file with intervals. Tab-delimited file of intervals "
        "in bed format to restrict analysis to. ")

    parser.add_argument(
        "-m", "--method", dest="methods", action="append", type=str,
        choices=("mutational-signature", "kinship"),
        help="methods to apply ")

    parser.set_defaults(
        methods=[],
        input_vcf_file=None,
        input_bed_file=None,
        input_fasta_file=None,
        truth_vcf_file=None,
    )

    (args, unknown) = E.start(parser,
                              argv,
                              add_output_options=True,
                              unknowns=True)

    if len(unknown) == 1:
        args.input_vcf_file = unknown[0]

    if args.input_vcf_file is None:
        raise ValueError("please supply a VCF file")

    if args.truth_vcf_file is None:
        raise ValueError("please supply a VCF file with truth data")

    if args.input_fasta_file is None:
        raise ValueError("please supply a fasta file with the reference genome")

    if not os.path.exists(args.input_vcf_file):
        raise OSError("input vcf file {} does not exist".format(
            args.input_vcf_file))

    if not os.path.exists(args.input_vcf_file + ".tbi"):
        raise OSError("input vcf file {} needs to be indexed".format(
            args.input_vcf_file))

    if not os.path.exists(args.truth_vcf_file):
        raise OSError("truth vcf file {} does not exist".format(
            args.truth_vcf_file))

    if not os.path.exists(args.truth_vcf_file + ".tbi"):
        raise OSError("truth vcf file {} needs to be indexed".format(
            args.truth_vcf_file))

    if not os.path.exists(args.input_fasta_file):
        raise OSError("input fasta file {} does not exist".format(
            args.input_fasta_file))

    if not os.path.exists(args.input_fasta_file + ".fai"):
        raise OSError("input fasta file {} needs to be indexed".format(
            args.input_fasta_file))

    # update paths to absolute
    args.input_fasta_file = os.path.abspath(args.input_fasta_file)
    args.input_vcf_file = os.path.abspath(args.input_vcf_file)
    args.truth_vcf_file = os.path.abspath(args.truth_vcf_file)

    test_vcf = pysam.VariantFile(args.input_vcf_file)
    truth_vcf = pysam.VariantFile(args.truth_vcf_file)
    contigs = test_vcf.header.contigs
    truth_contigs = set(truth_vcf.header.contigs)

    test_vcf_samples = set(test_vcf.header.samples)
    truth_vcf_samples = set(truth_vcf.header.samples)

    common_samples = test_vcf_samples.intersection(truth_vcf_samples)
    if len(common_samples) == 0:
        raise ValueError("no common samples in test/truth VCFs")

    def pair_iterator(test_vcf, truth_vcf, contig):
        counter = E.Counter()
        test_iter = test_vcf.fetch(contig)
        truth_iter = truth_vcf.fetch(contig)

        test_record = next(test_iter)
        truth_record = next(truth_iter)
        try:
            while 1:
                if test_record.pos < truth_record.pos:
                    test_record = next(test_iter)
                    continue

                elif test_record.pos > truth_record.pos:
                    truth_record = next(truth_iter)
                    continue

                elif len(test_record.alts) > 1:
                    counter.skip_test_truth += 1
                    test_record = next(test_iter)
                    continue

                elif len(truth_record.alts) > 1:
                    counter.skip_multiallelic_truth += 1
                    truth_record = next(truth_iter)
                    continue

                elif test_record.alts != truth_record.alts:
                    counter.skip_genotype_difference += 1
                    test_record = next(test_iter)
                    truth_record = next(truth_iter)
                    continue

                if test_record.ref != truth_record.ref:
                    # todo: deal with indels
                    raise ValueError(
                        "mismatching reference bases at position "
                        "{}:{}".format(test_record.chrom, test_record.pos))

                yield test_record, truth_record
                test_record = next(test_iter)
                truth_record = next(truth_iter)

        except StopIteration:
            pass

        E.debug(str(counter))

    counters_per_contig = {}

    for contig in contigs:
        counter_contig = collections.defaultdict(E.Counter)
        counters_per_contig[contig] = counter_contig

        E.info("processing contig {}".format(contig))

        if contig not in truth_contigs:
            E.warn(
                "skipping contig {} as it is not in truth data".format(contig))
            continue

        switch = False
        last_is_unphased = True

        for test_record, truth_record in pair_iterator(test_vcf, truth_vcf, contig):

            for sample in common_samples:
                counter = counter_contig[sample]

                truth_phased = truth_record.samples[sample].phased
                test_phased = test_record.samples[sample].phased
                truth_genotype = truth_record.samples[sample]["GT"]
                test_genotype = test_record.samples[sample]["GT"]
                truth_alleles = set(truth_genotype)
                test_alleles = set(test_genotype)

                ignore = False
                if not truth_phased:
                    counter.truth_unphased += 1
                    ignore = True
                if not test_phased:
                    counter.test_unphased += 1
                    ignore = True
                    last_is_unphased = True
                else:
                    last_is_unphased = False

                if len(test_alleles) == 1:
                    counter.test_homozygous += 1
                    ignore = True
                else:
                    if not test_phased:
                        counter.test_unphased_hets += 1

                if len(truth_alleles) == 1:
                    counter.truth_homozygous += 1
                    ignore = True

                if ignore:
                    counter.ignore += 1
                    continue

                E.debug("comparing: {}:{} {} -> {}: {} {}".format(
                    test_record.chrom, test_record.pos,
                    test_record.ref, test_record.alts,
                    test_genotype,
                    truth_genotype))

                if switch:
                    truth_genotype = truth_genotype[::-1]

                counter.test_phased_hets += 1

                if truth_genotype != test_genotype:
                    if not last_is_unphased:
                        E.debug("SWITCH: {}".format(switch))
                        counter.switch += 1
                    switch = not switch

    outf = args.stdout
    outf.write("\t".join(("contig",
                          "sample",
                          "switch_error_percent",
                          "false_negative_rate",
                          "switches",
                          "test_phased_hets",
                          "test_unphased_hets",
                          "test_unphased",
                          "truth_unphased",
                          "test_homozygous",
                          "truth_homozygous")) + "\n")

    for contig, contig_dict in list(counters_per_contig.items()):
        for sample, c in list(contig_dict.items()):
            outf.write("\t".join(
                map(str, (
                    contig,
                    sample,
                    "{:6.4f}".format(100.0 * c.switch / (c.test_phased_hets + 1)),
                    "{:6.4f}".format(100.0 * c.test_unphased_hets /
                                     (c.test_phased_hets + c.test_unphased_hets)),
                    c.switch,
                    c.test_phased_hets,
                    c.test_unphased_hets,
                    c.test_unphased,
                    c.truth_unphased,
                    c.test_homozygous,
                    c.truth_homozygous))) + "\n")

    E.stop()
Пример #10
0
def main(argv):
    parser = argparse.ArgumentParser(
        description=__doc__,
        prog='svtools standardize',
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('bed',
                        type=argparse.FileType('r'),
                        help='RdTest-formatted bed file. '
                        '(chrom, start, end, name, samples, svtype)')
    parser.add_argument('samples',
                        help='List of all samples present in '
                        'variant callset.')
    parser.add_argument('fout',
                        help='Standardized VCF. Will be compressed '
                        'with bgzip and tabix indexed if filename ends with '
                        '.gz')

    # Print help if no arguments specified
    if len(argv) == 0:
        parser.print_help()
        sys.exit(1)
    args = parser.parse_args(argv)

    # Get template header
    template = pkg_resources.resource_filename('svtools',
                                               'data/standard_template.vcf')
    template = pysam.VariantFile(template)
    header = template.header

    # Get list of samples
    with open(args.samples) as slist:
        samples = sorted([s.strip() for s in slist.readlines()])

    # Template header includes all necessary FILTER, INFO, and FORMAT fields
    # Just need to add list of samples
    for sample in samples:
        header.add_sample(sample)

    # Tag source in header
    meta = ('##FORMAT=<ID=depth,Number=1,Type=Integer,'
            'Description="Called by read-depth algorithms">')
    header.add_line(meta)
    header.add_line('##source=depth')

    if args.fout.endswith('.vcf.gz'):
        fname = os.path.splitext(args.fout)[0]
    elif args.fout.endswith('.vcf'):
        fname = args.fout
    else:
        msg = 'Invalid VCF filename; must end with .vcf or .vcf.gz: {0}'
        msg = msg.format(args.fout)
        raise ValueError(msg)

    fout = pysam.VariantFile(fname, mode='w', header=header)

    rdtest2vcf(args.bed, fout)

    # TODO: do this with subprocess so we don't have to write to disk twice
    if args.fout.endswith('.gz'):
        pysam.tabix_compress(fname, args.fout)
        pysam.tabix_index(args.fout, preset='vcf')
        os.remove(fname)
Пример #11
0
"""
Read the input vcf and use the model predictions to filter vcf by some threshold
"""
import sys
import pysam

vcf_file = sys.argv[1]
bed_file = sys.argv[2]
threshold = float(sys.argv[3])

predictions = {}
genotypes = {0: (0, 0), 1: (0, 1), 2: (1, 1)}

# go through the predictions bed file
# and get the probability distribution
for l in open(bed_file, 'r'):
    A = l.rstrip().split()
    key = '\t'.join(A[:3])
    predictions[key] = [float(x) for x in A[3:]]

# iterate over each record and replace the genotype
# with the predicted genotypes
with pysam.VariantFile(vcf_file, 'rb') as VCF:
    print(str(VCF.header).strip())
    for variant in VCF:
        key = '\t'.join(
            [str(x) for x in [variant.contig, variant.pos, variant.stop]])
        if key in predictions:
            if predictions[key][0] < threshold:
                print(str(variant).rstrip())
Пример #12
0
def main():
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('-v', '--variants', required=True, help='Default VCF')
    parser.add_argument('-r', '--RDtest')
    parser.add_argument('-b', '--BAFtest')
    parser.add_argument('-s', '--SRtest')
    parser.add_argument('-p', '--PEtest')
    parser.add_argument('--batch-list', type=argparse.FileType('r'))
    parser.add_argument('--segdups', required=True)
    parser.add_argument('--rmsk', required=True)
    parser.add_argument('--fam')
    parser.add_argument('-d', '--bed', action='store_true', default=False)
    parser.add_argument('fout')
    args = parser.parse_args()

    if args.bed:
        if not hasattr(args, 'batch_list'):
            raise Exception('batch list must be specified when passing a bed')
        variants = open(args.variants)
        dtypes = 'RD BAF'.split()
    else:
        variants = pysam.VariantFile(args.variants)
        dtypes = 'PE SR RD BAF'.split()

    metadata = process_metadata(variants, args.bed, args.batch_list)

    # Calculate segdup coverage
    bt = pbt.BedTool.from_dataframe(metadata['chrom start end'.split()])
    segdups = pbt.BedTool(args.segdups)
    cov = bt.coverage(segdups).to_dataframe()
    metadata['poor_region_cov'] = cov.thickStart

    # Check if endpoints are in repeat-masked sequence
    starts = metadata['chrom start end name'.split()].copy()
    starts['end'] = starts['start'] + 1
    ends = metadata['chrom start end name'.split()].copy()
    ends['start'] = ends['end'] - 1
    endpoints = pd.concat([starts, ends])
    bt = pbt.BedTool.from_dataframe(endpoints)
    rmsk = pbt.BedTool(args.rmsk)
    sect = bt.intersect(rmsk, u=True)
    rmasked_names = [i.fields[3] for i in sect.intervals]
    metadata['rmsk'] = metadata.name.isin(rmasked_names)

    metadata = metadata.set_index('name')

    evidence = deque()

    for dtype in dtypes:
        dtable = getattr(args, dtype + 'test')
        if dtable is None:
            continue

        df = pd.read_table(dtable)

        df = preprocess(df, dtype)
        df = df.rename(columns=lambda c: dtype + '_' + c if c != 'name' else c)
        df = df.set_index('name')
        evidence.append(df)

    evidence = list(evidence)
    evidence = metadata.join(evidence, how='outer', sort=True)
    evidence = evidence.reset_index().rename(columns={'index': 'name'})

    has_petest = (getattr(args, 'PEtest') is not None)
    has_srtest = (getattr(args, 'SRtest') is not None)
    if not args.bed and has_petest and has_srtest:
        evidence = add_pesr(evidence)

    # Replace infinite log-pvals
    LOG_CEIL = 300
    evidence = evidence.replace(np.inf, LOG_CEIL)

    evidence = evidence.reindex(columns=make_columns())
    evidence.to_csv(args.fout, index=False, sep='\t', na_rep='NA')
Пример #13
0
    def _parse_sam_file_and_vcf(cls, samfile, query_vcf_file, flank_length, allow_mismatches, exclude_regions=None, max_soft_clipped=3, number_ns=0):
        if  exclude_regions is None:
            exclude_regions = {}

        found = []
        match_flag = []
        correct_allele = []
        gt_conf = []
        allele = []

        samfile_handle = pysam.AlignmentFile(samfile, "r")
        sam_previous_record_name = None
        for sam_record in samfile_handle.fetch(until_eof=True):
            if sam_record.query_name == sam_previous_record_name:
                continue
            sam_previous_record_name = sam_record.query_name
            found_conf = False
            found_allele = False

            # see if excluded region in bed file
            ref, start, ref_num, var_num, allele_num = sam_record.query_name.rsplit('.', maxsplit=5)
            start = int(start) + flank_length
            exclude = False
            for ref_name in exclude_regions.keys():
                end = int(start) + 1
                interval = pyfastaq.intervals.Interval(start, end)
                exclude = EvaluateRecall._interval_intersects_an_interval_in_list(interval,
                                                                                  exclude_regions[ref_name])
            if exclude:
                found.append('Exclude')
                gt_conf.append(0)
                allele.append('0')
                continue

            match = EvaluateRecall._check_if_sam_match_is_good(sam_record,
                                                                    flank_length,
                                                                    query_sequence=sam_record.query_sequence,
                                                                    allow_mismatches=allow_mismatches,
                                                                    max_soft_clipped=max_soft_clipped)
            alignment_start = str(sam_record).split("\t")[3]
            match_flag.append(match)
            if match == 'Good':
                logging.debug('SAM record is a good match')
                logging.debug('SAM record reference is %s' %sam_record.reference_name)
                ref_name, expected_start, vcf_pos_index, vcf_record_index, allele_index = sam_record.reference_name.rsplit('.', maxsplit=4)

                vcf_reader = pysam.VariantFile(query_vcf_file)
                vcf_interval_start = int(expected_start) + int(alignment_start) + flank_length - 2 - number_ns
                vcf_interval_end = int(expected_start) + int(alignment_start) + flank_length - number_ns
                logging.debug('Find VCF records matching ref %s in interval [%i,%i]' %(ref_name, vcf_interval_start, vcf_interval_end))
                for i, vcf_record in enumerate(vcf_reader.fetch(ref_name, vcf_interval_start, vcf_interval_end)):
                    if i == int(vcf_pos_index):
                        sample_name = vcf_record.samples.keys()[0]
                        if 'GT' in vcf_record.format.keys() and len(set(vcf_record.samples[sample_name]['GT'])) == 1:
                            if int(allele_index) == int(vcf_record.samples[sample_name]['GT'][0]):
                                found.append('1')
                                allele.append(str(allele_index))
                                correct_allele.append('1')
                                found_allele = True
                                if 'GT_CONF' in vcf_record.format.keys():
                                    gt_conf.append(int(float(vcf_record.samples[sample_name]['GT_CONF'])))
                                    found_conf = True

            if not found_allele:
                found.append('0')
                allele.append('0')
                correct_allele.append('0')
            if not found_conf:
                gt_conf.append(0)
        assert len(found) == len(gt_conf)
        assert len(found) == len(allele)
        assert len(found) == len(match_flag)
        assert len(found) == len(correct_allele)
        return found, gt_conf, allele, match_flag, correct_allele
Пример #14
0
if not os.path.isfile(args.vcf)==True:
    print("Cannot find input file ",args.vcf)
    sys.exit(1)
if not (os.path.isfile(args.vcf+".tbi")==True or os.path.isfile(args.vcf+".csi")==True ):
    call(["bcftools","index",args.vcf])

# Merge the file with ALT variants
alts='/home/mzarowiecki/scratch/REF/allASDPs.SNV.50_10.valid.vcf.gz'

call(["bcftools", "merge","--force-samples","-O","z","-o",args.vcf+".asdp.vcf.gz",args.vcf,alts])




# read the input file
myvcf = pysam.VariantFile(args.vcf+".asdp.vcf.gz", "r")

# create an object of new bed file and open in to write data.
output=args.vcf+".asdp.res.vcf.gz"
out = open(output +'.review', 'w')
vaf = open(output +'.vaf', 'w')



myvcf.header.info.add("ALT", "1", "String", "Is variant on ALT or primary")

# create an object of new vcf file and open in to write data.
vcf_out = pysam.VariantFile(output, 'w', header=myvcf.header)


Пример #15
0
def retrieve_entry_from_test_query_vcf(idx: int) -> pysam.VariantRecord:
    with pysam.VariantFile(TEST_QUERY_VCF) as vcf:
        for i, record in enumerate(vcf):
            if i == idx:
                return record
    raise IndexError("You asked for an index that is beyond the number in the test VCF")
Пример #16
0
def read_vcf(infile,
             sample_id=None,
             normal_id=None,
             min_depth=None,
             skip_reject=False,
             skip_somatic=False):
    """Read one tumor-normal pair or unmatched sample from a VCF file.

    By default, return the first tumor-normal pair or unmatched sample in the
    file.  If `sample_id` is a string identifier, return the (paired or single)
    sample  matching that ID.  If `sample_id` is a positive integer, return the
    sample or pair at that index position, counting from 0.
    """
    # if isinstance(infile, basestring):
    #     vcf_reader = vcf.Reader(filename=infile)
    # else:
    #     vcf_reader = vcf.Reader(infile)
    try:
        vcf_reader = pysam.VariantFile(infile)
    except Exception as exc:
        raise ValueError("Must give a VCF filename, not open file handle: %s" %
                         exc)
    if vcf_reader.header.samples:
        sid, nid = _choose_samples(vcf_reader, sample_id, normal_id)
        logging.info("Selected test sample " + str(sid) +
                     (" and control sample %s" % nid if nid else ''))
        # NB: in-place
        vcf_reader.subset_samples(list(filter(None, (sid, nid))))
    else:
        logging.warn("VCF file %s has no sample genotypes", infile)
        sid = sample_id
        nid = None

    columns = [
        'chromosome', 'start', 'end', 'ref', 'alt', 'somatic', 'zygosity',
        'depth', 'alt_count'
    ]
    if nid:
        columns.extend(['n_zygosity', 'n_depth', 'n_alt_count'])

    rows = _parse_records(vcf_reader, sid, nid, skip_reject)
    table = pd.DataFrame.from_records(rows, columns=columns)
    table['alt_freq'] = table['alt_count'] / table['depth']
    if nid:
        table['n_alt_freq'] = table['n_alt_count'] / table['n_depth']
    table = table.fillna({col: 0.0 for col in table.columns[6:]})
    # Filter out records as requested
    cnt_depth = cnt_som = 0
    if min_depth:
        if table['depth'].any():
            dkey = 'n_depth' if 'n_depth' in table else 'depth'
            idx_depth = table[dkey] >= min_depth
            cnt_depth = (~idx_depth).sum()
            table = table[idx_depth]
        else:
            logging.warn("Depth info not available for filtering")
    if skip_somatic:
        idx_som = table['somatic']
        cnt_som = idx_som.sum()
        table = table[~idx_som]
    logging.info("Loaded %d records; skipped: %d somatic, %d depth",
                 len(table), cnt_som, cnt_depth)
    # return sid, nid, table
    return table
Пример #17
0
def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-s",
        "--sample-size",
        dest="sample_size",
        type="float",
        help=
        "sample size. If less than 0, take a proportion of the chromosome size. "
        "If greater than 0, take a fixed number of variants [%default]")

    parser.add_option(
        "--input-filename-fasta",
        dest="input_filename_fasta",
        type="string",
        help="filename with reference sequence in fasta format [%default]")

    parser.add_option("--input-filename-bam",
                      dest="input_filename_bam",
                      type="string",
                      help="filename with aligned reads [%default]")

    parser.add_option("--no-vcf-columns",
                      dest="no_vcf_columns",
                      action="store_true",
                      help="do not output vcf columns")

    parser.add_option(
        "--counter",
        dest="counters",
        type="choice",
        action="append",
        choices=["context", "bam-indels", "bam-allelic-depth", "indel-type"],
        help="counters to apply [%default]")

    parser.set_defaults(
        input_filename_fasta=None,
        input_filename_bam=None,
        input_filename_vcf=None,
        sample_size=0.001,
        sample_name="NA12878",
        region_size=20,
        threshold_homopolymer=12,
        threshold_repeat=5,
        no_vcf_columns=False,
        counters=[],
    )

    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    if len(args) > 0:
        options.input_filename_vcf = args[0]

    vcf_in = pysam.VariantFile(options.input_filename_vcf)

    counters = []

    if options.input_filename_fasta:
        fasta = pysam.FastaFile(options.input_filename_fasta)
    else:
        fasta = None

    if options.input_filename_bam:
        bam = pysam.AlignmentFile(options.input_filename_bam)
    else:
        bam = None

    for counter in options.counters:
        if counter == "context":
            counters.append(CounterContext(fasta))
        elif counter == "bam-indels":
            counters.append(CounterBAMIndels(bam))
        elif counter == "bam-allelic-depth":
            counters.append(CounterBAMAllelicDepth(bam))
        elif counter == "indel-type":
            counters.append(CounterIndelType())

    outf = options.stdout
    if not options.no_vcf_columns:
        header = str(
            vcf_in.header).strip().split("\n")[-1].strip()[1:].split("\t")

    else:
        header = ["chrom", "pos"]

    outf.write("\t".join(header))

    for counter in counters:
        outf.write("\t" + "\t".join(counter.header))

    outf.write("\n")
    for record in vcf_in:

        for counter in counters:
            counter.count(record)

        if not options.no_vcf_columns:
            outf.write("{}\t".format(str(record).strip()))
        else:
            outf.write("{}\t{}\t".format(record.chrom, record.pos))

        outf.write("\t".join(map(str, counters)) + "\n")

    E.stop()
Пример #18
0
def recall_variants(args):

    variants, alignment_file_path, target_path, mode, germline_variants_path, germline_variants_sample, germline_bam_path, window_radius, MAX_REF_MOLECULES, max_buffer_size = args

    window_radius = 600
    MAX_REF_MOLECULES = 1_000  # Maximum amount of reference molecules to process.
    # This is capped for regions to which many reads map (mapping artefact)

    variant_calls = dict()  # cell->(chrom,pos) +/- ?

    ### Set up molecule iterator (1/2)
    if mode == 'NLA':
        mc = NlaIIIMolecule
        fc = NLAIIIFragment
    else:
        mc = Molecule
        fc = Fragment

    ###
    locations_done = set()
    alignments = pysam.AlignmentFile(alignment_file_path, threads=4)
    if germline_bam_path is not None:
        germline_alignments = pysam.AlignmentFile(germline_bam_path, threads=4)

    for variant in variants:

        # Check if the variant is present in the germline bam file (if supplied)
        if germline_bam_path is not None and has_variant_reads(
                germline_alignments,
                variant.chrom,
                variant.pos - 1,
                variant.alts[0],
                min_reads=1,
                stepper='nofilter'):
            print(f'FOUND IN GERMLINE {variant}')
            continue

        #print(variant)
        overlap = False
        reference_start = max(0, variant.pos - window_radius)
        reference_end = variant.pos + window_radius
        contig = variant.contig

        variant_key = (contig, variant.pos, variant.ref, variant.alts[0])

        #print(contig,reference_start,reference_end,variant.alts[0],variant.ref)
        ### Set up allele resolver
        unphased_allele_resolver = singlecellmultiomics.alleleTools.AlleleResolver(
            use_cache=False, phased=False, verbose=True)

        if germline_variants_path is not None:
            with pysam.VariantFile(germline_variants_path) as germline:
                for i, ar_variant in enumerate(
                        germline.fetch(variant.chrom, reference_start,
                                       reference_end)):

                    if germline_variants_sample is None:
                        # If any of the samples is not heterozygous: continue
                        if any((ar_variant.samples[sample].alleles != 2
                                for sample in ar_variant.samples)):
                            continue
                    elif len(
                            set(ar_variant.samples[germline_variants_sample].
                                alleles)) != 2:
                        continue
                    unphased_allele_resolver.locationToAllele[
                        ar_variant.chrom][ar_variant.pos - 1] = {
                            ar_variant.alleles[0]: {'U'},
                            ar_variant.alleles[1]: {'V'}
                        }
        ####

        ref_phased = Counter()
        alt_phased = Counter()

        ### Set up molecule iterator (2/2)
        try:
            molecule_iter = MoleculeIterator(alignments,
                                             mc,
                                             fc,
                                             contig=contig,
                                             start=reference_start,
                                             end=reference_end,
                                             molecule_class_args={
                                                 'allele_resolver':
                                                 unphased_allele_resolver,
                                                 'max_associated_fragments':
                                                 20,
                                             },
                                             max_buffer_size=max_buffer_size)

            reference_called_molecules = []  # molecule, phase

            extracted_base_call_count = 0
            alt_call_count = 0
            for mi, molecule in enumerate(molecule_iter):
                base_call = get_molecule_base_calls(molecule, variant)
                if base_call is None:
                    continue
                extracted_base_call_count += 1
                base, quality = base_call
                call = None
                if base == variant.alts[0]:
                    call = 'A'
                    alt_call_count += 1
                    if molecule.sample not in variant_calls:
                        variant_calls[molecule.sample] = {}
                    variant_calls[molecule.sample][variant_key] = 1

                elif base == variant.ref:
                    call = 'R'

                if call is None:
                    continue

                # Obtain all germline variants which are phased :
                phased = get_phased_variants(molecule,
                                             unphased_allele_resolver)

                if call == 'R' and len(phased) > 0:
                    # If we can phase the alternative allele to a germline variant
                    # the reference calls can indicate absence
                    if len(reference_called_molecules) < MAX_REF_MOLECULES:
                        reference_called_molecules.append((molecule, phased))

                for chrom, pos, base in phased:
                    if call == 'A':
                        alt_phased[(chrom, pos, base)] += 1
                    elif call == 'R':
                        ref_phased[(chrom, pos, base)] += 1
        except MemoryError:
            print(f"Buffer exceeded for {variant.contig} {variant.pos}")
            continue

        #print(mi,extracted_base_call_count,alt_call_count)
        if len(alt_phased) > 0 and len(reference_called_molecules):
            # Clean the alt_phased variants for variants which are not >90% the same
            alt_phased_filtered = filter_alt_calls(alt_phased, 0.9)
            #print(alt_phased_filtered)
            for molecule, phased_gsnvs in reference_called_molecules:
                for p in phased_gsnvs:
                    if p in alt_phased_filtered:
                        if not molecule.sample in variant_calls:
                            variant_calls[molecule.sample] = {}
                        variant_calls[molecule.sample][variant_key] = 0
                        break
        locations_done.add(variant_key)
    alignments.close()
    return variant_calls, locations_done
Пример #19
0
get_conf_int.output_non_cov_call_info(output_dir, SV_positions_file,
                                      assem1_non_cov_regions_file,
                                      assem2_non_cov_regions_file)

#get filtered sv info, using results from get_conf_int.py
exclude_assem1_non_cover, exclude_assem2_non_cover = validate.get_filtered_sv_pos(
    output_dir + "exclude_assem1_non_cover.bed",
    output_dir + "exclude_assem2_non_cover.bed")

dict_centromere = validate.build_centro_dict(centromere_file)

##################################################################################
##################################################################################
#index SVs

f = pysam.VariantFile(vcf_file, 'r')
sv_list = []
for count, rec in enumerate(f.fetch()):
    #get sv_type
    try:
        sv_type = rec.info['SVTYPE']
    except:
        print("invalid sv type info")
        continue

    if first_filter(rec, sv_type):
        continue

    #get sv length
    if sv_type == 'INV':
        sv_len = abs(rec.stop - rec.pos + 1)
Пример #20
0
def job_gen(induced_variants_path,
            germline_variants_path,
            germline_variants_sample,
            alignments_path,
            block_size=100,
            n=None,
            contig=None,
            completed=None,
            min_qual=None,
            germline_bam_path=None,
            MAX_REF_MOLECULES=1000,
            window_radius=600,
            max_buffer_size=100_000):
    """
    Job generator

    block_size(int) : variants per block
    n(int) : amount of blocks to generate
    min_qual(float) : minimum quality score of variants to process
    contig: contig to generate jobs for
    completed(set): set of locations which should be skipped
    """

    i = 0
    with pysam.VariantFile(induced_variants_path,
                           ignore_truncation=True) as sc_calls:

        vlist = []
        for record in sc_calls:
            if contig is not None and record.chrom != contig:
                continue

            if completed is not None and (record.chrom,
                                          record.pos) in completed:
                continue

            if min_qual is not None and record.qual < min_qual:
                continue

            if len(record.alts[0]) != 1 or len(record.ref) != 1:
                continue

            k = (record.chrom, record.pos)

            vlist.append(VariantWrapper(record))

            if len(vlist) >= block_size:
                #f'./{extraction_folder}/variants_extracted_0_NLA_{i}.bam'
                yield (vlist, alignments_path, None, 'NLA',
                       germline_variants_path, germline_variants_sample,
                       germline_bam_path, window_radius, MAX_REF_MOLECULES,
                       max_buffer_size)

                vlist = []
                i += 1
                if n is not None and i >= n:
                    break
        if len(vlist):
            yield (vlist, alignments_path, None, 'NLA', germline_variants_path,
                   germline_variants_sample, germline_bam_path, window_radius,
                   MAX_REF_MOLECULES, max_buffer_size)
Пример #21
0
introns = annot.features.introns()

features = {
    k: annot[annot.Feature == k].drop()
    for k in ['CDS', 'five_prime_utr', 'three_prime_utr']
}
features['intron'] = introns.drop()

# high confidence regions
highc = pyranges.read_bed(snakemake.input.hc_bed)

# exome sequencing target regions
exometarg = pyranges.read_bed(snakemake.input.es_bed)

# load the variants into a pyranges object
prpm = vcf_to_pyranges(pysam.VariantFile(snakemake.input.vcf),
                       tmpfile=snakemake.output.tsv + '_tmp.bed')
# count overlaps to different feature types
prpm = pyranges.count_overlaps(features, prpm)

# annotation by majority vote
main_anno = np.argmax(
    prpm.as_df()[['CDS', 'five_prime_utr', 'three_prime_utr',
                  'intron']].values,
    axis=1)
d = {
    i: k
    for i, k in enumerate(
        ['CDS', 'five_prime_utr', 'three_prime_utr', 'intron'])
}
main_anno = pd.Series(main_anno).map(d)
Пример #22
0
                    help="VCF file")

# Check for no input
if len(sys.argv) == 1:
    parser.print_help()
    sys.exit(1)

args = parser.parse_args()

# Check if input files exist
if not os.path.isfile(args.vcf) == True:
    print("Cannot find input file ", args.vcf)
    sys.exit(1)

# read the input file
myvcf = pysam.VariantFile(args.vcf, "r")

# create an object of new bed file and open in to write data.
output = args.vcf + ".bed"
out = open(output, 'w')

for r in myvcf:

    #### FILTER OUT #####
    # Shared called total
    # Filter out sites which
    chr = r.chrom
    pos = r.pos
    id = str(r.id)
    varID = ':'.join([id.split(":")[0], id.split(":")[1]])
    #altb = r.ref
Пример #23
0
def pe_test(argv):
    parser = argparse.ArgumentParser(
        description=
        "Calculate enrichment of discordant pairs at SV breakpoints.",
        prog='svtk pe-test',
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('vcf', help='Variants.')
    parser.add_argument('disc', help='Table of discordant pair coordinates.')
    parser.add_argument('fout',
                        type=argparse.FileType('w'),
                        help='Output table of PE counts.')
    parser.add_argument('-o',
                        '--window-out',
                        type=int,
                        default=500,
                        help='Window outside breakpoint to query for '
                        'discordant pairs. [500]')
    parser.add_argument('-i',
                        '--window-in',
                        type=int,
                        default=50,
                        help='Window inside breakpoint to query for '
                        'discordant pairs. [50]')
    parser.add_argument('-b',
                        '--background',
                        type=int,
                        default=160,
                        help='Number of background samples to sample for PE '
                        'evidence. [160]')
    parser.add_argument('--common',
                        default=False,
                        action='store_true',
                        help='Ignore background for common AF')
    parser.add_argument('-s',
                        '--samples',
                        type=argparse.FileType('r'),
                        default=None,
                        help='Whitelist of samples to restrict testing to.')
    parser.add_argument(
        '--index',
        default=None,
        help='Tabix index of discordant pair file. Required if '
        'discordant pair file is hosted remotely.')
    parser.add_argument('--medianfile',
                        default=None,
                        help='Median coverage statistics for each library '
                        '(optional). If provided, each sample\'s split '
                        'counts will be normalized accordingly. '
                        'Same format as RdTest, one column per sample.')
    parser.add_argument('--log',
                        action='store_true',
                        default=False,
                        help='Print progress log to stderr.')

    if len(argv) == 0:
        parser.print_help()
        sys.exit(1)
    args = parser.parse_args(argv)

    if args.vcf in '- stdin'.split():
        vcf = pysam.VariantFile(sys.stdin)
    else:
        vcf = pysam.VariantFile(args.vcf)

    if args.fout in '- stdout'.split():
        fout = sys.stdout
    else:
        fout = args.fout

    header = 'name log_pval called_median bg_median bg_frac'.split()
    args.fout.write('\t'.join(header) + '\n')

    if args.samples is not None:
        whitelist = [s.strip() for s in args.samples.readlines()]
    else:
        whitelist = None

    if args.index is not None:
        discfile = pysam.TabixFile(args.disc, index=args.index)
    else:
        if args.disc.startswith('http'):
            raise Exception('Must provide tabix index with remote URL')
        discfile = pysam.TabixFile(args.disc)

    if args.medianfile is not None:
        medians = pd.read_table(args.medianfile)
        medians = pd.melt(medians, var_name='sample', value_name='median_cov')
    else:
        medians = None

    runner = PETestRunner(vcf,
                          discfile,
                          fout,
                          args.background,
                          args.common,
                          args.window_in,
                          args.window_out,
                          whitelist,
                          medians=medians,
                          log=args.log)

    runner.run()
Пример #24
0
def read_vcf_samples(vcf_filename):
    vcf = ps.VariantFile(str(vcf_filename))
    return vcf.header.samples
Пример #25
0
def count_pe(argv):
    parser = argparse.ArgumentParser(
        description="Count discordant pairs supporting a SV breakpoints.",
        prog='svtk count-pe',
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('vcf', help='Variants.')
    parser.add_argument('disc', help='Table of discordant pair coordinates.')
    parser.add_argument('fout',
                        type=argparse.FileType('w'),
                        help='Output table of PE counts.')
    parser.add_argument('-o',
                        '--window-out',
                        type=int,
                        default=500,
                        help='Window outside breakpoint to query for '
                        'discordant pairs. [500]')
    parser.add_argument('-i',
                        '--window-in',
                        type=int,
                        default=50,
                        help='Window inside breakpoint to query for '
                        'discordant pairs. [50]')
    parser.add_argument('--common',
                        default=False,
                        action='store_true',
                        help='Ignore background for common AF')
    parser.add_argument('-s',
                        '--samples',
                        type=argparse.FileType('r'),
                        default=None,
                        help='Whitelist of samples to restrict testing to.')
    parser.add_argument(
        '--index',
        default=None,
        help='Tabix index of discordant pair file. Required if '
        'discordant pair file is hosted remotely.')
    parser.add_argument('--medianfile',
                        default=None,
                        help='Median coverage statistics for each library '
                        '(optional). If provided, each sample\'s split '
                        'counts will be normalized accordingly. '
                        'Same format as RdTest, one column per sample.')

    if len(argv) == 0:
        parser.print_help()
        sys.exit(1)
    args = parser.parse_args(argv)

    if args.vcf in '- stdin'.split():
        vcf = pysam.VariantFile(sys.stdin)
    else:
        vcf = pysam.VariantFile(args.vcf)

    if args.fout in '- stdout'.split():
        fout = sys.stdout
    else:
        fout = args.fout

    header = 'name sample count'.split()
    args.fout.write('\t'.join(header) + '\n')

    if args.samples is not None:
        whitelist = [s.strip() for s in args.samples.readlines()]
    else:
        whitelist = [s for s in vcf.header.samples]

    if args.index is not None:
        discfile = pysam.TabixFile(args.disc, index=args.index)
    else:
        if args.disc.startswith('http'):
            raise Exception('Must provide tabix index with remote URL')
        discfile = pysam.TabixFile(args.disc)

    if args.medianfile is not None:
        medians = pd.read_table(args.medianfile)
        medians = pd.melt(medians, var_name='sample', value_name='median_cov')
    else:
        medians = None

    petest = PETest(discfile,
                    args.common,
                    args.window_in,
                    args.window_out,
                    medians=medians)

    for record in vcf:
        counts = petest.load_counts(record, args.window_in, args.window_out)
        counts = petest.normalize_counts(counts)
        counts = counts.set_index('sample')
        counts = counts.reindex(whitelist).fillna(0).astype(int)
        counts = counts.reset_index()
        counts['name'] = record.id
        cols = 'name sample count'.split()

        for row in counts[cols].as_matrix():
            fout.write('\t'.join([str(x) for x in row]) + '\n')
Пример #26
0
def read_octopus_header_info(vcf_filename):
    vcf = ps.VariantFile(str(vcf_filename))
    for record in vcf.header.records:
        if record.key == "octopus":
            return dict(record)
    return None
Пример #27
0
def format_gdc_vcf(
    input_vcf: str,
    output_vcf: str,
    patient_barcode: str,
    case_id: str,
    tumor_barcode: str,
    tumor_aliquot_uuid: str,
    tumor_bam_uuid: str,
    normal_barcode: str,
    normal_aliquot_uuid: str,
    normal_bam_uuid: str,
    *,
    reference_name: str = "GRCh38.d1.vd1.fa",
) -> None:
    """
    Adds VCF header metadata specific to the GDC.

    :param input_vcf: The input VCF file to format.
    :param output_vcf: The output formatted VCF file to create. BGzip and tabix-index created if ends with '.gz'.
    :param patient_barcode: The case submitter id.
    :param case_id: The case uuid.
    :param tumor_barcode: The tumor aliquot submitter id.
    :param tumor_aliquot_uuid: The tumor aliquot uuid.
    :param tumor_bam_uuid: The tumor bam uuid.
    :param normal_barcode: The normal aliquot submitter id.
    :param normal_aliquot_uuid: The normal aliquot uuid.
    :param normal_bam_uuid: The normal bam uuid.
    :param reference_name: Reference name to use in header.
    """
    logger = Logger.get_logger("format_gdc_vcf")
    logger.info("Format GDC tumor/normal paired VCFs.")

    # setup
    reader = pysam.VariantFile(input_vcf)
    mode = get_pysam_outmode(output_vcf)

    # Load new header
    new_header = build_header(
        reader,
        patient_barcode,
        case_id,
        tumor_barcode,
        tumor_aliquot_uuid,
        tumor_bam_uuid,
        normal_barcode,
        normal_aliquot_uuid,
        normal_bam_uuid,
        reference_name,
    )

    writer = pysam.VariantFile(output_vcf, mode=mode, header=new_header)

    # Process
    try:
        for record in reader.fetch():
            writer.write(record)
    finally:
        reader.close()
        writer.close()

    if mode == "wz":
        logger.info("Creating tabix index...")
        tbx = pysam.tabix_index(output_vcf, preset="vcf", force=True)
Пример #28
0
def main():

    description = 'Process a .gvcf file to create a file of consensus '\
                  'variants, low-frequency variants and a coverage ' \
                  'mask '
    parser = argparse.ArgumentParser(description=description)

    parser.add_argument('-m',
                        '--mask-output',
                        required=True,
                        help=f"The output file name for the coverage "
                        f"mask\n")

    parser.add_argument('-v',
                        '--variants-output',
                        required=True,
                        help=f"The output file name for variants ("
                        f"non-reference gVCF records)\n")

    parser.add_argument('-c',
                        '--consensus-sites-output',
                        required=True,
                        help=f"The output file name "
                        f"for variants that will "
                        f"be applied to generate "
                        f"the consensus "
                        f"sequence\n")

    parser.add_argument('-d',
                        '--min-depth',
                        type=int,
                        default=10,
                        help=f"Mask reference positions with depth "
                        f"less than this threshold")

    parser.add_argument('-l',
                        '--lower-ambiguity-frequency',
                        type=float,
                        default=0.15,
                        help=f"Variants "
                        f"with "
                        f"frequency "
                        f"less than -l "
                        f"will be "
                        f"discarded")

    parser.add_argument('-u',
                        '--upper-ambiguity-frequency',
                        type=float,
                        default=0.75,
                        help=f"Substitution "
                        f"variants "
                        f"with "
                        f"frequency "
                        f"less than -u "
                        f"will be "
                        f"encoded with "
                        f"IUPAC "
                        f"ambiguity "
                        f"codes")

    parser.add_argument('file', action='store', nargs=1)

    args = parser.parse_args()
    vcf = pysam.VariantFile(open(args.file[0], 'r'))

    # Initialize depth mask to all zeros for all contigs
    contig_depth = defaultdict(list)
    for r in vcf.header.records:
        if r.type == "CONTIG":
            contig_depth[r['ID']] = [0] * int(r['length'])

    out_header = vcf.header

    # open the output file with the filtered variant sites
    out_header.info.add("VAF",
                        number="A",
                        type='Float',
                        description="Variant allele fraction, called "
                        "from observed reference/alt "
                        "reads")
    variants_out = pysam.VariantFile(args.variants_output,
                                     'w',
                                     header=out_header)

    # open the output file with the changes to apply to the consensus
    # fasta this includes an additional tag in the VCF file
    out_header.info.add("ConsensusTag",
                        number=1,
                        type='String',
                        description="The type of base to be included "
                        "in the consensus sequence (IUPAC"
                        " or Fixed)")
    consensus_sites_out = pysam.VariantFile(args.consensus_sites_output,
                                            'w',
                                            header=out_header)

    for record in vcf:

        is_gvcf_ref = record.alts[0] == "<*>"

        # set depth for this part of the genome
        # this works for both gVCF blocks and regular variants
        # because pos/stop are set appropriately
        v_start = record.pos
        v_end = record.stop
        depth = record.info["DP"]

        # disallow gvcf records that are longer than a single base
        assert (not is_gvcf_ref or v_start == v_end)

        # update depth mask
        for i in range(v_start, v_end + 1):
            assert (i > 0)
            # VCF coordinates are 1-based, we record the depth vector
            # as 0-based to be consistent with artic-mask
            contig_depth[record.chrom][i - 1] = depth

        # do nothing else with ref records, or records that don't
        # meet our minimum depth
        if is_gvcf_ref or depth < args.min_depth:
            continue

        # determine if any allele in the variant is an indel
        has_indel = False
        for i in range(0, len(record.alts)):
            has_indel = has_indel or len(record.ref) != len(record.alts[i])

        # process the input variant record to handle multi-allelic
        # variants and MNPs
        out_records = list()
        if has_indel:
            # indels need to be handle specially as we can't apply
            # ambiguity codes
            out_records = handle_indel(out_header, record)
        else:
            out_records = handle_sub(out_header, record)

        # classify variants using VAF cutoffs for IUPAC ambiguity
        # codes, etc
        accept_variant = False
        for out_r in out_records:

            # at this point we should have resolved multi-allelic
            # variants
            assert (len(out_r.alts) == 1)

            vaf = out_r.info["VAF"][0]
            is_indel = len(out_r.ref) != len(out_r.alts[0])

            # discard low frequency variants
            if vaf < args.lower_ambiguity_frequency:
                continue

            # Write a tag describing what to do with the variant
            consensus_tag = "None"

            # high-frequency subs and indels are always applied
            # without ambiguity
            # we don't have to do an indel VAF check here as it is
            # dealt with in handle_indel
            if vaf > args.upper_ambiguity_frequency or is_indel:
                # always apply these to the consensus
                consensus_tag = "fixed"
            else:
                # record ambiguous SNPs in the consensus sequence
                # with IUPAC codes
                consensus_tag = "ambiguous"
            out_r.info["ConsensusTag"] = consensus_tag
            consensus_sites_out.write(out_r)
            accept_variant = True

        if accept_variant:
            record.info["VAF"] = calculate_vafs(record)
            variants_out.write(record)

    write_depth_mask(args.mask_output, contig_depth, args.min_depth)
Пример #29
0
        return None, rec.info["DP"], af
    else:
        alt_counts = None
    if alt_counts is None or depth is None or depth == 0:
        return None, None, None
    else:
        freq = float(alt_counts) / float(depth)
        return alt_counts, depth, freq

def _cur_workdir(data):
    return utils.safe_makedir(os.path.join(data["dirs"]["work"], "heterogeneity",
                                           dd.get_sample_name(data), "bubbletree"))

if __name__ == "__main__":
    import sys
    bcf_in = pysam.VariantFile(sys.argv[1])
    somatic = collections.namedtuple("Somatic", "normal_name,tumor_name")
    params = {"min_freq": 0.4,
              "max_freq": 0.6,
              "min_depth": 15}
    for rec in bcf_in:
        if _is_possible_loh(rec, bcf_in, params, somatic(sys.argv[2], sys.argv[3])):
            print(rec.filter.keys(), len(rec.filter))

_script = """
.libPaths(c("{local_sitelib}"))
library(BubbleTree)
library(GenomicRanges)
library(ggplot2)

vc.df = read.csv("{vcf_csv}", header=T)
Пример #30
0
 def openFile(self, filename):
     return pysam.VariantFile(filename)