示例#1
0
    def simulate_read(self):
        """Function that simulates perfect paired-end reads"""

        fastafile = ps.FastaFile(self.genome_fa)
        # left split read

        insert = int(
            np.random.normal(self.insert_size, (self.insert_size / 12), 1))
        start = int(
            np.random.randint(self.chr_pos_start, (self.chr_pos_end + 1)))
        left_end = start + self.read_length
        total_end = start + int(np.round(insert))
        right_start = total_end - self.read_length
        if total_end > self.chr_pos_end:
            # split read scenario or insert spanning split read scenario
            if left_end > self.chr_pos_end:
                # left read spanning split read scenario
                # left_read
                left_dntps = self.chr_pos_end - start
                right_dntps = self.read_length - left_dntps

                # the error could be here
                left_split_read = fastafile.fetch(self.chr, start,
                                                  self.chr_pos_end)
                right_split_read = fastafile.fetch(
                    self.chr, self.chr_pos_start,
                    (self.chr_pos_start + right_dntps))
                left_read = left_split_read + right_split_read

                # right_read
                right_start = self.chr_pos_start + int(
                    round(self.insert_size - left_dntps - self.read_length))
                right_read = fastafile.fetch(self.chr, right_start,
                                             (right_start + self.read_length))

                # assertion to check the error here

                common_id = "%s|%s|%s:%s-%s:%s|%s:%s|1|%s" % (
                    self.read_number, self.chr, start, self.chr_pos_end,
                    self.chr_pos_start,
                    (self.chr_pos_start + right_dntps), right_start,
                    (right_start + self.read_length), self.circle_id)

            else:
                if right_start > self.chr_pos_end:
                    # insert spanning split read scenario
                    left_read = fastafile.fetch(self.chr, start,
                                                (start + self.read_length))
                    right_start = self.chr_pos_start + (right_start -
                                                        self.chr_pos_end)
                    right_read = fastafile.fetch(
                        self.chr, right_start,
                        (right_start + self.read_length))
                    common_id = "%s|%s|%s:%s|%s:%s|3|%s" % (
                        self.read_number, self.chr, start,
                        (start + self.read_length), right_start,
                        (right_start + self.read_length), self.circle_id)
                else:
                    # right split read scenario
                    assert right_start <= self.chr_pos_end
                    assert (right_start + self.read_length) > self.chr_pos_end
                    left_read = fastafile.fetch(self.chr, start,
                                                (start + self.read_length))

                    # compute right dntps
                    left_dntps = self.chr_pos_end - right_start
                    right_dntps = self.read_length - left_dntps
                    left_split_read = fastafile.fetch(self.chr, right_start,
                                                      self.chr_pos_end)
                    right_split_read = fastafile.fetch(
                        self.chr, self.chr_pos_start,
                        (self.chr_pos_start + right_dntps))
                    right_read = left_split_read + right_split_read
                    common_id = "%s|%s|%s:%s|%s:%s-%s:%s|2|%s" % (
                        self.read_number, self.chr, start,
                        (start + self.read_length), right_start,
                        self.chr_pos_end, self.chr_pos_start,
                        (self.chr_pos_start + right_dntps), self.circle_id)

        else:
            # non split read scenario
            left_read = fastafile.fetch(self.chr, start,
                                        (start + self.read_length))
            # correct right read start
            right_read = fastafile.fetch(self.chr, right_start,
                                         (right_start + self.read_length))
            common_id = "%s|%s|%s:%s|%s:%s|0|%s" % (
                self.read_number, self.chr, start,
                (start + self.read_length), right_start,
                (right_start + self.read_length), self.circle_id)

        return (right_read, left_read, common_id)
                gsnv_pos) in enumerate(probed_variants.items()):
            arguments = " ".join([
                x for x in sys.argv if x != '--cluster' and '.bed' not in x
                and '-ssnv' != x and '-gsnv' != x
            ])

            job_name = f'vstat_{i}'
            out_folder = './variantStats'
            if not os.path.exists(out_folder):
                os.makedirs(out_folder)
            print(
                'submission.py' +
                f' -y --py36 -time 50 -t 1 -m 50 -N {job_name} "{arguments} -ssnv {chrom}:{snv_pos} -gsnv {chrom}:{gsnv_pos} -prefix {out_folder}/{chrom}_{snv_pos}" '
            )
        exit()
    reference = pysamiterators.CachedFasta(pysam.FastaFile(args.reference))

    cell_obs = collections.defaultdict(
        lambda: collections.defaultdict(collections.Counter))
    statistics = collections.defaultdict(
        lambda: collections.defaultdict(collections.Counter))
    cell_call_data = collections.defaultdict(dict)  # location->cell->haplotype
    haplotype_scores = {}

    read_groups = set()  # Store unique read groups in this set
    with sorted_bam_file(f'{args.prefix}_evidence.bam',
                         origin_bam=pysam.AlignmentFile(
                             paths[0],
                             ignore_truncation=args.ignore_bam_issues),
                         read_groups=read_groups) as out:
示例#3
0
def run_tfbscan(args):

	###### Check input arguments ######
	check_required(args, ["motifs", "fasta"])				#Check input arguments
	check_files([args.motifs, args.fasta, args.regions]) 	#Check if files exist

	##Test input
	if args.outdir != None and args.outfile != None:								#Error - both set
		sys.exit("ERROR: Please choose either --outdir or --outfile")
	elif ((args.outdir == None or args.outdir != None) and args.outfile == None): 	#Separate files	
		args.outdir = "tfbscan_output/" if args.outdir == None else args.outdir
		make_directory(args.outdir) #Check and create output directory
	elif args.outdir == None and args.outfile != None: 								#Joined file
		check_files([args.outfile], "w")


	###### Create logger and write argument overview ######
	logger = TobiasLogger("TFBScan", args.verbosity)
	logger.begin()
	parser = add_tfbscan_arguments(argparse.ArgumentParser())
	
	logger.arguments_overview(parser, args)
	
	if args.outfile != None:
		logger.output_files([args.outfile])

	######## Read sequences from file and estimate background gc ########
	
	logger.info("Handling input files")
	logger.info("Reading sequences from fasta")

	fastafile = pysam.FastaFile(args.fasta)
	fasta_chrom_info = dict(zip(fastafile.references, fastafile.lengths))
	fastafile.close()
	logger.stats("- Found {0} sequences in fasta".format(len(fasta_chrom_info)))
	
	#Create regions available in fasta 	
	logger.info("Setting up regions")
	fasta_regions = RegionList([OneRegion([header, 0, fasta_chrom_info[header]]) for header in fasta_chrom_info])

	#If subset, setup regions
	if args.regions:
		regions = RegionList().from_bed(args.regions)

	else:	#set up regions from fasta references
		regions = fasta_regions
		regions = regions.apply_method(OneRegion.split_region, 1000000)	
		regions = regions.apply_method(OneRegion.extend_reg, 50)		#extend to overlap at junctions	

	#Clip regions at chromosome boundaries
	regions = regions.apply_method(OneRegion.check_boundary, fasta_chrom_info, "cut")
	if len(regions) == 0:
		logger.error("No regions found.")
		sys.exit()
	logger.info("- Total of {0} regions (after splitting)".format(len(regions)))
	
	#Background gc
	if args.gc == None:
		logger.info("Estimating GC content from fasta (set --gc to skip this step)")
		args.gc = get_gc_content(regions, args.fasta)
		logger.info("- GC content: {0}".format(round(args.gc, 5)))
	
	bg = np.array([(1-args.gc)/2.0, args.gc/2.0, args.gc/2.0, (1-args.gc)/2.0])

	#Split regions
	region_chunks = regions.chunks(args.split)
	

	#################### Read motifs from file ####################

	logger.info("Reading motifs from file")

	motif_list = MotifList().from_file(args.motifs)
	logger.stats("- Found {0} motifs".format(len(motif_list)))
	
	logger.debug("Getting motifs ready")
	motif_list.bg = bg
	for motif in motif_list:
		motif.set_prefix(args.naming)
		motif.bg = bg
		motif.get_pssm()
	
	motif_names = list(set([motif.prefix for motif in motif_list]))

	#Calculate scanning-threshold for each motif
	pool = mp.Pool(processes=args.cores)
	outlist = pool.starmap(OneMotif.get_threshold, itertools.product(motif_list, [args.pvalue])) 
	motif_list = MotifList(outlist)	

	pool.close()
	pool.join()


	#################### Find TFBS in regions #####################

	logger.comment("")
	logger.info("Scanning for TFBS with all motifs")

	manager = mp.Manager()

	if args.outdir != None:
		writer_cores = max(1,int(args.cores*0.1))
		worker_cores = max(1,args.cores - writer_cores)

	elif args.outfile != None:	#Write to one file
		writer_cores = 1
		worker_cores = max(1,args.cores - writer_cores)

	#Setup pools
	logger.debug("Writer cores: {0}".format(writer_cores))
	logger.debug("Worker cores: {0}".format(worker_cores))
	worker_pool = mp.Pool(processes=worker_cores, maxtasksperchild=1)
	writer_pool = mp.Pool(processes=writer_cores)

	#Setup bed-writers based on --outdir or --outfile
	temp_files = []
	qs = {}
	TF_names_chunks = [motif_names[i::writer_cores] for i in range(writer_cores)]
	for TF_names_sub in TF_names_chunks:

		#Skip over any empty chunks
		if len(TF_names_sub) == 0:
			continue

		logger.debug("Creating writer queue for {0}".format(TF_names_sub))

		if args.outdir != None:
			files = [os.path.join(args.outdir, TF + ".tmp") for TF in TF_names_sub]
			temp_files.extend(files)
		elif args.outfile != None:
			files = [args.outfile + ".tmp" for TF in TF_names_sub]		#write to the same file for all
			temp_files.append(files[0])
			
		q = manager.Queue()

		TF2files = dict(zip(TF_names_sub, files))
		logger.debug("TF2files dict: {0}".format(TF2files))
		writer_pool.apply_async(file_writer, args=(q, TF2files, args)) 	#, callback = lambda x: finished.append(x) print("Writing time: {0}".format(x)))
		for TF in TF_names_sub:
			qs[TF] = q
	writer_pool.close() #no more jobs applied to writer_pool
	args.qs = qs 		#qs is a dict

	#Setup scanners pool
	input_arguments = [(chunk, args, motif_list) for chunk in region_chunks]
	task_list = [worker_pool.apply_async(motif_scanning, (chunk, args, motif_list, )) for chunk in region_chunks]
	monitor_progress(task_list, logger)
	results = [task.get() for task in task_list]	#1s

	#Wait for files to write
	for TF in qs:
		qs[TF].put((None, None))

	writer_pool.join()

	#Process each file output and write out
	logger.comment("")
	logger.info("Processing results from scanning")
	logger.debug("Running processing for files: {0}".format(temp_files))
	task_list = [worker_pool.apply_async(process_TFBS, (file, args)) for file in temp_files]
	worker_pool.close()
	monitor_progress(task_list, logger)
	worker_pool.terminate()
	results = [task.get() for task in task_list]

	logger.debug("Joining multiprocessing pools")
	worker_pool.join()	
	writer_pool.join()

	logger.end()
示例#4
0
def count_transcripts(cargs):
    args, contig = cargs
    if args.alleles is not None:
        allele_resolver = alleleTools.AlleleResolver(
            args.alleles, lazyLoad=(not args.loadAllelesToMem))
    else:
        allele_resolver = None

    contig_mapping = None

    if args.contigmapping == 'danio':
        contig_mapping = {
            '1': 'CM002885.2',
            '2': 'CM002886.2',
            '3': 'CM002887.2',
            '4': 'CM002888.2',
            '5': 'CM002889.2',

            '6': 'CM002890.2',
            '7': 'CM002891.2',
            '8': 'CM002892.2',
            '9': 'CM002893.2',
            '10': 'CM002894.2',
            '11': 'CM002895.2',
            '12': 'CM002896.2',
            '13': 'CM002897.2',
            '14': 'CM002898.2',
            '15': 'CM002899.2',

            '16': 'CM002900.2',
            '17': 'CM002901.2',
            '18': 'CM002902.2',
            '19': 'CM002903.2',
            '20': 'CM002904.2',
            '21': 'CM002905.2',
            '22': 'CM002906.2',
            '23': 'CM002907.2',
            '24': 'CM002908.2',
            '25': 'CM002909.2',
        }

    # Load features
    contig_mapping = None
    #conversion_table = get_gene_id_to_gene_name_conversion_table(args.gtfexon)
    features = singlecellmultiomics.features.FeatureContainer()
    if contig_mapping is not None:
        features.remapKeys = contig_mapping
    features.loadGTF(
        args.gtfexon,
        select_feature_type=['exon'],
        identifierFields=(
            'exon_id',
            'transcript_id'),
        store_all=True,
        head=args.hf,
        contig=contig)
    features.loadGTF(
        args.gtfintron,
        select_feature_type=['intron'],
        identifierFields=['transcript_id'],
        store_all=True,
        head=args.hf,
        contig=contig)

    # What is used for assignment of molecules?
    if args.method == 'nla':
        molecule_class = singlecellmultiomics.molecule.AnnotatedNLAIIIMolecule
        fragment_class = singlecellmultiomics.fragment.NlaIIIFragment
        pooling_method = 1  # all data from the same cell can be dealt with separately
        stranded = None  # data is not stranded
    elif args.method == 'vasa' or args.method == 'cs':
        molecule_class = singlecellmultiomics.molecule.VASA
        fragment_class = singlecellmultiomics.fragment.SingleEndTranscript
        pooling_method = 1
        stranded = 1  # data is stranded, mapping to other strand
    else:
        raise ValueError("Supply a valid method")

    # COUNT:
    exon_counts_per_cell = collections.defaultdict(
        collections.Counter)  # cell->gene->umiCount
    intron_counts_per_cell = collections.defaultdict(
        collections.Counter)  # cell->gene->umiCount
    junction_counts_per_cell = collections.defaultdict(
        collections.Counter)  # cell->gene->umiCount
    gene_counts_per_cell = collections.defaultdict(
        collections.Counter)  # cell->gene->umiCount

    gene_set = set()
    sample_set = set()
    annotated_molecules = 0
    read_molecules = 0
    if args.producebam:
        bam_path_produced = f'{args.o}/output_bam_{contig}.unsorted.bam'
        with pysam.AlignmentFile(args.alignmentfiles[0]) as alignments:
            output_bam = pysam.AlignmentFile(
                bam_path_produced, "wb", header=alignments.header)

    ref = None
    if args.ref is not None:
        ref = pysamiterators.iterators.CachedFasta(pysam.FastaFile(args.ref))

    for alignmentfile_path in args.alignmentfiles:

        i = 0
        with pysam.AlignmentFile(alignmentfile_path) as alignments:
            molecule_iterator = MoleculeIterator(
                alignments=alignments,
                check_eject_every=5000,
                molecule_class=molecule_class,
                molecule_class_args={
                    'features': features,
                    'stranded': stranded,
                    'min_max_mapping_quality': args.minmq,
                    'reference': ref,
                    'allele_resolver': allele_resolver
                },

                fragment_class=fragment_class,
                fragment_class_args={
                    'umi_hamming_distance': args.umi_hamming_distance,
                    'R1_primer_length': 4,
                    'R2_primer_length': 6},
                perform_qflag=True,
                # when the reads have not been tagged yet, this flag is very
                # much required
                pooling_method=pooling_method,
                contig=contig
            )

            for i, molecule in enumerate(molecule_iterator):
                if not molecule.is_valid():
                    if args.producebam:
                        molecule.write_tags()
                        molecule.write_pysam(output_bam)
                    continue

                molecule.annotate(args.annotmethod)
                molecule.set_intron_exon_features()

                if args.producebam:
                    molecule.write_tags()
                    molecule.write_pysam(output_bam)

                allele = None
                if allele_resolver is not None:
                    allele = molecule.allele
                    if allele is None:
                        allele = 'noAllele'

                # Obtain total count introns/exons reduce it so the sum of the
                # count will be 1:
                # len(molecule.introns.union( molecule.exons).difference(molecule.junctions))+len(molecule.junctions)
                total_count_for_molecule = len(molecule.genes)
                if total_count_for_molecule == 0:
                    continue  # we didn't find  any gene counts

                # Distibute count over amount of gene hits:
                count_to_add = 1 / total_count_for_molecule
                for gene in molecule.genes:
                    if allele is not None:
                        gene = f'{allele}_{gene}'
                    gene_counts_per_cell[molecule.sample][gene] += count_to_add
                    gene_set.add(gene)
                    sample_set.add(molecule.get_sample())

                # Obtain introns/exons/splice junction information:
                for intron in molecule.introns:
                    gene = intron
                    if allele is not None:
                        gene = f'{allele}_{intron}'
                    intron_counts_per_cell[molecule.sample][gene] += count_to_add
                    gene_set.add(gene)

                for exon in molecule.exons:
                    gene = exon
                    if allele is not None:
                        gene = f'{allele}_{exon}'
                    exon_counts_per_cell[molecule.sample][gene] += count_to_add
                    gene_set.add(gene)

                for junction in molecule.junctions:
                    gene = junction
                    if allele is not None:
                        gene = f'{allele}_{junction}'
                    junction_counts_per_cell[molecule.sample][gene] += count_to_add
                    gene_set.add(gene)

                annotated_molecules += 1
                if args.head and (i + 1) > args.head:
                    print(
                        f"-head was supplied, {i} molecules discovered, stopping")
                    break

        read_molecules += i

    if args.producebam:
        output_bam.close()
        final_bam_path = bam_path_produced.replace('.unsorted', '')
        sort_and_index(bam_path_produced, final_bam_path, remove_unsorted=True)

    return (
        gene_set,
        sample_set,
        gene_counts_per_cell,
        junction_counts_per_cell,
        exon_counts_per_cell,
        intron_counts_per_cell,
        annotated_molecules,
        read_molecules,
        contig

    )
示例#5
0
def process_regions(ref_file, regions, out_dir, param_file):
    out_vcf_path = os.path.join(out_dir, "svteaser.sim.vcf")
    out_ref_fa_path = os.path.join(out_dir, "svteaser.ref.fa")
    out_altered_fa_path = os.path.join(out_dir, "svteaser.altered.fa")

    out_vcf_fh = None
    out_ref_fa_fh = open(out_ref_fa_path, "w+")
    out_altered_fa_fh = open(out_altered_fa_path, "w+")

    ref = pysam.FastaFile(ref_file)

    # Define padding in reference region where SVs are not to be inserted.
    padding = 800

    for i, (chrom, start, end) in enumerate(regions):
        # Track status.
        if (i + 1) % 50 == 0:
            logging.info("Processed {}/{} regions...".format(i + 1, len(regions)))

        # Temporary dir.
        temp_dir = os.path.join(out_dir, "temp")
        os.mkdir(temp_dir)

        # Extract ref sequence.
        name = "{}_{}_{}".format(chrom, start, end)
        ref_seq = ref.fetch(chrom, start, end)

        # Remove some buffer from beginning and ending,
        # so that the tails do not contain SVs. These will be added
        # back later on.
        ref_seq_surv = ref_seq[padding:len(ref_seq)-padding]
        # Write ref sequence to temporary fa file.
        temp_ref_fa = os.path.join(temp_dir, "temp_ref.fa")
        with open(temp_ref_fa, "w") as fh:
            add_fasta_entry(name, ref_seq_surv, fh)

        # Run SURVIVOR.
        prefix = os.path.join(temp_dir, "simulated")
        survivor_cmd = " ".join(["SURVIVOR",
                                 "simSV",
                                 temp_ref_fa,
                                 param_file,
                                 "0.0",
                                 "0",
                                 prefix])
        ret = cmd_exe(survivor_cmd)
        # should be checking here

        # Read output of SURVIVOR
        altered_fa_path = "{}.fasta".format(prefix)
        insertions_fa_path = "{}.insertions.fa".format(prefix)
        sim_vcf = "{}.vcf".format(prefix)
        # Update VCF
        temp_vcf = os.path.join(temp_dir, "temp.vcf")
        update_vcf(temp_ref_fa, insertions_fa_path, sim_vcf, temp_vcf, pos_padding=padding)

        # Merge seqs and variants entries into single FA/VCF files
        # Add the initial and last 800bp back to the altered fasta
        altered_seq = pysam.FastaFile(altered_fa_path).fetch(name)
        altered_seq = update_altered_fa(ref_seq, altered_seq, padding)
        add_fasta_entry(name, altered_seq, out_altered_fa_fh)

        add_fasta_entry(name, ref_seq, out_ref_fa_fh)

        vcf_reader = pysam.VariantFile(temp_vcf)
        header = vcf_reader.header
        if not out_vcf_fh:
            out_vcf_fh = pysam.VariantFile(out_vcf_path, 'w', header=header)

        for record in vcf_reader:
            out_vcf_fh.write(record)

        # Remove temporary files.
        import shutil
        shutil.rmtree(temp_dir)

    out_altered_fa_fh.close()
    out_ref_fa_fh.close()
    out_vcf_fh.close()
    vcf_compress(out_vcf_path)
示例#6
0
def make_sampledata(args):
    if isinstance(args, tuple):
        vcf_subset = args[2]
        args[0].output_file = str(args[1])
        args = args[0]
    else:
        vcf_subset = None
    try:
        git_hash = subprocess.check_output(["git", "rev-parse", "HEAD"])
        git_provenance = {
            "repo":
            "[email protected]:mcveanlab/treeseq-inference.git",
            "hash":
            git_hash.decode().strip(),
            "dir":
            "human-data",
            "notes:":
            ("Use the Makefile to download and process the upstream data files"
             ),
        }
    except FileNotFoundError:
        git_hash = "Git unavailable"
        git_provenance = "Git unavailable"
    data_provenance = {
        "ancestral_states_url": args.ancestral_states_url,
        "reference_name": args.reference_name,
    }

    # Get the ancestral states.
    fasta = pysam.FastaFile(args.ancestral_states_file)
    # NB! We put in an extra character at the start to convert to 1 based coords.
    ancestral_states = "X" + fasta.fetch(reference=fasta.references[0])
    # The largest possible site position is len(ancestral_states). Positions must
    # be strictly less than sequence_length, so we add 1.
    sequence_length = len(ancestral_states) + 1

    converter_class = {
        "1kg": ThousandGenomesConverter,
        "sgdp": SgdpConverter,
        "hgdp": HgdpConverter,
        "max-planck": MaxPlanckConverter,
        "afanasievo": AfanasievoConverter,
        "1240k": ReichConverter,
    }
    try:
        with tsinfer.SampleData(path=args.output_file,
                                num_flush_threads=1,
                                sequence_length=sequence_length) as samples:
            converter = converter_class[args.source](args.data_file,
                                                     ancestral_states, samples,
                                                     args.target_samples)
            if args.metadata_file:
                converter.process_metadata(args.metadata_file, args.progress)
            else:
                converter.process_metadata(args.progress)
            if vcf_subset is not None:
                report = converter.process_sites(
                    vcf_subset=vcf_subset,
                    show_progress=args.progress,
                    max_sites=args.max_variants,
                )
            else:
                report = converter.process_sites(show_progress=args.progress,
                                                 max_sites=args.max_variants)
            samples.record_provenance(
                command=sys.argv[0],
                args=sys.argv[1:],
                git=git_provenance,
                data=data_provenance,
            )
            assert np.all(np.diff(samples.sites_position[:]) > 0)
    except Exception as e:
        os.unlink(args.output_file)
        if report["num_sites"] == 0:
            return report
        raise e
    if report["num_sites"] == 0:
        os.unlink(args.output_file)
    return report
示例#7
0
 def openFile(self, dataFile):
     return pysam.FastaFile(dataFile)
示例#8
0
 def test_open_file_with_explicit_index_succeeds(self):
     with pysam.FastaFile(self.filename,
                          filepath_index=self.filename + ".fai") as inf:
         self.assertEqual(len(inf), 2)
示例#9
0
def run(subcommand):
    args = get_args(subcommand)

    if subcommand == "reclassification":
        nl.filterate_by_panel(
            args.input_vcf,
            args.output_vcf,
            pysam.FastaFile(args.fasta),
            args.non_somatic_panel,
        )
        print("rnaindel reclassification completed successfully.",
              file=sys.stdout)
        sys.exit(0)

    data_dir = args.data_dir.rstrip("/")
    model_dir = "{}/models".format(data_dir)
    # database check
    path2cosmic = pathlib.Path("{}/cosmic".format(data_dir))

    if not path2cosmic.exists():
        print(
            "Please download the latest database: http://ftp.stjude.org/pub/software/RNAIndel/"
        )
        sys.exit(1)

    if subcommand == "nonsomatic" or subcommand == "recurrence":
        cosmic = pysam.TabixFile(
            "{}/cosmic/CosmicCodingMuts.indel.vcf.gz".format(data_dir))
        if subcommand == "nonsomatic":
            nl.make_non_somatic_panel(
                args.vcf_list,
                args.output_vcf,
                pysam.FastaFile(args.fasta),
                cosmic,
                args.count,
            )
            print("rnaindel nonsomaic completed successfully.",
                  file=sys.stdout)
            sys.exit(0)
        else:
            nl.annotate_recurrence(args.vcf_list, pysam.FastaFile(args.fasta),
                                   cosmic, args.out_dir)
            print("rnaindel recurrence completed successfully.",
                  file=sys.stdout)
            sys.exit(0)

    log_dir = args.log_dir.rstrip("/")

    if subcommand == "training":
        df = tl.input_validator(args.training_data, args.indel_class)

        # downsampling
        artifact_ratio, ds_f_beta, ds_precision = tl.downsampler(
            df,
            args.k_fold,
            args.indel_class,
            args.ds_beta,
            args.process_num,
            args.downsample_ratio,
        )

        # feature_selection
        selected_features, fs_f_beta, fs_precision = tl.selector(
            df,
            args.k_fold,
            args.indel_class,
            artifact_ratio,
            args.fs_beta,
            args.process_num,
            args.feature_names,
        )

        # parameter tuning
        feature_lst = selected_features.split(";")
        max_features, pt_f_beta, pt_precision = tl.tuner(
            df,
            args.k_fold,
            args.indel_class,
            artifact_ratio,
            feature_lst,
            args.pt_beta,
            args.process_num,
            args.auto_param,
        )

        # update models
        tl.updater(df, args.indel_class, artifact_ratio, feature_lst,
                   max_features, model_dir)

        # make report
        tl.reporter(
            args.indel_class,
            args.ds_beta,
            ds_f_beta,
            ds_precision,
            artifact_ratio,
            args.fs_beta,
            fs_f_beta,
            fs_precision,
            selected_features,
            args.pt_beta,
            pt_f_beta,
            pt_precision,
            max_features,
            args.log_dir,
        )

        msg = ("single-nucleotide indels"
               if args.indel_class == "s" else "multi-nucleotide indels")

        print("rnaindel training for " + msg + " completed successfully.",
              file=sys.stdout)
    else:
        create_logger(log_dir)

        alignments = pysam.AlignmentFile(args.bam)
        genome = pysam.FastaFile(args.fasta)
        refgene = "{}/refgene/refCodingExon.bed.gz".format(data_dir)
        exons = pysam.TabixFile(refgene)
        protein = "{}/protein/proteinConservedDomains.txt".format(data_dir)
        dbsnp = pysam.TabixFile("{}/dbsnp/dbsnp.indel.vcf.gz".format(data_dir))
        clinvar = pysam.TabixFile(
            "{}/clinvar/clinvar.indel.vcf.gz".format(data_dir))
        cosmic = pysam.TabixFile(
            "{}/cosmic/CosmicCodingMuts.indel.vcf.gz".format(data_dir))

        germline_db = pysam.TabixFile(
            args.germline_db) if args.germline_db else None

        # input validation
        rl.input_validator(alignments, genome, args.uniq_mapq)

        # region analysis
        region = args.region if subcommand == "analysis" else None

        # preprocessing
        # variant calling will be performed if no external VCF is supplied
        if not args.input_vcf:

            with tempfile.TemporaryDirectory() as tmp_dir:
                # indel calling
                bambino_output = os.path.join(tmp_dir, "bambino.txt")

                bl.bambino(args.bam, args.fasta, bambino_output,
                           args.heap_memory, region)

                # preprocess indels from the built-in caller
                df, chr_prefixed = rl.indel_preprocessor(
                    bambino_output, genome, alignments, exons)

                df = rl.indel_rescuer(df, args.fasta, args.bam, chr_prefixed,
                                      args.process_num)

        else:
            # preprocess indels from external VCF
            df, chr_prefixed = rl.indel_vcf_preprocessor(
                args.input_vcf, genome, alignments, exons, region)

            df = rl.indel_rescuer(
                df,
                args.fasta,
                args.bam,
                chr_prefixed,
                args.process_num,
                external_vcf=True,
            )

        # indel annotation
        df = rl.indel_annotator(df, genome, exons, chr_prefixed)

        # feature calculation
        if subcommand == "feature":
            df, df_filtered_premerge = rl.indel_sequence_processor(
                df,
                genome,
                alignments,
                args.uniq_mapq,
                chr_prefixed,
                softclip_analysis=args.softclip_analysis,
            )
        else:
            coverage_in_trainingset = "{}/models/coverage.txt".format(data_dir)
            downsample_thresholds = {}
            with open(coverage_in_trainingset) as f:
                for line in f:
                    if line.startswith("s"):
                        downsample_thresholds["single_nuleotide_indels"] = int(
                            line.rstrip().split("\t")[1])
                    else:
                        downsample_thresholds["multi_nuleotide_indels"] = int(
                            line.rstrip().split("\t")[1])

            df, df_filtered_premerge = rl.indel_sequence_processor(
                df,
                genome,
                alignments,
                args.uniq_mapq,
                chr_prefixed,
                softclip_analysis=args.softclip_analysis,
                downsample_thresholds=downsample_thresholds,
            )

        df = rl.indel_protein_processor(df, refgene, protein)

        # merging equivalent indels
        df, df_filtered_postmerge = rl.indel_equivalence_solver(
            df, genome, refgene, chr_prefixed)

        # SNP annotation
        df = rl.indel_snp_annotator(df, genome, dbsnp, clinvar, germline_db,
                                    chr_prefixed)

        # subcommand "feature" exits here
        if subcommand == "feature":
            df = rl.indel_feature_reporter(df, genome, args.output_tab,
                                           chr_prefixed)
            print("rnaindel feature completed successfully.", file=sys.stdout)
            sys.exit(0)

        # prediction
        df = rl.indel_classifier(df, model_dir, args.process_num)

        # concatenating invalid(filtered) entries
        df_filtered = pd.concat(
            [df_filtered_premerge, df_filtered_postmerge],
            axis=0,
            ignore_index=True,
            sort=True,
        )

        # panel of non somatic
        default_pons = pysam.TabixFile(
            os.path.join(args.data_dir, "non_somatic/non_somatic.vcf.gz"))
        user_pons = (pysam.TabixFile(args.non_somatic_panel)
                     if args.non_somatic_panel else None)

        df = rl.indel_reclassifier(df, genome, default_pons, user_pons, cosmic,
                                   chr_prefixed)

        # postProcessing & VCF formatting
        df, df_filtered = rl.indel_postprocessor(df, df_filtered, genome,
                                                 exons, chr_prefixed)
        rl.indel_vcf_writer(
            df,
            df_filtered,
            args.fasta,
            genome,
            alignments,
            chr_prefixed,
            args.output_vcf,
            model_dir,
            __version__,
        )

        print("rnaindel analysis completed successfully.", file=sys.stdout)
示例#10
0
 def setUp(self):
     self.file = pysam.FastaFile(os.path.join(BAM_DATADIR, "ex1.fa"))
示例#11
0
 def test_open_file_without_index_succeeds(self):
     with pysam.FastaFile(self.filename) as inf:
         self.assertEqual(len(inf), 2)
示例#12
0
 def __call__(self):
     fastaFile = pysam.FastaFile(self.args.fastainput)
     bamFile = pysam.AlignmentFile(self.args.BAMinput, "rb")
     ssl_settings = {'ca':self.args.sslpath}
     con = MySQLdb.connect(self.args.server, self.args.user, self.args.password, self.args.database, ssl=ssl_settings)
     with con:
         cur = con.cursor()
         cur.execute("USE " + self.args.database)
     def batch_gen(data, batch_size):
         for i in range(0, len(data), batch_size):
                 yield data[i:i+batch_size]
     references = sorted(set(bamFile.getrname(read.tid) for read in samfile.fetch()))
     referencesLeng = sorted(set(len(fastaFile.fetch(reference=str(item)))for item in references))
     for ref, leng in zip(references, referencesLeng):
         print ref, leng
         cur.execute('INSERT INTO templates(protein, length) VALUES(%s, %s)' ,(ref, leng))
     for reference in references:
         returned_position_lines=[]
         length=0
         refcodonpos=0
         counter=0
         for codon in batch_gen(fastaFile.fetch(reference=str(reference)),3):
             length+=3
             markerlist=[]
             referenceid = str(reference)+ ' '
             refnucpos1=0 +(3*refcodonpos)
             refnucpos2=1 +(3*refcodonpos)
             refnucpos3=2 +(3*refcodonpos)
             if 1 <= (refcodonpos+1) <= 9:
                 refcodonposid = str(refcodonpos+1)+ " "
             else:
                 refcodonposid = str(refcodonpos+1)
             refAAid = str(Seq(codon).translate()[0])
             marker_list=[]
             for read in samfile.fetch():
                 read_codon=[]
                 for seq, pos in zip(read.seq,AlignedSegment.get_reference_positions(read)):
                     if pos == refnucpos1:
                         read_codon.append(seq)
                     if pos == refnucpos2:
                         read_codon.append(seq)
                     if pos == refnucpos3:
                         read_codon.append(seq)
                 if any(read_codon) is True:
                     if len(read_codon) == 3:
                         counter+=1
                         if ''.join(read_codon) == codon:
                             marker_list.append('.')
                         else:
                             marker_list.append(str(Seq("".join(read_codon)).translate()[0]))
             print (referenceid, refcodonposid, refAAid, counter, ''.join(str(item)for item in marker_list))
             returned_position_lines.append(''.join(str(item)for item in marker_list))
             cur.execute("INSERT INTO sites(template_id, position, wild_type_AA) VALUES((SELECT id from templates WHERE protein=%s), %s, %s)" ,(reference, refcodonposid, refAAid))
             counter=0
             refcodonpos+=1
         print returned_position_lines
         AAs = ('A','R','N','D','C','E','Q','G','H','I','L','K','M','F','P','S','T','W','Y','V','*')
         for AA in AAs:
             position=0
             for line in returned_position_lines:
                 position+=1
                 count=0
                 for readAA in line:
                     if readAA==AA:
                         count+=1
                 if (count >= 1):
                     print count, AA, position
                     cur.execute("INSERT INTO substitutions(site_id, substitution, count) VALUES((SELECT id from sites WHERE position=%s AND template_id=(SELECT id from templates WHERE protein=%s)), %s, %s)" ,(position, reference, AA, count))
     con.commit()
     fastaFile.close()
     bamFile.close()
示例#13
0
def convert_bed_to_vcf(bed_filename, reference_filename, vcf_filename, sample,
                       variant_type):
    # Get variants.
    if variant_type == "sv":
        columns = (0, 1, 2, 3, 4, 5, 7, 9, 10, 12, 14)
        names = ("chrom", "start", "end", "sv_call", "event_size",
                 "sv_sequence", "contig", "contig_start", "contig_end",
                 "genotype", "repeat_type")
        fmt = ["GT"]
    elif variant_type == "indel":
        # chr1    94824   94827   3       Cttttcttttttttt 1       1       29.04   deletion
        columns = (0, 1, 2, 3, 4, 5, 6, 7, 8)
        names = ("chrom", "start", "end", "event_size", "sv_sequence",
                 "contig_support", "contig_depth", "depth", "sv_call")
        fmt = ["GT"]
    elif variant_type == "inversion":
        columns = (0, 1, 2, 3, 4, 5)
        names = ("chrom", "start", "end", "sv_call", "contig_support",
                 "contig_depth")
        fmt = [""]
    else:
        raise Exception("Unsupported variant type: %s" % variant_type)

    calls = pd.read_table(
        bed_filename,
        low_memory=False,
        keep_default_na=False,
        index_col=False,
        header=0)  #, header=None, usecols=columns, names=names)

    calls["sample_name"] = sample
    calls["call_id"] = "."
    calls["quality"] = "30"  #calls.apply(calculate_variant_quality, axis=1)
    calls["filter"] = "PASS"

    # Make sure the sv length and sv sequence agree

    #    calls["svLen"] = calls.apply(lambda row: len(GetSeq(row["svSeq"])), axis=1)

    pd.to_numeric(calls["tStart"])
    pd.to_numeric(calls["tEnd"])
    # Get the reference base at the position of the variant start.
    reference = pysam.FastaFile(reference_filename)
    calls["reference"] = calls.apply(lambda row: reference.fetch(
        row["#chrom"], row["tStart"], row["tStart"] + 1).upper(),
                                     axis=1)

    # Update start position to be 1-based.
    calls["origTStart"] = calls["tStart"]
    calls["CHROM"] = calls["#chrom"]

    calls["POS"] = calls.apply(lambda row: GetStart(row), axis=1)
    if args.addci is not None:
        calls["CIPOS"] = ["-{},{}".format(args.addci, args.addci)] * len(calls)
        calls["CIEND"] = ["-{},{}".format(args.addci, args.addci)] * len(calls)
    # Build an INFO field for each call.
    calls["svShort"] = calls.apply(lambda row: GetType(row["svType"]), axis=1)

    if variant_type == "sv":
        infoKeys = [("END", "tEnd"), ("SVTYPE", "svShort"), ("SVLEN", "svLen"),
                    ("CONTIG", "qName"), ("CONTIG_START", "qStart"),
                    ("CONTIG_END", "qEnd"), ("SEQ", "svSeq")]
        if "is_trf" in calls:
            infoKeys.append(("IS_TRF", "is_trf"))

        if args.addci is not None:
            infoKeys.append(("CIEND", "CIEND"))
            infoKeys.append(("CIPOS", "CIPOS"))

        if len(args.fields) > 0:
            extraKeys = [(args.fields[i], args.fields[i + 1])
                         for i in range(0, len(args.fields), 2)]
            infoKeys += extraKeys
        if args.seq:
            calls["reference"] = calls.apply(
                lambda row: GetRefSeq(row, reference), axis=1)
            calls["alt"] = calls.apply(lambda row: GetAltSeq(row, reference),
                                       axis=1)
        else:
            calls["reference"] = calls.apply(
                lambda row: GetRefSeq(row, reference, 1), axis=1)
            calls["alt"] = calls.apply(
                lambda row: "<%s>" % row.svType[:3].upper(), axis=1)

        calls["svLen"] = calls.apply(lambda row: GetSVLen(row), axis=1)

        calls["info"] = calls.apply(lambda row: ";".join([
            "=".join(map(str, (item[0], row[item[1]]))) for item in (infoKeys)
        ]),
                                    axis=1)
        #        import pdb
        #        pdb.set_trace()
        calls["svLen"] = calls.apply(lambda row: ParseSVLen(row["svLen"]),
                                     axis=1)
        calls["format"] = ":".join(fmt)
        if "hap" in calls:
            calls["genotype"] = calls.apply(lambda row: GetGenotype(row.hap),
                                            axis=1)
        else:
            calls["genotype"] = ["./."] * len(calls["tEnd"])

    elif variant_type == "indel":
        calls["reference"] = calls.apply(lambda row: GetRefSeq(row, reference),
                                         axis=1)
        calls["alt"] = calls.apply(lambda row: GetAltSeq(row, reference),
                                   axis=1)
        calls["format"] = ":".join(fmt)

        if "hap" in calls:
            calls["genotype"] = calls.apply(lambda row: GetGenotype(row.hap),
                                            axis=1)
        else:
            calls["genotype"] = ["./."] * len(calls["tEnd"])
        calls["svLen"] = calls.apply(lambda row: GetSVLen(row), axis=1)
        calls["info"] = calls.apply(lambda row: ";".join([
            "=".join(map(str, item)) for item in
            (("END", row["tEnd"]), ("SVTYPE", row["svType"]),
             ("SVLEN", row["svLen"]), ("SAMPLES", row["sample_name"]),
             ("SEQ", row["svSeq"]))
        ]),
                                    axis=1)
    elif variant_type == "inversion":
        calls["alt"] = "<INV>"
        calls["info"] = calls.apply(lambda row: ";".join([
            "=".join(map(str, item)) for item in (
                ("END", row["tEnd"]),
                ("SVTYPE", row["svType"]),
                ("SVLEN", row["svLen"]),
                ("SAMPLES", row["sample_name"]),
            )
        ]),
                                    axis=1)

    simple_calls = calls[[
        "#chrom", "POS", "call_id", "reference", "alt", "quality", "filter",
        "info", "format", "genotype"
    ]].rename(
        {
            "#chrom": "#CHROM",
            "reference": "REF",
            "call_id": "ID",
            "quality": "QUAL",
            "info": "INFO",
            "alt": "ALT",
            "filter": "FILTER",
            "format": "FORMAT",
            "genotype": sample
        },
        axis=1)

    faiFile = open(args.reference + ".fai")
    fai = []
    for line in faiFile:
        vals = line.split()
        fai.append([vals[0], vals[1]])

    # Save genotypes as tab-delimited file.
    with open(vcf_filename, "w") as vcf:
        vcf.write("##fileformat=VCFv4.2\n")
        vcf.write("##fileDate=%s\n" %
                  datetime.date.strftime(datetime.date.today(), "%Y%m%d"))
        vcf.write("##source={}\n".format(args.source))
        vcf.write(
            '##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">'
            + "\n")
        vcf.write(
            '##INFO=<ID=SVLEN,Number=1,Type=Integer,Description="Difference in length between REF and ALT alleles">'
            + "\n")
        vcf.write(
            '##INFO=<ID=END,Number=1,Type=Integer,Description="End coordinate of this variant">'
            + "\n")
        vcf.write(
            '##INFO=<ID=CONTIG,Number=1,Type=String,Description="Name of alternate assembly contig">'
            + "\n")
        vcf.write(
            '##INFO=<ID=CONTIG_START,Number=1,Type=Integer,Description="Start coordinate of this variant in the alternate assembly contig">'
            + "\n")
        vcf.write(
            '##INFO=<ID=CONTIG_END,Number=1,Type=Integer,Description="End coordinate of this variant in the alternate assembly contig">'
            + "\n")
        vcf.write(
            '##INFO=<ID=SEQ,Number=1,Type=String,Description="Sequence associated with variant">'
            + "\n")
        for i in range(0, len(fai)):
            vcf.write(
                "##contig=<ID={},length={}>".format(fai[i][0], fai[i][1]) +
                "\n")
        vcf.write("##SAMPLE=<ID={}>\n".format(args.sample))
        vcf.write(
            '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">' +
            "\n")
        if args.info is not None:
            vcf.write("\n".join(args.info) + "\n")
        simple_calls.to_csv(vcf, sep="\t", index=False)
示例#14
0
def main():
    # argument parser
    parser = argparse.ArgumentParser()
    parser.add_argument('-fourfold', help='bed file of fourfold sites with final column listing gene name',
                        required=True)
    parser.add_argument('-ref', help='Reference genome', required=True)
    parser.add_argument('-gc_thres', help=argparse.SUPPRESS, default=72)
    args = parser.parse_args()

    ref_genome = pysam.FastaFile(args.ref)
    trans_data = pysam.TabixFile(args.fourfold)
    chromosomes = trans_data.contigs

    out_stem = args.fourfold.replace('.bed.gz', '')
    bed_out = '{}_maxgc{}.bed'.format(out_stem, args.gc_thres)
    gc_out = '{}_gc_content.txt'.format(out_stem)

    gene_dict = {}

    # loop through all chromosomes in the fourfold bed
    for chromo in chromosomes:

        ref_str = ref_genome.fetch(chromo)

        # process each bed chromosome
        for line in trans_data.fetch(chromo, parser=pysam.asTuple()):

            chromo, start, stop, trans_id = line[0], int(line[1]), int(line[2]), line[3]

            # add relevant keys
            if chromo not in gene_dict.keys():
                gene_dict[chromo] = {trans_id: [0, 0, 0]}
            if trans_id not in gene_dict[chromo].keys():
                gene_dict[chromo][trans_id] = [0, 0, 0]

            # get ref string for region
            ref_seq = ref_str[start: stop].upper()

            at = ref_seq.count('A') + ref_seq.count('T')
            gc = ref_seq.count('G') + ref_seq.count('C')

            gene_dict[chromo][trans_id][0] += gc
            gene_dict[chromo][trans_id][1] += at
            percent_gc = (gene_dict[chromo][trans_id][0] /
                          float(gene_dict[chromo][trans_id][0] + gene_dict[chromo][trans_id][1])) * 100.0
            gene_dict[chromo][trans_id][2] = percent_gc

    trans_data.close()

    # process gc content
    with open(gc_out, 'w') as gc_file:
        failing_trans = []
        print('transcript\tgc\tat\tpercent_gc', file=gc_file)
        for x in gene_dict.keys():
            for transcript in gene_dict[x].keys():
                at_cont, gc_cont, gc_percent = gene_dict[x][transcript]
                print(transcript, at_cont, gc_cont, gc_percent, sep='\t', file=gc_file)

                if gc_percent > args.gc_thres:
                    failing_trans.append(transcript)

    # filter bed
    with open(bed_out, 'w') as bed_file:
        for bed_line in gzip.open(args.fourfold):
            trans_id = bed_line.rstrip().split()[-1]
            if trans_id in failing_trans:
                continue
            else:
                print(bed_line.rstrip(), file=bed_file)

    # bgzip and tabix
    subprocess.call('bgzip {}'.format(bed_out), shell=True)
    subprocess.call('tabix -pbed {}.gz'.format(bed_out), shell=True)
示例#15
0
def main():
    
    description = """ SplitStrains detects minor/major strains and classify reads. In addition, it produces 2 plots: histogram and scatter plots for visual inspecting and parameter tunning (see figures in output dir). """
    parser = argparse.ArgumentParser(description=description, add_help=False)
    arg_required = parser.add_argument_group('required arguments')
    arg_required.add_argument(dest='bamFilePath', metavar='bamFilePath', help='Input bam file')
    arg_required.add_argument('-o', metavar='dir', required=True, dest='outputDir', help='Output directory.')
    arg_required.add_argument('-fd', metavar='n', required=True, default=75, dest='depthThreshold', type=int, help='Do not consider pileup columns with the depth percentage less than n percent. Setting this to 75 means ignore sites with depth coverage less than 75%% of the bam avg depth. Default=75.')

    arg_optional = parser.add_argument_group('optional arguments')
    arg_optional.add_argument("-h", "--help", action="help", help="show this help message and exit")   
    arg_optional.add_argument('-c','--classify', action='store_true', help='If this option is specified then the program will run reads classification, otherwise it will detect means and produce histogram png.')
    arg_optional.add_argument('-z','--reuse', action='store_true', help='If this flag is specified the program will reuse the csv file from the previous run.')
    arg_optional.add_argument('-mo', metavar='gmm/bmm', dest='model', type=str, help='Specify clustering model: GMM or BMM. Default GMM.', default='gmm')
    arg_optional.add_argument('-f', metavar="plotName", dest='plotName', default='plot', help='Name for the histogram figure.')
    arg_optional.add_argument('-s', metavar='n', dest='regionStart', type=int, help='Specify the start position on the genome. Default=0.')
    arg_optional.add_argument('-e', metavar='n', dest='regionEnd', type=int, help='Specify the end position on the genome. Default is the genome length.')
    arg_optional.add_argument('-r', metavar='ref', dest='ref', help='Genome reference. It is highly recommended to use the default reference file for compatibility with the GFF file.', default='refs/tuberculosis.fna')
    arg_optional.add_argument('-b', metavar='gff', dest='gff', help='Use gff file to process only gff regions. It is highly recommended to use the default GFF file as it takes care of problematic genomic regions.', default='refs/tuberculosis.filtered-intervals.gff')
    arg_optional.add_argument('-i', metavar='n', default=150, dest='step', type=int, help='Step for snp cluster detection. Default=150.')
    arg_optional.add_argument('-g', metavar='n', default=2, type=int, dest='components', help='GMM model components. Default=2.')
    arg_optional.add_argument('-ft', metavar='n', default=1, dest='proportion_count_threshold', help='Filter out proportions which have count less than n. Default=1')
    arg_optional.add_argument('-fe', metavar='n', default=0, dest='entropy_thresh', help='Entropy filtering threshold. Set to 0 to turn off entropy filtering. Default=0.')
    arg_optional.add_argument('-a', metavar='n', default=0.05, dest='alpha_level', help='Significance level alpha. The probability of rejecting a single strain hypothesis when it is true. Default=0.05.')
    arg_optional.add_argument('-fes', metavar='n', type=int, default=70, dest='entropy_step', help='Entropy filtering step. Defines the step length on freqVec.csv for entropy filtering computation. Default=200.')
    arg_optional.add_argument('-u', metavar='n', type=int, default=90, dest='upperLimit', help='Do not consider proportion of bases beyond n value. Default=90.')
    arg_optional.add_argument('-l', metavar='n', type=int, default=10, dest='lowerLimit', help='Do not consider proportion of bases below n value. Default=10.')
    arg_optional.add_argument('-m', metavar='n', type=int, default=20, dest='mapQuality', help='Do not consider reads below n map quality. Default=20.')
    arg_optional.add_argument('-q', metavar='n', type=int, default=10, dest='baseQuality', help='Do not consider bases below n quality. Default=10.')

    args = parser.parse_args()

    components = args.components    # gmm components. For 2 strains 2 components.
    proprtionCountThresh = args.proportion_count_threshold
    depthThreshold = args.depthThreshold      # pileup columns with depth less than filter value are skipped. Helps to reduce noise for gmm fitting
    lowerLimit = args.lowerLimit
    upperLimit = args.upperLimit
    regionStart = args.regionStart
    regionEnd = args.regionEnd
    step = args.step
    baseQuality = args.baseQuality    # samtools default mpileup quality filter is 13
    mapQuality = args.mapQuality
    outputDir = args.outputDir
    plotName = args.plotName
    refFastaPath = args.ref     # path to a ref fasta file
    bamFilePath = args.bamFilePath  # path to bam file
    gffFilePath = args.gff
    entropy_step = args.entropy_step
    ethreshold = float(args.entropy_thresh)
    useModel = args.model
    reuseFreqVec = args.reuse
    alpha_level = float(args.alpha_level)

    logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO, stream=sys.stdout)

    # Ref path
    installed_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
    if refFastaPath == "refs/tuberculosis.fna":
        refFastaPath = os.path.join(installed_path, refFastaPath)
    if gffFilePath == "refs/tuberculosis.filtered-intervals.gff":
        gffFilePath = os.path.join(installed_path, gffFilePath)

    try:
        samfile = pysam.AlignmentFile(bamFilePath, "rb" )     # read bam file
        refFile = pysam.FastaFile(refFastaPath)     # read reference fasta file
    except FileNotFoundError:
        logging.error(f'{bamFilePath} or {refFastaPath} is not found.')
        exit()

    logging.info('splitStrain.py has started.')

    refName = samfile.references[0]
    refLength = samfile.lengths[0]

    # Parsing interval
    if not regionStart:
        regionStart = 0

    if not regionEnd:
        regionEnd = refLength

    if (regionEnd > refLength):
        logging.warning('regionEnd > reference length.')

    interval = regionEnd - regionStart
    if interval < 1000000:
        logging.warning(f'the interval length {interval} is too small.')

    logging.info(f'sample name: {bamFilePath}')
    logging.info(f'reference name: {refName}, reference length: {refLength}')
    logging.info(f'regionStart: {regionStart}, regionEnd: {regionEnd}')
    logging.info(f'depth threshold percent: {depthThreshold}')
    logging.info(f'entropy threshold: {ethreshold}')

    intervals = []  # list of Interval objects. This will be populated if gff file is provided
    freqVec = []    # vector format [a prop, c prop, t prop, g prop, position, depth]
    freqVecCSV = 'freqVec.csv'

    # Create output directory
    os.makedirs(outputDir, exist_ok=True)

    # compute freqVec
    if reuseFreqVec == False:

        # If gff file is provided, compute on regions specified in a gff file
        if gffFilePath != '':
            logging.info(f'using gff: {gffFilePath}')
            intervals = getIntervals(gffFilePath, regionStart, regionEnd)
            for interval in intervals:
                freqVec = computeDataFromSam(freqVec, samfile, refFile, baseQuality, mapQuality, interval.start, interval.end)


        else:
            freqVec = computeDataFromSam(freqVec, samfile, refFile, baseQuality, mapQuality, regionStart, regionEnd)

        freqVec = np.array(freqVec)

        # terminate if freqVec has less than 2 entries
        if freqVec.size < 2:
            logging.warning('No SNPs found on the given interval.')
            exit()

        # write freqVec to a file
        try:
            np.savetxt(f'{outputDir}/{freqVecCSV}', freqVec, delimiter=',')
            # np.savetxt(f'{outputDir}/{freqVecCSV}', freqVec, delimiter=',', fmt='%i')
        except IOError:
            logging.error(f'failed to save the csv {outputDir}/{freqVecCSV}.')
            exit()

    # if reuse is set then load freqVec
    else:

        try:
            logging.info(f'loading csv {outputDir}/{freqVecCSV} from the previous run')
            freqVec = np.loadtxt(open(f'{outputDir}/{freqVecCSV}', 'rb'), delimiter=',', dtype=float)
            assert len(freqVec) != 0, f'{freqVecCSV} is empty.'

        except IOError:
            logging.error(f'failed to load the csv {outputDir}/{freqVecCSV}. Please check if the file exists.')
            exit()

        except AssertionError as error:
            logging.error(error)
            exit()


    logging.debug('Starting filterVec()')
    originalFreqVec = freqVec.copy()

    # compute avg depth using freqVec
    avgDepth = freqVec[:,-1].mean()
    minDepth = avgDepth * depthThreshold / 100

    freqVec, entropyVec = filterVec(freqVec, minDepth, ethreshold, entropy_step, lowerLimit, upperLimit)
    plotScatter(outputDir, freqVec, originalFreqVec, plotName, entropyVec, regionStart, regionEnd, lowerLimit, upperLimit)


    num_iter = 20
    init_p = 0.7
    init_err = 0.001

    freqVec = freqVec[np.max(freqVec[:,:4], axis=1) < upperLimit]

    # call single strain if not enough variation is found
    if len(freqVec) < 5:
        logging.info(f'Not enough variant sites.')
        writeResult(bamFilePath, 0 , 0, alpha_level, [1])
        exit()

    # test null and alt hypthesis
    thresh, LR = likelyhood_ratio_test(freqVec, alpha_level, upperLimit, num_iter, init_p, init_err)

    # if test calls single strain exit
    if LR < thresh:
        # logging.info(f'LR test result: {bamFilePath} Single strain.')
        writeResult(bamFilePath, LR , thresh, alpha_level, [1])
        exit()


    if components == 2:
        # consider reference base frequencies in the histogram and fitting
        freqVecFlat = np.absolute(freqVec[:,:-2].flatten())
    else:
        # do not consider base frequencies. Ref bases frequencies will be filtered out since they are negative
        freqVecFlat = freqVec[:,:-2].flatten()

    freqVecFlat = freqVecFlat[freqVecFlat > lowerLimit]
    freqVecFlat = freqVecFlat[freqVecFlat < upperLimit]

    # TODO change box size to a parameter
    freqVecFlat = convolveVec(freqVecFlat, proprtionCountThresh, [1])

    if freqVecFlat.size < components:
        logging.info(f'Not enough SNP frequencies.')
        writeResult(bamFilePath, LR , thresh, alpha_level, [1])
        exit()

    # Fit data with Gaussian Mixture
    gmm = fitDataGMM(freqVecFlat, components)
    init_proportions = gmm.means_.flatten()/100

    for p in init_proportions:
        if np.isclose(p,0):
            logging.error('Unable to fit the data. Check if depth filtering, entropy filtering or intervals are reasonable.')
            exit()


    # specify which model to use
    if useModel == 'bmm':
        # Fit data with Binomial Mixture
        avgDepth = int(freqVec[:,-1].mean())
        bmm = fitDataBMM(freqVec, avgDepth, lowerLimit, upperLimit, init_proportions, components)
        bmm.set_prob(bmm.get_proportions()/np.sum(bmm.get_proportions()))
        model = Model(bmm)

    elif useModel == 'gmm':
        model = Model(gmm)

    else:
        logging.error('Wrong model name: Use either gmm or bmm.')
        exit()

    logging.info(f'using the model:{model}')


    means = model.get_strain_proportions()
    means = roundUP(means)

    if components == 2:
        if (means[0] > 50 and means[1] > 50) or (means[0] < 50 and means[1] < 50):
            logging.warning(f'result: Could not fit the data {bamFilePath}. Incorrect means:{means[0]}, {means[1]}. Possibly 50:50 split.')
            exit()

    writeResult(bamFilePath, LR , thresh, alpha_level, means/np.sum(means))

    originalFrecVecFlat = originalFreqVec[:,:-2].flatten()
    originalFrecVecFlat = originalFrecVecFlat[originalFrecVecFlat > 2]
    originalFrecVecFlat = originalFrecVecFlat[originalFrecVecFlat < 98]
    plotHist(outputDir, originalFrecVecFlat, freqVecFlat, gmm, plotName)

    if args.classify == True:
        logging.info('starting strain separation')
        result = bayesClassifyReads(outputDir, originalFreqVec, refName, samfile, refFile, model, components, baseQuality, mapQuality, step)

        if result == 0:
            logging.info('separation is complete.')
        else:
            logging.error('separation was not completed.')
示例#16
0
文件: extract.py 项目: mgem/mGEM2018
                        seq = refbase + "~" + seq
                    # pad to 3 digits b/c ref length is max 3 digits
                    mutation = format(pos + 1, "03") + "_" + seq
                    mutated_reads[mutation].append(name)
        samfile.close()
        with open(
                "./mutations/{}.mutated_reads.pkl".format(
                    os.path.basename(file)), "wb") as f:
            pickle.dump(mutated_reads, f)

    return mutated_reads


# get reference sequence
print("Getting reference sequence...")
reffile = pysam.FastaFile("ref.fa")
ref = reffile.fetch("ref", 0, 150)
reffile.close()

# get counter of all mutations
print("Counting all mutations...")
pathlib.Path('./mutations').mkdir(exist_ok=True)
mutation_counter = Counter()
for file in [f for f in os.listdir("./split") if f.endswith(".bam")]:
    mutation_counter += count_mutations_in_file(ref,
                                                os.path.join("./split", file))

# list 10 most common mutations
mutation_list = mutation_counter.most_common(10)
print("Most common mutations: " + str(mutation_list))
示例#17
0
    def data_processing(self):
        """
        Generate the consensus sequence and find indels.  Write the frequency file.  Called by pathos pool
        :return:
        """

        self.log.info("Begin Processing {}".format(self.index_name))
        """
        Summary_Data List: index_name, total aberrant, left deletions, right deletions, total deletions, left 
        insertions, right insertions, total insertions, microhomology, number filtered, target_name
        """
        target_name = self.index_dict[self.index_name][7]
        self.summary_data = [self.index_name, 0, 0, 0, 0, 0, [0, 0], [0, 0], 'junction data', target_name, [0, 0]]
        junction_type_data = [0, 0, 0, 0, 0]
        read_results_list = []
        results_freq_dict = collections.defaultdict(list)
        refseq = pysam.FastaFile(self.args.RefSeq)

        # Get the genomic 5' coordinate of the reference target region.
        try:
            start = int(self.target_dict[target_name][2])
        except IndexError:
            self.log.error("Target file incorrectly formatted for {}".format(target_name))
            return

        # Get the genomic 3' coordinate of the reference target region.
        stop = int(self.target_dict[target_name][3])

        chrm = self.target_dict[target_name][1]

        # Get the sequence of the sgRNA.
        sgrna = self.target_dict[target_name][4]

        # Get the Target Region.  This allows both types of genomic indices.
        try:
            refseq.fetch(chrm, start, stop)
        except KeyError:
            chrm = "chr{}".format(chrm)

        try:
            self.target_region = refseq.fetch(chrm, start, stop)
        except KeyError:
            self.target_region = str(pyfaidx.Fasta(self.args.RefSeq)[0]).upper()

        # Tool_Box.debug_messenger([target_name, self.target_region])
        self.cutsite_search(target_name, sgrna, chrm, start, stop)
        self.window_mapping()
        loop_count = 0
        start_time = time.time()
        split_time = start_time

        # Extract and process read 1 and read 2 from our list of sequences.
        for seq in self.sequence_list:
            loop_count += 1

            if loop_count % 5000 == 0:
                self.log.info("Processed {} reads of {} for {} in {} seconds. Elapsed time: {} seconds."
                              .format(loop_count, len(self.sequence_list), self.index_name, time.time() - split_time,
                                      time.time() - start_time))
                split_time = time.time()

            consensus_seq = seq

            # No need to attempt an analysis of bad data.
            if consensus_seq.count("N") / len(consensus_seq) > float(self.args.N_Limit):
                self.summary_data[7][0] += 1
                continue

            # No need to analyze sequences that are too short.
            if len(consensus_seq) <= int(self.args.Minimum_Length):
                self.summary_data[7][0] += 1
                continue

            '''
            The summary_data list contains information for a single library.  [0] index name; [1] reads passing all 
            filters; [2] left junction count; [3] right junction count; [4] insertion count; [5] microhomology count; 
            [6] [No junction count, no cut count]; [7] [consensus N + short filtered count, unused]; 
            [8] junction_type_data list; [9] target name; 10 [HR left junction count, HR right junction count]

            The junction_type_data list contains the repair type category counts.  [0] TMEJ, del_size >= 4 and 
            microhomology_size >= 2; [1] NHEJ, del_size < 4 and ins_size < 5; [2] insertions >= 5 
            [3] Junctions with scars not represented by the other categories; [4] Non-MH Deletions, del_size >= 4 and 
            microhomology_size < 2 and ins_size < 5
            '''
            # count reads that pass the read filters
            self.summary_data[1] += 1

            # The cutwindow is used to filter out false positives.
            cutwindow = self.target_region[self.cutsite-4:self.cutsite+4]

            sub_list, self.summary_data = \
                SlidingWindow.sliding_window(
                    consensus_seq, self.target_region, self.cutsite, self.target_length, self.lower_limit,
                    self.upper_limit, self.summary_data, self.left_target_windows, self.right_target_windows, cutwindow,
                    self.hr_donor)

            '''
            The sub_list holds the data for a single consensus read.  These data are [left deletion, right deletion, 
            insertion, microhomology, consensus sequence].  The list could be empty if nothing was found or the 
            consensus was too short.
            '''

            if sub_list:
                read_results_list.append(sub_list)
                freq_key = "{}|{}|{}|{}|{}".format(sub_list[0], sub_list[1], sub_list[2], sub_list[3], sub_list[9])

            else:
                continue

            if freq_key in results_freq_dict:
                results_freq_dict[freq_key][0] += 1
            else:
                results_freq_dict[freq_key] = [1, sub_list]

        self.log.info("Finished Processing {}".format(self.index_name))

        # Write frequency results file
        self.frequency_output(self.index_name, results_freq_dict, junction_type_data)

        # Format and output raw data if user has so chosen.
        if self.args.OutputRawData:
            self.raw_data_output(self.index_name, read_results_list)

        return self.summary_data
示例#18
0
#ctg2_left_rc    177718  A       ctg2_100x_PB_L_5164_1_0 None    *
#code based on: http://pysam.readthedocs.org/en/latest/
#argparse info: http://www.cyberciti.biz/faq/python-command-line-arguments-argv-example/
import pysam
import argparse
import csv
parser = argparse.ArgumentParser(
    description='usage: samtools_view.py --bam reads.bam --bed bed_file.bed')
parser.add_argument('--bam', help='Input bam file name', required=True)
parser.add_argument('--bed', help='Input bedfile name', required=True)
parser.add_argument('--fasta',
                    help='Input fasta reference file name',
                    required=True)
args = parser.parse_args()
bamfile = pysam.AlignmentFile(args.bam, "rb")
fastafile = pysam.FastaFile(args.fasta)
with open(args.bed) as bed:
    reader = csv.reader(bed, delimiter="\t")
    sites = list(reader)

for site in sites:
    start = int(site[1])
    end = int(site[2])
    pileup = bamfile.pileup(site[0],
                            start,
                            end,
                            stepper="all",
                            max_depth=500000)
    for pileupColumn in pileup:
        for pileupRead in pileupColumn.pileups:
            if (pileupColumn.pos >= start) and (pileupColumn.pos < end) and (
示例#19
0
for annotation in annotations:
    if annotation.consequence:
        cons_annotations.append(annotation)
    else:
        nocons_annotations.append(annotation)

annotationFeatures = []
for annotation in annotations:
    if hasattr(annotation, 'features'):
        annotationFeatures.extend(annotation.features)
    else:
        annotationFeatures.append(annotation.name)
annotationFeatures = OrderedDict.fromkeys(
    annotationFeatures).keys()  # remove duplicates

genome_index = pysam.FastaFile(reference)

for annotation in annotations:
    if hasattr(annotation, 'load'):
        annotation.load(args)

info_columns = vcf_reader.infos['CSQ'].desc.split('Format: ')[1].split('|')

output_columns = ['Chrom', 'Pos', 'Ref', 'Alt', 'Type']
output_columns.extend(annotationFeatures)

if args.header:
    stdout.write('#' + '\t'.join(output_columns) + '\n')

# processing
for record in vcf_reader:
示例#20
0
 def _check(self, files, expected):
     for file, exp in zip(files, expected):
         with pysam.FastaFile(file.name) as fh:
             self.assertEqual(exp, fh.references)
示例#21
0
def main():
    parser = argparse.ArgumentParser(
        description="Script to convert VCF files into tsinfer input.")
    parser.add_argument("source",
                        choices=["1kg", "sgdp", "ukbb"],
                        help="The source of the input data.")
    parser.add_argument("data_file", help="The input data file pattern.")
    parser.add_argument("ancestral_states_file",
                        help="A vcf file containing ancestral allele states. ")
    parser.add_argument("output_file", help="The tsinfer output file")
    parser.add_argument(
        "-m",
        "--metadata_file",
        default=None,
        help="The metadata file containing population and sample data")
    parser.add_argument("-n",
                        "--max-variants",
                        default=None,
                        type=int,
                        help="Keep only the first n variants")
    parser.add_argument(
        "-p",
        "--progress",
        action="store_true",
        help="Show progress bars and output extra information when done")
    parser.add_argument(
        "--ancestral-states-url",
        default=None,
        help="The source of ancestral state information for provenance.")
    parser.add_argument("--reference-name",
                        default=None,
                        help="The name of the reference for provenance.")

    args = parser.parse_args()

    git_hash = subprocess.check_output(["git", "rev-parse", "HEAD"])
    git_provenance = {
        "repo":
        "[email protected]:mcveanlab/treeseq-inference.git",
        "hash":
        git_hash.decode().strip(),
        "dir":
        "human-data",
        "notes:":
        ("Use the Makefile to download and process the upstream data files")
    }
    data_provenance = {
        "ancestral_states_url": args.ancestral_states_url,
        "reference_name": args.reference_name
    }

    # Get the ancestral states.
    fasta = pysam.FastaFile(args.ancestral_states_file)
    # NB! We put in an extra character at the start to convert to 1 based coords.
    ancestral_states = "X" + fasta.fetch(reference=fasta.references[0])
    # The largest possible site position is len(ancestral_states). Positions must
    # be strictly less than sequence_length, so we add 1.
    sequence_length = len(ancestral_states) + 1

    converter_class = {
        "1kg": ThousandGenomesConverter,
        "sgdp": SgdpConverter,
        "ukbb": UkbbConverter
    }

    try:
        with tsinfer.SampleData(path=args.output_file,
                                num_flush_threads=2,
                                sequence_length=sequence_length) as samples:
            converter = converter_class[args.source](args.data_file,
                                                     ancestral_states, samples)
            converter.process_metadata(args.metadata_file, args.progress)
            converter.process_sites(args.progress, args.max_variants)
            samples.record_provenance(command=sys.argv[0],
                                      args=sys.argv[1:],
                                      git=git_provenance,
                                      data=data_provenance)
    except Exception as e:
        os.unlink(args.output_file)
        raise e
    print(samples)
示例#22
0
def process_messages(processor_pipe, shared_all_loci):
    reads_to_save = []
    print("Starting Message Processing")
    if USE_LOCAL:
        in_file = open(
            "/Users/siakhnin/data/RMNISTHS_30xdownsample_9999999_11000000.mapped.sr.msgpack",
            "r")
        read_source = msgpack.load(in_file, encoding='utf-8')
        in_file.close()
    else:
        read_source = KafkaConsumer(
            'mapped_reads',
            group_id='rheos_common',
            bootstrap_servers=['localhost:9092'],
            value_deserializer=lambda m: json.loads(m.decode('utf-8')))

    start_time = time.time()
    updated_loci = {}

    reference_file = "/Users/siakhnin/data/reference/genome.fa"
    ref = pysam.FastaFile(reference_file).fetch(region="20")
    reads_list = []
    counter = 1
    saver_notified = False

    for message in read_source:
        # message value and key are raw bytes -- decode if necessary!
        # e.g., for unicode: `message.value.decode('utf-8')`
        #print ("%s:%d:%d: key=%s value=%s" % (message.topic, message.partition,
        #                                      message.offset, message.key,
        #                                      message.value))

        if processor_pipe.poll():
            msg = processor_pipe.recv()
            if msg == SAVED_MSG:
                saver_notified = False

        if not saver_notified:
            processor_pipe.send(NEW_MSG)
            saver_notified = True
            print("Sending message to saver")

        if USE_LOCAL:
            my_read = message
        else:
            my_read = message.value

        updated_loci.update(
            process_read(my_read, ref, 0, len(ref), shared_all_loci,
                         reads_to_save))
        shared_all_loci.update(updated_loci)
        updated_loci = {}
        if counter % 1000 == 0:
            print("Processed {} messages. Updating shared dictionary".format(
                counter))
        #     ts = time.time()
        #     shared_all_loci.update(updated_loci)
        #     te = time.time()
        #     print("Took {} to update {} loci".format(te - ts, len(updated_loci)))
        #     updated_loci = {}
        #break
        counter += 1
        if counter > 20000:
            break
def vcf2tsv(is_vcf=None,
            is_bed=None,
            is_pos=None,
            bam_fn=None,
            truth=None,
            cosmic=None,
            dbsnp=None,
            mutect=None,
            varscan=None,
            vardict=None,
            lofreq=None,
            scalpel=None,
            strelka=None,
            arbitrary_vcfs=None,
            dedup=True,
            min_mq=1,
            min_bq=5,
            min_caller=0,
            ref_fa=None,
            p_scale=None,
            outfile=None):

    if arbitrary_vcfs is None:
        arbitrary_vcfs = []

    # Convert contig_sequence to chrom_seq dict:
    fai_file = ref_fa + '.fai'
    chrom_seq = genome.faiordict2contigorder(fai_file, 'fai')

    # Determine input format:
    if is_vcf:
        mysites = is_vcf
    elif is_bed:
        mysites = is_bed
    elif is_pos:
        mysites = is_pos
    else:
        mysites = fai_file
        logger.info('No position supplied. Will evaluate the whole genome.')

    # Re-scale output or not:
    if p_scale == None:
        logger.info('NO RE-SCALING')
    elif p_scale.lower() == 'phred':
        p_scale = 'phred'
    elif p_scale.lower() == 'fraction':
        p_scale = 'fraction'
    else:
        p_scale = None
        logger.info('NO RE-SCALING')

    # Define NaN and Inf:
    nan = float('nan')
    inf = float('inf')
    pattern_chr_position = genome.pattern_chr_position

    ## Running
    with genome.open_textfile(mysites) as my_sites, open(outfile,
                                                         'w') as outhandle:

        my_line = my_sites.readline().rstrip()

        bam = pysam.AlignmentFile(bam_fn, reference_filename=ref_fa)
        ref_fa = pysam.FastaFile(ref_fa)

        if truth:
            truth = genome.open_textfile(truth)
            truth_line = genome.skip_vcf_header(truth)

        if cosmic:
            cosmic = genome.open_textfile(cosmic)
            cosmic_line = genome.skip_vcf_header(cosmic)

        if dbsnp:
            dbsnp = genome.open_textfile(dbsnp)
            dbsnp_line = genome.skip_vcf_header(dbsnp)

        # 6 Incorporate callers: get thru the #'s
        if mutect:
            mutect = genome.open_textfile(mutect)
            mutect_line = genome.skip_vcf_header(mutect)

        if varscan:
            varscan = genome.open_textfile(varscan)
            varscan_line = genome.skip_vcf_header(varscan)

        if vardict:
            vardict = genome.open_textfile(vardict)
            vardict_line = genome.skip_vcf_header(vardict)

        if lofreq:
            lofreq = genome.open_textfile(lofreq)
            lofreq_line = genome.skip_vcf_header(lofreq)

        if scalpel:
            scalpel = genome.open_textfile(scalpel)
            scalpel_line = genome.skip_vcf_header(scalpel)

        if strelka:
            strelka = genome.open_textfile(strelka)
            strelka_line = genome.skip_vcf_header(strelka)

        arbitrary_file_handle = {}
        arbitrary_line = {}
        for ith_arbi, arbitrary_vcf_i in enumerate(arbitrary_vcfs):
            arbitrary_file_handle[ith_arbi] = genome.open_textfile(
                arbitrary_vcf_i)
            arbitrary_line[ith_arbi] = genome.skip_vcf_header(
                arbitrary_file_handle[ith_arbi])

        # Get through all the headers:
        while my_line.startswith('#') or my_line.startswith('track='):
            my_line = my_sites.readline().rstrip()

        # First coordinate, for later purpose of making sure the input is sorted properly
        coordinate_i = re.match(genome.pattern_chr_position, my_line)
        coordinate_i = coordinate_i.group() if coordinate_i else ''

        # First line:
        # First line:
        header_part_1 = out_header.replace('{', '').replace('}', '')

        additional_arbi_caller_numbers = sorted(arbitrary_file_handle.keys())
        for arbi_caller_num in additional_arbi_caller_numbers:
            header_part_1 = header_part_1 + '\t' + 'if_Caller_{}'.format(
                arbi_caller_num)

        header_last_part = label_header.replace('{', '').replace('}', '')

        outhandle.write('\t'.join((header_part_1, header_last_part)) + '\n')

        while my_line:

            # If VCF, get all the variants with the same coordinate into a list:
            if is_vcf:

                my_vcf = genome.Vcf_line(my_line)
                my_coordinates = [(my_vcf.chromosome, my_vcf.position)]

                variants_at_my_coordinate = []

                alt_bases = my_vcf.altbase.split(',')
                for alt_i in alt_bases:
                    vcf_i = copy(my_vcf)
                    vcf_i.altbase = alt_i
                    variants_at_my_coordinate.append(vcf_i)

                # As long as the "coordinate" stays the same, it will keep reading until it's different.
                while my_coordinates[0] == (my_vcf.chromosome,
                                            my_vcf.position):

                    my_line = my_sites.readline().rstrip()
                    my_vcf = genome.Vcf_line(my_line)

                    ########## This block is code is to ensure the input VCF file is properly sorted ##
                    coordinate_j = re.match(genome.pattern_chr_position,
                                            my_line)
                    coordinate_j = coordinate_j.group() if coordinate_j else ''

                    if genome.whoisbehind(coordinate_i, coordinate_j,
                                          chrom_seq) == 1:
                        raise Exception(
                            '{} does not seem to be properly sorted.'.format(
                                mysites))

                    coordinate_i = coordinate_j
                    ###################################################################################

                    if my_coordinates[0] == (my_vcf.chromosome,
                                             my_vcf.position):

                        alt_bases = my_vcf.altbase.split(',')
                        for alt_i in alt_bases:

                            vcf_i = copy(my_vcf)
                            vcf_i.altbase = alt_i
                            variants_at_my_coordinate.append(vcf_i)

            elif is_bed:
                bed_item = my_line.split('\t')
                my_coordinates = genomic_coordinates(bed_item[0],
                                                     int(bed_item[1]) + 1,
                                                     int(bed_item[2]))

            elif is_pos:
                pos_item = my_line.split('\t')
                my_coordinates = genomic_coordinates(pos_item[0],
                                                     int(pos_item[1]),
                                                     int(pos_item[1]))

            elif fai_file:
                fai_item = my_line.split('\t')
                my_coordinates = genomic_coordinates(fai_item[0], 1,
                                                     int(fai_item[1]))

            ##### ##### ##### ##### ##### #####
            for my_coordinate in my_coordinates:

                ######## If VCF, can get ref base, variant base, as well as other identifying information ########
                if is_vcf:

                    ref_bases = []
                    alt_bases = []
                    indel_lengths = []
                    all_my_identifiers = []

                    for variant_i in variants_at_my_coordinate:

                        ref_base = variant_i.refbase
                        first_alt = variant_i.altbase.split(',')[0]
                        indel_length = len(first_alt) - len(ref_base)

                        ref_bases.append(ref_base)
                        alt_bases.append(first_alt)
                        indel_lengths.append(indel_length)

                        # Extract these information if they exist in the VCF file, but they could be re-written if dbSNP/COSMIC are supplied.
                        if_dbsnp = 1 if re.search(r'rs[0-9]+',
                                                  variant_i.identifier) else 0
                        if_cosmic = 1 if re.search(r'COS[MN][0-9]+',
                                                   variant_i.identifier) else 0
                        if_common = 1 if variant_i.get_info_value(
                            'COMMON') == '1' else 0
                        num_cases = variant_i.get_info_value(
                            'CNT') if variant_i.get_info_value('CNT') else nan

                        if variant_i.identifier == '.':
                            my_identifier_i = set()
                        else:
                            my_identifier_i = variant_i.identifier.split(';')
                            my_identifier_i = set(my_identifier_i)

                        all_my_identifiers.append(my_identifier_i)

                ## If not, 1) get ref_base, first_alt from other VCF files.
                #          2) Create placeholders for dbSNP and COSMIC that can be overwritten with dbSNP/COSMIC VCF files (if provided)
                else:
                    variants_at_my_coordinate = [
                        None
                    ]  # Just to have something to iterate
                    ref_base = first_alt = indel_length = None

                    # Could be re-written if dbSNP/COSMIC are supplied. If not, they will remain NaN.
                    if_dbsnp = if_cosmic = if_common = num_cases = nan

                #################################### Find the same coordinate in those VCF files ####################################
                if mutect:
                    got_mutect, mutect_variants, mutect_line = genome.find_vcf_at_coordinate(
                        my_coordinate, mutect_line, mutect, chrom_seq)
                if varscan:
                    got_varscan, varscan_variants, varscan_line = genome.find_vcf_at_coordinate(
                        my_coordinate, varscan_line, varscan, chrom_seq)
                if vardict:
                    got_vardict, vardict_variants, vardict_line = genome.find_vcf_at_coordinate(
                        my_coordinate, vardict_line, vardict, chrom_seq)
                if lofreq:
                    got_lofreq, lofreq_variants, lofreq_line = genome.find_vcf_at_coordinate(
                        my_coordinate, lofreq_line, lofreq, chrom_seq)
                if scalpel:
                    got_scalpel, scalpel_variants, scalpel_line = genome.find_vcf_at_coordinate(
                        my_coordinate, scalpel_line, scalpel, chrom_seq)
                if strelka:
                    got_strelka, strelka_variants, strelka_line = genome.find_vcf_at_coordinate(
                        my_coordinate, strelka_line, strelka, chrom_seq)
                if truth:
                    got_truth, truth_variants, truth_line = genome.find_vcf_at_coordinate(
                        my_coordinate, truth_line, truth, chrom_seq)
                if dbsnp:
                    got_dbsnp, dbsnp_variants, dbsnp_line = genome.find_vcf_at_coordinate(
                        my_coordinate, dbsnp_line, dbsnp, chrom_seq)
                if cosmic:
                    got_cosmic, cosmic_variants, cosmic_line = genome.find_vcf_at_coordinate(
                        my_coordinate, cosmic_line, cosmic, chrom_seq)

                got_arbitraries = {}
                arbitrary_variants = {}
                for ith_arbi in arbitrary_file_handle:
                    got_arbitraries[ith_arbi], arbitrary_variants[
                        ith_arbi], arbitrary_line[
                            ith_arbi] = genome.find_vcf_at_coordinate(
                                my_coordinate, arbitrary_line[ith_arbi],
                                arbitrary_file_handle[ith_arbi], chrom_seq)

                # Now, use pysam to look into the tBAM file(s), variant by variant from the input:
                for ith_call, my_call in enumerate(variants_at_my_coordinate):

                    if is_vcf:
                        # The particular line in the input VCF file:
                        variant_id = ((my_call.chromosome, my_call.position),
                                      my_call.refbase, my_call.altbase)

                        ref_base = ref_bases[ith_call]
                        first_alt = alt_bases[ith_call]
                        indel_length = indel_lengths[ith_call]
                        my_identifiers = all_my_identifiers[ith_call]

                    else:
                        variant_id = ((my_coordinate[0], my_coordinate[1]),
                                      ref_base, first_alt)

                    # Reset num_caller to 0 for each variant in the same coordinate
                    num_callers = 0

                    #################### Collect Caller Vcf ####################:
                    if mutect:
                        mutect_classification, tlod, ecnt = annotate_caller.ssMuTect(
                            variant_id, mutect_variants)
                        num_callers += mutect_classification
                    else:
                        mutect_classification = tlod = ecnt = nan

                    if varscan:
                        varscan_classification, score_varscan2 = annotate_caller.ssVarScan(
                            variant_id, varscan_variants)
                        num_callers += varscan_classification
                    else:
                        varscan_classification = score_varscan2 = nan

                    if vardict:
                        vardict_classification, msi, msilen, shift3, t_pmean, t_pstd, t_qstd = annotate_caller.ssVarDict(
                            variant_id, vardict_variants)
                        num_callers += vardict_classification
                    else:
                        vardict_classification = msi = msilen = shift3 = t_pmean = t_pstd = t_qstd = nan

                    if lofreq:
                        lofreq_classification = annotate_caller.ssLoFreq(
                            variant_id, lofreq_variants)
                        num_callers += lofreq_classification
                    else:
                        lofreq_classification = nan

                    if scalpel:
                        scalpel_classification = annotate_caller.ssScalpel(
                            variant_id, scalpel_variants)
                        num_callers += scalpel_classification
                    else:
                        scalpel_classification = nan

                    if strelka:
                        strelka_classification = annotate_caller.ssStrelka(
                            variant_id, strelka_variants)
                        num_callers += strelka_classification
                    else:
                        strelka_classification = nan

                    arbitrary_classifications = {}
                    for ith_arbi_var in arbitrary_file_handle:
                        arbi_classification_i = annotate_caller.anyInputVcf(
                            variant_id, arbitrary_variants[ith_arbi_var])
                        arbitrary_classifications[
                            ith_arbi_var] = arbi_classification_i
                        num_callers += arbi_classification_i

                    # Potentially write the output only if it meets this threshold:
                    if num_callers >= min_caller:

                        ########## Ground truth file ##########
                        if truth:
                            if variant_id in truth_variants.keys():
                                judgement = 1
                                my_identifiers.add('TruePositive')
                            else:
                                judgement = 0
                                my_identifiers.add('FalsePositive')
                        else:
                            judgement = nan

                        ########## dbSNP ########## Will overwrite dbSNP info from input VCF file
                        if dbsnp:
                            if_dbsnp, if_common, rsID = annotate_caller.dbSNP(
                                variant_id, dbsnp_variants)
                            for ID_i in rsID:
                                my_identifiers.add(ID_i)

                        ########## COSMIC ########## Will overwrite COSMIC info from input VCF file
                        if cosmic:
                            if_cosmic, num_cases, cosmicID = annotate_caller.COSMIC(
                                variant_id, cosmic_variants)
                            for ID_i in cosmicID:
                                my_identifiers.add(ID_i)

                        ########## ######### INFO EXTRACTION FROM BAM FILES ########## #########
                        # Tumor tBAM file:
                        tBamFeatures = sequencing_features.from_bam(
                            bam, my_coordinate, ref_base, first_alt, min_mq,
                            min_bq)

                        # Homopolymer eval:
                        homopolymer_length, site_homopolymer_length = sequencing_features.from_genome_reference(
                            ref_fa, my_coordinate, ref_base, first_alt)

                        # Linguistic sequence complexity in a +/-80bp window, but substring calculation stops at 20-bp substring.
                        seq_span_80bp = ref_fa.fetch(
                            my_coordinate[0], max(0, my_coordinate[1] - 41),
                            my_coordinate[1] + 40)
                        seq_left_80bp = ref_fa.fetch(
                            my_coordinate[0], max(0, my_coordinate[1] - 81),
                            my_coordinate[1])
                        seq_right_80bp = ref_fa.fetch(my_coordinate[0],
                                                      my_coordinate[1],
                                                      my_coordinate[1] + 81)

                        if len(seq_span_80bp) > 20:
                            LC_spanning = sequencing_features.subLC(
                                seq_span_80bp, 20)
                        else:
                            LC_spanning = math.nan

                        if len(seq_left_80bp) > 20:
                            left_LC = sequencing_features.subLC(
                                seq_left_80bp, 20)
                        else:
                            left_LC = math.nan

                        if len(seq_right_80bp) > 20:
                            right_LC = sequencing_features.subLC(
                                seq_right_80bp, 20)
                        else:
                            right_LC = math.nan

                        LC_adjacent = min(left_LC, right_LC)

                        LC_spanning_phred = genome.p2phred(1 - LC_spanning, 40)
                        LC_adjacent_phred = genome.p2phred(1 - LC_adjacent, 40)

                        # Fill the ID field of the TSV/VCF
                        my_identifiers = ';'.join(
                            my_identifiers) if my_identifiers else '.'

                        ###
                        out_line_part_1 = out_header.format( \
                        CHROM                      = my_coordinate[0],                                                    \
                        POS                        = my_coordinate[1],                                                    \
                        ID                         = my_identifiers,                                                      \
                        REF                        = ref_base,                                                            \
                        ALT                        = first_alt,                                                           \
                        if_MuTect                  = mutect_classification,                                               \
                        if_Strelka                 = strelka_classification,                                              \
                        if_VarScan2                = varscan_classification,                                              \
                        if_VarDict                 = vardict_classification,                                              \
                        if_LoFreq                  = lofreq_classification,                                               \
                        if_Scalpel                 = scalpel_classification,                                              \
                        VarScan2_Score             = rescale(score_varscan2,      'phred', p_scale, 1001),                \
                        if_dbsnp                   = if_dbsnp,                                                            \
                        COMMON                     = if_common,                                                           \
                        if_COSMIC                  = if_cosmic,                                                           \
                        COSMIC_CNT                 = num_cases,                                                           \
                        Consistent_Mates           = tBamFeatures['consistent_mates'],                                    \
                        Inconsistent_Mates         = tBamFeatures['inconsistent_mates'],                                  \
                        Seq_Complexity_Span        = LC_spanning_phred,                                                   \
                        Seq_Complexity_Adj         = LC_adjacent_phred,                                                   \
                        M2_TLOD                    = tlod,                                                                \
                        M2_ECNT                    = ecnt,                                                                \
                        MSI                        = msi,                                                                 \
                        MSILEN                     = msilen,                                                              \
                        SHIFT3                     = shift3,                                                              \
                        MaxHomopolymer_Length      = homopolymer_length,                                                  \
                        SiteHomopolymer_Length     = site_homopolymer_length,                                             \
                        T_DP                       = tBamFeatures['dp'],                                                  \
                        tBAM_REF_MQ                = '%g' % tBamFeatures['ref_mq'],                                       \
                        tBAM_ALT_MQ                = '%g' % tBamFeatures['alt_mq'],                                       \
                        tBAM_p_MannWhitneyU_MQ     = '%g' % tBamFeatures['p_mannwhitneyu_mq'],                            \
                        tBAM_REF_BQ                = '%g' % tBamFeatures['ref_bq'],                                       \
                        tBAM_ALT_BQ                = '%g' % tBamFeatures['alt_bq'],                                       \
                        tBAM_p_MannWhitneyU_BQ     = '%g' % tBamFeatures['p_mannwhitneyu_bq'],                            \
                        tBAM_REF_NM                = '%g' % tBamFeatures['ref_NM'],                                       \
                        tBAM_ALT_NM                = '%g' % tBamFeatures['alt_NM'],                                       \
                        tBAM_NM_Diff               = '%g' % tBamFeatures['NM_Diff'],                                      \
                        tBAM_REF_Concordant        = tBamFeatures['ref_concordant_reads'],                                \
                        tBAM_REF_Discordant        = tBamFeatures['ref_discordant_reads'],                                \
                        tBAM_ALT_Concordant        = tBamFeatures['alt_concordant_reads'],                                \
                        tBAM_ALT_Discordant        = tBamFeatures['alt_discordant_reads'],                                \
                        tBAM_Concordance_FET       = rescale(tBamFeatures['concordance_fet'], 'fraction', p_scale, 1001), \
                        T_REF_FOR                  = tBamFeatures['ref_for'],                                             \
                        T_REF_REV                  = tBamFeatures['ref_rev'],                                             \
                        T_ALT_FOR                  = tBamFeatures['alt_for'],                                             \
                        T_ALT_REV                  = tBamFeatures['alt_rev'],                                             \
                        tBAM_StrandBias_FET        = rescale(tBamFeatures['strandbias_fet'], 'fraction', p_scale, 1001),  \
                        tBAM_p_MannWhitneyU_EndPos = '%g' % tBamFeatures['p_mannwhitneyu_endpos'],                        \
                        tBAM_REF_Clipped_Reads     = tBamFeatures['ref_SC_reads'],                                        \
                        tBAM_ALT_Clipped_Reads     = tBamFeatures['alt_SC_reads'],                                        \
                        tBAM_Clipping_FET          = rescale(tBamFeatures['clipping_fet'], 'fraction', p_scale, 1001),    \
                        tBAM_MQ0                   = tBamFeatures['MQ0'],                                                 \
                        tBAM_Other_Reads           = tBamFeatures['noise_read_count'],                                    \
                        tBAM_Poor_Reads            = tBamFeatures['poor_read_count'],                                     \
                        tBAM_REF_InDel_3bp         = tBamFeatures['ref_indel_3bp'],                                       \
                        tBAM_REF_InDel_2bp         = tBamFeatures['ref_indel_2bp'],                                       \
                        tBAM_REF_InDel_1bp         = tBamFeatures['ref_indel_1bp'],                                       \
                        tBAM_ALT_InDel_3bp         = tBamFeatures['alt_indel_3bp'],                                       \
                        tBAM_ALT_InDel_2bp         = tBamFeatures['alt_indel_2bp'],                                       \
                        tBAM_ALT_InDel_1bp         = tBamFeatures['alt_indel_1bp'],                                       \
                        InDel_Length               = indel_length)

                        additional_caller_columns = []
                        for arbi_key_i in additional_arbi_caller_numbers:
                            additional_caller_columns.append(
                                str(arbitrary_classifications[arbi_key_i]))
                        additional_caller_columns = '\t'.join(
                            additional_caller_columns)

                        label_column = label_header.format(
                            TrueVariant_or_False=judgement)

                        if len(additional_arbi_caller_numbers) > 0:
                            out_line = '\t'.join(
                                (out_line_part_1, additional_caller_columns,
                                 label_column))
                        else:
                            out_line = '\t'.join(
                                (out_line_part_1, label_column))

                        # Print it out to stdout:
                        outhandle.write(out_line + '\n')

            # Read into the next line:
            if not is_vcf:
                my_line = my_sites.readline().rstrip()

        ##########  Close all open files if they were opened  ##########
        opened_files = [
            ref_fa, bam, truth, cosmic, dbsnp, mutect, varscan, vardict,
            lofreq, scalpel, strelka
        ]
        [
            opened_files.append(extra_opened_file)
            for extra_opened_file in arbitrary_file_handle.values()
        ]
        [opened_file.close() for opened_file in opened_files if opened_file]
示例#24
0
def find_saturation(bam, ref, start, end, chrom, rs, re, output):
	"""
	Reads the BAM file and counts each base at a specific aligned position.
	Compares those reads to the reference and calculates frequency of the SNPs at each
	position.
	:param string bam: BAM file pathway.
	:param string ref: Reference file pathway.
	:param int start: Start position.
	:param int end: End position. 
	:param float rs: Range start number.
	:param float re: Range end number.
	:return: a dictionary of mutations at each position.
	"""

	# Read the BAM file.
	bamfile = pysam.AlignmentFile(bam, 'rb')

	# Read reference FASTA file.
	fastafile = pysam.FastaFile(ref)

	mutations = {}
	total_reads = 0

	# fetch() returns all reads overlapping a region sorted by the first aligned base in the reference
	# 	sequence. Note that it will also return reads that are only partially overlapping with 
	#	the region.
	# Create the dictionary of dictionaries for SNPs at each position.
	for read in bamfile.fetch(chrom, start, end):

		# read.positions gives an array of all the positions of each sequence.
		positions = read.get_reference_positions()
		sequence = read.query_alignment_sequence # Don't want soft clipped bases.
		quality = read.mapping_quality
		q_quality = read.query_alignment_qualities

		# Disregard any reads that don't have high mapping accuracy.
		if quality < 40:
			continue
		
		tmp = 0
		for i in range(tmp, len(positions)-1):

			# Check the probability that the base at this position is wrong.
			if q_quality[i] < 30 or sequence[i] == 'N':
				continue

			# Make sure that we compute just the specified region.
			if positions[i] >= end or positions[i] < start:
				break

			# Positions start at index 0 which is fine since reference also starts at 0.
			# Position number will be incremented for VCF file creation. 
			if positions[i] not in mutations:
				atcg = {'A': 0, 'T':0, 'C':0, 'G':0}
				atcg[sequence[i]] += 1
				mutations[positions[i]] = atcg
			else:				
				mutations[positions[i]][sequence[i]] += 1

	bamfile.close()	

	# No ref and separate mutation nucleotides fractions approach .
	#mutations = calculate_fractions(mutations, fastafile)
	
	# No ref and collected non-ref mutations fractions apprach.
	mutations, positions = calculate_fractions_overall(mutations, fastafile, chrom, re, rs, output)

	fastafile.close()

	return mutations, positions
示例#25
0
import pysam
import math
import statistics

bam = pysam.AlignmentFile("/home/minime/Scrivania/TEST/20161213_02_Conn.bam",
                          "rb")
fasta = pysam.FastaFile("/home/minime/NGS_TOOLS/hg19/ucsc.hg19.fasta")

chrom = 'chr2'
start = 21225013
stop = 21225014

#print bam.count(reference=chrom, start=start, end=stop, until_eof=False, read_callback='nofilter')
print bam.count_coverage(reference=chrom, start=start, end=stop)
#print bam.parse_region(reference=chrom, start=start, end=stop, tid=None)

# for pc in bam.pileup(reference=chrom, start=start, end=stop):
# 	for reads in pc.pileups:
# 		print reads

QB = []
MQ = []
BQ = []

for pileupcolumn in bam.pileup(reference=chrom, start=start, end=stop):
    if pileupcolumn.reference_pos >= start and pileupcolumn.reference_pos < stop:
        for pileupread in pileupcolumn.pileups:
            QB += [
                pileupread.alignment.query_qualities[pileupread.query_position]
            ]
            MQ += [pileupread.alignment.mapping_quality]
def main():
    # arguments
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('-call_fa',
                        help='Callable sites fasta file',
                        required=True)
    parser.add_argument('-vcf',
                        help='Vcf file to extract site frequencies from',
                        required=True)
    parser.add_argument(
        '-cds_bed',
        help='Bed file of zerofold sites, in form chr\tstart\tstop\gene_id',
        required=True)
    parser.add_argument('-out',
                        help='Output file location and name',
                        required=True)
    parser.add_argument('-sub',
                        help='If specified will submit script to cluster',
                        action='store_true',
                        default=False)
    parser.add_argument('-evolgen',
                        help='If specified will submit script to lab queue',
                        default=False,
                        action='store_true')
    args = parser.parse_args()

    # submission loop
    if args.sub is True:
        command_line = [
            ' '.join([x for x in sys.argv if x != '-sub' and x != '-evolgen'])
        ]
        q_sub(command_line,
              out=args.out.replace('.txt', '') + 'gene_pi0_pi4',
              evolgen=args.evolgen,
              t=48,
              mem=15,
              rmem=15)
        sys.exit()

    # variables
    call_fa = pysam.FastaFile(args.call_fa)
    vcf = pysam.VariantFile(args.vcf)
    gene_coords = bed_to_dict(args.cds_bed)
    number_samples = len(vcf.header.samples)
    out = open(args.out, 'w')

    # gene by gene calcs
    print('trans_id',
          'pi_indel',
          'theta_indel',
          'tajd_indel',
          sep='\t',
          file=out)

    for chromosome in gene_coords.keys():
        chr_string = call_fa.fetch(chromosome)
        for trans in gene_coords[chromosome].keys():

            call_sites = ''
            allele_freqs = []
            for pos in gene_coords[chromosome][trans]:

                # get callable site
                call_pos = chr_string[pos]
                call_sites += call_pos

                # get vcf site (try to)
                var_record = [x for x in vcf.fetch(chromosome, pos, pos + 1)]
                if len(var_record) == 1:
                    allele_freq = round(
                        var_record[0].info['AC'][0] /
                        float(number_samples * 2), 3)
                    allele_freqs.append(allele_freq)

            # count callable sites for transcript
            n_callable = call_sites.upper().count('K')

            # calc pi
            if len(allele_freqs) == 0:
                pie = 0
                theta = 0
                tajd = 0
            else:
                pie = pi(number_samples, allele_freqs)
                theta = theta_w(number_samples, len(allele_freqs))
                tajd = tajimas_d(number_samples, allele_freqs)

            if n_callable != 0:
                pie_per_site = pie / float(n_callable)
                theta_per_site = theta / float(n_callable)
            else:
                pie_per_site, theta_per_site = 0.0, 0.0

            print(trans,
                  pie_per_site,
                  theta_per_site,
                  tajd,
                  sep='\t',
                  file=out)

    out.close()
示例#27
0
	def __init__(self, snps, fastaFile, filepath_index):
		self.snps = snps
		self.faFile = pysam.FastaFile(fastaFile, filepath_index)
		self.chromosomesWithSNPs = {}
		self.dico = {'AG':'R', 'CT':'Y', 'GC':'S', 'AT':'W', 'GT':'K', 'AC':'M', 'CGT':'B', 'AGT':'D', 'ACT':'H', 'ACG':'V', 'ACTG':'N'}
		print 'getTranscriptInformation OK'
示例#28
0
def annotate_vcf_n_reads(args):
    """Entry point to annotate a vcf with read depth and supporting reads."""
    ref_fasta = pysam.FastaFile(args.ref_fasta)

    vcf = VCFReader(args.vcf)
    chrom = None
    pref = 'Depth of reads '
    suff = ' by strand (fwd, rev)'
    g_open = 5
    g_ext = 3
    # use parasail.dnafull (match 5, mismatch -4)
    # change INFO below if you change this.
    matrix = parasail.dnafull
    # check it is indeed a symmetric
    match = matrix.matrix[0, 0]
    mismatch = matrix.matrix[0, 1]
    assert dict(
        zip(*np.unique(matrix.matrix[:4, :4], return_counts=True))) == {
            mismatch: 12,
            match: 4
        }
    assert np.unique(matrix.matrix.diagonal()[:4])[0] == match
    ann_meta = [
        ('INFO', 'DP', 1, 'Integer', pref + 'at pos'),
        ('INFO', 'DPS', 2, 'Integer', pref + 'at pos' + suff),
        ('INFO', 'DPSP', 1, 'Integer',
         pref + 'spanning pos +-{}'.format(args.pad)),
        ('INFO', 'SR', '.', 'Integer', 'Depth of spanning reads by strand ' +
         'which best align to each allele ' +
         '(ref fwd, ref rev, alt1 fwd, alt1 rev, etc.)'),
        ('INFO', 'AR', 2, 'Integer', 'Depth of ambiguous spanning reads by ' +
         'strand which align equally well to all alleles (fwd, rev)'),
        ('INFO', 'SC', '.', 'Integer', 'Total alignment score to each allele' +
         ' of spanning reads by strand ' +
         '(ref fwd, ref rev, alt1 fwd, alt1 rev, etc.) aligned with parasail' +
         ' match {}, mismatch {}, open {}, extend {}'.format(
             match, mismatch, g_open, g_ext)),
    ]
    meta_info = vcf.meta + [str(MetaInfo(*m)) for m in ann_meta]
    with VCFWriter(args.vcfout,
                   'w',
                   version='4.1',
                   contigs=vcf.chroms,
                   meta_info=meta_info) as vcf_writer:
        for v in vcf.fetch():
            if chrom is None or chrom != v.chrom:
                chrom = v.chrom
                ref_seq = ref_fasta.fetch(chrom)

            # get read depth by strand at the variant (without padding)
            depth_by_strand = collections.Counter()
            # medaka.features.get_trimmed_reads seems to behave oddly if the
            # region only spans 1 base, hence v.pos + 2
            var_reg = medaka.common.Region(chrom, v.pos, v.pos + 2)
            reads = get_trimmed_reads(args.bam,
                                      var_reg,
                                      partial=True,
                                      read_group=args.RG)
            for is_rev, _ in reads:
                depth_by_strand[is_rev] += 1
            v.info['DP'] = str(sum(depth_by_strand.values()))
            v.info['DPS'] = '{},{}'.format(depth_by_strand[False],
                                           depth_by_strand[True])

            # get read depth by strand at the variant (with padding)
            padded_haps, pad_reg = get_padded_haplotypes(v, ref_seq, args.pad)
            reads = get_trimmed_reads(args.bam,
                                      pad_reg,
                                      partial=False,
                                      read_group=args.RG)
            counts, scores = align_reads_to_haps(reads, padded_haps, g_open,
                                                 g_ext, matrix)
            v.info['DPSP'] = sum(counts.values())
            sr = []  # counts of supporting reads for each hap by strand
            sc = []  # total scores for each hap by strand
            haps = list(range(1 + len(v.alt)))  # ref and alts
            is_revs = [False, True]
            for hap in haps:
                for is_rev in is_revs:
                    sr.append(counts[(is_rev, hap)])
                    sc.append(scores[(is_rev, hap)])
            v.info['SR'] = ','.join(map(str, sr))
            v.info['SC'] = ','.join(map(str, sc))
            v.info['AR'] = '{},{}'.format(
                *[counts[(is_rev, None)] for is_rev in is_revs])
            vcf_writer.write_variant(v)
示例#29
0
def variants_from_hdf(args):
    """Entry point for variant calling from HDF5 files.

    A `LabelScheme` read from HDF must define both a `decode_variants`
    and `decode_consnesus` method. The latter is used with `join_samples`
    to detect multi-locus variants spanning `Sample` slice boundaries.

    """
    logger = medaka.common.get_named_logger('Variants')

    index = medaka.datastore.DataIndex(args.inputs)

    if args.regions is None:
        args.regions = index.regions

    # lookup LabelScheme stored in HDF5
    try:
        label_scheme = index.metadata['label_scheme']
    except KeyError:
        logger.debug("Could not find `label_scheme` metadata in input file, "
                     "assuming HaploidLabelScheme.")
        label_scheme = medaka.labels.HaploidLabelScheme()

    logger.debug("Label decoding is:\n{}".format('\n'.join(
        '{}: {}'.format(k, v) for k, v in label_scheme._decoding.items())))

    if not hasattr(label_scheme, 'decode_variants'):
        raise AttributeError(
            '{} does not support decoding of variants'.format(label_scheme))

    if not hasattr(label_scheme, 'decode_consensus'):
        raise AttributeError('{} does not support consensus decoding required '
                             'for variant calling.'.format(label_scheme))

    # tell label_scheme whether we want verbose info fields
    label_scheme.verbose = args.verbose

    meta_info = label_scheme.variant_metainfo

    with pysam.FastaFile(args.ref_fasta) as fa:
        lengths = dict(zip(fa.references, fa.lengths))

    with medaka.vcf.VCFWriter(args.output,
                              'w',
                              version='4.1',
                              contigs=[
                                  '{},length={}'.format(
                                      r.ref_name, lengths[r.ref_name])
                                  for r in args.regions
                              ],
                              meta_info=meta_info) as vcf_writer:
        for reg in args.regions:
            logger.info("Processing {}.".format(reg))
            ref_seq = pysam.FastaFile(
                args.ref_fasta).fetch(reference=reg.ref_name).upper()

            samples = index.yield_from_feature_files([reg])
            trimmed_samples = medaka.common.Sample.trim_samples(samples)
            joined_samples = join_samples(trimmed_samples, ref_seq,
                                          label_scheme)

            for sample in joined_samples:
                variants = label_scheme.decode_variants(
                    sample, ref_seq, ambig_ref=args.ambig_ref)
                vcf_writer.write_variants(variants, sort=True)
示例#30
0
def main():
    
    args = supply_args()

    handle_in_vcf = open(args.input, 'rU')
    handle_out_vcf = open(args.output, 'w')
    # broke_samp = []

    with handle_in_vcf as vcf:
        for line in vcf:
            if not line.startswith('#'):
                new_line = line.rstrip('\n').split('\t')
                chrom = new_line[0]
                pos = new_line[1]
                rsid = new_line[2]
                ref = new_line[3]
                alts = new_line[4]
                qual = new_line[5]
                filter = new_line[6]
                info = new_line[7]
                samples = new_line[9:]

                if ',' in alts:
                    alt_allele = alts.split(',')
                    genos = geno_prob_parse(len(alt_allele))
                    for i in range(1, len(alt_allele)+1):
                        # Assess alt allele here, if asterisk.
                        if alt_allele[i-1] == '*':
                            extra_base = pysam.FastaFile(args.ref).fetch(chrom, int(pos)-2, int(pos)-1)
                            pos = str(int(pos) - 1)
                            ref = extra_base + ref
                            alt_allele[i-1] = extra_base
                        to_write = [chrom, pos, rsid, ref]
                        gl_ind = include_gl(genos, i)
                        to_write.append(alt_allele[i-1])
                        to_write.extend([qual, filter])
                        to_write.append(info_break(info, i-1))

                        try:
                            format = new_line[8]
                            to_write.append(format)
                            for sample in samples:
                                broke_samp = sample_break(format, sample)
                                # Work up the SAMPLE section.
                                # GT:DP:AD:RO:QR:AO:QA:GL
                                # 0/1:1206:597,608:597:23045:608:23566:-1753.83,0,-1708.68:0.5045643:0.542
                                new_samp = []
                                for field in format.split(':'):
                                    if field == 'GT':
                                        if 'GL' in broke_samp:
                                            new_field = new_gt(broke_samp, gl_ind, 'GL')
                                        elif 'PL' in broke_samp:
                                            new_field = new_gt(broke_samp, gl_ind, 'PL')
                                        else:
                                            if broke_samp['GT'] != '1/1':
                                                new_field = '0/1'
                                            else:
                                                new_field = '1/1'
                                    elif field == 'AD' or field == 'F1R2' or field == 'F2R1' or field == 'MBQ' or field == 'MFRL':
                                        this_samp_ad_ref = broke_samp[field].split(',')[0]
                                        this_samp_ad_alt = broke_samp[field].split(',')[i]
                                        new_field = ','.join([this_samp_ad_ref, this_samp_ad_alt])
                                    elif field == 'AO' or field == 'QA' or field == 'AF':
                                        new_field = broke_samp[field].split(',')[i-1]
                                    elif field == 'GL' or field == 'PL':
                                        new_field = collect_gls(gl_ind, broke_samp, field)
                                    else:
                                        new_field = broke_samp[field]

                                    new_samp.append(new_field)
                                to_write.append(':'.join(new_samp))
                        except:
                            print("Something went wrong")
                        handle_out_vcf.write('\t'.join(to_write))
                        handle_out_vcf.write('\n')
                else:
                    handle_out_vcf.write(line)
            else:
                handle_out_vcf.write(line)

    handle_out_vcf.close()