def __init__(self, reference_file: str = None, sim_output: str = None, sequencing_error: float = 0.005, mutation_rate: float = 0.0010, mutation_indel_fraction: float = 0.15, indel_extension_probability: float = 0.15, random_seed: int = -1, paired_end: bool = False, read_length: int = 100, read_depth: int = 20, undirectional: bool = False, methylation_reference: str = None, cgmap: str = None, ambiguous_base_cutoff: float = 0.05, haplotype_mode: bool = False, pe_fragment_size: int = 400, insert_deviation: int = 25, mean_insert_size: int = 100, collect_ch_sites: bool = True, collect_sim_stats: bool = False, verbose: bool = True, overwrite_db: bool = False): _, wgsim_path, _ = get_external_paths() self.sim_command = [ wgsim_path, '-1', str(read_length), '-2', str(read_length), '-e', str(sequencing_error), '-d', str(pe_fragment_size), '-s', str(insert_deviation), '-r', str(mutation_rate), '-R', str(mutation_indel_fraction), '-X', str(indel_extension_probability), '-S', str(random_seed), '-A', str(ambiguous_base_cutoff), '-I', str(mean_insert_size) ] if haplotype_mode: self.sim_command.append('-h') self.sim_db = SetCytosineMethylation( reference_file=reference_file, sim_dir=sim_output, methylation_reference=methylation_reference, cgmap=cgmap, collect_ch_sites=collect_ch_sites, overwrite_db=overwrite_db) self.sim_output = sim_output self.paired_end = paired_end self.output_objects = self.get_output_objects self.undirectional = undirectional self.read_coverage = (read_length, read_depth) self.reference = self.sim_db.reference self.collect_sim_stats = collect_sim_stats self.tqdm_disabe = False if verbose else True self.current_contig = None self.contig_profile = None self.contig_values = {} self.variant_data = {}
def __init__(self, genome_database: str = None, block_size: int = None): # format genome_database path bwa_path, _, _ = get_external_paths() self.genome_database = self.generate_genome_directory(genome_database) self.block_size = block_size self.bwa_path = bwa_path # set output object self.database_output = open(f'{self.genome_database}BSB_ref.fa', 'w')
def __init__(self, reference_file: str = None, genome_database: str = None, lower_bound: int = 30, upper_bound: int = 500, cut_format: str = 'C-CGG', block_size: int = None, ignore_alt: bool = False): bwa_path, _, _ = get_external_paths() self.reference_file = OpenFasta(fasta=reference_file) self.index_output = IndexOutput(genome_database=genome_database, block_size=block_size) self.lower_bound = lower_bound self.upper_bound = upper_bound self.ignore_alt = ignore_alt self.cut_sites = ProcessCutSites(cut_format=cut_format) self.mappable_regions = [] self.contig_size_dict = {}
def align_reads(self): """ Launch bwa alignment. Pipe output to BAM file """ _, _, stream_bam = get_external_paths() if '/' in self.output: assert os.path.exists('/'.join(self.output.split( '/')[0:-1])), f"output path {self.output} not valid" alignment_run = subprocess.Popen(self.alignment_commands, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) bam_compression = subprocess.Popen([ stream_bam, '-@', str(self.output_threads), '-o', f'{self.output}.bam' ], stdin=alignment_run.stdout) # watch alignment progress, output stderr and collect alignment stats while True: #Show intermediate steps of alignment line = alignment_run.stderr.readline() if line: print(line) if bam_compression.returncode: break elif alignment_run.returncode: break elif alignment_run.poll() is not None and bam_compression.poll( ) is not None: alignment_run.stdout.close() alignment_run.stderr.close() break else: alignment_info = alignment_run.stderr.readline().strip() if alignment_info: if alignment_info[0:7] == 'BSStat ': category, count = alignment_info.replace( 'BSStat ', '').split(': ') self.mapping_statistics[category] += int(count) else: print(alignment_info)
import copy import unittest from bsbolt.Simulate.SimulateMethylatedReads import SimulateMethylatedReads from bsbolt.Utils.UtilityFunctions import reverse_complement, get_external_paths from tests.TestHelpers import test_directory bwa_path, wgsim_path, _ = get_external_paths() # hold read simulation data to test functions class TestSimOut: def __init__(self): pass @staticmethod def write(read): return read sim_out = f'{test_directory}/TestSimulations/wgbs_pe' test_genome = f'{test_directory}/TestData/BSB_test.fa' # set rest reference and reads with repeated sequence test_reference = {'chr10': 'ATCGCATTAA' * 40} test_read = { 1: { 'chrom': 'chr10', 'start': 0, 'end':
import datetime import os import time from bsbolt.Align.AlignReads import BisulfiteAlignmentAndProcessing from bsbolt.CallMethylation.ProcessMethylationContigs import ProcessContigs from bsbolt.Impute.kNN_Impute import ImputeMissingValues from bsbolt.Index.RRBSIndex import RRBSBuild from bsbolt.Index.WholeGenomeIndex import WholeGenomeBuild from bsbolt.Matrix.MatrixAggregator import AggregateMatrix from bsbolt.Simulate import SimulateMethylatedReads from bsbolt.Utils.UtilityFunctions import index_bam, get_external_paths, sort_bam bwa_path, wgsim_path, stream_bam = get_external_paths() def launch_index(arguments): if arguments.rrbs: print( f'Generating RRBS Database at {arguments.DB}: ' f'lower bound {arguments.rrbs_lower}, upper bound {arguments.rrbs_upper}: ' f'Cut Format {arguments.rrbs_cut_format}') index = RRBSBuild(reference_file=arguments.G, genome_database=arguments.DB, cut_format=arguments.rrbs_cut_format, lower_bound=arguments.rrbs_lower, upper_bound=arguments.rrbs_upper, block_size=arguments.B, ignore_alt=arguments.IA) index.generate_rrbs_database() else: print(f'Generating WGBS Database at {arguments.DB}')