def __init__(self,
              reference_file: str = None,
              sim_output: str = None,
              sequencing_error: float = 0.005,
              mutation_rate: float = 0.0010,
              mutation_indel_fraction: float = 0.15,
              indel_extension_probability: float = 0.15,
              random_seed: int = -1,
              paired_end: bool = False,
              read_length: int = 100,
              read_depth: int = 20,
              undirectional: bool = False,
              methylation_reference: str = None,
              cgmap: str = None,
              ambiguous_base_cutoff: float = 0.05,
              haplotype_mode: bool = False,
              pe_fragment_size: int = 400,
              insert_deviation: int = 25,
              mean_insert_size: int = 100,
              collect_ch_sites: bool = True,
              collect_sim_stats: bool = False,
              verbose: bool = True,
              overwrite_db: bool = False):
     _, wgsim_path, _ = get_external_paths()
     self.sim_command = [
         wgsim_path, '-1',
         str(read_length), '-2',
         str(read_length), '-e',
         str(sequencing_error), '-d',
         str(pe_fragment_size), '-s',
         str(insert_deviation), '-r',
         str(mutation_rate), '-R',
         str(mutation_indel_fraction), '-X',
         str(indel_extension_probability), '-S',
         str(random_seed), '-A',
         str(ambiguous_base_cutoff), '-I',
         str(mean_insert_size)
     ]
     if haplotype_mode:
         self.sim_command.append('-h')
     self.sim_db = SetCytosineMethylation(
         reference_file=reference_file,
         sim_dir=sim_output,
         methylation_reference=methylation_reference,
         cgmap=cgmap,
         collect_ch_sites=collect_ch_sites,
         overwrite_db=overwrite_db)
     self.sim_output = sim_output
     self.paired_end = paired_end
     self.output_objects = self.get_output_objects
     self.undirectional = undirectional
     self.read_coverage = (read_length, read_depth)
     self.reference = self.sim_db.reference
     self.collect_sim_stats = collect_sim_stats
     self.tqdm_disabe = False if verbose else True
     self.current_contig = None
     self.contig_profile = None
     self.contig_values = {}
     self.variant_data = {}
示例#2
0
 def __init__(self, genome_database: str = None, block_size: int = None):
     # format genome_database path
     bwa_path, _, _ = get_external_paths()
     self.genome_database = self.generate_genome_directory(genome_database)
     self.block_size = block_size
     self.bwa_path = bwa_path
     # set output object
     self.database_output = open(f'{self.genome_database}BSB_ref.fa', 'w')
示例#3
0
 def __init__(self,
              reference_file: str = None,
              genome_database: str = None,
              lower_bound: int = 30,
              upper_bound: int = 500,
              cut_format: str = 'C-CGG',
              block_size: int = None,
              ignore_alt: bool = False):
     bwa_path, _, _ = get_external_paths()
     self.reference_file = OpenFasta(fasta=reference_file)
     self.index_output = IndexOutput(genome_database=genome_database,
                                     block_size=block_size)
     self.lower_bound = lower_bound
     self.upper_bound = upper_bound
     self.ignore_alt = ignore_alt
     self.cut_sites = ProcessCutSites(cut_format=cut_format)
     self.mappable_regions = []
     self.contig_size_dict = {}
示例#4
0
 def align_reads(self):
     """ Launch bwa alignment. Pipe output to BAM file
     """
     _, _, stream_bam = get_external_paths()
     if '/' in self.output:
         assert os.path.exists('/'.join(self.output.split(
             '/')[0:-1])), f"output path {self.output} not valid"
     alignment_run = subprocess.Popen(self.alignment_commands,
                                      stdout=subprocess.PIPE,
                                      stderr=subprocess.PIPE,
                                      universal_newlines=True)
     bam_compression = subprocess.Popen([
         stream_bam, '-@',
         str(self.output_threads), '-o', f'{self.output}.bam'
     ],
                                        stdin=alignment_run.stdout)
     # watch alignment progress, output stderr and collect alignment stats
     while True:
         #Show intermediate steps of alignment
         line = alignment_run.stderr.readline()
         if line:
             print(line)
         if bam_compression.returncode:
             break
         elif alignment_run.returncode:
             break
         elif alignment_run.poll() is not None and bam_compression.poll(
         ) is not None:
             alignment_run.stdout.close()
             alignment_run.stderr.close()
             break
         else:
             alignment_info = alignment_run.stderr.readline().strip()
             if alignment_info:
                 if alignment_info[0:7] == 'BSStat ':
                     category, count = alignment_info.replace(
                         'BSStat ', '').split(': ')
                     self.mapping_statistics[category] += int(count)
                 else:
                     print(alignment_info)
import copy
import unittest
from bsbolt.Simulate.SimulateMethylatedReads import SimulateMethylatedReads
from bsbolt.Utils.UtilityFunctions import reverse_complement, get_external_paths
from tests.TestHelpers import test_directory

bwa_path, wgsim_path, _ = get_external_paths()
# hold read simulation data to test functions


class TestSimOut:
    def __init__(self):
        pass

    @staticmethod
    def write(read):
        return read


sim_out = f'{test_directory}/TestSimulations/wgbs_pe'
test_genome = f'{test_directory}/TestData/BSB_test.fa'

# set rest reference and reads with repeated sequence
test_reference = {'chr10': 'ATCGCATTAA' * 40}
test_read = {
    1: {
        'chrom':
        'chr10',
        'start':
        0,
        'end':
示例#6
0
import datetime
import os
import time
from bsbolt.Align.AlignReads import BisulfiteAlignmentAndProcessing
from bsbolt.CallMethylation.ProcessMethylationContigs import ProcessContigs
from bsbolt.Impute.kNN_Impute import ImputeMissingValues
from bsbolt.Index.RRBSIndex import RRBSBuild
from bsbolt.Index.WholeGenomeIndex import WholeGenomeBuild
from bsbolt.Matrix.MatrixAggregator import AggregateMatrix
from bsbolt.Simulate import SimulateMethylatedReads
from bsbolt.Utils.UtilityFunctions import index_bam, get_external_paths, sort_bam

bwa_path, wgsim_path, stream_bam = get_external_paths()


def launch_index(arguments):
    if arguments.rrbs:
        print(
            f'Generating RRBS Database at {arguments.DB}: '
            f'lower bound {arguments.rrbs_lower}, upper bound {arguments.rrbs_upper}: '
            f'Cut Format {arguments.rrbs_cut_format}')
        index = RRBSBuild(reference_file=arguments.G,
                          genome_database=arguments.DB,
                          cut_format=arguments.rrbs_cut_format,
                          lower_bound=arguments.rrbs_lower,
                          upper_bound=arguments.rrbs_upper,
                          block_size=arguments.B,
                          ignore_alt=arguments.IA)
        index.generate_rrbs_database()
    else:
        print(f'Generating WGBS Database at {arguments.DB}')