def pysam_install(): _subprocess.call([_sys.executable, 'setup.py', 'build']) for f in _os.listdir('build/lib.linux-x86_64-2.7/pysam'): if f[-3:] == '.so': #print(f) _shutil.copy('build/lib.linux-x86_64-2.7/pysam/{}'.format(f), 'pysam/{}'.format(f)) try: _shutil.rmtree('../pysam') except OSError: pass _os.rename('pysam', '../pysam')
def generateReads(self, path_to_exe = False, paths_to_genomes = False, readcov = 60, readlen = 100, fraglen = 350, sterrfraglen = 20, model = 4, max_cpus = -1): ''' Call GemSIM to generate reads Need to have written genome sequences to generate from, possibly with generated SNPs, small indels and large deletions. ''' #max_cpus etc if paths_to_genomes: use_genomes = sorted(paths_to_genomes) elif hasattr(self, 'written_genomes'): use_genomes = sorted(self.written_genomes) else: raise ValueError('provide either paths_to_genomes or generate some then .writeSequences()') if not path_to_exe: path_to_exe = _get_exe_path('gemsim') comment2 = ''' to generate reads put GemSIM v1.6 into subfolder GemSIM_v1.6 and issue these commands: GemSIM_v1.6/GemReads.py -r LESB58_for_GemSim_01.fasta -n 1980527 -l d -u 350 -s 20 -m GemSIM_v1.6/models/ill100v4_p.gzip -c -q 33 -p -o GemSimLESB58_01 ''' num_pairs = len(self.genome.sequence) * readcov / (readlen*2) if model == 4: path_to_model = _os.path.sep.join(path_to_exe.split(_os.path.sep)[:-1] + ['models','ill100v4_p.gzip']) elif model == 5: path_to_model = _os.path.sep.join(path_to_exe.split(_os.path.sep)[:-1] + ['models','ill100v5_p.gzip']) print('Using error model: {}'.format(path_to_model)) print('Generating {:,} {}bp read pairs for {}x coverage depth of a {}bp genome ({})'.format( num_pairs, readlen, readcov, len(self.genome.sequence), self.genome.id)) processes = set() max_processes = _decide_max_processes( max_cpus ) import time start = time.time() out_raw = [] for i,genome_in in enumerate(use_genomes): # could use per genome length . . less consistent than using reference # genome_len = len(_SeqIO.read(genome_in,'fasta').seq) # num_pairs = genome_len * readcov / (readlen*2) outprefix = 'GemSim_{}_{:02d}'.format(self.genome.id, i+1) cmd = [path_to_exe, '-r', genome_in, '-n', num_pairs, '-l', 'd', '-u', fraglen, '-s', sterrfraglen, '-m', path_to_model, '-c', '-q', 33, '-p', '-o', outprefix] out_raw += [outprefix+'_fir.fastq', outprefix+'_sec.fastq'] # this would be better to rename and compress all in one # maybe as a shell script? Then resuming (--force) would be easier. if _os.path.exists(outprefix+'_fir.fastq') and \ _os.path.exists(outprefix+'_sec.fastq'): print('Found output for {}_fir.fastq (and sec), not regenerating, '\ 'delete these to start from scratch'.format(outprefix)) else: cmd = map(str,cmd) print(' '.join(cmd)) processes.add( _subprocess.Popen(cmd, shell=False) ) if len(processes) >= max_processes: (pid, exit_status) = _os.wait() processes.difference_update( [p for p in processes if p.poll() is not None]) # Check if all the child processes were closed for p in processes: if p.poll() is None: p.wait() missing = [] for o in out_raw: if not _os.path.exists(o): missing += [o] assert len(missing) == 0, 'Could not find:\n{}'.format('\n'.join(missing)) print('all finished after {} minutes'.format(int(round((time.time() - start)/60.0)))) outdir = _os.path.sep.join(['simulated_reads',self.genome.id]) try: _os.makedirs(outdir) except OSError: pass for o in out_raw: new = _os.path.sep.join([outdir, o.replace('fir','R1').replace('sec','R2')]) print('{} ==> {}'.format(o, new)) _os.rename(o, new) cmd = ['gzip', new] print(' '.join(cmd)) _subprocess.call(cmd)
def generateReads(self, path_to_exe=False, paths_to_genomes=False, readcov=60, readlen=100, fraglen=350, sterrfraglen=20, model=4, max_cpus=-1): ''' Call GemSIM to generate reads Need to have written genome sequences to generate from, possibly with generated SNPs, small indels and large deletions. ''' #max_cpus etc if paths_to_genomes: use_genomes = sorted(paths_to_genomes) elif hasattr(self, 'written_genomes'): use_genomes = sorted(self.written_genomes) else: raise ValueError( 'provide either paths_to_genomes or generate some then .writeSequences()' ) if not path_to_exe: path_to_exe = _get_exe_path('gemsim') comment2 = ''' to generate reads put GemSIM v1.6 into subfolder GemSIM_v1.6 and issue these commands: GemSIM_v1.6/GemReads.py -r LESB58_for_GemSim_01.fasta -n 1980527 -l d -u 350 -s 20 -m GemSIM_v1.6/models/ill100v4_p.gzip -c -q 33 -p -o GemSimLESB58_01 ''' num_pairs = len(self.genome.sequence) * readcov / (readlen * 2) if model == 4: path_to_model = _os.path.sep.join( path_to_exe.split(_os.path.sep)[:-1] + ['models', 'ill100v4_p.gzip']) elif model == 5: path_to_model = _os.path.sep.join( path_to_exe.split(_os.path.sep)[:-1] + ['models', 'ill100v5_p.gzip']) print('Using error model: {}'.format(path_to_model)) print( 'Generating {:,} {}bp read pairs for {}x coverage depth of a {}bp genome ({})' .format(num_pairs, readlen, readcov, len(self.genome.sequence), self.genome.id)) processes = set() max_processes = _decide_max_processes(max_cpus) import time start = time.time() out_raw = [] for i, genome_in in enumerate(use_genomes): # could use per genome length . . less consistent than using reference # genome_len = len(_SeqIO.read(genome_in,'fasta').seq) # num_pairs = genome_len * readcov / (readlen*2) outprefix = 'GemSim_{}_{:02d}'.format(self.genome.id, i + 1) cmd = [ path_to_exe, '-r', genome_in, '-n', num_pairs, '-l', 'd', '-u', fraglen, '-s', sterrfraglen, '-m', path_to_model, '-c', '-q', 33, '-p', '-o', outprefix ] out_raw += [outprefix + '_fir.fastq', outprefix + '_sec.fastq'] # this would be better to rename and compress all in one # maybe as a shell script? Then resuming (--force) would be easier. if _os.path.exists(outprefix+'_fir.fastq') and \ _os.path.exists(outprefix+'_sec.fastq'): print('Found output for {}_fir.fastq (and sec), not regenerating, '\ 'delete these to start from scratch'.format(outprefix)) else: cmd = map(str, cmd) print(' '.join(cmd)) processes.add(_subprocess.Popen(cmd, shell=False)) if len(processes) >= max_processes: (pid, exit_status) = _os.wait() processes.difference_update( [p for p in processes if p.poll() is not None]) # Check if all the child processes were closed for p in processes: if p.poll() is None: p.wait() missing = [] for o in out_raw: if not _os.path.exists(o): missing += [o] assert len(missing) == 0, 'Could not find:\n{}'.format( '\n'.join(missing)) print('all finished after {} minutes'.format( int(round((time.time() - start) / 60.0)))) outdir = _os.path.sep.join(['simulated_reads', self.genome.id]) try: _os.makedirs(outdir) except OSError: pass for o in out_raw: new = _os.path.sep.join( [outdir, o.replace('fir', 'R1').replace('sec', 'R2')]) print('{} ==> {}'.format(o, new)) _os.rename(o, new) cmd = ['gzip', new] print(' '.join(cmd)) _subprocess.call(cmd)