def simulate_instance(args): print 'Started modyfiyng genome' if not os.path.exists(args.output_path): os.makedirs(args.output_path) # genome_path = args.genome contig_path = os.path.join(args.output_path, 'ctgs.fa') read1_path = args.read1 read2_path = args.read2 bam_path = os.path.join(args.output_path, 'mapped') gff_path = os.path.join(args.output_path, 'true_error_pos.gff') gff_file = open(gff_path,'w') genome_seqs = ReadInContigseqs(open(args.genome, 'r'),10) #contigs/scaffolds gap = args.gapsize error = args.error chunk_size = 20000 modified_genome = {} modified_chunks = [] for acc,seq in genome_seqs.iteritems(): if acc == 'sequence_0': pos = 0 chunks = [seq[i:i+chunk_size] for i in range(0, len(seq), chunk_size)] i=0 for sample in range(100): N_s = 'N'* max(0,(gap + error)) cut_size = gap + max(0,-error) modified_chunk = chunks[i][: len(chunks[i])-(cut_size)] + N_s #print modified_chunk modified_chunks.append(modified_chunk) i+=1 #print len(modified_chunk) pos += len(modified_chunk) if (gap + error) > 0: error_start = pos - len(N_s) error_stop = pos # error is anywhere in the introduced gap (either contraction or expansion) else: error_start = pos error_stop = pos + 1 # error is at a specific position where a contraction has occured if error < 0: to_GFF(gff_file, '{0}'.format(acc), 'TRUTH','FCD', error_start, error_stop, 1, '+', '.', 'Note=Error:Contraction {0}bp'.format(abs(error))) else: to_GFF(gff_file, '{0}'.format(acc), 'TRUTH','FCD', error_start, error_stop, 1, '+', '.', 'Note=Error:Expansion {0}bp'.format(abs(error))) mod_seq = ''.join(modified_chunks) if error < 0: modified_genome['scf_gap{0}_errorsize_minus{1}'.format(gap,error)] = mod_seq else: modified_genome['scf_gap{0}_errorsize{1}'.format(gap,error)] = mod_seq else: modified_genome[acc] = seq #print and map ctgs = open(contig_path,'w') for acc,seq in modified_genome.iteritems(): ctgs.write('>{0}\n{1}\n'.format(acc,seq)) ctgs.close() ctgs = open(contig_path,'r') print 'Started mapping' align.bwa_mem(read1_path, read2_path, contig_path, bam_path, args)
def simulate_instance(args): print 'Started modyfiyng genome' if not os.path.exists(args.output_path): os.makedirs(args.output_path) # genome_path = args.genome contig_path = os.path.join(args.output_path, 'ctgs.fa') read1_path = args.read1 read2_path = args.read2 bam_path = os.path.join(args.output_path, 'mapped') gff_path = os.path.join(args.output_path, 'true_error_pos.gff') gff_file = open(gff_path, 'w') genome_seqs = ReadInContigseqs(open(args.genome, 'r'), 10) #contigs/scaffolds gap = args.gapsize error = args.error chunk_size = 20000 modified_genome = {} modified_chunks = [] for acc, seq in genome_seqs.iteritems(): if acc == 'sequence_0': pos = 0 chunks = [ seq[i:i + chunk_size] for i in range(0, len(seq), chunk_size) ] i = 0 for sample in range(100): N_s = 'N' * max(0, (gap + error)) cut_size = gap + max(0, -error) modified_chunk = chunks[i][:len(chunks[i]) - (cut_size)] + N_s #print modified_chunk modified_chunks.append(modified_chunk) i += 1 #print len(modified_chunk) pos += len(modified_chunk) if (gap + error) > 0: error_start = pos - len(N_s) error_stop = pos # error is anywhere in the introduced gap (either contraction or expansion) else: error_start = pos error_stop = pos + 1 # error is at a specific position where a contraction has occured if error < 0: to_GFF(gff_file, '{0}'.format(acc), 'TRUTH', 'FCD', error_start, error_stop, 1, '+', '.', 'Note=Error:Contraction {0}bp'.format(abs(error))) else: to_GFF(gff_file, '{0}'.format(acc), 'TRUTH', 'FCD', error_start, error_stop, 1, '+', '.', 'Note=Error:Expansion {0}bp'.format(abs(error))) mod_seq = ''.join(modified_chunks) if error < 0: modified_genome['scf_gap{0}_errorsize_minus{1}'.format( gap, error)] = mod_seq else: modified_genome['scf_gap{0}_errorsize{1}'.format( gap, error)] = mod_seq else: modified_genome[acc] = seq #print and map ctgs = open(contig_path, 'w') for acc, seq in modified_genome.iteritems(): ctgs.write('>{0}\n{1}\n'.format(acc, seq)) ctgs.close() ctgs = open(contig_path, 'r') print 'Started mapping' align.bwa_mem(read1_path, read2_path, contig_path, bam_path, args)
def simulate_instance(args): print 'Started simulating' if not os.path.exists(args.output_path): os.makedirs(args.output_path) genome_path = os.path.join(args.output_path, 'genome.fa') contig_path = os.path.join(args.output_path, 'ctgs.fa') read1_path = os.path.join(args.output_path, 'reads1.fa') read2_path = os.path.join(args.output_path, 'reads2.fa') bam_path = os.path.join(args.output_path, 'mapped') gff_path = os.path.join(args.output_path, 'true_error_pos.gff') gff_file = open(gff_path,'w') #genome genomelen = args.burnin + ( (args.contiglen+args.gaplen)*(args.nrgaps + 1 ) + args.contiglen ) * (len(args.errorsize) + 1) print genomelen g = genome.Genome([0.25]*4,genomelen,'genome1') g.genome() print >> open(genome_path,'w'), g.genome_fasta_format() #contigs/scaffolds if args.scaffolds: scafs = open(contig_path,'w') scafs.write('>scf_burnin{0}\n{1}\n'.format(args.gaplen,g.sequence[0:args.burnin])) scaffold = '' pos = args.burnin for error in args.errorsize: scaffold_coord = 0 for i,x in enumerate(range(pos, pos + (args.nrgaps + 1)*(args.contiglen + args.gaplen ), args.contiglen + args.gaplen)): #print 'pos:', x if (args.gaplen + error) > 0: if i < args.nrgaps: scaffold += g.sequence[x:x+args.contiglen]+ 'N'* (args.gaplen + error) scaffold_coord = len(scaffold) error_start = scaffold_coord - (args.gaplen + error) error_stop = scaffold_coord # error is anywhere in the introduced gap (either contraction or expansion) else: scaffold += g.sequence[x:x+args.contiglen] else: #scaffold += g.sequence[i*(args.gaplen + error) + x : x + args.contiglen + (i+1)*(args.gaplen + error)] scaffold += g.sequence[x : x + args.contiglen + (args.gaplen + error)] scaffold_coord = len(scaffold) error_start = scaffold_coord error_stop = scaffold_coord+1 # error is at a specific position where a contraction has occured if error < 0 and i < args.nrgaps: to_GFF(gff_file, 'scf_gap{1}_errorsize_minus{2}'.format(i+1, args.gaplen, abs(error)), 'TRUTH','FCD', error_start, error_stop, 1, '+', '.', 'Note=Error:Contraction {0}bp'.format(abs(error))) elif error > 0 and i < args.nrgaps: to_GFF(gff_file, 'scf_gap{1}_errorsize{2}'.format(i+1, args.gaplen, abs(error)), 'TRUTH','FCD', error_start, error_stop, 1, '+', '.', 'Note=Error:Expansion {0}bp'.format(abs(error))) else: pass if error <0: scafs.write('>scf_gap{1}_errorsize_minus{2}\n{3}\n'.format(i+1, args.gaplen, abs(error), scaffold)) else: scafs.write('>scf_gap{1}_errorsize{2}\n{3}\n'.format(i+1, args.gaplen, error, scaffold)) scaffold = '' pos = x + 2*args.contiglen # dummy sequences to prevent bwa tor remove any of our scaffolds # for i in range(10): # dummy = genome.Genome([0.25]*4,10000,'z_dummy{0}'.format(i+1)) # dummy.genome() # scafs.write('>z_dummy{0}\n{1}\n'.format(i+1, dummy.sequence)) else: ctgs = open(contig_path,'w') ctgs.write('>ctg0\n{0}\n'.format(g.sequence[0:args.burnin])) for i,x in enumerate(range(args.burnin,genomelen,(args.contiglen + args.gaplen))): ctgs.write('>ctg{0}\n{1}\n'.format(i+1,g.sequence[x:x+args.contiglen])) #reads if args.distr == 'normal': lib = reads.DNAseq(args.read_length ,args.coverage, distribution=args.distr, mean=args.mean,stddev=args.sd) lib.simulate_pe_reads(g) elif args.distr == 'uniform': lib = reads.DNAseq(args.read_length ,args.coverage, distribution=args.distr, min_size=args.min_size,max_size=args.max_size) lib.simulate_pe_reads(g) reads1 = open(read1_path,'w') reads2 = open(read2_path,'w') i=0 for read in lib.fasta_format(): if i%2==0: reads1.write(read) else: reads2.write(read) i+=1 print 'Started mapping' #mapping #align.map_paired_reads(read1_path, read2_path, contig_path, bam_path, args) align.bwa_mem(read1_path, read2_path, contig_path, bam_path, args)
def simulate_instance(args): print 'Started simulating' if not os.path.exists(args.output_path): os.makedirs(args.output_path) genome_path = os.path.join(args.output_path, 'genome.fa') contig_path = os.path.join(args.output_path, 'ctgs.fa') read1_path = os.path.join(args.output_path, 'reads1.fa') read2_path = os.path.join(args.output_path, 'reads2.fa') bam_path = os.path.join(args.output_path, 'mapped') gff_path = os.path.join(args.output_path, 'true_error_pos.gff') gff_file = open(gff_path, 'w') #genome genomelen = args.burnin + ( (args.contiglen + args.gaplen) * (args.nrgaps + 1) + args.contiglen) * (len(args.errorsize) + 1) print genomelen g = genome.Genome([0.25] * 4, genomelen, 'genome1') g.genome() print >> open(genome_path, 'w'), g.genome_fasta_format() #contigs/scaffolds if args.scaffolds: scafs = open(contig_path, 'w') scafs.write('>scf_burnin{0}\n{1}\n'.format(args.gaplen, g.sequence[0:args.burnin])) scaffold = '' pos = args.burnin for error in args.errorsize: scaffold_coord = 0 for i, x in enumerate( range( pos, pos + (args.nrgaps + 1) * (args.contiglen + args.gaplen), args.contiglen + args.gaplen)): #print 'pos:', x if (args.gaplen + error) > 0: if i < args.nrgaps: scaffold += g.sequence[x:x + args.contiglen] + 'N' * ( args.gaplen + error) scaffold_coord = len(scaffold) error_start = scaffold_coord - (args.gaplen + error) error_stop = scaffold_coord # error is anywhere in the introduced gap (either contraction or expansion) else: scaffold += g.sequence[x:x + args.contiglen] else: #scaffold += g.sequence[i*(args.gaplen + error) + x : x + args.contiglen + (i+1)*(args.gaplen + error)] scaffold += g.sequence[x:x + args.contiglen + (args.gaplen + error)] scaffold_coord = len(scaffold) error_start = scaffold_coord error_stop = scaffold_coord + 1 # error is at a specific position where a contraction has occured if error < 0 and i < args.nrgaps: to_GFF( gff_file, 'scf_gap{1}_errorsize_minus{2}'.format( i + 1, args.gaplen, abs(error)), 'TRUTH', 'FCD', error_start, error_stop, 1, '+', '.', 'Note=Error:Contraction {0}bp'.format(abs(error))) elif error > 0 and i < args.nrgaps: to_GFF( gff_file, 'scf_gap{1}_errorsize{2}'.format( i + 1, args.gaplen, abs(error)), 'TRUTH', 'FCD', error_start, error_stop, 1, '+', '.', 'Note=Error:Expansion {0}bp'.format(abs(error))) else: pass if error < 0: scafs.write('>scf_gap{1}_errorsize_minus{2}\n{3}\n'.format( i + 1, args.gaplen, abs(error), scaffold)) else: scafs.write('>scf_gap{1}_errorsize{2}\n{3}\n'.format( i + 1, args.gaplen, error, scaffold)) scaffold = '' pos = x + 2 * args.contiglen # dummy sequences to prevent bwa tor remove any of our scaffolds # for i in range(10): # dummy = genome.Genome([0.25]*4,10000,'z_dummy{0}'.format(i+1)) # dummy.genome() # scafs.write('>z_dummy{0}\n{1}\n'.format(i+1, dummy.sequence)) else: ctgs = open(contig_path, 'w') ctgs.write('>ctg0\n{0}\n'.format(g.sequence[0:args.burnin])) for i, x in enumerate( range(args.burnin, genomelen, (args.contiglen + args.gaplen))): ctgs.write('>ctg{0}\n{1}\n'.format( i + 1, g.sequence[x:x + args.contiglen])) #reads if args.distr == 'normal': lib = reads.DNAseq(args.read_length, args.coverage, distribution=args.distr, mean=args.mean, stddev=args.sd) lib.simulate_pe_reads(g) elif args.distr == 'uniform': lib = reads.DNAseq(args.read_length, args.coverage, distribution=args.distr, min_size=args.min_size, max_size=args.max_size) lib.simulate_pe_reads(g) reads1 = open(read1_path, 'w') reads2 = open(read2_path, 'w') i = 0 for read in lib.fasta_format(): if i % 2 == 0: reads1.write(read) else: reads2.write(read) i += 1 print 'Started mapping' #mapping #align.map_paired_reads(read1_path, read2_path, contig_path, bam_path, args) align.bwa_mem(read1_path, read2_path, contig_path, bam_path, args)
def simulate_instance(args): #print 'Started simulating' if not os.path.exists(args.output_path): os.makedirs(args.output_path) if not args.contigs: genome_path = os.path.join(args.output_path, 'genome.fa') contig_path = os.path.join(args.output_path, 'ctgs.fa') else: genome_path = args.genome contig_path = args.contigs read1_path = os.path.join(args.output_path, 'reads1.fa') read2_path = os.path.join(args.output_path, 'reads2.fa') bam_path = os.path.join(args.output_path, 'mapped') if not args.contigs: #genome #print args.genomelen g = genome.Genome([0.25]*4,args.genomelen,'genome1') g.genome() print >> open(genome_path,'w'), g.genome_fasta_format() #contigs ctgs = open(contig_path,'w') ctg_list = [x for x in contigs.generate_contigs(g.sequence,args.min_contig, args.max_contig, 0,3000)] random.shuffle( ctg_list ) for ctg in ctg_list: ctgs.write(ctg) else: g = genome.Genome([0.25]*4,args.genomelen,'genome1') #print genome_path, args.genomelen longest_seq = 0 for acc,seq in fasta.fasta_iter(open(genome_path,'r')): print acc, len(seq) if len(seq) > longest_seq: g.sequence = seq g.accession = acc longest_seq = len(seq) print 'chosen:',g.accession #ctgs.write('>ctg0\n{0}\n'.format(g.sequence[0:args.burnin])) #for i,x in enumerate(range(args.burnin,args.genomelen,(args.contiglen + args.gaplen))): # ctgs.write('>ctg{0}\n{1}\n'.format(i+1,g.sequence[x:x+args.contiglen])) #reads if args.distr == 'normal': lib = reads.DNAseq(args.read_length ,args.coverage, distribution=args.distr, mean=args.mean,stddev=args.sd) lib.simulate_pe_reads(g) elif args.distr == 'uniform': lib = reads.DNAseq(args.read_length ,args.coverage, distribution=args.distr, min_size=args.min_size,max_size=args.max_size) lib.simulate_pe_reads(g) elif args.distr == 'mix': lib_part1 = reads.DNAseq(args.read_length ,args.coverage/2, distribution='normal', mean=args.mean,stddev=args.sd) lib_part1.simulate_pe_reads(g) lib_part2 = reads.DNAseq(args.read_length ,args.coverage/2, distribution='uniform', min_size=(args.mean - 4*args.sd),max_size=(args.mean + 4*args.sd)) lib_part2.simulate_pe_reads(g) # concatenate the reads from each distribution lib = reads.DNAseq(args.read_length ,args.coverage, distribution=args.distr, mean=args.mean,stddev=args.sd) lib.reads = lib_part1.reads + lib_part2.reads reads1 = open(read1_path,'w') reads2 = open(read2_path,'w') i=0 for read in lib.fasta_format(): if i%2==0: reads1.write(read) else: reads2.write(read) i+=1 #print 'Started mapping' #mapping #align.map_paired_reads(read1_path, read2_path, contig_path, bam_path, args) align.bwa_mem(read1_path, read2_path, genome_path, bam_path, args)