def initialize_containers(args): scaffold_tp_fp = {} true_breakpoints = {} for acc, scf in fasta_iter(open(args.scafs, 'r')): scaffold_tp_fp[acc] = [0, 0] true_breakpoints[acc] = set() return true_breakpoints, scaffold_tp_fp
def initialize_containers(args): scaffold_tp_fp = {} true_breakpoints = {} for acc, scf in fasta_iter(open(args.scafs, 'r')): scaffold_tp_fp[acc] = [0,0] true_breakpoints[acc] = set() return true_breakpoints, scaffold_tp_fp
def main(args): if not os.path.exists(args.folder_path): os.mkdir(args.folder_path) for acc, seq in fasta.fasta_iter(open(args.genome, 'r')): accession, genome = acc, seq break #print seq dist = DistanceContainer() dist.parse_genome(seq, args.k) dist.plot()
def main(args): if not os.path.exists(args.folder_path): os.mkdir(args.folder_path) for acc, seq in fasta.fasta_iter(open(args.genome,'r')): accession,genome = acc, seq break #print seq dist = DistanceContainer() dist.parse_genome(seq,args.k) dist.plot()
def simulate_instance(args): #print 'Started simulating' if not os.path.exists(args.output_path): os.makedirs(args.output_path) if not args.contigs: genome_path = os.path.join(args.output_path, 'genome.fa') contig_path = os.path.join(args.output_path, 'ctgs.fa') else: genome_path = args.genome contig_path = args.contigs read1_path = os.path.join(args.output_path, 'reads1.fa') read2_path = os.path.join(args.output_path, 'reads2.fa') bam_path = os.path.join(args.output_path, 'mapped') if not args.contigs: #genome #print args.genomelen g = genome.Genome([0.25]*4,args.genomelen,'genome1') g.genome() print >> open(genome_path,'w'), g.genome_fasta_format() #contigs ctgs = open(contig_path,'w') ctg_list = [x for x in contigs.generate_contigs(g.sequence,args.min_contig, args.max_contig, 0,3000)] random.shuffle( ctg_list ) for ctg in ctg_list: ctgs.write(ctg) else: g = genome.Genome([0.25]*4,args.genomelen,'genome1') #print genome_path, args.genomelen longest_seq = 0 for acc,seq in fasta.fasta_iter(open(genome_path,'r')): print acc, len(seq) if len(seq) > longest_seq: g.sequence = seq g.accession = acc longest_seq = len(seq) print 'chosen:',g.accession #ctgs.write('>ctg0\n{0}\n'.format(g.sequence[0:args.burnin])) #for i,x in enumerate(range(args.burnin,args.genomelen,(args.contiglen + args.gaplen))): # ctgs.write('>ctg{0}\n{1}\n'.format(i+1,g.sequence[x:x+args.contiglen])) #reads if args.distr == 'normal': lib = reads.DNAseq(args.read_length ,args.coverage, distribution=args.distr, mean=args.mean,stddev=args.sd) lib.simulate_pe_reads(g) elif args.distr == 'uniform': lib = reads.DNAseq(args.read_length ,args.coverage, distribution=args.distr, min_size=args.min_size,max_size=args.max_size) lib.simulate_pe_reads(g) elif args.distr == 'mix': lib_part1 = reads.DNAseq(args.read_length ,args.coverage/2, distribution='normal', mean=args.mean,stddev=args.sd) lib_part1.simulate_pe_reads(g) lib_part2 = reads.DNAseq(args.read_length ,args.coverage/2, distribution='uniform', min_size=(args.mean - 4*args.sd),max_size=(args.mean + 4*args.sd)) lib_part2.simulate_pe_reads(g) # concatenate the reads from each distribution lib = reads.DNAseq(args.read_length ,args.coverage, distribution=args.distr, mean=args.mean,stddev=args.sd) lib.reads = lib_part1.reads + lib_part2.reads reads1 = open(read1_path,'w') reads2 = open(read2_path,'w') i=0 for read in lib.fasta_format(): if i%2==0: reads1.write(read) else: reads2.write(read) i+=1 #print 'Started mapping' #mapping #align.map_paired_reads(read1_path, read2_path, contig_path, bam_path, args) align.bwa_mem(read1_path, read2_path, genome_path, bam_path, args)