Exemplo n.º 1
0
def simulate_instance(args):
    print 'Started modyfiyng genome'
    if not os.path.exists(args.output_path):
        os.makedirs(args.output_path)

    # genome_path = args.genome
    contig_path = os.path.join(args.output_path, 'ctgs.fa')
    read1_path = args.read1 
    read2_path = args.read2 
    bam_path = os.path.join(args.output_path, 'mapped')
    gff_path = os.path.join(args.output_path, 'true_error_pos.gff')
    gff_file = open(gff_path,'w')
    genome_seqs = ReadInContigseqs(open(args.genome, 'r'),10)
    # #genome
    # genomelen = args.burnin + ( (args.contiglen+args.gaplen)*(args.nrgaps + 1 ) + args.contiglen ) * (len(args.errorsize) + 1)
    # print genomelen
    # g = genome.Genome([0.25]*4,genomelen,'genome1')
    # g.genome()
    # print >> open(genome_path,'w'), g.genome_fasta_format()

    #contigs/scaffolds
    ctgs = open(contig_path,'w')
    scaffold = ''
    chunk_size = 9000 
    modified_genome = {}
    modified_chunks = []
    for acc,seq in genome_seqs.iteritems():
        if acc == 'sequence_0':
            pos = 0
            chunks = [seq[i:i+chunk_size] for i in range(0, len(seq), chunk_size)]
            i=0
            # print len(chunks)
            for gap in [0,1000,2000,3000]:
                for error in [-2000,-1500,-1000,-500,500,1000,1500,2000]:
                    for sample in range(10):
                        N_s = 'N'* max(0,(gap + error))
                        cut_size = gap + max(0,-error)
                        modified_chunk = chunks[i][: len(chunks[i])-(cut_size)] + N_s
                    #print modified_chunk
                        modified_chunks.append(modified_chunk) 
                        i+=1
                        #print len(modified_chunk)
            
                        pos += len(modified_chunk)

                        if (gap + error) > 0:
                            error_start = pos - len(N_s)  
                            error_stop = pos  # error is anywhere in the introduced gap (either contraction or expansion)
                        else:
                            error_start = pos 
                            error_stop = pos + 1 # error is at a specific position where a contraction has occured
                        if error < 0:
                            to_GFF(gff_file, '{0}'.format(acc), 'TRUTH','FCD', error_start, error_stop, 1, '+', '.', 'Note=Error:Contraction {0}bp'.format(abs(error)))
                        else:
                            to_GFF(gff_file, '{0}'.format(acc), 'TRUTH','FCD', error_start, error_stop, 1, '+', '.', 'Note=Error:Expansion {0}bp'.format(abs(error)))

            mod_seq = ''.join(modified_chunks)
            modified_genome[acc] = mod_seq
        else:
            modified_genome[acc] = seq



    for acc,seq in modified_genome.iteritems():
        ctgs.write('>{0}\n{1}\n'.format(acc,seq))
    ctgs.close()
    ctgs = open(contig_path,'r')

    #reads
    print 'Started mapping'
    #mapping
    align.map_paired_reads(read1_path, read2_path, contig_path, bam_path, args)
Exemplo n.º 2
0
def simulate_instance(args):
    print 'Started modyfiyng genome'
    if not os.path.exists(args.output_path):
        os.makedirs(args.output_path)

    # genome_path = args.genome
    contig_path = os.path.join(args.output_path, 'ctgs.fa')
    read1_path = args.read1 
    read2_path = args.read2 
    bam_path = os.path.join(args.output_path, 'mapped')
    gff_path = os.path.join(args.output_path, 'true_error_pos.gff')
    gff_file = open(gff_path,'w')
    genome_seqs = ReadInContigseqs(open(args.genome, 'r'),10)


    #contigs/scaffolds
    gap = args.gapsize
    error = args.error
    chunk_size = 20000 
    modified_genome = {}
    modified_chunks = []
    for acc,seq in genome_seqs.iteritems():
        if acc == 'sequence_0':
            pos = 0
            chunks = [seq[i:i+chunk_size] for i in range(0, len(seq), chunk_size)]
            i=0

            for sample in range(100):
                N_s = 'N'* max(0,(gap + error))
                cut_size = gap + max(0,-error)
                modified_chunk = chunks[i][: len(chunks[i])-(cut_size)] + N_s
            #print modified_chunk
                modified_chunks.append(modified_chunk) 
                i+=1
                #print len(modified_chunk)
    
                pos += len(modified_chunk)

                if (gap + error) > 0:
                    error_start = pos - len(N_s)  
                    error_stop = pos  # error is anywhere in the introduced gap (either contraction or expansion)
                else:
                    error_start = pos 
                    error_stop = pos + 1 # error is at a specific position where a contraction has occured
                if error < 0:
                    to_GFF(gff_file, '{0}'.format(acc), 'TRUTH','FCD', error_start, error_stop, 1, '+', '.', 'Note=Error:Contraction {0}bp'.format(abs(error)))
                else:
                    to_GFF(gff_file, '{0}'.format(acc), 'TRUTH','FCD', error_start, error_stop, 1, '+', '.', 'Note=Error:Expansion {0}bp'.format(abs(error)))

            mod_seq = ''.join(modified_chunks)
            if error < 0:
                modified_genome['scf_gap{0}_errorsize_minus{1}'.format(gap,error)] = mod_seq
            else:
                modified_genome['scf_gap{0}_errorsize{1}'.format(gap,error)] = mod_seq

        else:
            modified_genome[acc] = seq

        #print and map

        ctgs = open(contig_path,'w')
        for acc,seq in modified_genome.iteritems():
            ctgs.write('>{0}\n{1}\n'.format(acc,seq))
        ctgs.close()
        ctgs = open(contig_path,'r')

        print 'Started mapping'
        align.bwa_mem(read1_path, read2_path, contig_path, bam_path, args)
Exemplo n.º 3
0
def simulate_instance(args):
    print 'Started modyfiyng genome'
    if not os.path.exists(args.output_path):
        os.makedirs(args.output_path)

    # genome_path = args.genome
    contig_path = os.path.join(args.output_path, 'ctgs.fa')
    read1_path = args.read1
    read2_path = args.read2
    bam_path = os.path.join(args.output_path, 'mapped')
    gff_path = os.path.join(args.output_path, 'true_error_pos.gff')
    gff_file = open(gff_path, 'w')
    genome_seqs = ReadInContigseqs(open(args.genome, 'r'), 10)

    #contigs/scaffolds
    gap = args.gapsize
    error = args.error
    chunk_size = 20000
    modified_genome = {}
    modified_chunks = []
    for acc, seq in genome_seqs.iteritems():
        if acc == 'sequence_0':
            pos = 0
            chunks = [
                seq[i:i + chunk_size] for i in range(0, len(seq), chunk_size)
            ]
            i = 0

            for sample in range(100):
                N_s = 'N' * max(0, (gap + error))
                cut_size = gap + max(0, -error)
                modified_chunk = chunks[i][:len(chunks[i]) - (cut_size)] + N_s
                #print modified_chunk
                modified_chunks.append(modified_chunk)
                i += 1
                #print len(modified_chunk)

                pos += len(modified_chunk)

                if (gap + error) > 0:
                    error_start = pos - len(N_s)
                    error_stop = pos  # error is anywhere in the introduced gap (either contraction or expansion)
                else:
                    error_start = pos
                    error_stop = pos + 1  # error is at a specific position where a contraction has occured
                if error < 0:
                    to_GFF(gff_file, '{0}'.format(acc), 'TRUTH', 'FCD',
                           error_start, error_stop, 1, '+', '.',
                           'Note=Error:Contraction {0}bp'.format(abs(error)))
                else:
                    to_GFF(gff_file, '{0}'.format(acc), 'TRUTH', 'FCD',
                           error_start, error_stop, 1, '+', '.',
                           'Note=Error:Expansion {0}bp'.format(abs(error)))

            mod_seq = ''.join(modified_chunks)
            if error < 0:
                modified_genome['scf_gap{0}_errorsize_minus{1}'.format(
                    gap, error)] = mod_seq
            else:
                modified_genome['scf_gap{0}_errorsize{1}'.format(
                    gap, error)] = mod_seq

        else:
            modified_genome[acc] = seq

        #print and map

        ctgs = open(contig_path, 'w')
        for acc, seq in modified_genome.iteritems():
            ctgs.write('>{0}\n{1}\n'.format(acc, seq))
        ctgs.close()
        ctgs = open(contig_path, 'r')

        print 'Started mapping'
        align.bwa_mem(read1_path, read2_path, contig_path, bam_path, args)
Exemplo n.º 4
0
def simulate_instance(args):
    print 'Started simulating'
    if not os.path.exists(args.output_path):
        os.makedirs(args.output_path)
    genome_path = os.path.join(args.output_path, 'genome.fa')
    contig_path = os.path.join(args.output_path, 'ctgs.fa')
    read1_path = os.path.join(args.output_path, 'reads1.fa')
    read2_path = os.path.join(args.output_path, 'reads2.fa')
    bam_path = os.path.join(args.output_path, 'mapped')
    gff_path = os.path.join(args.output_path, 'true_error_pos.gff')
    gff_file = open(gff_path, 'w')

    #genome
    genomelen = args.burnin + (
        (args.contiglen + args.gaplen) *
        (args.nrgaps + 1) + args.contiglen) * (len(args.errorsize) + 1)
    print genomelen
    g = genome.Genome([0.25] * 4, genomelen, 'genome1')
    g.genome()
    print >> open(genome_path, 'w'), g.genome_fasta_format()

    #contigs/scaffolds
    if args.scaffolds:
        scafs = open(contig_path, 'w')
        scafs.write('>scf_burnin{0}\n{1}\n'.format(args.gaplen,
                                                   g.sequence[0:args.burnin]))
        scaffold = ''
        pos = args.burnin

        for error in args.errorsize:
            scaffold_coord = 0
            for i, x in enumerate(
                    range(
                        pos, pos + (args.nrgaps + 1) *
                        (args.contiglen + args.gaplen),
                        args.contiglen + args.gaplen)):
                #print 'pos:', x
                if (args.gaplen + error) > 0:
                    if i < args.nrgaps:
                        scaffold += g.sequence[x:x + args.contiglen] + 'N' * (
                            args.gaplen + error)
                        scaffold_coord = len(scaffold)
                        error_start = scaffold_coord - (args.gaplen + error)
                        error_stop = scaffold_coord  # error is anywhere in the introduced gap (either contraction or expansion)
                    else:
                        scaffold += g.sequence[x:x + args.contiglen]
                else:
                    #scaffold += g.sequence[i*(args.gaplen + error) + x : x + args.contiglen + (i+1)*(args.gaplen + error)]
                    scaffold += g.sequence[x:x + args.contiglen +
                                           (args.gaplen + error)]

                    scaffold_coord = len(scaffold)
                    error_start = scaffold_coord
                    error_stop = scaffold_coord + 1  # error is at a specific position where a contraction has occured

                if error < 0 and i < args.nrgaps:
                    to_GFF(
                        gff_file, 'scf_gap{1}_errorsize_minus{2}'.format(
                            i + 1, args.gaplen, abs(error)), 'TRUTH', 'FCD',
                        error_start, error_stop, 1, '+', '.',
                        'Note=Error:Contraction {0}bp'.format(abs(error)))
                elif error > 0 and i < args.nrgaps:
                    to_GFF(
                        gff_file, 'scf_gap{1}_errorsize{2}'.format(
                            i + 1, args.gaplen, abs(error)), 'TRUTH', 'FCD',
                        error_start, error_stop, 1, '+', '.',
                        'Note=Error:Expansion {0}bp'.format(abs(error)))
                else:
                    pass

            if error < 0:
                scafs.write('>scf_gap{1}_errorsize_minus{2}\n{3}\n'.format(
                    i + 1, args.gaplen, abs(error), scaffold))
            else:
                scafs.write('>scf_gap{1}_errorsize{2}\n{3}\n'.format(
                    i + 1, args.gaplen, error, scaffold))

            scaffold = ''
            pos = x + 2 * args.contiglen
    # dummy sequences to prevent bwa tor remove any of our scaffolds
    # for i in range(10):
    #     dummy = genome.Genome([0.25]*4,10000,'z_dummy{0}'.format(i+1))
    #     dummy.genome()
    #     scafs.write('>z_dummy{0}\n{1}\n'.format(i+1, dummy.sequence))

    else:
        ctgs = open(contig_path, 'w')
        ctgs.write('>ctg0\n{0}\n'.format(g.sequence[0:args.burnin]))
        for i, x in enumerate(
                range(args.burnin, genomelen, (args.contiglen + args.gaplen))):
            ctgs.write('>ctg{0}\n{1}\n'.format(
                i + 1, g.sequence[x:x + args.contiglen]))

    #reads
    if args.distr == 'normal':
        lib = reads.DNAseq(args.read_length,
                           args.coverage,
                           distribution=args.distr,
                           mean=args.mean,
                           stddev=args.sd)
        lib.simulate_pe_reads(g)
    elif args.distr == 'uniform':
        lib = reads.DNAseq(args.read_length,
                           args.coverage,
                           distribution=args.distr,
                           min_size=args.min_size,
                           max_size=args.max_size)
        lib.simulate_pe_reads(g)

    reads1 = open(read1_path, 'w')
    reads2 = open(read2_path, 'w')
    i = 0
    for read in lib.fasta_format():
        if i % 2 == 0:
            reads1.write(read)
        else:
            reads2.write(read)
        i += 1

    print 'Started mapping'
    #mapping
    #align.map_paired_reads(read1_path, read2_path, contig_path, bam_path, args)
    align.bwa_mem(read1_path, read2_path, contig_path, bam_path, args)
Exemplo n.º 5
0
def simulate_instance(args):
    print 'Started modyfiyng genome'
    if not os.path.exists(args.output_path):
        os.makedirs(args.output_path)

    # genome_path = args.genome
    contig_path = os.path.join(args.output_path, 'ctgs.fa')
    read1_path = args.read1
    read2_path = args.read2
    bam_path = os.path.join(args.output_path, 'mapped')
    gff_path = os.path.join(args.output_path, 'true_error_pos.gff')
    gff_file = open(gff_path, 'w')
    genome_seqs = ReadInContigseqs(open(args.genome, 'r'), 10)
    # #genome
    # genomelen = args.burnin + ( (args.contiglen+args.gaplen)*(args.nrgaps + 1 ) + args.contiglen ) * (len(args.errorsize) + 1)
    # print genomelen
    # g = genome.Genome([0.25]*4,genomelen,'genome1')
    # g.genome()
    # print >> open(genome_path,'w'), g.genome_fasta_format()

    #contigs/scaffolds
    ctgs = open(contig_path, 'w')
    scaffold = ''
    chunk_size = 9000
    modified_genome = {}
    modified_chunks = []
    for acc, seq in genome_seqs.iteritems():
        if acc == 'sequence_0':
            pos = 0
            chunks = [
                seq[i:i + chunk_size] for i in range(0, len(seq), chunk_size)
            ]
            i = 0
            # print len(chunks)
            for gap in [0, 1000, 2000, 3000]:
                for error in [
                        -2000, -1500, -1000, -500, 500, 1000, 1500, 2000
                ]:
                    for sample in range(10):
                        N_s = 'N' * max(0, (gap + error))
                        cut_size = gap + max(0, -error)
                        modified_chunk = chunks[i][:len(chunks[i]) -
                                                   (cut_size)] + N_s
                        #print modified_chunk
                        modified_chunks.append(modified_chunk)
                        i += 1
                        #print len(modified_chunk)

                        pos += len(modified_chunk)

                        if (gap + error) > 0:
                            error_start = pos - len(N_s)
                            error_stop = pos  # error is anywhere in the introduced gap (either contraction or expansion)
                        else:
                            error_start = pos
                            error_stop = pos + 1  # error is at a specific position where a contraction has occured
                        if error < 0:
                            to_GFF(
                                gff_file, '{0}'.format(acc), 'TRUTH', 'FCD',
                                error_start, error_stop, 1, '+', '.',
                                'Note=Error:Contraction {0}bp'.format(
                                    abs(error)))
                        else:
                            to_GFF(
                                gff_file, '{0}'.format(acc), 'TRUTH', 'FCD',
                                error_start, error_stop, 1, '+',
                                '.', 'Note=Error:Expansion {0}bp'.format(
                                    abs(error)))

            mod_seq = ''.join(modified_chunks)
            modified_genome[acc] = mod_seq
        else:
            modified_genome[acc] = seq

    for acc, seq in modified_genome.iteritems():
        ctgs.write('>{0}\n{1}\n'.format(acc, seq))
    ctgs.close()
    ctgs = open(contig_path, 'r')

    #reads
    print 'Started mapping'
    #mapping
    align.map_paired_reads(read1_path, read2_path, contig_path, bam_path, args)
Exemplo n.º 6
0
def simulate_instance(args):
    print 'Started simulating'
    if not os.path.exists(args.output_path):
        os.makedirs(args.output_path)
    genome_path = os.path.join(args.output_path, 'genome.fa')
    contig_path = os.path.join(args.output_path, 'ctgs.fa')
    read1_path = os.path.join(args.output_path, 'reads1.fa')
    read2_path = os.path.join(args.output_path, 'reads2.fa')
    bam_path = os.path.join(args.output_path, 'mapped')
    gff_path = os.path.join(args.output_path, 'true_error_pos.gff')
    gff_file = open(gff_path,'w')

    #genome
    genomelen = args.burnin + ( (args.contiglen+args.gaplen)*(args.nrgaps + 1 ) + args.contiglen ) * (len(args.errorsize) + 1)
    print genomelen
    g = genome.Genome([0.25]*4,genomelen,'genome1')
    g.genome()
    print >> open(genome_path,'w'), g.genome_fasta_format()

    #contigs/scaffolds
    if args.scaffolds:
    	scafs = open(contig_path,'w')
        scafs.write('>scf_burnin{0}\n{1}\n'.format(args.gaplen,g.sequence[0:args.burnin]))
    	scaffold = ''
        pos = args.burnin

        for error in args.errorsize:
            scaffold_coord = 0
            for i,x in enumerate(range(pos, pos + (args.nrgaps + 1)*(args.contiglen + args.gaplen ), args.contiglen + args.gaplen)):
                #print 'pos:', x
                if (args.gaplen + error) > 0:
                    if i < args.nrgaps:
                        scaffold += g.sequence[x:x+args.contiglen]+ 'N'* (args.gaplen + error) 
                        scaffold_coord = len(scaffold)
                        error_start = scaffold_coord - (args.gaplen + error) 
                        error_stop = scaffold_coord  # error is anywhere in the introduced gap (either contraction or expansion)
                    else:
                        scaffold += g.sequence[x:x+args.contiglen]
                else:
                    #scaffold += g.sequence[i*(args.gaplen + error) + x : x + args.contiglen + (i+1)*(args.gaplen + error)] 
                    scaffold += g.sequence[x : x + args.contiglen + (args.gaplen + error)] 

                    scaffold_coord = len(scaffold)
                    error_start = scaffold_coord
                    error_stop = scaffold_coord+1 # error is at a specific position where a contraction has occured

                if error < 0 and i < args.nrgaps:
                    to_GFF(gff_file, 'scf_gap{1}_errorsize_minus{2}'.format(i+1, args.gaplen, abs(error)), 'TRUTH','FCD', error_start, error_stop, 1, '+', '.', 'Note=Error:Contraction {0}bp'.format(abs(error)))
                elif error > 0 and i < args.nrgaps:
                    to_GFF(gff_file, 'scf_gap{1}_errorsize{2}'.format(i+1, args.gaplen, abs(error)), 'TRUTH','FCD', error_start, error_stop, 1, '+', '.', 'Note=Error:Expansion {0}bp'.format(abs(error)))
                else:
                    pass

            if error <0:
                scafs.write('>scf_gap{1}_errorsize_minus{2}\n{3}\n'.format(i+1, args.gaplen, abs(error), scaffold)) 
            else:
                scafs.write('>scf_gap{1}_errorsize{2}\n{3}\n'.format(i+1, args.gaplen, error, scaffold))   
	
            scaffold = ''
            pos = x + 2*args.contiglen  
        # dummy sequences to prevent bwa tor remove any of our scaffolds
        # for i in range(10):
        #     dummy = genome.Genome([0.25]*4,10000,'z_dummy{0}'.format(i+1))
        #     dummy.genome()
        #     scafs.write('>z_dummy{0}\n{1}\n'.format(i+1, dummy.sequence)) 
            
    else:
    	ctgs = open(contig_path,'w')
        ctgs.write('>ctg0\n{0}\n'.format(g.sequence[0:args.burnin]))
    	for i,x in enumerate(range(args.burnin,genomelen,(args.contiglen + args.gaplen))):
        	ctgs.write('>ctg{0}\n{1}\n'.format(i+1,g.sequence[x:x+args.contiglen]))

    #reads
    if args.distr == 'normal':
        lib = reads.DNAseq(args.read_length ,args.coverage, distribution=args.distr, mean=args.mean,stddev=args.sd)
        lib.simulate_pe_reads(g)
    elif args.distr == 'uniform':
        lib = reads.DNAseq(args.read_length ,args.coverage, distribution=args.distr, min_size=args.min_size,max_size=args.max_size)
        lib.simulate_pe_reads(g)

    reads1 = open(read1_path,'w')
    reads2 = open(read2_path,'w')
    i=0
    for read in lib.fasta_format():
        if i%2==0:
            reads1.write(read)
        else:
            reads2.write(read)
        i+=1

    print 'Started mapping'
    #mapping
    #align.map_paired_reads(read1_path, read2_path, contig_path, bam_path, args)
    align.bwa_mem(read1_path, read2_path, contig_path, bam_path, args)