def read_fastaq(reads_file): filename = reads_file.name.lower() if filename.endswith('.fa') or filename.endswith('.fasta'): format = 'fasta' elif filename.endswith('.fq') or filename.endswith('.fastq'): format = 'fastq' else: format = detect_format(reads_file) return getreads.getparser(reads_file, filetype=format)
def read_fastqs(infileh1, infileh2, tag_len=12, check_ids=False): reader1 = getreads.getparser(infileh1, filetype='fastq').parser() reader2 = getreads.getparser(infileh2, filetype='fastq').parser() barcodes = collections.Counter() while True: try: read1 = next(reader1) read2 = next(reader2) except StopIteration: break if check_ids and not read_ids_match(read1.id, read2.id): raise getreads.FormatError('Read pair mismatch: "{}" and "{}"'.format(read1.id, read2.id)) alpha = read1.seq[:tag_len] beta = read2.seq[:tag_len] if alpha < beta: order = 'ab' barcode = alpha + beta else: order = 'ba' barcode = beta + alpha barcodes[(barcode, order)] += 1 return barcodes
def main(argv): # Parse and interpret arguments. parser = make_argparser() args = parser.parse_args(argv[1:]) logging.basicConfig(stream=args.log, level=args.volume, format='%(message)s') tone_down_logger() if not (args.ref or args.frag_file): parser.print_usage() fail('You must provide either a reference or fragments file.') if args.ref: if not os.path.isfile(args.ref): fail('Error: reference file {!r} not found.'.format(args.ref)) if not os.path.getsize(args.ref): fail('Error: reference file {!r} empty (0 bytes).'.format(args.ref)) else: if not (args.reads1 and args.reads2): fail('Error: must provide output --reads1 and --reads2 files.') if args.seed is None: seed = random.randint(0, 2**31-1) logging.info('seed: {}\n'.format(seed)) else: seed = args.seed random.seed(seed) if args.stdout: reads1 = sys.stdout reads2 = sys.stdout else: reads1 = args.reads1 reads2 = args.reads2 if isinstance(args.fastq_qual, numbers.Integral): assert args.fastq_qual >= 0, '--fastq-qual cannot be negative.' fastq_qual = chr(args.fastq_qual + 33) elif isinstance(args.fastq_qual, basestring): assert len(args.fastq_qual) == 1, '--fastq-qual cannot be more than a single character.' fastq_qual = args.fastq_qual else: raise AssertionError('--fastq-qual must be a positive integer or single character.') qual_line = fastq_qual * args.read_len invariant_rc = get_revcomp(args.invariant) # Create a temporary directory to do our work in. Then work inside a try so we can finally remove # the directory no matter what exceptions are encountered. tmpfile = tempfile.NamedTemporaryFile(prefix='wgdsim.frags.', delete=False) tmpfile.close() try: # Step 1: Use wgsim to create fragments from the reference. if args.frag_file: frag_path = args.frag_file else: frag_path = tmpfile.name if args.ref: #TODO: Check exit status #TODO: Check for wgsim on the PATH. # Set error and mutation rates to 0 to just slice sequences out of the reference without # modification. run_command('wgsim', '-e', '0', '-r', '0', '-d', '0', '-R', args.indel_rate, '-S', seed, '-N', args.n_frags, '-X', args.ext_rate, '-1', args.frag_len, args.ref, frag_path, os.devnull) # NOTE: Coordinates here are 0-based (0 is the first base in the sequence). extended_dist = extend_dist(RAW_DISTRIBUTION) proportional_dist = compile_dist(extended_dist) n_frags = 0 for raw_fragment in getreads.getparser(frag_path, filetype='fastq'): n_frags += 1 if n_frags > args.n_frags: break chrom, id_num, start, stop = parse_read_id(raw_fragment.id) barcode1 = get_rand_seq(args.bar_len) barcode2 = get_rand_seq(args.bar_len) barcode2_rc = get_revcomp(barcode2) #TODO: Vary the size of the fragment. # Could add ~100bp to frag_len arg to wgsim, then randomly select a subsequence here. raw_frag_full = barcode1 + args.invariant + raw_fragment.seq + invariant_rc + barcode2 # Step 2: Determine how many reads to produce from each fragment. # - Use random.random() and divide the range 0-1 into segments of sizes proportional to # the likelihood of each family size. # bisect.bisect() finds where an element belongs in a sorted list, returning the index. # proportional_dist is just such a sorted list, with values from 0 to 1. n_reads = bisect.bisect(proportional_dist, random.random()) # Step 3: Introduce PCR errors. # - Determine the mutations and their frequencies. # - Could get frequency from the cycle of PCR it occurs in. # - Important to have PCR errors shared between reads. # - For each read, determine which mutations it contains. # - Use random.random() < mut_freq. tree = build_good_pcr_tree(args.cycles, n_reads, args.efficiency_decline, 1000) # Add errors to all children of original fragment. subtree1 = tree.child1 subtree2 = tree.child2 #TODO: Only simulate errors on portions of fragment that will become reads. add_pcr_errors(subtree1, '+', len(raw_frag_full), args.pcr_error, args.indel_rate, args.ext_rate) add_pcr_errors(subtree2, '-', len(raw_frag_full), args.pcr_error, args.indel_rate, args.ext_rate) apply_pcr_errors(tree, raw_frag_full) fragments = get_final_fragments(tree) add_mutation_lists(tree, fragments, []) # Step 4: Introduce sequencing errors. for fragment in fragments.values(): for mutation in generate_mutations(args.read_len, args.seq_error, args.indel_rate, args.ext_rate): fragment['mutations'].append(mutation) fragment['seq'] = apply_mutation(mutation, fragment['seq']) # Print barcodes to log file. if args.barcodes: args.barcodes.write('{}-{}\t{}\t{}\n'.format(chrom, id_num, barcode1, barcode2_rc)) # Print family. for frag_id in sorted(fragments.keys()): fragment = fragments[frag_id] read_id = '{}-{}-{}'.format(chrom, id_num, frag_id) # Print mutations to log file. if args.mutations: read1_muts = get_mutations_subset(fragment['mutations'], 0, args.read_len) read2_muts = get_mutations_subset(fragment['mutations'], 0, args.read_len, revcomp=True, seqlen=len(fragment['seq'])) if fragment['strand'] == '-': read1_muts, read2_muts = read2_muts, read1_muts log_mutations(args.mutations, read1_muts, read_id+'/1', chrom, start, stop) log_mutations(args.mutations, read2_muts, read_id+'/2', chrom, start, stop) frag_seq = fragment['seq'] read1_seq = frag_seq[:args.read_len] read2_seq = get_revcomp(frag_seq[len(frag_seq)-args.read_len:]) if fragment['strand'] == '-': read1_seq, read2_seq = read2_seq, read1_seq if args.out_format == 'fasta': reads1.write('>{}\n{}\n'.format(read_id, read1_seq)) reads2.write('>{}\n{}\n'.format(read_id, read2_seq)) elif args.out_format == 'fastq': reads1.write('@{}\n{}\n+\n{}\n'.format(read_id, read1_seq, qual_line)) reads2.write('@{}\n{}\n+\n{}\n'.format(read_id, read2_seq, qual_line)) finally: try: os.remove(tmpfile.name) except OSError: pass
def main(argv): # Parse and interpret arguments. parser = make_argparser() args = parser.parse_args(argv[1:]) logging.basicConfig(stream=args.log, level=args.volume, format='%(message)s') tone_down_logger() if not (args.ref or args.frag_file): parser.print_usage() fail('You must provide either a reference or fragments file.') if args.bar_list: print(type(args.bar_list)) f = open(str(args.bar_list), "r") barlist = f.read() barcodes = list(map(str, barlist.split())) if not os.path.isfile(args.bar_list): fail('Error: barcode list file not found.'.format(args.bar_list)) if args.ref: if not os.path.isfile(args.ref): fail('Error: reference file {!r} not found.'.format(args.ref)) if not os.path.getsize(args.ref): fail('Error: reference file {!r} empty (0 bytes).'.format( args.ref)) else: if not (args.reads1 and args.reads2): fail('Error: must provide output --reads1 and --reads2 files.') if args.seed is None: seed = random.randint(0, 2**31 - 1) logging.info('seed: {}\n'.format(seed)) else: seed = args.seed random.seed(seed) if args.stdout: reads1 = sys.stdout reads2 = sys.stdout else: reads1 = args.reads1 reads2 = args.reads2 if isinstance(args.fastq_qual, numbers.Integral): assert args.fastq_qual >= 0, '--fastq-qual cannot be negative.' fastq_qual = chr(args.fastq_qual + 33) elif isinstance(args.fastq_qual, str): assert len( args.fastq_qual ) == 1, '--fastq-qual cannot be more than a single character.' fastq_qual = args.fastq_qual else: raise AssertionError( '--fastq-qual must be a positive integer or single character.') qual_line = fastq_qual * args.read_len invariant_rc = pcr.get_revcomp(args.invariant) # Create a temporary directory to do our work in. Then work inside a try so we can finally remove # the directory no matter what exceptions are encountered. tmpfile = tempfile.NamedTemporaryFile(prefix='wgdsim.frags.', delete=False) tmpfile.close() try: # Step 1: Use wgsim to create fragments from the reference. if args.frag_file: frag_path = args.frag_file else: frag_path = tmpfile.name if args.ref: #TODO: Check exit status #TODO: Check for wgsim on the PATH. # Set error and mutation rates to 0 to just slice sequences out of the reference without # modification. run_command('wgsim', '-e', '0', '-r', '0', '-d', '0', '-R', args.indel_rate, '-S', seed, '-N', args.n_frags, '-X', args.ext_rate, '-1', args.frag_len, args.ref, frag_path, os.devnull) # NOTE: Coordinates here are 0-based (0 is the first base in the sequence). extended_dist = extend_dist(RAW_DISTRIBUTION) proportional_dist = compile_dist(extended_dist) n_frags = 0 for raw_fragment in getreads.getparser(frag_path, filetype='fastq'): n_frags += 1 if n_frags > args.n_frags: break chrom, id_num, start, stop = parse_read_id(raw_fragment.id) if args.bar_list: barcode1 = random.choice(barcodes) barcode2 = random.choice(barcodes) barcode2_rc = pcr.get_revcomp(barcode2) else: barcode1 = pcr.get_rand_seq(args.bar_len) barcode2 = pcr.get_rand_seq(args.bar_len) barcode2_rc = pcr.get_revcomp(barcode2) #TODO: Vary the size of the fragment. # Could add ~100bp to frag_len arg to wgsim, then randomly select a subsequence here. raw_frag_full = barcode1 + args.invariant + raw_fragment.seq + invariant_rc + barcode2 # Step 2: Determine how many reads to produce from each fragment. # - Use random.random() and divide the range 0-1 into segments of sizes proportional to # the likelihood of each family size. # bisect.bisect() finds where an element belongs in a sorted list, returning the index. # proportional_dist is just such a sorted list, with values from 0 to 1. n_reads = bisect.bisect(proportional_dist, random.random()) # Step 3: Introduce PCR errors. # - Determine the mutations and their frequencies. # - Could get frequency from the cycle of PCR it occurs in. # - Important to have PCR errors shared between reads. # - For each read, determine which mutations it contains. # - Use random.random() < mut_freq. tree = pcr.build_good_pcr_tree(args.cycles, n_reads, args.efficiency_decline, 1000) # Add errors to all children of original fragment. subtree1 = tree.child1 subtree2 = tree.child2 #TODO: Only simulate errors on portions of fragment that will become reads. frag_len = len(raw_frag_full) pcr.add_pcr_errors(subtree1, '+', frag_len, args.pcr_error, args.indel_rate, args.ext_rate) pcr.add_pcr_errors(subtree2, '-', frag_len, args.pcr_error, args.indel_rate, args.ext_rate) pcr.apply_pcr_errors(tree, raw_frag_full) fragments = pcr.get_final_fragments(tree) pcr.add_mutation_lists(tree, fragments, []) # Step 4: Introduce sequencing errors. for fragment in fragments.values(): for mutation in pcr.generate_mutations(args.read_len, args.seq_error, args.indel_rate, args.ext_rate): fragment['mutations'].append(mutation) fragment['seq'] = pcr.apply_mutation( mutation, fragment['seq']) # Print barcodes to log file. if args.barcodes: args.barcodes.write('{}-{}\t{}\t{}\n'.format( chrom, id_num, barcode1, barcode2_rc)) # Print family. for frag_id in sorted(fragments.keys()): fragment = fragments[frag_id] read_id = '{}-{}-{}'.format(chrom, id_num, frag_id) # Print mutations to log file. if args.mutations: read1_muts = pcr.get_mutations_subset( fragment['mutations'], 0, args.read_len) read2_muts = pcr.get_mutations_subset( fragment['mutations'], 0, args.read_len, revcomp=True, seqlen=len(fragment['seq'])) if fragment['strand'] == '-': read1_muts, read2_muts = read2_muts, read1_muts pcr.log_mutations(args.mutations, read1_muts, read_id + '/1', chrom, start, stop) pcr.log_mutations(args.mutations, read2_muts, read_id + '/2', chrom, start, stop) frag_seq = fragment['seq'] read1_seq = frag_seq[:args.read_len] read2_seq = pcr.get_revcomp(frag_seq[len(frag_seq) - args.read_len:]) if fragment['strand'] == '-': read1_seq, read2_seq = read2_seq, read1_seq if args.out_format == 'fasta': reads1.write('>{}\n{}\n'.format(read_id, read1_seq)) reads2.write('>{}\n{}\n'.format(read_id, read2_seq)) elif args.out_format == 'fastq': qual_line = fastq_qual * len( read1_seq ) ## calculating qual line based on actual read length in case fragment length is less than read length reads1.write('@{}\n{}\n+\n{}\n'.format( read_id, read1_seq, qual_line)) reads2.write('@{}\n{}\n+\n{}\n'.format( read_id, read2_seq, qual_line)) finally: try: os.remove(tmpfile.name) except OSError: pass
def main(argv): parser = argparse.ArgumentParser(description=DESCRIPTION) parser.set_defaults(**ARG_DEFAULTS) parser.add_argument('ref', metavar='ref.fa', nargs='?', help='Reference sequence. Omit if giving --frag-file.') parser.add_argument('out1', type=argparse.FileType('w'), help='Write final mate 1 reads to this file.') parser.add_argument('out2', type=argparse.FileType('w'), help='Write final mate 2 reads to this file.') parser.add_argument('-o', '--out-format', choices=('fastq', 'fasta')) parser.add_argument('--stdout', action='store_true', help='Print interleaved output reads to stdout.') parser.add_argument( '-m', '--mutations', type=argparse.FileType('w'), help= 'Write a log of the PCR and sequencing errors introduced to this file. Will overwrite any ' 'existing file at this path.') parser.add_argument( '-b', '--barcodes', type=argparse.FileType('w'), help= 'Write a log of which barcodes were ligated to which fragments. Will overwrite any ' 'existing file at this path.') parser.add_argument( '--frag-file', help= 'The path of the FASTQ file of fragments. If --ref is given, these will be generated with ' 'wgsim and kept (normally a temporary file is used, then deleted). Note: the file will be ' 'overwritten! If --ref is not given, then this should be a file of already generated ' 'fragments, and they will be used instead of generating new ones.') parser.add_argument( '-Q', '--fastq-qual', help= 'The quality score to assign to all bases in FASTQ output. Give a character or PHRED ' 'score (integer). A PHRED score will be converted using the Sanger offset (33). Default: ' '"%(default)s"') parser.add_argument( '-S', '--seed', type=int, help= 'Random number generator seed. By default, a random, 32-bit seed will be generated and ' 'logged to stdout.') params = parser.add_argument_group('simulation parameters') params.add_argument( '-n', '--n-frags', type=int, help= 'The number of original fragment molecules to simulate. The final number of reads will be ' 'this multiplied by the average number of reads per family. If you provide fragments with ' '--frag-file, the script will still only read in the number specified here. Default: ' '%(default)s') params.add_argument('-r', '--read-len', type=int, help='Default: %(default)s') params.add_argument('-f', '--frag-len', type=int, help='Default: %(default)s') params.add_argument( '-s', '--seq-error', type=float, help= 'Sequencing error rate per base (0-1 proportion, not percent). Default: %(default)s' ) params.add_argument( '-p', '--pcr-error', type=float, help= 'PCR error rate per base (0-1 proportion, not percent). Default: %(default)s' ) params.add_argument( '-c', '--cycles', type=int, help='Number of PCR cycles to simulate. Default: %(default)s') params.add_argument( '-i', '--indel-rate', type=float, help='Fraction of errors which are indels. Default: %(default)s') params.add_argument( '-E', '--extension-rate', dest='ext_rate', type=float, help='Probability an indel is extended. Default: %(default)s') params.add_argument( '-B', '--bar-len', type=int, help='Length of the barcodes to generate. Default: %(default)s') params.add_argument( '-I', '--invariant', help= 'The invariant linker sequence between the barcode and sample sequence in each read. ' 'Default: %(default)s') # Parse and interpret arguments. args = parser.parse_args(argv[1:]) assert args.ref or args.frag_file, 'You must provide either a reference or fragments file.' if args.seed is None: seed = random.randint(0, 2**31 - 1) sys.stderr.write('seed: {}\n'.format(seed)) else: seed = args.seed random.seed(seed) if args.stdout: out1 = sys.stdout out2 = sys.stdout else: out1 = args.out1 out2 = args.out2 if isinstance(args.fastq_qual, numbers.Integral): assert args.fastq_qual >= 0, '--fastq-qual cannot be negative.' fastq_qual = chr(args.fastq_qual + 33) elif isinstance(args.fastq_qual, basestring): assert len( args.fastq_qual ) == 1, '--fastq-qual cannot be more than a single character.' fastq_qual = args.fastq_qual else: raise AssertionError( '--fastq-qual must be a positive integer or single character.') qual_line = fastq_qual * args.read_len invariant_rc = get_revcomp(args.invariant) # Create a temporary directory to do our work in. Then work inside a try so we can finally remove # the directory no matter what exceptions are encountered. tmpfile = tempfile.NamedTemporaryFile(prefix='wgdsim.frags.') tmpfile.close() try: # Step 1: Use wgsim to create fragments from the reference. if args.frag_file: frag_path = args.frag_file else: frag_path = tmpfile.name if args.ref and os.path.isfile(args.ref) and os.path.getsize(args.ref): #TODO: Check exit status #TODO: Check for wgsim on the PATH. # Set error and mutation rates to 0 to just slice sequences out of the reference without # modification. run_command('wgsim', '-e', '0', '-r', '0', '-d', '0', '-R', args.indel_rate, '-S', seed, '-N', args.n_frags, '-X', args.ext_rate, '-1', args.frag_len, args.ref, frag_path, os.devnull) # NOTE: Coordinates here are 0-based (0 is the first base in the sequence). extended_dist = extend_dist(RAW_DISTRIBUTION) proportional_dist = compile_dist(extended_dist) n_frags = 0 for raw_fragment in getreads.getparser(frag_path, filetype='fastq'): n_frags += 1 if n_frags > args.n_frags: break chrom, id_num, start, stop = parse_read_id(raw_fragment.id) barcode1 = get_rand_seq(args.bar_len) barcode2 = get_rand_seq(args.bar_len) barcode2_rc = get_revcomp(barcode2) raw_frag_full = barcode1 + args.invariant + raw_fragment.seq + invariant_rc + barcode2 # Step 2: Determine how many reads to produce from each fragment. # - Use random.random() and divide the range 0-1 into segments of sizes proportional to # the likelihood of each family size. # bisect.bisect() finds where an element belongs in a sorted list, returning the index. # proportional_dist is just such a sorted list, with values from 0 to 1. n_reads = bisect.bisect(proportional_dist, random.random()) # Step 3: Introduce PCR errors. # - Determine the mutations and their frequencies. # - Could get frequency from the cycle of PCR it occurs in. # - Important to have PCR errors shared between reads. # - For each read, determine which mutations it contains. # - Use random.random() < mut_freq. tree = get_good_pcr_tree(n_reads, args.cycles, 1000, max_diff=1) # Add errors to all children of original fragment. subtree1 = tree.get('child1') subtree2 = tree.get('child2') #TODO: Only simulate errors on portions of fragment that will become reads. add_pcr_errors(subtree1, '+', len(raw_frag_full), args.pcr_error, args.indel_rate, args.ext_rate) add_pcr_errors(subtree2, '-', len(raw_frag_full), args.pcr_error, args.indel_rate, args.ext_rate) apply_pcr_errors(tree, raw_frag_full) fragments = get_final_fragments(tree) add_mutation_lists(tree, fragments, []) # Step 4: Introduce sequencing errors. for fragment in fragments.values(): for mutation in generate_mutations(args.read_len, args.seq_error, args.indel_rate, args.ext_rate): fragment['mutations'].append(mutation) fragment['seq'] = apply_mutation(mutation, fragment['seq']) # Print barcodes to log file. if args.barcodes: args.barcodes.write('{}-{}\t{}\t{}\n'.format( chrom, id_num, barcode1, barcode2_rc)) # Print family. for frag_id in sorted(fragments.keys()): fragment = fragments[frag_id] read_id = '{}-{}-{}'.format(chrom, id_num, frag_id) # Print mutations to log file. if args.mutations: read1_muts = get_mutations_subset(fragment['mutations'], 0, args.read_len) read2_muts = get_mutations_subset(fragment['mutations'], 0, args.read_len, revcomp=True, seqlen=len( fragment['seq'])) if fragment['strand'] == '-': read1_muts, read2_muts = read2_muts, read1_muts log_mutations(args.mutations, read1_muts, read_id + '/1', chrom, start, stop) log_mutations(args.mutations, read2_muts, read_id + '/2', chrom, start, stop) frag_seq = fragment['seq'] read1_seq = frag_seq[:args.read_len] read2_seq = get_revcomp(frag_seq[len(frag_seq) - args.read_len:]) if fragment['strand'] == '-': read1_seq, read2_seq = read2_seq, read1_seq if args.out_format == 'fasta': out1.write('>{}\n{}\n'.format(read_id, read1_seq)) out2.write('>{}\n{}\n'.format(read_id, read2_seq)) elif args.out_format == 'fastq': out1.write('@{}\n{}\n+\n{}\n'.format( read_id, read1_seq, qual_line)) out2.write('@{}\n{}\n+\n{}\n'.format( read_id, read2_seq, qual_line)) finally: try: os.remove(tmpfile.name) except OSError: pass
def fasta_to_fastq(fasta_file, fastq_file, qual_char): for read in getreads.getparser(fasta_file, filetype='fasta'): quals = qual_char * len(read.seq) fastq_file.write('@{0}\n{1}\n+\n{2}\n'.format(read.name, read.seq, quals))
def find_and_write_chosen_reads(chosen_names, input_fastq, output_fastq): input_reads = getreads.getparser(input_fastq, filetype='fastq') chosen_reads = find_chosen_reads(input_reads, chosen_names) write_reads(chosen_reads, output_fastq)
def start_new_file(self, new_file): self.current_file = open(new_file) return getreads.getparser(self.current_file, self.format).parser()
def main(argv): parser = argparse.ArgumentParser(description=DESCRIPTION) parser.set_defaults(**ARG_DEFAULTS) parser.add_argument('ref', metavar='ref.fa', nargs='?', help='Reference sequence. Omit if giving --frag-file.') parser.add_argument('out1', type=argparse.FileType('w'), help='Write final mate 1 reads to this file.') parser.add_argument('out2', type=argparse.FileType('w'), help='Write final mate 2 reads to this file.') parser.add_argument('-o', '--out-format', choices=('fastq', 'fasta')) parser.add_argument('--stdout', action='store_true', help='Print interleaved output reads to stdout.') parser.add_argument('-m', '--mutations', type=argparse.FileType('w'), help='Write a log of the PCR and sequencing errors introduced to this file. Will overwrite any ' 'existing file at this path.') parser.add_argument('-b', '--barcodes', type=argparse.FileType('w'), help='Write a log of which barcodes were ligated to which fragments. Will overwrite any ' 'existing file at this path.') parser.add_argument('--frag-file', help='The path of the FASTQ file of fragments. If --ref is given, these will be generated with ' 'wgsim and kept (normally a temporary file is used, then deleted). Note: the file will be ' 'overwritten! If --ref is not given, then this should be a file of already generated ' 'fragments, and they will be used instead of generating new ones.') parser.add_argument('-Q', '--fastq-qual', help='The quality score to assign to all bases in FASTQ output. Give a character or PHRED ' 'score (integer). A PHRED score will be converted using the Sanger offset (33). Default: ' '"%(default)s"') parser.add_argument('-S', '--seed', type=int, help='Random number generator seed. By default, a random, 32-bit seed will be generated and ' 'logged to stdout.') params = parser.add_argument_group('simulation parameters') params.add_argument('-n', '--n-frags', type=int, help='The number of original fragment molecules to simulate. The final number of reads will be ' 'this multiplied by the average number of reads per family. If you provide fragments with ' '--frag-file, the script will still only read in the number specified here. Default: ' '%(default)s') params.add_argument('-r', '--read-len', type=int, help='Default: %(default)s') params.add_argument('-f', '--frag-len', type=int, help='Default: %(default)s') params.add_argument('-s', '--seq-error', type=float, help='Sequencing error rate per base (0-1 proportion, not percent). Default: %(default)s') params.add_argument('-p', '--pcr-error', type=float, help='PCR error rate per base (0-1 proportion, not percent). Default: %(default)s') params.add_argument('-c', '--cycles', type=int, help='Number of PCR cycles to simulate. Default: %(default)s') params.add_argument('-i', '--indel-rate', type=float, help='Fraction of errors which are indels. Default: %(default)s') params.add_argument('-E', '--extension-rate', dest='ext_rate', type=float, help='Probability an indel is extended. Default: %(default)s') params.add_argument('-B', '--bar-len', type=int, help='Length of the barcodes to generate. Default: %(default)s') params.add_argument('-I', '--invariant', help='The invariant linker sequence between the barcode and sample sequence in each read. ' 'Default: %(default)s') # Parse and interpret arguments. args = parser.parse_args(argv[1:]) assert args.ref or args.frag_file, 'You must provide either a reference or fragments file.' if args.seed is None: seed = random.randint(0, 2**31-1) sys.stderr.write('seed: {}\n'.format(seed)) else: seed = args.seed random.seed(seed) if args.stdout: out1 = sys.stdout out2 = sys.stdout else: out1 = args.out1 out2 = args.out2 if isinstance(args.fastq_qual, numbers.Integral): assert args.fastq_qual >= 0, '--fastq-qual cannot be negative.' fastq_qual = chr(args.fastq_qual + 33) elif isinstance(args.fastq_qual, basestring): assert len(args.fastq_qual) == 1, '--fastq-qual cannot be more than a single character.' fastq_qual = args.fastq_qual else: raise AssertionError('--fastq-qual must be a positive integer or single character.') qual_line = fastq_qual * args.read_len invariant_rc = get_revcomp(args.invariant) # Create a temporary directory to do our work in. Then work inside a try so we can finally remove # the directory no matter what exceptions are encountered. tmpfile = tempfile.NamedTemporaryFile(prefix='wgdsim.frags.') tmpfile.close() try: # Step 1: Use wgsim to create fragments from the reference. if args.frag_file: frag_path = args.frag_file else: frag_path = tmpfile.name if args.ref and os.path.isfile(args.ref) and os.path.getsize(args.ref): #TODO: Check exit status #TODO: Check for wgsim on the PATH. # Set error and mutation rates to 0 to just slice sequences out of the reference without # modification. run_command('wgsim', '-e', '0', '-r', '0', '-d', '0', '-R', args.indel_rate, '-S', seed, '-N', args.n_frags, '-X', args.ext_rate, '-1', args.frag_len, args.ref, frag_path, os.devnull) # NOTE: Coordinates here are 0-based (0 is the first base in the sequence). extended_dist = extend_dist(RAW_DISTRIBUTION) proportional_dist = compile_dist(extended_dist) n_frags = 0 for raw_fragment in getreads.getparser(frag_path, filetype='fastq'): n_frags += 1 if n_frags > args.n_frags: break chrom, id_num, start, stop = parse_read_id(raw_fragment.id) barcode1 = get_rand_seq(args.bar_len) barcode2 = get_rand_seq(args.bar_len) barcode2_rc = get_revcomp(barcode2) raw_frag_full = barcode1 + args.invariant + raw_fragment.seq + invariant_rc + barcode2 # Step 2: Determine how many reads to produce from each fragment. # - Use random.random() and divide the range 0-1 into segments of sizes proportional to # the likelihood of each family size. # bisect.bisect() finds where an element belongs in a sorted list, returning the index. # proportional_dist is just such a sorted list, with values from 0 to 1. n_reads = bisect.bisect(proportional_dist, random.random()) # Step 3: Introduce PCR errors. # - Determine the mutations and their frequencies. # - Could get frequency from the cycle of PCR it occurs in. # - Important to have PCR errors shared between reads. # - For each read, determine which mutations it contains. # - Use random.random() < mut_freq. tree = get_good_pcr_tree(n_reads, args.cycles, 1000, max_diff=1) # Add errors to all children of original fragment. subtree1 = tree.get('child1') subtree2 = tree.get('child2') #TODO: Only simulate errors on portions of fragment that will become reads. add_pcr_errors(subtree1, '+', len(raw_frag_full), args.pcr_error, args.indel_rate, args.ext_rate) add_pcr_errors(subtree2, '-', len(raw_frag_full), args.pcr_error, args.indel_rate, args.ext_rate) apply_pcr_errors(tree, raw_frag_full) fragments = get_final_fragments(tree) add_mutation_lists(tree, fragments, []) # Step 4: Introduce sequencing errors. for fragment in fragments.values(): for mutation in generate_mutations(args.read_len, args.seq_error, args.indel_rate, args.ext_rate): fragment['mutations'].append(mutation) fragment['seq'] = apply_mutation(mutation, fragment['seq']) # Print barcodes to log file. if args.barcodes: args.barcodes.write('{}-{}\t{}\t{}\n'.format(chrom, id_num, barcode1, barcode2_rc)) # Print family. for frag_id in sorted(fragments.keys()): fragment = fragments[frag_id] read_id = '{}-{}-{}'.format(chrom, id_num, frag_id) # Print mutations to log file. if args.mutations: read1_muts = get_mutations_subset(fragment['mutations'], 0, args.read_len) read2_muts = get_mutations_subset(fragment['mutations'], 0, args.read_len, revcomp=True, seqlen=len(fragment['seq'])) if fragment['strand'] == '-': read1_muts, read2_muts = read2_muts, read1_muts log_mutations(args.mutations, read1_muts, read_id+'/1', chrom, start, stop) log_mutations(args.mutations, read2_muts, read_id+'/2', chrom, start, stop) frag_seq = fragment['seq'] read1_seq = frag_seq[:args.read_len] read2_seq = get_revcomp(frag_seq[len(frag_seq)-args.read_len:]) if fragment['strand'] == '-': read1_seq, read2_seq = read2_seq, read1_seq if args.out_format == 'fasta': out1.write('>{}\n{}\n'.format(read_id, read1_seq)) out2.write('>{}\n{}\n'.format(read_id, read2_seq)) elif args.out_format == 'fastq': out1.write('@{}\n{}\n+\n{}\n'.format(read_id, read1_seq, qual_line)) out2.write('@{}\n{}\n+\n{}\n'.format(read_id, read2_seq, qual_line)) finally: try: os.remove(tmpfile.name) except OSError: pass
def fastq_to_fasta(fastq_file, fasta_file): for read in getreads.getparser(fastq_file, filetype='fastq'): fasta_file.write('>{0}\n{1}\n'.format(read.name, read.seq))