Exemplo n.º 1
0
def read_fastaq(reads_file):
    filename = reads_file.name.lower()
    if filename.endswith('.fa') or filename.endswith('.fasta'):
        format = 'fasta'
    elif filename.endswith('.fq') or filename.endswith('.fastq'):
        format = 'fastq'
    else:
        format = detect_format(reads_file)
    return getreads.getparser(reads_file, filetype=format)
Exemplo n.º 2
0
def read_fastqs(infileh1, infileh2, tag_len=12, check_ids=False):
  reader1 = getreads.getparser(infileh1, filetype='fastq').parser()
  reader2 = getreads.getparser(infileh2, filetype='fastq').parser()
  barcodes = collections.Counter()
  while True:
    try:
      read1 = next(reader1)
      read2 = next(reader2)
    except StopIteration:
      break
    if check_ids and not read_ids_match(read1.id, read2.id):
      raise getreads.FormatError('Read pair mismatch: "{}" and "{}"'.format(read1.id, read2.id))
    alpha = read1.seq[:tag_len]
    beta  = read2.seq[:tag_len]
    if alpha < beta:
      order = 'ab'
      barcode = alpha + beta
    else:
      order = 'ba'
      barcode = beta + alpha
    barcodes[(barcode, order)] += 1
  return barcodes
Exemplo n.º 3
0
def read_fastqs(infileh1, infileh2, tag_len=12, check_ids=False):
  reader1 = getreads.getparser(infileh1, filetype='fastq').parser()
  reader2 = getreads.getparser(infileh2, filetype='fastq').parser()
  barcodes = collections.Counter()
  while True:
    try:
      read1 = next(reader1)
      read2 = next(reader2)
    except StopIteration:
      break
    if check_ids and not read_ids_match(read1.id, read2.id):
      raise getreads.FormatError('Read pair mismatch: "{}" and "{}"'.format(read1.id, read2.id))
    alpha = read1.seq[:tag_len]
    beta  = read2.seq[:tag_len]
    if alpha < beta:
      order = 'ab'
      barcode = alpha + beta
    else:
      order = 'ba'
      barcode = beta + alpha
    barcodes[(barcode, order)] += 1
  return barcodes
Exemplo n.º 4
0
def main(argv):
  # Parse and interpret arguments.
  parser = make_argparser()
  args = parser.parse_args(argv[1:])
  logging.basicConfig(stream=args.log, level=args.volume, format='%(message)s')
  tone_down_logger()
  if not (args.ref or args.frag_file):
    parser.print_usage()
    fail('You must provide either a reference or fragments file.')
  if args.ref:
    if not os.path.isfile(args.ref):
      fail('Error: reference file {!r} not found.'.format(args.ref))
    if not os.path.getsize(args.ref):
      fail('Error: reference file {!r} empty (0 bytes).'.format(args.ref))
  else:
    if not (args.reads1 and args.reads2):
      fail('Error: must provide output --reads1 and --reads2 files.')
  if args.seed is None:
    seed = random.randint(0, 2**31-1)
    logging.info('seed: {}\n'.format(seed))
  else:
    seed = args.seed
  random.seed(seed)
  if args.stdout:
    reads1 = sys.stdout
    reads2 = sys.stdout
  else:
    reads1 = args.reads1
    reads2 = args.reads2
  if isinstance(args.fastq_qual, numbers.Integral):
    assert args.fastq_qual >= 0, '--fastq-qual cannot be negative.'
    fastq_qual = chr(args.fastq_qual + 33)
  elif isinstance(args.fastq_qual, basestring):
    assert len(args.fastq_qual) == 1, '--fastq-qual cannot be more than a single character.'
    fastq_qual = args.fastq_qual
  else:
    raise AssertionError('--fastq-qual must be a positive integer or single character.')
  qual_line = fastq_qual * args.read_len

  invariant_rc = get_revcomp(args.invariant)

  # Create a temporary directory to do our work in. Then work inside a try so we can finally remove
  # the directory no matter what exceptions are encountered.
  tmpfile = tempfile.NamedTemporaryFile(prefix='wgdsim.frags.', delete=False)
  tmpfile.close()
  try:
    # Step 1: Use wgsim to create fragments from the reference.
    if args.frag_file:
      frag_path = args.frag_file
    else:
      frag_path = tmpfile.name
    if args.ref:
      #TODO: Check exit status
      #TODO: Check for wgsim on the PATH.
      # Set error and mutation rates to 0 to just slice sequences out of the reference without
      # modification.
      run_command('wgsim', '-e', '0', '-r', '0', '-d', '0', '-R', args.indel_rate, '-S', seed,
                  '-N', args.n_frags, '-X', args.ext_rate, '-1', args.frag_len,
                  args.ref, frag_path, os.devnull)

    # NOTE: Coordinates here are 0-based (0 is the first base in the sequence).
    extended_dist = extend_dist(RAW_DISTRIBUTION)
    proportional_dist = compile_dist(extended_dist)
    n_frags = 0
    for raw_fragment in getreads.getparser(frag_path, filetype='fastq'):
      n_frags += 1
      if n_frags > args.n_frags:
        break
      chrom, id_num, start, stop = parse_read_id(raw_fragment.id)
      barcode1 = get_rand_seq(args.bar_len)
      barcode2 = get_rand_seq(args.bar_len)
      barcode2_rc = get_revcomp(barcode2)
      #TODO: Vary the size of the fragment.
      #      Could add ~100bp to frag_len arg to wgsim, then randomly select a subsequence here.
      raw_frag_full = barcode1 + args.invariant + raw_fragment.seq + invariant_rc + barcode2

      # Step 2: Determine how many reads to produce from each fragment.
      # - Use random.random() and divide the range 0-1 into segments of sizes proportional to
      #   the likelihood of each family size.
      # bisect.bisect() finds where an element belongs in a sorted list, returning the index.
      # proportional_dist is just such a sorted list, with values from 0 to 1.
      n_reads = bisect.bisect(proportional_dist, random.random())

      # Step 3: Introduce PCR errors.
      # - Determine the mutations and their frequencies.
      #   - Could get frequency from the cycle of PCR it occurs in.
      #     - Important to have PCR errors shared between reads.
      # - For each read, determine which mutations it contains.
      #   - Use random.random() < mut_freq.
      tree = build_good_pcr_tree(args.cycles, n_reads, args.efficiency_decline, 1000)
      # Add errors to all children of original fragment.
      subtree1 = tree.child1
      subtree2 = tree.child2
      #TODO: Only simulate errors on portions of fragment that will become reads.
      add_pcr_errors(subtree1, '+', len(raw_frag_full), args.pcr_error, args.indel_rate, args.ext_rate)
      add_pcr_errors(subtree2, '-', len(raw_frag_full), args.pcr_error, args.indel_rate, args.ext_rate)
      apply_pcr_errors(tree, raw_frag_full)
      fragments = get_final_fragments(tree)
      add_mutation_lists(tree, fragments, [])

      # Step 4: Introduce sequencing errors.
      for fragment in fragments.values():
        for mutation in generate_mutations(args.read_len, args.seq_error, args.indel_rate,
                                           args.ext_rate):
          fragment['mutations'].append(mutation)
          fragment['seq'] = apply_mutation(mutation, fragment['seq'])

      # Print barcodes to log file.
      if args.barcodes:
        args.barcodes.write('{}-{}\t{}\t{}\n'.format(chrom, id_num, barcode1, barcode2_rc))
      # Print family.
      for frag_id in sorted(fragments.keys()):
        fragment = fragments[frag_id]
        read_id = '{}-{}-{}'.format(chrom, id_num, frag_id)
        # Print mutations to log file.
        if args.mutations:
          read1_muts = get_mutations_subset(fragment['mutations'], 0, args.read_len)
          read2_muts = get_mutations_subset(fragment['mutations'], 0, args.read_len, revcomp=True,
                                            seqlen=len(fragment['seq']))
          if fragment['strand'] == '-':
            read1_muts, read2_muts = read2_muts, read1_muts
          log_mutations(args.mutations, read1_muts, read_id+'/1', chrom, start, stop)
          log_mutations(args.mutations, read2_muts, read_id+'/2', chrom, start, stop)
        frag_seq = fragment['seq']
        read1_seq = frag_seq[:args.read_len]
        read2_seq = get_revcomp(frag_seq[len(frag_seq)-args.read_len:])
        if fragment['strand'] == '-':
          read1_seq, read2_seq = read2_seq, read1_seq
        if args.out_format == 'fasta':
          reads1.write('>{}\n{}\n'.format(read_id, read1_seq))
          reads2.write('>{}\n{}\n'.format(read_id, read2_seq))
        elif args.out_format == 'fastq':
          reads1.write('@{}\n{}\n+\n{}\n'.format(read_id, read1_seq, qual_line))
          reads2.write('@{}\n{}\n+\n{}\n'.format(read_id, read2_seq, qual_line))

  finally:
    try:
      os.remove(tmpfile.name)
    except OSError:
      pass
Exemplo n.º 5
0
def main(argv):
    # Parse and interpret arguments.
    parser = make_argparser()
    args = parser.parse_args(argv[1:])
    logging.basicConfig(stream=args.log,
                        level=args.volume,
                        format='%(message)s')
    tone_down_logger()
    if not (args.ref or args.frag_file):
        parser.print_usage()
        fail('You must provide either a reference or fragments file.')
    if args.bar_list:
        print(type(args.bar_list))
        f = open(str(args.bar_list), "r")
        barlist = f.read()
        barcodes = list(map(str, barlist.split()))
        if not os.path.isfile(args.bar_list):
            fail('Error: barcode list file not found.'.format(args.bar_list))
    if args.ref:
        if not os.path.isfile(args.ref):
            fail('Error: reference file {!r} not found.'.format(args.ref))
        if not os.path.getsize(args.ref):
            fail('Error: reference file {!r} empty (0 bytes).'.format(
                args.ref))
    else:
        if not (args.reads1 and args.reads2):
            fail('Error: must provide output --reads1 and --reads2 files.')
    if args.seed is None:
        seed = random.randint(0, 2**31 - 1)
        logging.info('seed: {}\n'.format(seed))
    else:
        seed = args.seed
    random.seed(seed)
    if args.stdout:
        reads1 = sys.stdout
        reads2 = sys.stdout
    else:
        reads1 = args.reads1
        reads2 = args.reads2
    if isinstance(args.fastq_qual, numbers.Integral):
        assert args.fastq_qual >= 0, '--fastq-qual cannot be negative.'
        fastq_qual = chr(args.fastq_qual + 33)
    elif isinstance(args.fastq_qual, str):
        assert len(
            args.fastq_qual
        ) == 1, '--fastq-qual cannot be more than a single character.'
        fastq_qual = args.fastq_qual
    else:
        raise AssertionError(
            '--fastq-qual must be a positive integer or single character.')
    qual_line = fastq_qual * args.read_len

    invariant_rc = pcr.get_revcomp(args.invariant)

    # Create a temporary directory to do our work in. Then work inside a try so we can finally remove
    # the directory no matter what exceptions are encountered.
    tmpfile = tempfile.NamedTemporaryFile(prefix='wgdsim.frags.', delete=False)
    tmpfile.close()
    try:
        # Step 1: Use wgsim to create fragments from the reference.
        if args.frag_file:
            frag_path = args.frag_file
        else:
            frag_path = tmpfile.name
        if args.ref:
            #TODO: Check exit status
            #TODO: Check for wgsim on the PATH.
            # Set error and mutation rates to 0 to just slice sequences out of the reference without
            # modification.
            run_command('wgsim', '-e', '0', '-r', '0', '-d', '0', '-R',
                        args.indel_rate, '-S', seed, '-N', args.n_frags, '-X',
                        args.ext_rate, '-1', args.frag_len, args.ref,
                        frag_path, os.devnull)

        # NOTE: Coordinates here are 0-based (0 is the first base in the sequence).
        extended_dist = extend_dist(RAW_DISTRIBUTION)
        proportional_dist = compile_dist(extended_dist)
        n_frags = 0
        for raw_fragment in getreads.getparser(frag_path, filetype='fastq'):
            n_frags += 1
            if n_frags > args.n_frags:
                break
            chrom, id_num, start, stop = parse_read_id(raw_fragment.id)

            if args.bar_list:
                barcode1 = random.choice(barcodes)
                barcode2 = random.choice(barcodes)
                barcode2_rc = pcr.get_revcomp(barcode2)
            else:
                barcode1 = pcr.get_rand_seq(args.bar_len)
                barcode2 = pcr.get_rand_seq(args.bar_len)
                barcode2_rc = pcr.get_revcomp(barcode2)
            #TODO: Vary the size of the fragment.
            #      Could add ~100bp to frag_len arg to wgsim, then randomly select a subsequence here.
            raw_frag_full = barcode1 + args.invariant + raw_fragment.seq + invariant_rc + barcode2

            # Step 2: Determine how many reads to produce from each fragment.
            # - Use random.random() and divide the range 0-1 into segments of sizes proportional to
            #   the likelihood of each family size.
            # bisect.bisect() finds where an element belongs in a sorted list, returning the index.
            # proportional_dist is just such a sorted list, with values from 0 to 1.
            n_reads = bisect.bisect(proportional_dist, random.random())

            # Step 3: Introduce PCR errors.
            # - Determine the mutations and their frequencies.
            #   - Could get frequency from the cycle of PCR it occurs in.
            #     - Important to have PCR errors shared between reads.
            # - For each read, determine which mutations it contains.
            #   - Use random.random() < mut_freq.
            tree = pcr.build_good_pcr_tree(args.cycles, n_reads,
                                           args.efficiency_decline, 1000)
            # Add errors to all children of original fragment.
            subtree1 = tree.child1
            subtree2 = tree.child2
            #TODO: Only simulate errors on portions of fragment that will become reads.
            frag_len = len(raw_frag_full)
            pcr.add_pcr_errors(subtree1, '+', frag_len, args.pcr_error,
                               args.indel_rate, args.ext_rate)
            pcr.add_pcr_errors(subtree2, '-', frag_len, args.pcr_error,
                               args.indel_rate, args.ext_rate)
            pcr.apply_pcr_errors(tree, raw_frag_full)
            fragments = pcr.get_final_fragments(tree)
            pcr.add_mutation_lists(tree, fragments, [])

            # Step 4: Introduce sequencing errors.
            for fragment in fragments.values():
                for mutation in pcr.generate_mutations(args.read_len,
                                                       args.seq_error,
                                                       args.indel_rate,
                                                       args.ext_rate):
                    fragment['mutations'].append(mutation)
                    fragment['seq'] = pcr.apply_mutation(
                        mutation, fragment['seq'])

            # Print barcodes to log file.
            if args.barcodes:
                args.barcodes.write('{}-{}\t{}\t{}\n'.format(
                    chrom, id_num, barcode1, barcode2_rc))
            # Print family.
            for frag_id in sorted(fragments.keys()):
                fragment = fragments[frag_id]
                read_id = '{}-{}-{}'.format(chrom, id_num, frag_id)
                # Print mutations to log file.
                if args.mutations:
                    read1_muts = pcr.get_mutations_subset(
                        fragment['mutations'], 0, args.read_len)
                    read2_muts = pcr.get_mutations_subset(
                        fragment['mutations'],
                        0,
                        args.read_len,
                        revcomp=True,
                        seqlen=len(fragment['seq']))
                    if fragment['strand'] == '-':
                        read1_muts, read2_muts = read2_muts, read1_muts
                    pcr.log_mutations(args.mutations, read1_muts,
                                      read_id + '/1', chrom, start, stop)
                    pcr.log_mutations(args.mutations, read2_muts,
                                      read_id + '/2', chrom, start, stop)
                frag_seq = fragment['seq']
                read1_seq = frag_seq[:args.read_len]
                read2_seq = pcr.get_revcomp(frag_seq[len(frag_seq) -
                                                     args.read_len:])
                if fragment['strand'] == '-':
                    read1_seq, read2_seq = read2_seq, read1_seq
                if args.out_format == 'fasta':
                    reads1.write('>{}\n{}\n'.format(read_id, read1_seq))
                    reads2.write('>{}\n{}\n'.format(read_id, read2_seq))
                elif args.out_format == 'fastq':
                    qual_line = fastq_qual * len(
                        read1_seq
                    )  ## calculating qual line based on actual read length in case fragment length is less than read length
                    reads1.write('@{}\n{}\n+\n{}\n'.format(
                        read_id, read1_seq, qual_line))
                    reads2.write('@{}\n{}\n+\n{}\n'.format(
                        read_id, read2_seq, qual_line))

    finally:
        try:
            os.remove(tmpfile.name)
        except OSError:
            pass
Exemplo n.º 6
0
def main(argv):

    parser = argparse.ArgumentParser(description=DESCRIPTION)
    parser.set_defaults(**ARG_DEFAULTS)

    parser.add_argument('ref',
                        metavar='ref.fa',
                        nargs='?',
                        help='Reference sequence. Omit if giving --frag-file.')
    parser.add_argument('out1',
                        type=argparse.FileType('w'),
                        help='Write final mate 1 reads to this file.')
    parser.add_argument('out2',
                        type=argparse.FileType('w'),
                        help='Write final mate 2 reads to this file.')
    parser.add_argument('-o', '--out-format', choices=('fastq', 'fasta'))
    parser.add_argument('--stdout',
                        action='store_true',
                        help='Print interleaved output reads to stdout.')
    parser.add_argument(
        '-m',
        '--mutations',
        type=argparse.FileType('w'),
        help=
        'Write a log of the PCR and sequencing errors introduced to this file. Will overwrite any '
        'existing file at this path.')
    parser.add_argument(
        '-b',
        '--barcodes',
        type=argparse.FileType('w'),
        help=
        'Write a log of which barcodes were ligated to which fragments. Will overwrite any '
        'existing file at this path.')
    parser.add_argument(
        '--frag-file',
        help=
        'The path of the FASTQ file of fragments. If --ref is given, these will be generated with '
        'wgsim and kept (normally a temporary file is used, then deleted). Note: the file will be '
        'overwritten! If --ref is not given, then this should be a file of already generated '
        'fragments, and they will be used instead of generating new ones.')
    parser.add_argument(
        '-Q',
        '--fastq-qual',
        help=
        'The quality score to assign to all bases in FASTQ output. Give a character or PHRED '
        'score (integer). A PHRED score will be converted using the Sanger offset (33). Default: '
        '"%(default)s"')
    parser.add_argument(
        '-S',
        '--seed',
        type=int,
        help=
        'Random number generator seed. By default, a random, 32-bit seed will be generated and '
        'logged to stdout.')
    params = parser.add_argument_group('simulation parameters')
    params.add_argument(
        '-n',
        '--n-frags',
        type=int,
        help=
        'The number of original fragment molecules to simulate. The final number of reads will be '
        'this multiplied by the average number of reads per family. If you provide fragments with '
        '--frag-file, the script will still only read in the number specified here. Default: '
        '%(default)s')
    params.add_argument('-r',
                        '--read-len',
                        type=int,
                        help='Default: %(default)s')
    params.add_argument('-f',
                        '--frag-len',
                        type=int,
                        help='Default: %(default)s')
    params.add_argument(
        '-s',
        '--seq-error',
        type=float,
        help=
        'Sequencing error rate per base (0-1 proportion, not percent). Default: %(default)s'
    )
    params.add_argument(
        '-p',
        '--pcr-error',
        type=float,
        help=
        'PCR error rate per base (0-1 proportion, not percent). Default: %(default)s'
    )
    params.add_argument(
        '-c',
        '--cycles',
        type=int,
        help='Number of PCR cycles to simulate. Default: %(default)s')
    params.add_argument(
        '-i',
        '--indel-rate',
        type=float,
        help='Fraction of errors which are indels. Default: %(default)s')
    params.add_argument(
        '-E',
        '--extension-rate',
        dest='ext_rate',
        type=float,
        help='Probability an indel is extended. Default: %(default)s')
    params.add_argument(
        '-B',
        '--bar-len',
        type=int,
        help='Length of the barcodes to generate. Default: %(default)s')
    params.add_argument(
        '-I',
        '--invariant',
        help=
        'The invariant linker sequence between the barcode and sample sequence in each read. '
        'Default: %(default)s')

    # Parse and interpret arguments.
    args = parser.parse_args(argv[1:])
    assert args.ref or args.frag_file, 'You must provide either a reference or fragments file.'
    if args.seed is None:
        seed = random.randint(0, 2**31 - 1)
        sys.stderr.write('seed: {}\n'.format(seed))
    else:
        seed = args.seed
    random.seed(seed)
    if args.stdout:
        out1 = sys.stdout
        out2 = sys.stdout
    else:
        out1 = args.out1
        out2 = args.out2
    if isinstance(args.fastq_qual, numbers.Integral):
        assert args.fastq_qual >= 0, '--fastq-qual cannot be negative.'
        fastq_qual = chr(args.fastq_qual + 33)
    elif isinstance(args.fastq_qual, basestring):
        assert len(
            args.fastq_qual
        ) == 1, '--fastq-qual cannot be more than a single character.'
        fastq_qual = args.fastq_qual
    else:
        raise AssertionError(
            '--fastq-qual must be a positive integer or single character.')
    qual_line = fastq_qual * args.read_len

    invariant_rc = get_revcomp(args.invariant)

    # Create a temporary directory to do our work in. Then work inside a try so we can finally remove
    # the directory no matter what exceptions are encountered.
    tmpfile = tempfile.NamedTemporaryFile(prefix='wgdsim.frags.')
    tmpfile.close()
    try:
        # Step 1: Use wgsim to create fragments from the reference.
        if args.frag_file:
            frag_path = args.frag_file
        else:
            frag_path = tmpfile.name
        if args.ref and os.path.isfile(args.ref) and os.path.getsize(args.ref):
            #TODO: Check exit status
            #TODO: Check for wgsim on the PATH.
            # Set error and mutation rates to 0 to just slice sequences out of the reference without
            # modification.
            run_command('wgsim', '-e', '0', '-r', '0', '-d', '0', '-R',
                        args.indel_rate, '-S', seed, '-N', args.n_frags, '-X',
                        args.ext_rate, '-1', args.frag_len, args.ref,
                        frag_path, os.devnull)

        # NOTE: Coordinates here are 0-based (0 is the first base in the sequence).
        extended_dist = extend_dist(RAW_DISTRIBUTION)
        proportional_dist = compile_dist(extended_dist)
        n_frags = 0
        for raw_fragment in getreads.getparser(frag_path, filetype='fastq'):
            n_frags += 1
            if n_frags > args.n_frags:
                break
            chrom, id_num, start, stop = parse_read_id(raw_fragment.id)
            barcode1 = get_rand_seq(args.bar_len)
            barcode2 = get_rand_seq(args.bar_len)
            barcode2_rc = get_revcomp(barcode2)
            raw_frag_full = barcode1 + args.invariant + raw_fragment.seq + invariant_rc + barcode2

            # Step 2: Determine how many reads to produce from each fragment.
            # - Use random.random() and divide the range 0-1 into segments of sizes proportional to
            #   the likelihood of each family size.
            # bisect.bisect() finds where an element belongs in a sorted list, returning the index.
            # proportional_dist is just such a sorted list, with values from 0 to 1.
            n_reads = bisect.bisect(proportional_dist, random.random())

            # Step 3: Introduce PCR errors.
            # - Determine the mutations and their frequencies.
            #   - Could get frequency from the cycle of PCR it occurs in.
            #     - Important to have PCR errors shared between reads.
            # - For each read, determine which mutations it contains.
            #   - Use random.random() < mut_freq.
            tree = get_good_pcr_tree(n_reads, args.cycles, 1000, max_diff=1)
            # Add errors to all children of original fragment.
            subtree1 = tree.get('child1')
            subtree2 = tree.get('child2')
            #TODO: Only simulate errors on portions of fragment that will become reads.
            add_pcr_errors(subtree1, '+', len(raw_frag_full), args.pcr_error,
                           args.indel_rate, args.ext_rate)
            add_pcr_errors(subtree2, '-', len(raw_frag_full), args.pcr_error,
                           args.indel_rate, args.ext_rate)
            apply_pcr_errors(tree, raw_frag_full)
            fragments = get_final_fragments(tree)
            add_mutation_lists(tree, fragments, [])

            # Step 4: Introduce sequencing errors.
            for fragment in fragments.values():
                for mutation in generate_mutations(args.read_len,
                                                   args.seq_error,
                                                   args.indel_rate,
                                                   args.ext_rate):
                    fragment['mutations'].append(mutation)
                    fragment['seq'] = apply_mutation(mutation, fragment['seq'])

            # Print barcodes to log file.
            if args.barcodes:
                args.barcodes.write('{}-{}\t{}\t{}\n'.format(
                    chrom, id_num, barcode1, barcode2_rc))
            # Print family.
            for frag_id in sorted(fragments.keys()):
                fragment = fragments[frag_id]
                read_id = '{}-{}-{}'.format(chrom, id_num, frag_id)
                # Print mutations to log file.
                if args.mutations:
                    read1_muts = get_mutations_subset(fragment['mutations'], 0,
                                                      args.read_len)
                    read2_muts = get_mutations_subset(fragment['mutations'],
                                                      0,
                                                      args.read_len,
                                                      revcomp=True,
                                                      seqlen=len(
                                                          fragment['seq']))
                    if fragment['strand'] == '-':
                        read1_muts, read2_muts = read2_muts, read1_muts
                    log_mutations(args.mutations, read1_muts, read_id + '/1',
                                  chrom, start, stop)
                    log_mutations(args.mutations, read2_muts, read_id + '/2',
                                  chrom, start, stop)
                frag_seq = fragment['seq']
                read1_seq = frag_seq[:args.read_len]
                read2_seq = get_revcomp(frag_seq[len(frag_seq) -
                                                 args.read_len:])
                if fragment['strand'] == '-':
                    read1_seq, read2_seq = read2_seq, read1_seq
                if args.out_format == 'fasta':
                    out1.write('>{}\n{}\n'.format(read_id, read1_seq))
                    out2.write('>{}\n{}\n'.format(read_id, read2_seq))
                elif args.out_format == 'fastq':
                    out1.write('@{}\n{}\n+\n{}\n'.format(
                        read_id, read1_seq, qual_line))
                    out2.write('@{}\n{}\n+\n{}\n'.format(
                        read_id, read2_seq, qual_line))

    finally:
        try:
            os.remove(tmpfile.name)
        except OSError:
            pass
Exemplo n.º 7
0
def fasta_to_fastq(fasta_file, fastq_file, qual_char):
    for read in getreads.getparser(fasta_file, filetype='fasta'):
        quals = qual_char * len(read.seq)
        fastq_file.write('@{0}\n{1}\n+\n{2}\n'.format(read.name, read.seq,
                                                      quals))
Exemplo n.º 8
0
def find_and_write_chosen_reads(chosen_names, input_fastq, output_fastq):
    input_reads = getreads.getparser(input_fastq, filetype='fastq')
    chosen_reads = find_chosen_reads(input_reads, chosen_names)
    write_reads(chosen_reads, output_fastq)
Exemplo n.º 9
0
 def start_new_file(self, new_file):
     self.current_file = open(new_file)
     return getreads.getparser(self.current_file, self.format).parser()
Exemplo n.º 10
0
def fasta_to_fastq(fasta_file, fastq_file, qual_char):
  for read in getreads.getparser(fasta_file, filetype='fasta'):
    quals = qual_char * len(read.seq)
    fastq_file.write('@{0}\n{1}\n+\n{2}\n'.format(read.name, read.seq, quals))
Exemplo n.º 11
0
def main(argv):

  parser = argparse.ArgumentParser(description=DESCRIPTION)
  parser.set_defaults(**ARG_DEFAULTS)

  parser.add_argument('ref', metavar='ref.fa', nargs='?',
    help='Reference sequence. Omit if giving --frag-file.')
  parser.add_argument('out1', type=argparse.FileType('w'),
    help='Write final mate 1 reads to this file.')
  parser.add_argument('out2', type=argparse.FileType('w'),
    help='Write final mate 2 reads to this file.')
  parser.add_argument('-o', '--out-format', choices=('fastq', 'fasta'))
  parser.add_argument('--stdout', action='store_true',
    help='Print interleaved output reads to stdout.')
  parser.add_argument('-m', '--mutations', type=argparse.FileType('w'),
    help='Write a log of the PCR and sequencing errors introduced to this file. Will overwrite any '
         'existing file at this path.')
  parser.add_argument('-b', '--barcodes', type=argparse.FileType('w'),
    help='Write a log of which barcodes were ligated to which fragments. Will overwrite any '
         'existing file at this path.')
  parser.add_argument('--frag-file',
    help='The path of the FASTQ file of fragments. If --ref is given, these will be generated with '
         'wgsim and kept (normally a temporary file is used, then deleted). Note: the file will be '
         'overwritten! If --ref is not given, then this should be a file of already generated '
         'fragments, and they will be used instead of generating new ones.')
  parser.add_argument('-Q', '--fastq-qual',
    help='The quality score to assign to all bases in FASTQ output. Give a character or PHRED '
         'score (integer). A PHRED score will be converted using the Sanger offset (33). Default: '
         '"%(default)s"')
  parser.add_argument('-S', '--seed', type=int,
    help='Random number generator seed. By default, a random, 32-bit seed will be generated and '
         'logged to stdout.')
  params = parser.add_argument_group('simulation parameters')
  params.add_argument('-n', '--n-frags', type=int,
    help='The number of original fragment molecules to simulate. The final number of reads will be '
         'this multiplied by the average number of reads per family. If you provide fragments with '
         '--frag-file, the script will still only read in the number specified here. Default: '
         '%(default)s')
  params.add_argument('-r', '--read-len', type=int,
    help='Default: %(default)s')
  params.add_argument('-f', '--frag-len', type=int,
    help='Default: %(default)s')
  params.add_argument('-s', '--seq-error', type=float,
    help='Sequencing error rate per base (0-1 proportion, not percent). Default: %(default)s')
  params.add_argument('-p', '--pcr-error', type=float,
    help='PCR error rate per base (0-1 proportion, not percent). Default: %(default)s')
  params.add_argument('-c', '--cycles', type=int,
    help='Number of PCR cycles to simulate. Default: %(default)s')
  params.add_argument('-i', '--indel-rate', type=float,
    help='Fraction of errors which are indels. Default: %(default)s')
  params.add_argument('-E', '--extension-rate', dest='ext_rate', type=float,
    help='Probability an indel is extended. Default: %(default)s')
  params.add_argument('-B', '--bar-len', type=int,
    help='Length of the barcodes to generate. Default: %(default)s')
  params.add_argument('-I', '--invariant',
    help='The invariant linker sequence between the barcode and sample sequence in each read. '
         'Default: %(default)s')

  # Parse and interpret arguments.
  args = parser.parse_args(argv[1:])
  assert args.ref or args.frag_file, 'You must provide either a reference or fragments file.'
  if args.seed is None:
    seed = random.randint(0, 2**31-1)
    sys.stderr.write('seed: {}\n'.format(seed))
  else:
    seed = args.seed
  random.seed(seed)
  if args.stdout:
    out1 = sys.stdout
    out2 = sys.stdout
  else:
    out1 = args.out1
    out2 = args.out2
  if isinstance(args.fastq_qual, numbers.Integral):
    assert args.fastq_qual >= 0, '--fastq-qual cannot be negative.'
    fastq_qual = chr(args.fastq_qual + 33)
  elif isinstance(args.fastq_qual, basestring):
    assert len(args.fastq_qual) == 1, '--fastq-qual cannot be more than a single character.'
    fastq_qual = args.fastq_qual
  else:
    raise AssertionError('--fastq-qual must be a positive integer or single character.')
  qual_line = fastq_qual * args.read_len

  invariant_rc = get_revcomp(args.invariant)

  # Create a temporary directory to do our work in. Then work inside a try so we can finally remove
  # the directory no matter what exceptions are encountered.
  tmpfile = tempfile.NamedTemporaryFile(prefix='wgdsim.frags.')
  tmpfile.close()
  try:
    # Step 1: Use wgsim to create fragments from the reference.
    if args.frag_file:
      frag_path = args.frag_file
    else:
      frag_path = tmpfile.name
    if args.ref and os.path.isfile(args.ref) and os.path.getsize(args.ref):
      #TODO: Check exit status
      #TODO: Check for wgsim on the PATH.
      # Set error and mutation rates to 0 to just slice sequences out of the reference without
      # modification.
      run_command('wgsim', '-e', '0', '-r', '0', '-d', '0', '-R', args.indel_rate, '-S', seed,
                  '-N', args.n_frags, '-X', args.ext_rate, '-1', args.frag_len,
                  args.ref, frag_path, os.devnull)

    # NOTE: Coordinates here are 0-based (0 is the first base in the sequence).
    extended_dist = extend_dist(RAW_DISTRIBUTION)
    proportional_dist = compile_dist(extended_dist)
    n_frags = 0
    for raw_fragment in getreads.getparser(frag_path, filetype='fastq'):
      n_frags += 1
      if n_frags > args.n_frags:
        break
      chrom, id_num, start, stop = parse_read_id(raw_fragment.id)
      barcode1 = get_rand_seq(args.bar_len)
      barcode2 = get_rand_seq(args.bar_len)
      barcode2_rc = get_revcomp(barcode2)
      raw_frag_full = barcode1 + args.invariant + raw_fragment.seq + invariant_rc + barcode2

      # Step 2: Determine how many reads to produce from each fragment.
      # - Use random.random() and divide the range 0-1 into segments of sizes proportional to
      #   the likelihood of each family size.
      # bisect.bisect() finds where an element belongs in a sorted list, returning the index.
      # proportional_dist is just such a sorted list, with values from 0 to 1.
      n_reads = bisect.bisect(proportional_dist, random.random())

      # Step 3: Introduce PCR errors.
      # - Determine the mutations and their frequencies.
      #   - Could get frequency from the cycle of PCR it occurs in.
      #     - Important to have PCR errors shared between reads.
      # - For each read, determine which mutations it contains.
      #   - Use random.random() < mut_freq.
      tree = get_good_pcr_tree(n_reads, args.cycles, 1000, max_diff=1)
      # Add errors to all children of original fragment.
      subtree1 = tree.get('child1')
      subtree2 = tree.get('child2')
      #TODO: Only simulate errors on portions of fragment that will become reads.
      add_pcr_errors(subtree1, '+', len(raw_frag_full), args.pcr_error, args.indel_rate, args.ext_rate)
      add_pcr_errors(subtree2, '-', len(raw_frag_full), args.pcr_error, args.indel_rate, args.ext_rate)
      apply_pcr_errors(tree, raw_frag_full)
      fragments = get_final_fragments(tree)
      add_mutation_lists(tree, fragments, [])

      # Step 4: Introduce sequencing errors.
      for fragment in fragments.values():
        for mutation in generate_mutations(args.read_len, args.seq_error, args.indel_rate,
                                           args.ext_rate):
          fragment['mutations'].append(mutation)
          fragment['seq'] = apply_mutation(mutation, fragment['seq'])

      # Print barcodes to log file.
      if args.barcodes:
        args.barcodes.write('{}-{}\t{}\t{}\n'.format(chrom, id_num, barcode1, barcode2_rc))
      # Print family.
      for frag_id in sorted(fragments.keys()):
        fragment = fragments[frag_id]
        read_id = '{}-{}-{}'.format(chrom, id_num, frag_id)
        # Print mutations to log file.
        if args.mutations:
          read1_muts = get_mutations_subset(fragment['mutations'], 0, args.read_len)
          read2_muts = get_mutations_subset(fragment['mutations'], 0, args.read_len, revcomp=True,
                                            seqlen=len(fragment['seq']))
          if fragment['strand'] == '-':
            read1_muts, read2_muts = read2_muts, read1_muts
          log_mutations(args.mutations, read1_muts, read_id+'/1', chrom, start, stop)
          log_mutations(args.mutations, read2_muts, read_id+'/2', chrom, start, stop)
        frag_seq = fragment['seq']
        read1_seq = frag_seq[:args.read_len]
        read2_seq = get_revcomp(frag_seq[len(frag_seq)-args.read_len:])
        if fragment['strand'] == '-':
          read1_seq, read2_seq = read2_seq, read1_seq
        if args.out_format == 'fasta':
          out1.write('>{}\n{}\n'.format(read_id, read1_seq))
          out2.write('>{}\n{}\n'.format(read_id, read2_seq))
        elif args.out_format == 'fastq':
          out1.write('@{}\n{}\n+\n{}\n'.format(read_id, read1_seq, qual_line))
          out2.write('@{}\n{}\n+\n{}\n'.format(read_id, read2_seq, qual_line))

  finally:
    try:
      os.remove(tmpfile.name)
    except OSError:
      pass
Exemplo n.º 12
0
def fastq_to_fasta(fastq_file, fasta_file):
    for read in getreads.getparser(fastq_file, filetype='fastq'):
        fasta_file.write('>{0}\n{1}\n'.format(read.name, read.seq))