예제 #1
0
def align_family(family, mate, stats):
    """Do a multiple sequence alignment of the reads in a family and their quality scores."""
    mate = str(mate)
    assert mate == "1" or mate == "2"
    start = time.time()
    # Do the multiple sequence alignment.
    seq_alignment = make_msa(family, mate)
    if seq_alignment is None:
        return None
    # Transfer the alignment to the quality scores.
    seqs = [read["seq"] for read in seq_alignment]
    quals_raw = [pair["qual" + mate] for pair in family]
    qual_alignment = seqtools.transfer_gaps_multi(quals_raw, seqs, gap_char_out=" ")
    # Package them up in the output data structure.
    alignment = []
    for aligned_seq, aligned_qual in zip(seq_alignment, qual_alignment):
        alignment.append({"name": aligned_seq["name"], "seq": aligned_seq["seq"], "qual": aligned_qual})
    elapsed = time.time() - start
    pairs = len(family)
    logging.info("{} sec for {} read pairs.".format(elapsed, pairs))
    if pairs > 1:
        stats["time"] += elapsed
        stats["pairs"] += pairs
        stats["runs"] += 1
    return alignment
예제 #2
0
def align_family(family, mate, aligner='mafft'):
    """Do a multiple sequence alignment of the reads in a family and their quality scores."""
    mate = str(mate)
    assert mate == '1' or mate == '2'
    if len(family) == 0:
        return None
    elif len(family) == 1:
        # If there's only one read pair, there's no alignment to be done (and MAFFT won't accept it).
        aligned_seqs = [family[0]['seq' + mate]]
    else:
        # Do the multiple sequence alignment.
        aligned_seqs = make_msa(family, mate, aligner=aligner)
    # Transfer the alignment to the quality scores.
    ## Get a list of all quality scores in the family for this mate.
    quals_raw = [pair['qual' + mate] for pair in family]
    qual_alignment = seqtools.transfer_gaps_multi(quals_raw,
                                                  aligned_seqs,
                                                  gap_char_out=' ')
    # Package them up in the output data structure.
    alignment = []
    for pair, aligned_seq, aligned_qual in zip(family, aligned_seqs,
                                               qual_alignment):
        alignment.append({
            'name': pair['name' + mate],
            'seq': aligned_seq,
            'qual': aligned_qual
        })
    return alignment
예제 #3
0
파일: errstats.py 프로젝트: pughlab/dunovo
def realign_family_to_consensus(consensus, family, quals, validate=False):
  unaligned_seqs = [consensus]
  unaligned_seqs.extend([seq.replace('-', '') for seq in family])
  aligned_seqs = kalign.align(unaligned_seqs)
  if validate:
    for input, output in zip(unaligned_seqs, aligned_seqs):
      assert input == output.replace('-', ''), (
        "Kalign may have returned sequences in a different order than they were given. Failed on "
        "sequence: {}".format(input)
      )
  aligned_consensus = aligned_seqs[0]
  aligned_family = aligned_seqs[1:]
  aligned_quals = seqtools.transfer_gaps_multi(quals, aligned_family, gap_char_out=' ')
  return aligned_consensus, aligned_family, aligned_quals
예제 #4
0
def align_family(family, mate):
  """Do a multiple sequence alignment of the reads in a family and their quality scores."""
  mate = str(mate)
  assert mate == '1' or mate == '2'
  # Do the multiple sequence alignment.
  seq_alignment = make_msa(family, mate)
  if seq_alignment is None:
    return None
  # Transfer the alignment to the quality scores.
  ## Get a list of all sequences in the alignment (mafft output).
  seqs = [read['seq'] for read in seq_alignment]
  ## Get a list of all quality scores in the family for this mate.
  quals_raw = [pair['qual'+mate] for pair in family]
  qual_alignment = seqtools.transfer_gaps_multi(quals_raw, seqs, gap_char_out=' ')
  # Package them up in the output data structure.
  alignment = []
  for aligned_seq, aligned_qual in zip(seq_alignment, qual_alignment):
    alignment.append({'name':aligned_seq['name'], 'seq':aligned_seq['seq'], 'qual':aligned_qual})
  return alignment
예제 #5
0
def align_family(family, mate, aligner='mafft'):
  """Do a multiple sequence alignment of the reads in a family and their quality scores."""
  mate = str(mate)
  assert mate == '1' or mate == '2'
  if len(family) == 0:
    return None
  elif len(family) == 1:
    # If there's only one read pair, there's no alignment to be done (and MAFFT won't accept it).
    aligned_seqs = [family[0]['seq'+mate]]
  else:
    # Do the multiple sequence alignment.
    aligned_seqs = make_msa(family, mate, aligner=aligner)
  # Transfer the alignment to the quality scores.
  ## Get a list of all quality scores in the family for this mate.
  quals_raw = [pair['qual'+mate] for pair in family]
  qual_alignment = seqtools.transfer_gaps_multi(quals_raw, aligned_seqs, gap_char_out=' ')
  # Package them up in the output data structure.
  alignment = []
  for pair, aligned_seq, aligned_qual in zip(family, aligned_seqs, qual_alignment):
    alignment.append({'name':pair['name'+mate], 'seq':aligned_seq, 'qual':aligned_qual})
  return alignment
예제 #6
0
def main(argv):

  parser = argparse.ArgumentParser(description=DESCRIPTION)
  parser.set_defaults(**OPT_DEFAULTS)

  parser.add_argument('seqs', metavar='sequence', nargs='*',
    help='The alignment.')
  parser.add_argument('-i', '--input',
    help='Provide the sequences in this input file instead of as command-line arguments. '
         'Give "-" to use stdin.')
  parser.add_argument('-f', '--format', choices=('plain', 'duplex'),
    help='Input format. "plain" is a simple list of the sequences, one on each line. "duplex" is '
         'the 8-column format of the family-sorted read data from the duplex pipeline. It must be '
         'the read pairs from a single alpha/beta barcode combination (both the alpha-beta and '
         'beta-alpha strands). If "duplex" is given, you must also specify which of the four '
         'possible alignments to output with --mate and --order.')
  parser.add_argument('-m', '--mate', type=int, choices=(1, 2))
  parser.add_argument('-o', '--order', choices=('ab', 'ba'))
  parser.add_argument('-F', '--qual-format', choices=('sanger',))
  parser.add_argument('-q', '--qual', type=int,
    help='Quality threshold: Default: %(default)s')

  args = parser.parse_args(argv[1:])

  qual_thres = ' '
  if args.qual_format == 'sanger':
    qual_thres = chr(args.qual + 33)
  else:
    fail('Error: Unsupported FASTQ quality format "{}".'.format(args.qual_format))
  # Check arguments.
  if not (args.seqs or args.input):
    fail('Error: You must provide sequences either in a file with --input or as arguments.')
  elif args.seqs and args.input:
    fail('Error: You cannot provide sequences in both a file and command-line arguments.')
  if args.format == 'duplex' and not (args.mate and args.order):
    fail('Error: If the --format is duplex, you must specify a --mate and --order.')

  # Read input.
  quals = []
  if args.input:
    if args.format == 'plain':
      if args.input == '-':
        seqs = [line.strip() for line in sys.stdin]
      else:
        with open(args.input) as infile:
          seqs = [line.strip() for line in infile]
    elif args.format == 'duplex':
      if args.input == '-':
        (seqs, quals) = parse_duplex(sys.stdin, args.mate, args.order)
      else:
        with open(args.input) as infile:
          (seqs, quals) = parse_duplex(infile, args.mate, args.order)
  else:
    seqs = args.seqs

  align = make_msa(seqs)
  if quals:
    quals = seqtools.transfer_gaps_multi(quals, align, gap_char_out=' ')
  cons = consensus.get_consensus(align, quals, qual_thres=qual_thres, gapped=True)

  output = format_alignment(cons, align, quals, qual_thres=ord(qual_thres))

  for seq in output:
    print seq