Exemplo n.º 1
0
    parser = argparse.ArgumentParser(
        description=('Print a (Levenshtein) distance matrix for a set of '
                     'known adaptors'))

    parser.add_argument(
        'adaptors', nargs='+', metavar='adaptor',
        help='the set of adaptors that were used in sequencing')

    args = parser.parse_args()

    adaptors = args.adaptors
    nAdaptors = len(adaptors)
    length = len(adaptors[0])
    spaces = ' ' * length

    for i in range(length):
        print(spaces, end=' ')
        for adaptor in adaptors:
            print(adaptor[i], end=' ')
        print()

    for i in range(nAdaptors):
        print(adaptors[i], end=' ')
        for j in range(nAdaptors):
            if j < i:
                print(' ', end=' ')
            else:
                print(levenshtein(adaptors[i], adaptors[j]), end=' ')
        print()
def splitFASTAByAdaptor(knownAdaptors, adaptorLen, adaptorOffset,
                        maximumDistance, outputPrefix, dryRun, verbose):
    """
    @param knownAdaptors: A C{set} of expected adaptor sequences.
    @param adaptorLen: The C{int} length of each adaptor sequence.
    @param adaptorOffset: The zero-based C{int} offset of the adaptor in
        each sequence.
    @param maximumDistance: The maximum distance an unknown adaptor will be
        mapped to in an attempt to find its nearest known adaptor.
    @param outputPrefix: A C{str} prefix that should be used in the file names
        that are written out.
    @param dryRun: A C{bool}, if C{True} only print what would be done, don't
        create any new FASTA files.
    @param verbose: A C{bool}, if C{True} output additional information about
        adaptor classes found and assigned.
    """
    adaptors = defaultdict(int)
    unknowns = 0
    classes = dict(zip(knownAdaptors, knownAdaptors))
    reads = []

    for count, seq in enumerate(SeqIO.parse(sys.stdin, 'fasta'), start=1):
        reads.append(seq)
        adaptor = str(seq.seq)[adaptorOffset:][:adaptorLen].upper()
        adaptors[adaptor] += 1

    order = sorted(adaptors,
                   key=lambda adaptor: adaptors[adaptor],
                   reverse=True)

    for adaptor in order:
        if adaptor in knownAdaptors:
            if verbose:
                print('%s: %s. Known adaptor' % (adaptor, adaptors[adaptor]))
        else:
            distances = sorted((levenshtein(adaptor, known), known)
                               for known in knownAdaptors)
            # Treat the read as unclassifiable if it's too far from its
            # nearest neighbor or if its nearest neighbor is ambiguous.
            nearest = distances[0][0]
            if nearest > maximumDistance or (len(knownAdaptors) > 1
                                             and nearest == distances[1][0]):
                unknowns += 1
                classes[adaptor] = UNKNOWN
                if verbose:
                    print(
                        '%s: %s. Unknown, distances %r' %
                        (adaptor, adaptors[adaptor], [d[0]
                                                      for d in distances]))
            else:
                correctedAdaptor = distances[0][1]
                classes[adaptor] = correctedAdaptor
                if verbose:
                    print('%s: %s. Assigned to class %s, at dist %d' %
                          (adaptor, adaptors[adaptor], correctedAdaptor,
                           distances[0][0]))

    readGroups = defaultdict(list)

    # Collect reads into classes.
    for read in reads:
        adaptor = str(read.seq)[adaptorOffset:][:adaptorLen].upper()
        readGroups[classes[adaptor]].append(read[adaptorOffset + adaptorLen:])

    # Calculate the number of digits in the size of the biggest read group
    # so we can nicely align the output.
    width = int(ceil(log10(max(len(group) for group in readGroups.values()))))

    # The width of the count of files we'll write, so file names have zero
    # padded numeric prefixes.
    filesWidth = int(ceil(log10(len(readGroups))))

    # Write out the FASTA files for each adaptor class (this includes the
    # unclassifiable reads if any unknown adaptors were found).
    for count, adaptor in enumerate(sorted(readGroups), start=1):
        reads = readGroups[adaptor]
        filename = '%s%0*d-%s.fasta' % (outputPrefix, filesWidth, count,
                                        adaptor)
        description = ('unrecognized adaptors'
                       if adaptor == UNKNOWN else 'adaptor %s' % adaptor)
        if dryRun:
            print('Would write %*d sequences for %s to %s' %
                  (width, len(reads), description, filename))
        else:
            with open(filename, 'w') as fp:
                SeqIO.write(reads, fp, 'fasta')
            print('Wrote %*d sequences for %s to %s' %
                  (width, len(reads), description, filename))
Exemplo n.º 3
0
def splitFASTAByAdaptor(knownAdaptors, adaptorLen, adaptorOffset,
                        maximumDistance, outputPrefix, dryRun, verbose):
    """
    @param knownAdaptors: A C{set} of expected adaptor sequences.
    @param adaptorLen: The C{int} length of each adaptor sequence.
    @param adaptorOffset: The zero-based C{int} offset of the adaptor in
        each sequence.
    @param maximumDistance: The maximum distance an unknown adaptor will be
        mapped to in an attempt to find its nearest known adaptor.
    @param outputPrefix: A C{str} prefix that should be used in the file names
        that are written out.
    @param dryRun: A C{bool}, if C{True} only print what would be done, don't
        create any new FASTA files.
    @param verbose: A C{bool}, if C{True} output additional information about
        adaptor classes found and assigned.
    """
    adaptors = defaultdict(int)
    unknowns = 0
    classes = dict(zip(knownAdaptors, knownAdaptors))
    reads = []

    for count, seq in enumerate(SeqIO.parse(sys.stdin, 'fasta'), start=1):
        reads.append(seq)
        adaptor = str(seq.seq)[adaptorOffset:][:adaptorLen].upper()
        adaptors[adaptor] += 1

    order = sorted(adaptors, key=lambda adaptor: adaptors[adaptor],
                   reverse=True)

    for adaptor in order:
        if adaptor in knownAdaptors:
            if verbose:
                print('%s: %s. Known adaptor' % (adaptor, adaptors[adaptor]))
        else:
            distances = sorted((levenshtein(adaptor, known), known) for
                               known in knownAdaptors)
            # Treat the read as unclassifiable if it's too far from its
            # nearest neighbor or if its nearest neighbor is ambiguous.
            nearest = distances[0][0]
            if nearest > maximumDistance or (len(knownAdaptors) > 1 and
                                             nearest == distances[1][0]):
                unknowns += 1
                classes[adaptor] = UNKNOWN
                if verbose:
                    print('%s: %s. Unknown, distances %r' % (
                        adaptor, adaptors[adaptor], [d[0] for d in distances]))
            else:
                correctedAdaptor = distances[0][1]
                classes[adaptor] = correctedAdaptor
                if verbose:
                    print('%s: %s. Assigned to class %s, at dist %d' % (
                        adaptor, adaptors[adaptor], correctedAdaptor,
                        distances[0][0]))

    readGroups = defaultdict(list)

    # Collect reads into classes.
    for read in reads:
        adaptor = str(read.seq)[adaptorOffset:][:adaptorLen].upper()
        readGroups[classes[adaptor]].append(read[adaptorOffset + adaptorLen:])

    # Calculate the number of digits in the size of the biggest read group
    # so we can nicely align the output.
    width = int(ceil(log10(max(len(group) for group in readGroups.values()))))

    # The width of the count of files we'll write, so file names have zero
    # padded numeric prefixes.
    filesWidth = int(ceil(log10(len(readGroups))))

    # Write out the FASTA files for each adaptor class (this includes the
    # unclassifiable reads if any unknown adaptors were found).
    for count, adaptor in enumerate(sorted(readGroups), start=1):
        reads = readGroups[adaptor]
        filename = '%s%0*d-%s.fasta' % (outputPrefix, filesWidth, count,
                                        adaptor)
        description = ('unrecognized adaptors' if adaptor == UNKNOWN
                       else 'adaptor %s' % adaptor)
        if dryRun:
            print('Would write %*d sequences for %s to %s' % (
                width, len(reads), description, filename))
        else:
            with open(filename, 'w') as fp:
                SeqIO.write(reads, fp, 'fasta')
            print('Wrote %*d sequences for %s to %s' % (
                width, len(reads), description, filename))
Exemplo n.º 4
0
 def testIdentical(self):
     """
     Two identical strings must have distance zero.
     """
     self.assertEqual(0, levenshtein('BLAH', 'BLAH'))
    parser = argparse.ArgumentParser(
        description=('Print a (Levenshtein) distance matrix for a set of '
                     'known adaptors'))

    parser.add_argument(
        'adaptors', type=str, nargs='+', metavar='adaptor',
        help='the set of adaptors that were used in sequencing')

    args = parser.parse_args()

    adaptors = args.adaptors
    nAdaptors = len(adaptors)
    length = len(adaptors[0])
    spaces = ' ' * length

    for i in xrange(length):
        print spaces,
        for adaptor in adaptors:
            print adaptor[i],
        print

    for i in xrange(nAdaptors):
        print adaptors[i],
        for j in xrange(nAdaptors):
            if j < i:
                print ' ',
            else:
                print levenshtein(adaptors[i], adaptors[j]),
        print
Exemplo n.º 6
0
 def testInsert(self):
     """
     Test a string insertion that results in a distance of 2.
     """
     self.assertEqual(2, levenshtein('AGTACACACTG', 'ACGTACACACT'))
Exemplo n.º 7
0
 def testMutation(self):
     """
     Test a single character results in a distance of 1.
     """
     self.assertEqual(1, levenshtein('ACGTACACACG', 'ACGTACACACT'))
Exemplo n.º 8
0
 def testInsert(self):
     """
     Test a string insertion that results in a distance of 2.
     """
     self.assertEqual(2, levenshtein('AGTACACACTG',
                                     'ACGTACACACT'))
Exemplo n.º 9
0
 def testMutation(self):
     """
     Test a single character results in a distance of 1.
     """
     self.assertEqual(1, levenshtein('ACGTACACACG',
                                     'ACGTACACACT'))
Exemplo n.º 10
0
 def testIdentical(self):
     """
     Two identical strings must have distance zero.
     """
     self.assertEqual(0, levenshtein('BLAH',
                                     'BLAH'))