Python dedup 예제들, deduplicate.dedup Python 예제들

예제 #1

0

파일 보기

파일: test_dedup.py 프로젝트: nhoffman/deduplicate

    def test01(self):
        infile = 'testfiles/474_0_0.txt'
        names, seqs = zip(*(line.split() for line in open(infile) if line.strip()))

        d1 = dedup(seqs)
        d2 = dedup(seqs, chunksize=500)

        as_strings = lambda idx: set(seqs[i] for i in idx)
        self.assertTrue(as_strings(d1.keys()) == as_strings(d1.keys()))

        parents = [seqs[i] for i in d1.keys()]
        # canonical sequences should all be unique
        self.assertTrue(len(parents) == len(set(parents)))
        dp = dedup(parents)
        self.assertTrue(len(dp) == len(parents))

예제 #2

0

파일 보기

    def test01(self):
        infile = 'testfiles/474_0_0.txt'
        names, seqs = zip(*(line.split() for line in open(infile)
                            if line.strip()))

        d1 = dedup(seqs)
        d2 = dedup(seqs, chunksize=500)

        as_strings = lambda idx: set(seqs[i] for i in idx)
        self.assertTrue(as_strings(d1.keys()) == as_strings(d1.keys()))

        parents = [seqs[i] for i in d1.keys()]
        # canonical sequences should all be unique
        self.assertTrue(len(parents) == len(set(parents)))
        dp = dedup(parents)
        self.assertTrue(len(dp) == len(parents))

예제 #3

0

파일 보기

def main():

    usage = """%prog [options]

Read sequences in fasta format and identify a single sequence to
represent each set of substrings or identical sequences.
"""

    parser = OptionParser(usage=usage, version="$Id$")

    parser.set_defaults(
        infile=None,
        verbose=0,
        nchunks=0,
        chunksize=None,
        compare_type='contains',
        outfile=None,
        r_file=None,
        as_is=False,
        name_style='list',
    )

    parser.add_option("-f",
                      "--fasta-file",
                      dest="infile",
                      help='Input file containing sequences in fasta format',
                      metavar='FILE')

    parser.add_option("-o",
                      "--outfile",
                      dest="outfile",
                      help='Output file in fasta format',
                      metavar='FILE')

    parser.add_option(
        "-r",
        "--r-output",
        dest="r_file",
        help='Output file containing an R-language list of character vectors',
        metavar='FILE')

    parser.add_option('-n',
                      '--nchunks',
                      dest='nchunks',
                      metavar='INT',
                      type='int',
                      help='Number of partitions [default: %default].')
    parser.add_option(
        '-c',
        '--chunksize',
        dest='chunksize',
        metavar='INT',
        type='int',
        help=
        'Number of strings in each partition (overrides --nchunks) [default: %default].'
    )
    parser.add_option(
        '-t',
        '--compare-type',
        dest='compare_type',
        metavar='VAL',
        type='choice',
        choices=['eq', 'contains'],
        help=('Type of comparison: "contains"'
              '(finds longest sequences containing sets of substrings)'
              ' or "eq" (selects single representatives from groups of'
              ' identical sequences) [default: %default].'))
    parser.add_option(
        "-a",
        "--as-is",
        action="store_true",
        dest="as_is",
        help='If True, do not degap input sequences [default: %default].')

    parser.add_option(
        '-N',
        '--name-style',
        dest='name_style',
        type='choice',
        choices=['list', 'count'],
        help=
        ('Naming style of sequences.'
         '"list" - name of canonical sequence representing each group followed by list of members'
         ' or "count" - as above, with the count of group members appended to the seq name'
         ' identical sequences) [default: %default].'))

    parser.add_option(
        "-v",
        "--verbose",
        action="count",
        dest="verbose",
        help=
        "increase verbosity of screen output (eg, -v is verbose, -vv is more so)"
    )

    options, args = parser.parse_args()

    loglevel = {
        0: logging.WARNING,
        1: logging.INFO,
        2: logging.DEBUG
    }.get(options.verbose, logging.DEBUG)

    verbose_format = '%(levelname)s %(funcName)s %(lineno)s %(message)s'
    logformat = {
        0: '%(message)s',
        1: verbose_format,
        2: verbose_format
    }.get(options.verbose, verbose_format)

    if __debug__:
        logformat = verbose_format

    # set up logging
    logging.basicConfig(file=sys.stdout, format=logformat, level=loglevel)

    comp = options.compare_type

    infile = options.infile
    if not infile:
        log.error('Please provide an input file using -f/--fasta-file\n')
        parser.print_usage()
        sys.exit(1)

    seqs = readfasta(infile, degap=True)

    strings = tuple(seq[1] for seq in seqs)
    nstrings = len(strings)
    log.warning('Input contains %s items' % nstrings)

    start = time.time()

    if options.chunksize:
        chunksize = options.chunksize
    elif options.nchunks and options.nchunks > 1:
        chunksize = int(nstrings / options.nchunks) + 1
    else:
        chunksize = nstrings

    log.warning('chunksize = %s' % chunksize)

    d = deduplicate.dedup(strings, comp, chunksize)

    log.warning(
        'Input data can be represented by %s superstrings (%.2f%% of the input number)'
        % (len(d), 100 * (len(d) / float(nstrings))))
    log.warning('grand total is %.2f secs' % (time.time() - start))

예제 #4

0

파일 보기

파일: dedup.py 프로젝트: nhoffman/deduplicate

def main():

    usage = """%prog [options]

Read sequences in fasta format and identify a single sequence to
represent each set of substrings or identical sequences.
"""

    parser = OptionParser(usage=usage, version="$Id$")

    parser.set_defaults(
    infile=None,
    verbose=0,
    nchunks=0,
    chunksize=None,
    compare_type='contains',
    outfile=None,
    r_file=None,
    as_is=False,
    name_style = 'list',
    )

    parser.add_option("-f", "--fasta-file", dest="infile",
        help='Input file containing sequences in fasta format',
        metavar='FILE')

    parser.add_option("-o", "--outfile", dest="outfile",
        help='Output file in fasta format',
        metavar='FILE')

    parser.add_option("-r", "--r-output", dest="r_file",
        help='Output file containing an R-language list of character vectors',
        metavar='FILE')

    parser.add_option('-n','--nchunks', dest='nchunks', metavar='INT', type='int',
                      help='Number of partitions [default: %default].')
    parser.add_option('-c','--chunksize', dest='chunksize', metavar='INT',
                      type='int',
                      help='Number of strings in each partition (overrides --nchunks) [default: %default].')
    parser.add_option('-t','--compare-type',dest='compare_type',metavar='VAL',
                      type='choice',choices=['eq','contains'],
                      help=('Type of comparison: "contains"'
                            '(finds longest sequences containing sets of substrings)'
                            ' or "eq" (selects single representatives from groups of'
                            ' identical sequences) [default: %default].'))
    parser.add_option("-a","--as-is", action="store_true", dest="as_is",
                      help='If True, do not degap input sequences [default: %default].')

    parser.add_option('-N','--name-style',dest='name_style',
                      type='choice',choices=['list','count'],
                      help=('Naming style of sequences.'
                            '"list" - name of canonical sequence representing each group followed by list of members'
                            ' or "count" - as above, with the count of group members appended to the seq name'
                            ' identical sequences) [default: %default].'
                            ))
    
    parser.add_option("-v", "--verbose",
                      action="count", dest="verbose",
                      help="increase verbosity of screen output (eg, -v is verbose, -vv is more so)")

    options, args = parser.parse_args()

    loglevel = {0:logging.WARNING,
                1:logging.INFO,
                2:logging.DEBUG}.get(options.verbose, logging.DEBUG)

    verbose_format = '%(levelname)s %(funcName)s %(lineno)s %(message)s'
    logformat = {0:'%(message)s',
        1:verbose_format,
        2:verbose_format}.get(options.verbose, verbose_format)

    if __debug__:
        logformat = verbose_format

    # set up logging
    logging.basicConfig(file=sys.stdout,
        format=logformat,
        level=loglevel)

    comp = options.compare_type

    infile = options.infile
    if not infile:
        log.error('Please provide an input file using -f/--fasta-file\n')
        parser.print_usage()
        sys.exit(1)

    seqs = readfasta(infile, degap=True)

    strings = tuple(seq[1] for seq in seqs)
    nstrings = len(strings)
    log.warning('Input contains %s items' % nstrings)

    start = time.time()

    if options.chunksize:
        chunksize = options.chunksize
    elif options.nchunks and options.nchunks > 1:
        chunksize = int(nstrings/options.nchunks) + 1
    else:
        chunksize = nstrings

    log.warning('chunksize = %s' % chunksize)

    d = deduplicate.dedup(strings, comp, chunksize)

    log.warning('Input data can be represented by %s superstrings (%.2f%% of the input number)' % (len(d), 100*(len(d)/float(nstrings))))
    log.warning('grand total is %.2f secs' % (time.time()-start))