Пример #1
0
    def test04(self):
        with BZ2File(self.data('rle_100_left.ssearch.bz2')) as f:
            aligns = list(sequtils.parse_ssearch36(f))

            # 100 total sequences
            self.assertEqual(len(set(a['q_name'] for a in aligns)), 100)

            # searched against 4 primers
            self.assertEqual(len(aligns), 400)
Пример #2
0
def action(args):
    extras = parse_extras(args.extra_fields) if args.extra_fields else {}

    aligns = islice(parse_ssearch36(args.alignments, False), args.limit)

    if args.min_zscore:
        aligns = (a for a in aligns
                  if float(a['sw_zscore']) >= args.min_zscore)
    aligns = groupby(aligns, key=itemgetter('q_name'))

    if args.top_alignment:
        aligns = (next(a) for _, a in aligns)
    else:
        aligns = (a for _, i in aligns for a in i)  # flatten groupby iters

    if args.rlefile:
        decoding = {k: v for d in args.rlefile for k, v in d.items()}

        def decode(aligns):
            aligns['t_seq'], aligns['q_seq'] = homodecodealignment(
                aligns['t_seq'], from_ascii(decoding[aligns['t_name']]),
                aligns['q_seq'], from_ascii(decoding[aligns['q_name']]))
            return aligns

        aligns = imap(decode, aligns)

    if args.print_one:
        pprint.pprint(aligns.next())
        sys.exit()

    if args.with_diff:
        aligns = imap(add_diff, aligns)

    if args.fieldnames:
        fieldnames = args.fieldnames
    else:
        # peek at first row fieldnames
        top = next(aligns, {})
        fieldnames = top.keys()
        aligns = chain([top], aligns)

    if extras:
        fieldnames += extras.keys()
        aligns = (dict(d, **extras) for d in aligns)

    writer = csv.DictWriter(args.out,
                            extrasaction='ignore',
                            fieldnames=fieldnames)

    if args.header:
        writer.writeheader()

    for a in aligns:
        writer.writerow(a)
Пример #3
0
    def test03(self):
        """
        No output name, clean up
        """

        q, t = self.data('two.fasta'), self.data('ten.fasta')
        with sequtils.run_ssearch(q, t) as aligns:
            parsed = sequtils.parse_ssearch36(aligns)
            self.assertEqual(set(['H59735', 'T70875']),
                             {d['q_name'] for d in parsed})

        # should not still exist
        self.assertFalse(path.exists(aligns.name))
Пример #4
0
    def test02(self):
        """
        No output name, don't clean up
        """

        q, t = self.data('two.fasta'), self.data('ten.fasta')
        with sequtils.run_ssearch(q, t, cleanup=False) as aligns:
            parsed = sequtils.parse_ssearch36(aligns)
            self.assertEqual(set(['H59735', 'T70875']),
                             {d['q_name'] for d in parsed})

        # should still exist since we specified 'cleanup=False'
        self.assertTrue(path.exists(aligns.name))
Пример #5
0
    def test01(self):
        """
        Provide an output name
        """

        q, t = self.data('two.fasta'), self.data('ten.fasta')
        out = path.join(self.mkoutdir(), 'aligns.ssearch')
        with sequtils.run_ssearch(q, t, out) as aligns:
            self.assertEqual(out, aligns.name)
            parsed = sequtils.parse_ssearch36(aligns)
            self.assertEqual(set(['H59735', 'T70875']),
                             {d['q_name'] for d in parsed})

        # should still exist since we provided a name for the output
        self.assertTrue(path.exists(out))
Пример #6
0
def action(args):
    # setup ssearch command and communicate
    command = ["ssearch36"]
    command += ["-m", "10"]  # 10 is parseable key:value output format
    command += ["-n"]  # DNA/RNA query
    command += ["-z", args.statistical_calculation]
    command += ["-g", args.gap_extension_penalty]
    command += ["-f", args.gap_open_penalty]
    command += ["-T", str(args.threads)]

    if args.strand == "forward":
        command += ["-3"]  # forward strand only

    if args.full_sequences:
        command += ["-a"]

    if not args.all_alignments:
        command += ["-b", "1"]
        command += ["-d", "1"]

    command += [args.query, args.library]

    # If query or library file is empty, don't bother executing ssearch.
    # Just print empty file
    # with a header and exit
    if os.stat(args.query).st_size == 0 or os.stat(args.library).st_size == 0:
        # write empty header
        if args.fieldnames:
            fieldnames = args.fieldnames
        if fieldnames:
            writer = DictWriter(args.out, extrasaction="ignore", fieldnames=fieldnames)
            if args.header:
                writer.writeheader()
        return

    log.info(" ".join(command))
    ssearch = Popen(command, stdout=PIPE, stderr=PIPE)

    # parse alignments
    aligns = parse_ssearch36(ssearch.stdout)
    aligns = (a for a in aligns if float(a["sw_zscore"]) >= args.min_zscore)
    aligns = groupby(aligns, key=itemgetter("q_name"))
    aligns = (a for _, i in aligns for a in i)  # flatten groupby iters

    # decode if appropriate
    if args.decode:
        decoding = {k: v for d in args.decode for k, v in d.items()}

        def decode(aligns):
            aligns["t_seq"], aligns["q_seq"] = homodecodealignment(
                aligns["t_seq"],
                from_ascii(decoding[aligns["t_name"]]),
                aligns["q_seq"],
                from_ascii(decoding[aligns["q_name"]]),
            )
            return aligns

        aligns = imap(decode, aligns)

    # calculate coverage for each item and repack into generator
    # coverage = |query alignment| / |query length|
    aligns = (
        dict(d, qcovs=str((float(d["q_al_stop"]) - float(d["q_al_start"])) / float(d["q_sq_len"]))) for d in aligns
    )

    # write results
    if args.fieldnames:
        fieldnames = args.fieldnames
    else:
        # peek at first row fieldnames
        top = next(aligns, {})
        fieldnames = top.keys()
        if top:
            aligns = chain([top], aligns)

    writer = DictWriter(args.out, extrasaction="ignore", fieldnames=fieldnames)

    if args.header:
        writer.writeheader()

    for a in aligns:
        writer.writerow(a)

    error = set(e.strip() for e in ssearch.stderr)
    error = ", ".join(error)

    if ssearch.wait() != 0:
        raise CalledProcessError(ssearch.returncode, error)

    if error:
        log.error(error)
Пример #7
0
def action(args):
    # setup ssearch command and communicate
    command = ['fasta36']
    command += ['-m', '10']
    command += ['-3']
    command += ['-n']
    command += ['-g', args.gap_extension_penalty]
    command += ['-f', args.gap_open_penalty]
    command += ['-T', str(args.threads)]

    if args.full_sequences:
        command += ['-a']

    if not args.all_alignments:
        command += ['-b', '1']
        command += ['-d', '1']

    command += [args.query, args.library]

    log.info(' '.join(command))

    pipe = Popen(command, stdout = PIPE, stderr = PIPE)

    # parse alignments
    aligns = parse_ssearch36(pipe.stdout)
    aligns = (a for a in aligns if float(a['fa_zscore']) >= args.min_zscore)
    aligns = groupby(aligns, key = itemgetter('q_name'))
    aligns = (a for _,i in aligns for a in i) # flatten groupby iters

    # decode if appropriate
    if args.decode:
        decoding = {k:v for d in args.decode for k,v in d.items()}
        def decode(aligns):
            aligns['t_seq'], aligns['q_seq'] = homodecodealignment(
                    aligns['t_seq'], from_ascii(decoding[aligns['t_name']]),
                    aligns['q_seq'], from_ascii(decoding[aligns['q_name']]))
            return aligns
        aligns = imap(decode, aligns)

    # write results
    if args.fieldnames:
        fieldnames = args.fieldnames
    else:
        # peek at first row fieldnames
        top = next(aligns, {})
        fieldnames = top.keys()
        if top:
            aligns = chain([top], aligns)

    if fieldnames:
        writer = DictWriter(args.out,
                extrasaction = 'ignore',
                fieldnames = fieldnames)

        if args.header:
            writer.writeheader()

        for a in aligns:
            writer.writerow(a)

    error = set(e.strip() for e in pipe.stderr)
    error = ', '.join(error)

    if pipe.wait() != 0:
        raise CalledProcessError(pipe.returncode, error)
    if error:
        log.error(error)
Пример #8
0
def action(args):
    # setup ssearch command and communicate
    command = ['fasta36']
    command += ['-m', '10']
    command += ['-3']
    command += ['-n']
    command += ['-g', args.gap_extension_penalty]
    command += ['-f', args.gap_open_penalty]
    command += ['-T', str(args.threads)]

    if args.full_sequences:
        command += ['-a']

    if not args.all_alignments:
        command += ['-b', '1']
        command += ['-d', '1']

    command += [args.query, args.library]

    log.info(' '.join(command))

    pipe = Popen(command, stdout=PIPE, stderr=PIPE)

    # parse alignments
    aligns = parse_ssearch36(pipe.stdout)
    aligns = (a for a in aligns if float(a['fa_zscore']) >= args.min_zscore)
    aligns = groupby(aligns, key=itemgetter('q_name'))
    aligns = (a for _, i in aligns for a in i)  # flatten groupby iters

    # decode if appropriate
    if args.decode:
        decoding = {k: v for d in args.decode for k, v in d.items()}

        def decode(aligns):
            aligns['t_seq'], aligns['q_seq'] = homodecodealignment(
                aligns['t_seq'], from_ascii(decoding[aligns['t_name']]),
                aligns['q_seq'], from_ascii(decoding[aligns['q_name']]))
            return aligns

        aligns = imap(decode, aligns)

    # write results
    if args.fieldnames:
        fieldnames = args.fieldnames
    else:
        # peek at first row fieldnames
        top = next(aligns, {})
        fieldnames = top.keys()
        if top:
            aligns = chain([top], aligns)

    if fieldnames:
        writer = DictWriter(args.out,
                            extrasaction='ignore',
                            fieldnames=fieldnames)

        if args.header:
            writer.writeheader()

        for a in aligns:
            writer.writerow(a)

    error = set(e.strip() for e in pipe.stderr)
    error = ', '.join(error)

    if pipe.wait() != 0:
        raise CalledProcessError(pipe.returncode, error)
    if error:
        log.error(error)
Пример #9
0
def action(args):
    # setup ssearch command and communicate
    command = ['ssearch36']
    command += ['-m', '10']  # 10 is parseable key:value output format
    command += ['-n']        # DNA/RNA query
    command += ['-z', args.statistical_calculation]
    command += ['-g', args.gap_extension_penalty]
    command += ['-f', args.gap_open_penalty]
    command += ['-T', str(args.threads)]

    if args.strand == 'forward':
        command += ['-3']        # forward strand only

    if args.full_sequences:
        command += ['-a']

    if not args.all_alignments:
        command += ['-b', '1']
        command += ['-d', '1']

    command += [args.query, args.library]

    # If query or library file is empty, don't bother executing ssearch.
    # Just print empty file
    # with a header and exit
    if os.stat(args.query).st_size == 0 or os.stat(args.library).st_size == 0:
        # write empty header
        if args.fieldnames:
            fieldnames = args.fieldnames
        if fieldnames:
            writer = DictWriter(args.out,
                                extrasaction='ignore',
                                fieldnames=fieldnames)
            if args.header:
                writer.writeheader()
        return

    log.info(' '.join(command))
    ssearch = Popen(command, stdout=PIPE, stderr=PIPE)

    # parse alignments
    aligns = parse_ssearch36(ssearch.stdout)
    aligns = (a for a in aligns if float(a['sw_zscore']) >= args.min_zscore)
    aligns = groupby(aligns, key=itemgetter('q_name'))
    aligns = (a for _, i in aligns for a in i)  # flatten groupby iters

    # decode if appropriate
    if args.decode:
        decoding = {k: v for d in args.decode for k, v in d.items()}

        def decode(aligns):
            aligns['t_seq'], aligns['q_seq'] = homodecodealignment(
                aligns['t_seq'], from_ascii(decoding[aligns['t_name']]),
                aligns['q_seq'], from_ascii(decoding[aligns['q_name']]))
            return aligns

        aligns = imap(decode, aligns)

    # calculate coverage for each item and repack into generator
    # coverage = |query alignment| / |query length|
    aligns = (dict(d, qcovs=str(
             (float(d['q_al_stop']) - float(d['q_al_start'])) /
        float(d['q_sq_len'])
    )) for d in aligns)

    # write results
    if args.fieldnames:
        fieldnames = args.fieldnames
    else:
        # peek at first row fieldnames
        top = next(aligns, {})
        fieldnames = top.keys()
        if top:
            aligns = chain([top], aligns)

    writer = DictWriter(args.out,
                        extrasaction='ignore',
                        fieldnames=fieldnames)

    if args.header:
        writer.writeheader()

    for a in aligns:
        writer.writerow(a)

    error = set(e.strip() for e in ssearch.stderr)
    error = ', '.join(error)

    if ssearch.wait() != 0:
        raise CalledProcessError(ssearch.returncode, error)

    if error:
        log.error(error)