Пример #1
0
def action(args):

    seqs = fastalite(args.fasta)
    pairs = list(all_pairwise(seqs))

    if args.distance:
        pairs = [(q, t, 1 - i) for q, t, i in pairs]

    if args.split_info and args.matrix_out:
        primary, secondary = args.primary_group, args.secondary_group
        split_info = list(csv.DictReader(args.split_info))
        info = {r['seqname']: r for r in split_info if r['seqname']}
        tax = {r['tax_id']:r for r in split_info}

        pairs += map(itemgetter(1,0,2), pairs)

        def group(seqname):
            i = info[seqname]
            return i[primary] or i[secondary] if secondary else i[primary]

        pairs = ((group(left), group(right), score) for left,right,score in pairs)

        # sort and group rows
        pairs = list(groupbyl(pairs, key = itemgetter(0)))

        matrix_out = csv.writer(args.matrix_out)

        # this is the tax_id order we will be using for columns
        tax_ids = map(itemgetter(0), pairs)

        # get the species names to output as first row
        matrix_out.writerow([''] + [tax[t]['tax_name'] for t in tax_ids])

        # iterator through the sorted rows (pairs)
        for row_id, columns in pairs:
            # sort and group columns
            columns = dict(groupbyl(columns, key = itemgetter(1)))

            # get the species name
            row = [tax[row_id]['tax_name']]

            for t in tax_ids:
                # if t not in columns that means there is only
                # sequence representing the group
                # therefore the median destance is 0
                if t not in columns:
                    med = 0
                else:
                    col = columns[t]
                    med = median(map(itemgetter(2), col))
                    # percent and round
                    med = math.ceil(med * 100) / 100

                row.append(med)

            matrix_out.writerow(row)
    else:
        writer = csv.writer(args.out)
        writer.writerow(['query', 'target', 'identity'])
        writer.writerows(pairs)
Пример #2
0
def condense(queries, floor_rank, max_size, ranks, rank_thresholds, target_rank = None):
    target_rank = target_rank or ranks[0]

    groups = list(groupbyl(queries, key = itemgetter(target_rank)))

    num_groups = len(groups)

    if rank_thresholds.get(target_rank, max_size) < num_groups:
        return queries

    # assign where available target_rank_ids
    # groups without 'i' values remain assigned at previous (higher) rank
    for g in (g for i,g in groups if i):
        for q in g:
            q['target_rank_id'] = q[target_rank]

    # return if we hit the floor
    if target_rank == floor_rank:
        return queries

    # else move down a rank
    target_rank = ranks[ranks.index(target_rank) + 1]

    # recurse down the tax tree
    condensed = []
    for _,g in groups:
        c = condense(g, floor_rank, max_size, ranks, rank_thresholds, target_rank)
        condensed.extend(c)

    return condensed
Пример #3
0
def action(args):

    rows = csv.DictReader(args.clusters, delimiter='\t', fieldnames=UCLUST_HEADERS)
    grouped = groupbyl(get_mapping(rows), key=itemgetter(0))  # group by centroid
    clusters = {c: rows for c, rows in grouped if len(rows) >= args.min_clust_size}
    readmap = csv.writer(args.out)
    specimenmap = csv.writer(args.specimenmap) \
                 if (args.specimenmap and args.specimen) else None
    weights = csv.writer(args.weights) if args.weights else None

    # Calculate ratios of reads for the smallest group to each of the
    # other groups.
    if args.groups:
        groups = dict(csv.reader(args.groups))
        most_common = Counter(groups.values()).most_common()
        _, least_common = most_common[-1]
        wdict = {k: float(least_common) / v for k, v in most_common}

    for centroid, cluster in clusters.iteritems():
        log.info('writing {}'.format(centroid))

        for _centroid, read in cluster:
            readmap.writerow([read, centroid])

        if specimenmap:
            specimenmap.writerow((centroid, args.specimen))

        if weights:
            if args.groups:
                # normalize weight of each cluster by contribution of
                # reads by each group defined in --groups
                weights.writerow((centroid, sum(wdict[groups[r]] for c, r in cluster)))
            else:
                weights.writerow((centroid, len(cluster)))

    # filter non centroid seqs
    if args.fasta_in and args.fasta_out:
        for c in  (c for c in args.fasta_in if c.id in clusters):
            args.fasta_out.write('>{}\n{}\n'.format(c.description, c.seq))
Пример #4
0
def condense(queries,
             floor_rank,
             max_size,
             ranks,
             rank_thresholds,
             target_rank=None):
    target_rank = target_rank or ranks[0]

    groups = list(groupbyl(queries, key=itemgetter(target_rank)))

    num_groups = len(groups)

    if rank_thresholds.get(target_rank, max_size) < num_groups:
        return queries

    # assign where available target_rank_ids
    # groups without 'i' values remain assigned at previous (higher) rank
    for g in (g for i, g in groups if i):
        for q in g:
            q['target_rank_id'] = q[target_rank]

    # return if we hit the floor
    if target_rank == floor_rank:
        return queries

    # else move down a rank
    target_rank = ranks[ranks.index(target_rank) + 1]

    # recurse down the tax tree
    condensed = []
    for _, g in groups:
        c = condense(g, floor_rank, max_size, ranks, rank_thresholds,
                     target_rank)
        condensed.extend(c)

    return condensed
Пример #5
0
def action(args):
    ### format format blast data and add additional available information
    fieldnames = None if args.has_header else sequtils.BLAST_HEADER_DEFAULT
    blast_results = DictReader(args.blast_file, fieldnames=fieldnames)
    blast_results = list(blast_results)

    sseqids = set(s['sseqid'] for s in blast_results)
    qseqids = set(s['qseqid'] for s in blast_results)

    # load seq_info and map file
    mapfile = DictReader(args.map, fieldnames=['name', 'specimen'])
    mapfile = {
        m['name']: m['specimen']
        for m in mapfile if m['name'] in qseqids
    }

    seq_info = DictReader(args.seq_info)
    seq_info = {s['seqname']: s for s in seq_info if s['seqname'] in sseqids}

    # pident
    def pident(b):
        return dict(b, pident=float(b['pident'])) if b['sseqid'] else b

    blast_results = (pident(b) for b in blast_results)

    # coverage
    def cov(b):
        if b['sseqid'] and b['qcovs']:
            b['coverage'] = float(b['qcovs'])
            return b
        elif b['sseqid']:
            c = coverage(b['qstart'], b['qend'], b['qlen'])
            return dict(b, coverage=c)
        else:
            return b

    blast_results = (cov(b) for b in blast_results)

    # seq info
    def info(b):
        return dict(seq_info[b['sseqid']], **b) if b['sseqid'] else b

    blast_results = (info(b) for b in blast_results)

    # tax info
    def tax_info(b):
        return dict(args.taxonomy[b['tax_id']], **b) if b['sseqid'] else b

    blast_results = (tax_info(b) for b in blast_results)

    ### output file headers
    fieldnames = [
        'specimen', 'max_percent', 'min_percent', 'max_coverage',
        'min_coverage', 'assignment_id', 'assignment'
    ]

    if args.weights:
        weights = DictReader(args.weights, fieldnames=['name', 'weight'])
        weights = {
            d['name']: d['weight']
            for d in weights if d['name'] in qseqids
        }
        fieldnames += ['clusters', 'reads', 'pct_reads']
    else:
        weights = {}

    if args.copy_numbers:
        copy_numbers = DictReader(args.copy_numbers)
        copy_numbers = {d['tax_id']: float(d['median']) for d in copy_numbers}
        fieldnames += ['corrected', 'pct_corrected']
    else:
        copy_numbers = {}

    # TODO: take out target_rank, hi, low and provide in pipeline using csvmod
    # TODO: option to include tax_ids (default no)
    fieldnames += ['target_rank', 'hi', 'low', 'tax_ids']

    ### Columns
    out = DictWriter(args.out, extrasaction='ignore', fieldnames=fieldnames)
    out.writeheader()

    if args.out_detail:
        args.out_detail.writeheader()

    def blast_hit(hit, args):
        return hit['sseqid'] and \
               hit[args.target_rank] and \
               hit['coverage'] >= args.coverage and \
               float(weights.get(hit['qseqid'], 1)) >= args.min_cluster_size and \
               hit[args.target_rank] not in args.exclude_by_taxid and \
               hit['qseqid'] != hit['sseqid'] and \
               int(hit['ambig_count']) <= args.max_ambiguous

    ### Rows
    etc = '[no blast result]'  # This row will hold all unmatched

    # groups have list position prioritization
    groups = [
        ('> {}%'.format(args.max_identity),
         lambda h: blast_hit(h, args) and h['pident'] > args.max_identity),
        (None, lambda h: blast_hit(h, args) and args.max_identity >= h[
            'pident'] > args.min_identity),
        ('<= {}%'.format(args.min_identity),
         lambda h: blast_hit(h, args) and h['pident'] <= args.min_identity),
    ]

    # used later for results output
    group_cats = map(itemgetter(0), groups)
    group_cats.append(etc)

    # assignment rank thresholds
    rank_thresholds = (d.split(':') for d in args.group_def)
    rank_thresholds = dict((k, int(v)) for k, v in rank_thresholds)

    # rt = {k: int(v) for k, v in (d.split(':') for d in args.group_def)}

    # group by specimen
    if args.map:
        specimen_grouper = lambda s: mapfile[s['qseqid']]
    elif args.all_one_group:
        specimen_grouper = lambda s: args.group_label
    else:
        specimen_grouper = lambda s: s['qseqid']

    blast_results = groupbyl(blast_results, key=specimen_grouper)

    assignments = []  # assignment list for assignment ids

    for specimen, hits in blast_results:
        categories = defaultdict(list)

        # clusters will hold the query ids as hits are matched to categories
        clusters = set()

        # filter out categories
        for cat, fltr in groups:
            matches = filter(fltr, hits)

            if cat:
                categories[cat] = matches
            else:
                # create sets of tax_rank_id
                query_group = groupbyl(matches, key=itemgetter('qseqid'))

                target_cats = defaultdict(list)
                for _, queries in query_group:
                    queries = condense(queries, args.target_rank,
                                       args.target_max_group_size,
                                       sequtils.RANKS, rank_thresholds)
                    cat = map(itemgetter('target_rank_id'), queries)
                    cat = frozenset(cat)

                    target_cats[cat].extend(queries)

                categories = dict(categories, **target_cats)

            # add query ids that were matched to a filter
            clusters |= set(map(itemgetter('qseqid'), matches))

            # remove all hits corresponding to a matched query id (cluster)
            hits = filter(lambda h: h['qseqid'] not in clusters, hits)

        # remaining hits go in the etc ('no match') category
        categories[etc] = hits

        # calculate read counts
        read_counts = dict()
        for k, v in categories.items():
            qseqids = set(map(itemgetter('qseqid'), v))
            weight = sum(float(weights.get(q, 1)) for q in qseqids)
            read_counts[k] = weight

        taxids = set()
        for k, v in categories.items():
            if k is not etc:
                for h in v:
                    taxids.add(h['tax_id'])

        ### list of assigned ids for count corrections
        assigned_ids = dict()
        for k, v in categories.items():
            if k is not etc and v:
                assigned_ids[k] = set(map(itemgetter('tax_id'), v))

        # correction counts
        corrected_counts = dict()
        for k, v in categories.items():
            if k is not etc and v:
                av = mean(copy_numbers.get(t, 1) for t in assigned_ids[k])
                corrected_counts[k] = ceil(read_counts[k] / av)

        # finally take the root value for the etc category
        corrected_counts[etc] = ceil(read_counts[etc] /
                                     copy_numbers.get('1', 1))

        # totals for percent calculations later
        total_reads = sum(v for v in read_counts.values())
        total_corrected = sum(v for v in corrected_counts.values())

        # Print classifications per specimen sorted by # of reads in reverse (descending) order

        sort_by_reads_assign = lambda (c, h): corrected_counts.get(c, None)

        for cat, hits in sorted(categories.items(),
                                key=sort_by_reads_assign,
                                reverse=True):

            # continue if their are hits
            if hits:

                # for incrementing assignment id's
                if cat not in assignments:
                    assignments.append(cat)

                assignment_id = assignments.index(cat)

                reads = read_counts[cat]
                reads_corrected = corrected_counts[cat]

                clusters = set(map(itemgetter('qseqid'), hits))

                results = dict(
                    hi=args.max_identity,
                    low=args.min_identity,
                    target_rank=args.target_rank,
                    specimen=specimen,
                    assignment_id=assignment_id,
                    reads=int(reads),
                    pct_reads='{0:.2f}'.format(reads / total_reads * 100),
                    corrected=int(reads_corrected),
                    pct_corrected='{0:.2f}'.format(reads_corrected /
                                                   total_corrected * 100),
                    clusters=len(clusters))

                if cat is etc:
                    assignment = etc
                    results = dict(results, assignment=assignment)
                else:
                    taxids = set(map(itemgetter('tax_id'), hits))
                    coverages = set(map(itemgetter('coverage'), hits))
                    percents = set(map(itemgetter('pident'), hits))

                    if cat in group_cats:
                        assignment = cat
                    else:
                        names = [
                            args.taxonomy[h['target_rank_id']]['tax_name']
                            for h in hits
                        ]
                        selectors = [
                            h['pident'] >= args.asterisk for h in hits
                        ]
                        assignment = sequtils.format_taxonomy(
                            names, selectors, '*')

                    results = dict(
                        results,
                        assignment=assignment,
                        max_percent='{0:.2f}'.format(max(percents)),
                        min_percent='{0:.2f}'.format(min(percents)),
                        max_coverage='{0:.2f}'.format(max(coverages)),
                        min_coverage='{0:.2f}'.format(min(coverages)),
                        tax_ids=' '.join(taxids))

                out.writerow(results)

                if args.out_detail:
                    if not args.details_full:
                        # drop the no_hits
                        hits = [h for h in hits if 'tax_id' in h]
                        # only report heaviest centroid
                        clusters_and_sizes = [(float(weights.get(c, 1.0)), c)
                                              for c in clusters]
                        _, largest = max(clusters_and_sizes)
                        hits = (h for h in hits if h['qseqid'] == largest)

                    for h in hits:
                        args.out_detail.writerow(
                            dict(specimen=specimen,
                                 assignment=assignment,
                                 assignment_id=assignment_id,
                                 hi=args.max_identity,
                                 low=args.min_identity,
                                 target_rank=args.target_rank,
                                 **h))
Пример #6
0
def action(args):
    ### format format blast data and add additional available information
    fieldnames = None if args.has_header else sequtils.BLAST_HEADER_DEFAULT
    blast_results = DictReader(args.blast_file, fieldnames = fieldnames)
    blast_results = list(blast_results)

    sseqids = set(s['sseqid'] for s in blast_results)
    qseqids = set(s['qseqid'] for s in blast_results)

    # load seq_info and map file
    mapfile = DictReader(args.map, fieldnames = ['name', 'specimen'])
    mapfile = {m['name']:m['specimen'] for m in mapfile if m['name'] in qseqids}

    seq_info = DictReader(args.seq_info)
    seq_info = {s['seqname']:s for s in seq_info if s['seqname'] in sseqids}

    # pident
    def pident(b):
        return dict(b, pident = float(b['pident'])) if b['sseqid'] else b

    blast_results = (pident(b) for b in blast_results)

    # coverage
    def cov(b):
        if b['sseqid'] and b['qcovs']:
            b['coverage'] = float(b['qcovs'])
            return b
        elif b['sseqid']:
            c = coverage(b['qstart'], b['qend'], b['qlen'])
            return dict(b, coverage = c)
        else:
            return b

    blast_results = (cov(b) for b in blast_results)

    # seq info
    def info(b):
        return dict(seq_info[b['sseqid']], **b) if b['sseqid'] else b

    blast_results = (info(b) for b in blast_results)

    # tax info
    def tax_info(b):
        return dict(args.taxonomy[b['tax_id']], **b) if b['sseqid'] else b

    blast_results = (tax_info(b) for b in blast_results)

    ### output file headers
    fieldnames = ['specimen', 'max_percent', 'min_percent', 'max_coverage',
                  'min_coverage', 'assignment_id', 'assignment']

    if args.weights:
        weights = DictReader(args.weights, fieldnames = ['name', 'weight'])
        weights = {d['name']:d['weight'] for d in weights if d['name'] in qseqids}
        fieldnames += ['clusters', 'reads', 'pct_reads']
    else:
        weights = {}

    if args.copy_numbers:
        copy_numbers = DictReader(args.copy_numbers)
        copy_numbers = {d['tax_id']:float(d['median']) for d in copy_numbers}
        fieldnames += ['corrected', 'pct_corrected']
    else:
        copy_numbers = {}

    # TODO: take out target_rank, hi, low and provide in pipeline using csvmod
    # TODO: option to include tax_ids (default no)
    fieldnames += ['target_rank', 'hi', 'low', 'tax_ids']

    ### Columns
    out = DictWriter(args.out,
            extrasaction = 'ignore',
            fieldnames = fieldnames)
    out.writeheader()

    if args.out_detail:
        args.out_detail.writeheader()

    def blast_hit(hit, args):
        return hit['sseqid'] and \
               hit[args.target_rank] and \
               hit['coverage'] >= args.coverage and \
               float(weights.get(hit['qseqid'], 1)) >= args.min_cluster_size and \
               hit[args.target_rank] not in args.exclude_by_taxid and \
               hit['qseqid'] != hit['sseqid'] and \
               int(hit['ambig_count']) <= args.max_ambiguous

    ### Rows
    etc = '[no blast result]' # This row will hold all unmatched

    # groups have list position prioritization
    groups = [
        ('> {}%'.format(args.max_identity),
            lambda h: blast_hit(h, args) and h['pident'] > args.max_identity),
        (None,
            lambda h: blast_hit(h, args) and args.max_identity >= h['pident'] > args.min_identity),
        ('<= {}%'.format(args.min_identity),
            lambda h: blast_hit(h, args) and h['pident'] <= args.min_identity),
    ]

    # used later for results output
    group_cats = map(itemgetter(0), groups)
    group_cats.append(etc)

    # assignment rank thresholds
    rank_thresholds = (d.split(':') for d in args.group_def)
    rank_thresholds = dict((k, int(v)) for k,v in rank_thresholds)

    # rt = {k: int(v) for k, v in (d.split(':') for d in args.group_def)}

    # group by specimen
    if args.map:
        specimen_grouper = lambda s: mapfile[s['qseqid']]
    elif args.all_one_group:
        specimen_grouper = lambda s: args.group_label
    else:
        specimen_grouper = lambda s: s['qseqid']

    blast_results = groupbyl(blast_results, key = specimen_grouper)

    assignments = [] # assignment list for assignment ids

    for specimen, hits in blast_results:
        categories = defaultdict(list)

        # clusters will hold the query ids as hits are matched to categories
        clusters = set()

        # filter out categories
        for cat, fltr in groups:
            matches = filter(fltr, hits)

            if cat:
                categories[cat] = matches
            else:
                # create sets of tax_rank_id
                query_group = groupbyl(matches, key = itemgetter('qseqid'))

                target_cats = defaultdict(list)
                for _,queries in query_group:
                    queries = condense(
                            queries,
                            args.target_rank,
                            args.target_max_group_size,
                            sequtils.RANKS,
                            rank_thresholds)
                    cat = map(itemgetter('target_rank_id'), queries)
                    cat = frozenset(cat)

                    target_cats[cat].extend(queries)

                categories = dict(categories, **target_cats)

            # add query ids that were matched to a filter
            clusters |= set(map(itemgetter('qseqid'), matches))

            # remove all hits corresponding to a matched query id (cluster)
            hits = filter(lambda h: h['qseqid'] not in clusters, hits)

        # remaining hits go in the etc ('no match') category
        categories[etc] = hits

        # calculate read counts
        read_counts = dict()
        for k,v in categories.items():
            qseqids = set(map(itemgetter('qseqid'), v))
            weight = sum(float(weights.get(q, 1)) for q in qseqids)
            read_counts[k] = weight

        taxids = set()
        for k,v in categories.items():
            if k is not etc:
                for h in v:
                    taxids.add(h['tax_id'])

        ### list of assigned ids for count corrections
        assigned_ids = dict()
        for k,v in categories.items():
            if k is not etc and v:
                assigned_ids[k] = set(map(itemgetter('tax_id'), v))

        # correction counts
        corrected_counts = dict()
        for k,v in categories.items():
            if k is not etc and v:
                av = mean(copy_numbers.get(t, 1) for t in assigned_ids[k])
                corrected_counts[k] = ceil(read_counts[k] / av)

        # finally take the root value for the etc category
        corrected_counts[etc] = ceil(read_counts[etc] / copy_numbers.get('1', 1))

        # totals for percent calculations later
        total_reads = sum(v for v in read_counts.values())
        total_corrected = sum(v for v in corrected_counts.values())

        # Print classifications per specimen sorted by # of reads in reverse (descending) order

        sort_by_reads_assign = lambda (c,h): corrected_counts.get(c, None)

        for cat, hits in sorted(categories.items(), key = sort_by_reads_assign, reverse = True):

            # continue if their are hits
            if hits:

                # for incrementing assignment id's
                if cat not in assignments:
                    assignments.append(cat)

                assignment_id = assignments.index(cat)

                reads = read_counts[cat]
                reads_corrected = corrected_counts[cat]

                clusters = set(map(itemgetter('qseqid'), hits))

                results = dict(
                        hi = args.max_identity,
                        low = args.min_identity,
                        target_rank = args.target_rank,
                        specimen = specimen,
                        assignment_id = assignment_id,
                        reads = int(reads),
                        pct_reads = '{0:.2f}'.format(reads / total_reads * 100),
                        corrected = int(reads_corrected),
                        pct_corrected = '{0:.2f}'.format(reads_corrected / total_corrected * 100),
                        clusters = len(clusters))

                if cat is etc:
                    assignment = etc
                    results = dict(results, assignment = assignment)
                else:
                    taxids = set(map(itemgetter('tax_id'), hits))
                    coverages = set(map(itemgetter('coverage'), hits))
                    percents = set(map(itemgetter('pident'), hits))

                    if cat in group_cats:
                        assignment = cat
                    else:
                        names = [args.taxonomy[h['target_rank_id']]['tax_name'] for h in  hits]
                        selectors = [h['pident'] >= args.asterisk for h in hits]
                        assignment = sequtils.format_taxonomy(names, selectors, '*')

                    results = dict(results,
                        assignment = assignment,
                        max_percent = '{0:.2f}'.format(max(percents)),
                        min_percent = '{0:.2f}'.format(min(percents)),
                        max_coverage = '{0:.2f}'.format(max(coverages)),
                        min_coverage = '{0:.2f}'.format(min(coverages)),
                        tax_ids = ' '.join(taxids))

                out.writerow(results)

                if args.out_detail:
                    if not args.details_full:
                        # drop the no_hits
                        hits = [h for h in hits if 'tax_id' in h]
                        # only report heaviest centroid
                        clusters_and_sizes = [(float(weights.get(c, 1.0)), c) for c in clusters]
                        _, largest = max(clusters_and_sizes)
                        hits = (h for h in hits if h['qseqid'] == largest)

                    for h in hits:
                        args.out_detail.writerow(dict(
                        specimen = specimen,
                        assignment = assignment,
                        assignment_id = assignment_id,
                        hi = args.max_identity,
                        low = args.min_identity,
                        target_rank = args.target_rank,
                        **h))