示例#1
0
    def test01(self):
        infile = self.data('clusters.uc')
        with open(infile) as f:
            cluster_ids, cluster_sizes = sequtils.parse_uc(f)

        counter = Counter()
        for cluster, count in cluster_sizes.items():
            counter[count] += 1

        # most of the clusters are singletons
        self.assertEquals(counter.most_common(1)[0][0], 1)
示例#2
0
文件: denoise.py 项目: nhoffman/bioy
def action(args):

    if args.clusters:
        _, fileExt, = os.path.basename(args.clusters.name).split('.')

        if fileExt == 'uc':
            clusters = parse_uc(args.clusters)[0]
        else:
            clusters = {seq: tag for seq, tag in csv.reader(args.clusters)}

        by_clusters = lambda s: clusters.get(s.id, s.id)
    else:
        by_clusters = lambda _: 'all one cluster'

    seqs = fastalite(args.fastafile)
    seqs = islice(seqs, args.limit)
    seqs = sorted(seqs, key=by_clusters)
    grouped_seqs = groupby(seqs, key=by_clusters)

    chunks = ichunker((group for _, group in grouped_seqs), args.rlefile,
                      args.min_clust_size, args.max_clust_size)

    # calculate consensus for each cluster, then accumulate names of
    # each set of identical consensus sequences in `exemplars` (keys
    # are the consensus sequences themselves).
    exemplars = defaultdict(list)
    pool = Pool(processes=args.threads)
    for cluster, cons in map(align_and_consensus, enumerate(chunks, start=1)):
        exemplars[cons].extend([c.id for c in cluster])

    # calculate ratios of reads for the smallest group to each of the
    # other groups. outseqs is a list of (weight, consensus, list_of_names)
    if args.groups and exemplars:
        groups = dict(csv.reader(args.groups))
        group_counts = Counter(
            groups[name] for name in chain.from_iterable(exemplars.values()))
        most_common = group_counts.most_common()
        _, least_common = most_common[-1]
        weights = {k: float(least_common) / v for k, v in most_common}
        outseqs = [(sum(weights[groups[n]] for n in names), cons, names)
                   for cons, names in exemplars.items()]
    else:
        outseqs = [(len(names), cons, names)
                   for cons, names in exemplars.items()]

    # write each consensus sequence in descending order of weight
    outseqs.sort(reverse=True, key=itemgetter(0))
    for i, (weight, cons, names) in enumerate(outseqs, start=1):

        name_elements = [
            args.name_prefix, 'cons{:04}'.format(i), '{:.0f}'.format(weight),
            args.name_suffix
        ]

        consname = args.name_delimiter.join([e for e in name_elements if e])

        log.debug('writing {}'.format(consname))

        args.outfile.write('>{}\n{}\n'.format(consname, cons))

        if args.readmap:
            args.readmap.writerows((name, consname) for name in names)

        if args.clustermap and args.specimen:
            args.clustermap.writerow((consname, args.specimen))

        if args.weights:
            args.weights.writerow((consname, weight))
示例#3
0
文件: denoise.py 项目: crosenth/bioy
def action(args):

    if args.clusters:
        _, fileExt, = os.path.basename(args.clusters.name).split('.')

        if fileExt == 'uc':
            clusters = parse_uc(args.clusters)[0]
        else:
            clusters = {seq: tag for seq,tag in csv.reader(args.clusters)}

        by_clusters = lambda s: clusters.get(s.id, s.id)
    else:
        by_clusters = lambda _: 'all one cluster'

    seqs = fastalite(args.fastafile)
    seqs = islice(seqs, args.limit)
    seqs = sorted(seqs, key = by_clusters)
    grouped_seqs = groupby(seqs, key = by_clusters)

    chunks = ichunker((group for _, group in grouped_seqs),
                      args.rlefile, args.min_clust_size, args.max_clust_size)

    # calculate consensus for each cluster, then accumulate names of
    # each set of identical consensus sequences in `exemplars` (keys
    # are the consensus sequences themselves).
    exemplars = defaultdict(list)
    pool = Pool(processes = args.threads)
    for cluster, cons in map(align_and_consensus, enumerate(chunks, start = 1)):
        exemplars[cons].extend([c.id for c in cluster])

    # calculate ratios of reads for the smallest group to each of the
    # other groups. outseqs is a list of (weight, consensus, list_of_names)
    if args.groups and exemplars:
        groups = dict(csv.reader(args.groups))
        group_counts = Counter(groups[name] for name in chain.from_iterable(exemplars.values()))
        most_common = group_counts.most_common()
        _, least_common = most_common[-1]
        weights = {k: float(least_common)/v for k, v in most_common}
        outseqs = [(sum(weights[groups[n]] for n in names), cons, names)
                   for cons, names in exemplars.items()]
    else:
        outseqs = [(len(names), cons, names) for cons, names in exemplars.items()]

    # write each consensus sequence in descending order of weight
    outseqs.sort(reverse=True, key=itemgetter(0))
    for i, (weight, cons, names) in enumerate(outseqs, start=1):

        name_elements = [args.name_prefix,
                         'cons{:04}'.format(i),
                         '{:.0f}'.format(weight),
                         args.name_suffix]

        consname = args.name_delimiter.join([e for e in name_elements if e])

        log.debug('writing {}'.format(consname))

        args.outfile.write('>{}\n{}\n'.format(consname, cons))

        if args.readmap:
            args.readmap.writerows((name, consname) for name in names)

        if args.clustermap and args.specimen:
            args.clustermap.writerow((consname, args.specimen))

        if args.weights:
            args.weights.writerow((consname, weight))