Пример #1
0
    def preprocess(self) :
        if len(self.options['input-files']) == 0 :
            self.log.error("nothing to do")
            sys.exit(1)

        self.seqdb = SequenceDB(preprocessed=False)

        p = Progress("Preprocessing", len(self.options['input-files']))
        p.start()

        samples = []

        for f in self.__get_files(self.options['input-files']) :
            mid = self.__mid_fastq(f)

            sample = Sample(f, 
                        self.options['outdir'],
                        self.seqdb, 
                        self.__filters(mid),
                        chimeras=self.options['chimeras'])

            sample.print_sample()
            samples.append(sample)

            p.increment()

        p.end()

        rejected_reads = sum([ sum(s.filters.counts) for s in samples ])
        accepted_reads = sum([ len(s) for s in samples ])
        unique_seq = sum([ len(s.seqcounts) for s in samples ])

        print "processed %s reads, accepted %d (of which %d are unique)" % \
                (rejected_reads + accepted_reads, accepted_reads, unique_seq)

        if samples :
            summary_data = self.__build_summary(samples)

            summary_file = Summary(self.options['summary-file'])
            summary_file.update(summary_data)
            summary_file.write(self.options['summary-file'])


        if self.options['verbose'] :
            self.summary()

        return 0
Пример #2
0
    def cluster(self) :
        # rebuild the database from preprocessed samples in outdir
        self.seqdb = SequenceDB(preprocessed=False) # setting this to true causes things to be overwritten if we merge mulitiple preprocessing steps
        samples = self.__preprocessed_samples()
        
        if self.seqdb.num_sequences() == 0 :
            self.log.error("no sequences loaded")
            exit(1)

        # to prove the rebuild of the database is the same
        #for sample in samples :
        #    sample.print_sample(extension=".rebuild")
        
        # get keys of sequences we want to cluster
#        input_keys = self.__get_cluster_input(samples,
#                                self.options['total-duplicate-threshold'],
#                                self.options['sample-threshold'],
#                                self.options['duplicate-threshold'])

        input_keys, singleton_keys = self.__get_cluster_input2(samples,
                                    self.options['total-duplicate-threshold'],
                                    self.options['duplicate-threshold'])

        # get some info
#        num_reads = sum([ self.seqdb.get(i).duplicates for i in input_keys ])
        num_reads = sum([ self.seqdb.get(i).duplicates for i in input_keys.keys() ])
        self.log.info("clustering %d/%d (%.2f%%) sequences (%d/%d (%.2f%%) reads)" % \
                        (len(input_keys), self.seqdb.num_sequences(), \
                        len(input_keys) * 100 / float(self.seqdb.num_sequences()), \
                        num_reads, self.seqdb.num_reads(), \
                        num_reads * 100 / float(self.seqdb.num_reads())))

        # clustering
        c = Cluster(self.seqdb, self.options['otu-similarity'], self.options['verbose'])
#        c.create_clusters(keys=input_keys, homopolymer_correction=not self.options['no-homopolymer-correction'])
        c.create_clusters2(keys=input_keys, 
                           homopolymer_correction=not self.options['no-homopolymer-correction'], 
                           singletons=singleton_keys,
                           sample_threshold=self.options['sample-threshold'])

        # output centroids to file
        # output biom file
        # run blast if necessary
        centroid_fname = self.options['cluster-fasta']
        biom_fname = self.options['cluster-biom']
        otu_names = {}

        # write everything out anyway in case labelling fails or is killed
        self.__fasta(centroid_fname, c.centroids(), names=otu_names)
        self.__biom(biom_fname, samples, c, otu_names)

        # blast to get better names
        if self.options['labels'] :
            print "getting OTU names (this may take a while)..."
            otu_names = BlastN(self.options['verbose']).get_names(centroid_fname, self.options['labels'], self.options['labels-similarity'], self.options['labels_db'])

            if self.options['labels'] == 'blast' and self.options['merge-blast-hits'] :
                c.merge(otu_names)

            # write out results files
            self.__fasta(centroid_fname, c.centroids(), names=otu_names)
            self.__biom(biom_fname, samples, c, otu_names)
        
        return 0
Пример #3
0
class WorkFlow(object) :
    def __init__(self, options) :
        self.options = options
        self.log = logging.getLogger('seance')
        self.seqdb = None

    def __filters(self, mid) :
        mf = MultiFilter()

        ## if we have trimmed the forward primer, then the mid will be trimmed 
        ## off with it and this check will always fail
        #if not self.options['clipprimers'] :
        #    mf.add(MidFilter(mid, self.options['miderrors']))


        # always remove the mid, because it is now left by denoising
        mf.add(MidFilter(mid, self.options['miderrors']))

        # check primer, remove if requested
        if self.options['forwardprimer'] is not None :
            mf.add(PrimerFilter(self.options['forwardprimer'], self.options['primererrors'], self.options['clipprimers']))

        if self.options['length'] != None :
            mf.add(LengthFilter(self.options['length']))

        if self.options['removeambiguous'] :
            mf.add(AmbiguousFilter())

        # homopolymer and quality are taken care of
        if self.options['denoise'] :
            return mf

        if self.options['maxhomopolymer'] > 0 :
            mf.add(HomopolymerFilter(self.options['maxhomopolymer']))

        qual = self.options['quality-method']

        if qual == 'min' :
            mf.add(MinimumQualityFilter(self.options['quality']))

        elif qual == 'average' :
            mf.add(AverageQualityFilter(self.options['quality']))

        elif qual == 'window' :
                mf.add(
                        WindowedQualityFilter(
                            self.options['quality'], 
                            self.options['windowlength'])
                      )

        return mf

    def __mid_sff(self, sff) :
        fastq = Sff2Fastq().run(sff, self.options['outdir'])
        mid = self.__mid_fastq(fastq)
        os.remove(fastq.get_filename())
        return mid

    def __mid_fastq(self, fastq) :
        #return GetMID(self.options['midlength']).run(fastq.get_filename())
        return GetMID2(self.options['midlength']).run(fastq)

    def __get_files(self, file_names) :
        for fname in file_names :
            root,ext = splitext(basename(fname))

            self.log.info("current file = %s" % fname)

            if ext == '.sff' :
                if os.path.exists(join(self.options['outdir'], basename(fname)+'.fasta.sample')) :
                    self.log.info("skipping %s (already preprocessed)" % fname)
                    continue

                sff = SffFile(fname)

                # annoyingly, mid is figured out twice if we are
                # doing denoising
                if self.options['denoise'] :
                    yield AmpliconNoise().run(sff,
                            self.options['outdir'],
                            self.options['forwardprimer'],
                            self.options['primererrors'],
                            self.__mid_sff(sff),
                            self.options['miderrors'], 
                            self.options['maxhomopolymer']
                            )
                else :
                    yield Sff2Fastq().run(sff, 
                        self.options['outdir'])

            else :
                yield FastqFile(fname)

    def preprocess(self) :
        if len(self.options['input-files']) == 0 :
            self.log.error("nothing to do")
            sys.exit(1)

        self.seqdb = SequenceDB(preprocessed=False)

        p = Progress("Preprocessing", len(self.options['input-files']))
        p.start()

        samples = []

        for f in self.__get_files(self.options['input-files']) :
            mid = self.__mid_fastq(f)

            sample = Sample(f, 
                        self.options['outdir'],
                        self.seqdb, 
                        self.__filters(mid),
                        chimeras=self.options['chimeras'])

            sample.print_sample()
            samples.append(sample)

            p.increment()

        p.end()

        rejected_reads = sum([ sum(s.filters.counts) for s in samples ])
        accepted_reads = sum([ len(s) for s in samples ])
        unique_seq = sum([ len(s.seqcounts) for s in samples ])

        print "processed %s reads, accepted %d (of which %d are unique)" % \
                (rejected_reads + accepted_reads, accepted_reads, unique_seq)

        if samples :
            summary_data = self.__build_summary(samples)

            summary_file = Summary(self.options['summary-file'])
            summary_file.update(summary_data)
            summary_file.write(self.options['summary-file'])


        if self.options['verbose'] :
            self.summary()

        return 0

    def __preprocessed_samples(self) :
        if self.options['metadata'] is None :
            tmp = []
            for sample in glob(join(self.options['outdir'], '*.sample')) :
                md = SampleMetadata()
                md.defaults()
                s = basename(sample)
                md['file'] = s[:s.find('.')]
                tmp.append(MetadataSample(FastqFile(sample), self.options['outdir'], self.seqdb, md))
            return tmp

        mdr = MetadataReader(self.options['metadata'])
        mdr.process()

        tmp = []
        md_used = []

        for sample in glob(join(self.options['outdir'], '*.sample')) :
            md = mdr.get(basename(sample))

            if md == None :
                self.log.warn("metadata missing for %s ..." % basename(sample))
                md = SampleMetadata()
                md.defaults()
                s = basename(sample)
                md['file'] = s[:s.find('.')]
                #self.log.warn("skipping %s, metadata missing..." % basename(sample))
                #continue

            tmp.append(MetadataSample(FastqFile(sample), self.options['outdir'], self.seqdb, md))
            md_used.append(md['file'])

        # warn about used metadata
        for i in mdr.metadata.keys() :
            if i not in md_used :
                self.log.warn("preprocessed file for %s not found" % i)

        return sorted(tmp)

    def __get_cluster_input(self, samples, duplicate_threshold, sample_threshold, contamination_threshold) :
        ref_count = collections.Counter()
        cluster_input_keys = set()

        for sample in samples :
            sample.remove_less_than(contamination_threshold)

        # first collect keys for all sequences that fit number of reads
        for sample in samples :
            for key,freq in sample.seqcounts.most_common() :
                if self.seqdb.get(key).duplicates >= duplicate_threshold :
                    ref_count[key] += 1

        # then for these keys see how many samples they occurred in
        for key,freq in ref_count.most_common() :
            if freq < sample_threshold :
                break

            cluster_input_keys.add(key)

        # some samples contain reads you would not necessarily expect to see 
        # in any other samples (control samples) so add these separately, but
        # still respect duplicate_threshold
        for sample in samples :
            if sample.metadata['allow-singletons'] :
                count = 0

                for key,freq in sample.seqcounts.most_common() :
                    if self.seqdb.get(key).duplicates >= duplicate_threshold :
                        if key not in cluster_input_keys :
                            cluster_input_keys.add(key)
                            count += 1

                self.log.info("allowing singletons for (%s) - added %d sequences" % \
                        (sample.description(), count))

        return list(cluster_input_keys)

    def __get_cluster_input2(self, samples, duplicate_threshold, contamination_threshold) :
        seq2samp = collections.defaultdict(list)
        singletons = []

        for sample in samples :
            sample.remove_less_than(contamination_threshold)

        # first collect keys for all sequences that fit number of reads
        for index, sample in enumerate(samples) :
            is_singleton = sample.metadata['allow-singletons']

            for key,freq in sample.seqcounts.most_common() :
                if is_singleton or (self.seqdb.get(key).duplicates >= duplicate_threshold) :
                    seq2samp[key].append(index)

                if is_singleton :
                    singletons.append(key)

        return seq2samp, singletons

    def cluster(self) :
        # rebuild the database from preprocessed samples in outdir
        self.seqdb = SequenceDB(preprocessed=False) # setting this to true causes things to be overwritten if we merge mulitiple preprocessing steps
        samples = self.__preprocessed_samples()
        
        if self.seqdb.num_sequences() == 0 :
            self.log.error("no sequences loaded")
            exit(1)

        # to prove the rebuild of the database is the same
        #for sample in samples :
        #    sample.print_sample(extension=".rebuild")
        
        # get keys of sequences we want to cluster
#        input_keys = self.__get_cluster_input(samples,
#                                self.options['total-duplicate-threshold'],
#                                self.options['sample-threshold'],
#                                self.options['duplicate-threshold'])

        input_keys, singleton_keys = self.__get_cluster_input2(samples,
                                    self.options['total-duplicate-threshold'],
                                    self.options['duplicate-threshold'])

        # get some info
#        num_reads = sum([ self.seqdb.get(i).duplicates for i in input_keys ])
        num_reads = sum([ self.seqdb.get(i).duplicates for i in input_keys.keys() ])
        self.log.info("clustering %d/%d (%.2f%%) sequences (%d/%d (%.2f%%) reads)" % \
                        (len(input_keys), self.seqdb.num_sequences(), \
                        len(input_keys) * 100 / float(self.seqdb.num_sequences()), \
                        num_reads, self.seqdb.num_reads(), \
                        num_reads * 100 / float(self.seqdb.num_reads())))

        # clustering
        c = Cluster(self.seqdb, self.options['otu-similarity'], self.options['verbose'])
#        c.create_clusters(keys=input_keys, homopolymer_correction=not self.options['no-homopolymer-correction'])
        c.create_clusters2(keys=input_keys, 
                           homopolymer_correction=not self.options['no-homopolymer-correction'], 
                           singletons=singleton_keys,
                           sample_threshold=self.options['sample-threshold'])

        # output centroids to file
        # output biom file
        # run blast if necessary
        centroid_fname = self.options['cluster-fasta']
        biom_fname = self.options['cluster-biom']
        otu_names = {}

        # write everything out anyway in case labelling fails or is killed
        self.__fasta(centroid_fname, c.centroids(), names=otu_names)
        self.__biom(biom_fname, samples, c, otu_names)

        # blast to get better names
        if self.options['labels'] :
            print "getting OTU names (this may take a while)..."
            otu_names = BlastN(self.options['verbose']).get_names(centroid_fname, self.options['labels'], self.options['labels-similarity'], self.options['labels_db'])

            if self.options['labels'] == 'blast' and self.options['merge-blast-hits'] :
                c.merge(otu_names)

            # write out results files
            self.__fasta(centroid_fname, c.centroids(), names=otu_names)
            self.__biom(biom_fname, samples, c, otu_names)
        
        return 0

    def label(self) :
        self.seqdb = self.__read_fasta(self.options['cluster-fasta'])
        blast_fname = self.options['cluster-fasta']

        # if we are only going to label the clusters without labels
        # then we need to find the names of those clusters and write a 
        # fasta file containing only those sequences
        if self.options['label-missing'] :
            tmp = []
            biom = json.load(open(self.options['cluster-biom']))
            for r in biom['rows'] :
                if r['metadata']['label'] in ("", "unknown", "error", "cannot label (matches multiple domains!)") :
                    tmp.append(r['id'])

            if len(tmp) == 0 :
                self.log.error("there are no missing labels")
                exit(1)

            self.log.info("%d clusters missing labels" % len(tmp))
            blast_fname = self.__fasta(join(self.options['outdir'], 'missing.fasta'), tmp)


        print "getting OTU names (this may take a while)..." 
        otu_names = BlastN(self.options['verbose']).get_names(blast_fname, self.options['labels'], self.options['labels-similarity'], self.options['labels_db'])

        # rework the biom
        biom = BiomFile()
        biom.change_otu_names(self.options['cluster-biom'], otu_names)
        self.log.info("written %s" % self.options['cluster-biom'])

        # get the rest of the names and rewrite fasta
        otu_names = biom.get_label_mapping(self.options['cluster-biom'])
        self.__fasta(self.options['cluster-fasta'], self.seqdb.keys(), names=otu_names)

        return 0

    def __fasta(self, filename, keys, names=None) :
        f = open(filename, 'w')
        
        if names is None :
            for key in keys :
                print >> f, self.seqdb.get(key).fasta()
        else :
            for key in keys :
                s = self.seqdb.get(key)

                if isinstance(key, int) :
                    str_key = "seance%d" % key
                else :
                    str_key = str(key)

                print >> f, ">%s %s" % (str_key, names.get(str_key, "unknown"))
                print >> f, s.sequence

        f.close()
        self.log.info("written %s" % filename)

        return filename

    def __read_fasta(self, filename, include=None) :
        tmp = {}

        if include :
            include = include.lower()

        f = FastqFile(filename)
        f.open()

        for seq in f :
            if include :
                if include not in seq.id.lower() :
                    continue

            seq.id = seq.id.split()[0][1:]

            #if only_include :
            #    if seq.id not in only_include :
            #        continue
    
            tmp[seq.id] = seq

        f.close()

        self.log.info("read %d centroid sequences" % len(tmp))

        return tmp

    def __biom(self, filename, samples, clustering, cluster_names) :
        centroids = clustering.centroids()
        all_keys = clustering.all()

        output_clusters = clustering.clusters
        output_samples = [ s for s in samples if s.contains(all_keys) ]
        output_otus = [ ("seance" + str(k), cluster_names.get("seance" + str(k), "unknown")) for k in centroids ]

        #self.log.info("%d / %d samples have at least one sequence used in clustering" % \
        #        (len(output_samples), len(samples)))

        b = BiomFile()
        b.set_samples(output_samples)
        b.set_otus(output_otus)

        for sind,sample in enumerate(output_samples) :
            for cind,cluster in enumerate(output_clusters) :
                count = 0

                for read in cluster :
                    if read in sample :
                        count += sample.seqcounts[read]

                b.add_quantity(cind, sind, count)

        b.write_to(filename)
        self.log.info("written %s" % filename)

    def phylogeny(self) :
        num_sequences = self.__count(self.options['cluster-fasta'])
        p = Pagan()

        if self.options['subset'] :
            self.seqdb = self.__read_fasta(self.options['cluster-fasta'], include=self.options['subset'])
            self.options['cluster-fasta'] = self.__fasta(self.options['cluster-fasta'] + '.subset', self.seqdb.keys())

            self.log.info("read %d cluster centroids using subset(%s)" % (len(self.seqdb), self.options['subset']))

        if not self.options['silva-fasta'] :
            self.log.info("aligning %s sequences with PAGAN ..." % (num_sequences))
            alignment,tree,xmlfile = p.phylogenetic_alignment(self.options['cluster-fasta'])
        else :
            self.log.info("aligning %s sequences with PAGAN against SILVA ..." % (num_sequences))
            alignment,tree,xmlfile = p.silva_phylogenetic_alignment(self.options['silva-fasta'], 
                                                                    self.options['silva-tree'], 
                                                                    self.options['cluster-fasta'])

        os.rename(alignment, self.options['phylogeny-fasta'])
        os.rename(tree,      self.options['phylogeny-tree'])
        os.rename(xmlfile,   self.options['phylogeny-xml'])

        self.log.info("created %s" % self.options['phylogeny-fasta'])
        self.log.info("created %s" % self.options['phylogeny-tree'])
        
        if xmlfile :
            self.log.info("created %s" % self.options['phylogeny-xml'])

        return 0

    def __count(self, fasta) :
        fq = FastqFile(fasta)
        fq.open()

        count = 0
        for seq in fq :
            count += 1

        fq.close()
        return count

    def heatmap(self) :
        self.log.info("creating heatmap using %s and %s" % (self.options['cluster-biom'], self.options['phylogeny-tree']))

        if self.options['heatmap-no-tree'] :
            self.options['phylogeny-tree'] = None
        
        if self.options['phylogeny-tree'] is not None and not exists(self.options['phylogeny-tree']) :
            self.log.warn("%s does not exist, drawing heatmap without tree" % self.options['phylogeny-tree'])
            self.options['phylogeny-tree'] = None

        phylogenetic_heatmap(self.options['cluster-biom'], 
                             tree=self.options['phylogeny-tree'], 
                             output=self.options['heatmap-pdf'],
                             str_include=self.options['subset'],
                             count_include=self.options['min-bin-count'],
                             output_tree=self.options['heatmap-out-tree'],
                             flip_tree=self.options['heatmap-flip-tree'],
                             scale=self.options['heatmap-scale'],
                             tree_height_blocks=self.options['heatmap-tree-height'],
                             label_clips=self.options['heatmap-label-clip'],
                             label_tokens=self.options['heatmap-label-tokens'],
                             ladderise=self.options['heatmap-ladderise'])

        self.log.info("wrote %s" % self.options['heatmap-pdf'])
        print "wrote %s" % self.options['heatmap-pdf']
        return 0

    def wasabi(self) :
        return view_in_wasabi(self.options['phylogeny-xml'], 
                              basename(self.options['outdir']), 
                              self.options['wasabi-url'],
                              self.options['wasabi-user'])

    def __build_summary(self, samples) :
        data = collections.defaultdict(dict)

        for s in samples :
            filename = s.fastq.get_filename()
            filename = basename(filename[:filename.rfind(".")])

            for name,count in s.filters.filter_counts() :
                data[filename][name] = count

            if self.options['chimeras'] :
                data[filename]['Chimera'] = sum([ s.seqcounts[i] for i in s.chimeras ])
            
            data[filename]['Accepted'] = len(s)
            data[filename]['Unique'] = len([ i for i in s.seqcounts if i not in s.chimeras ])

        return data

    def summary(self) :
        header_constant = 4

        with open(self.options['summary-file']) as f :
            max_length = max([ len(line.split(',')[0]) for line in f ])

        with open(self.options['summary-file']) as f :
            header = f.readline().rstrip().split(',')
            field_lengths = [ len(i) + header_constant for i in header ]
            fmt_str = "%s"

            for i in field_lengths[1:] :
                fmt_str += ("%%%ds" % i)
            
            x = max_length - len("filename")
            print (" " * x) + (fmt_str % tuple(header))
            print ""

            for line in f :
                tmp = line.rstrip().split(',')

                if tmp[0] == 'Totals' :
                    print ""

                x = max_length - len(tmp[0])
                print (" " * x) + (fmt_str % tuple(tmp))
            
            print ""

        return 0

    def showcounts(self) :
        def get_ids(biom_obj, element) :
            return [ i['id'].encode('ascii', 'ignore') for i in biom_obj[element] ]

        delim = self.options['delimiter']
        
        # read in
        biom = json.load(open(self.options['cluster-biom']))
        rows = get_ids(biom, 'rows')
        cols = get_ids(biom, 'columns')

        data = dict([ ((int(r),int(c)),int(q)) for r,c,q in biom['data']])

        id2label = dict([ (i['id'], i['metadata']['label']) for i in biom['rows'] if i['metadata']['label'] ])

        # output
        print delim.join([""] + cols)

        for r_index,r_id in enumerate(rows) :
            tmp = [r_id + "_" + id2label[r_id]]

            for c_index,c_id in enumerate(cols) :
                tmp.append(data.get((r_index,c_index), 0))

            print delim.join([str(i) for i in tmp])

        return 0

    def showlabels(self) :
        delim = self.options['delimiter']
        
        #biom = json.load(open(self.options['cluster-biom']))
        #
        #for r in biom['rows'] :
        #    print delim.join([r['id'], r['metadata']['label']])

        biom = BiomFile()
        labels = biom.get_label_mapping(self.options['cluster-biom'])

        for x in labels.iteritems() :
            print delim.join(x)

        return 0