Python SequenceDB примеры использования

Язык программирования: Python

Пространство имен/Пакет: seance.db

Класс/Тип: SequenceDB

Примеров на hotexamples.com: 3

Python SequenceDB - 3 примера найдено. Это лучшие примеры Python кода для seance.db.SequenceDB, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

get(1)

keys(1)

num_reads(1)

num_sequences(1)

Пример #1

Показать файл

Файл: workflow.py Проект: ajm/seance

    def preprocess(self) :
        if len(self.options['input-files']) == 0 :
            self.log.error("nothing to do")
            sys.exit(1)

        self.seqdb = SequenceDB(preprocessed=False)

        p = Progress("Preprocessing", len(self.options['input-files']))
        p.start()

        samples = []

        for f in self.__get_files(self.options['input-files']) :
            mid = self.__mid_fastq(f)

            sample = Sample(f, 
                        self.options['outdir'],
                        self.seqdb, 
                        self.__filters(mid),
                        chimeras=self.options['chimeras'])

            sample.print_sample()
            samples.append(sample)

            p.increment()

        p.end()

        rejected_reads = sum([ sum(s.filters.counts) for s in samples ])
        accepted_reads = sum([ len(s) for s in samples ])
        unique_seq = sum([ len(s.seqcounts) for s in samples ])

        print "processed %s reads, accepted %d (of which %d are unique)" % \
                (rejected_reads + accepted_reads, accepted_reads, unique_seq)

        if samples :
            summary_data = self.__build_summary(samples)

            summary_file = Summary(self.options['summary-file'])
            summary_file.update(summary_data)
            summary_file.write(self.options['summary-file'])


        if self.options['verbose'] :
            self.summary()

        return 0

Пример #2

Показать файл

Файл: workflow.py Проект: ajm/seance

    def cluster(self) :
        # rebuild the database from preprocessed samples in outdir
        self.seqdb = SequenceDB(preprocessed=False) # setting this to true causes things to be overwritten if we merge mulitiple preprocessing steps
        samples = self.__preprocessed_samples()
        
        if self.seqdb.num_sequences() == 0 :
            self.log.error("no sequences loaded")
            exit(1)

        # to prove the rebuild of the database is the same
        #for sample in samples :
        #    sample.print_sample(extension=".rebuild")
        
        # get keys of sequences we want to cluster
#        input_keys = self.__get_cluster_input(samples,
#                                self.options['total-duplicate-threshold'],
#                                self.options['sample-threshold'],
#                                self.options['duplicate-threshold'])

        input_keys, singleton_keys = self.__get_cluster_input2(samples,
                                    self.options['total-duplicate-threshold'],
                                    self.options['duplicate-threshold'])

        # get some info
#        num_reads = sum([ self.seqdb.get(i).duplicates for i in input_keys ])
        num_reads = sum([ self.seqdb.get(i).duplicates for i in input_keys.keys() ])
        self.log.info("clustering %d/%d (%.2f%%) sequences (%d/%d (%.2f%%) reads)" % \
                        (len(input_keys), self.seqdb.num_sequences(), \
                        len(input_keys) * 100 / float(self.seqdb.num_sequences()), \
                        num_reads, self.seqdb.num_reads(), \
                        num_reads * 100 / float(self.seqdb.num_reads())))

        # clustering
        c = Cluster(self.seqdb, self.options['otu-similarity'], self.options['verbose'])
#        c.create_clusters(keys=input_keys, homopolymer_correction=not self.options['no-homopolymer-correction'])
        c.create_clusters2(keys=input_keys, 
                           homopolymer_correction=not self.options['no-homopolymer-correction'], 
                           singletons=singleton_keys,
                           sample_threshold=self.options['sample-threshold'])

        # output centroids to file
        # output biom file
        # run blast if necessary
        centroid_fname = self.options['cluster-fasta']
        biom_fname = self.options['cluster-biom']
        otu_names = {}

        # write everything out anyway in case labelling fails or is killed
        self.__fasta(centroid_fname, c.centroids(), names=otu_names)
        self.__biom(biom_fname, samples, c, otu_names)

        # blast to get better names
        if self.options['labels'] :
            print "getting OTU names (this may take a while)..."
            otu_names = BlastN(self.options['verbose']).get_names(centroid_fname, self.options['labels'], self.options['labels-similarity'], self.options['labels_db'])

            if self.options['labels'] == 'blast' and self.options['merge-blast-hits'] :
                c.merge(otu_names)

            # write out results files
            self.__fasta(centroid_fname, c.centroids(), names=otu_names)
            self.__biom(biom_fname, samples, c, otu_names)
        
        return 0

Пример #3

Показать файл

Файл: workflow.py Проект: ajm/seance

class WorkFlow(object) :
    def __init__(self, options) :
        self.options = options
        self.log = logging.getLogger('seance')
        self.seqdb = None

    def __filters(self, mid) :
        mf = MultiFilter()

        ## if we have trimmed the forward primer, then the mid will be trimmed 
        ## off with it and this check will always fail
        #if not self.options['clipprimers'] :
        #    mf.add(MidFilter(mid, self.options['miderrors']))


        # always remove the mid, because it is now left by denoising
        mf.add(MidFilter(mid, self.options['miderrors']))

        # check primer, remove if requested
        if self.options['forwardprimer'] is not None :
            mf.add(PrimerFilter(self.options['forwardprimer'], self.options['primererrors'], self.options['clipprimers']))

        if self.options['length'] != None :
            mf.add(LengthFilter(self.options['length']))

        if self.options['removeambiguous'] :
            mf.add(AmbiguousFilter())

        # homopolymer and quality are taken care of
        if self.options['denoise'] :
            return mf

        if self.options['maxhomopolymer'] > 0 :
            mf.add(HomopolymerFilter(self.options['maxhomopolymer']))

        qual = self.options['quality-method']

        if qual == 'min' :
            mf.add(MinimumQualityFilter(self.options['quality']))

        elif qual == 'average' :
            mf.add(AverageQualityFilter(self.options['quality']))

        elif qual == 'window' :
                mf.add(
                        WindowedQualityFilter(
                            self.options['quality'], 
                            self.options['windowlength'])
                      )

        return mf

    def __mid_sff(self, sff) :
        fastq = Sff2Fastq().run(sff, self.options['outdir'])
        mid = self.__mid_fastq(fastq)
        os.remove(fastq.get_filename())
        return mid

    def __mid_fastq(self, fastq) :
        #return GetMID(self.options['midlength']).run(fastq.get_filename())
        return GetMID2(self.options['midlength']).run(fastq)

    def __get_files(self, file_names) :
        for fname in file_names :
            root,ext = splitext(basename(fname))

            self.log.info("current file = %s" % fname)

            if ext == '.sff' :
                if os.path.exists(join(self.options['outdir'], basename(fname)+'.fasta.sample')) :
                    self.log.info("skipping %s (already preprocessed)" % fname)
                    continue

                sff = SffFile(fname)

                # annoyingly, mid is figured out twice if we are
                # doing denoising
                if self.options['denoise'] :
                    yield AmpliconNoise().run(sff,
                            self.options['outdir'],
                            self.options['forwardprimer'],
                            self.options['primererrors'],
                            self.__mid_sff(sff),
                            self.options['miderrors'], 
                            self.options['maxhomopolymer']
                            )
                else :
                    yield Sff2Fastq().run(sff, 
                        self.options['outdir'])

            else :
                yield FastqFile(fname)

    def preprocess(self) :
        if len(self.options['input-files']) == 0 :
            self.log.error("nothing to do")
            sys.exit(1)

        self.seqdb = SequenceDB(preprocessed=False)

        p = Progress("Preprocessing", len(self.options['input-files']))
        p.start()

        samples = []

        for f in self.__get_files(self.options['input-files']) :
            mid = self.__mid_fastq(f)

            sample = Sample(f, 
                        self.options['outdir'],
                        self.seqdb, 
                        self.__filters(mid),
                        chimeras=self.options['chimeras'])

            sample.print_sample()
            samples.append(sample)

            p.increment()

        p.end()

        rejected_reads = sum([ sum(s.filters.counts) for s in samples ])
        accepted_reads = sum([ len(s) for s in samples ])
        unique_seq = sum([ len(s.seqcounts) for s in samples ])

        print "processed %s reads, accepted %d (of which %d are unique)" % \
                (rejected_reads + accepted_reads, accepted_reads, unique_seq)

        if samples :
            summary_data = self.__build_summary(samples)

            summary_file = Summary(self.options['summary-file'])
            summary_file.update(summary_data)
            summary_file.write(self.options['summary-file'])


        if self.options['verbose'] :
            self.summary()

        return 0

    def __preprocessed_samples(self) :
        if self.options['metadata'] is None :
            tmp = []
            for sample in glob(join(self.options['outdir'], '*.sample')) :
                md = SampleMetadata()
                md.defaults()
                s = basename(sample)
                md['file'] = s[:s.find('.')]
                tmp.append(MetadataSample(FastqFile(sample), self.options['outdir'], self.seqdb, md))
            return tmp

        mdr = MetadataReader(self.options['metadata'])
        mdr.process()

        tmp = []
        md_used = []

        for sample in glob(join(self.options['outdir'], '*.sample')) :
            md = mdr.get(basename(sample))

            if md == None :
                self.log.warn("metadata missing for %s ..." % basename(sample))
                md = SampleMetadata()
                md.defaults()
                s = basename(sample)
                md['file'] = s[:s.find('.')]
                #self.log.warn("skipping %s, metadata missing..." % basename(sample))
                #continue

            tmp.append(MetadataSample(FastqFile(sample), self.options['outdir'], self.seqdb, md))
            md_used.append(md['file'])

        # warn about used metadata
        for i in mdr.metadata.keys() :
            if i not in md_used :
                self.log.warn("preprocessed file for %s not found" % i)

        return sorted(tmp)

    def __get_cluster_input(self, samples, duplicate_threshold, sample_threshold, contamination_threshold) :
        ref_count = collections.Counter()
        cluster_input_keys = set()

        for sample in samples :
            sample.remove_less_than(contamination_threshold)

        # first collect keys for all sequences that fit number of reads
        for sample in samples :
            for key,freq in sample.seqcounts.most_common() :
                if self.seqdb.get(key).duplicates >= duplicate_threshold :
                    ref_count[key] += 1

        # then for these keys see how many samples they occurred in
        for key,freq in ref_count.most_common() :
            if freq < sample_threshold :
                break

            cluster_input_keys.add(key)

        # some samples contain reads you would not necessarily expect to see 
        # in any other samples (control samples) so add these separately, but
        # still respect duplicate_threshold
        for sample in samples :
            if sample.metadata['allow-singletons'] :
                count = 0

                for key,freq in sample.seqcounts.most_common() :
                    if self.seqdb.get(key).duplicates >= duplicate_threshold :
                        if key not in cluster_input_keys :
                            cluster_input_keys.add(key)
                            count += 1

                self.log.info("allowing singletons for (%s) - added %d sequences" % \
                        (sample.description(), count))

        return list(cluster_input_keys)

    def __get_cluster_input2(self, samples, duplicate_threshold, contamination_threshold) :
        seq2samp = collections.defaultdict(list)
        singletons = []

        for sample in samples :
            sample.remove_less_than(contamination_threshold)

        # first collect keys for all sequences that fit number of reads
        for index, sample in enumerate(samples) :
            is_singleton = sample.metadata['allow-singletons']

            for key,freq in sample.seqcounts.most_common() :
                if is_singleton or (self.seqdb.get(key).duplicates >= duplicate_threshold) :
                    seq2samp[key].append(index)

                if is_singleton :
                    singletons.append(key)

        return seq2samp, singletons

    def cluster(self) :
        # rebuild the database from preprocessed samples in outdir
        self.seqdb = SequenceDB(preprocessed=False) # setting this to true causes things to be overwritten if we merge mulitiple preprocessing steps
        samples = self.__preprocessed_samples()
        
        if self.seqdb.num_sequences() == 0 :
            self.log.error("no sequences loaded")
            exit(1)

        # to prove the rebuild of the database is the same
        #for sample in samples :
        #    sample.print_sample(extension=".rebuild")
        
        # get keys of sequences we want to cluster
#        input_keys = self.__get_cluster_input(samples,
#                                self.options['total-duplicate-threshold'],
#                                self.options['sample-threshold'],
#                                self.options['duplicate-threshold'])

        input_keys, singleton_keys = self.__get_cluster_input2(samples,
                                    self.options['total-duplicate-threshold'],
                                    self.options['duplicate-threshold'])

        # get some info
#        num_reads = sum([ self.seqdb.get(i).duplicates for i in input_keys ])
        num_reads = sum([ self.seqdb.get(i).duplicates for i in input_keys.keys() ])
        self.log.info("clustering %d/%d (%.2f%%) sequences (%d/%d (%.2f%%) reads)" % \
                        (len(input_keys), self.seqdb.num_sequences(), \
                        len(input_keys) * 100 / float(self.seqdb.num_sequences()), \
                        num_reads, self.seqdb.num_reads(), \
                        num_reads * 100 / float(self.seqdb.num_reads())))

        # clustering
        c = Cluster(self.seqdb, self.options['otu-similarity'], self.options['verbose'])
#        c.create_clusters(keys=input_keys, homopolymer_correction=not self.options['no-homopolymer-correction'])
        c.create_clusters2(keys=input_keys, 
                           homopolymer_correction=not self.options['no-homopolymer-correction'], 
                           singletons=singleton_keys,
                           sample_threshold=self.options['sample-threshold'])

        # output centroids to file
        # output biom file
        # run blast if necessary
        centroid_fname = self.options['cluster-fasta']
        biom_fname = self.options['cluster-biom']
        otu_names = {}

        # write everything out anyway in case labelling fails or is killed
        self.__fasta(centroid_fname, c.centroids(), names=otu_names)
        self.__biom(biom_fname, samples, c, otu_names)

        # blast to get better names
        if self.options['labels'] :
            print "getting OTU names (this may take a while)..."
            otu_names = BlastN(self.options['verbose']).get_names(centroid_fname, self.options['labels'], self.options['labels-similarity'], self.options['labels_db'])

            if self.options['labels'] == 'blast' and self.options['merge-blast-hits'] :
                c.merge(otu_names)

            # write out results files
            self.__fasta(centroid_fname, c.centroids(), names=otu_names)
            self.__biom(biom_fname, samples, c, otu_names)
        
        return 0

    def label(self) :
        self.seqdb = self.__read_fasta(self.options['cluster-fasta'])
        blast_fname = self.options['cluster-fasta']

        # if we are only going to label the clusters without labels
        # then we need to find the names of those clusters and write a 
        # fasta file containing only those sequences
        if self.options['label-missing'] :
            tmp = []
            biom = json.load(open(self.options['cluster-biom']))
            for r in biom['rows'] :
                if r['metadata']['label'] in ("", "unknown", "error", "cannot label (matches multiple domains!)") :
                    tmp.append(r['id'])

            if len(tmp) == 0 :
                self.log.error("there are no missing labels")
                exit(1)

            self.log.info("%d clusters missing labels" % len(tmp))
            blast_fname = self.__fasta(join(self.options['outdir'], 'missing.fasta'), tmp)


        print "getting OTU names (this may take a while)..." 
        otu_names = BlastN(self.options['verbose']).get_names(blast_fname, self.options['labels'], self.options['labels-similarity'], self.options['labels_db'])

        # rework the biom
        biom = BiomFile()
        biom.change_otu_names(self.options['cluster-biom'], otu_names)
        self.log.info("written %s" % self.options['cluster-biom'])

        # get the rest of the names and rewrite fasta
        otu_names = biom.get_label_mapping(self.options['cluster-biom'])
        self.__fasta(self.options['cluster-fasta'], self.seqdb.keys(), names=otu_names)

        return 0

    def __fasta(self, filename, keys, names=None) :
        f = open(filename, 'w')
        
        if names is None :
            for key in keys :
                print >> f, self.seqdb.get(key).fasta()
        else :
            for key in keys :
                s = self.seqdb.get(key)

                if isinstance(key, int) :
                    str_key = "seance%d" % key
                else :
                    str_key = str(key)

                print >> f, ">%s %s" % (str_key, names.get(str_key, "unknown"))
                print >> f, s.sequence

        f.close()
        self.log.info("written %s" % filename)

        return filename

    def __read_fasta(self, filename, include=None) :
        tmp = {}

        if include :
            include = include.lower()

        f = FastqFile(filename)
        f.open()

        for seq in f :
            if include :
                if include not in seq.id.lower() :
                    continue

            seq.id = seq.id.split()[0][1:]

            #if only_include :
            #    if seq.id not in only_include :
            #        continue
    
            tmp[seq.id] = seq

        f.close()

        self.log.info("read %d centroid sequences" % len(tmp))

        return tmp

    def __biom(self, filename, samples, clustering, cluster_names) :
        centroids = clustering.centroids()
        all_keys = clustering.all()

        output_clusters = clustering.clusters
        output_samples = [ s for s in samples if s.contains(all_keys) ]
        output_otus = [ ("seance" + str(k), cluster_names.get("seance" + str(k), "unknown")) for k in centroids ]

        #self.log.info("%d / %d samples have at least one sequence used in clustering" % \
        #        (len(output_samples), len(samples)))

        b = BiomFile()
        b.set_samples(output_samples)
        b.set_otus(output_otus)

        for sind,sample in enumerate(output_samples) :
            for cind,cluster in enumerate(output_clusters) :
                count = 0

                for read in cluster :
                    if read in sample :
                        count += sample.seqcounts[read]

                b.add_quantity(cind, sind, count)

        b.write_to(filename)
        self.log.info("written %s" % filename)

    def phylogeny(self) :
        num_sequences = self.__count(self.options['cluster-fasta'])
        p = Pagan()

        if self.options['subset'] :
            self.seqdb = self.__read_fasta(self.options['cluster-fasta'], include=self.options['subset'])
            self.options['cluster-fasta'] = self.__fasta(self.options['cluster-fasta'] + '.subset', self.seqdb.keys())

            self.log.info("read %d cluster centroids using subset(%s)" % (len(self.seqdb), self.options['subset']))

        if not self.options['silva-fasta'] :
            self.log.info("aligning %s sequences with PAGAN ..." % (num_sequences))
            alignment,tree,xmlfile = p.phylogenetic_alignment(self.options['cluster-fasta'])
        else :
            self.log.info("aligning %s sequences with PAGAN against SILVA ..." % (num_sequences))
            alignment,tree,xmlfile = p.silva_phylogenetic_alignment(self.options['silva-fasta'], 
                                                                    self.options['silva-tree'], 
                                                                    self.options['cluster-fasta'])

        os.rename(alignment, self.options['phylogeny-fasta'])
        os.rename(tree,      self.options['phylogeny-tree'])
        os.rename(xmlfile,   self.options['phylogeny-xml'])

        self.log.info("created %s" % self.options['phylogeny-fasta'])
        self.log.info("created %s" % self.options['phylogeny-tree'])
        
        if xmlfile :
            self.log.info("created %s" % self.options['phylogeny-xml'])

        return 0

    def __count(self, fasta) :
        fq = FastqFile(fasta)
        fq.open()

        count = 0
        for seq in fq :
            count += 1

        fq.close()
        return count

    def heatmap(self) :
        self.log.info("creating heatmap using %s and %s" % (self.options['cluster-biom'], self.options['phylogeny-tree']))

        if self.options['heatmap-no-tree'] :
            self.options['phylogeny-tree'] = None
        
        if self.options['phylogeny-tree'] is not None and not exists(self.options['phylogeny-tree']) :
            self.log.warn("%s does not exist, drawing heatmap without tree" % self.options['phylogeny-tree'])
            self.options['phylogeny-tree'] = None

        phylogenetic_heatmap(self.options['cluster-biom'], 
                             tree=self.options['phylogeny-tree'], 
                             output=self.options['heatmap-pdf'],
                             str_include=self.options['subset'],
                             count_include=self.options['min-bin-count'],
                             output_tree=self.options['heatmap-out-tree'],
                             flip_tree=self.options['heatmap-flip-tree'],
                             scale=self.options['heatmap-scale'],
                             tree_height_blocks=self.options['heatmap-tree-height'],
                             label_clips=self.options['heatmap-label-clip'],
                             label_tokens=self.options['heatmap-label-tokens'],
                             ladderise=self.options['heatmap-ladderise'])

        self.log.info("wrote %s" % self.options['heatmap-pdf'])
        print "wrote %s" % self.options['heatmap-pdf']
        return 0

    def wasabi(self) :
        return view_in_wasabi(self.options['phylogeny-xml'], 
                              basename(self.options['outdir']), 
                              self.options['wasabi-url'],
                              self.options['wasabi-user'])

    def __build_summary(self, samples) :
        data = collections.defaultdict(dict)

        for s in samples :
            filename = s.fastq.get_filename()
            filename = basename(filename[:filename.rfind(".")])

            for name,count in s.filters.filter_counts() :
                data[filename][name] = count

            if self.options['chimeras'] :
                data[filename]['Chimera'] = sum([ s.seqcounts[i] for i in s.chimeras ])
            
            data[filename]['Accepted'] = len(s)
            data[filename]['Unique'] = len([ i for i in s.seqcounts if i not in s.chimeras ])

        return data

    def summary(self) :
        header_constant = 4

        with open(self.options['summary-file']) as f :
            max_length = max([ len(line.split(',')[0]) for line in f ])

        with open(self.options['summary-file']) as f :
            header = f.readline().rstrip().split(',')
            field_lengths = [ len(i) + header_constant for i in header ]
            fmt_str = "%s"

            for i in field_lengths[1:] :
                fmt_str += ("%%%ds" % i)
            
            x = max_length - len("filename")
            print (" " * x) + (fmt_str % tuple(header))
            print ""

            for line in f :
                tmp = line.rstrip().split(',')

                if tmp[0] == 'Totals' :
                    print ""

                x = max_length - len(tmp[0])
                print (" " * x) + (fmt_str % tuple(tmp))
            
            print ""

        return 0

    def showcounts(self) :
        def get_ids(biom_obj, element) :
            return [ i['id'].encode('ascii', 'ignore') for i in biom_obj[element] ]

        delim = self.options['delimiter']
        
        # read in
        biom = json.load(open(self.options['cluster-biom']))
        rows = get_ids(biom, 'rows')
        cols = get_ids(biom, 'columns')

        data = dict([ ((int(r),int(c)),int(q)) for r,c,q in biom['data']])

        id2label = dict([ (i['id'], i['metadata']['label']) for i in biom['rows'] if i['metadata']['label'] ])

        # output
        print delim.join([""] + cols)

        for r_index,r_id in enumerate(rows) :
            tmp = [r_id + "_" + id2label[r_id]]

            for c_index,c_id in enumerate(cols) :
                tmp.append(data.get((r_index,c_index), 0))

            print delim.join([str(i) for i in tmp])

        return 0

    def showlabels(self) :
        delim = self.options['delimiter']
        
        #biom = json.load(open(self.options['cluster-biom']))
        #
        #for r in biom['rows'] :
        #    print delim.join([r['id'], r['metadata']['label']])

        biom = BiomFile()
        labels = biom.get_label_mapping(self.options['cluster-biom'])

        for x in labels.iteritems() :
            print delim.join(x)

        return 0