def label(self) : self.seqdb = self.__read_fasta(self.options['cluster-fasta']) blast_fname = self.options['cluster-fasta'] # if we are only going to label the clusters without labels # then we need to find the names of those clusters and write a # fasta file containing only those sequences if self.options['label-missing'] : tmp = [] biom = json.load(open(self.options['cluster-biom'])) for r in biom['rows'] : if r['metadata']['label'] in ("", "unknown", "error", "cannot label (matches multiple domains!)") : tmp.append(r['id']) if len(tmp) == 0 : self.log.error("there are no missing labels") exit(1) self.log.info("%d clusters missing labels" % len(tmp)) blast_fname = self.__fasta(join(self.options['outdir'], 'missing.fasta'), tmp) print "getting OTU names (this may take a while)..." otu_names = BlastN(self.options['verbose']).get_names(blast_fname, self.options['labels'], self.options['labels-similarity'], self.options['labels_db']) # rework the biom biom = BiomFile() biom.change_otu_names(self.options['cluster-biom'], otu_names) self.log.info("written %s" % self.options['cluster-biom']) # get the rest of the names and rewrite fasta otu_names = biom.get_label_mapping(self.options['cluster-biom']) self.__fasta(self.options['cluster-fasta'], self.seqdb.keys(), names=otu_names) return 0
def __biom(self, filename, samples, clustering, cluster_names) : centroids = clustering.centroids() all_keys = clustering.all() output_clusters = clustering.clusters output_samples = [ s for s in samples if s.contains(all_keys) ] output_otus = [ ("seance" + str(k), cluster_names.get("seance" + str(k), "unknown")) for k in centroids ] #self.log.info("%d / %d samples have at least one sequence used in clustering" % \ # (len(output_samples), len(samples))) b = BiomFile() b.set_samples(output_samples) b.set_otus(output_otus) for sind,sample in enumerate(output_samples) : for cind,cluster in enumerate(output_clusters) : count = 0 for read in cluster : if read in sample : count += sample.seqcounts[read] b.add_quantity(cind, sind, count) b.write_to(filename) self.log.info("written %s" % filename)
def showlabels(self) : delim = self.options['delimiter'] #biom = json.load(open(self.options['cluster-biom'])) # #for r in biom['rows'] : # print delim.join([r['id'], r['metadata']['label']]) biom = BiomFile() labels = biom.get_label_mapping(self.options['cluster-biom']) for x in labels.iteritems() : print delim.join(x) return 0