Пример #1
0
    def write(self,blastMap=None, transcriptMin=9, transcriptMax=1000,outFile="genesets.gmt"):
        """
        outFile: specifies the output file path (*.gmt)
        also a *.csv file with gene transcript mapping will be created if a bmap is provided

        blastMap: BlastMap returned after loading summary file in BlastMapper 
        transcriptMin: minimum size for a gene set
        transcriptMax: maximum size for a gene set
        outFile: outfile path

        """

        print("---------------------")
        if self.gene2go:
            print('There are %s genes with at least one annotation'%(len(self.gene2go.keys())))
        print('There are %s genes in the labels file'%(len(self.genes)))

        if blastMap:
            bm = BlastMapper()
            bmGenes = bm.print_summary(blastMap)
            gene2transcript = bm.get_gene_dict(blastMap)
            if self.gene2go:
                usableGenes = list(set(bmGenes).intersection(set(self.gene2go.keys())))
            else:
                usableGenes = bmGenes
                
        if blastMap:
            print('There are %s genes with at least one BLAST hit'%(len(bmGenes)))
            print('There are %s genes that have both a BLAST hit and an annotation'%(len(usableGenes)))
            #print('There are %s genes in clusters with at least one BLAST hits'%(len(set(self.genes).intersection.(set(bmGenes.keys())))))

        ## prepare outfiles
        writer = csv.writer(open(outFile,'w'),delimiter="\t")

        if blastMap:
            outFileMap = re.sub("\.gmt",".csv",outFile)
            writerMap = csv.writer(open(outFileMap,'w'))
            writerMap.writerow(["gene_set","gene_id","mapped_transcripts"])

        ## save gene sets to file
        failedThreshold = 0
        
        for _k in self.allClusters:
            clusterInds = np.where(self.labels==_k)[0]
            clusterGenes = self.genes[clusterInds]
            gsName = "gs-"+str(_k)
            if self.gene2go:
                description = self.get_description(clusterGenes)
            else:
                description = "kegg pathway"
                
            ## map the genes
            if blastMap:
                mapped = set([])
                for gene in clusterGenes:
                    if not gene2transcript.has_key(gene):
                        continue
                    geneTranscripts = gene2transcript[gene]
                    geneTranscripts = list(set([re.sub("\.[0-9]$","",g) for g in geneTranscripts]))

                    if blastMap:
                        writerMap.writerow([gsName,gene,";".join(list(geneTranscripts))])
                    mapped.update(geneTranscripts)
                mapped = list(mapped)
            else:
                mapped = clusterGenes

            ### remove non-unique and versioned genes
            #if len(mapped) > 0:
                

            if len(mapped) >= transcriptMin and len(mapped) <= transcriptMax: 
                writer.writerow([gsName,description] + mapped)
            else:
                failedThreshold+=clusterGenes.size

        print("-----------------")
        #print("sigma: %s"%self.sigma)
        #print("k: %s"%self.k)
        print('Total clusters: %s '%self.allClusters.size)
        percentAccepted = float(self.genes.size-failedThreshold) / float(self.genes.size)
        print("genes pass threshold %s/%s (%s)"%(self.genes.size-failedThreshold,self.genes.size,round(percentAccepted,2)) + "%)")
Пример #2
0
    def write(self,
              blastMap=None,
              transcriptMin=9,
              transcriptMax=1000,
              outFile="genesets.gmt"):
        """
        outFile: specifies the output file path (*.gmt)
        also a *.csv file with gene transcript mapping will be created if a bmap is provided

        blastMap: BlastMap returned after loading summary file in BlastMapper 
        transcriptMin: minimum size for a gene set
        transcriptMax: maximum size for a gene set
        outFile: outfile path

        """

        print("---------------------")
        if self.gene2go:
            print('There are %s genes with at least one annotation' %
                  (len(self.gene2go.keys())))
        print('There are %s genes in the labels file' % (len(self.genes)))

        if blastMap:
            bm = BlastMapper()
            bmGenes = bm.print_summary(blastMap)
            gene2transcript = bm.get_gene_dict(blastMap)
            if self.gene2go:
                usableGenes = list(
                    set(bmGenes).intersection(set(self.gene2go.keys())))
            else:
                usableGenes = bmGenes

        if blastMap:
            print('There are %s genes with at least one BLAST hit' %
                  (len(bmGenes)))
            print(
                'There are %s genes that have both a BLAST hit and an annotation'
                % (len(usableGenes)))
            #print('There are %s genes in clusters with at least one BLAST hits'%(len(set(self.genes).intersection.(set(bmGenes.keys())))))

        ## prepare outfiles
        writer = csv.writer(open(outFile, 'w'), delimiter="\t")

        if blastMap:
            outFileMap = re.sub("\.gmt", ".csv", outFile)
            writerMap = csv.writer(open(outFileMap, 'w'))
            writerMap.writerow(["gene_set", "gene_id", "mapped_transcripts"])

        ## save gene sets to file
        failedThreshold = 0

        for _k in self.allClusters:
            clusterInds = np.where(self.labels == _k)[0]
            clusterGenes = self.genes[clusterInds]
            gsName = "gs-" + str(_k)
            if self.gene2go:
                description = self.get_description(clusterGenes)
            else:
                description = "kegg pathway"

            ## map the genes
            if blastMap:
                mapped = set([])
                for gene in clusterGenes:
                    if not gene2transcript.has_key(gene):
                        continue
                    geneTranscripts = gene2transcript[gene]
                    geneTranscripts = list(
                        set([
                            re.sub("\.[0-9]$", "", g) for g in geneTranscripts
                        ]))

                    if blastMap:
                        writerMap.writerow(
                            [gsName, gene, ";".join(list(geneTranscripts))])
                    mapped.update(geneTranscripts)
                mapped = list(mapped)
            else:
                mapped = clusterGenes

            ### remove non-unique and versioned genes
            if type(mapped) == type(np.array([])):
                mapped = mapped.tolist()

            if len(mapped) >= transcriptMin and len(mapped) <= transcriptMax:
                writer.writerow([gsName, description] + mapped)
            else:
                failedThreshold += clusterGenes.size

        print("-----------------")
        #print("sigma: %s"%self.sigma)
        #print("k: %s"%self.k)
        print('Total clusters: %s ' % self.allClusters.size)
        percentAccepted = float(self.genes.size - failedThreshold) / float(
            self.genes.size)
        print(
            "genes pass threshold %s/%s (%s)" %
            (self.genes.size - failedThreshold, self.genes.size,
             round(percentAccepted, 2)) + "%)")