示例#1
0
    def getDomains(self, sparkContext):

        # recover the species name for using in temp files
        self.species = Utils.getSpecies(self.source_path)
        domainFinder = DomainFinder.DomainFinder()

        # load source sequences into a single list
        if ("fasta" in self.source_type):
            list, file_content = Parsers.parseFastaToList(self.source_path, "")
        elif ("genbank" in self.source_type):
            list = Parsers.genBankToAminoacid(self.source_path)

        print('Processing domains...')

        # create RDD with source sequences
        sourceRDD = sparkContext.parallelize(file_content, numSlices=2000)

        if ("nucleotide" in self.source_type):
            # execute sixFrame translation for each sequence in RDD
            sourceRDD = sourceRDD.map(lambda x: SixFrameTranslator.main(x))

        # execute Pfam domain prediction for each sixFrame translation in RDD
        domainsRDD = sourceRDD.map(lambda x: domainFinder.main(x[0], x[1]))
        processedRDD = domainsRDD.map(
            lambda x: self.processDomainOutput(x[0], x[1]))

        # recover Pfam domain prediction results from RDD
        result = processedRDD.collectAsMap()

        print('Done!')

        return result
示例#2
0
    def createGoDataset(self):
        source_type = self.config.get('dataPipeline', 'source.type')
        blastPath = self.config.get('blaster', 'blastdb.path')
        blastPath = Utils.normalizePath(blastPath)
        blastName = self.config.get('blaster', 'blastdb.name')
        blastMapping = blastPath + blastName + '.tab'

        datapipe = DataPipeline.DataPipeline(source_type=source_type,
                                             source_path=self.source_path,
                                             result_path=self.result_path)
        list, file_content = Parsers.parseFastaToList(self.source_path, "")
        file_content = [
            content for content in file_content if not os.path.isfile(
                self.result_path +
                os.path.basename(content[0]).replace('.fasta', '.go'))
        ]

        sparkContext = SparkContext(conf=datapipe.initSpark("goDataset"))
        goterms = datapipe.getBLAST(file_content,
                                    sparkContext,
                                    blastTask="goTerms")

        count = 0
        notFound = 0
        for file, content in goterms.items():

            length = content.split('\n')
            if (len(length) == 2 and not str(length[1])):
                notFound += 1
            else:
                filename = os.path.basename(file)
                resultFile = self.result_path + filename
                resultFile = resultFile.replace('.fasta', '.go')
                Utils.writeFile(resultFile, content)
                count += 1

        print('Done generating',
              str(count), 'GO term files. \nNo GO terms found for',
              str(notFound), 'files.')