def getDomains(self, sparkContext): # recover the species name for using in temp files self.species = Utils.getSpecies(self.source_path) domainFinder = DomainFinder.DomainFinder() # load source sequences into a single list if ("fasta" in self.source_type): list, file_content = Parsers.parseFastaToList(self.source_path, "") elif ("genbank" in self.source_type): list = Parsers.genBankToAminoacid(self.source_path) print('Processing domains...') # create RDD with source sequences sourceRDD = sparkContext.parallelize(file_content, numSlices=2000) if ("nucleotide" in self.source_type): # execute sixFrame translation for each sequence in RDD sourceRDD = sourceRDD.map(lambda x: SixFrameTranslator.main(x)) # execute Pfam domain prediction for each sixFrame translation in RDD domainsRDD = sourceRDD.map(lambda x: domainFinder.main(x[0], x[1])) processedRDD = domainsRDD.map( lambda x: self.processDomainOutput(x[0], x[1])) # recover Pfam domain prediction results from RDD result = processedRDD.collectAsMap() print('Done!') return result
def createGoDataset(self): source_type = self.config.get('dataPipeline', 'source.type') blastPath = self.config.get('blaster', 'blastdb.path') blastPath = Utils.normalizePath(blastPath) blastName = self.config.get('blaster', 'blastdb.name') blastMapping = blastPath + blastName + '.tab' datapipe = DataPipeline.DataPipeline(source_type=source_type, source_path=self.source_path, result_path=self.result_path) list, file_content = Parsers.parseFastaToList(self.source_path, "") file_content = [ content for content in file_content if not os.path.isfile( self.result_path + os.path.basename(content[0]).replace('.fasta', '.go')) ] sparkContext = SparkContext(conf=datapipe.initSpark("goDataset")) goterms = datapipe.getBLAST(file_content, sparkContext, blastTask="goTerms") count = 0 notFound = 0 for file, content in goterms.items(): length = content.split('\n') if (len(length) == 2 and not str(length[1])): notFound += 1 else: filename = os.path.basename(file) resultFile = self.result_path + filename resultFile = resultFile.replace('.fasta', '.go') Utils.writeFile(resultFile, content) count += 1 print('Done generating', str(count), 'GO term files. \nNo GO terms found for', str(notFound), 'files.')