def getDomains(self, sparkContext): # recover the species name for using in temp files self.species = Utils.getSpecies(self.source_path) domainFinder = DomainFinder.DomainFinder() # load source sequences into a single list if ("fasta" in self.source_type): list, file_content = Parsers.parseFastaToList(self.source_path, "") elif ("genbank" in self.source_type): list = Parsers.genBankToAminoacid(self.source_path) print('Processing domains...') # create RDD with source sequences sourceRDD = sparkContext.parallelize(file_content, numSlices=2000) if ("nucleotide" in self.source_type): # execute sixFrame translation for each sequence in RDD sourceRDD = sourceRDD.map(lambda x: SixFrameTranslator.main(x)) # execute Pfam domain prediction for each sixFrame translation in RDD domainsRDD = sourceRDD.map(lambda x: domainFinder.main(x[0], x[1])) processedRDD = domainsRDD.map( lambda x: self.processDomainOutput(x[0], x[1])) # recover Pfam domain prediction results from RDD result = processedRDD.collectAsMap() print('Done!') return result
def createNegShuffle(self, posPerc): files = Utils.listFilesExt(self.source_path, self.ext) negPerc = 100 - posPerc positives = len(files) negativeSize = int((negPerc * positives) / posPerc) print('Negative percentage: ' + str(negPerc) + '% \n' + 'Negative instances: ' + str(negativeSize) + '\n' + 'Positive percentage: ' + str(posPerc) + '% \n' + 'Positive instances: ' + str(positives) + '\n' + 'Total corpus size: ' + str(negativeSize + positives)) thisDecRatio = 0.0 count = 0 ratio = (negativeSize / positives) decRatio = ratio - int(ratio) print('Generating...') for file in files: # add up the decimal ratio part thisDecRatio += round(decRatio, 2) # reset range ratioRange = int(negativeSize / positives) # check if decimal ratio added up to a duplicate if (thisDecRatio >= 1): ratioRange = int(ratio + thisDecRatio) thisDecRatio = 0 for i in range(0, ratioRange): name = os.path.basename(file) result_file = name.split('.')[0] + '_' + str( i) + '.shuffled.negative.fasta' if ('nuc' in self.seqType): content = Parsers.genBankToNucleotide(file) if ('amino' in self.seqType): list, content = Parsers.genBankToAminoacid(file) content = Utils.charGramShuffle(content, 2) content = '>' + name + '\n' + content count += 1 Utils.writeFile(self.result_path + result_file, content) print('Total generated: ' + str(count) + '. Done!')