def __init__(self): self.config = Utils.loadConfig() self.task = self.config.get('eval', 'task') self.gold = self.config.get('eval', 'goldID.path') self.result = self.config.get('eval', 'result.path') self.threshold = float(self.config.get('eval', 'threshold')) self.sparkContext = SparkContext(conf=Utils.getSparkConf('filter')) self.Similarity = Similarity.Similarity(self.config) self.Filter = Filter.Filter(self.config, sparkContext=self.sparkContext) self.Merger = Merger.Merger(self.config) self.goldIDs = Utils.readFileLines(self.gold)[1:] self.resultFiles = Utils.listFilesExt(self.result, 'IDs.test') # total nb of gold genes self.nbGoldGenes = len(self.goldIDs) # total nb of gold clusters self.foldedGold = Utils.foldClusterData(self.goldIDs, 'gold', 0) self.goldGenes = [ gene for genes in self.foldedGold.values() for gene in genes ] self.nbGoldClusters = len(self.foldedGold) self.outputheader = 'goldClusterID\tgoldGeneID\tpredictedClusterLabel\tpredictedClusterID\n' self.scoreheader = 'goldClusterID\tpredictedClusterID\tclusterScore\n'
def __init__(self): self.config = Utils.loadConfig() self.path = self.config.get('prediction', 'source.path') self.path = Utils.normalizePath(self.path) self.trainPath = self.path + 'train/' self.testPath = self.path + 'test/' self.outputPath = self.path + 'metricsQLearner/models/' self.geneMapPath = self.config.get('eval', 'filter.map') self.geneMap = {} self.extractor = Extractor.Extractor(self.config, self.outputPath) self.rewardType = 'occ' self.rewardPath = self.outputPath + self.rewardType + 'PerDomains.feat' # pfam domain list self.rewardList, self.rewardIDs, self.rewardLabels = '', '', '' self.actions = ['keep', 'skip'] self.task = 'train' self.rewardTable, self.QTable = [], [] self.episodes = int(self.config.get('prediction', 'episodes')) # hyperparams self.alpha = float(self.config.get('prediction', 'alpha')) # learning rate self.gamma = float(self.config.get('prediction', 'gamma')) # discount factor self.epsilon = float(self.config.get('prediction', 'epsilon')) # exploration self.penaltyThreshold = float( self.config.get( 'prediction', 'penalty.threshold')) # negative rewards mean penalty self.keepskipThreshold = float( self.config.get('prediction', 'keepskip.threshold') ) # keep reward ratio wrt skip reward for domain to be kept self.useSimilarityWeight = False self.useCompWeights = False self.useNeighborWeight = self.config.getboolean( 'prediction', 'neighbor.weight') self.useDryIslands = self.config.getboolean('prediction', 'dry.islands') self.useAvAction = self.config.getboolean('prediction', 'average.action') self.weightsPath = self.config.get('eval', 'weights') self.weights = Utils.readFileLines( self.weightsPath ) if self.useCompWeights or self.useNeighborWeight else '' self.params = self.rewardType + '_keepgt' + str( self.keepskipThreshold) + 'skip' + '_ep' + str( self.episodes) + '_alpha' + str(self.alpha) + '_gamma' + str( self.gamma) + '_eps' + str(self.epsilon) self.params += '_neighbor' if self.useCompWeights else '' self.QTablePath = self.outputPath + 'Qtable_' + self.params + '.npy' self.rewardTablePath = self.outputPath + 'Rewards_' + self.params + '.npy' self.IDmapPath = self.outputPath + 'RewardIDsmap_' + self.params + '.map'
def __init__(self, blastTask): self.config = Utils.loadConfig() self.sourceType = self.config.get('dataPipeline', 'source.type') self.blastTask = blastTask self.blastdb = self.config.get('blaster', 'blastdb.path') self.blastdb = Utils.normalizePath(self.blastdb) self.blastdbName = self.config.get('blaster', 'blastdb.name') if(not self.blastdbName.endswith('fasta')): self.blastdbName += '.fasta' self.goTerms = True if 'goterm' in blastTask.lower() else False self.mappingFile = self.blastdb + self.blastdbName.replace('.fasta','.tab') self.mapping = '' if(self.goTerms): self.mapping = self.loadMapping()
def __init__(self): self.config = Utils.loadConfig() self.corpus_path = self.config.get('corpusPrep', 'corpus.home') self.source_path = self.config.get('corpusPrep', 'source.path') self.result_path = self.config.get('corpusPrep', 'result.path') self.task = self.config.get('corpusPrep', 'task') self.ext = self.config.get('corpusPrep', 'source.ext') self.seqType = self.config.get('corpusPrep', 'source.type') self.posPath = self.config.get('corpusPrep', 'pos.path') self.negPath = self.config.get('corpusPrep', 'neg.path') self.validPerc = self.config.get('corpusPrep', 'valid.perc') self.posPerc = self.config.getint('corpusPrep', 'pos.perc') self.targetIds = self.config.get('corpusPrep', 'pos.ids') self.filterIds = self.config.get('corpusPrep', 'pos.filters') self.length = self.config.getint('corpusPrep', 'cluster.length') self.windowOverlap = self.config.getint('corpusPrep', 'window.overlap') self.clusterLen = self.config.getint('corpusPrep', 'cluster.length')
def __init__(self): # read application configuration props self.config = Utils.loadConfig() self.path = self.config.get('prediction', 'source.path') self.path = Utils.normalizePath(self.path) self.trainPath = self.path + 'train/' self.validPath = self.path + 'validation/' self.gridCVPath = self.path + 'train_validation/' self.testPath = self.path + 'test/' self.outputPath = self.path + 'metrics/cv_gridsearchparams/' self.task = self.config.get('prediction', 'task') self.posPerc = int(self.config.get('prediction', 'pos.perc')) self.classif = self.config.get('prediction', 'classifier') os.makedirs(os.path.dirname(self.outputPath), exist_ok=True) self.extractor = Extractor.Extractor(self.config, self.outputPath) self.loader = Loader.Loader(self.config, self.outputPath) self.dimHandler = DimensionHandler.DimensionHandler( self.config, self.outputPath) self.outFile = '' self.useEmbeddings = self.config.getboolean('prediction', 'use.embeddings') self.cv = self.config.getboolean('prediction', 'use.crossvalid') if ('cross' in self.task): self.cv = True if (not 'none' in self.dimHandler.name.lower()): self.outFile = self.dimHandler.getOutFile(self.classif) self.outFile = self.outFile + '_embeddings' if self.useEmbeddings else self.outFile else: self.outFile = self.outputPath + self.classif + '_' + self.extractor.featType if ('kmers' in self.extractor.featType): kmerfeats = 'kmers' + str( self.extractor.size) + '_minOcc' + str( self.extractor.minOcc) self.outFile = self.outFile.replace('kmers', kmerfeats) #self.outFile += str(self.extractor.size) + '_minOcc' + str(self.extractor.minOcc) if ('cross' in self.task or 'grid' in self.task or self.cv): self.extractor.featFile = self.extractor.featFile.replace( '.feat', '.complete.feat' ) if 'grid' in self.task else self.extractor.featFile if ('cross' in self.task or self.cv): self.outFile += '_cv05' self.modelFile = self.outFile + '.model.pkl' self.classifier = self.setUpClassifier()
def __init__(self, source_type=None, source_path=None, result_path=None): self.config = Utils.loadConfig() self.task = self.config.get('dataPipeline', 'task') self.source_path = self.config.get( 'dataPipeline', 'source.path') if source_path is None else source_path self.source_type = self.config.get( 'dataPipeline', 'source.type') if source_type is None else source_type self.result_path = self.config.get( 'dataPipeline', 'result.path') if result_path is None else result_path self.result_path = Utils.normalizePath(self.result_path) # create if it doesnt exist os.makedirs(os.path.dirname(self.result_path), exist_ok=True) # recover the species name for using in temp files self.species = Utils.getSpecies(self.source_path) # temp dir + file used by sub-pipelines self.path = os.path.dirname(os.path.realpath(__file__)) self.path += '/temp/' os.makedirs(os.path.dirname(self.path), exist_ok=True)
def genBankToFasta(): config = Utils.loadConfig() source = config.get('parsers', 'source.path') if not str(source).endswith('/'): output = source + '_fasta/' source += '/' else: source[len(source) - 1] = '' output = source + '_fasta/' source += '/' os.makedirs(os.path.dirname(output), exist_ok=True) list = genBankToAminoacid(source) content = '' for item in list: content += item + '\n' Utils.writeFile(output + 'fungi_complete' + '.fasta', content)
def __init__(self): self.config = Utils.loadConfig()