Пример #1
0
    def __init__(self):
        self.config = Utils.loadConfig()
        self.task = self.config.get('eval', 'task')
        self.gold = self.config.get('eval', 'goldID.path')
        self.result = self.config.get('eval', 'result.path')
        self.threshold = float(self.config.get('eval', 'threshold'))
        self.sparkContext = SparkContext(conf=Utils.getSparkConf('filter'))

        self.Similarity = Similarity.Similarity(self.config)
        self.Filter = Filter.Filter(self.config,
                                    sparkContext=self.sparkContext)
        self.Merger = Merger.Merger(self.config)

        self.goldIDs = Utils.readFileLines(self.gold)[1:]
        self.resultFiles = Utils.listFilesExt(self.result, 'IDs.test')

        # total nb of gold genes
        self.nbGoldGenes = len(self.goldIDs)
        # total nb of gold clusters
        self.foldedGold = Utils.foldClusterData(self.goldIDs, 'gold', 0)
        self.goldGenes = [
            gene for genes in self.foldedGold.values() for gene in genes
        ]
        self.nbGoldClusters = len(self.foldedGold)
        self.outputheader = 'goldClusterID\tgoldGeneID\tpredictedClusterLabel\tpredictedClusterID\n'
        self.scoreheader = 'goldClusterID\tpredictedClusterID\tclusterScore\n'
Пример #2
0
 def __init__(self):
     self.config = Utils.loadConfig()
     self.path = self.config.get('prediction', 'source.path')
     self.path = Utils.normalizePath(self.path)
     self.trainPath = self.path + 'train/'
     self.testPath = self.path + 'test/'
     self.outputPath = self.path + 'metricsQLearner/models/'
     self.geneMapPath = self.config.get('eval', 'filter.map')
     self.geneMap = {}
     self.extractor = Extractor.Extractor(self.config, self.outputPath)
     self.rewardType = 'occ'
     self.rewardPath = self.outputPath + self.rewardType + 'PerDomains.feat'  # pfam domain list
     self.rewardList, self.rewardIDs, self.rewardLabels = '', '', ''
     self.actions = ['keep', 'skip']
     self.task = 'train'
     self.rewardTable, self.QTable = [], []
     self.episodes = int(self.config.get('prediction', 'episodes'))
     # hyperparams
     self.alpha = float(self.config.get('prediction',
                                        'alpha'))  # learning rate
     self.gamma = float(self.config.get('prediction',
                                        'gamma'))  # discount factor
     self.epsilon = float(self.config.get('prediction',
                                          'epsilon'))  # exploration
     self.penaltyThreshold = float(
         self.config.get(
             'prediction',
             'penalty.threshold'))  # negative rewards mean penalty
     self.keepskipThreshold = float(
         self.config.get('prediction', 'keepskip.threshold')
     )  # keep reward ratio wrt skip reward for domain to be kept
     self.useSimilarityWeight = False
     self.useCompWeights = False
     self.useNeighborWeight = self.config.getboolean(
         'prediction', 'neighbor.weight')
     self.useDryIslands = self.config.getboolean('prediction',
                                                 'dry.islands')
     self.useAvAction = self.config.getboolean('prediction',
                                               'average.action')
     self.weightsPath = self.config.get('eval', 'weights')
     self.weights = Utils.readFileLines(
         self.weightsPath
     ) if self.useCompWeights or self.useNeighborWeight else ''
     self.params = self.rewardType + '_keepgt' + str(
         self.keepskipThreshold) + 'skip' + '_ep' + str(
             self.episodes) + '_alpha' + str(self.alpha) + '_gamma' + str(
                 self.gamma) + '_eps' + str(self.epsilon)
     self.params += '_neighbor' if self.useCompWeights else ''
     self.QTablePath = self.outputPath + 'Qtable_' + self.params + '.npy'
     self.rewardTablePath = self.outputPath + 'Rewards_' + self.params + '.npy'
     self.IDmapPath = self.outputPath + 'RewardIDsmap_' + self.params + '.map'
Пример #3
0
    def __init__(self, blastTask):
        self.config = Utils.loadConfig()
        self.sourceType = self.config.get('dataPipeline', 'source.type')
        self.blastTask = blastTask
        self.blastdb = self.config.get('blaster', 'blastdb.path')
        self.blastdb = Utils.normalizePath(self.blastdb)
        self.blastdbName = self.config.get('blaster', 'blastdb.name')
        if(not self.blastdbName.endswith('fasta')):
            self.blastdbName += '.fasta'

        self.goTerms = True if 'goterm' in blastTask.lower() else False
        self.mappingFile = self.blastdb + self.blastdbName.replace('.fasta','.tab')
        self.mapping = ''
        if(self.goTerms):
            self.mapping = self.loadMapping()
Пример #4
0
 def __init__(self):
     self.config = Utils.loadConfig()
     self.corpus_path = self.config.get('corpusPrep', 'corpus.home')
     self.source_path = self.config.get('corpusPrep', 'source.path')
     self.result_path = self.config.get('corpusPrep', 'result.path')
     self.task = self.config.get('corpusPrep', 'task')
     self.ext = self.config.get('corpusPrep', 'source.ext')
     self.seqType = self.config.get('corpusPrep', 'source.type')
     self.posPath = self.config.get('corpusPrep', 'pos.path')
     self.negPath = self.config.get('corpusPrep', 'neg.path')
     self.validPerc = self.config.get('corpusPrep', 'valid.perc')
     self.posPerc = self.config.getint('corpusPrep', 'pos.perc')
     self.targetIds = self.config.get('corpusPrep', 'pos.ids')
     self.filterIds = self.config.get('corpusPrep', 'pos.filters')
     self.length = self.config.getint('corpusPrep', 'cluster.length')
     self.windowOverlap = self.config.getint('corpusPrep', 'window.overlap')
     self.clusterLen = self.config.getint('corpusPrep', 'cluster.length')
Пример #5
0
    def __init__(self):
        # read application configuration props
        self.config = Utils.loadConfig()
        self.path = self.config.get('prediction', 'source.path')
        self.path = Utils.normalizePath(self.path)
        self.trainPath = self.path + 'train/'
        self.validPath = self.path + 'validation/'
        self.gridCVPath = self.path + 'train_validation/'
        self.testPath = self.path + 'test/'
        self.outputPath = self.path + 'metrics/cv_gridsearchparams/'
        self.task = self.config.get('prediction', 'task')
        self.posPerc = int(self.config.get('prediction', 'pos.perc'))
        self.classif = self.config.get('prediction', 'classifier')
        os.makedirs(os.path.dirname(self.outputPath), exist_ok=True)
        self.extractor = Extractor.Extractor(self.config, self.outputPath)
        self.loader = Loader.Loader(self.config, self.outputPath)
        self.dimHandler = DimensionHandler.DimensionHandler(
            self.config, self.outputPath)
        self.outFile = ''
        self.useEmbeddings = self.config.getboolean('prediction',
                                                    'use.embeddings')
        self.cv = self.config.getboolean('prediction', 'use.crossvalid')
        if ('cross' in self.task):
            self.cv = True
        if (not 'none' in self.dimHandler.name.lower()):
            self.outFile = self.dimHandler.getOutFile(self.classif)
            self.outFile = self.outFile + '_embeddings' if self.useEmbeddings else self.outFile
        else:
            self.outFile = self.outputPath + self.classif + '_' + self.extractor.featType
            if ('kmers' in self.extractor.featType):
                kmerfeats = 'kmers' + str(
                    self.extractor.size) + '_minOcc' + str(
                        self.extractor.minOcc)
                self.outFile = self.outFile.replace('kmers', kmerfeats)
                #self.outFile +=  str(self.extractor.size) + '_minOcc' + str(self.extractor.minOcc)
        if ('cross' in self.task or 'grid' in self.task or self.cv):
            self.extractor.featFile = self.extractor.featFile.replace(
                '.feat', '.complete.feat'
            ) if 'grid' in self.task else self.extractor.featFile
            if ('cross' in self.task or self.cv):
                self.outFile += '_cv05'

        self.modelFile = self.outFile + '.model.pkl'
        self.classifier = self.setUpClassifier()
Пример #6
0
 def __init__(self, source_type=None, source_path=None, result_path=None):
     self.config = Utils.loadConfig()
     self.task = self.config.get('dataPipeline', 'task')
     self.source_path = self.config.get(
         'dataPipeline',
         'source.path') if source_path is None else source_path
     self.source_type = self.config.get(
         'dataPipeline',
         'source.type') if source_type is None else source_type
     self.result_path = self.config.get(
         'dataPipeline',
         'result.path') if result_path is None else result_path
     self.result_path = Utils.normalizePath(self.result_path)
     # create if it doesnt exist
     os.makedirs(os.path.dirname(self.result_path), exist_ok=True)
     # recover the species name for using in temp files
     self.species = Utils.getSpecies(self.source_path)
     # temp dir + file used by sub-pipelines
     self.path = os.path.dirname(os.path.realpath(__file__))
     self.path += '/temp/'
     os.makedirs(os.path.dirname(self.path), exist_ok=True)
Пример #7
0
def genBankToFasta():
    config = Utils.loadConfig()
    source = config.get('parsers', 'source.path')

    if not str(source).endswith('/'):
        output = source + '_fasta/'
        source += '/'
    else:
        source[len(source) - 1] = ''
        output = source + '_fasta/'
        source += '/'

    os.makedirs(os.path.dirname(output), exist_ok=True)

    list = genBankToAminoacid(source)
    content = ''

    for item in list:
        content += item + '\n'

    Utils.writeFile(output + 'fungi_complete' + '.fasta', content)
Пример #8
0
 def __init__(self):
     self.config = Utils.loadConfig()