Exemplo n.º 1
0
    def summarize(self):
        metricFiles = Utils.listFilesExt(self.result, 'metrics')
        metricFiles = sorted(metricFiles)
        output, pos = "", ""
        outputFile = Utils.normalizePath(self.result) + "results.summary"
        if ("pos" in self.result):
            pos = self.result.split("pos")[1][0:2]

        for file in metricFiles:
            metrics = Utils.readFileLines(file)[2].replace("pos\t", "")
            filename = os.path.basename(file)
            classifier = filename.split("_")[0]
            feats = filename.split("_")[1] + "+" + filename.split("_")[2]
            len = filename.split("len")[1].split("_")[0]
            overlap = filename.split("overlap")[1].split("_")[0][0:2]

            evaltype = filename.split("IDs.test.")[1].replace(
                "eval.metrics", "").replace(".", "")
            if (not evaltype):
                evaltype = "succ0"
            if ("similar" in evaltype):
                evaltype = evaltype.replace("similar", "sim")
            if ("merge" in evaltype):
                evaltype = evaltype.replace("succ", "")

            line = feats + "\t" + classifier + "\t" + pos + "\t" + len + "\t" + overlap + "\t" + evaltype + "\t" + metrics + "\n"
            output += line
        Utils.writeFile(outputFile, output)
Exemplo n.º 2
0
    def __init__(self, config, outputPath):
        self.sourcePath = config.get('prediction', 'source.path')
        self.sourcePath = Utils.normalizePath(self.sourcePath)
        self.trainPath = self.sourcePath + 'train/'
        self.outputPath = self.sourcePath + 'metricsDL/'
        self.sourceType = config.get('prediction', 'source.type')
        self.useEmbeddings = bool(config.get('prediction', 'use.embeddings'))
        self.embedPath = config.get('prediction', 'embed.path')
        self.embedPath = Utils.normalizePath(self.embedPath)

        if (self.useEmbeddings):
            self.featType = config.get('prediction', 'feat.type')
            self.featSize = config.get('prediction', 'feat.size')
            self.minOcc = config.get('prediction', 'feat.minOcc')
            self.embedSize = config.getint('prediction', 'embeddings.length')
            self.embeddingsName =  self.featType + self.featSize + 'minOcc' + str(self.minOcc) \
                                  + str(self.embedSize) + 'd'

        self.dictionary = dict()
        self.extractor = Extractor.Extractor(config, outputPath)
        self.featType = config.get('prediction', 'feat.type')
        self.maxLength = 0
Exemplo n.º 3
0
 def __init__(self):
     self.config = Utils.loadConfig()
     self.path = self.config.get('prediction', 'source.path')
     self.path = Utils.normalizePath(self.path)
     self.trainPath = self.path + 'train/'
     self.testPath = self.path + 'test/'
     self.outputPath = self.path + 'metricsQLearner/models/'
     self.geneMapPath = self.config.get('eval', 'filter.map')
     self.geneMap = {}
     self.extractor = Extractor.Extractor(self.config, self.outputPath)
     self.rewardType = 'occ'
     self.rewardPath = self.outputPath + self.rewardType + 'PerDomains.feat'  # pfam domain list
     self.rewardList, self.rewardIDs, self.rewardLabels = '', '', ''
     self.actions = ['keep', 'skip']
     self.task = 'train'
     self.rewardTable, self.QTable = [], []
     self.episodes = int(self.config.get('prediction', 'episodes'))
     # hyperparams
     self.alpha = float(self.config.get('prediction',
                                        'alpha'))  # learning rate
     self.gamma = float(self.config.get('prediction',
                                        'gamma'))  # discount factor
     self.epsilon = float(self.config.get('prediction',
                                          'epsilon'))  # exploration
     self.penaltyThreshold = float(
         self.config.get(
             'prediction',
             'penalty.threshold'))  # negative rewards mean penalty
     self.keepskipThreshold = float(
         self.config.get('prediction', 'keepskip.threshold')
     )  # keep reward ratio wrt skip reward for domain to be kept
     self.useSimilarityWeight = False
     self.useCompWeights = False
     self.useNeighborWeight = self.config.getboolean(
         'prediction', 'neighbor.weight')
     self.useDryIslands = self.config.getboolean('prediction',
                                                 'dry.islands')
     self.useAvAction = self.config.getboolean('prediction',
                                               'average.action')
     self.weightsPath = self.config.get('eval', 'weights')
     self.weights = Utils.readFileLines(
         self.weightsPath
     ) if self.useCompWeights or self.useNeighborWeight else ''
     self.params = self.rewardType + '_keepgt' + str(
         self.keepskipThreshold) + 'skip' + '_ep' + str(
             self.episodes) + '_alpha' + str(self.alpha) + '_gamma' + str(
                 self.gamma) + '_eps' + str(self.epsilon)
     self.params += '_neighbor' if self.useCompWeights else ''
     self.QTablePath = self.outputPath + 'Qtable_' + self.params + '.npy'
     self.rewardTablePath = self.outputPath + 'Rewards_' + self.params + '.npy'
     self.IDmapPath = self.outputPath + 'RewardIDsmap_' + self.params + '.map'
Exemplo n.º 4
0
    def __init__(self, blastTask):
        self.config = Utils.loadConfig()
        self.sourceType = self.config.get('dataPipeline', 'source.type')
        self.blastTask = blastTask
        self.blastdb = self.config.get('blaster', 'blastdb.path')
        self.blastdb = Utils.normalizePath(self.blastdb)
        self.blastdbName = self.config.get('blaster', 'blastdb.name')
        if(not self.blastdbName.endswith('fasta')):
            self.blastdbName += '.fasta'

        self.goTerms = True if 'goterm' in blastTask.lower() else False
        self.mappingFile = self.blastdb + self.blastdbName.replace('.fasta','.tab')
        self.mapping = ''
        if(self.goTerms):
            self.mapping = self.loadMapping()
Exemplo n.º 5
0
    def __init__(self):
        # read application configuration props
        self.config = Utils.loadConfig()
        self.path = self.config.get('prediction', 'source.path')
        self.path = Utils.normalizePath(self.path)
        self.trainPath = self.path + 'train/'
        self.validPath = self.path + 'validation/'
        self.gridCVPath = self.path + 'train_validation/'
        self.testPath = self.path + 'test/'
        self.outputPath = self.path + 'metrics/cv_gridsearchparams/'
        self.task = self.config.get('prediction', 'task')
        self.posPerc = int(self.config.get('prediction', 'pos.perc'))
        self.classif = self.config.get('prediction', 'classifier')
        os.makedirs(os.path.dirname(self.outputPath), exist_ok=True)
        self.extractor = Extractor.Extractor(self.config, self.outputPath)
        self.loader = Loader.Loader(self.config, self.outputPath)
        self.dimHandler = DimensionHandler.DimensionHandler(
            self.config, self.outputPath)
        self.outFile = ''
        self.useEmbeddings = self.config.getboolean('prediction',
                                                    'use.embeddings')
        self.cv = self.config.getboolean('prediction', 'use.crossvalid')
        if ('cross' in self.task):
            self.cv = True
        if (not 'none' in self.dimHandler.name.lower()):
            self.outFile = self.dimHandler.getOutFile(self.classif)
            self.outFile = self.outFile + '_embeddings' if self.useEmbeddings else self.outFile
        else:
            self.outFile = self.outputPath + self.classif + '_' + self.extractor.featType
            if ('kmers' in self.extractor.featType):
                kmerfeats = 'kmers' + str(
                    self.extractor.size) + '_minOcc' + str(
                        self.extractor.minOcc)
                self.outFile = self.outFile.replace('kmers', kmerfeats)
                #self.outFile +=  str(self.extractor.size) + '_minOcc' + str(self.extractor.minOcc)
        if ('cross' in self.task or 'grid' in self.task or self.cv):
            self.extractor.featFile = self.extractor.featFile.replace(
                '.feat', '.complete.feat'
            ) if 'grid' in self.task else self.extractor.featFile
            if ('cross' in self.task or self.cv):
                self.outFile += '_cv05'

        self.modelFile = self.outFile + '.model.pkl'
        self.classifier = self.setUpClassifier()
Exemplo n.º 6
0
 def __init__(self, config, outputPath):
     self.config = config
     self.dictPath = config.get('prediction', 'dict.path')
     self.featType = config.get('prediction', 'feat.type')
     self.nbFeatType = self.featType.count('-') + 1
     self.sourceType = config.get('prediction', 'source.type')
     self.size = config.get('prediction', 'feat.size')
     self.minOcc = config.get('prediction', 'feat.minOcc')
     outputPath = Utils.normalizePath(outputPath)
     self.featFile = outputPath + self.featType
     self.cv = self.config.getboolean('prediction', 'use.crossvalid')
     self.task = self.config.get('prediction', 'task')
     if ('cross' in self.task):
         self.cv = True
     if self.cv:
         self.featFile += '.cv'
     if ('kmers' in self.featType):
         kmerfeats = 'kmers' + str(self.size) + '_minOcc' + str(self.minOcc)
         self.featFile = self.featFile.replace('kmers', kmerfeats)
     self.featFile += '.feat'
Exemplo n.º 7
0
 def __init__(self, source_type=None, source_path=None, result_path=None):
     self.config = Utils.loadConfig()
     self.task = self.config.get('dataPipeline', 'task')
     self.source_path = self.config.get(
         'dataPipeline',
         'source.path') if source_path is None else source_path
     self.source_type = self.config.get(
         'dataPipeline',
         'source.type') if source_type is None else source_type
     self.result_path = self.config.get(
         'dataPipeline',
         'result.path') if result_path is None else result_path
     self.result_path = Utils.normalizePath(self.result_path)
     # create if it doesnt exist
     os.makedirs(os.path.dirname(self.result_path), exist_ok=True)
     # recover the species name for using in temp files
     self.species = Utils.getSpecies(self.source_path)
     # temp dir + file used by sub-pipelines
     self.path = os.path.dirname(os.path.realpath(__file__))
     self.path += '/temp/'
     os.makedirs(os.path.dirname(self.path), exist_ok=True)
Exemplo n.º 8
0
    def createGoDataset(self):
        source_type = self.config.get('dataPipeline', 'source.type')
        blastPath = self.config.get('blaster', 'blastdb.path')
        blastPath = Utils.normalizePath(blastPath)
        blastName = self.config.get('blaster', 'blastdb.name')
        blastMapping = blastPath + blastName + '.tab'

        datapipe = DataPipeline.DataPipeline(source_type=source_type,
                                             source_path=self.source_path,
                                             result_path=self.result_path)
        list, file_content = Parsers.parseFastaToList(self.source_path, "")
        file_content = [
            content for content in file_content if not os.path.isfile(
                self.result_path +
                os.path.basename(content[0]).replace('.fasta', '.go'))
        ]

        sparkContext = SparkContext(conf=datapipe.initSpark("goDataset"))
        goterms = datapipe.getBLAST(file_content,
                                    sparkContext,
                                    blastTask="goTerms")

        count = 0
        notFound = 0
        for file, content in goterms.items():

            length = content.split('\n')
            if (len(length) == 2 and not str(length[1])):
                notFound += 1
            else:
                filename = os.path.basename(file)
                resultFile = self.result_path + filename
                resultFile = resultFile.replace('.fasta', '.go')
                Utils.writeFile(resultFile, content)
                count += 1

        print('Done generating',
              str(count), 'GO term files. \nNo GO terms found for',
              str(notFound), 'files.')
Exemplo n.º 9
0
    def main(self):
        if ('shuffle' not in self.task and 'selectvalid' not in self.task):
            self.result_path = Utils.normalizePath(self.result_path)
            os.makedirs(os.path.dirname(self.result_path), exist_ok=True)

        if ('split' in self.task):
            self.splitAsClusters()

        if ('shuffle' in self.task):
            posPerc = self.config.get('corpusPrep', 'pos.perc')
            posPerc = int(posPerc) if float(posPerc).is_integer() else float(
                posPerc)

            if (self.result_path.endswith('/')):
                self.result_path = self.result_path[:-1]
            self.result_path = self.result_path + '_pos' + str(posPerc) + '/'

            if (not os.path.isdir(self.result_path)):
                os.makedirs(os.path.dirname(self.result_path), exist_ok=True)
                self.createNegShuffle(posPerc)
            else:
                print('Result path already exists.')

        if ('createdataset' in self.task):
            self.createDataset()

        if ('domain' in self.task):
            self.createDomainDataset()

        if ('goterms' in self.task):
            self.createGoDataset()

        if ('similarity' in self.task):
            self.createSimilarityMatrix()

        if ('pfamtsv' in self.task):
            self.createPfamTsv()
Exemplo n.º 10
0
    def splitAsClusters(self):
        self.source_path = Utils.normalizePath(self.source_path)
        slimIDs = self.config.getboolean('corpusPrep', 'slim.id')
        files = Utils.listFilesExt(self.source_path, self.ext)
        result = []

        overlap = int((self.windowOverlap / 100) * self.length)

        for file in files:
            fileName = os.path.basename(file).split('.')[0]

            self.result_path = self.result_path + fileName + '_len' + str(
                self.length) + '_overlap' + str(self.windowOverlap)
            if (slimIDs):
                self.result_path += '_slimIDs'
            #self.result_path += '/'
            self.result_path += '.fasta'

            if (os.path.isfile(self.result_path)):
                print('File already exists: ' + self.result_path + '.\nDone.')

            else:
                file = Parsers.sortFasta(file)
                sequences = Parsers.parseFasta(file)
                content, ids, entry, overlapIds = '', '', '', ''
                for fasta in sequences:
                    content += str(fasta.seq.upper())
                    ids += str(fasta.id) if not ids else '|' + str(fasta.id)

                    if (slimIDs):
                        allIds = ids.split('|')
                        ids = allIds[0] + '|to|' + allIds[len(allIds) - 1]

                    while (len(content) > 0):
                        varSize = self.length - (len(entry))
                        if (varSize <= overlap):
                            overlapIds += str(
                                fasta.id) if not overlapIds else '|' + str(
                                    fasta.id)
                        entry += content[0:varSize]

                        if (len(entry) == self.length):
                            # move cursor on real sequence according to variable length added
                            content = content[varSize:]
                            # add chunk to list
                            if (slimIDs):
                                allIds = ids.split("|")
                                ids = allIds[0] + '|to|' + allIds[len(allIds) -
                                                                  1]
                            result.append('>' + ids + '\n' + entry)
                            # make sure that entry contains overlap
                            entry = entry[len(entry) - overlap:]

                            if (len(content) > 0):
                                ids = overlapIds
                                overlapIds = ''
                            else:
                                ids = ''

                        elif (len(content) > 0 and len(entry) < self.length):
                            content = content[len(entry):]

                prev = 0
                pos = self.length

            result = '\n'.join(result)
            Utils.writeFile(self.result_path, result)

        print('Done.')
Exemplo n.º 11
0
    def createDataset(self):
        neg_path = Utils.normalizePath(self.negPath)
        pos_path = Utils.normalizePath(self.posPath)
        negatives = Utils.listFilesExt(neg_path, self.ext)
        positives = Utils.listFilesExt(pos_path, self.ext)
        subject = Utils.normalizePath(self.result_path)

        negLen = len(negatives)
        posLen = len(positives)
        negPerc = 100 - self.posPerc
        negTotal = (posLen * negPerc) / self.posPerc

        if (negLen < negTotal):
            print("Not enough negative instances. Try another %")
            exit()
        else:
            if (not negatives or not positives):
                print(
                    'List of files was empty. '
                    'Please check \'neg.path\' and \'pos.path\' in the self.config file.'
                )

            subject += 'pos' + str(self.posPerc)

            if (len(subject) > 1):
                os.makedirs(subject, exist_ok=True)
                destTrain = subject + '/train/'
                destValid = subject + '/validation/'

                if (os.path.exists(destTrain) or os.path.exists(destValid)):
                    print('Dataset already splitted for train and validation. '
                          '\nRename ' + destTrain + ' or ' + destValid +
                          ' and try again.')
                else:
                    os.makedirs(destTrain, exist_ok=False)
                    os.makedirs(destValid, exist_ok=False)

                    perc = int(self.validPerc) / 100

                    readme = 'Source negative: ' + neg_path + \
                    '\nSource positive: ' + pos_path + \
                    '\n# negative files: ' + str(negLen) + \
                    '\n(final) # negative files: ' + str(negTotal) + \
                    '\n# positive files: ' + str(posLen) + \
                    '\nValidation data percentage (from total): ' + str(self.validPerc) + '%'

                    Utils.writeFile(subject + '/README.md', readme)

                    # select validation files
                    validNegatives = random.sample(negatives,
                                                   int(perc * negTotal))
                    validPositives = random.sample(positives,
                                                   int(perc * posLen))

                    # remove validation files from list
                    negatives = [
                        f for f in negatives if f not in validNegatives
                    ]
                    positives = [
                        f for f in positives if f not in validPositives
                    ]

                    # select randomly corresponding nb of negatives
                    negatives = random.sample(
                        negatives, int(negTotal - len(validNegatives)))

                    train = negatives + positives
                    validation = validPositives + validNegatives

                    for f in validation:
                        name = os.path.basename(f)
                        copy(f, destValid + name)

                    for f in train:
                        name = os.path.basename(f)
                        copy(f, destTrain + name)

                    print('Done splitting randomly ' + str(len(train)) +
                          ' train and ' + str(len(validation)) + ' files.')