Exemplo n.º 1
0
def parseDatasetContents(dataPath, featType, sourceType):
    files, result = [], []
    if ('domain' in featType or 'dictionary' in featType):
        domainFiles = Utils.listFilesExt(dataPath, 'domains')
        files += domainFiles
        if (len(domainFiles) < 1):
            print('No domains / dictionary files found in', dataPath)
            exit()

    if ('kmers' in featType or 'prot' in featType):
        fastaFiles = Utils.listFilesExt(dataPath, 'fasta')
        files += fastaFiles
        if (len(fastaFiles) < 1):
            print('No fasta files found in', dataPath)
            exit()

    if ('go' in featType):
        goTermFiles = Utils.listFilesExt(dataPath, 'go')
        files += goTermFiles
        if (len(goTermFiles) < 1):
            print('No GO term files found in', dataPath)
            exit()

    for file in files:
        ext = os.path.splitext(file)[1]
        lines = Utils.readFileLines(file)
        #handle genes with an added version number as NRRL3_00129.1
        id = lines[0].replace('>', '').replace('a', '').split('.')[0]
        if ('fasta' in ext):
            #content = Utils.readFileLines(file)[1].upper()
            content = lines[1].upper()
            content = normalizeSequence(content, sourceType)
            if ('kmers' in featType):
                result.append(((file, content, id), 'kmers'))
            if ('prot' in featType):
                result.append(((file, content, id), 'protanalys'))

        elif ('domain' in ext):
            #content = Utils.readFileLines(file)[1:]
            content = lines[1:]
            # Comment out next line to keep domain name:
            content = [line.split('.')[0] for line in content]
            content = "\n".join(content)

            if ('pfam' in featType):
                temp = content.split('\n')
                for entry in temp:
                    result.append(((file, entry, id), 'domains'))
            else:
                if (content):
                    result.append(((file, content, id), 'domains'))

        elif ('go' in ext):
            #content = Utils.readFileLines(file)[1:]
            content = lines[1:]
            content = "\n".join(content)
            result.append(((file, content, id), 'go'))

    return result
Exemplo n.º 2
0
def parseFastaToList(path, filter):
    thislist, files, filterIDs, filename_content = [], [], [], []

    if (os.path.isfile(path)):
        files.append(path)
    else:
        files = Utils.listFilesExt(path, 'fasta')

    if (os.path.isfile(filter)):
        filterIDs = Utils.readFileLines(filter)
    else:
        filterIDs = filter.split('\n')

    for file in files:
        sequences = parseFasta(file)
        for fasta_record in sequences:
            output = '>' + str(fasta_record.id) + '\n' + str(fasta_record.seq)
            if (len(filterIDs) > 0):
                if (str(fasta_record.id)) not in str(filterIDs):
                    thislist.append(output)
                    filename_content.append(tuple([file, output]))
            else:
                thislist.append(output)
                filename_content.append(tuple([file, output]))

    return thislist, filename_content
Exemplo n.º 3
0
    def summarize(self):
        metricFiles = Utils.listFilesExt(self.result, 'metrics')
        metricFiles = sorted(metricFiles)
        output, pos = "", ""
        outputFile = Utils.normalizePath(self.result) + "results.summary"
        if ("pos" in self.result):
            pos = self.result.split("pos")[1][0:2]

        for file in metricFiles:
            metrics = Utils.readFileLines(file)[2].replace("pos\t", "")
            filename = os.path.basename(file)
            classifier = filename.split("_")[0]
            feats = filename.split("_")[1] + "+" + filename.split("_")[2]
            len = filename.split("len")[1].split("_")[0]
            overlap = filename.split("overlap")[1].split("_")[0][0:2]

            evaltype = filename.split("IDs.test.")[1].replace(
                "eval.metrics", "").replace(".", "")
            if (not evaltype):
                evaltype = "succ0"
            if ("similar" in evaltype):
                evaltype = evaltype.replace("similar", "sim")
            if ("merge" in evaltype):
                evaltype = evaltype.replace("succ", "")

            line = feats + "\t" + classifier + "\t" + pos + "\t" + len + "\t" + overlap + "\t" + evaltype + "\t" + metrics + "\n"
            output += line
        Utils.writeFile(outputFile, output)
Exemplo n.º 4
0
    def createSimilarityMatrix(self):
        source_type = self.config.get('dataPipeline', 'source.type')
        list = Utils.listFilesExt(self.source_path, "fasta")
        outputFile = self.result_path + '/similarity.blast'
        outputRedFile = self.result_path + '/similarity.blast.similarity'
        similarity = ""
        columns = [
            'qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen',
            'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore', 'qcovs'
        ]

        if (not os.path.isfile(outputFile)):

            # generate all gene pairs within a genome
            allpairs = {(i, j) for i in list for j in list}
            # filter out duplicate pairs, e.g. (2,8) and (8,2)
            file_content = set(tuple(sorted(p)) for p in allpairs)

            datapipe = DataPipeline.DataPipeline(source_type=source_type,
                                                 source_path=self.source_path,
                                                 result_path=self.result_path)

            sparkContext = SparkContext(
                conf=datapipe.initSpark("blastSimilarity"))
            similarity = datapipe.getBLAST(file_content,
                                           sparkContext,
                                           blastTask="similarity")

            result = ""
            for entry in similarity:
                if (entry[1]):
                    result += entry[1] + "\n"

            Utils.writeFile(outputFile, result)
            df = pandas.read_csv(StringIO(result),
                                 sep='\t',
                                 names=columns,
                                 index_col=False)

        else:
            df = pandas.read_csv(outputFile,
                                 sep='\t',
                                 names=columns,
                                 index_col=False)

        # generate leaner matrix with only selected columns,
        # output to new file
        if (not os.path.isfile(outputRedFile)):
            df = df[['qseqid', 'sseqid', 'pident', 'bitscore', 'qcovs']]
            df['id'] = df[['qseqid', 'sseqid']].agg('|'.join, axis=1)
            df.drop('qseqid', 1)
            df.drop('sseqid', 1)
            df = df[['id', 'pident', 'bitscore', 'qcovs']]
            df = df.sort_values('id')
            df.to_csv(sep='\t',
                      header=True,
                      path_or_buf=outputRedFile,
                      index=False)

        print('done!')
Exemplo n.º 5
0
    def __init__(self):
        self.config = Utils.loadConfig()
        self.task = self.config.get('eval', 'task')
        self.gold = self.config.get('eval', 'goldID.path')
        self.result = self.config.get('eval', 'result.path')
        self.threshold = float(self.config.get('eval', 'threshold'))
        self.sparkContext = SparkContext(conf=Utils.getSparkConf('filter'))

        self.Similarity = Similarity.Similarity(self.config)
        self.Filter = Filter.Filter(self.config,
                                    sparkContext=self.sparkContext)
        self.Merger = Merger.Merger(self.config)

        self.goldIDs = Utils.readFileLines(self.gold)[1:]
        self.resultFiles = Utils.listFilesExt(self.result, 'IDs.test')

        # total nb of gold genes
        self.nbGoldGenes = len(self.goldIDs)
        # total nb of gold clusters
        self.foldedGold = Utils.foldClusterData(self.goldIDs, 'gold', 0)
        self.goldGenes = [
            gene for genes in self.foldedGold.values() for gene in genes
        ]
        self.nbGoldClusters = len(self.foldedGold)
        self.outputheader = 'goldClusterID\tgoldGeneID\tpredictedClusterLabel\tpredictedClusterID\n'
        self.scoreheader = 'goldClusterID\tpredictedClusterID\tclusterScore\n'
Exemplo n.º 6
0
    def createPfamTsv(self):
        listFiles = Utils.listFilesExt(self.source_path, 'domains')

        head = 'sequence_id\tprotein_id\tgene_start\tgene_end\tgene_strand\tpfam_id\tin_cluster\n'
        contentPos = ''
        contentNeg = ''

        for file in listFiles:
            fileContent = Utils.readFileLines(file)
            id = fileContent[0].replace('>', '')
            fileContent = fileContent[1:]
            inCluster = 1 if 'bgc' in os.path.basename(file).lower() else 0

            for line in fileContent:
                pfamId = line.split('|')[0]
                product = line.split('|')[1]
                currentLine = id + '\t' + product + '\t0\t0\t0\t' + pfamId + '\t' + str(
                    inCluster) + '\n'
                if (inCluster == 1):
                    contentPos += currentLine
                else:
                    contentNeg += currentLine

        contentPos = head + contentPos[:-1]
        contentNeg = head + contentNeg[:-1]

        folder = os.path.basename(os.path.dirname(file))
        resultPos = self.result_path + folder + '.positives.pfam.tsv'
        resultNeg = self.result_path + folder + '.negatives.pfam.tsv'

        Utils.writeFile(resultPos, contentPos)
        Utils.writeFile(resultNeg, contentNeg)
Exemplo n.º 7
0
 def getEmbeddings(self):
     matrix = np.zeros((self.dictLength(), self.embedSize))
     embfiles = Utils.listFilesExt(self.embedPath, 'w2v')
     for i in embfiles:
         if ('kmer' in i.lower() and 'kmer' in self.featType.lower()):
             matrix = self.mapEmbedWeights(i, 'kmer', matrix)
         elif ('domain' in i.lower() and 'domain' in self.featType.lower()):
             matrix = self.mapEmbedWeights(i, 'domain', matrix)
         elif ('go' in i.lower() and 'go' in self.featType.lower()):
             matrix = self.mapEmbedWeights(i, 'go', matrix)
     return matrix
Exemplo n.º 8
0
def genBankToAminoacid(path):
    list = []
    # only aminoacid sequence
    translations = ''
    files = []
    if (os.path.isfile(path)):
        files.append(path)
    else:
        files = Utils.listFilesExt(path, 'gbk')

    for file in files:
        species = Utils.getSpecies(file)
        records = parseGenBank(file)

        for record in records:
            locus = record.id
            for feature in record.features:
                #if feature.key == "CDS":
                if feature.type == "CDS":
                    id, locus_tag, gene, protein_id, translation, \
                    product, function, description  = '','','','','','','',''

                    for key, value in feature.qualifiers.items():
                        # get rid of the quotes around the qualifier
                        # find entry ID
                        if key == "translation":
                            translation = value[0]
                        elif key == "gene":
                            gene = value[0]
                        elif key == "locus_tag":
                            locus_tag = value[0]
                        elif key == "protein_id":
                            protein_id = value[0]
                            protein_id = protein_id.replace('/', '')
                        elif key == "product":
                            product = value[0]
                        elif key == "function":
                            function = value[0]

                    #priority for gene ID
                    id = locus_tag if not id and len(locus_tag) > 1 else id
                    id = gene if not id and len(gene) > 1 else id
                    id = protein_id if not id and len(protein_id) > 1 else id

                    description = product if product.strip() else description
                    description += '|' + function if function.strip(
                    ) else description

                    entry = '>' + locus + '|' + species + '|' + id + '|' + description + '\n' + translation
                    if (entry not in list):
                        list.append(entry)
                        translations += translation

    return list, translations
Exemplo n.º 9
0
    def createDomainDataset(self):
        useID = True
        files = Utils.listFilesExt(self.source_path, self.ext)
        files = [
            fileName for fileName in files if not os.path.isfile(
                self.result_path +
                os.path.basename(fileName).replace('.fasta', '.domains'))
        ]

        source_type = self.config.get('dataPipeline', 'source.type')
        count = 0
        countNone = 0
        datapipe = DataPipeline.DataPipeline(source_type=source_type,
                                             source_path=self.source_path,
                                             result_path=self.result_path)

        sparkContext = SparkContext(conf=datapipe.initSpark("domainDataset"))
        pfamDomains = datapipe.getDomains(sparkContext)

        for file in files:
            fileName = os.path.basename(file)
            IDs = open(file, 'r').readline()

            resultFile = self.result_path + fileName.replace(
                '.fasta', '.domains')
            result = pfamDomains.get(file)

            #if(len(result) > 1):
            if (result is not None):
                result = result.split('\n')
                outF = open(resultFile, 'w')
                outF.write(IDs)
                output = ""

                for line in result:
                    if (len(line.strip()) > 1):
                        items = line.split('\t')
                        domainID = items[5]
                        domain = items[6]
                        bitscore = items[11]
                        if (useID):
                            domain = domainID + '|' + domain

                        outF.write(domain + '\n')
                        output += domain + '\n'

                outF.close()
                count += 1
            else:
                print('None for file: ', file)
                countNone += 1

        print('Done generating', str(count),
              'domain files. \nNo domain found for', str(countNone), 'files.')
Exemplo n.º 10
0
    def createNegShuffle(self, posPerc):
        files = Utils.listFilesExt(self.source_path, self.ext)
        negPerc = 100 - posPerc
        positives = len(files)
        negativeSize = int((negPerc * positives) / posPerc)
        print('Negative percentage: ' + str(negPerc) + '% \n' +
              'Negative instances: ' + str(negativeSize) + '\n' +
              'Positive percentage: ' + str(posPerc) + '% \n' +
              'Positive instances: ' + str(positives) + '\n' +
              'Total corpus size: ' + str(negativeSize + positives))

        thisDecRatio = 0.0
        count = 0
        ratio = (negativeSize / positives)
        decRatio = ratio - int(ratio)

        print('Generating...')
        for file in files:
            # add up the decimal ratio part
            thisDecRatio += round(decRatio, 2)
            # reset range
            ratioRange = int(negativeSize / positives)

            # check if decimal ratio added up to a duplicate
            if (thisDecRatio >= 1):
                ratioRange = int(ratio + thisDecRatio)
                thisDecRatio = 0

            for i in range(0, ratioRange):
                name = os.path.basename(file)
                result_file = name.split('.')[0] + '_' + str(
                    i) + '.shuffled.negative.fasta'

                if ('nuc' in self.seqType):
                    content = Parsers.genBankToNucleotide(file)
                if ('amino' in self.seqType):
                    list, content = Parsers.genBankToAminoacid(file)
                content = Utils.charGramShuffle(content, 2)
                content = '>' + name + '\n' + content

                count += 1

                Utils.writeFile(self.result_path + result_file, content)

        print('Total generated: ' + str(count) + '. Done!')
Exemplo n.º 11
0
def orthogroupSeqs(orthofile, seqpath, limit):
    orthodir = os.path.dirname(seqpath)
    ortholines = Utils.readFileLines(orthofile)[1:]
    seqpath = Utils.listFilesExt(seqpath, "fasta")
    threshold = limit if (limit) else len(seqpath)
    orthogroups = [re.split('\t|;|,', item)[1:]
                   for item in ortholines][1:threshold + 1]
    sequences, output = dict(), dict()

    print('Loading files and seqIDs...')
    for seqfile in seqpath:
        sequences.update(SeqIO.index(seqfile, "fasta"))

    orthodir = orthodir + '/orthologs_threshold' + str(threshold) + '/'

    if os.path.isdir(orthodir):
        print('Orthogroup path', orthodir, 'already exists.')
        exit()
    else:
        os.makedirs(orthodir)

        print('Loading sequences per IDs...')
        for group in orthogroups:
            for id in group:
                id = id.strip(' ')
                tempseq = sequences.get(id)

                if (tempseq is not None and len(tempseq) > 1):
                    thisseqfile = orthodir + tempseq.id + '.fasta'
                    content = '>' + tempseq.id + '\n' + tempseq.seq
                    Utils.writeFile(thisseqfile, content)
                # else:
                #     print('ID not found', str(id))

    print('Done writing seqs for orthogroups.')
    return output
Exemplo n.º 12
0
    def splitAsClusters(self):
        self.source_path = Utils.normalizePath(self.source_path)
        slimIDs = self.config.getboolean('corpusPrep', 'slim.id')
        files = Utils.listFilesExt(self.source_path, self.ext)
        result = []

        overlap = int((self.windowOverlap / 100) * self.length)

        for file in files:
            fileName = os.path.basename(file).split('.')[0]

            self.result_path = self.result_path + fileName + '_len' + str(
                self.length) + '_overlap' + str(self.windowOverlap)
            if (slimIDs):
                self.result_path += '_slimIDs'
            #self.result_path += '/'
            self.result_path += '.fasta'

            if (os.path.isfile(self.result_path)):
                print('File already exists: ' + self.result_path + '.\nDone.')

            else:
                file = Parsers.sortFasta(file)
                sequences = Parsers.parseFasta(file)
                content, ids, entry, overlapIds = '', '', '', ''
                for fasta in sequences:
                    content += str(fasta.seq.upper())
                    ids += str(fasta.id) if not ids else '|' + str(fasta.id)

                    if (slimIDs):
                        allIds = ids.split('|')
                        ids = allIds[0] + '|to|' + allIds[len(allIds) - 1]

                    while (len(content) > 0):
                        varSize = self.length - (len(entry))
                        if (varSize <= overlap):
                            overlapIds += str(
                                fasta.id) if not overlapIds else '|' + str(
                                    fasta.id)
                        entry += content[0:varSize]

                        if (len(entry) == self.length):
                            # move cursor on real sequence according to variable length added
                            content = content[varSize:]
                            # add chunk to list
                            if (slimIDs):
                                allIds = ids.split("|")
                                ids = allIds[0] + '|to|' + allIds[len(allIds) -
                                                                  1]
                            result.append('>' + ids + '\n' + entry)
                            # make sure that entry contains overlap
                            entry = entry[len(entry) - overlap:]

                            if (len(content) > 0):
                                ids = overlapIds
                                overlapIds = ''
                            else:
                                ids = ''

                        elif (len(content) > 0 and len(entry) < self.length):
                            content = content[len(entry):]

                prev = 0
                pos = self.length

            result = '\n'.join(result)
            Utils.writeFile(self.result_path, result)

        print('Done.')
Exemplo n.º 13
0
    def createDataset(self):
        neg_path = Utils.normalizePath(self.negPath)
        pos_path = Utils.normalizePath(self.posPath)
        negatives = Utils.listFilesExt(neg_path, self.ext)
        positives = Utils.listFilesExt(pos_path, self.ext)
        subject = Utils.normalizePath(self.result_path)

        negLen = len(negatives)
        posLen = len(positives)
        negPerc = 100 - self.posPerc
        negTotal = (posLen * negPerc) / self.posPerc

        if (negLen < negTotal):
            print("Not enough negative instances. Try another %")
            exit()
        else:
            if (not negatives or not positives):
                print(
                    'List of files was empty. '
                    'Please check \'neg.path\' and \'pos.path\' in the self.config file.'
                )

            subject += 'pos' + str(self.posPerc)

            if (len(subject) > 1):
                os.makedirs(subject, exist_ok=True)
                destTrain = subject + '/train/'
                destValid = subject + '/validation/'

                if (os.path.exists(destTrain) or os.path.exists(destValid)):
                    print('Dataset already splitted for train and validation. '
                          '\nRename ' + destTrain + ' or ' + destValid +
                          ' and try again.')
                else:
                    os.makedirs(destTrain, exist_ok=False)
                    os.makedirs(destValid, exist_ok=False)

                    perc = int(self.validPerc) / 100

                    readme = 'Source negative: ' + neg_path + \
                    '\nSource positive: ' + pos_path + \
                    '\n# negative files: ' + str(negLen) + \
                    '\n(final) # negative files: ' + str(negTotal) + \
                    '\n# positive files: ' + str(posLen) + \
                    '\nValidation data percentage (from total): ' + str(self.validPerc) + '%'

                    Utils.writeFile(subject + '/README.md', readme)

                    # select validation files
                    validNegatives = random.sample(negatives,
                                                   int(perc * negTotal))
                    validPositives = random.sample(positives,
                                                   int(perc * posLen))

                    # remove validation files from list
                    negatives = [
                        f for f in negatives if f not in validNegatives
                    ]
                    positives = [
                        f for f in positives if f not in validPositives
                    ]

                    # select randomly corresponding nb of negatives
                    negatives = random.sample(
                        negatives, int(negTotal - len(validNegatives)))

                    train = negatives + positives
                    validation = validPositives + validNegatives

                    for f in validation:
                        name = os.path.basename(f)
                        copy(f, destValid + name)

                    for f in train:
                        name = os.path.basename(f)
                        copy(f, destTrain + name)

                    print('Done splitting randomly ' + str(len(train)) +
                          ' train and ' + str(len(validation)) + ' files.')