def GetTotalNumbers(realKmerDict, posFile, negFile, pssmList, pwm, predictedMotifs, predictedKmers=None): PosSeqDict = SeqGenUtils.fasta_read(posFile); NegSeqDict = SeqGenUtils.fasta_read(negFile); numPosTP = numPosFP = numPosFN = 0; numNegTP = numNegFP = numNegFP = 0; #print "Predicted KMER DICT: ", predictedKmers; numPosTP, numPosFP, numPosFN = getTotalNumbersForSeqDict(PosSeqDict, realKmerDict, pssmList, pwm, predictedMotifs, predictedKmers); numNegTP, numNegFP, numNegFN = getTotalNumbersForSeqDict(NegSeqDict, realKmerDict, pssmList, pwm, predictedMotifs, predictedKmers); #print "Pos File: TP: ", str(numPosTP), ", FP: ", numPosFP, ", FN: ", numPosFN #print "Neg File: TP: ", str(numNegTP), ", FP: ", numNegFP, ", FN: ", numNegFN return (numPosTP + numNegTP), (numPosFP + numNegFP) , (numPosFN + numNegFN);
def GetKmersFromStructureFile(structureFile, numSeqsWithSignal): structureDict = SeqGenUtils.fasta_read(structureFile); structureDict = SeqGenUtils.ChangeUsToTs(structureDict); originalKmers = structureDict.values(); updatedKmers = CreateLengthMatchingKmers(originalKmers, numSeqsWithSignal) return updatedKmers;
def ComputeNgramFrequencyAndProbability(seqFile, nLen): seqDict = SeqGenUtils.fasta_read(seqFile); nGramCombinedList = list(); total_seq_length = 0; for header, sequence in seqDict.iteritems(): ngram_list = getNgramListForSeq(sequence, nLen) if len(ngram_list) > 0: nGramCombinedList.append(ngram_list) total_seq_length = total_seq_length + len(sequence) else: print "Found empty sequence for ", header; ngram_freq = getNGramFreqForCombinedList(nGramCombinedList, nLen); ngram_freq = dict(ngram_freq) ngram_prob = {} for ngram, frequency in ngram_freq.iteritems(): ngram_prob[ngram] = round(frequency/total_seq_length, 4); # print str(nLen) + "-" + "gram frequences: \n", ngram_freq; #print str(nLen) + "-" + "gram probabilities: \n", ngram_prob; #print "Total Nucleotides: ", str(total_seq_length) return ngram_freq, ngram_prob;
def test_DictFromFasta(self): structureAlignmentFile = "/projects/bhandare/workspace/PySG/src/resources/RF000037.fa" structureDict = SeqGenUtils.fasta_read(structureAlignmentFile); self.assertEqual(len(structureDict), 62); for key, value in structureDict.iteritems(): condition = 'T' in value; self.assertFalse(condition); structureDict = SeqGenUtils.ChangeUsToTs(structureDict); self.assertEqual(len(structureDict), 62); for key, value in structureDict.iteritems(): condition = 'T' in value; self.assertTrue(condition);
def computeSequenceBasedDREMEResults(dremeFile, realCsvFile, posSeqFile, negSeqFile): totalPosTP = totalPosFP = totalPosFN = totalPosTN = 0; totalNegTP = totalNegFP = totalNegFN = totalNegTN = 0; posSeqDict = SeqGenUtils.fasta_read(posSeqFile); negSeqDict = SeqGenUtils.fasta_read(negSeqFile); realKmerDict = parseRealKmers.GetRealKmerDict(realCsvFile); pssmList = parseDreme.getPSSMListFromDremeFile(dremeFile) numPosTP, numPosFP, numPosFN, numPosTN = compareRealAndPredicted(realKmerDict, posSeqDict, pssmList, positive=True) numNegTP, numNegFP, numNegFN, numNegTN = compareRealAndPredicted(realKmerDict, negSeqDict, pssmList, False) print "Positive: TP: ", numPosTP, ", FP: ", numPosFP, ", FN: ", numPosFN, ", TN: ", numPosTN print "Negative: TP: ", numNegTP, ", FP: ", numNegFP, ", FN: ", numNegFN, ", TN: ", numNegTN totalPos = len(posSeqDict) totalNeg = len(negSeqDict) sensitivity, ppv = compareKmers.GetSensitivityAndPPV((numPosTP + numNegTP) , (numPosFP + numNegFP), (numPosFN + numNegFN)) accuracy = compareKmers.GetAccuracy( (numPosTP + numNegTP), (numPosTN + numNegTN), (totalPos + totalNeg) ) specificity = compareKmers.GetSpecificity( (numPosFP + numNegFP), totalNeg); print "Senitivity: ", sensitivity, ", PPV: ", ppv, ", Accuracy: ", accuracy, ", Specificity: ", specificity; return sensitivity, ppv;
motifCountDict[motif] = 1; if motifFound == False: totalMisMatches = totalMisMatches + 1; print "Total Mismatches: ", totalMisMatches; return motifMatchDict, motifCountDict; if __name__ == "__main__": import sys motifFile = sys.argv[1] faFile = sys.argv[2] motifList = getMotifList(motifFile) print motifList seqDict = SeqGenUtils.fasta_read(faFile) motifMatchDict, motifCountDict = findMotifInSequences(motifList, seqDict) singleMotif = 0 multipleMotif = 0; for header, value in motifMatchDict.iteritems(): if len(value) > 1: multipleMotif = multipleMotif + 1; if len(value) == 1: singleMotif = singleMotif + 1; print "Sequences that matches single motif: ", singleMotif; print "Sequences that matches multiple motifs: ", multipleMotif; print "Total Sequences: ", len(seqDict) # for item in sorted(motifMatchDict):