示例#1
0
def GetTotalNumbers(realKmerDict, posFile, negFile, pssmList, pwm, predictedMotifs, predictedKmers=None):
	PosSeqDict = SeqGenUtils.fasta_read(posFile);
	NegSeqDict = SeqGenUtils.fasta_read(negFile);

	numPosTP = 	numPosFP = 	numPosFN = 0;	
	numNegTP = 	numNegFP = 	numNegFP = 0;	

	#print "Predicted KMER DICT: ", predictedKmers;
	numPosTP, numPosFP, numPosFN = getTotalNumbersForSeqDict(PosSeqDict, realKmerDict, pssmList, pwm, predictedMotifs, predictedKmers);
	numNegTP, numNegFP, numNegFN = getTotalNumbersForSeqDict(NegSeqDict, realKmerDict, pssmList, pwm, predictedMotifs, predictedKmers);

	#print "Pos File: TP: ", str(numPosTP), ", FP: ", numPosFP, ", FN: ", numPosFN
	#print "Neg File: TP: ", str(numNegTP), ", FP: ", numNegFP, ", FN: ", numNegFN
	return (numPosTP + numNegTP), (numPosFP + numNegFP) , (numPosFN + numNegFN);
示例#2
0
def GetDistribListForDirectory(directory, filesToFind):
	distribList = list();
	for seqFile in SeqGenUtils.findFiles(directory, filesToFind):
		seqs, gc_list, fg_lengths = shuffle_utils.get_seqs(seqFile)
		dinuc_distrib = shuffle_utils.compute_dinuc_distrib(seqs, True)			
		distribList.append(dinuc_distrib)		
	return distribList;
示例#3
0
def GetKmersFromStructureFile(structureFile, numSeqsWithSignal):
	structureDict = SeqGenUtils.fasta_read(structureFile);
	structureDict = SeqGenUtils.ChangeUsToTs(structureDict);
	originalKmers = structureDict.values();
	
	updatedKmers = CreateLengthMatchingKmers(originalKmers, numSeqsWithSignal)
	return updatedKmers;
示例#4
0
def ComputeNgramFrequencyAndProbability(seqFile, nLen):

    seqDict = SeqGenUtils.fasta_read(seqFile);
    nGramCombinedList = list();
    total_seq_length = 0;

    for header, sequence in seqDict.iteritems():
        ngram_list = getNgramListForSeq(sequence, nLen)
        if len(ngram_list) > 0:
            nGramCombinedList.append(ngram_list)
            total_seq_length = total_seq_length + len(sequence)
        else: 
            print "Found empty sequence for ", header;

    ngram_freq = getNGramFreqForCombinedList(nGramCombinedList, nLen);
    ngram_freq = dict(ngram_freq)
    ngram_prob = {}

    for ngram, frequency in ngram_freq.iteritems():
        ngram_prob[ngram] = round(frequency/total_seq_length, 4);

    # print str(nLen) + "-" + "gram frequences: \n", ngram_freq;
    #print str(nLen) + "-" + "gram probabilities: \n", ngram_prob;
    #print "Total Nucleotides: ", str(total_seq_length) 

    return ngram_freq, ngram_prob;
示例#5
0
 def GetRandomSequence(self, seqLen):
     seq = ""
     for count in range(seqLen):
         seq += SeqGenUtils.weightedchoice([("C", self.cPercent),
                                            ("G", self.gPercent),
                                            ("A", self.aPercent),
                                            ("T", self.tPercent)])
     return seq
示例#6
0
	def test_DictFromFasta(self):
		structureAlignmentFile = "/projects/bhandare/workspace/PySG/src/resources/RF000037.fa"
		structureDict = SeqGenUtils.fasta_read(structureAlignmentFile);
		self.assertEqual(len(structureDict), 62);

		for key, value in structureDict.iteritems():
			condition = 'T' in value;
			self.assertFalse(condition);

		structureDict = SeqGenUtils.ChangeUsToTs(structureDict);
		self.assertEqual(len(structureDict), 62);
		for key, value in structureDict.iteritems():
			condition = 'T' in value;
			self.assertTrue(condition);
def computeSequenceBasedDREMEResults(dremeFile, realCsvFile, posSeqFile, negSeqFile):
	totalPosTP = totalPosFP = totalPosFN = totalPosTN = 0;
	totalNegTP = totalNegFP = totalNegFN = totalNegTN = 0;

	posSeqDict  = SeqGenUtils.fasta_read(posSeqFile);
	negSeqDict  = SeqGenUtils.fasta_read(negSeqFile);
	realKmerDict = parseRealKmers.GetRealKmerDict(realCsvFile);
	pssmList = parseDreme.getPSSMListFromDremeFile(dremeFile)

	numPosTP, numPosFP, numPosFN, numPosTN = compareRealAndPredicted(realKmerDict, posSeqDict, pssmList, positive=True)
	numNegTP, numNegFP, numNegFN, numNegTN = compareRealAndPredicted(realKmerDict, negSeqDict, pssmList, False)

	print "Positive: TP: ", numPosTP, ", FP: ", numPosFP, ", FN: ", numPosFN, ", TN: ", numPosTN
	print "Negative: TP: ", numNegTP, ", FP: ", numNegFP, ", FN: ", numNegFN, ", TN: ", numNegTN

	totalPos = len(posSeqDict)
	totalNeg = len(negSeqDict)

	sensitivity, ppv = compareKmers.GetSensitivityAndPPV((numPosTP + numNegTP) , (numPosFP + numNegFP), (numPosFN + numNegFN))
	accuracy = compareKmers.GetAccuracy( (numPosTP + numNegTP), (numPosTN + numNegTN),  (totalPos + totalNeg) )
	specificity = compareKmers.GetSpecificity( (numPosFP + numNegFP), totalNeg);

	print "Senitivity: ", sensitivity, ", PPV: ", ppv, ", Accuracy: ", accuracy, ", Specificity: ", specificity;
	return sensitivity, ppv;	
def parseSubDirectories(resultDir, level=1):
	gcContentMap = dict();
	resultFileName = resultDir + resultDir[:-1] + "_GC_Content.out"
	print "Result FileName: ", resultFileName
	for signalFile in SeqGenUtils.findFiles(resultDir, "Signal*.fa"):
		print "Signal File: ", signalFile;
		expt_name = os.path.dirname(signalFile).split("/")[2]
		gcContentValue, atContentValue = gcContent.getNucleotideComposition(signalFile)
		print gcContentValue, atContentValue;
		if expt_name in gcContentMap.keys():
			gcContentMap[str(expt_name)].append(gcContentValue);
		else:
			gcContentMap[str(expt_name)] = [gcContentValue];

	gcContentMeanStd = dict();
	for key, value in gcContentMap.iteritems():
		meanValue = np.mean(value);
		varianceValue = np.var(value);
		gcContentMeanStd[key] = [meanValue, varianceValue]

	writeDictToFile(gcContentMeanStd, resultFileName);
示例#9
0
					motifCountDict[motif] = 1;

		if motifFound == False:
			totalMisMatches = totalMisMatches + 1;

	print "Total Mismatches: ", totalMisMatches;
	return motifMatchDict, motifCountDict;

if __name__ == "__main__":	
	import sys
	motifFile = sys.argv[1]
	faFile = sys.argv[2]

	motifList = getMotifList(motifFile)
	print motifList
	seqDict  = SeqGenUtils.fasta_read(faFile)
	motifMatchDict, motifCountDict = findMotifInSequences(motifList, seqDict)

	singleMotif = 0
	multipleMotif = 0;
	for header, value in motifMatchDict.iteritems():
		if len(value) > 1:
			multipleMotif = multipleMotif + 1;
		if len(value) == 1:
			singleMotif = singleMotif + 1;

	print "Sequences that matches single motif: ", singleMotif;
	print "Sequences that matches multiple motifs: ", multipleMotif;
	print "Total Sequences: ", len(seqDict)

	# for item in sorted(motifMatchDict):
def writeFastaFileFromKmers(kmerList, prefix, outFileName):
	fastaFileLines = SeqGenUtils.createFastaFileFromKmers(kmerList, prefix);
	SeqGenUtils.writeFastaLinesToFile(fastaFileLines, outFileName);
示例#11
0
文件: SeqGen.py 项目: ezubaric/PySG
	def GetRandomSequence(self, seqLen):
		seq=""
		for count in range(seqLen):
			seq+=SeqGenUtils.weightedchoice([("C", self.cPercent), ("G", self.gPercent), ("A",
				self.aPercent), ("T", self.tPercent)]);
		return seq;
示例#12
0
	kmerDict = parseKspectrum.FindKspectrumKmers(featureFile, numKmers)
	HuR_ReString = '[^-](\d+\.\d+)\,HuR_([ATGC]+)'
	TTP_ReString = '[^-](\d+\.\d+)\,TTP_([ATGC]+)'

	HuRKmerDict = parseKspectrum.FindRBPSpecificKmers(featureFile, HuR_ReString, numKmers);
	TTPKmerDict = parseKspectrum.FindRBPSpecificKmers(featureFile, TTP_ReString, numKmers);


	kmerFastaLines = addFastaHeaderToKmers(kmerDict, "FeatureKmer")
	HuRFastaLines = addFastaHeaderToKmers(HuRKmerDict, "HuR_Specific")
	TTPFastaLines = addFastaHeaderToKmers(TTPKmerDict, "TTP_Specific")

	filename_prefix = os.path.splitext(sequenceFile)[0];
	filename_ext = os.path.splitext(sequenceFile)[1];
	HuR_Filename = filename_prefix + "_HuR" + filename_ext;
	TTP_Filename = filename_prefix + "_TTP" + filename_ext;

	SeqGenUtils.writeFastaLinesToFile(kmerFastaLines, sequenceFile)
	SeqGenUtils.writeFastaLinesToFile(HuRFastaLines, HuR_Filename)
	SeqGenUtils.writeFastaLinesToFile(TTPFastaLines, TTP_Filename)
	# topKmerFile = sys.argv[1]
	# signalType = sys.argv[2] ## PWM or PFM
	# signalFile = sys.argv[3]
	# seqFile = sys.argv[4]
	# topKmers = int(sys.argv[5])

	# kmerList = GetTopXKmers(topKmerFile, topKmers);
	# motif = GetMotifForSignal(signalType, signalFile);
	# seqDict  = SeqGenUtils.fasta_read(seqFile);
	# FindBestScoreSeqForMotif(motif, seqDict, kmerList)