def func_deriv(self, v, sign=1.0): """ Derivative of objective function """ gradient = np.zeros(self.iEnd) #Compute non-penalized objective fuction if self.objectiveFunction == 'affinityError': gradient[:self.k + 3 * self.L + 1] = sign * self.computeAffinityErrorGradient(v) elif self.objectiveFunction == 'KLDivergence': gradient[:self.k + 3 * self.L + 1] = sign * self.computeKLDivergenceGradient(v) else: sl.err('Invalid objective function') #Adding L2 contribution to gradient if self.lambdaL2Mono != 0: gradient[self.iMono:self.iMono + self.L * 3] += 2 * self.lambdaL2Mono * np.dot( self.vectorToMonoMatrix(v), self.xMono.transpose()).flatten() if self.lambdaL2Shape != 0: gradient[self.iShape:self.iShape + self.k] += 2 * self.lambdaL2Shape * np.array( v[self.iShape:self.iShape + self.k]) #Adding gradinets of dual L1 variables if self.useL1Mono: gradient[self.iL1Mono:self.iL1Mono + 4 * self.L] = np.ones(4 * self.L) * self.lambdaL1Mono if self.useL1Shape: gradient[self.iL1Shape:self.iL1Shape + self.k] = np.ones(self.k) * self.lambdaL1Shape return gradient
def func(self, v, sign=1.0): """ Computes objective function """ out = 0. #Compute non-penalized objective fuction if self.objectiveFunction == 'affinityError': out += sign * self.computeAffinityError(v) elif self.objectiveFunction == 'KLDivergence': out += sign * self.computeKLDivergence(v) else: sl.err('Invalid objective function') #Adding L2 penalty: if self.lambdaL2Mono != 0: out += sign * self.lambdaL2Mono * sum( [la.norm(mi, 2)**2 for mi in self.vectorToMonoMatrix(v)]) if self.lambdaL2Shape != 0: out += sign * self.lambdaL2Shape * la.norm( v[self.iShape:self.iShape + self.k], 2)**2 #Adding L1 penalty: if self.useL1Mono: out += sign * self.lambdaL1Mono * np.sum( v[self.iL1Mono:self.iL1Mono + 4 * self.L]) if self.useL1Shape: out += sign * self.lambdaL1Shape * np.sum( v[self.iL1Shape:self.iL1Shape + self.k]) return out
def main(): #Creating parser parser = argparse.ArgumentParser( description= 'Generates a random k-mer table from an input table using either the "matched complexity" method (default) or by permuting the input table.' ) parser.add_argument( 'kmerFile', metavar='kmerTable.csv', help='Comma-separated kMer table. (COL 1) = kmer, (COL 2) = values') parser.add_argument("-s", metavar='seed', help="Seed for numpy.random", default=None) parser.add_argument("--verbose", help="Increase output verbosity", action="store_true") parser.add_argument("--symmetric", help="k-mer table is reverse-complement symmetric.", action="store_true") parser.add_argument("--header", help="First line in kmer file is header.", action="store_true") parser.add_argument("--permute", help="Permuts the input table", action="store_true") args = parser.parse_args() #Sets seed. if args.s is not None: np.random.seed(int(args.s)) #Reads k-mer table (kMers, values, k, nCol, header) = sl.readKMerTable(args.kmerFile, args.header) if len(values[0]) > 1: sl.err( "Current implementation can only generate a single random column") if args.permute: #Permutes k-mer table if args.symmetric: #Identifies reverse-complement pairs of sequences that shouldbe held out kmerPairs = {} symValues = {} trantab = maketrans("ACGT", "TGCA") for i in range(len(kMers)): km = kMers[i] rcKm = km[::-1].translate(trantab) if rcKm not in kmerPairs: kmerPairs[kMers[i]] = (i, kMers.index(rcKm)) symValues[kMers[i]] = (values[i] + values[kMers.index(rcKm)]) / 2 #Creates list of inital and permuted kmers inKmers = kmerPairs.keys() pKmers = np.random.permutation(inKmers) newValues = [0.] * len(kMers) for i in range(len(inKmers)): km = inKmers[i] rcKm = km[::-1].translate(trantab) permutedValue = symValues[pKmers[i]] newValues[kMers.index(km)] = permutedValue newValues[kMers.index(rcKm)] = permutedValue else: newValues = np.random.permutation(values[:, 0]) else: #Generates random mono+di model with matched conditional variance # 1. Computes the expected conditional variance of 'true' k-mer table conditionalVariance = np.array([[ np.mean([ np.var([ values[j][0] for j in range(len(kMers)) if kMers[j][i1] == nucl[n1] and kMers[j][i2] == nucl[n2] ]) for n1 in range(4) for n2 in range(4) if not (i1 == i2 and n1 != n2) ]) for i2 in range(k) ] for i1 in range(k)]) # 2. Create a design matrix used to generate random k-mer table # 2.1 Creates (independent) mononucleotide matrices: sigList = [] matrixList = [] for i in range(k): #Generates new mononucleotide-matrix using uniform random numbers rv = np.random.rand(4) newMonoMatrix = np.array([[0.] * (i) + [rv[y] - np.mean(rv)] + [0.] * (k - i - 1) for y in range(4)]) #Generates a "signature" vector used to make sure we don't add the same degrees of freedom twice (for symmetric matrices) newMonoSignature = np.array([0] * (i) + [1] + [0] * (k - i - 1)) #Symmerizes matrix and signature if appropriate. if args.symmetric: newMonoMatrix = (newMonoMatrix + newMonoMatrix[::-1, ::-1]) / 2 newMonoSignature = newMonoSignature + newMonoSignature[::-1] #Checks if the current signature allready has been added. sig = "mono" + "".join(["%d" % si for si in newMonoSignature]) #Saves the new matrix if an equivalent matrix has not been saved before if sig in sigList: continue else: sigList += [sig] matrixList += [(newMonoMatrix, np.zeros((16, k - 1)))] # 2.2 Creates (independent) dinucleotide matrices: #matrix used to reverse-complement a 16-entry dinucleotide vector rcDiMatrix = np.array([[ int(j == 4 * (3 - n2) + (3 - n1)) for n1 in range(4) for n2 in range(4) ] for j in range(16)]) for i in range(k - 1): #Generates new mononucleotide-matrix using uniform random numbers rv = np.random.rand(16) newDiMatrix = np.array([[0.] * (i) + [rv[y] - np.mean(rv)] + [0.] * (k - i - 2) for y in range(16)]) #Generates a "signature" vector used to make sure we don't add the same degrees of freedom twice newDiSignature = np.array([0] * (i) + [1] + [0] * (k - i - 2)) #Symmerizes matrix and signature if appropriate. if args.symmetric: newDiMatrix = (newDiMatrix + rcDiMatrix.dot(newDiMatrix[:, ::-1])) / 2 newDiSignature = newDiSignature + newDiSignature[::-1] #Checks if the current signature allready has been added. sig = "di" + "".join(["%d" % si for si in newDiSignature]) #Saves the new matrix if an equivalent matrix has not been saved before if sig in sigList: continue else: sigList += [sig] matrixList += [(np.zeros((4, k)), newDiMatrix)] # 2.3 Computes design matrix by scoring each k-mer using each independent mono/di-nucleotide matrix X = np.array([[scoreKMer(kMer, m) for m in matrixList] for kMer in kMers]) # 3. Computes the conditional covariance between pairs of matrix models d = [[ np.array([[ np.mean([ np.cov( np.array([[X[j][a], X[j][b]] for j in range(len(kMers)) if kMers[j][i1] == nucl[n1] and kMers[j][i2] == nucl[n2] ]).transpose())[0, 1] for n1 in range(4) for n2 in range(4) if not (i1 == i2 and n1 != n2) ]) for a in range(len(matrixList)) ] for b in range(len(matrixList))]) for i1 in range(k) ] for i2 in range(k)] # 4. Finds a combination of matrix models that minimizes the L2-error in the expected conditional variance. # 4.1 Loss function f = lambda v: np.sum([(v.dot(d[i1][i2]).dot(v) - conditionalVariance[ i1, i2])**2 for i1 in range(k) for i2 in range(k)]) # 4.2 Gradient of loss function df = lambda v: np.sum([ 4 * (v.dot(d[i1][i2]).dot(v) - conditionalVariance[i1, i2]) * v. dot(d[i1][i2]) for i1 in range(k) for i2 in range(k) ], axis=0) # 4.3 Initial seed x0 = np.random.rand(len(matrixList)) # 4.4 Minimizes the loss funciton using L-BFGS res = op.minimize(f, x0, args=(), method='L-BFGS-B', jac=df, options={ 'disp': False, 'maxiter': 1000 }) # 4.5 Computes new values newValues = np.array([Xi.dot(res.x) for Xi in X]) if args.verbose: # Writes conditional variance matrices sl.disp("Conditional variance matrix:") sl.printMatrix(conditionalVariance, sys.stderr) sl.disp("Conditional variance in random model:") newConditionalVariance = np.array([[ np.mean([ np.var([ newValues[j] for j in range(len(kMers)) if kMers[j][i1] == nucl[n1] and kMers[j][i2] == nucl[n2] ]) for n1 in range(4) for n2 in range(4) if not (i1 == i2 and n1 != n2) ]) for i2 in range(k) ] for i1 in range(k)]) sl.printMatrix(newConditionalVariance, sys.stderr) #Writes the random matrix to STDOUT for i in range(len(newValues)): print "%s,%f" % (kMers[i], newValues[i])
def main(): #Creating parser parser = argparse.ArgumentParser( description= 'Reads a list of sequences and computes the mean shape profile using a k-mer table. Outputs one position in the profile per line. The k-mer table can have multiple columns, leading to multiple columns of output.' ) parser.add_argument( 'seqFile', metavar='seq.lst', help='Text file containing one sequence per line ("-" gives STDIN)') parser.add_argument( 'kmerFile', metavar='kmerValue.csv', help= 'Comma-separated kMer file. The first column contains k-mers, the following columns contain the associated value(s).' ) parser.add_argument("--scoreN", help="Treats N as the average of A, C, G, and T.", action="store_true") parser.add_argument("--header", help="First line in k-mer file is header.", action="store_true") # parser.add_argument("--verbose", help="Increase output verbosity", action="store_true") args = parser.parse_args() #Parses kmer file (kMers, values, k, nCol, header) = sl.readKMerTable(args.kmerFile, args.header) kMerTable = dict([(kMers[i], values[i]) for i in range(len(kMers))]) if args.scoreN: #Adds wildcard "N" characters to the k-mer table by averaging over "A", "C, "G", "T" for x in range( k ): #Successively adds "N" to each position (in combination with previously added Ns) currentKeys = kMerTable.keys() for key in currentKeys: newKey = key[:x] + "N" + key[x + 1:] if newKey in kMerTable: continue else: kMerTable[newKey] = sum( [kMerTable[key[:x] + n + key[x + 1:]] for n in nucl]) / len(nucl) #Computes the mean value nSeq = 0 seqSum = None L = None #Determines where to read the sequence file if args.seqFile == "-": f = sys.stdin else: f = open(args.seqFile) #Loops over sequences for l in f: #Makes sure the sequences have equal length (and sets up seqSum the first round0 if L is None: L = len(l.rstrip()) seqSum = [np.zeros(nCol) for i in range(L - k + 1)] elif L != len(l.rstrip()): sl.err('All sequences must be of equal length.') nSeq += 1 seq = l.rstrip() for i in range(L - k + 1): seqSum[i] += kMerTable[l[i:i + k]] #Prints the mean profile if args.header: print ",".join(header) for i in range(L - k + 1): print ",".join(["%f" % di for di in (seqSum[i] / nSeq)])
def main(): #Creating parser tDEFAULT = 13 # Number of temperatures eDEFAULT = 10 # Number of free-energy bins nDEFAULT = 1000 # Number of sampled sequences parser = argparse.ArgumentParser( description= 'Samples random sequences from the uniform distribution and sorts them into -DDG/RT bins using a free-energy scoring matrix. The sequences are first sampled from the Boltzmann distribution e^{E/T} using Metropolis-Hastings sampling and then down-sampled to the uniform distribution using bin-specific rejection-sampling. Outputs the sampled sequences in FORMAT. The sequence identifiers contain energy-bin index iE, where 1 is the lowest-affinity bin, and a sequence index iSeq.' ) parser.add_argument( 'matrixFile', metavar='scoringMatrix.tsv', help= '(Mononucleotide) scoring matrix. (COL 1) = "A,C,G,T", (COL 2..) = -ddG/RT values' ) parser.add_argument( '-t', metavar='nTemp', help= 'Number of temperatures to use in the Metropolis-Hastings sampling (DEFAULT = %d)' % tDEFAULT, type=int, default=tDEFAULT) parser.add_argument('-e', metavar='nEnergyBins', help='Number of energy bins (DEFAULT = %d)' % eDEFAULT, type=int, default=eDEFAULT) parser.add_argument('-n', metavar='nSample', help='Number of sampled sequences (DEFAULT = %d)' % nDEFAULT, type=int, default=nDEFAULT) # parser.add_argument("--verbose", help="Increase output verbosity", action="store_true") args = parser.parse_args() #1. Reads and processes scoring matrix #1.1 - Reads matrix file rawScoringMatrix = sl.loadScoringMatrix(args.matrixFile) if rawScoringMatrix.keys() != [0]: sl.err( "Model file should only contain scoring matrix (mononucleotides).") #1.2. Shifts and scales the scoring matrix so the sequence-score is in the interval [-1,0] scoringMatrix = np.array([i - max(i) for i in rawScoringMatrix[0]]) / sum( [max(i) - min(i) for i in rawScoringMatrix[0]]) #2. Samples sequences at each temperature k = len(scoringMatrix) # Width of scoring matrix dE = 1. / args.e # Width of energy bins betaList = 2 * np.log(math.pow(4, k)) * np.linspace( -1, 1, args.t) #List of inverse temperatures to loop over binnedSeq = [[[] for ei in range(args.e)] for ti in range(args.t) ] # (Empty) table of energy-binned sequences for betaI in range(len(betaList)): #Loops over inverse temperatures. #2.1 Samples sequences with using Metropolis-Hastings sampling from distribution p[s] ~ e^(beta*E) beta = betaList[betaI] seqList = sampleSequence(scoringMatrix, beta, 3 * args.n) for s in seqList: #2.2 Identifies energy bin of sequence energyI = min(int(np.floor((s[1] + 1) * args.e)), args.e - 1) #2.3 Saves the sequence with probability proportional to e^(-beta*E). This downsamples to the uniform distribution within each bin. if (beta > 0 and np.random.rand() < np.exp(-beta * (s[1] - (-1 + energyI * dE))) ) or (beta <= 0 and np.random.rand() < np.exp(-beta * (s[1] - (-1 + (energyI + 1) * dE)))): binnedSeq[betaI][energyI] += [s] #3. Output #3.1 Converts from numeric to string representation outSeqs = [] for iE in range(args.e): outSeqs += [[]] for Ti in range(args.t): for s in binnedSeq[Ti][iE]: outSeqs[iE] += ["".join(nucl[n] for n in s[0])] outSeqs[iE] = np.random.permutation(outSeqs[iE]) #3.2 Prints the sequences. for iE in range(args.e): for iS in range(min(args.n, len(outSeqs[iE]))): print ">iE=%d,iSeq=%d" % (iE + 1, iS + 1) print outSeqs[iE][iS]
def __init__(self, agnosticModel, shapeModel, lambdaL1Mono=0., lambdaL1Shape=0., lambdaL2Mono=0., lambdaL2Shape=0., objectiveFunction='KLDivergence', normalizeModel=True, edgeReadout=False): #Saves mechanism-agnostic (true) model if sorted(agnosticModel.keys()) != [0, 1]: sl.err( "Mechanism-agnostic model file must (only) have mono- and di-nucleotide values." ) self.agnosticMono = np.copy(agnosticModel[0]) #L*4 array self.agnosticMonoAffinity = np.exp(self.agnosticMono) if len(agnosticModel[1].shape) == 3: self.agnosticDi = np.copy(agnosticModel[1]) else: self.agnosticDi = np.array([[[vi[4 * n1 + n2] for n2 in range(4)] for n1 in range(4)] for vi in agnosticModel[1] ]) #(L-1)*4*4 array self.agnosticDiAffinity = np.exp(self.agnosticDi) self.L = len(self.agnosticMono) self.edgeReadout = edgeReadout #Sets shifts the agnostic model so that the expected affinity is 1.0 if normalizeModel: meanAffinity = self.computeSum(1, self.agnosticMonoAffinity, self.agnosticDiAffinity) / 4**self.L self.agnosticMono -= np.log(meanAffinity) / self.L self.agnosticMonoAffinity = np.exp(self.agnosticMono) #Processes shape model if sorted(shapeModel.keys()) != [0, 1]: sl.err( "Shape model file must (only) have mono- and di-nucleotide values." ) if len(shapeModel[0]) < 2: sl.err("Shape model must 2bp long or wider.") self.diDesignMatrix = self.buildDiShapeDesignMatrix(shapeModel, self.L) self.k = len(self.diDesignMatrix) # Saves the objective function choice to be used self.objectiveFunction = objectiveFunction # Setting up dual variables for L1 Penalization if lambdaL1Mono == 0.: self.useL1Mono = False self.lambaL1Mono = None else: self.useL1Mono = True self.lambdaL1Mono = lambdaL1Mono if lambdaL1Shape == 0.: self.useL1Shape = False self.lambdaL1Shape = None else: self.useL1Shape = True self.lambdaL1Shape = lambdaL1Shape self.lambdaL2Mono = lambdaL2Mono self.lambdaL2Shape = lambdaL2Shape #Setting up indices for accessing entries in the state vector self.iShape = 0 self.iMono = self.iShape + self.k self.iIntercept = self.iMono + 3 * self.L self.iL1Shape = self.iIntercept + 1 if self.useL1Shape: self.iL1Mono = self.iL1Shape + self.k else: self.iL1Mono = self.iL1Shape if self.useL1Mono: self.iEnd = self.iL1Mono + 4 * self.L else: self.iEnd = self.iL1Mono return