Exemplo n.º 1
0
    def func_deriv(self, v, sign=1.0):
        """ Derivative of objective function """

        gradient = np.zeros(self.iEnd)

        #Compute non-penalized objective fuction
        if self.objectiveFunction == 'affinityError':
            gradient[:self.k + 3 * self.L +
                     1] = sign * self.computeAffinityErrorGradient(v)
        elif self.objectiveFunction == 'KLDivergence':
            gradient[:self.k + 3 * self.L +
                     1] = sign * self.computeKLDivergenceGradient(v)
        else:
            sl.err('Invalid objective function')

        #Adding L2 contribution to gradient
        if self.lambdaL2Mono != 0:
            gradient[self.iMono:self.iMono +
                     self.L * 3] += 2 * self.lambdaL2Mono * np.dot(
                         self.vectorToMonoMatrix(v),
                         self.xMono.transpose()).flatten()
        if self.lambdaL2Shape != 0:
            gradient[self.iShape:self.iShape +
                     self.k] += 2 * self.lambdaL2Shape * np.array(
                         v[self.iShape:self.iShape + self.k])

        #Adding gradinets of dual L1 variables
        if self.useL1Mono:
            gradient[self.iL1Mono:self.iL1Mono +
                     4 * self.L] = np.ones(4 * self.L) * self.lambdaL1Mono
        if self.useL1Shape:
            gradient[self.iL1Shape:self.iL1Shape +
                     self.k] = np.ones(self.k) * self.lambdaL1Shape

        return gradient
Exemplo n.º 2
0
    def func(self, v, sign=1.0):
        """ Computes objective function """

        out = 0.

        #Compute non-penalized objective fuction
        if self.objectiveFunction == 'affinityError':
            out += sign * self.computeAffinityError(v)
        elif self.objectiveFunction == 'KLDivergence':
            out += sign * self.computeKLDivergence(v)
        else:
            sl.err('Invalid objective function')

        #Adding L2 penalty:
        if self.lambdaL2Mono != 0:
            out += sign * self.lambdaL2Mono * sum(
                [la.norm(mi, 2)**2 for mi in self.vectorToMonoMatrix(v)])

        if self.lambdaL2Shape != 0:
            out += sign * self.lambdaL2Shape * la.norm(
                v[self.iShape:self.iShape + self.k], 2)**2

        #Adding L1 penalty:
        if self.useL1Mono:
            out += sign * self.lambdaL1Mono * np.sum(
                v[self.iL1Mono:self.iL1Mono + 4 * self.L])
        if self.useL1Shape:
            out += sign * self.lambdaL1Shape * np.sum(
                v[self.iL1Shape:self.iL1Shape + self.k])

        return out
Exemplo n.º 3
0
def main():

    #Creating parser
    parser = argparse.ArgumentParser(
        description=
        'Generates a random k-mer table from an input table using either the "matched complexity" method (default) or by permuting the input table.'
    )
    parser.add_argument(
        'kmerFile',
        metavar='kmerTable.csv',
        help='Comma-separated kMer table. (COL 1) = kmer, (COL 2) = values')
    parser.add_argument("-s",
                        metavar='seed',
                        help="Seed for numpy.random",
                        default=None)
    parser.add_argument("--verbose",
                        help="Increase output verbosity",
                        action="store_true")
    parser.add_argument("--symmetric",
                        help="k-mer table is reverse-complement symmetric.",
                        action="store_true")
    parser.add_argument("--header",
                        help="First line in kmer file is header.",
                        action="store_true")
    parser.add_argument("--permute",
                        help="Permuts the input table",
                        action="store_true")
    args = parser.parse_args()

    #Sets seed.
    if args.s is not None:
        np.random.seed(int(args.s))

    #Reads k-mer table
    (kMers, values, k, nCol,
     header) = sl.readKMerTable(args.kmerFile, args.header)
    if len(values[0]) > 1:
        sl.err(
            "Current implementation can only generate a single random column")

    if args.permute:  #Permutes k-mer table
        if args.symmetric:
            #Identifies reverse-complement pairs of sequences that shouldbe held out
            kmerPairs = {}
            symValues = {}
            trantab = maketrans("ACGT", "TGCA")
            for i in range(len(kMers)):
                km = kMers[i]
                rcKm = km[::-1].translate(trantab)
                if rcKm not in kmerPairs:
                    kmerPairs[kMers[i]] = (i, kMers.index(rcKm))
                    symValues[kMers[i]] = (values[i] +
                                           values[kMers.index(rcKm)]) / 2

            #Creates list of inital and permuted kmers
            inKmers = kmerPairs.keys()
            pKmers = np.random.permutation(inKmers)
            newValues = [0.] * len(kMers)
            for i in range(len(inKmers)):
                km = inKmers[i]
                rcKm = km[::-1].translate(trantab)
                permutedValue = symValues[pKmers[i]]

                newValues[kMers.index(km)] = permutedValue
                newValues[kMers.index(rcKm)] = permutedValue

        else:
            newValues = np.random.permutation(values[:, 0])
    else:  #Generates random mono+di model with matched conditional variance

        # 1. Computes the expected conditional variance of 'true' k-mer table
        conditionalVariance = np.array([[
            np.mean([
                np.var([
                    values[j][0] for j in range(len(kMers))
                    if kMers[j][i1] == nucl[n1] and kMers[j][i2] == nucl[n2]
                ]) for n1 in range(4) for n2 in range(4)
                if not (i1 == i2 and n1 != n2)
            ]) for i2 in range(k)
        ] for i1 in range(k)])

        # 2. Create a design matrix used to generate random k-mer table
        # 2.1 Creates (independent) mononucleotide matrices:
        sigList = []
        matrixList = []
        for i in range(k):
            #Generates new mononucleotide-matrix using uniform random numbers
            rv = np.random.rand(4)
            newMonoMatrix = np.array([[0.] * (i) + [rv[y] - np.mean(rv)] +
                                      [0.] * (k - i - 1) for y in range(4)])
            #Generates a "signature" vector used to make sure we don't add the same degrees of freedom twice (for symmetric matrices)
            newMonoSignature = np.array([0] * (i) + [1] + [0] * (k - i - 1))

            #Symmerizes matrix and signature if appropriate.
            if args.symmetric:
                newMonoMatrix = (newMonoMatrix + newMonoMatrix[::-1, ::-1]) / 2
                newMonoSignature = newMonoSignature + newMonoSignature[::-1]

            #Checks if the current signature allready has been added.
            sig = "mono" + "".join(["%d" % si for si in newMonoSignature])

            #Saves the new matrix if an equivalent matrix has not been saved before
            if sig in sigList:
                continue
            else:
                sigList += [sig]
                matrixList += [(newMonoMatrix, np.zeros((16, k - 1)))]

        # 2.2 Creates (independent) dinucleotide matrices:
        #matrix used to reverse-complement a 16-entry dinucleotide vector
        rcDiMatrix = np.array([[
            int(j == 4 * (3 - n2) + (3 - n1)) for n1 in range(4)
            for n2 in range(4)
        ] for j in range(16)])
        for i in range(k - 1):
            #Generates new mononucleotide-matrix using uniform random numbers
            rv = np.random.rand(16)
            newDiMatrix = np.array([[0.] * (i) + [rv[y] - np.mean(rv)] + [0.] *
                                    (k - i - 2) for y in range(16)])
            #Generates a "signature" vector used to make sure we don't add the same degrees of freedom twice
            newDiSignature = np.array([0] * (i) + [1] + [0] * (k - i - 2))

            #Symmerizes matrix and signature if appropriate.
            if args.symmetric:

                newDiMatrix = (newDiMatrix +
                               rcDiMatrix.dot(newDiMatrix[:, ::-1])) / 2
                newDiSignature = newDiSignature + newDiSignature[::-1]

            #Checks if the current signature allready has been added.
            sig = "di" + "".join(["%d" % si for si in newDiSignature])

            #Saves the new matrix if an equivalent matrix has not been saved before
            if sig in sigList:
                continue
            else:
                sigList += [sig]
                matrixList += [(np.zeros((4, k)), newDiMatrix)]

        # 2.3 Computes design matrix by scoring each k-mer using each independent mono/di-nucleotide matrix
        X = np.array([[scoreKMer(kMer, m) for m in matrixList]
                      for kMer in kMers])

        # 3. Computes the conditional covariance between pairs of matrix models
        d = [[
            np.array([[
                np.mean([
                    np.cov(
                        np.array([[X[j][a], X[j][b]] for j in range(len(kMers))
                                  if kMers[j][i1] == nucl[n1]
                                  and kMers[j][i2] == nucl[n2]
                                  ]).transpose())[0, 1] for n1 in range(4)
                    for n2 in range(4) if not (i1 == i2 and n1 != n2)
                ]) for a in range(len(matrixList))
            ] for b in range(len(matrixList))]) for i1 in range(k)
        ] for i2 in range(k)]

        # 4. Finds a combination of matrix models that minimizes the L2-error in the expected conditional variance.
        # 4.1 Loss function
        f = lambda v: np.sum([(v.dot(d[i1][i2]).dot(v) - conditionalVariance[
            i1, i2])**2 for i1 in range(k) for i2 in range(k)])
        # 4.2 Gradient of loss function
        df = lambda v: np.sum([
            4 * (v.dot(d[i1][i2]).dot(v) - conditionalVariance[i1, i2]) * v.
            dot(d[i1][i2]) for i1 in range(k) for i2 in range(k)
        ],
                              axis=0)
        # 4.3 Initial seed
        x0 = np.random.rand(len(matrixList))
        # 4.4 Minimizes the loss funciton using L-BFGS
        res = op.minimize(f,
                          x0,
                          args=(),
                          method='L-BFGS-B',
                          jac=df,
                          options={
                              'disp': False,
                              'maxiter': 1000
                          })
        # 4.5 Computes new values
        newValues = np.array([Xi.dot(res.x) for Xi in X])

        if args.verbose:
            # Writes conditional variance matrices
            sl.disp("Conditional variance matrix:")
            sl.printMatrix(conditionalVariance, sys.stderr)

            sl.disp("Conditional variance in random model:")
            newConditionalVariance = np.array([[
                np.mean([
                    np.var([
                        newValues[j] for j in range(len(kMers)) if
                        kMers[j][i1] == nucl[n1] and kMers[j][i2] == nucl[n2]
                    ]) for n1 in range(4) for n2 in range(4)
                    if not (i1 == i2 and n1 != n2)
                ]) for i2 in range(k)
            ] for i1 in range(k)])
            sl.printMatrix(newConditionalVariance, sys.stderr)

    #Writes the random matrix to STDOUT
    for i in range(len(newValues)):
        print "%s,%f" % (kMers[i], newValues[i])
Exemplo n.º 4
0
def main():

    #Creating parser
    parser = argparse.ArgumentParser(
        description=
        'Reads a list of sequences and computes the mean shape profile using a k-mer table. Outputs one position in the profile per line. The k-mer table can have multiple columns, leading to multiple columns of output.'
    )
    parser.add_argument(
        'seqFile',
        metavar='seq.lst',
        help='Text file containing one sequence per line ("-" gives STDIN)')
    parser.add_argument(
        'kmerFile',
        metavar='kmerValue.csv',
        help=
        'Comma-separated kMer file. The first column contains k-mers, the following columns contain the associated value(s).'
    )
    parser.add_argument("--scoreN",
                        help="Treats N as the average of A, C, G, and T.",
                        action="store_true")
    parser.add_argument("--header",
                        help="First line in k-mer file is header.",
                        action="store_true")
    #	parser.add_argument("--verbose", help="Increase output verbosity", action="store_true")
    args = parser.parse_args()

    #Parses kmer file
    (kMers, values, k, nCol,
     header) = sl.readKMerTable(args.kmerFile, args.header)
    kMerTable = dict([(kMers[i], values[i]) for i in range(len(kMers))])

    if args.scoreN:  #Adds wildcard "N" characters to the k-mer table by averaging over "A", "C, "G", "T"
        for x in range(
                k
        ):  #Successively adds "N" to each position (in combination with previously added Ns)
            currentKeys = kMerTable.keys()
            for key in currentKeys:
                newKey = key[:x] + "N" + key[x + 1:]
                if newKey in kMerTable:
                    continue
                else:
                    kMerTable[newKey] = sum(
                        [kMerTable[key[:x] + n + key[x + 1:]]
                         for n in nucl]) / len(nucl)

    #Computes the mean value
    nSeq = 0
    seqSum = None
    L = None

    #Determines where to read the sequence file
    if args.seqFile == "-":
        f = sys.stdin
    else:
        f = open(args.seqFile)

    #Loops over sequences
    for l in f:
        #Makes sure the sequences have equal length (and sets up seqSum the first round0
        if L is None:
            L = len(l.rstrip())
            seqSum = [np.zeros(nCol) for i in range(L - k + 1)]
        elif L != len(l.rstrip()):
            sl.err('All sequences must be of equal length.')

        nSeq += 1
        seq = l.rstrip()

        for i in range(L - k + 1):
            seqSum[i] += kMerTable[l[i:i + k]]

    #Prints the mean profile
    if args.header:
        print ",".join(header)
    for i in range(L - k + 1):
        print ",".join(["%f" % di for di in (seqSum[i] / nSeq)])
Exemplo n.º 5
0
def main():

    #Creating parser
    tDEFAULT = 13  # Number of temperatures
    eDEFAULT = 10  # Number of free-energy bins
    nDEFAULT = 1000  # Number of sampled sequences
    parser = argparse.ArgumentParser(
        description=
        'Samples random sequences from the uniform distribution and sorts them into -DDG/RT bins using a free-energy scoring matrix. The sequences are first sampled from the Boltzmann distribution e^{E/T} using Metropolis-Hastings sampling and then down-sampled to the uniform distribution using bin-specific rejection-sampling. Outputs the sampled sequences in FORMAT. The sequence identifiers contain energy-bin index iE, where 1 is the lowest-affinity bin, and a sequence index iSeq.'
    )
    parser.add_argument(
        'matrixFile',
        metavar='scoringMatrix.tsv',
        help=
        '(Mononucleotide) scoring matrix. (COL 1) = "A,C,G,T", (COL 2..) = -ddG/RT values'
    )
    parser.add_argument(
        '-t',
        metavar='nTemp',
        help=
        'Number of temperatures to use in the Metropolis-Hastings sampling (DEFAULT = %d)'
        % tDEFAULT,
        type=int,
        default=tDEFAULT)
    parser.add_argument('-e',
                        metavar='nEnergyBins',
                        help='Number of energy bins (DEFAULT = %d)' % eDEFAULT,
                        type=int,
                        default=eDEFAULT)
    parser.add_argument('-n',
                        metavar='nSample',
                        help='Number of sampled sequences (DEFAULT = %d)' %
                        nDEFAULT,
                        type=int,
                        default=nDEFAULT)
    #	parser.add_argument("--verbose", help="Increase output verbosity", action="store_true")
    args = parser.parse_args()

    #1. Reads and processes scoring matrix
    #1.1 - Reads matrix file
    rawScoringMatrix = sl.loadScoringMatrix(args.matrixFile)
    if rawScoringMatrix.keys() != [0]:
        sl.err(
            "Model file should only contain scoring matrix (mononucleotides).")

    #1.2. Shifts and scales the scoring matrix so the sequence-score is in the interval [-1,0]
    scoringMatrix = np.array([i - max(i) for i in rawScoringMatrix[0]]) / sum(
        [max(i) - min(i) for i in rawScoringMatrix[0]])

    #2. Samples sequences at each temperature
    k = len(scoringMatrix)  # Width of scoring matrix
    dE = 1. / args.e  # Width of energy bins
    betaList = 2 * np.log(math.pow(4, k)) * np.linspace(
        -1, 1, args.t)  #List of inverse temperatures to loop over
    binnedSeq = [[[] for ei in range(args.e)] for ti in range(args.t)
                 ]  # (Empty) table of  energy-binned sequences

    for betaI in range(len(betaList)):  #Loops over inverse temperatures.
        #2.1 Samples sequences with using Metropolis-Hastings sampling from distribution p[s] ~ e^(beta*E)
        beta = betaList[betaI]
        seqList = sampleSequence(scoringMatrix, beta, 3 * args.n)

        for s in seqList:
            #2.2 Identifies energy bin of sequence
            energyI = min(int(np.floor((s[1] + 1) * args.e)), args.e - 1)
            #2.3 Saves the sequence with probability proportional to e^(-beta*E). This downsamples to the uniform distribution within each bin.
            if (beta > 0 and np.random.rand() < np.exp(-beta *
                                                       (s[1] -
                                                        (-1 + energyI * dE)))
                ) or (beta <= 0
                      and np.random.rand() < np.exp(-beta *
                                                    (s[1] -
                                                     (-1 +
                                                      (energyI + 1) * dE)))):
                binnedSeq[betaI][energyI] += [s]

    #3. Output
    #3.1 Converts from numeric to string representation
    outSeqs = []
    for iE in range(args.e):
        outSeqs += [[]]
        for Ti in range(args.t):
            for s in binnedSeq[Ti][iE]:
                outSeqs[iE] += ["".join(nucl[n] for n in s[0])]
        outSeqs[iE] = np.random.permutation(outSeqs[iE])

    #3.2 Prints the sequences.
    for iE in range(args.e):
        for iS in range(min(args.n, len(outSeqs[iE]))):
            print ">iE=%d,iSeq=%d" % (iE + 1, iS + 1)
            print outSeqs[iE][iS]
Exemplo n.º 6
0
    def __init__(self,
                 agnosticModel,
                 shapeModel,
                 lambdaL1Mono=0.,
                 lambdaL1Shape=0.,
                 lambdaL2Mono=0.,
                 lambdaL2Shape=0.,
                 objectiveFunction='KLDivergence',
                 normalizeModel=True,
                 edgeReadout=False):

        #Saves mechanism-agnostic (true) model
        if sorted(agnosticModel.keys()) != [0, 1]:
            sl.err(
                "Mechanism-agnostic model file must (only) have mono- and di-nucleotide values."
            )
        self.agnosticMono = np.copy(agnosticModel[0])  #L*4 array
        self.agnosticMonoAffinity = np.exp(self.agnosticMono)
        if len(agnosticModel[1].shape) == 3:
            self.agnosticDi = np.copy(agnosticModel[1])
        else:
            self.agnosticDi = np.array([[[vi[4 * n1 + n2] for n2 in range(4)]
                                         for n1 in range(4)]
                                        for vi in agnosticModel[1]
                                        ])  #(L-1)*4*4 array

        self.agnosticDiAffinity = np.exp(self.agnosticDi)
        self.L = len(self.agnosticMono)
        self.edgeReadout = edgeReadout

        #Sets shifts the agnostic model so that the expected affinity is 1.0
        if normalizeModel:
            meanAffinity = self.computeSum(1, self.agnosticMonoAffinity,
                                           self.agnosticDiAffinity) / 4**self.L
            self.agnosticMono -= np.log(meanAffinity) / self.L
            self.agnosticMonoAffinity = np.exp(self.agnosticMono)

        #Processes shape model
        if sorted(shapeModel.keys()) != [0, 1]:
            sl.err(
                "Shape model file must (only) have mono- and di-nucleotide values."
            )
        if len(shapeModel[0]) < 2:
            sl.err("Shape model must 2bp long or wider.")
        self.diDesignMatrix = self.buildDiShapeDesignMatrix(shapeModel, self.L)
        self.k = len(self.diDesignMatrix)

        # Saves the objective function choice to be used
        self.objectiveFunction = objectiveFunction

        # Setting up dual variables for L1 Penalization
        if lambdaL1Mono == 0.:
            self.useL1Mono = False
            self.lambaL1Mono = None
        else:
            self.useL1Mono = True
            self.lambdaL1Mono = lambdaL1Mono

        if lambdaL1Shape == 0.:
            self.useL1Shape = False
            self.lambdaL1Shape = None
        else:
            self.useL1Shape = True
            self.lambdaL1Shape = lambdaL1Shape

        self.lambdaL2Mono = lambdaL2Mono
        self.lambdaL2Shape = lambdaL2Shape

        #Setting up indices for accessing entries in the state vector
        self.iShape = 0
        self.iMono = self.iShape + self.k
        self.iIntercept = self.iMono + 3 * self.L
        self.iL1Shape = self.iIntercept + 1
        if self.useL1Shape:
            self.iL1Mono = self.iL1Shape + self.k
        else:
            self.iL1Mono = self.iL1Shape
        if self.useL1Mono:
            self.iEnd = self.iL1Mono + 4 * self.L
        else:
            self.iEnd = self.iL1Mono

        return