示例#1
0
def getRandomizedSequenceCacheForVerticalPermutations(taxId):
    global _caches

    if (taxId, db.Sources.ShuffleCDS_vertical_permutation_1nt) in _caches:
        cache = _caches[(taxId, db.Sources.ShuffleCDS_vertical_permutation_1nt)]
        
    else:
        # read all native sequences
        protIds = []
        cdss = []
        for protId in SpeciesCDSSource(taxId):
            cds = CDSHelper(taxId, protId)
            
            if( cds.length()%3 != 0 ):
                continue
            
            seq = cds.sequence()
            
            protIds.append(protId)
            cdss.append(seq)
            
        geneticCode = getSpeciesTranslationTable( taxId )
        scpr = SynonymousCodonPermutingRandomization( geneticCode ) 
        randomizer = lambda cdss: scpr.verticalPermutation( cdss )
        cache = VerticalRandomizationCache(shuffleType=db.Sources.ShuffleCDS_vertical_permutation_1nt,
                                           taxId=taxId,
                                           nativeSeqsMap=dict(zip(protIds, cdss)),
                                           geneticCode=geneticCode,
                                           randomizer=randomizer )
        _caches[(taxId, db.Sources.ShuffleCDS_vertical_permutation_1nt)] = cache
        print(_caches.keys())

        
    return cache
示例#2
0
def writeSequenceToTempFile(taxId):

    print("Fetching sequence for taxid={}".format(taxId))

    allRecords = []
    allCDSs = []

    for protId in SpeciesCDSSource(taxId):
        cds = CDSHelper(taxId, protId)

        if (cds.length() % 3 != 0):
            continue

        seq = cds.sequence()
        allCDSs.append(seq)

        if (len(allCDSs) % 1000 == 999): print(".")

    record = SeqRecord(Seq(''.join(allCDSs), NucleotideAlphabet),
                       id="allCDSs",
                       description="")
    allRecords.append(record)

    fout = NamedTemporaryFile(mode="w", delete=(not debugMode))
    SeqIO.write(allRecords, fout.name,
                "fasta")  # write the full sequences into the file

    return (len(allRecords), fout)
示例#3
0
    def calculateMissingWindowsForSequence(
            self,
            taxId,
            protId,
            seqIds,
            requestedShuffleIds,
            firstWindow,
            lastWindowStart,
            windowStep,
            reference="begin",
            shuffleType=db.Sources.ShuffleCDSv2_python):

        timerForPreFolding.start()
        logging.warning("Parameters: %d %s %s %s %d %d %s %d" %
                        (taxId, protId, seqIds, requestedShuffleIds,
                         lastWindowStart, windowStep, reference, shuffleType))
        f = self._logfile

        assert (len(seqIds) > 0)
        assert (len(seqIds) == len(requestedShuffleIds))

        optimalSpeciesGrowthTemperature = None
        if (self._seriesSourceNumber ==
                db.Sources.RNAfoldEnergy_SlidingWindow40_v2_native_temp):
            (numericalProp, _) = getSpeciesTemperatureInfo(taxId)
            optimalSpeciesGrowthTemperature = numericalProp[0]

            if optimalSpeciesGrowthTemperature is None:
                raise Exception(
                    "No temperature value for taxid={}, can't calculate native-temperature folding profile..."
                    .format(taxId))
            else:
                optimalSpeciesGrowthTemperature = float(
                    optimalSpeciesGrowthTemperature)
                assert (optimalSpeciesGrowthTemperature >= -30.0
                        and optimalSpeciesGrowthTemperature <= 150.0)

        if (reference != "begin" and reference != "end"):
            timerForPreFolding.stop()
            e = "Specificed profile reference '%s' is not supported! (" % reference
            logging.error(e)
            raise Exception(e)

        # We will process all listed shuffle-ids for the following protein record
        cds = CDSHelper(taxId, protId)

        if (cds.length() < self._windowWidth):
            e = "Refusing to process item %s because the sequence length (%d nt) is less than the window size (%d nt)\n" % (
                itemToProcess, cds.length(), self._windowWidth)
            f.write(e)
            logging.error(e)
            timerForPreFolding.stop()
            raise Exception(e)

        # Create a list of the windows we need to calculate for this CDS
        if reference == "begin":
            requestedWindowStarts = frozenset(
                range(
                    0,
                    min(lastWindowStart + 1,
                        cds.length() - self._windowWidth - 1), windowStep))
            if (len(requestedWindowStarts) == 0):
                e = "No windows exist for calculation taxid=%d, protId=%s, CDS-length=%d, lastWindowStart=%d, windowStep=%d, windowWidth=%d - Skipping...\n" % (
                    taxId, protId, cds.length(), lastWindowStart, windowStep,
                    self._windowWidth)
                f.write(e)
                logging.error(e)
                timerForPreFolding.stop()
                raise Exception(e)
        elif reference == "end":
            lastPossibleWindowStart = cds.length(
            ) - self._windowWidth  #+ 1  # disregard lastWindowStart when reference=="end"
            #lastWindowCodonStart = (lastPossibleWindowStart-3)-(lastPossibleWindowStart-3)%3

            #lastPossibleWindowStart = seqLength - windowWidth # + 1  # disregard lastWindowStart when reference=="end"
            requestedWindowStarts = frozenset(
                filter(
                    lambda x: x >= lastWindowStart,
                    range(lastPossibleWindowStart % windowStep,
                          lastPossibleWindowStart + 1, windowStep)))

            #requestedWindowStarts = frozenset(range(lastWindowCodonStart % windowStep, lastWindowCodonStart, windowStep))
            #pass
        else:
            assert (False)

        # First, read available results (for all shuffle-ids) in JSON format
        # Array is indexed by shuffle-id, so results not requested will be represented by None (as will requested items that have no results yet).
        logging.info("DEBUG: requestedShuffleIds (%d items): %s\n" %
                     (len(requestedShuffleIds), requestedShuffleIds))
        existingResults = cds.getCalculationResult2(self._seriesSourceNumber,
                                                    requestedShuffleIds,
                                                    True,
                                                    shuffleType=shuffleType)
        #assert(len(existingResults) >= len(requestedShuffleIds))  # The returned array must be at least as large as the requested ids list
        assert (len(existingResults) == len(requestedShuffleIds))
        logging.info("requestedShuffleIds: %s" % requestedShuffleIds)
        logging.info("existingResults.keys(): %s" % existingResults.keys())
        assert (frozenset(requestedShuffleIds) == frozenset(
            existingResults.keys()))
        #existingResults = [None] * (max(requestedShuffleIds)+1)
        logging.info("DEBUG: existingResults (%d items): %s\n" %
                     (len(existingResults), existingResults))

        # Check for which of the requested shuffle-ids there are values missing
        shuffleIdsToProcess = {}
        for shuffleId, r in existingResults.items():
            if r is None:
                # There are no existing results for shuffled-id n. If it was requested, it should be calculated now (including all windows)
                if shuffleId in requestedShuffleIds:
                    shuffleIdsToProcess[shuffleId] = list(
                        requestedWindowStarts)

                timerForPreFolding.stop()

                # ------------------------------------------------------------------------------------
                continue  # TODO - verify this line; should we abort this sequence by throwing????
                # ------------------------------------------------------------------------------------

            logging.info("/// shuffleId r = %d %s" % (shuffleId, r))
            logging.info("r[MFE-profile] %s" % r["MFE-profile"])

            # Check the existing results for this shuffle
            alreadyProcessedWindowStarts = frozenset([
                i for i, x in enumerate(r["MFE-profile"]) if x is not None
            ])  # Get the indices (=window starts) of all non-None values
            missingWindows = requestedWindowStarts - alreadyProcessedWindowStarts  # Are there any requested windows that are not already computed?
            if (missingWindows):
                shuffleIdsToProcess[shuffleId] = missingWindows

        if (not shuffleIdsToProcess):
            e = "All requested shuffle-ids in (taxId: %d, protId: %s, seqs: %s) seem to have already been processed. Skipping...\n" % (
                taxId, protId, str(list(zip(seqIds, requestedShuffleIds))))
            logging.warning(e)
            timerForPreFolding.stop()
            return
        logging.info("DEBUG: shuffleIdsToProcess (%d items): %s\n" %
                     (len(shuffleIdsToProcess), shuffleIdsToProcess))

        logging.info("DEBUG: Before (%d items): %s\n" %
                     (len(existingResults), existingResults))
        # Initialize new results records
        for shuffleId in shuffleIdsToProcess.keys():
            if existingResults[shuffleId] is None:
                logging.info(seqIds)
                logging.info(requestedShuffleIds)
                logging.info(shuffleId)
                thisSeqId = seqIds[requestedShuffleIds.index(shuffleId)]

                existingResults[shuffleId] = {
                    "id":
                    "%s/%s/%d/%d" % (taxId, protId, thisSeqId, shuffleId),
                    "seq-crc": None,
                    "MFE-profile": [],
                    "MeanMFE": None,
                    "v": 2,
                    "shuffle-type": shuffleType
                }
        logging.info("DEBUG: existingResults (%d items): %s\n" %
                     (len(existingResults), existingResults))
        timerForPreFolding.stop()

        # Load the sequences of all shuffle-ids we need to work on
        # TODO - combine loading of multiple sequences into one DB operation
        for shuffleId, record in existingResults.items():
            if record is None:
                logging.info(
                    "DEBUG: skipping empty results record for shuffleId={}".
                    format(shuffleId))
                continue
            timerForPreFolding.start()

            seq = None
            annotatedSeqId = None
            # Get the sequence for this entry
            if (shuffleId < 0):
                seq = cds.sequence()
                annotatedSeqId = cds.seqId()
            else:
                seq = cds.getShuffledSeq(shuffleId, shuffleType)
                annotatedSeqId = cds.getShuffledSeqId(shuffleId, shuffleType)

            if (seq is None or (not seq is None and len(seq) == 0)):
                seq2 = cds.getShuffledSeq2(annotatedSeqId)
                seq3 = cds._fetchSequence(annotatedSeqId)
                seq4 = cds._cache.get("%d:seq" % annotatedSeqId)
                if not seq4 is None:
                    del cds._cache["%d:seq" % annotatedSeqId]
                seq5 = cds.getShuffledSeq2(annotatedSeqId)
                e = "Got empty sequence for shuffleId=%d, seqId=%d, taxId=%d, protId=%s, numShuffled=%d, ids[%d:%d]=%s, len(seq2)=%d, len(seq3)=%d, len(seq4)=%d, len(seq5)=%d" % (
                    shuffleId, annotatedSeqId, taxId, protId,
                    len(cds.shuffledSeqIds()), shuffleId - 2, shuffleId + 2,
                    cds.shuffledSeqIds()[shuffleId - 2:shuffleId + 2],
                    len(seq2) if not seq2 is None else -1, len(seq3)
                    if not seq3 is None else -1, len(seq4) if not seq4 is None
                    else -1, len(seq5) if not seq5 is None else -1)
                logging.error(e)
                timerForPreFolding.stop()
                raise Exception(e)

            #
            # Disabled - calculation needn't include the native sequence...
            #
            #if( annotatedSeqId not in seqIds ):
            #    e = "Error: SeqId specified in queue item %s does not match annotated seq-id %d\n" % (itemToProcess, annotatedSeqId)
            #    f.write(e)
            #    f.write("Current shuffle-id: %d\n" % shuffleId)
            #    f.write("Ids in existing results:\n")
            #    for shuffleId, record in enumerate(existingResults):
            #        f.write(" %d) %s\n" % (shuffleId, record['id']))
            #    f.write("Debug info:\n")
            #    f.write("\n".join(cds.getDebugInfo()))
            #    f.write("\n")
            #    f.write("Skipping...\n")
            #    print("Skipping...")
            #    raise Exception(e)

            expectedSeqLength = cds.length()
            if (not expectedSeqLength is None):
                if (expectedSeqLength != len(seq)):
                    e = "Warning: taxid=%d, protid=%s, seqid=%d - unexpected length %d (expected: %d)\n" % (
                        taxId, protId, annotatedSeqId, len(seq),
                        expectedSeqLength)
                    f.write(e)
                    logging.error(e)
                    timerForPreFolding.stop()
                    raise Exception(e)

            if (len(seq) < self._windowWidth):
                # Sequence is shorter than required window; skip
                e = "Warning: skipping sequence because it is shorter than the requested window...\n"
                f.write(e)
                logging.error(e)
                timerForPreFolding.stop()
                raise Exception(e)

            logging.info(
                "DEBUG: Processing item taxId=%d, protId=%s, shuffle=%d (length=%d, %d windows)...\n"
                % (taxId, protId, shuffleId, len(seq),
                   len(requestedWindowStarts)))

            # TODO - Remove any old value stored in this key?

            # Skip this for now
            # This will be made redundant by completing the "updating" implementation
            #
            #if( cds.isCalculationDone( seriesSourceNumber, shuffleId )):
            #    # Sufficient data seems to exist. Skip...
            #    f.write("Item %s appears to be already completed, skipping..." % itemToProcess)
            #    continue

            logging.info(seq[:50])
            #f.write("\n")

            MFEprofile = record["MFE-profile"]
            #f.write("Profile: %s\n" % MFEprofile)

            # Make sure the profile array contains enough entries for all new windows (and possibly, if windows are non-contiguous, entries between them that we are not going to compute right now)
            if (len(MFEprofile) < max(requestedWindowStarts)):
                entriesToAdd = max(requestedWindowStarts) - len(MFEprofile) + 1
                MFEprofile.extend([None] * entriesToAdd)
            assert (len(MFEprofile) >= max(requestedWindowStarts))

            stats = RunningStats()
            stats.extend([x for x in MFEprofile if x is not None])

            timerForPreFolding.stop()
            timerForFolding.start()
            for start in requestedWindowStarts:
                fragment = seq[start:(start + self._windowWidth)]
                assert (len(fragment) == self._windowWidth)

                if self._seriesSourceNumber == db.Sources.RNAfoldEnergy_SlidingWindow40_v2:
                    # Calculate the RNA folding energy. This is the computation-heavy part.
                    #strct, energy = RNA.fold(fragment)
                    energy = RNAfold_direct(fragment)
                    assert (energy <= 0.0)

                elif self._seriesSourceNumber == db.Sources.RNAfoldEnergy_SlidingWindow40_v2_native_temp:
                    # Calculate the RNA folding energy. This is the computation-heavy part.
                    #strct, energy = RNA.fold(fragment)
                    energy = RNAfold_direct(fragment,
                                            explicitCalculationTemperature=
                                            optimalSpeciesGrowthTemperature)
                    assert (energy <= 0.0)

                elif self._seriesSourceNumber == db.Sources.TEST_StepFunction_BeginReferenced:
                    if shuffleId < 0:
                        energy = 0
                    else:
                        energy = start % 50 - 20

                elif self._seriesSourceNumber == db.Sources.TEST_StepFunction_EndReferenced:
                    if shuffleId < 0:
                        energy = 0
                    else:
                        energy = (expectedSeqLength - self._windowWidth -
                                  start) % 50 - 20

                else:
                    logging.error(
                        "Received unknown seriesSourceNumber {}".format(
                            self._seriesSourceNumber))
                    assert (False)

                # Store the calculation result
                #print("%d:%s --> %f" % (taxId, protId, energy))

                stats.push(energy)
                MFEprofile[start] = energy

            print(
                "///////////////////  shuffleId={} (len={}) //////////////////////////"
                .format(shuffleId, expectedSeqLength))
            prettyPrintProfile(MFEprofile)

            timerForFolding.stop()
            timerForPostFolding.start()

            # Format
            crc = calcCrc(seq)
            #result = """{"id":"%s","seq-crc":%d,"MFE-profile":[%s],"MeanMFE":%.6g,v:2}""" % (itemToProcess, crc, ",".join(map(lambda x: "%.3g" % x, MFEprofile)), stats.mean())
            record["seq-crc"] = crc
            record["MFE-profile"] = [
                round4(x) for x in MFEprofile
            ]  # Round items down to save space (these are not exact numbers anyway)
            record["MeanMFE"] = stats.mean()
            result = json.dumps(record)

            f.write(result)
            f.write("\n")

            if (not self._debugDoneWriteResults):
                cds.saveCalculationResult2(self._seriesSourceNumber, result,
                                           annotatedSeqId, False)

            timerForPostFolding.stop()

        timerForPostFolding.start()

        if (not self._debugDoneWriteResults):
            cds.commitChanges()

        timerForPostFolding.stop()
statsShuffles = RunningStats()

recordsCount = 0
warningsCount = 0

rl = RateLimit(30)

total = countSpeciesCDS(taxId)

for protId in SpeciesCDSSource(taxId):
    cds = CDSHelper(taxId, protId)
    recordsCount += 1

    statsLength.push(cds.length())

    if (len(cds.sequence()) != cds.length()):
        print(
            "WARNING: incorrect sequence length detected for record (taxid=%d, protId=%s); real-length=%d, recorded-length=%d."
            % (taxId, protId, len(cds.sequence()), cds.length()))
        warningsCount += 1

    recomputedCrc = calcCrc(cds.sequence())
    annotatedCrc = cds.crc()
    assert (recomputedCrc == annotatedCrc)
    print(cds.sequence()[:15])

    seq1trans = Seq(cds.sequence(), generic_dna).translate()
    crc1 = calcCrc(seq1trans)

    shuffles = cds.shuffledSeqIds()
    unique = len(frozenset(shuffles))
示例#5
0
def testCDSand3UTRRandomizationIncludingNextCDS(
        taxId: int = 511145,
        geneticCode: int = 11,
        constantOverlaps: bool = False) -> int:
    from data_helpers import SpeciesCDSSource
    from genome_model import getGenomeModelFromCache

    rand = CDSand3UTRRandomizationIncludingNextCDS(
        SynonymousCodonPermutingRandomization(geneticCode=geneticCode),
        NucleotidePermutationRandomization(),
        taxId,
        constantOverlaps=constantOverlaps)

    #for protId in SpeciesCDSSource(taxId):
    countOK = 0
    countNotOK = 0
    countNotOK2 = 0
    countSkipped = 0

    for protId in getGenomeModelFromCache(taxId).allCDSSource():
        try:
            cds = CDSHelper(taxId, protId)
            seq = cds.sequence()

            #if str(seq).find("n") != -1:
            #    countSkipped += 1
            #    continue

        except Exception as e:
            countNotOK += 1
            continue

        for i in range(20):
            try:
                ret = rand.randomize(seq, protId)

            except Exception as e:
                print(
                    "Caught exception during call to randomize(), protId={}!".
                    format(protId))
                print(e)
                countNotOK += 1
                countNotOK2 += 1
                continue

            if ret[0] < 1e5:
                print(protId)

            if not (len(ret[2]) == len(seq)):
                print(ret)
                rand.randomize(seq, protId)
            assert (len(ret[2]) == len(seq))

        countOK += 1

        #print("{} -> {}".format( protId, ret ))

    print("OK: {}, NotOK: {}, Skipped: {}, Total: {}".format(
        countOK, countNotOK, countSkipped,
        countOK + countNotOK + countSkipped))
    print("randomize exception: {}".format(countNotOK2))

    return 0
        #fnormpval.write("%s,%s\n" % (protId, ','.join(map(str, normpval))))
        #fshapiro.write("%s,%s\n" % (protId, ','.join(map(str, shapiro))))
        #fshapiropval.write("%s,%s\n" % (protId, ','.join(map(str, shapiropval))))
        #fkurtosis.write("%s,%s\n" % (protId, ','.join(map(str, kurtosis))))
        #frefgamma.write("%s,%s\n" % (protId, ','.join(map(str, refgamma))))

        # Perform Wilcoxon signed-rank test
        #print("DF: ", df.shape)  # (20,151)
        #print("s: ", s.shape)    # (20,)
        #print("u: ", df.iloc[1,:].shape) # (151,)
        #print("mean(u): ", df.mean(axis=0).shape)
        #print(np.array(profile).shape) #(151,)


        # Update the GC profile
        cdsSequence = cds.sequence()
        gcContent = calcGCcontent(cdsSequence)
        #print(gcContent)
        if( len(gcContent) ):
            for i in range(profileSpec.numProfileWindows()):  # Limit to 150; TODO - treat this generically?
                GCProfile[i].push( gcContent[i] )
                medianGCContent.append( np.median( gcContent ) )
        del cds
        

        
        # Method 1 -- Wilcoxon, native vs. mean(shuffled), window step=1nt, per gene
        meanOfShufflesProfileForW = df.mean(axis=0)
        nativeProfileForW = np.array(profile)
        assert(meanOfShufflesProfileForW.shape == nativeProfileForW.shape)
for taxId in species:
    proteinsDone = 0

    #nativeColumns = [[] for x in range(maxCodons)]
    #shuffledColumns = [[[] for x in range(maxCodons)] for y in range(maxShuffles)]
    allNativeSeqs = {}
    allShuffledSeqs = {}

    for protId in SpeciesCDSSource(taxId):
        cds = CDSHelper(taxId, protId)
        warnings.update(("total-cds", ))

        allIds = cds.shuffledSeqIds(shuffleType=shuffleType)[:maxShuffles]

        nativeSeq = cds.sequence()
        if (len(nativeSeq) % 3 != 0):
            warnings.update(("has-broken-codons", ))
            continue

        nativeCodons = Counter(splitCodons(nativeSeq))

        hasMismatchedCodons = False
        allNativeSeqs[protId] = nativeSeq
        hashesForShuffles = set()

        #for i, c in enumerate(splitCodons(nativeSeq)[:maxCodons]):
        #    nativeColumns[i].append(c)

        shuffledSeqs = []