def getRandomizedSequenceCacheForVerticalPermutations(taxId): global _caches if (taxId, db.Sources.ShuffleCDS_vertical_permutation_1nt) in _caches: cache = _caches[(taxId, db.Sources.ShuffleCDS_vertical_permutation_1nt)] else: # read all native sequences protIds = [] cdss = [] for protId in SpeciesCDSSource(taxId): cds = CDSHelper(taxId, protId) if( cds.length()%3 != 0 ): continue seq = cds.sequence() protIds.append(protId) cdss.append(seq) geneticCode = getSpeciesTranslationTable( taxId ) scpr = SynonymousCodonPermutingRandomization( geneticCode ) randomizer = lambda cdss: scpr.verticalPermutation( cdss ) cache = VerticalRandomizationCache(shuffleType=db.Sources.ShuffleCDS_vertical_permutation_1nt, taxId=taxId, nativeSeqsMap=dict(zip(protIds, cdss)), geneticCode=geneticCode, randomizer=randomizer ) _caches[(taxId, db.Sources.ShuffleCDS_vertical_permutation_1nt)] = cache print(_caches.keys()) return cache
def writeSequenceToTempFile(taxId): print("Fetching sequence for taxid={}".format(taxId)) allRecords = [] allCDSs = [] for protId in SpeciesCDSSource(taxId): cds = CDSHelper(taxId, protId) if (cds.length() % 3 != 0): continue seq = cds.sequence() allCDSs.append(seq) if (len(allCDSs) % 1000 == 999): print(".") record = SeqRecord(Seq(''.join(allCDSs), NucleotideAlphabet), id="allCDSs", description="") allRecords.append(record) fout = NamedTemporaryFile(mode="w", delete=(not debugMode)) SeqIO.write(allRecords, fout.name, "fasta") # write the full sequences into the file return (len(allRecords), fout)
def getIdentifiersConversionTableUsingGff3(): global altIdentifiers if altIdentifiers: return altIdentifiers gm = getGenomeModelFromCache(taxId) for protId in SpeciesCDSSource(taxId): cds = CDSHelper(taxId, protId) geneId = cds.getGeneId() alts = gm.findEquivalentIdentifiers(geneId) for i in alts: altIdentifiers[i] = protId altIdentifiers[geneId] = protId
def testSpecies(taxId): paData = getSpeciesPaxdbData( taxId ) countFound = 0 countNotFound = 0 for protId in SpeciesCDSSource(taxId): cds = CDSHelper( taxId=taxId, protId=protId ) geneId = cds.getGeneId() if geneId in paData: countFound += 1 else: countNotFound += 1 print("Species: {} -> Found: {} ({:.3}%) Not found: {}".format(taxId, countFound, countFound/(countFound+countNotFound)*100, countNotFound)) return( countFound, countNotFound)
selected = 0 alreadyCompleted = 0 totalMissingResults = 0 queuedDelayedCalls = [] for taxIdForProcessing in species: print("Processing %d sequences for tax-id %d (%s)..." % (countSpeciesCDS(taxIdForProcessing), taxIdForProcessing, getSpeciesName(taxIdForProcessing))) stats = Counter() # Iterate over all CDS entries for this species # TODO - preloading all sequences and results should optimize this for protId in SpeciesCDSSource(taxIdForProcessing): stats['all-sequences'] += 1 #protId = codecs.decode(protId) # Filtering # Only process 1/N of the sequences, selected randomly (N=randomFraction) # (if randomFraction==1, all sequences will be processed) if (randint(1, randomFraction) != 1): skipped += 1 stats['skipped-random-fraction'] += 1 continue # ------------------------------------------------------------------------------------------ # Exclude some sequences from the calculation
args = argsParser.parse_args() # Configuration taxId = args.taxId #statsShuffles = RunningStats() statsShuffles = OfflineStats() recordsCount = 0 warningsCount = 0 rl = RateLimit(30) total = countSpeciesCDS(taxId) for protId in SpeciesCDSSource(taxId): cds = CDSHelper(taxId, protId) statsShuffles.push( cds.dropShuffledSeqs(lastItemToKeep=args.keep_first_n_shuffles)) recordsCount += 1 if (rl()): print("processed %d records (%.2g%%)" % (recordsCount, float(recordsCount) / total * 100)) # DEBUG ONLY # DEBUG ONLY # DEBUG ONLY # DEBUG ONLY # DEBUG ONLY # DEBUG ONLY # #if( recordsCount > 20 ): # break # DEBUG ONLY # DEBUG ONLY # DEBUG ONLY # DEBUG ONLY # DEBUG ONLY # DEBUG ONLY #
def getAllProteins(taxId): return list(SpeciesCDSSource(taxId))
def plotStatistics(): metadata = io.loadmat("{}{}".format(data_path, metadata_file)) sourceIdentifiersTable = metadata["gene_id"] def getSourceGeneId(idx: int) -> str: return sourceIdentifiersTable[idx][0][0] #print(metadata["gene_id"].shape) #print(metadata["gene_id"][1]) #print(metadata["gene_id"][100]) #print(metadata["gene_id"][1000]) #print(metadata["gene_id"][1020]) idTable = getIdentifiersMapping() allData = [ io.loadmat("{}{}".format(data_path, fn)) for fn in measurement_files ] for data, fn in zip(allData, measurement_files): plotDatafileStatistics(data, fn) RPratios = np.stack([readReadthroughData(fn)[5] for fn in allData]) ORFreads = np.stack([readReadthroughData(fn)[3] for fn in allData]) ORFreads[np.isnan(ORFreads)] = 0.0 print(ORFreads.shape) RPratios_ = RPratios.copy() RPratios_[np.isnan(RPratios_)] = 0.0 RPratios_[np.isinf(RPratios_)] = 0.0 print("//") print(np.min(RPratios[~np.isnan(RPratios)])) print(np.max(RPratios[~np.isnan(RPratios)])) print(np.min(RPratios_)) print(np.max(RPratios_)) # Does the "RP ratios" metric correlate between the different experiments? rs = spearmanr(RPratios, axis=1, nan_policy="omit").correlation fig, ax1 = plt.subplots() sns.heatmap(rs, annot=True, ax=ax1) plt.savefig("RP_distribution_spearman.pdf") plt.close(fig) print(RPratios[0, :].shape) #qs = np.quantile( RPratios_, 0.90, axis=1 ) #qs3 = np.quantile( RPratios_, 0.95, axis=1 ) #qs3 = np.quantile( RPratios_, 0.99, axis=1 ) #print(qs) #print(qs2) #print(qs3) #for t in (0.1, 0.2, 0.3, 0.8, 0.9, 0.99, 0.999): # print( np.quantile( RPratios_, t, axis=1 ) ) #tt1 = np.quantile( RPratios_, 0.985, axis=1 ) #selectedPos = np.any( (RPratios_.T > tt1), axis=1 ) #selectedNeg = np.all( (RPratios_.T <= tt1), axis=1 ) & np.any(ORFreads > 0.0, axis=0) from data_helpers import SpeciesCDSSource, setCDSProperty for i, fn in enumerate(measurement_files): selectedPos = frozenset( np.nonzero( RPratios[i, np.isfinite(RPratios[i, :])] > readthroughThreshold )[0]) selectedNeg = frozenset( np.nonzero(RPratios[i, np.isfinite(RPratios[i, :])] <= readthroughThreshold)[0]) print("///////////////////////") print(i) # print("++") # print( len(selectedPos) ) # print("--") # print( len(selectedNeg) ) positiveIdentifiersSourceFmt = frozenset( [getSourceGeneId(x) for x in selectedPos]) negativeIdentifiersSourceFmt = frozenset( [getSourceGeneId(x) for x in selectedNeg]) assert (not positiveIdentifiersSourceFmt.intersection( negativeIdentifiersSourceFmt)) positiveIdentifiersNativeFmt = [ idTable.get(x, None) for x in positiveIdentifiersSourceFmt ] negativeIdentifiersNativeFmt = [ idTable.get(x, None) for x in negativeIdentifiersSourceFmt ] # good = 0 # bad = 0 # out = [] # for pos in selectPosIndices: # sourceIds = metadata["gene_id"][pos] # x = sourceIds[0][0] # print(x) # if x in idTable: # good += 1 # out.append(idTable[x]) # else: # bad += 1 # print("good={} bad={}".format(good, bad)) countMarkedPositive = 0 countMarkedNegative = 0 for protId in SpeciesCDSSource(taxId): valForProt = None if protId in positiveIdentifiersNativeFmt: valForProt = "1" countMarkedPositive += 1 elif protId in negativeIdentifiersNativeFmt: valForProt = "0" countMarkedNegative += 1 if not valForProt is None: setCDSProperty(taxId, protId, "readthrough-v2.ex{}".format(i), valForProt, overwrite=True) print(countMarkedPositive) print(countMarkedNegative)
server = "http://rest.ensemblgenomes.org" ext = "/xrefs/id/%s?content-type=application/json;all_levels=1" def getRecord(protid): r = requests.get(server + ext % protid) #headers={ "Content-Type" : "application/json"}) if not r.ok: r.raise_for_status() sys.exit() # Rate-limit requests sleep(0.5) decoded = r.json() return decoded for protid in SpeciesCDSSource(taxid): record = getRecord(protid) print(record) #with open(f, 'r') as csvfile: # for row in csv.reader(csvfile, delimiter='\t'): # #['3706992', '224308.Bsubs1_010100004063', '4.46'] # paxId = row[1].split(".")[1] # pa = float(row[2])
def processGenome(args, taxId): alreadyProcessedGenes = {} totalProteinsProcessed = 0 totalSkipped = 0 seqsForWriting=[] recordsForWriting={} gm = getGenomeModelFromCache( taxId ) for protId in SpeciesCDSSource(taxId): cds = CDSHelper( taxId, protId ) totalProteinsProcessed += 1 #feature = gm.findFeatureById( protId ) geneId = cds.getGeneId() #flanking3UTRRegionLengthNt = cds.flankingRegion3UtrLength() feature = gm.findFeatureById( protId ) #feature = cds.getMatchingFeatureFromGenomeModel() #print(feature) strand = feature[1].data['strand'] if strand=='+': otherFeature = gm.moleculeModels[ feature[0] ].find5PrimeFlankingRegion( feature[1] ) if otherFeature is None: totalSkipped += 1 continue assert( otherFeature['downstream-feature'].begin <= otherFeature['downstream-feature'].end) flanking3UTRRegionLengthNt = otherFeature['curr-feature'].begin - otherFeature['downstream-feature'].end threePrimeUTRCoords = (feature[1].begin-20, feature[1].begin+2, False) # include the first 3 nucleotides of the CDS else: otherFeature = gm.moleculeModels[ feature[0] ].find5PrimeFlankingRegion( feature[1] ) if otherFeature is None: totalSkipped += 1 continue assert( otherFeature['downstream-feature'].begin <= otherFeature['downstream-feature'].end) flanking3UTRRegionLengthNt = otherFeature['downstream-feature'].begin - otherFeature['curr-feature'].end threePrimeUTRCoords = (feature[1].end-3, feature[1].end+20, True) # include the first 3 nucleotides of the CDS threePrimeUTR = gm.moleculeModels[ feature[0] ].getSequence( *threePrimeUTRCoords ) if flanking3UTRRegionLengthNt < -50: print("Warning: found gene with apparent long overlap: {},{},{},{},{}".format( protId, geneId, strand, flanking3UTRRegionLengthNt, threePrimeUTR.seq )) #totalSkipped += 1 #continue if threePrimeUTR.seq[-2:] != 'TG': print("Warning: skipping gene with start codon at the correct place: {},{},{},{},{}".format( protId, geneId, strand, flanking3UTRRegionLengthNt, threePrimeUTR.seq )) totalSkipped += 1 continue # All done - emit the output #fout.write("{},{},{},{},{}".format( protId, geneId, strand, flanking3UTRRegionLengthNt, threePrimeUTR.seq )) recordsForWriting[protId] = (geneId, strand, flanking3UTRRegionLengthNt, threePrimeUTR.seq ) seqsForWriting.append( SeqRecord( Seq(threePrimeUTR.seq[:-3], NucleotideAlphabet), id=protId) ) aSD = calculateaSDEnergies( seqsForWriting, args, taxId ) print(len(aSD)) with open( outputData.format(taxId), 'wt') as fout: for protId, record in recordsForWriting.items(): aSDval = aSD.get(protId, None) vals = (protId,) + record + (aSDval,) fout.write("{},{},{},{},{},{}\n".format( *vals )) print("Processed {} coding sequences for taxid {}".format( totalProteinsProcessed, taxId )) print("Skipped {} coding sequences".format( totalSkipped ))
def readSeriesResultsForSpecies(seriesSourceNumber, species, minShuffledGroups=20, maxShuffledGroups=20, shuffleType=db.Sources.ShuffleCDSv2_python, cdsFilter=None, returnCDS=True): if isinstance( species, Iterable ): # usually, species will be a sequence of numeric taxid values if isinstance(species, basestring): raise Exception("species cannot be string") # all set - proceed... else: species = (species, ) # assume we got a single (numeric) taxid value assert (minShuffledGroups <= maxShuffledGroups) for taxIdForProcessing in species: print("Procesing %d sequences for tax-id %d (%s)..." % (countSpeciesCDS(taxIdForProcessing), taxIdForProcessing, getSpeciesName(taxIdForProcessing))) computed = getAllComputedSeqsForSpecies(seriesSourceNumber, taxIdForProcessing, maxShuffledGroups, shuffleType=shuffleType) computedIds = frozenset(computed.keys()) print("Collecting data from %d computation results..." % len(computed)) skipped = 0 selected = 0 alreadyCompleted = 0 # Iterate over all CDS entries for this species for protId in SpeciesCDSSource(taxIdForProcessing): cds = CDSHelper(taxIdForProcessing, protId) if (not cdsFilter is None) and (not cdsFilter(cds)): continue cdsSeqId = cds.seqId() shuffledIds = cds.shuffledSeqIds(shuffleType=shuffleType) # How many shuffles (for this cds) exist in the data we found? computedShufflesCount = len( computedIds.intersection(frozenset(shuffledIds))) if (computedShufflesCount < minShuffledGroups or (not cdsSeqId in computedIds)): #print("%s - found only %d groups, skipping" % (protId, computedShufflesCount)) skipped += 1 continue # Get the computed results for this CDS seqIds = [cds.seqId()] seqIds.extend(cds.shuffledSeqIds(shuffleType=shuffleType)) if (len(seqIds) > maxShuffledGroups + 1): seqIds = seqIds[:maxShuffledGroups + 1] results = [computed.get(x) for x in seqIds] if (results is None or len([() for x in results if not x is None]) < minShuffledGroups): print("Not enough results found for %s" % protId) skipped += 1 continue # Decode the results results = list( map( lambda x: decodeJsonSeriesRecord(decompressSeriesRecord(x)) if not x is None else None, results)) if (returnCDS): yield { "taxid": taxIdForProcessing, "content": results, "cds": cds } else: yield {"taxid": taxIdForProcessing, "content": results} del results del cds selected += 1 if (rl()): print("# %s - %d records included, %d records skipped" % (datetime.now().isoformat(), selected, skipped))