# TODO: Add support for step-size >1 # Establish DB connections #r = redis.StrictRedis(host=config.host, port=config.port, db=config.db) #session = db.Session() skipped = 0 selected = 0 alreadyCompleted = 0 totalMissingResults = 0 queuedDelayedCalls = [] for taxIdForProcessing in species: print("Processing %d sequences for tax-id %d (%s)..." % (countSpeciesCDS(taxIdForProcessing), taxIdForProcessing, getSpeciesName(taxIdForProcessing))) stats = Counter() # Iterate over all CDS entries for this species # TODO - preloading all sequences and results should optimize this for protId in SpeciesCDSSource(taxIdForProcessing): stats['all-sequences'] += 1 #protId = codecs.decode(protId) # Filtering # Only process 1/N of the sequences, selected randomly (N=randomFraction) # (if randomFraction==1, all sequences will be processed)
# metadata server (redis) r = redis.StrictRedis(host=config.host, port=config.port, db=config.db, password=config.password) # sequences server (mysql) session = db.Session() visitedProteinIds = set() assert (r.exists("species:taxid:%d:name" % taxId)) if (seqSourceTag == db.Sources.External): # Clear any previously imported CDSs... #r.delete(speciesCDSList % (taxId,)) count = data_helpers.countSpeciesCDS(taxId) if (count > 0 and (not args.dry_run)): print("%d sequences already exist for specied %d. Aborting..." % (count, taxId)) sys.exit(-1) elif (sequenceType == "fixCDSkey"): r.delete(speciesCDSList % (taxId, )) # Delete and reconstruct the CDS key else: assert (data_helpers.countSpeciesCDS(taxId) > 0) def getCrc(seq): return crc32(str(seq).lower()) & 0xffffffff
argsParser.add_argument("--taxId", type=int, required=True) argsParser.add_argument("--keep-first-n-shuffles", type=int, default=None) args = argsParser.parse_args() # Configuration taxId = args.taxId #statsShuffles = RunningStats() statsShuffles = OfflineStats() recordsCount = 0 warningsCount = 0 rl = RateLimit(30) total = countSpeciesCDS(taxId) for protId in SpeciesCDSSource(taxId): cds = CDSHelper(taxId, protId) statsShuffles.push( cds.dropShuffledSeqs(lastItemToKeep=args.keep_first_n_shuffles)) recordsCount += 1 if (rl()): print("processed %d records (%.2g%%)" % (recordsCount, float(recordsCount) / total * 100)) # DEBUG ONLY # DEBUG ONLY # DEBUG ONLY # DEBUG ONLY # DEBUG ONLY # DEBUG ONLY # #if( recordsCount > 20 ):
def runDistributed(): import _distributed import dask scheduler = _distributed.open() results = {} #taxids = [] delayedCalls = [] fractionSize = 20 for taxId in allSpeciesSource(): if randint(0, 20) > 0: continue if not getSpeciesProperty(taxId, 'paired-mRNA-fraction')[0] is None: continue size = countSpeciesCDS(taxId) numFractions = size / fractionSize for i in range(numFractions): call = dask.delayed(calcNativePairedFraction)(taxId, i, numFractions) delayedCalls.append(call) #taxids.append(taxId) print("Starting %d calls..." % len(delayedCalls)) futures = scheduler.compute( delayedCalls ) # submit all delayed calculations; obtain futures immediately try: _distributed.progress(futures) # wait for all calculations to complete except Exception as e: print(E) print("\n") print("Waiting for all tasks to complete...") _distributed.wait(futures) results = {} errorsCount = 0 for f in futures: try: (taxId, fraction, cdsCount, countPairedNucleotides, countTotalNucleotides) = scheduler.gather(f) current = None if taxId in results: current = results[taxId] else: current = (0, 0, 0, set()) current = (current[0] + cdsCount, current[1] + countPairedNucleotides, current[2] + countTotalNucleotides, current[3].union(set((fraction, )))) results[taxId] = current except Exception as e: print(e) errorsCount += 1 for taxId, result in results.items(): if len(result[3]) != max(result[3]) + 1: #raise Exception("Found invalid number of items for taxId=%d" % taxId) print("Found invalid number of items for taxId=%d" % taxId) continue fraction = float(result[1]) / result[2] setSpeciesProperty(taxId, "paired-mRNA-fraction", "%.4g" % fraction, "computed (v3)", overwrite=False) print("TaxId: %d\t\tFraction: %.4g" % (taxId, fraction)) print("Finished %d species with %d errors" % (len(results), errorsCount)) return results
# but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. import sys from time import sleep from data_helpers import SpeciesCDSSource, CDSHelper, getSpeciesName, countSpeciesCDS, matchCDSKeyNamesSource, r from rate_limit import RateLimit taxId = int(sys.argv[1]) rl = RateLimit(10) if (countSpeciesCDS(taxId) == 0): print("Species %d (%s) doesn't have any proteins..." % (taxId, getSpeciesName(taxId))) print("Nothing left to do...") sys.exit(0) print("Species %d (%s) has %d proteins stored." % (taxId, getSpeciesName(taxId), countSpeciesCDS(taxId))) print("Will delete it in 10 seconds...") sleep(10) count = 0 for protId in SpeciesCDSSource(taxId): print(protId) cds = CDSHelper(taxId, protId)
def standalone(): argsParser = argparse.ArgumentParser() argsParser.add_argument("--taxid", type=int) argsParser.add_argument("--input") argsParser.add_argument("--variant", type=parseOption( set(("yeastgenome", "NCBI", "Ensembl", "JGI")), "variant")) argsParser.add_argument("--type", type=parseOption( set(("cds", "shuffle", "fixCDSkey")), "sequence type")) argsParser.add_argument("--dry-run", action="store_true", default=False) argsParser.add_argument("--output-fasta") argsParser.add_argument("--gene-ids-file") argsParser.add_argument("--alt-protein-ids", type=parseOption(set(("locus_tag", )), "alt-protein-id")) argsParser.add_argument("--headers-from-another-fasta") argsParser.add_argument("--ignore-id-check", action="store_true", default=False) args = argsParser.parse_args() if (args.output_fasta): if (args.output_fasta == args.input): raise Exception("Fasta output file cannot match input file!") #if( len(sys.argv) < 5 ): # print("Usage: %s <taxid> <fasta-file> <fasta-variant> <cds|shuffle>" % (sys.argv[0],)) # sys.exit(-1) # command-line arguments taxId = args.taxid f = None if (args.input[-3:] == ".gz"): f = gzip.open(args.input, "r") elif (args.input[-4:] == ".bz2"): # TODO: impl this... assert (False) else: f = open(args.input, 'r') #sequenceFormat = args.variant sequenceType = args.type if (sequenceType == "cds"): seqSourceTag = db.Sources.External elif (sequenceType == "shuffle"): seqSourceTag = db.Sources.ShuffleCDSv2_matlab elif (sequenceType == "fixCDSkey"): seqSourceTag = None else: raise Exception("Unknown sequence type '%s'" % sequenceType) # establish connections # metadata server (redis) #r = redis.StrictRedis(host=config.host, port=config.port, db=config.db, password=config.password) # sequences server (mysql) #session = db.Session() visitedProteinIds = set() assert (r.exists("species:taxid:%d:name" % taxId)) if (seqSourceTag == db.Sources.External): # Clear any previously imported CDSs... #r.delete(speciesCDSList % (taxId,)) count = data_helpers.countSpeciesCDS(taxId) if (count > 0 and (not args.dry_run)): print("%d sequences already exist for specied %d. Aborting..." % (count, taxId)) sys.exit(-1) elif (sequenceType == "fixCDSkey"): r.delete(speciesCDSList % (taxId, )) # Delete and reconstruct the CDS key else: assert (data_helpers.countSpeciesCDS(taxId) > 0) reNuclearYeastGene = re.compile("Y[A-P][RL]\d+[CW](-[A-Z])?") geneIdsToInclude = set() if (args.gene_ids_file): with open(args.gene_ids_file, "r") as genesFile: for geneId in genesFile: geneIdsToInclude.add(geneId.rstrip()) reNCBIattributes = re.compile("\[(\S+)=([^\]]+)\]") reNCBIbareheader = re.compile("\w+\|\w+\.\d+_cds_(\w+.\d+)_\d+") outRecords = [] headersFromAnotherFasta = {} if args.headers_from_another_fasta: with open(args.headers_from_another_fasta, "r") as f2: for record in SeqIO.parse(f2, "fasta", alphabet=generic_dna): assert (not record.id in headersFromAnotherFasta) headersFromAnotherFasta[record.id] = record.description cdsCount = 0 notFoundCount = 0 skippedCount = 0 validNucleotideChars = str.maketrans("ACGTacgt", "%%%%%%%%") #print("Opening fasta file: {}".format(f)) for record in SeqIO.parse(f, "fasta", alphabet=generic_dna): #proteinId = regexLocusId.match(record.id).group(1) # Work-around for multiple-transcript identifiers in JGI's Chlamy genome if args.headers_from_another_fasta: record.description = headersFromAnotherFasta[record.id] numNonNucleotideChars = len(record.seq) - str( record.seq).translate(validNucleotideChars).count("%") if numNonNucleotideChars: print( "Skipping record %s, containing non-nucleotide or ambiguous symbols '%s'" % (record.id, numNonNucleotideChars)) skippedCount += 1 continue # yeastgenome.org - skip suspected pseudo-genes if (args.variant == "yeastgenome" and record.description.find("Dubious ORF") != -1): skippedCount += 1 continue # yeastgenome.org - skip mitochondrial genes if (args.variant == "yeastgenome"): geneType = record.id[0] if geneType == "Q" or geneType == "R": skippedCount += 1 continue # yeastgenome.org - verify gene-id conforms to: http://www.yeastgenome.org/help/community/nomenclature-conventions if (args.variant == "yeastgenome"): geneId = record.id assert (reNuclearYeastGene.match(geneId)) # Obtain attributes mapping attributes = [] if (args.variant == "NCBI"): attributes = dict(re.findall(reNCBIattributes, record.description)) if (args.variant == "NCBI"): if ('pseudo' in attributes and attributes['pseudo'] == 'true'): print("Skipping pseudo-gene entry %s" % (record.id, )) skippedCount += 1 continue # Determine gene id proteinId = None additionalProteinIds = set() altProteinId = None if (args.variant == "yeastgenome"): proteinId = record.id elif (args.variant == "NCBI"): if (sequenceType == "shuffle" and not attributes): #Workaround for shuffle-seq files missing the header... #Extract the protein-id from sequence-id like this: #>lcl|NC_002516.2_cds_NP_064721.1_1 if not args.alt_protein_ids: proteinId = reNCBIbareheader.match(record.id).group(1) elif args.alt_protein_ids == "locus_tag": if ('locus_tag' not in attributes): print("Skipping entry %s missing locus_tag - %s" % (record.id, attributes)) skippedCount += 1 continue proteinId = attributes['locus_tag'] print(proteinId) else: assert False else: # Note - not currently used #if 'db_xref' in attributes: # _db_xrefs = attributes['db_xref'].split(",") # db_xrefs = dict(map( lambda x: tuple(x.split(":")), _db_xrefs)) if not args.alt_protein_ids: if ('protein_id' not in attributes): print("Skipping entry %s missing protein_id - %s" % (record.id, attributes)) skippedCount += 1 continue proteinId = attributes['protein_id'] elif args.alt_protein_ids == "locus_tag": if ('locus_tag' not in attributes): print("Skipping entry %s missing locus_tag - %s" % (record.id, attributes)) skippedCount += 1 continue proteinId = attributes['locus_tag'] if ('protein_id' in attributes): altProteinId = attributes['protein_id'] else: assert (False) elif (args.variant == "Ensembl"): # Sample id: ABD29211.1 dotPos = record.id.rfind('.') if (dotPos > 3): proteinId = record.id[:dotPos] additionalProteinIds.add( record.id ) # also allow matching the full format (including the transcript-id) - some CDS files include it... else: proteinId = record.id elif (args.variant == "JGI"): # Variant 1 (Phytozome, Mpus) # (gff3): 60050 # (fasta): 60050 # Variant 2 (Phytozome, Dsal) # (gff3): Dusal.1637s00001.1 # (fasta): Dusal.1637s00001.1 # Variant 3: # (gff3): jgi.p|Ostta1115_2|10314 # (fasta): jgi|Ostta1115_2|10314|CE10313_131 proteinId = record.id if record.id.startswith("jgi|"): parts = record.id.split('|') parts[0] = 'jgi.p' # add the '.p' additionalProteinIds.add('|'.join( parts[:3])) # drop the suffix (parts[4]) else: assert (False) if not args.ignore_id_check: assert (len(proteinId) > 2) # Skip sequences that have non-standard translations if (args.variant == "NCBI"): if "transl_except" in attributes: print("Skipping %s (because of transl_except)" % (proteinId, )) skippedCount += 1 continue # If an inclusion list (white list) is defined, skip sequences missing from it if args.gene_ids_file: if (proteinId not in geneIdsToInclude): # Also try the additional ids if (not geneIdsToInclude.intersection(additionalProteinIds)): print("Skipping %s (sequence %s, alternate ids=%s)" % (proteinId, record.id, list(additionalProteinIds))) skippedCount += 1 continue print("Inserting %s (sequence %s)..." % (proteinId, record.id)) # Verify there are no duplicates entries if (proteinId in visitedProteinIds): print("MULTIPLE Entry: %s", proteinId) skippedCount += 1 continue #assert(proteinId not in visitedProteinIds) visitedProteinIds.add(proteinId) # Write the filtered sequences into an output file (if needed) # Note - this also works in dry-run... if (args.output_fasta): outRecords.append(record) if (args.dry_run): continue if (sequenceType == "fixCDSkey"): cds = data_helpers.CDSHelper(taxId, proteinId) seqId = cds.seqId() if (not seqId is None): r.sadd(speciesCDSList % (taxId, ), proteinId) else: print("Couldn't find entry for proteinId=%s" % proteinId) continue # Skip the rest of the processing... storeSeqInDB(nucSeq=record.seq, taxId=taxId, proteinId=proteinId, seqSourceTag=seqSourceTag) cdsCount += 1 if (notFoundCount + skippedCount > 0): print("Warning: %d entries skipped and %d entries not found" % (skippedCount, notFoundCount)) print("Processed %d CDS entries" % (cdsCount, )) print("(out of %d CDS entries for this species)" % (r.scard("species:taxid:%d:CDS" % (taxId, )))) if (args.output_fasta): with open(args.output_fasta, "w") as outfile: out = SeqIO.write(outRecords, outfile, "fasta")
def readSeriesResultsForSpecies(seriesSourceNumber, species, minShuffledGroups=20, maxShuffledGroups=20, shuffleType=db.Sources.ShuffleCDSv2_python, cdsFilter=None, returnCDS=True): if isinstance( species, Iterable ): # usually, species will be a sequence of numeric taxid values if isinstance(species, basestring): raise Exception("species cannot be string") # all set - proceed... else: species = (species, ) # assume we got a single (numeric) taxid value assert (minShuffledGroups <= maxShuffledGroups) for taxIdForProcessing in species: print("Procesing %d sequences for tax-id %d (%s)..." % (countSpeciesCDS(taxIdForProcessing), taxIdForProcessing, getSpeciesName(taxIdForProcessing))) computed = getAllComputedSeqsForSpecies(seriesSourceNumber, taxIdForProcessing, maxShuffledGroups, shuffleType=shuffleType) computedIds = frozenset(computed.keys()) print("Collecting data from %d computation results..." % len(computed)) skipped = 0 selected = 0 alreadyCompleted = 0 # Iterate over all CDS entries for this species for protId in SpeciesCDSSource(taxIdForProcessing): cds = CDSHelper(taxIdForProcessing, protId) if (not cdsFilter is None) and (not cdsFilter(cds)): continue cdsSeqId = cds.seqId() shuffledIds = cds.shuffledSeqIds(shuffleType=shuffleType) # How many shuffles (for this cds) exist in the data we found? computedShufflesCount = len( computedIds.intersection(frozenset(shuffledIds))) if (computedShufflesCount < minShuffledGroups or (not cdsSeqId in computedIds)): #print("%s - found only %d groups, skipping" % (protId, computedShufflesCount)) skipped += 1 continue # Get the computed results for this CDS seqIds = [cds.seqId()] seqIds.extend(cds.shuffledSeqIds(shuffleType=shuffleType)) if (len(seqIds) > maxShuffledGroups + 1): seqIds = seqIds[:maxShuffledGroups + 1] results = [computed.get(x) for x in seqIds] if (results is None or len([() for x in results if not x is None]) < minShuffledGroups): print("Not enough results found for %s" % protId) skipped += 1 continue # Decode the results results = list( map( lambda x: decodeJsonSeriesRecord(decompressSeriesRecord(x)) if not x is None else None, results)) if (returnCDS): yield { "taxid": taxIdForProcessing, "content": results, "cds": cds } else: yield {"taxid": taxIdForProcessing, "content": results} del results del cds selected += 1 if (rl()): print("# %s - %d records included, %d records skipped" % (datetime.now().isoformat(), selected, skipped))
def speciesStatisticsAndValidityReport(args): import _distributed speciesDf = pd.DataFrame({ 'TaxId': pd.Series([], dtype='int'), # Species TaxId 'Species': pd.Series([], dtype='str'), # Species binomial name 'Nickname': pd.Series([], dtype='str'), 'Domain': pd.Categorical([]), # Bacteria, Eukaryota, Archaea 'Phylum': pd.Categorical([]), # Phylum name (string) 'NumCDSs': pd.Series([], dtype='int'), # CDS count for this species 'NumCDSsInProfile': pd.Series([], dtype='int' ), # Num seqs with 20 shuffled profiles for this species 'AnnotatedNumCDSs': pd.Series([], dtype='int'), # 'CDSDifference': pd.Series([], dtype='float'), # 'NumNativeSeqs': pd.Series([], dtype='int'), # 'GCContentInCDS': pd.Series([], dtype='float'), # 'AnnotatedGCContent': pd.Series([], dtype='float'), # 'RowType': pd.Categorical([]), # Species count or total 'Warnings': pd.Series([], dtype='str'), # 'CDSWarnings': pd.Series([], dtype='int'), # 'CDSWarnings_': pd.Series([], dtype='str'), # 'FirstAA': pd.Series([], dtype='str'), # 'LastAA': pd.Series([], dtype='str') # }) scheduler = _distributed.open() results = {} delayedCalls_native = [] shuffledCounts = {} delayedCalls_shuffledProfiles = [] for taxId in allSpeciesSource(): if taxId in speciesToExclude: continue # always exclude species from the blacklist if args.taxid and taxId not in args.taxid: continue # if a whitelist is specified, skip other species warnings = [] ## DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ## #if randint(0, 20) > 0: # continue ## DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ## cdsCountInRedis = countSpeciesCDS(taxId) #cdsCountProfiles = countx(taxId, (310, 10, "begin", 0), 102, 11) annotatedProteinCount = getSpeciesProperty(taxId, 'protein-count')[0] annotatedGCContent = getSpeciesProperty(taxId, 'gc-content')[0] proteinDifference = None if not annotatedProteinCount is None: proteinDifference = (1.0 - float(cdsCountInRedis) / float(annotatedProteinCount)) * 100.0 if abs(proteinDifference) > 9.9: warnings.append("CDS_count") else: warnings.append("No_CDS_count") # Determine phylum lineage = ncbiTaxa.get_lineage(taxId) names = ncbiTaxa.get_taxid_translator(lineage) ranks = ncbiTaxa.get_rank(lineage) # Determine kingdom/domain domain = "" kingdomTaxId = [ t for t, rank in ranks.items() if rank == 'superkingdom' ] if not kingdomTaxId: kingdomTaxId = [ t for t, rank in ranks.items() if rank == 'kingdom' ] domain = names[kingdomTaxId[0]] phylumName = "" # Determine phylum phylumTaxId = [t for t, rank in ranks.items() if rank == 'phylum'] if phylumTaxId: phylumName = names[phylumTaxId[0]] speciesDf = speciesDf.append( pd.DataFrame({ 'TaxId': pd.Series([taxId], dtype='int'), # Species TaxId 'Species': pd.Series([getSpeciesName(taxId)], dtype='str'), 'Nickname': pd.Series([shortNames[taxId]], dtype='str'), 'Domain': pd.Categorical([domain]), # Bacteria, Eukaryota, Archaea 'Phylum': pd.Categorical([phylumName]), # Phylum name (string) 'NumCDSs': pd.Series([cdsCountInRedis], dtype='int'), # CDS count for this species 'NumCDSsInProfile': pd.Series([0], dtype='int'), # Num seqs with 20 shuffled profiles 'AnnotatedNumCDSs': pd.Series([ 0 if annotatedProteinCount is None else annotatedProteinCount ], dtype='int'), # 'CDSDifference': pd.Series([proteinDifference], dtype='float'), # 'NumNativeSeqs': pd.Series([0], dtype='int'), # 'GCContentInCDS': pd.Series([0.0], dtype='float'), # 'AnnotatedGCContent': pd.Series([annotatedGCContent], dtype='float'), # 'RowType': pd.Categorical(["species"]), # Species count or total 'Warnings': pd.Series([", ".join(warnings)], dtype='str'), # 'CDSWarnings': pd.Series([0], dtype='int'), 'CDSWarnings_': pd.Series([""], dtype='str'), 'FirstAA': pd.Series([""], dtype='str'), 'LastAA': pd.Series([""], dtype='str'), 'Source': pd.Series([""], dtype='str') })) fractionSize = 1000 # How many sequences (roughly) to process in each task numFractions = cdsCountInRedis / fractionSize if numFractions == 0: numFractions = 1 for i in range(numFractions): # DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY # #if i%100!=5: continue # DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY # call = dask.delayed(calcNativeSequencesStatistics)(taxId, i, numFractions) delayedCalls_native.append(call) call = dask.delayed(countShuffledProfiles)(taxId, (310, 10, "begin", 0), 102, 11) delayedCalls_shuffledProfiles.append(call) speciesDf.set_index('TaxId', inplace=True) print("Starting {} calls...".format( len(delayedCalls_native) + len(delayedCalls_shuffledProfiles))) futures = scheduler.compute( delayedCalls_native + delayedCalls_shuffledProfiles ) # submit all delayed calculations; obtain futures immediately try: _distributed.progress(futures) # wait for all calculations to complete except Exception as e: print(E) print("\n") print("Waiting for all tasks to complete...") _distributed.wait(futures) results = {} errorsCount = 0 for f in futures: try: ret = scheduler.gather(f) if (len(ret) == 9): (taxId, fraction, cdsCount, gcCounts, totalCounts, cdsWarnings, warnings, firstAA, lastAA) = ret current = None if taxId in results: current = results[taxId] else: current = (0, 0, 0, 0, Counter(), Counter(), Counter()) current = (current[0] + cdsCount, current[1] + gcCounts, current[2] + totalCounts, current[3] + cdsWarnings, current[4] + warnings, current[5] + firstAA, current[6] + lastAA) results[taxId] = current elif (len(ret) == 2): (taxId, numShuffledSeqs) = ret shuffledCounts[taxId] = numShuffledSeqs else: assert (False) except Exception as e: print(e) errorsCount += 1 for taxId, result in results.items(): (numNativeSeqs, gcCounts, totalCounts, cdsWarnings, warnings, firstAA, lastAA) = result speciesDf.at[taxId, 'NumNativeSeqs'] = numNativeSeqs speciesDf.at[taxId, 'GCContentInCDS'] = round( float(gcCounts) / float(totalCounts) * 100.0, 1) speciesDf.at[taxId, 'CDSWarnings'] = cdsWarnings speciesDf.at[taxId, 'CDSWarnings_'] = summarizeCounter(warnings) speciesDf.at[taxId, 'FirstAA'] = summarizeCounter(firstAA) speciesDf.at[taxId, 'LastAA'] = summarizeCounter(lastAA) #if numNativeSeqs < species.at[taxId, 'NumCDSs']: # pass for taxId, result in shuffledCounts.items(): speciesDf.at[taxId, 'NumCDSsInProfile'] = result speciesDf = speciesDf.sort_values(by=['Domain', 'Species']) # sort rows speciesDf.to_html('species_report.html', float_format='{0:.1f}'.format, columns=[ 'Species', 'Nickname', 'NumCDSs', 'NumCDSsInProfile', 'AnnotatedNumCDSs', 'CDSDifference', 'NumNativeSeqs', 'GCContentInCDS', 'AnnotatedGCContent', 'Phylum', 'Domain', 'Warnings', 'CDSWarnings', 'CDSWarnings_', 'FirstAA', 'LastAA' ]) with open("species_report_simple.rst", "w") as f: f.write( speciesDf.drop([ 'RowType', 'Warnings', 'CDSWarnings', 'CDSWarnings_', 'FirstAA', 'LastAA', 'CDSDifference' ], axis=1).pipe(tabulate, headers='keys', tablefmt='rst')) speciesDf.to_html('species_report_simple.html', float_format='{0:.1f}'.format, columns=[ 'Species', 'Nickname', 'NumCDSs', 'NumCDSsInProfile', 'AnnotatedNumCDSs', 'CDSDifference', 'NumNativeSeqs', 'GCContentInCDS', 'AnnotatedGCContent', 'Phylum', 'Domain' ]) speciesDf.to_excel('species_report.xlsx', sheet_name='Species summary')