def pruneReferenceTree_Nmicrobiol201648(taxa): treefmt = None with open(nmicrobiol201648_s6_PATHd8, "r") as f: #with open(itol_newick, "r") as f: treefmt = f.read() # Strip internal support, and parse the resulting string tree = PhyloTree(stripTreeInternalSupport(treefmt)) #ts.show_leaf_name = True #ts.layout_fn = nodeLayoutWithTaxonomicNames unmatched = [] matched = [] #xxx = set([1287680, 115713, 203267, 188937, 4781, 187420, 243230, 130081, 227882, 228908, 227377, 224308, 5693, 345663, 208964, 224325, 1116230, 243273, 213585, 64091, 45670, 1069680, 1397361, 280699, 1047168, 284811, 284812, 46234, 418459, 214684, 262768, 243365, 273063, 511145, 176299, 272557, 272558, 402612, 283166, 223926, 163003, 559292, 1041607, 1183438, 2769, 122586, 273116, 593117, 192222, 1574623, 243159, 160490, 212717, 272623, 272631, 272632, 63737, 272634, 1341181, 1125630, 99287, 27923, 400667, 269084, 257314, 96563, 300852, 4927, 381764, 242507, 65357, 104782, 336722, 190304, 882, 347515, 353152, 83332, 93061, 194439, 1223560, 267671, 196164, 1245935, 449447, 420778, 195522, 556484, 5061, 391623, 70601, 85962, 272844, 259536, 272633, 220668, 169963, 295405, 237561, 407035, 997884, 1432061, 1010810, 562, 1010800]) labelToTaxId = readTranslationMap() numProcessed = 0 numMatched = 0 # Annotate tree leaves for node in tree.traverse(): if node.is_leaf(): #items = node.name.split("_") # "items" (words) are separated by '_' #for i,x in enumerate(items): # if x.startswith("Submit") or x.startswith("submit"): # print("Removing submission note on node: %s" % items) # items = items[:i] # break matchingName = None matchingTaxId = None #print("---------------------------") #print(n) # Check if the label has a mapping in the id-conversion table matchingTaxId = labelToTaxId.get(node.name) #if not matchingTaxId is None: # matchingName = ncbiTaxa.get_taxid_translator((matchingTaxId,))[matchingTaxId] # Did we find a match for this leaf? #if not matchingName is None: # match found if not matchingTaxId is None: node.label = node.name node.name = str(matchingTaxId) #node.matchingName #print("<%s>" % node.name) #lineageItems = [x for x in items[:speciesStartItem] if not x in taxonItemsToIgnore] lineageItems = ncbiTaxa.get_lineage(matchingTaxId) # TODO - Fix lineageItems ? node.add_features(taxId = matchingTaxId, lineageItems = lineageItems) #node.add_features(taxId = matchingTaxId) #if matchingTaxId != speciesLevelTaxon: # node.add_features(speciesLevelTaxon=speciesLevelTaxon) matched.append("%s [%d]" % (node.label, matchingTaxId)) #print("--"*20) #print(matchingName) #print(items[:speciesStartItem]) numMatched += 1 else: # no match found unmatched.append(node.name) node.name = "n/a" numProcessed += 1 if (rl()): print("(processed %d matched %d)" % (numProcessed, numMatched)) #if(numProcessed>1000): # break # Save unmatched names to file (for examination) with open("unmatched_names.txt", "w") as f: f.writelines(["%s\n" % x for x in sorted(unmatched)]) # Save unmatched names to file (for examination) with open("matched_names.txt", "w") as f: f.writelines(["%s\n" % x for x in sorted(matched)]) print("//"*30) print("//"*30) print("//"*30) print("//"*30) outer = {} # Try to annotate non-leaf nodes with common taxonomic group for node in tree.traverse(strategy='postorder'): # children first if node.is_leaf(): continue a = [] for c in node.children: try: a.append( c.lineageItems ) except AttributeError: pass print(">>" * 20) print(len(a)) print(a) if a: out = None if len(a)==2: out = [] #if a[0][-1]=="Anabaena" or a[1][-1]=="Anabaena": # print(">"*50) # print(a) # print(">"*50) for u,v in zip(a[0], a[1]): if u==v: out.append(u) else: break elif len(a)==1: #if a[0][-1]=="Anabaena": # print(">"*50) # print(a) # print(">"*50) out = a[0] else: assert(False) if out: #print("out = %s" % out) print("//"*20) print(out[-1]) node.add_features(lineageItems = out, testK = out[-1]) outer[out[-1]] = id(node) print(">>> %s" % out[-1]) if out[-1]==2: print("- - "*5) print(a) print("- - "*5) # #if out[-1] == "Nostocaceae": # #print(node.name) # #print(a) # #pass #else: # print("*-"*20) # print(node.name) # print(a) for node in tree.traverse(strategy='postorder'): # children first if node.is_leaf(): continue try: l = node.testK if not l is None: if outer[l]==id(node): node.add_features(testL = l) except AttributeError as e: pass print("//"*30) print("//"*30) print("//"*30) print("//"*30) # Now we have our annotated reference phylogenetic tree # Get our target list of species to appear on the final tree #taxa = getSpeciesToInclude() allNames = ncbiTaxa.get_taxid_translator(taxa) #taxa.append(1906157) #taxa.append(251221) #print(ncbiTaxa.get_rank(taxa)) for x in (45157,4896,44056): print(x) print(tree.search_nodes(taxId=x)) f = set() fnodes = [] notf = set() for x in taxa: print("=="*20) print("Searching for %d" % x) found = tree.search_nodes(taxId=x) if found: f.add(x) fnodes.append(found[0]) print("Exact match found") else: containingSpeciesLevelTaxon = getContainingSpeciesLevelTaxon(x) if x != containingSpeciesLevelTaxon: found = tree.search_nodes(taxId=containingSpeciesLevelTaxon) if not found: found = tree.search_nodes(speciesLevelTaxon=containingSpeciesLevelTaxon) # TODO - CONTINUE HERE #if not found: # found = tree.search_nodes( if found: f.add(x) fnodes.append(found[0]) print("Found") else: print("Not found at all...") #elif ncbiTaxa.get_rank([x])[x] == 'no rank': # parent = ncbiTaxa.get_lineage(x)[-2] # found = tree.search_nodes(taxId=parent) # if found: # f.add(x) # fnodes.append(found[0]) if x not in f: #print("--"*50) #print("TaxId not found: %d" % x) #print("Name: %s" % allNames[x]) containingSpeciesLevelTaxon = getContainingSpeciesLevelTaxon(x) #print("Species TaxId: %d %s" % (containingSpeciesLevelTaxon, "" if x==containingSpeciesLevelTaxon else "***")) lineage = ncbiTaxa.get_lineage(x) #print("Lineage: %s" % lineage) names = ncbiTaxa.get_taxid_translator(lineage) #print(names) for y in reversed(lineage): name = names[y] res = bool(tree.search_nodes(testK = name)) #print("%s: %s" % (name, res)) notf.add(x) print("Found (%d): %s" % (len(f), f)) print(ncbiTaxa.get_rank(list(f))) print("Couldn't find (%d): %s" % (len(notf), notf)) print(list(ncbiTaxa.get_taxid_translator(list(notf)).values())) print(len(fnodes)) tree2 = tree.copy() print("Before pruning: %d" % len(tree2)) if fnodes: #tree2.prune(fnodes, preserve_branch_length=True) pruneTree(tree2, fnodes) print("After pruning: %d" % len(tree2)) return (tree, tree2)
def nodeLayoutFunc(node): taxid = int(node.name) if taxid in taxidsToKeep: taxGroupName = ncbiTaxa.get_taxid_translator( [taxid] )[taxid] # There has to be an easier way to look up names... row = None rangeRows = None print(len(ranges)) if (len(ranges) == 1): row = df[(df['ExplanatoryVar'] == var) & (df['TaxGroup'] == taxid) & (df['Range'] == ranges[0])] assert (len(row) == len(ranges)) elif len(ranges) > 1: row = df[(df['ExplanatoryVar'] == var) & (df['TaxGroup'] == taxid) & (df['Range'] == 0)] assert (len(row) == 1) rangeRows = df[(df['ExplanatoryVar'] == var) & (df['TaxGroup'] == taxid) & (df['Range'].isin(set(ranges)))] else: assert (False) overallPval = float(row['Pvalue'].values[0]) name = TextFace("%s" % taxGroupName, fsize=baseFontSize * 2.5) name.tight_text = True name.margin_left = 20 name.margin_right = 0 name.margin_top = 40 name.margin_bottom = 12 faces.add_face_to_node(name, node, column=0) #print(rangeRows) # For each range to be included in this plot, add a bar for rangeId in ranges: #print("rangeId = %s" % (rangeId)) rowForThisRange = None if len(ranges) == 1: rowForThisRange = row else: rowForThisRange = rangeRows[rangeRows['Range'] == rangeId] assert (len(rowForThisRange) == 1) # Extract p-value and "effect-size" (signed R^2) effectSize = float( rowForThisRange['EffectSize'].values[0]) pval = float(rowForThisRange['Pvalue'].values[0]) # Set bar-graph color and significance markers barColor = "" significanceMarker = "" if (pval < significanceLevel): significanceMarker = " %s" % unichr(0x2731) if effectSize < 0: barColor = "#1133ff" else: barColor = "#ff3311" else: # not significant if effectSize < 0: barColor = "#b0b0f0" else: barColor = "#ccb090" # Add the minus sign if needed signChar = "" if effectSize < 0: signChar = unichr( 0x2212 ) # minus sign (more legible than a hypen...) v = RectFace(width=abs(effectSize) * barScale, height=baseFontSize * 3.5, fgcolor=barColor, bgcolor=barColor, label={ "text": "%s%.2g %s" % (signChar, abs(effectSize), significanceMarker), "fontsize": baseFontSize * 1.8, "color": "black" }) #v.rotation = -90 v.margin_top = 1 v.margin_left = 30 v.margin_right = 8 v.margin_bottom = 12 faces.add_face_to_node(v, node, column=0) details = TextFace( "N=%d" % row['NumSpecies'], fsize=baseFontSize * 1.5) #, fsize=baseFontSize) #, fstyle="italic") details.background.color = "#dfdfdf" details.margin_left = 6 details.margin_right = 20 #details.margin_top=5 #details.margin_bottom=0 faces.add_face_to_node(details, node, column=1) nstyle = NodeStyle() nstyle["size"] = 0 node.set_style(nstyle)
def speciesByPhylaTable(): allPhyla = parseReport() # get all existing phyla domainCounts = Counter() phylaCounts = Counter() skippedCounts = Counter() #classesByPhyla = {} # Disable tallying by class, since these are not used for many taxons ordersByPhyla = {} familiesByPhyla = {} genusesByPhyla = {} phylaDf = pd.DataFrame({ 'Domain': pd.Categorical([]), # Bacteria, Eukaryota, Archaea 'Phylum': pd.Categorical([]), # Phylum name (string) 'TaxId': pd.Series([], dtype='int'), # Phylum TaxId 'ParentTaxId': pd.Series([], dtype='int'), # Parent TaxId 'NumSpecies': pd.Series([], dtype='int'), # Species count for this phyla # 'NumClasses': pd.Series([], dtype='int'), # Species count for this phyla 'NumOrders': pd.Series([], dtype='int'), # Orders count for this phyla 'NumFamilies': pd.Series([], dtype='int'), # Families count for this phyla 'NumGenuses': pd.Series([], dtype='int'), # Genuses count for this phyla 'RowType': pd.Categorical([]) }) # Phylum count or total for group, phyla in allPhyla.items(): for phylum, record in phyla.items(): # Add item for each phylum taxId = record['taxId'] phylaDf = phylaDf.append( pd.DataFrame({ 'Domain': pd.Categorical([group]), 'Phylum': pd.Categorical([phylum]), 'TaxId': pd.Series([taxId], dtype='int'), 'ParentTaxId': pd.Series([record['parentTaxId']], dtype='int'), 'NumSpecies': pd.Series([0], dtype='int'), # 'NumClasses': pd.Series([0], dtype='int'), 'NumOrders': pd.Series([0], dtype='int'), 'NumFamilies': pd.Series([0], dtype='int'), 'NumGenuses': pd.Series([0], dtype='int'), 'RowType': pd.Categorical(['Phylum']) })) #classesByPhyla[record['taxId']] = set() ordersByPhyla[record['taxId']] = set() familiesByPhyla[record['taxId']] = set() genusesByPhyla[record['taxId']] = set() # Create "special" items pid = 1 for group in allPhyla.keys(): # Add "Unknown phylum" tally for each domain phylaDf = phylaDf.append( pd.DataFrame({ 'Domain': pd.Categorical([group]), 'Phylum': pd.Categorical(['[Unknown]']), 'TaxId': pd.Series([pid], dtype='int'), 'ParentTaxId': pd.Series([0], dtype='int'), 'NumSpecies': pd.Series([0], dtype='int'), # 'NumClasses': pd.Series([0], dtype='int'), 'NumOrders': pd.Series([0], dtype='int'), 'NumFamilies': pd.Series([0], dtype='int'), 'NumGenuses': pd.Series([0], dtype='int'), 'RowType': pd.Categorical(['Total']) })) pid += 1 # Add totals tally for each domain phylaDf = phylaDf.append( pd.DataFrame({ 'Domain': pd.Categorical([group]), 'Phylum': pd.Categorical(['[Total]']), 'TaxId': pd.Series([pid], dtype='int'), 'ParentTaxId': pd.Series([0], dtype='int'), 'NumSpecies': pd.Series([0], dtype='int'), # 'NumClasses': pd.Series([0], dtype='int'), 'NumOrders': pd.Series([0], dtype='int'), 'NumFamilies': pd.Series([0], dtype='int'), 'NumGenuses': pd.Series([0], dtype='int'), 'RowType': pd.Categorical(['Total']) })) pid += 1 # Add overally totals items phylaDf = phylaDf.append( pd.DataFrame({ 'Domain': pd.Categorical(['[All]']), 'Phylum': pd.Categorical(['[Total]']), 'TaxId': pd.Series([pid], dtype='int'), 'ParentTaxId': pd.Series([0], dtype='int'), 'NumSpecies': pd.Series([0], dtype='int'), # 'NumClasses': pd.Series([0], dtype='int'), 'NumOrders': pd.Series([0], dtype='int'), 'NumFamilies': pd.Series([0], dtype='int'), 'NumGenuses': pd.Series([0], dtype='int'), 'RowType': pd.Categorical(['Total']) })) phylaDf.set_index('TaxId', inplace=True) skippedSpecies = [] # Count the number of species under each phylum for taxId in allSpeciesSource(): if taxId in speciesToExclude: continue lineage = ncbiTaxa.get_lineage(taxId) names = ncbiTaxa.get_taxid_translator(lineage) ranks = ncbiTaxa.get_rank(lineage) # Determine kingdom/domain kingdomTaxId = [ t for t, rank in ranks.items() if rank == 'superkingdom' ] if not kingdomTaxId: kingdomTaxId = [ t for t, rank in ranks.items() if rank == 'kingdom' ] domain = names[kingdomTaxId[0]] domainCounts.update([domain]) # Determine phylum phylumTaxId = [t for t, rank in ranks.items() if rank == 'phylum'] if not phylumTaxId: skippedSpecies.append(taxId) skippedCounts.update([domain]) print("Skipping %d: (%s) missing phylum" % (taxId, names[taxId])) #print(formatLineage(lineage, names)) continue # This table is structured by phylum; information will be missing for any species missing a phylum; it will be included in the "species missing phylum" ([Unknown]) row. else: phylumTaxId = phylumTaxId[0] if phylumTaxId: phylaCounts.update( [phylumTaxId]) # tally this species under the specified phylum #classTaxId = [t for t,rank in ranks.items() if rank=='class'] #if classTaxId: # classesByPhyla[phylumTaxId].add( classTaxId[0] ) orderTaxId = [t for t, rank in ranks.items() if rank == 'order'] if orderTaxId: ordersByPhyla[phylumTaxId].add(orderTaxId[0]) familyTaxId = [t for t, rank in ranks.items() if rank == 'family'] if familyTaxId: familiesByPhyla[phylumTaxId].add(familyTaxId[0]) genusTaxId = [t for t, rank in ranks.items() if rank == 'genus'] if genusTaxId: genusesByPhyla[phylumTaxId].add(genusTaxId[0]) assert (sum(skippedCounts.values()) == len(skippedSpecies)) # Update the phyla counts for phylaTaxId, counts in phylaCounts.items(): #phylaDf.loc[phylaTaxId, 'NumClasses'] = len(classesByPhyla[phylaTaxId]) phylaDf.loc[phylaTaxId, 'NumOrders'] = len(ordersByPhyla[phylaTaxId]) phylaDf.loc[phylaTaxId, 'NumFamilies'] = len(familiesByPhyla[phylaTaxId]) phylaDf.loc[phylaTaxId, 'NumGenuses'] = len(genusesByPhyla[phylaTaxId]) phylaDf.loc[phylaTaxId, 'NumSpecies'] = counts # Update the "Unknown phyla" count for each domain for group, countMissing in skippedCounts.items(): #print('-'*20) #print("%s - %d missing" % (group, countMissing)) dummyTaxIdForBasketGroup = phylaDf[(phylaDf.Domain == group) & ( phylaDf.Phylum == '[Unknown]')].index[0] phylaDf.loc[dummyTaxIdForBasketGroup, 'NumSpecies'] = countMissing # Update the total for each domain for group, totalCount in domainCounts.items(): dummyTaxIdForBasketGroup = phylaDf[ (phylaDf.Domain == group) & (phylaDf.Phylum == '[Total]')].index[0] phylaDf.loc[dummyTaxIdForBasketGroup, 'NumSpecies'] = totalCount # Update the overall total count dummyTaxIdForBasketGroup = phylaDf[(phylaDf.Domain == "[All]") & (phylaDf.Phylum == '[Total]')].index[0] phylaDf.loc[dummyTaxIdForBasketGroup, 'NumSpecies'] = sum(domainCounts.values()) phylaDf.loc[dummyTaxIdForBasketGroup, 'NumOrders'] = sum([len(x) for x in ordersByPhyla.values()]) phylaDf.loc[dummyTaxIdForBasketGroup, 'NumFamilies'] = sum( [len(x) for x in familiesByPhyla.values()]) phylaDf.loc[dummyTaxIdForBasketGroup, 'NumGenuses'] = sum([len(x) for x in genusesByPhyla.values()]) # Prepare and save the final table phylaReportDf = phylaDf[phylaDf['NumSpecies'] > 0] # remove "empty" items phylaReportDf = phylaReportDf.sort_values( by=['Domain', 'RowType', 'Phylum']) # sort rows print(phylaReportDf) phylaReportDf.to_html('phyla_report.html', columns=[ 'Phylum', 'NumOrders', 'NumFamilies', 'NumGenuses', 'NumSpecies', 'Domain' ]) phylaReportDf.to_excel('phyla_report.xlsx', sheet_name='Phyla Summary') with open("phyla_report.rst", "w") as f: f.write( phylaReportDf.drop([ 'RowType', 'NumFamilies', 'NumGenuses', 'NumOrders', 'ParentTaxId' ], axis=1).pipe(tabulate, headers='keys', tablefmt='rst')) # Prepare the "Missing phyla" report missingPhylaReportDf = phylaDf[phylaDf['NumSpecies'] == 0] missingPhylaReportDf = missingPhylaReportDf.sort_values( by=['Domain', 'RowType', 'Phylum']) # sort rows missingPhylaReportDf.to_html('phyla_report_missing.html', columns=['Phylum', 'NumSpecies', 'Domain']) missingPhylaReportDf.to_excel('phyla_report_missing.xlsx', sheet_name='Missing Phyla Summary') # print counts print(domainCounts) #print(phylaCounts) # Display "skipped items" warning if (skippedSpecies): print("=" * 50) print("Warning: Skipped %d species" % len(skippedSpecies)) print(skippedCounts) print("=" * 50)
}, index=pd.Index([], name='tax_id', dtype='int')) # Add kingdom data to the data-frame for k, v in taxidToKingdom.items(): df.loc[k, 'kingdom'] = v df.loc[k, 'full.name'] = getSpeciesName(k) df.loc[k, 'short.name'] = shortNames[k] assert (df.loc[k, 'kingdom'] == v) # Get list of large taxonomic groups (based on the lineages of all species) majorGroups = getMajorTaxonomicGroups(taxidToLineage) # Add a binary membership column for each major group for groupTaxId, _ in majorGroups: groupName = ncbiTaxa.get_taxid_translator([groupTaxId])[groupTaxId] groupName = "Member_%s_%d" % (groupName.replace(" ", "_").replace( "/", "_").replace("-", "_"), groupTaxId) groupDf = pd.DataFrame({groupName: pd.Series(dtype='bool')}, index=pd.Index(df.index.values, name='tax_id', dtype='int')) for taxId, lineage in taxidToLineage.items(): isMember = int(groupTaxId in lineage) groupDf.loc[taxId, groupName] = isMember df = pd.merge(df, groupDf, how='inner', left_index=True, right_index=True ) # Add the new column (is there an easier way to do this?)
def speciesStatisticsAndValidityReport(args): import _distributed speciesDf = pd.DataFrame({ 'TaxId': pd.Series([], dtype='int'), # Species TaxId 'Species': pd.Series([], dtype='str'), # Species binomial name 'Nickname': pd.Series([], dtype='str'), 'Domain': pd.Categorical([]), # Bacteria, Eukaryota, Archaea 'Phylum': pd.Categorical([]), # Phylum name (string) 'NumCDSs': pd.Series([], dtype='int'), # CDS count for this species 'NumCDSsInProfile': pd.Series([], dtype='int' ), # Num seqs with 20 shuffled profiles for this species 'AnnotatedNumCDSs': pd.Series([], dtype='int'), # 'CDSDifference': pd.Series([], dtype='float'), # 'NumNativeSeqs': pd.Series([], dtype='int'), # 'GCContentInCDS': pd.Series([], dtype='float'), # 'AnnotatedGCContent': pd.Series([], dtype='float'), # 'RowType': pd.Categorical([]), # Species count or total 'Warnings': pd.Series([], dtype='str'), # 'CDSWarnings': pd.Series([], dtype='int'), # 'CDSWarnings_': pd.Series([], dtype='str'), # 'FirstAA': pd.Series([], dtype='str'), # 'LastAA': pd.Series([], dtype='str') # }) scheduler = _distributed.open() results = {} delayedCalls_native = [] shuffledCounts = {} delayedCalls_shuffledProfiles = [] for taxId in allSpeciesSource(): if taxId in speciesToExclude: continue # always exclude species from the blacklist if args.taxid and taxId not in args.taxid: continue # if a whitelist is specified, skip other species warnings = [] ## DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ## #if randint(0, 20) > 0: # continue ## DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ## cdsCountInRedis = countSpeciesCDS(taxId) #cdsCountProfiles = countx(taxId, (310, 10, "begin", 0), 102, 11) annotatedProteinCount = getSpeciesProperty(taxId, 'protein-count')[0] annotatedGCContent = getSpeciesProperty(taxId, 'gc-content')[0] proteinDifference = None if not annotatedProteinCount is None: proteinDifference = (1.0 - float(cdsCountInRedis) / float(annotatedProteinCount)) * 100.0 if abs(proteinDifference) > 9.9: warnings.append("CDS_count") else: warnings.append("No_CDS_count") # Determine phylum lineage = ncbiTaxa.get_lineage(taxId) names = ncbiTaxa.get_taxid_translator(lineage) ranks = ncbiTaxa.get_rank(lineage) # Determine kingdom/domain domain = "" kingdomTaxId = [ t for t, rank in ranks.items() if rank == 'superkingdom' ] if not kingdomTaxId: kingdomTaxId = [ t for t, rank in ranks.items() if rank == 'kingdom' ] domain = names[kingdomTaxId[0]] phylumName = "" # Determine phylum phylumTaxId = [t for t, rank in ranks.items() if rank == 'phylum'] if phylumTaxId: phylumName = names[phylumTaxId[0]] speciesDf = speciesDf.append( pd.DataFrame({ 'TaxId': pd.Series([taxId], dtype='int'), # Species TaxId 'Species': pd.Series([getSpeciesName(taxId)], dtype='str'), 'Nickname': pd.Series([shortNames[taxId]], dtype='str'), 'Domain': pd.Categorical([domain]), # Bacteria, Eukaryota, Archaea 'Phylum': pd.Categorical([phylumName]), # Phylum name (string) 'NumCDSs': pd.Series([cdsCountInRedis], dtype='int'), # CDS count for this species 'NumCDSsInProfile': pd.Series([0], dtype='int'), # Num seqs with 20 shuffled profiles 'AnnotatedNumCDSs': pd.Series([ 0 if annotatedProteinCount is None else annotatedProteinCount ], dtype='int'), # 'CDSDifference': pd.Series([proteinDifference], dtype='float'), # 'NumNativeSeqs': pd.Series([0], dtype='int'), # 'GCContentInCDS': pd.Series([0.0], dtype='float'), # 'AnnotatedGCContent': pd.Series([annotatedGCContent], dtype='float'), # 'RowType': pd.Categorical(["species"]), # Species count or total 'Warnings': pd.Series([", ".join(warnings)], dtype='str'), # 'CDSWarnings': pd.Series([0], dtype='int'), 'CDSWarnings_': pd.Series([""], dtype='str'), 'FirstAA': pd.Series([""], dtype='str'), 'LastAA': pd.Series([""], dtype='str'), 'Source': pd.Series([""], dtype='str') })) fractionSize = 1000 # How many sequences (roughly) to process in each task numFractions = cdsCountInRedis / fractionSize if numFractions == 0: numFractions = 1 for i in range(numFractions): # DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY # #if i%100!=5: continue # DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY # call = dask.delayed(calcNativeSequencesStatistics)(taxId, i, numFractions) delayedCalls_native.append(call) call = dask.delayed(countShuffledProfiles)(taxId, (310, 10, "begin", 0), 102, 11) delayedCalls_shuffledProfiles.append(call) speciesDf.set_index('TaxId', inplace=True) print("Starting {} calls...".format( len(delayedCalls_native) + len(delayedCalls_shuffledProfiles))) futures = scheduler.compute( delayedCalls_native + delayedCalls_shuffledProfiles ) # submit all delayed calculations; obtain futures immediately try: _distributed.progress(futures) # wait for all calculations to complete except Exception as e: print(E) print("\n") print("Waiting for all tasks to complete...") _distributed.wait(futures) results = {} errorsCount = 0 for f in futures: try: ret = scheduler.gather(f) if (len(ret) == 9): (taxId, fraction, cdsCount, gcCounts, totalCounts, cdsWarnings, warnings, firstAA, lastAA) = ret current = None if taxId in results: current = results[taxId] else: current = (0, 0, 0, 0, Counter(), Counter(), Counter()) current = (current[0] + cdsCount, current[1] + gcCounts, current[2] + totalCounts, current[3] + cdsWarnings, current[4] + warnings, current[5] + firstAA, current[6] + lastAA) results[taxId] = current elif (len(ret) == 2): (taxId, numShuffledSeqs) = ret shuffledCounts[taxId] = numShuffledSeqs else: assert (False) except Exception as e: print(e) errorsCount += 1 for taxId, result in results.items(): (numNativeSeqs, gcCounts, totalCounts, cdsWarnings, warnings, firstAA, lastAA) = result speciesDf.at[taxId, 'NumNativeSeqs'] = numNativeSeqs speciesDf.at[taxId, 'GCContentInCDS'] = round( float(gcCounts) / float(totalCounts) * 100.0, 1) speciesDf.at[taxId, 'CDSWarnings'] = cdsWarnings speciesDf.at[taxId, 'CDSWarnings_'] = summarizeCounter(warnings) speciesDf.at[taxId, 'FirstAA'] = summarizeCounter(firstAA) speciesDf.at[taxId, 'LastAA'] = summarizeCounter(lastAA) #if numNativeSeqs < species.at[taxId, 'NumCDSs']: # pass for taxId, result in shuffledCounts.items(): speciesDf.at[taxId, 'NumCDSsInProfile'] = result speciesDf = speciesDf.sort_values(by=['Domain', 'Species']) # sort rows speciesDf.to_html('species_report.html', float_format='{0:.1f}'.format, columns=[ 'Species', 'Nickname', 'NumCDSs', 'NumCDSsInProfile', 'AnnotatedNumCDSs', 'CDSDifference', 'NumNativeSeqs', 'GCContentInCDS', 'AnnotatedGCContent', 'Phylum', 'Domain', 'Warnings', 'CDSWarnings', 'CDSWarnings_', 'FirstAA', 'LastAA' ]) with open("species_report_simple.rst", "w") as f: f.write( speciesDf.drop([ 'RowType', 'Warnings', 'CDSWarnings', 'CDSWarnings_', 'FirstAA', 'LastAA', 'CDSDifference' ], axis=1).pipe(tabulate, headers='keys', tablefmt='rst')) speciesDf.to_html('species_report_simple.html', float_format='{0:.1f}'.format, columns=[ 'Species', 'Nickname', 'NumCDSs', 'NumCDSsInProfile', 'AnnotatedNumCDSs', 'CDSDifference', 'NumNativeSeqs', 'GCContentInCDS', 'AnnotatedGCContent', 'Phylum', 'Domain' ]) speciesDf.to_excel('species_report.xlsx', sheet_name='Species summary')