# TODO: Add support for step-size >1

# Establish DB connections
#r = redis.StrictRedis(host=config.host, port=config.port, db=config.db)
#session = db.Session()

skipped = 0
selected = 0
alreadyCompleted = 0
totalMissingResults = 0

queuedDelayedCalls = []

for taxIdForProcessing in species:
    print("Processing %d sequences for tax-id %d (%s)..." %
          (countSpeciesCDS(taxIdForProcessing), taxIdForProcessing,
           getSpeciesName(taxIdForProcessing)))

    stats = Counter()

    # Iterate over all CDS entries for this species
    # TODO - preloading all sequences and results should optimize this
    for protId in SpeciesCDSSource(taxIdForProcessing):

        stats['all-sequences'] += 1

        #protId = codecs.decode(protId)
        # Filtering

        # Only process 1/N of the sequences, selected randomly (N=randomFraction)
        # (if randomFraction==1, all sequences will be processed)
# metadata server (redis)
r = redis.StrictRedis(host=config.host,
                      port=config.port,
                      db=config.db,
                      password=config.password)
# sequences server (mysql)
session = db.Session()

visitedProteinIds = set()

assert (r.exists("species:taxid:%d:name" % taxId))

if (seqSourceTag == db.Sources.External):
    # Clear any previously imported CDSs...
    #r.delete(speciesCDSList % (taxId,))
    count = data_helpers.countSpeciesCDS(taxId)
    if (count > 0 and (not args.dry_run)):
        print("%d sequences already exist for specied %d. Aborting..." %
              (count, taxId))
        sys.exit(-1)
elif (sequenceType == "fixCDSkey"):
    r.delete(speciesCDSList % (taxId, ))
    # Delete and reconstruct the CDS key
else:
    assert (data_helpers.countSpeciesCDS(taxId) > 0)


def getCrc(seq):
    return crc32(str(seq).lower()) & 0xffffffff

argsParser.add_argument("--taxId", type=int, required=True)
argsParser.add_argument("--keep-first-n-shuffles", type=int, default=None)
args = argsParser.parse_args()

# Configuration
taxId = args.taxId

#statsShuffles = RunningStats()
statsShuffles = OfflineStats()

recordsCount = 0
warningsCount = 0

rl = RateLimit(30)

total = countSpeciesCDS(taxId)

for protId in SpeciesCDSSource(taxId):
    cds = CDSHelper(taxId, protId)

    statsShuffles.push(
        cds.dropShuffledSeqs(lastItemToKeep=args.keep_first_n_shuffles))

    recordsCount += 1

    if (rl()):
        print("processed %d records (%.2g%%)" %
              (recordsCount, float(recordsCount) / total * 100))

    # DEBUG ONLY # DEBUG ONLY # DEBUG ONLY # DEBUG ONLY # DEBUG ONLY # DEBUG ONLY #
    #if( recordsCount > 20 ):
def runDistributed():
    import _distributed
    import dask

    scheduler = _distributed.open()

    results = {}

    #taxids = []
    delayedCalls = []

    fractionSize = 20

    for taxId in allSpeciesSource():

        if randint(0, 20) > 0:
            continue

        if not getSpeciesProperty(taxId, 'paired-mRNA-fraction')[0] is None:
            continue

        size = countSpeciesCDS(taxId)

        numFractions = size / fractionSize
        for i in range(numFractions):
            call = dask.delayed(calcNativePairedFraction)(taxId, i,
                                                          numFractions)
            delayedCalls.append(call)
            #taxids.append(taxId)

    print("Starting %d calls..." % len(delayedCalls))

    futures = scheduler.compute(
        delayedCalls
    )  # submit all delayed calculations; obtain futures immediately

    try:
        _distributed.progress(futures)  # wait for all calculations to complete
    except Exception as e:
        print(E)
    print("\n")

    print("Waiting for all tasks to complete...")
    _distributed.wait(futures)

    results = {}
    errorsCount = 0
    for f in futures:
        try:
            (taxId, fraction, cdsCount, countPairedNucleotides,
             countTotalNucleotides) = scheduler.gather(f)

            current = None
            if taxId in results:
                current = results[taxId]
            else:
                current = (0, 0, 0, set())

            current = (current[0] + cdsCount,
                       current[1] + countPairedNucleotides,
                       current[2] + countTotalNucleotides,
                       current[3].union(set((fraction, ))))

            results[taxId] = current

        except Exception as e:
            print(e)
            errorsCount += 1

    for taxId, result in results.items():
        if len(result[3]) != max(result[3]) + 1:
            #raise Exception("Found invalid number of items for taxId=%d" % taxId)
            print("Found invalid number of items for taxId=%d" % taxId)
            continue

        fraction = float(result[1]) / result[2]

        setSpeciesProperty(taxId,
                           "paired-mRNA-fraction",
                           "%.4g" % fraction,
                           "computed (v3)",
                           overwrite=False)

        print("TaxId: %d\t\tFraction: %.4g" % (taxId, fraction))

    print("Finished %d species with %d errors" % (len(results), errorsCount))
    return results
예제 #5
0
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
import sys
from time import sleep
from data_helpers import SpeciesCDSSource, CDSHelper, getSpeciesName, countSpeciesCDS, matchCDSKeyNamesSource, r
from rate_limit import RateLimit

taxId = int(sys.argv[1])

rl = RateLimit(10)

if (countSpeciesCDS(taxId) == 0):
    print("Species %d (%s) doesn't have any proteins..." %
          (taxId, getSpeciesName(taxId)))
    print("Nothing left to do...")
    sys.exit(0)

print("Species %d (%s) has %d proteins stored." %
      (taxId, getSpeciesName(taxId), countSpeciesCDS(taxId)))
print("Will delete it in 10 seconds...")
sleep(10)

count = 0

for protId in SpeciesCDSSource(taxId):
    print(protId)
    cds = CDSHelper(taxId, protId)
def standalone():
    argsParser = argparse.ArgumentParser()
    argsParser.add_argument("--taxid", type=int)
    argsParser.add_argument("--input")
    argsParser.add_argument("--variant",
                            type=parseOption(
                                set(("yeastgenome", "NCBI", "Ensembl", "JGI")),
                                "variant"))
    argsParser.add_argument("--type",
                            type=parseOption(
                                set(("cds", "shuffle", "fixCDSkey")),
                                "sequence type"))
    argsParser.add_argument("--dry-run", action="store_true", default=False)
    argsParser.add_argument("--output-fasta")
    argsParser.add_argument("--gene-ids-file")
    argsParser.add_argument("--alt-protein-ids",
                            type=parseOption(set(("locus_tag", )),
                                             "alt-protein-id"))
    argsParser.add_argument("--headers-from-another-fasta")
    argsParser.add_argument("--ignore-id-check",
                            action="store_true",
                            default=False)
    args = argsParser.parse_args()

    if (args.output_fasta):
        if (args.output_fasta == args.input):
            raise Exception("Fasta output file cannot match input file!")

    #if( len(sys.argv) < 5 ):
    #    print("Usage: %s <taxid> <fasta-file> <fasta-variant> <cds|shuffle>" % (sys.argv[0],))
    #    sys.exit(-1)

    # command-line arguments
    taxId = args.taxid
    f = None
    if (args.input[-3:] == ".gz"):
        f = gzip.open(args.input, "r")
    elif (args.input[-4:] == ".bz2"):
        # TODO: impl this...
        assert (False)
    else:
        f = open(args.input, 'r')
    #sequenceFormat = args.variant
    sequenceType = args.type

    if (sequenceType == "cds"):
        seqSourceTag = db.Sources.External
    elif (sequenceType == "shuffle"):
        seqSourceTag = db.Sources.ShuffleCDSv2_matlab
    elif (sequenceType == "fixCDSkey"):
        seqSourceTag = None
    else:
        raise Exception("Unknown sequence type '%s'" % sequenceType)

    # establish connections
    # metadata server (redis)
    #r = redis.StrictRedis(host=config.host, port=config.port, db=config.db, password=config.password)
    # sequences server (mysql)
    #session = db.Session()

    visitedProteinIds = set()

    assert (r.exists("species:taxid:%d:name" % taxId))

    if (seqSourceTag == db.Sources.External):
        # Clear any previously imported CDSs...
        #r.delete(speciesCDSList % (taxId,))
        count = data_helpers.countSpeciesCDS(taxId)
        if (count > 0 and (not args.dry_run)):
            print("%d sequences already exist for specied %d. Aborting..." %
                  (count, taxId))
            sys.exit(-1)
    elif (sequenceType == "fixCDSkey"):
        r.delete(speciesCDSList % (taxId, ))
        # Delete and reconstruct the CDS key
    else:
        assert (data_helpers.countSpeciesCDS(taxId) > 0)

    reNuclearYeastGene = re.compile("Y[A-P][RL]\d+[CW](-[A-Z])?")
    geneIdsToInclude = set()
    if (args.gene_ids_file):
        with open(args.gene_ids_file, "r") as genesFile:
            for geneId in genesFile:
                geneIdsToInclude.add(geneId.rstrip())

    reNCBIattributes = re.compile("\[(\S+)=([^\]]+)\]")
    reNCBIbareheader = re.compile("\w+\|\w+\.\d+_cds_(\w+.\d+)_\d+")
    outRecords = []

    headersFromAnotherFasta = {}
    if args.headers_from_another_fasta:
        with open(args.headers_from_another_fasta, "r") as f2:
            for record in SeqIO.parse(f2, "fasta", alphabet=generic_dna):
                assert (not record.id in headersFromAnotherFasta)
                headersFromAnotherFasta[record.id] = record.description

    cdsCount = 0
    notFoundCount = 0
    skippedCount = 0
    validNucleotideChars = str.maketrans("ACGTacgt", "%%%%%%%%")
    #print("Opening fasta file: {}".format(f))
    for record in SeqIO.parse(f, "fasta", alphabet=generic_dna):
        #proteinId = regexLocusId.match(record.id).group(1) # Work-around for multiple-transcript identifiers in JGI's Chlamy genome

        if args.headers_from_another_fasta:
            record.description = headersFromAnotherFasta[record.id]

        numNonNucleotideChars = len(record.seq) - str(
            record.seq).translate(validNucleotideChars).count("%")
        if numNonNucleotideChars:
            print(
                "Skipping record %s, containing non-nucleotide or ambiguous symbols '%s'"
                % (record.id, numNonNucleotideChars))
            skippedCount += 1
            continue

        # yeastgenome.org - skip suspected pseudo-genes
        if (args.variant == "yeastgenome"
                and record.description.find("Dubious ORF") != -1):
            skippedCount += 1
            continue

        # yeastgenome.org - skip mitochondrial genes
        if (args.variant == "yeastgenome"):
            geneType = record.id[0]
            if geneType == "Q" or geneType == "R":
                skippedCount += 1
                continue

        # yeastgenome.org - verify gene-id conforms to: http://www.yeastgenome.org/help/community/nomenclature-conventions
        if (args.variant == "yeastgenome"):
            geneId = record.id
            assert (reNuclearYeastGene.match(geneId))

        # Obtain attributes mapping
        attributes = []
        if (args.variant == "NCBI"):
            attributes = dict(re.findall(reNCBIattributes, record.description))

        if (args.variant == "NCBI"):
            if ('pseudo' in attributes and attributes['pseudo'] == 'true'):
                print("Skipping pseudo-gene entry %s" % (record.id, ))
                skippedCount += 1
                continue

        # Determine gene id
        proteinId = None
        additionalProteinIds = set()
        altProteinId = None
        if (args.variant == "yeastgenome"):
            proteinId = record.id
        elif (args.variant == "NCBI"):
            if (sequenceType == "shuffle" and not attributes):
                #Workaround for shuffle-seq files missing the header...
                #Extract the protein-id from sequence-id like this:
                #>lcl|NC_002516.2_cds_NP_064721.1_1
                if not args.alt_protein_ids:
                    proteinId = reNCBIbareheader.match(record.id).group(1)

                elif args.alt_protein_ids == "locus_tag":
                    if ('locus_tag' not in attributes):
                        print("Skipping entry %s missing locus_tag - %s" %
                              (record.id, attributes))
                        skippedCount += 1
                        continue
                    proteinId = attributes['locus_tag']
                    print(proteinId)
                else:
                    assert False

            else:
                # Note - not currently used
                #if 'db_xref' in attributes:
                #    _db_xrefs = attributes['db_xref'].split(",")
                #    db_xrefs = dict(map( lambda x: tuple(x.split(":")), _db_xrefs))
                if not args.alt_protein_ids:
                    if ('protein_id' not in attributes):
                        print("Skipping entry %s missing protein_id - %s" %
                              (record.id, attributes))
                        skippedCount += 1
                        continue

                    proteinId = attributes['protein_id']
                elif args.alt_protein_ids == "locus_tag":
                    if ('locus_tag' not in attributes):
                        print("Skipping entry %s missing locus_tag - %s" %
                              (record.id, attributes))
                        skippedCount += 1
                        continue
                    proteinId = attributes['locus_tag']

                    if ('protein_id' in attributes):
                        altProteinId = attributes['protein_id']

                else:
                    assert (False)

        elif (args.variant == "Ensembl"):
            # Sample id: ABD29211.1
            dotPos = record.id.rfind('.')
            if (dotPos > 3):
                proteinId = record.id[:dotPos]
                additionalProteinIds.add(
                    record.id
                )  # also allow matching the full format (including the transcript-id) - some CDS files include it...

            else:
                proteinId = record.id

        elif (args.variant == "JGI"):
            # Variant 1 (Phytozome, Mpus)
            #  (gff3):  60050
            #  (fasta): 60050
            # Variant 2 (Phytozome, Dsal)
            #  (gff3):  Dusal.1637s00001.1
            #  (fasta): Dusal.1637s00001.1
            # Variant 3:
            #  (gff3):  jgi.p|Ostta1115_2|10314
            #  (fasta): jgi|Ostta1115_2|10314|CE10313_131

            proteinId = record.id

            if record.id.startswith("jgi|"):
                parts = record.id.split('|')
                parts[0] = 'jgi.p'  # add the '.p'
                additionalProteinIds.add('|'.join(
                    parts[:3]))  # drop the suffix (parts[4])

        else:
            assert (False)

        if not args.ignore_id_check:
            assert (len(proteinId) > 2)

        # Skip sequences that have non-standard translations
        if (args.variant == "NCBI"):
            if "transl_except" in attributes:
                print("Skipping %s (because of transl_except)" % (proteinId, ))
                skippedCount += 1
                continue

        # If an inclusion list (white list) is defined, skip sequences missing from it
        if args.gene_ids_file:
            if (proteinId not in geneIdsToInclude):
                # Also try the additional ids
                if (not geneIdsToInclude.intersection(additionalProteinIds)):
                    print("Skipping %s (sequence %s, alternate ids=%s)" %
                          (proteinId, record.id, list(additionalProteinIds)))
                    skippedCount += 1
                    continue

        print("Inserting %s (sequence %s)..." % (proteinId, record.id))

        # Verify there are no duplicates entries
        if (proteinId in visitedProteinIds):
            print("MULTIPLE Entry: %s", proteinId)
            skippedCount += 1
            continue
        #assert(proteinId not in visitedProteinIds)
        visitedProteinIds.add(proteinId)

        # Write the filtered sequences into an output file (if needed)
        # Note - this also works in dry-run...
        if (args.output_fasta):
            outRecords.append(record)

        if (args.dry_run):
            continue

        if (sequenceType == "fixCDSkey"):
            cds = data_helpers.CDSHelper(taxId, proteinId)
            seqId = cds.seqId()
            if (not seqId is None):
                r.sadd(speciesCDSList % (taxId, ), proteinId)
            else:
                print("Couldn't find entry for proteinId=%s" % proteinId)

            continue  # Skip the rest of the processing...

        storeSeqInDB(nucSeq=record.seq,
                     taxId=taxId,
                     proteinId=proteinId,
                     seqSourceTag=seqSourceTag)

        cdsCount += 1

    if (notFoundCount + skippedCount > 0):
        print("Warning: %d entries skipped and %d entries not found" %
              (skippedCount, notFoundCount))

    print("Processed %d CDS entries" % (cdsCount, ))
    print("(out of %d CDS entries for this species)" %
          (r.scard("species:taxid:%d:CDS" % (taxId, ))))

    if (args.output_fasta):
        with open(args.output_fasta, "w") as outfile:
            out = SeqIO.write(outRecords, outfile, "fasta")
예제 #7
0
def readSeriesResultsForSpecies(seriesSourceNumber,
                                species,
                                minShuffledGroups=20,
                                maxShuffledGroups=20,
                                shuffleType=db.Sources.ShuffleCDSv2_python,
                                cdsFilter=None,
                                returnCDS=True):
    if isinstance(
            species, Iterable
    ):  # usually, species will be a sequence of numeric taxid values
        if isinstance(species, basestring):
            raise Exception("species cannot be string")
        # all set - proceed...
    else:
        species = (species, )  # assume we got a single (numeric) taxid value
    assert (minShuffledGroups <= maxShuffledGroups)

    for taxIdForProcessing in species:
        print("Procesing %d sequences for tax-id %d (%s)..." %
              (countSpeciesCDS(taxIdForProcessing), taxIdForProcessing,
               getSpeciesName(taxIdForProcessing)))

        computed = getAllComputedSeqsForSpecies(seriesSourceNumber,
                                                taxIdForProcessing,
                                                maxShuffledGroups,
                                                shuffleType=shuffleType)
        computedIds = frozenset(computed.keys())
        print("Collecting data from %d computation results..." % len(computed))

        skipped = 0
        selected = 0
        alreadyCompleted = 0

        # Iterate over all CDS entries for this species
        for protId in SpeciesCDSSource(taxIdForProcessing):
            cds = CDSHelper(taxIdForProcessing, protId)

            if (not cdsFilter is None) and (not cdsFilter(cds)):
                continue

            cdsSeqId = cds.seqId()

            shuffledIds = cds.shuffledSeqIds(shuffleType=shuffleType)

            # How many shuffles (for this cds) exist in the data we found?
            computedShufflesCount = len(
                computedIds.intersection(frozenset(shuffledIds)))

            if (computedShufflesCount < minShuffledGroups
                    or (not cdsSeqId in computedIds)):
                #print("%s - found only %d groups, skipping" % (protId, computedShufflesCount))
                skipped += 1
                continue

            # Get the computed results for this CDS
            seqIds = [cds.seqId()]
            seqIds.extend(cds.shuffledSeqIds(shuffleType=shuffleType))
            if (len(seqIds) > maxShuffledGroups + 1):
                seqIds = seqIds[:maxShuffledGroups + 1]
            results = [computed.get(x) for x in seqIds]

            if (results is None or len([() for x in results if not x is None])
                    < minShuffledGroups):
                print("Not enough results found for %s" % protId)
                skipped += 1
                continue

            # Decode the results
            results = list(
                map(
                    lambda x: decodeJsonSeriesRecord(decompressSeriesRecord(x))
                    if not x is None else None, results))
            if (returnCDS):
                yield {
                    "taxid": taxIdForProcessing,
                    "content": results,
                    "cds": cds
                }
            else:
                yield {"taxid": taxIdForProcessing, "content": results}
            del results
            del cds
            selected += 1

            if (rl()):
                print("# %s - %d records included, %d records skipped" %
                      (datetime.now().isoformat(), selected, skipped))
예제 #8
0
def speciesStatisticsAndValidityReport(args):
    import _distributed

    speciesDf = pd.DataFrame({
        'TaxId': pd.Series([], dtype='int'),  # Species TaxId
        'Species': pd.Series([], dtype='str'),  # Species binomial name
        'Nickname': pd.Series([], dtype='str'),
        'Domain': pd.Categorical([]),  # Bacteria, Eukaryota, Archaea
        'Phylum': pd.Categorical([]),  # Phylum name (string)
        'NumCDSs': pd.Series([], dtype='int'),  # CDS count for this species
        'NumCDSsInProfile':
        pd.Series([], dtype='int'
                  ),  # Num seqs with 20 shuffled profiles for this species
        'AnnotatedNumCDSs': pd.Series([], dtype='int'),  # 
        'CDSDifference': pd.Series([], dtype='float'),  # 
        'NumNativeSeqs': pd.Series([], dtype='int'),  # 
        'GCContentInCDS': pd.Series([], dtype='float'),  # 
        'AnnotatedGCContent': pd.Series([], dtype='float'),  # 
        'RowType': pd.Categorical([]),  # Species count or total
        'Warnings': pd.Series([], dtype='str'),  # 
        'CDSWarnings': pd.Series([], dtype='int'),  # 
        'CDSWarnings_': pd.Series([], dtype='str'),  # 
        'FirstAA': pd.Series([], dtype='str'),  # 
        'LastAA': pd.Series([], dtype='str')  # 
    })

    scheduler = _distributed.open()

    results = {}
    delayedCalls_native = []

    shuffledCounts = {}
    delayedCalls_shuffledProfiles = []

    for taxId in allSpeciesSource():
        if taxId in speciesToExclude:
            continue  # always exclude species from the blacklist
        if args.taxid and taxId not in args.taxid:
            continue  # if a whitelist is specified, skip other species

        warnings = []

        ## DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ##
        #if randint(0, 20) > 0:
        #    continue
        ## DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ##

        cdsCountInRedis = countSpeciesCDS(taxId)

        #cdsCountProfiles = countx(taxId, (310, 10, "begin", 0), 102, 11)

        annotatedProteinCount = getSpeciesProperty(taxId, 'protein-count')[0]

        annotatedGCContent = getSpeciesProperty(taxId, 'gc-content')[0]

        proteinDifference = None
        if not annotatedProteinCount is None:
            proteinDifference = (1.0 - float(cdsCountInRedis) /
                                 float(annotatedProteinCount)) * 100.0

            if abs(proteinDifference) > 9.9:
                warnings.append("CDS_count")
        else:
            warnings.append("No_CDS_count")

        # Determine phylum
        lineage = ncbiTaxa.get_lineage(taxId)
        names = ncbiTaxa.get_taxid_translator(lineage)

        ranks = ncbiTaxa.get_rank(lineage)

        # Determine kingdom/domain
        domain = ""
        kingdomTaxId = [
            t for t, rank in ranks.items() if rank == 'superkingdom'
        ]
        if not kingdomTaxId:
            kingdomTaxId = [
                t for t, rank in ranks.items() if rank == 'kingdom'
            ]
        domain = names[kingdomTaxId[0]]

        phylumName = ""
        # Determine phylum
        phylumTaxId = [t for t, rank in ranks.items() if rank == 'phylum']
        if phylumTaxId:
            phylumName = names[phylumTaxId[0]]

        speciesDf = speciesDf.append(
            pd.DataFrame({
                'TaxId':
                pd.Series([taxId], dtype='int'),  # Species TaxId
                'Species':
                pd.Series([getSpeciesName(taxId)], dtype='str'),
                'Nickname':
                pd.Series([shortNames[taxId]], dtype='str'),
                'Domain':
                pd.Categorical([domain]),  # Bacteria, Eukaryota, Archaea
                'Phylum':
                pd.Categorical([phylumName]),  # Phylum name (string)
                'NumCDSs':
                pd.Series([cdsCountInRedis],
                          dtype='int'),  # CDS count for this species
                'NumCDSsInProfile':
                pd.Series([0],
                          dtype='int'),  # Num seqs with 20 shuffled profiles
                'AnnotatedNumCDSs':
                pd.Series([
                    0
                    if annotatedProteinCount is None else annotatedProteinCount
                ],
                          dtype='int'),  # 
                'CDSDifference':
                pd.Series([proteinDifference], dtype='float'),  # 
                'NumNativeSeqs':
                pd.Series([0], dtype='int'),  # 
                'GCContentInCDS':
                pd.Series([0.0], dtype='float'),  # 
                'AnnotatedGCContent':
                pd.Series([annotatedGCContent], dtype='float'),  # 
                'RowType':
                pd.Categorical(["species"]),  # Species count or total
                'Warnings':
                pd.Series([", ".join(warnings)], dtype='str'),  #
                'CDSWarnings':
                pd.Series([0], dtype='int'),
                'CDSWarnings_':
                pd.Series([""], dtype='str'),
                'FirstAA':
                pd.Series([""], dtype='str'),
                'LastAA':
                pd.Series([""], dtype='str'),
                'Source':
                pd.Series([""], dtype='str')
            }))

        fractionSize = 1000  # How many sequences (roughly) to process in each task
        numFractions = cdsCountInRedis / fractionSize
        if numFractions == 0: numFractions = 1

        for i in range(numFractions):
            # DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #
            #if i%100!=5: continue
            # DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #

            call = dask.delayed(calcNativeSequencesStatistics)(taxId, i,
                                                               numFractions)
            delayedCalls_native.append(call)

        call = dask.delayed(countShuffledProfiles)(taxId,
                                                   (310, 10, "begin", 0), 102,
                                                   11)
        delayedCalls_shuffledProfiles.append(call)

    speciesDf.set_index('TaxId', inplace=True)

    print("Starting {} calls...".format(
        len(delayedCalls_native) + len(delayedCalls_shuffledProfiles)))

    futures = scheduler.compute(
        delayedCalls_native + delayedCalls_shuffledProfiles
    )  # submit all delayed calculations; obtain futures immediately

    try:
        _distributed.progress(futures)  # wait for all calculations to complete
    except Exception as e:
        print(E)
    print("\n")

    print("Waiting for all tasks to complete...")
    _distributed.wait(futures)

    results = {}

    errorsCount = 0
    for f in futures:
        try:
            ret = scheduler.gather(f)
            if (len(ret) == 9):
                (taxId, fraction, cdsCount, gcCounts, totalCounts, cdsWarnings,
                 warnings, firstAA, lastAA) = ret

                current = None
                if taxId in results:
                    current = results[taxId]
                else:
                    current = (0, 0, 0, 0, Counter(), Counter(), Counter())

                current = (current[0] + cdsCount, current[1] + gcCounts,
                           current[2] + totalCounts, current[3] + cdsWarnings,
                           current[4] + warnings, current[5] + firstAA,
                           current[6] + lastAA)

                results[taxId] = current

            elif (len(ret) == 2):
                (taxId, numShuffledSeqs) = ret
                shuffledCounts[taxId] = numShuffledSeqs

            else:
                assert (False)

        except Exception as e:
            print(e)
            errorsCount += 1

    for taxId, result in results.items():
        (numNativeSeqs, gcCounts, totalCounts, cdsWarnings, warnings, firstAA,
         lastAA) = result
        speciesDf.at[taxId, 'NumNativeSeqs'] = numNativeSeqs

        speciesDf.at[taxId, 'GCContentInCDS'] = round(
            float(gcCounts) / float(totalCounts) * 100.0, 1)

        speciesDf.at[taxId, 'CDSWarnings'] = cdsWarnings

        speciesDf.at[taxId, 'CDSWarnings_'] = summarizeCounter(warnings)
        speciesDf.at[taxId, 'FirstAA'] = summarizeCounter(firstAA)
        speciesDf.at[taxId, 'LastAA'] = summarizeCounter(lastAA)

        #if numNativeSeqs < species.at[taxId, 'NumCDSs']:
        #    pass

    for taxId, result in shuffledCounts.items():
        speciesDf.at[taxId, 'NumCDSsInProfile'] = result

    speciesDf = speciesDf.sort_values(by=['Domain', 'Species'])  # sort rows
    speciesDf.to_html('species_report.html',
                      float_format='{0:.1f}'.format,
                      columns=[
                          'Species', 'Nickname', 'NumCDSs', 'NumCDSsInProfile',
                          'AnnotatedNumCDSs', 'CDSDifference', 'NumNativeSeqs',
                          'GCContentInCDS', 'AnnotatedGCContent', 'Phylum',
                          'Domain', 'Warnings', 'CDSWarnings', 'CDSWarnings_',
                          'FirstAA', 'LastAA'
                      ])

    with open("species_report_simple.rst", "w") as f:
        f.write(
            speciesDf.drop([
                'RowType', 'Warnings', 'CDSWarnings', 'CDSWarnings_',
                'FirstAA', 'LastAA', 'CDSDifference'
            ],
                           axis=1).pipe(tabulate,
                                        headers='keys',
                                        tablefmt='rst'))

    speciesDf.to_html('species_report_simple.html',
                      float_format='{0:.1f}'.format,
                      columns=[
                          'Species', 'Nickname', 'NumCDSs', 'NumCDSsInProfile',
                          'AnnotatedNumCDSs', 'CDSDifference', 'NumNativeSeqs',
                          'GCContentInCDS', 'AnnotatedGCContent', 'Phylum',
                          'Domain'
                      ])

    speciesDf.to_excel('species_report.xlsx', sheet_name='Species summary')