def getSpeciesToInclude():
    ret = []
    for taxId in allSpeciesSource():
        if taxId in speciesToExclude:
            continue
        ret.append(taxId)
    return ret
def run():

    positiveDict = dict(positiveGroup)
    negativeDict = dict(negativeGroup)

    totalCount = 0
    positiveCount = 0
    negativeCount = 0

    for taxId in allSpeciesSource():

        totalCount += 1

        #if not getSpeciesProperty(taxId, 'algae')[0] is None:
        #    continue

        lineage = frozenset(ncbiTaxa.get_lineage(taxId))

        algeaClassification = None

        if lineage.intersection(algaeDefinition_ExcludedGroups):
            algeaClassification = ('No', 'Excluded taxonomic group')

        elif taxId in positiveDict:
            algeaClassification = ('Yes', positiveDict[taxId])

        elif taxId in negativeDict:
            algeaClassification = ('No', negativeDict[taxId])

        if not algeaClassification is None:
            setSpeciesProperty(taxId,
                               "algae",
                               algeaClassification[0],
                               algeaClassification[1],
                               overwrite=True)

            # Done; update counts
            if algeaClassification[0] == 'Yes':
                positiveCount += 1

                if lineage.intersection(algaeDefinition_ExcludedGroups):
                    print("Warning: possible false annotation: %d" % taxId)
                if not lineage.intersection(algaeDefinition_IncludedGroups):
                    print("Warning: possible false annotation: %d" % taxId)
            elif algeaClassification[0] == 'No':
                negativeCount += 1
            else:
                assert (False)
        else:
            if lineage.intersection(algaeDefinition_IncludedGroups):
                print("Warning: check unannotated possible algae: %d" % taxId)

    print("Finished %d species (%d annotated; %d positive, %d negative)" %
          (totalCount, positiveCount + negativeCount, positiveCount,
           negativeCount))
Exemplo n.º 3
0
def runDistributed():
    import _distributed
    import dask

    scheduler = _distributed.open()
    delayedCalls = []

    for taxId in allSpeciesSource():

        if not getSpeciesProperty(taxId, "ENc-prime")[0] is None:
            continue

        print(taxId)

        call = dask.delayed(annotateENcPrime)(taxId)
        delayedCalls.append(call)

    print("Starting %d calls..." % len(delayedCalls))

    futures = scheduler.compute(
        delayedCalls
    )  # submit all delayed calculations; obtain futures immediately

    try:
        _distributed.progress(futures)  # wait for all calculations to complete
    except Exception as e:
        print(E)
    print("\n")

    print("Waiting for all tasks to complete...")
    _distributed.wait(futures)

    results = {}
    errorsCount = 0
    newValuesCount = 0
    oldValuesCount = 0
    for f in futures:
        try:
            (taxId, ENc, ENc_prime, isFreshValue) = scheduler.gather(f)
            results[taxId] = (ENc, ENc_prime)
            if isFreshValue:
                newValuesCount += 1
            else:
                oldValuesCount += 1

        except Exception as e:
            print(e)
            errorsCount += 1

    print("Finished %d species with %d errors" % (len(results), errorsCount))
    print("{} new values; {} old values".format(newValuesCount,
                                                oldValuesCount))
    return results
def runDistributed():

    for taxId in allSpeciesSource():

        currentProp = getSpeciesProperty(taxId, 'paired-mRNA-fraction')

        if currentProp[0] is None:
            continue

        if currentProp[1] == "computed":
            origVal = float(currentProp[0])
            fixedVal = origVal * 2
            setSpeciesProperty(taxId,
                               "paired-mRNA-fraction",
                               "%.4g" % fixedVal,
                               "computed (v2)",
                               overwrite=True)
            print("Fixed %d: %.4g -> %.4g" % (taxId, origVal, fixedVal))
Exemplo n.º 5
0
def testAll():

    testGettingGenomeAttributes(10796, "Archaea")
    testGettingGenomeAttributes(15, "Eukaryota")
    testGettingGenomeAttributes(1059, "Archaea")
    testGettingGenomeAttributes(1030, "Bacteria")
    testGettingGenomeAttributes(1070, "Bacteria")
    testGettingGenomeAttributes(1564, "Archaea")
    testGettingGenomeAttributes(1589, "Bacteria")
    testGettingGenomeAttributes(1124, "Bacteria")
    testGettingGenomeAttributes(820, "Bacteria")
    testGettingGenomeAttributes(1069, "Bacteria")
    testGettingGenomeAttributes(410, "Eukaryota")
    testGettingGenomeAttributes(691, "Bacteria")
    testGettingGenomeAttributes(815, "Bacteria")
    testGettingGenomeAttributes(416, "Bacteria")
    testGettingGenomeAttributes(1014, "Bacteria")

    print("---------------------------------------------")

    totalCount = 0
    envFoundCount = 0
    tempFoundCount = 0
    statsFoundCount = 0

    temps1 = {}
    temps2 = {}
    oxygenReq = {}
    habitat = {}
    salinity = {}
    proteinCount = {}
    gcContent = {}
    genomeSize = {}

    for taxId in allSpeciesSource():
        if limitSpecies and taxId not in limitSpecies:
            continue

        genomesList = taxIdToGenomeId(taxId)
        if (not genomesList):
            print("No genome-id found for (taxId=%d), skipping..." % taxId)
            continue

        kingdom = getKingdomForSpecies(taxId)
        genomeId = genomesList[0]  # TODO - is this right?
        props = testGettingGenomeAttributes(genomeId, kingdom)

        tempFound = False
        envFound = False
        statsFound = False

        if 'Environment:' in props:
            envFound = True
            envprops = props['Environment:']

            if 'TemperatureRange' in envprops:
                tempFound = True
                temps1[taxId] = envprops['TemperatureRange']

            if 'OptimumTemperature' in envprops:
                tempFound = True
                temps2[taxId] = envprops['OptimumTemperature']

            if 'OxygenReq' in envprops:
                oxygenReq[taxId] = envprops['OxygenReq']

            if 'Salinity' in envprops:
                salinity[taxId] = envprops['Salinity']

            if 'Habitat' in envprops:
                habitat[taxId] = envprops['Habitat']

        else:
            envFound = False

        if 'Statistics:' in props:
            statsFound = True
            stats = props['Statistics:']

            if 'protein count' in stats:
                proteinCount[taxId] = stats['protein count']

            if 'GC%' in stats:
                gcContent[taxId] = stats['GC%']

            if 'total length (Mb)' in stats:
                genomeSize[taxId] = stats['total length (Mb)']
        else:
            statsFound = False

        totalCount += 1
        if envFound:
            envFoundCount += 1
        if tempFound:
            tempFoundCount += 1
        if statsFound:
            statsFoundCount += 1

    print("TemperatureRange")
    print(temps1)
    print("OptimumTemperature")
    print(temps2)
    print("Salinity")
    print(salinity)
    print("Habitat")
    print(habitat)
    print("OxygenReq")
    print(oxygenReq)

    print("ProteinCount")
    print(proteinCount)
    print("GC%")
    print(gcContent)
    print("genomeSize")
    print(genomeSize)
    print("Total: %d\tEnv found: %d\tTemp found: %d\tStats found: %d" %
          (totalCount, envFoundCount, tempFoundCount, statsFoundCount))

    x = {}
    for k, v in temps2.items():
        if type(v) == type(''):
            if v == 'C':
                v = None
            elif v[-1] == 'C':
                v = int(v[:-1])
            else:
                v = None
                print("Unknown val %s" % v)
        elif type(v) == type(()):
            if len(v) == 2:
                v = (float(v[0]) + float(v[1])) / 2
            else:
                v = None
                print("Uknown val %s" % v)

        if not v is None:
            x[k] = v
    print(x)

    for taxId, temperature in x.items():
        setSpeciesProperty(taxId,
                           'optimum-temperature',
                           '%g' % temperature,
                           "entrez",
                           overwrite=False)

    for taxId, tempRange in temps1.items():
        setSpeciesProperty(taxId,
                           'temperature-range',
                           tempRange,
                           "entrez",
                           overwrite=False)

    for taxId, val in salinity.items():
        if val == 'Unknown':
            continue
        setSpeciesProperty(taxId, 'salinity', val, "entrez", overwrite=False)

    for taxId, val in habitat.items():
        if val == 'Unknown':
            continue
        setSpeciesProperty(taxId, 'habitat', val, "entrez", overwrite=False)

    for taxId, val in oxygenReq.items():
        if val == 'Unknown':
            continue
        setSpeciesProperty(taxId, 'oxygen-req', val, "entrez", overwrite=False)

    for taxId, val in proteinCount.items():
        setSpeciesProperty(taxId,
                           'protein-count',
                           val,
                           "entrez",
                           overwrite=False)

    for taxId, val in gcContent.items():
        if (val > 90 or val < 10):
            continue

        if (setSpeciesProperty(taxId,
                               'gc-content',
                               "%g" % val,
                               "entrez",
                               overwrite=False)):
            print("[gc-content (taxid=%d) -> %g]" % (taxId, val))

    for taxId, val in genomeSize.items():
        setSpeciesProperty(taxId,
                           'genome-size-mb',
                           "%g" % val,
                           "entrez",
                           overwrite=False)

    return 0

    genomeIdentifiers = taxIdToGenomeId(
        3055)  # Obtain genome-ids for this tax-id
    for genomeId in genomeIdentifiers:

        report = fetchEntrezGenomeReportForSpecies(genomeId)
        props = parseNCBIGenomeHTML_fetchSummaryReport(report)
        print(props)
    #return 0
    #------------
    #for fn in ("NCBI_genome_1030.html", "NCBI_genome_1070.html", "NCBI_genome_1347.html"):
    #    with open(fn, "r") as f:
    #        print("Testing %s..." % fn)
    #        props = parseNCBIGenomeHTML_fetchSummaryReport(f.read())
    #        print(props)

    for fn in ("NCBI_genomes_report_15_table.txt", ):
        with open(fn, "r") as f:
            print("Testing %s..." % fn)
            (genomeId,
             assemblyId) = parseNCBIGenomeAssembliesHTML_fetchMainAssembly(
                 fixMissingImgCloseTags(wrapTableFragmentAsXML(f.read())))
            print((genomeId, assemblyId))

    return 0
    import os.path
    
    argsParser = argparse.ArgumentParser()
    argsParser.add_argument( "--taxid",           type=parseList(int) )
    argsParser.add_argument( "--all-taxa",        default=False, action="store_true")
    argsParser.add_argument( "--profile",         type=parseProfileSpec(), default=parseProfileSpec()('310:10:end:0') )
    argsParser.add_argument( "--computation-tag", type=int,                default=Sources.RNAfoldEnergy_SlidingWindow40_v2 )
    argsParser.add_argument( "--shuffle-type",    type=int,                default=Sources.ShuffleCDSv2_python )
    #argsParser.add_argument( "--Ecoli-workaround",   default=False, action="store_true" )
    args = argsParser.parse_args()

    taxonsToProcess = []
    if not args.all_taxa:
        taxonsToProcess = args.taxid
    else:
        taxonsToProcess= frozenset( allSpeciesSource() ) - speciesToExclude

    assert(taxonsToProcess)
    print("Processing {} taxons".format(len(taxonsToProcess)))

    out = pd.DataFrame()
    for taxId in taxonsToProcess:
        df = processGenome(taxId, args)
        print(df)
        out = out.append( df )

    # write data to file
    out.to_csv( "output_native_LFE_statistics_table.csv" )
    # print domain stats
    out = out.append( out.assign(Domain=lambda x:"All") )
    print( out.assign(Count=lambda x:1).groupby('Domain').sum().Count )
Exemplo n.º 7
0
habitatCategories = Counter()


#Counter({'Mesophilic': 79, 'Hyperthermophilic': 20, 'Thermophilic': 13, 'Psychrophilic': 4, 'Unknown': 1})


# Temperatures and categories for all species *that have temperatures*
temperatureVsCategoryStatistics = pd.DataFrame({
    'tax_id':pd.Series(dtype='int'),
    'temperature':pd.Series(dtype='float'),
    'category':pd.Categorical([])
    })


# Plot raw data
for taxId in allSpeciesSource():

    category = None
    temperatureRange = getSpeciesProperty( taxId, 'temperature-range')
    if not temperatureRange[0] is None:
        category = temperatureRange[0]
        categories.update((category,))
    else:
        category = "Unknown"
    assert(not category is None)

    optimalTemperatureData = getSpeciesProperty( taxId, 'optimum-temperature')
    optimalTemperature = None
    if not optimalTemperatureData[0] is None:
        optimalTemperature = float(optimalTemperatureData[0])
def loadSpeciesMapping():
    for taxId in allSpeciesSource():
        speciesName = getSpeciesName(taxId)
        speciesMapping[speciesName] = taxId
def runDistributed():
    import _distributed
    import dask

    scheduler = _distributed.open()

    results = {}

    #taxids = []
    delayedCalls = []

    fractionSize = 20

    for taxId in allSpeciesSource():

        if randint(0, 20) > 0:
            continue

        if not getSpeciesProperty(taxId, 'paired-mRNA-fraction')[0] is None:
            continue

        size = countSpeciesCDS(taxId)

        numFractions = size / fractionSize
        for i in range(numFractions):
            call = dask.delayed(calcNativePairedFraction)(taxId, i,
                                                          numFractions)
            delayedCalls.append(call)
            #taxids.append(taxId)

    print("Starting %d calls..." % len(delayedCalls))

    futures = scheduler.compute(
        delayedCalls
    )  # submit all delayed calculations; obtain futures immediately

    try:
        _distributed.progress(futures)  # wait for all calculations to complete
    except Exception as e:
        print(E)
    print("\n")

    print("Waiting for all tasks to complete...")
    _distributed.wait(futures)

    results = {}
    errorsCount = 0
    for f in futures:
        try:
            (taxId, fraction, cdsCount, countPairedNucleotides,
             countTotalNucleotides) = scheduler.gather(f)

            current = None
            if taxId in results:
                current = results[taxId]
            else:
                current = (0, 0, 0, set())

            current = (current[0] + cdsCount,
                       current[1] + countPairedNucleotides,
                       current[2] + countTotalNucleotides,
                       current[3].union(set((fraction, ))))

            results[taxId] = current

        except Exception as e:
            print(e)
            errorsCount += 1

    for taxId, result in results.items():
        if len(result[3]) != max(result[3]) + 1:
            #raise Exception("Found invalid number of items for taxId=%d" % taxId)
            print("Found invalid number of items for taxId=%d" % taxId)
            continue

        fraction = float(result[1]) / result[2]

        setSpeciesProperty(taxId,
                           "paired-mRNA-fraction",
                           "%.4g" % fraction,
                           "computed (v3)",
                           overwrite=False)

        print("TaxId: %d\t\tFraction: %.4g" % (taxId, fraction))

    print("Finished %d species with %d errors" % (len(results), errorsCount))
    return results
def parseList(conversion=str):
    def convert(values):
        return map(conversion, values.split(","))

    return convert


argsParser = argparse.ArgumentParser()
argsParser.add_argument("--exclude-species", type=parseList(int), default=())
argsParser.add_argument("--size", type=int, default=10)
argsParser.add_argument("--sample-from-taxon", type=int)

args = argsParser.parse_args()

allSpecies = frozenset(allSpeciesSource())
excludedSpecies = frozenset(args.exclude_species)
candidateSpecies = allSpecies - excludedSpecies

print("All species:\t\t{}".format(len(allSpecies)))
print("Excluded species:\t\t{}".format(len(excludedSpecies)))
print("Candidate species:\t\t{}".format(len(candidateSpecies)))

species = list(candidateSpecies)
random.shuffle(species)

ret = []

if args.sample_from_taxon is None:  # unrestricted sampling
    #print(random.sample( candidateSpecies, args.size ))
    ret = species[:args.size]
Exemplo n.º 11
0
    argsParser.add_argument("--profile", type=parseProfileSpec())
    argsParser.add_argument("--computation-tag",
                            type=int,
                            default=Sources.RNAfoldEnergy_SlidingWindow40_v2)
    argsParser.add_argument("--shuffle-types", type=parseList(int))
    argsParser.add_argument("--num-shuffles", type=int, default=20)
    argsParser.add_argument("--pax-db", type=str, required=False)
    argsParser.add_argument("--codonw", type=bool, default=False)
    argsParser.add_argument("--external-property", type=str, default=None)
    argsParser.add_argument("--distributed",
                            action="store_true",
                            default=False)
    args = argsParser.parse_args()

    if (args.all_taxa):
        args.taxid = list(allSpeciesSource())

    if (args.taxid is None):
        raise Exception(
            "No species requested (use '--taxid tax1,tax2,tax3' or '--all-taxa')"
        )

    # ------------------------------------------------------------------------------------
    # Argument validity checks
    if (len(args.taxid) > len(frozenset(args.taxid))):
        raise Exception(
            "Duplicate taxid encountered in list %s" % args.taxid
        )  # Make sure no taxid was specified twice (will skew calculations...)

    checkSpeciesExist(
        args.taxid)  # Check for non-existant taxids to avoid doomed runs
#csvRegressionEffectsByGroupCsv = "tree_traits_effects_analysis_with_taxgroups.out.abs(dLFE).length.300.csv"

# Possible input 2: Raw profile value outliers, i.e., profiles with positive dLFE at position 0 (created by find_trait_values_outliers.r)
#csvRegressionEffectsByGroupCsv = "find_trait_values_outliers.out.dLFE.csv"
#--------------------------------------------------------------

baseFontSize = 25  # Scale factor for (most) text
significanceLevel = 1e-2  # p-values smaller than this will be marked as significant
barScale = 500  # Width of 100%-bar
#barScale = 20              # Width of 100%-bar
useOwnXserver = False
#--------------------------------------------------------------

print("Processing all species...")
taxidToLineage = {}
for taxId in allSpeciesSource():  # read taxonomic lineages for all species
    lineage = ncbiTaxa.get_lineage(taxId)
    taxidToLineage[taxId] = lineage
"""
Return a list of "major taxonomic groups" (i.e., groups having at least a minimum number of species)
Input - map taxid -> lineage
Output - collection of pairs (taxId, num_species)
"""


def getMajorTaxonomicGroups(taxidToLineage):
    bigGroups = {}
    for taxId, lineage in taxidToLineage.items():
        for item in lineage:
            if item in bigGroups:
                bigGroups[item] += 1
def calculate2dProfile(args):

    maxLength = 300
    profileStep = 10

    taxids = []
    if args.all_species:
        taxids = [x for x in allSpeciesSource()]
    else:
        taxids = args.taxid

    for taxid in taxids:
        count = 0
        nativeArrayData = []
        controlArrayData = []

        testt = dict(map(lambda x: (x, 0), range(21)))

        for result in sampleProfilesFixedIntervals(convertResultsToMFEProfiles(
                readSeriesResultsForSpecies(args.computation_tag, taxid,
                                            args.num_shuffles,
                                            args.num_shuffles),
                args.num_shuffles),
                                                   startPosition=0,
                                                   endPosition=maxLength,
                                                   interval=profileStep):
            profileData = result["profile-data"]

            # Check the sequence-id
            seqId = result["content"][0]["id"]
            if (seqId.find(":") != -1):
                seqId = seqId.replace(":", "/")
            shuffleId = int(
                seqId.split("/")[3]
            )  # the first result should belong to shuffle-id -1 (i.e., the native sequence)
            assert (shuffleId == -1)

            expectedProfileLength = min(
                len(result["content"][0]["MFE-profile"]) / profileStep,
                maxLength / profileStep)
            profileLength = profileData.shape[1]
            assert (abs(profileLength - expectedProfileLength) <= 1)

            if (
                    profileData.shape[0] != args.num_shuffles + 1
            ):  # we require one vector per suffled sequence, plus one for the native sequence
                print("Warning: ignoring record '%s' containing %d records" %
                      (seqId, profileData.shape[0]))
                continue

            nativeDiffs = profileData[0, ] - profileData[
                1:,
            ]  # Calculate NativeLFE - ShuffledLFE (for each of the shuffles, and for each window)
            #controlDiffs = profileData[-1,] - profileData[:-1,]   # Calculate NativeLFE - ShuffledLFE (for each of the shuffles, and for each window)
            controlDiffs = profileData[8, ] - profileData[
                1:,
            ]  # Calculate NativeLFE - ShuffledLFE (for each of the shuffles, and for each window)
            assert (nativeDiffs.shape[0] == args.num_shuffles)
            assert (controlDiffs.shape[0] == args.num_shuffles)

            for i in range(21):
                deltas = profileData[i, ] - profileData
                T, pval = wilcoxon(deltas.ravel())
                if (pval < 0.05):
                    testt[i] += 1
                    #print("%d - pval: %g" % (i, pval))

            direction = np.sign(
                np.apply_along_axis(np.mean, 0, nativeDiffs)
            )  # TODO - prove this is equivalent to checking whether the sign of the sum of ranks
            controlDirection = np.sign(
                np.apply_along_axis(np.mean, 0, controlDiffs)
            )  # TODO - prove this is equivalent to checking whether the sign of the sum of ranks

            wilc = np.apply_along_axis(
                wilcoxon, 0, nativeDiffs
            )  # The wilcoxon test is performed separately for each window (with N=args.num_shuffles, typically = 20)

            # Note that Nr <= N (because ties are ignored when using the default settings), and Nr should be at least 10 or 20 for the distribution to approach normal.
            # See:
            # Explanation of Python impl. (using T statistic):  https://stackoverflow.com/a/18966286
            # Wilcoxon signed-rank test tutorial:               http://vassarstats.net/textbook/ch12a.html
            assert (wilc.shape == (2, profileLength))
            #controlWilc = np.apply_along_axis( wilcoxon, 0, controlDiffs )   # The wilcoxon test is performed separately for each window (with N=args.num_shuffles, typically = 20)
            controlWilc = np.apply_along_axis(
                np.mean, 0, controlDiffs
            )  # The wilcoxon test is performed separately for each window (with N=args.num_shuffles, typically = 20)

            #print("--"*10)
            #print(direction)
            #print(np.mean(wilc[0,]))
            #Nr = sum( np.abs(nativeDiffs) > 1e-6 )
            #print(Nr)
            #sigma = np.sqrt(Nr * (Nr+1) * (2*Nr+1) / 6)   # =SD of W
            assert (
                np.all(wilc[0, ] >= 0.0)
            )  # test statistic T (not W) - The sum of the ranks of the differences above or below zero, whichever is smaller
            #assert( np.all( controlWilc[0,] >= 0.0 ))  # test statistic T (not W) - The sum of the ranks of the differences above or below zero, whichever is smaller
            #S = Nr * (Nr+1) / 2.0  # Sum of all ranks
            #W = S - 2*wilc[0,]
            #Z = W / sigma
            #print(wilc[1,])
            assert (np.all(((wilc[1, ] >= 0.0) & (wilc[1, ] <= 1.0))
                           | np.isnan(wilc[1, ])))  # P-values
            #print(Z * direction)

            #wilc.resize((2, maxLength / profileStep))  # pad with zeros
            #arrayData.append( Zwilc[1,] )
            #out = np.resize( Z*direction, (2, maxLength / profileStep))
            out = np.resize(
                np.log10(wilc[1, ]) * direction * -1,
                (2, maxLength / profileStep))
            nativeArrayData.append(out)

            #controlOut = np.resize( np.log10(controlWilc[1,]) * controlDirection * -1, (2, maxLength / profileStep))
            controlOut = np.resize(controlWilc[1, ],
                                   (2, maxLength / profileStep))
            controlArrayData.append(controlOut)

            count += 1

        if (not nativeArrayData):
            print("Warning: no data found for taxid=%d" % taxid)
            continue

        nativeAr = np.vstack(nativeArrayData)
        controlAr = np.vstack(controlArrayData)
        #print(ar.shape)
        #print(ar[0,])

        #x = np.apply_along_axis( lambda x: relfreq(x[~np.isnan(x)], numbins=100, defaultreallimits=(-5,5)), 0, ar)
        #x = np.apply_along_axis( lambda x: np.histogram(x[~np.isnan(x)], bins=100, range=(-5,5), density=True), 0, nativeAr)
        #print(x.shape)
        #nativeFreqs = np.vstack(x[0,])

        #y = np.apply_along_axis( lambda x: np.histogram(x[~np.isnan(x)], bins=100, range=(-5,5), density=True), 0, controlAr)
        #print(x.shape)
        #controlFreqs = np.vstack(y[0,])

        #print( np.apply_along_axis( np.sum, 1, controlFreqs ) )
        #assert( np.allclose( np.apply_along_axis( np.sum, 0, freqs ), 1.0 ) )
        #print(freqs.shape)
        #print(freqs[0])

        #plot2dProfile(nativeFreqs, taxid)
        print(testt)
        plot2dProfile(controlAr, taxid)

        print(count)
    return 0
Exemplo n.º 14
0
def speciesByPhylaTable():
    allPhyla = parseReport()  # get all existing phyla

    domainCounts = Counter()
    phylaCounts = Counter()
    skippedCounts = Counter()
    #classesByPhyla = {}   # Disable tallying by class, since these are not used for many taxons
    ordersByPhyla = {}
    familiesByPhyla = {}
    genusesByPhyla = {}

    phylaDf = pd.DataFrame({
        'Domain': pd.Categorical([]),  # Bacteria, Eukaryota, Archaea
        'Phylum': pd.Categorical([]),  # Phylum name (string)
        'TaxId': pd.Series([], dtype='int'),  # Phylum TaxId
        'ParentTaxId': pd.Series([], dtype='int'),  # Parent TaxId
        'NumSpecies': pd.Series([],
                                dtype='int'),  # Species count for this phyla
        #                       'NumClasses': pd.Series([], dtype='int'),    # Species count for this phyla
        'NumOrders': pd.Series([], dtype='int'),  # Orders count for this phyla
        'NumFamilies': pd.Series([],
                                 dtype='int'),  # Families count for this phyla
        'NumGenuses': pd.Series([],
                                dtype='int'),  # Genuses count for this phyla
        'RowType': pd.Categorical([])
    })  # Phylum count or total

    for group, phyla in allPhyla.items():
        for phylum, record in phyla.items():
            # Add item for each phylum
            taxId = record['taxId']

            phylaDf = phylaDf.append(
                pd.DataFrame({
                    'Domain':
                    pd.Categorical([group]),
                    'Phylum':
                    pd.Categorical([phylum]),
                    'TaxId':
                    pd.Series([taxId], dtype='int'),
                    'ParentTaxId':
                    pd.Series([record['parentTaxId']], dtype='int'),
                    'NumSpecies':
                    pd.Series([0], dtype='int'),
                    #                                       'NumClasses': pd.Series([0], dtype='int'),
                    'NumOrders':
                    pd.Series([0], dtype='int'),
                    'NumFamilies':
                    pd.Series([0], dtype='int'),
                    'NumGenuses':
                    pd.Series([0], dtype='int'),
                    'RowType':
                    pd.Categorical(['Phylum'])
                }))
            #classesByPhyla[record['taxId']]  = set()
            ordersByPhyla[record['taxId']] = set()
            familiesByPhyla[record['taxId']] = set()
            genusesByPhyla[record['taxId']] = set()

    # Create "special" items
    pid = 1
    for group in allPhyla.keys():
        # Add "Unknown phylum" tally for each domain
        phylaDf = phylaDf.append(
            pd.DataFrame({
                'Domain': pd.Categorical([group]),
                'Phylum': pd.Categorical(['[Unknown]']),
                'TaxId': pd.Series([pid], dtype='int'),
                'ParentTaxId': pd.Series([0], dtype='int'),
                'NumSpecies': pd.Series([0], dtype='int'),
                #                                       'NumClasses': pd.Series([0], dtype='int'),
                'NumOrders': pd.Series([0], dtype='int'),
                'NumFamilies': pd.Series([0], dtype='int'),
                'NumGenuses': pd.Series([0], dtype='int'),
                'RowType': pd.Categorical(['Total'])
            }))
        pid += 1
        # Add totals tally for each domain
        phylaDf = phylaDf.append(
            pd.DataFrame({
                'Domain': pd.Categorical([group]),
                'Phylum': pd.Categorical(['[Total]']),
                'TaxId': pd.Series([pid], dtype='int'),
                'ParentTaxId': pd.Series([0], dtype='int'),
                'NumSpecies': pd.Series([0], dtype='int'),
                #                                       'NumClasses': pd.Series([0], dtype='int'),
                'NumOrders': pd.Series([0], dtype='int'),
                'NumFamilies': pd.Series([0], dtype='int'),
                'NumGenuses': pd.Series([0], dtype='int'),
                'RowType': pd.Categorical(['Total'])
            }))
        pid += 1
    # Add overally totals items
    phylaDf = phylaDf.append(
        pd.DataFrame({
            'Domain': pd.Categorical(['[All]']),
            'Phylum': pd.Categorical(['[Total]']),
            'TaxId': pd.Series([pid], dtype='int'),
            'ParentTaxId': pd.Series([0], dtype='int'),
            'NumSpecies': pd.Series([0], dtype='int'),
            #                                       'NumClasses': pd.Series([0], dtype='int'),
            'NumOrders': pd.Series([0], dtype='int'),
            'NumFamilies': pd.Series([0], dtype='int'),
            'NumGenuses': pd.Series([0], dtype='int'),
            'RowType': pd.Categorical(['Total'])
        }))

    phylaDf.set_index('TaxId', inplace=True)
    skippedSpecies = []

    # Count the number of species under each phylum
    for taxId in allSpeciesSource():
        if taxId in speciesToExclude: continue
        lineage = ncbiTaxa.get_lineage(taxId)
        names = ncbiTaxa.get_taxid_translator(lineage)

        ranks = ncbiTaxa.get_rank(lineage)

        # Determine kingdom/domain
        kingdomTaxId = [
            t for t, rank in ranks.items() if rank == 'superkingdom'
        ]
        if not kingdomTaxId:
            kingdomTaxId = [
                t for t, rank in ranks.items() if rank == 'kingdom'
            ]
        domain = names[kingdomTaxId[0]]
        domainCounts.update([domain])

        # Determine phylum
        phylumTaxId = [t for t, rank in ranks.items() if rank == 'phylum']
        if not phylumTaxId:
            skippedSpecies.append(taxId)
            skippedCounts.update([domain])
            print("Skipping %d: (%s) missing phylum" % (taxId, names[taxId]))
            #print(formatLineage(lineage, names))
            continue  # This table is structured by phylum; information will be missing for any species missing a phylum; it will be included in the "species missing phylum" ([Unknown]) row.
        else:
            phylumTaxId = phylumTaxId[0]

        if phylumTaxId:
            phylaCounts.update(
                [phylumTaxId])  # tally this species under the specified phylum

        #classTaxId = [t for t,rank in ranks.items() if rank=='class']
        #if classTaxId:
        #    classesByPhyla[phylumTaxId].add( classTaxId[0] )

        orderTaxId = [t for t, rank in ranks.items() if rank == 'order']
        if orderTaxId:
            ordersByPhyla[phylumTaxId].add(orderTaxId[0])

        familyTaxId = [t for t, rank in ranks.items() if rank == 'family']
        if familyTaxId:
            familiesByPhyla[phylumTaxId].add(familyTaxId[0])

        genusTaxId = [t for t, rank in ranks.items() if rank == 'genus']
        if genusTaxId:
            genusesByPhyla[phylumTaxId].add(genusTaxId[0])

    assert (sum(skippedCounts.values()) == len(skippedSpecies))

    # Update the phyla counts
    for phylaTaxId, counts in phylaCounts.items():
        #phylaDf.loc[phylaTaxId, 'NumClasses']  = len(classesByPhyla[phylaTaxId])
        phylaDf.loc[phylaTaxId, 'NumOrders'] = len(ordersByPhyla[phylaTaxId])
        phylaDf.loc[phylaTaxId,
                    'NumFamilies'] = len(familiesByPhyla[phylaTaxId])
        phylaDf.loc[phylaTaxId, 'NumGenuses'] = len(genusesByPhyla[phylaTaxId])
        phylaDf.loc[phylaTaxId, 'NumSpecies'] = counts

    # Update the "Unknown phyla" count for each domain
    for group, countMissing in skippedCounts.items():
        #print('-'*20)
        #print("%s - %d missing" % (group, countMissing))
        dummyTaxIdForBasketGroup = phylaDf[(phylaDf.Domain == group) & (
            phylaDf.Phylum == '[Unknown]')].index[0]
        phylaDf.loc[dummyTaxIdForBasketGroup, 'NumSpecies'] = countMissing

    # Update the total for each domain
    for group, totalCount in domainCounts.items():
        dummyTaxIdForBasketGroup = phylaDf[
            (phylaDf.Domain == group) & (phylaDf.Phylum == '[Total]')].index[0]
        phylaDf.loc[dummyTaxIdForBasketGroup, 'NumSpecies'] = totalCount

    # Update the overall total count
    dummyTaxIdForBasketGroup = phylaDf[(phylaDf.Domain == "[All]") &
                                       (phylaDf.Phylum == '[Total]')].index[0]
    phylaDf.loc[dummyTaxIdForBasketGroup,
                'NumSpecies'] = sum(domainCounts.values())
    phylaDf.loc[dummyTaxIdForBasketGroup,
                'NumOrders'] = sum([len(x) for x in ordersByPhyla.values()])
    phylaDf.loc[dummyTaxIdForBasketGroup, 'NumFamilies'] = sum(
        [len(x) for x in familiesByPhyla.values()])
    phylaDf.loc[dummyTaxIdForBasketGroup,
                'NumGenuses'] = sum([len(x) for x in genusesByPhyla.values()])

    # Prepare and save the final table
    phylaReportDf = phylaDf[phylaDf['NumSpecies'] > 0]  # remove "empty" items
    phylaReportDf = phylaReportDf.sort_values(
        by=['Domain', 'RowType', 'Phylum'])  # sort rows
    print(phylaReportDf)
    phylaReportDf.to_html('phyla_report.html',
                          columns=[
                              'Phylum', 'NumOrders', 'NumFamilies',
                              'NumGenuses', 'NumSpecies', 'Domain'
                          ])
    phylaReportDf.to_excel('phyla_report.xlsx', sheet_name='Phyla Summary')

    with open("phyla_report.rst", "w") as f:
        f.write(
            phylaReportDf.drop([
                'RowType', 'NumFamilies', 'NumGenuses', 'NumOrders',
                'ParentTaxId'
            ],
                               axis=1).pipe(tabulate,
                                            headers='keys',
                                            tablefmt='rst'))

    # Prepare the "Missing phyla" report
    missingPhylaReportDf = phylaDf[phylaDf['NumSpecies'] == 0]
    missingPhylaReportDf = missingPhylaReportDf.sort_values(
        by=['Domain', 'RowType', 'Phylum'])  # sort rows
    missingPhylaReportDf.to_html('phyla_report_missing.html',
                                 columns=['Phylum', 'NumSpecies', 'Domain'])
    missingPhylaReportDf.to_excel('phyla_report_missing.xlsx',
                                  sheet_name='Missing Phyla Summary')

    # print counts
    print(domainCounts)
    #print(phylaCounts)

    # Display "skipped items" warning
    if (skippedSpecies):
        print("=" * 50)
        print("Warning: Skipped %d species" % len(skippedSpecies))
        print(skippedCounts)
        print("=" * 50)
Exemplo n.º 15
0
def speciesStatisticsAndValidityReport(args):
    import _distributed

    speciesDf = pd.DataFrame({
        'TaxId': pd.Series([], dtype='int'),  # Species TaxId
        'Species': pd.Series([], dtype='str'),  # Species binomial name
        'Nickname': pd.Series([], dtype='str'),
        'Domain': pd.Categorical([]),  # Bacteria, Eukaryota, Archaea
        'Phylum': pd.Categorical([]),  # Phylum name (string)
        'NumCDSs': pd.Series([], dtype='int'),  # CDS count for this species
        'NumCDSsInProfile':
        pd.Series([], dtype='int'
                  ),  # Num seqs with 20 shuffled profiles for this species
        'AnnotatedNumCDSs': pd.Series([], dtype='int'),  # 
        'CDSDifference': pd.Series([], dtype='float'),  # 
        'NumNativeSeqs': pd.Series([], dtype='int'),  # 
        'GCContentInCDS': pd.Series([], dtype='float'),  # 
        'AnnotatedGCContent': pd.Series([], dtype='float'),  # 
        'RowType': pd.Categorical([]),  # Species count or total
        'Warnings': pd.Series([], dtype='str'),  # 
        'CDSWarnings': pd.Series([], dtype='int'),  # 
        'CDSWarnings_': pd.Series([], dtype='str'),  # 
        'FirstAA': pd.Series([], dtype='str'),  # 
        'LastAA': pd.Series([], dtype='str')  # 
    })

    scheduler = _distributed.open()

    results = {}
    delayedCalls_native = []

    shuffledCounts = {}
    delayedCalls_shuffledProfiles = []

    for taxId in allSpeciesSource():
        if taxId in speciesToExclude:
            continue  # always exclude species from the blacklist
        if args.taxid and taxId not in args.taxid:
            continue  # if a whitelist is specified, skip other species

        warnings = []

        ## DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ##
        #if randint(0, 20) > 0:
        #    continue
        ## DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ##

        cdsCountInRedis = countSpeciesCDS(taxId)

        #cdsCountProfiles = countx(taxId, (310, 10, "begin", 0), 102, 11)

        annotatedProteinCount = getSpeciesProperty(taxId, 'protein-count')[0]

        annotatedGCContent = getSpeciesProperty(taxId, 'gc-content')[0]

        proteinDifference = None
        if not annotatedProteinCount is None:
            proteinDifference = (1.0 - float(cdsCountInRedis) /
                                 float(annotatedProteinCount)) * 100.0

            if abs(proteinDifference) > 9.9:
                warnings.append("CDS_count")
        else:
            warnings.append("No_CDS_count")

        # Determine phylum
        lineage = ncbiTaxa.get_lineage(taxId)
        names = ncbiTaxa.get_taxid_translator(lineage)

        ranks = ncbiTaxa.get_rank(lineage)

        # Determine kingdom/domain
        domain = ""
        kingdomTaxId = [
            t for t, rank in ranks.items() if rank == 'superkingdom'
        ]
        if not kingdomTaxId:
            kingdomTaxId = [
                t for t, rank in ranks.items() if rank == 'kingdom'
            ]
        domain = names[kingdomTaxId[0]]

        phylumName = ""
        # Determine phylum
        phylumTaxId = [t for t, rank in ranks.items() if rank == 'phylum']
        if phylumTaxId:
            phylumName = names[phylumTaxId[0]]

        speciesDf = speciesDf.append(
            pd.DataFrame({
                'TaxId':
                pd.Series([taxId], dtype='int'),  # Species TaxId
                'Species':
                pd.Series([getSpeciesName(taxId)], dtype='str'),
                'Nickname':
                pd.Series([shortNames[taxId]], dtype='str'),
                'Domain':
                pd.Categorical([domain]),  # Bacteria, Eukaryota, Archaea
                'Phylum':
                pd.Categorical([phylumName]),  # Phylum name (string)
                'NumCDSs':
                pd.Series([cdsCountInRedis],
                          dtype='int'),  # CDS count for this species
                'NumCDSsInProfile':
                pd.Series([0],
                          dtype='int'),  # Num seqs with 20 shuffled profiles
                'AnnotatedNumCDSs':
                pd.Series([
                    0
                    if annotatedProteinCount is None else annotatedProteinCount
                ],
                          dtype='int'),  # 
                'CDSDifference':
                pd.Series([proteinDifference], dtype='float'),  # 
                'NumNativeSeqs':
                pd.Series([0], dtype='int'),  # 
                'GCContentInCDS':
                pd.Series([0.0], dtype='float'),  # 
                'AnnotatedGCContent':
                pd.Series([annotatedGCContent], dtype='float'),  # 
                'RowType':
                pd.Categorical(["species"]),  # Species count or total
                'Warnings':
                pd.Series([", ".join(warnings)], dtype='str'),  #
                'CDSWarnings':
                pd.Series([0], dtype='int'),
                'CDSWarnings_':
                pd.Series([""], dtype='str'),
                'FirstAA':
                pd.Series([""], dtype='str'),
                'LastAA':
                pd.Series([""], dtype='str'),
                'Source':
                pd.Series([""], dtype='str')
            }))

        fractionSize = 1000  # How many sequences (roughly) to process in each task
        numFractions = cdsCountInRedis / fractionSize
        if numFractions == 0: numFractions = 1

        for i in range(numFractions):
            # DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #
            #if i%100!=5: continue
            # DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #

            call = dask.delayed(calcNativeSequencesStatistics)(taxId, i,
                                                               numFractions)
            delayedCalls_native.append(call)

        call = dask.delayed(countShuffledProfiles)(taxId,
                                                   (310, 10, "begin", 0), 102,
                                                   11)
        delayedCalls_shuffledProfiles.append(call)

    speciesDf.set_index('TaxId', inplace=True)

    print("Starting {} calls...".format(
        len(delayedCalls_native) + len(delayedCalls_shuffledProfiles)))

    futures = scheduler.compute(
        delayedCalls_native + delayedCalls_shuffledProfiles
    )  # submit all delayed calculations; obtain futures immediately

    try:
        _distributed.progress(futures)  # wait for all calculations to complete
    except Exception as e:
        print(E)
    print("\n")

    print("Waiting for all tasks to complete...")
    _distributed.wait(futures)

    results = {}

    errorsCount = 0
    for f in futures:
        try:
            ret = scheduler.gather(f)
            if (len(ret) == 9):
                (taxId, fraction, cdsCount, gcCounts, totalCounts, cdsWarnings,
                 warnings, firstAA, lastAA) = ret

                current = None
                if taxId in results:
                    current = results[taxId]
                else:
                    current = (0, 0, 0, 0, Counter(), Counter(), Counter())

                current = (current[0] + cdsCount, current[1] + gcCounts,
                           current[2] + totalCounts, current[3] + cdsWarnings,
                           current[4] + warnings, current[5] + firstAA,
                           current[6] + lastAA)

                results[taxId] = current

            elif (len(ret) == 2):
                (taxId, numShuffledSeqs) = ret
                shuffledCounts[taxId] = numShuffledSeqs

            else:
                assert (False)

        except Exception as e:
            print(e)
            errorsCount += 1

    for taxId, result in results.items():
        (numNativeSeqs, gcCounts, totalCounts, cdsWarnings, warnings, firstAA,
         lastAA) = result
        speciesDf.at[taxId, 'NumNativeSeqs'] = numNativeSeqs

        speciesDf.at[taxId, 'GCContentInCDS'] = round(
            float(gcCounts) / float(totalCounts) * 100.0, 1)

        speciesDf.at[taxId, 'CDSWarnings'] = cdsWarnings

        speciesDf.at[taxId, 'CDSWarnings_'] = summarizeCounter(warnings)
        speciesDf.at[taxId, 'FirstAA'] = summarizeCounter(firstAA)
        speciesDf.at[taxId, 'LastAA'] = summarizeCounter(lastAA)

        #if numNativeSeqs < species.at[taxId, 'NumCDSs']:
        #    pass

    for taxId, result in shuffledCounts.items():
        speciesDf.at[taxId, 'NumCDSsInProfile'] = result

    speciesDf = speciesDf.sort_values(by=['Domain', 'Species'])  # sort rows
    speciesDf.to_html('species_report.html',
                      float_format='{0:.1f}'.format,
                      columns=[
                          'Species', 'Nickname', 'NumCDSs', 'NumCDSsInProfile',
                          'AnnotatedNumCDSs', 'CDSDifference', 'NumNativeSeqs',
                          'GCContentInCDS', 'AnnotatedGCContent', 'Phylum',
                          'Domain', 'Warnings', 'CDSWarnings', 'CDSWarnings_',
                          'FirstAA', 'LastAA'
                      ])

    with open("species_report_simple.rst", "w") as f:
        f.write(
            speciesDf.drop([
                'RowType', 'Warnings', 'CDSWarnings', 'CDSWarnings_',
                'FirstAA', 'LastAA', 'CDSDifference'
            ],
                           axis=1).pipe(tabulate,
                                        headers='keys',
                                        tablefmt='rst'))

    speciesDf.to_html('species_report_simple.html',
                      float_format='{0:.1f}'.format,
                      columns=[
                          'Species', 'Nickname', 'NumCDSs', 'NumCDSsInProfile',
                          'AnnotatedNumCDSs', 'CDSDifference', 'NumNativeSeqs',
                          'GCContentInCDS', 'AnnotatedGCContent', 'Phylum',
                          'Domain'
                      ])

    speciesDf.to_excel('species_report.xlsx', sheet_name='Species summary')