Exemplo n.º 1
0
def annotateENcPrime(taxId, overwrite=False):
    encPropValue = getSpeciesProperty(taxId, 'ENc')
    encPrimePropValue = getSpeciesProperty(taxId, 'ENc-prime')

    # TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY #
    ###return (taxId, 0.0, 1.0, False)  # return old values (last value indicates this value are old)
    # TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY #

    if (encPropValue[0] is None) or (encPrimePropValue[0] is
                                     None) or overwrite:
        ENc, ENc_prime = calculateENcPrimeForSpecies(taxId)
        assert (
            ENc < 75.0 and ENc > 10.0
        )  # The actual extreme values for ENc are not clear to me, but let's do a sanity check
        assert (ENc_prime < 75.0 and ENc_prime > 10.0)

        setSpeciesProperty(taxId, 'ENc', str(ENc), "ENCprime (custom version)")
        setSpeciesProperty(taxId, 'ENc-prime', str(ENc_prime),
                           "ENCprime (custom version)")

    else:
        return (
            taxId, encPropValue[0], encPrimePropValue[0], False
        )  # return old values (last value indicates this value are old)

    return (taxId, ENc, ENc_prime, True
            )  # return values (last value indicates new values)
def getTraits( taxIds, traits=(("gc-content", "float"), ("ENc-prime", "float"), ("optimum-temperature", "float"), ("is-endosymbiont", "int"), ("is-high-temp", "int")) ):
    df = pd.DataFrame(dict([(x, pd.Series(dtype=t)) for x,t in traits]), index=taxIds)
    
    for trait, _ in traits:
        for taxId in taxIds:
            traitVal = None
            
            if trait=="is-endosymbiont":
                traitVal = isEndosymbiont( taxId )

            elif trait=="is-high-temp":
                traitVal = 0
                prop = getSpeciesProperty(taxId, "optimum-temperature")
                
                if not prop[0] is None:
                    tempVal = float(prop[0])
                    print("{} -> {}".format(taxId, tempVal))
                    if tempVal > 75.0:
                        traitVal = 1
                        print(taxId)
                        
            else:
                prop = getSpeciesProperty(taxId, trait)
                
                if not prop[0] is None:
                    traitVal = float(prop[0])
                    
            if not traitVal is None:
                print("{} {} -> {}".format(taxId, trait, traitVal))
                df.loc[taxId, trait] = traitVal
    return df
Exemplo n.º 3
0
def runDistributed():
    import _distributed
    import dask

    scheduler = _distributed.open()
    delayedCalls = []

    for taxId in allSpeciesSource():

        if not getSpeciesProperty(taxId, "ENc-prime")[0] is None:
            continue

        print(taxId)

        call = dask.delayed(annotateENcPrime)(taxId)
        delayedCalls.append(call)

    print("Starting %d calls..." % len(delayedCalls))

    futures = scheduler.compute(
        delayedCalls
    )  # submit all delayed calculations; obtain futures immediately

    try:
        _distributed.progress(futures)  # wait for all calculations to complete
    except Exception as e:
        print(E)
    print("\n")

    print("Waiting for all tasks to complete...")
    _distributed.wait(futures)

    results = {}
    errorsCount = 0
    newValuesCount = 0
    oldValuesCount = 0
    for f in futures:
        try:
            (taxId, ENc, ENc_prime, isFreshValue) = scheduler.gather(f)
            results[taxId] = (ENc, ENc_prime)
            if isFreshValue:
                newValuesCount += 1
            else:
                oldValuesCount += 1

        except Exception as e:
            print(e)
            errorsCount += 1

    print("Finished %d species with %d errors" % (len(results), errorsCount))
    print("{} new values; {} old values".format(newValuesCount,
                                                oldValuesCount))
    return results
Exemplo n.º 4
0
def calculateENcPrimeForSpecies(taxId, orig=False):
    geneticCode = getSpeciesTranslationTable(taxId)

    if orig:
        cdsCount, fastaFile = writeSequenceToTempFile_orig(taxId)
    else:
        cdsCount, fastaFile = writeSequenceToTempFile(taxId)

    createCodonCounts(fastaFile.name, cdsCount)
    createNucleotideCounts(fastaFile.name, cdsCount)
    print("Genomic GC%: {}".format(getSpeciesProperty(taxId, 'gc-content')))

    return createEncPrimeReport(fastaFile.name, geneticCode)
def runDistributed():

    for taxId in allSpeciesSource():

        currentProp = getSpeciesProperty(taxId, 'paired-mRNA-fraction')

        if currentProp[0] is None:
            continue

        if currentProp[1] == "computed":
            origVal = float(currentProp[0])
            fixedVal = origVal * 2
            setSpeciesProperty(taxId,
                               "paired-mRNA-fraction",
                               "%.4g" % fixedVal,
                               "computed (v2)",
                               overwrite=True)
            print("Fixed %d: %.4g -> %.4g" % (taxId, origVal, fixedVal))
Exemplo n.º 6
0
def annotateCUBmeasures(taxId, overwrite=False):
    caiPropValue = getSpeciesProperty(taxId, 'genomic-CAI')

    # TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY #
    ###return (taxId, 0.0, 1.0, False)  # return old values (last value indicates this value are old)
    # TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY #

    if (caiPropValue[0] is None) or overwrite:
        cubDf = calculateGenomeLevelCUBmeasures(taxId)

        print(cubDf)
        #print(cubDf.index)
        #print(cubDf.iloc[0].at['CAI'])

        CAI = cubDf.iloc[0].at['CAI']
        CBI = cubDf.iloc[0].at['CBI']
        Fop = cubDf.iloc[0].at['Fop']
        Nc = cubDf.iloc[0].at['Nc']

        assert (CAI < 1.0 and CAI > 0.0)
        assert (CBI < 1.0 and CBI > -0.5)
        assert (Fop < 1.0 and Fop > 0.0)
        assert (
            Nc < 75.0 and Nc > 10.0
        )  # The actual extreme values for ENc are not clear to me, but let's do a sanity check
        print(CAI, CBI, Fop, Nc)

        setSpeciesProperty(taxId, 'genomic-CAI', "{:.4}".format(CAI),
                           "codonw 1.4.4")
        setSpeciesProperty(taxId, 'genomic-CBI', "{:.4}".format(CBI),
                           "codonw 1.4.4")
        setSpeciesProperty(taxId, 'genomic-Fop', "{:.4}".format(Fop),
                           "codonw 1.4.4")
        setSpeciesProperty(taxId, 'genomic-Nc-codonw', "{:.4}".format(Nc),
                           "codonw 1.4.4")

    else:
        return (
            taxId, caiPropValue[0], False
        )  # return old values (last value indicates this value are old)

    return (taxId, CAI, True
            )  # return values (last value indicates new values)
Exemplo n.º 7
0
def annotateDCBS(taxId, overwrite=False):
    dcbsPropValue = getSpeciesProperty(taxId, 'DCBS-geomean')

    # TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY #
    ## return (taxId, 0.0, False)  # return old values (last value indicates this value are old)
    # TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY ### TESTING ONLY #

    if (dcbsPropValue[0] is None) or overwrite:
        DCBS = float(calcDCBS(taxId))
        #assert(ENc_prime<75.0 and ENc_prime>10.0)

        setSpeciesProperty(taxId, 'DCBS-geomean', str(DCBS),
                           "DCBS (matlab, Renana)")

    else:
        return (
            taxId, dcbsPropValue[0], False
        )  # return old values (last value indicates this value are old)

    return (taxId, DCBS, True
            )  # return values (last value indicates new values)
Exemplo n.º 8
0
def getSpeciesPaxdbDataFromFile( taxId ):
    (paxfn, _) = getSpeciesProperty( taxId, 'paxdb-path' )
    if paxfn is None: return {}

    return parsePaxDbFile( paxfn, taxId=taxId )
Exemplo n.º 9
0
#Counter({'Mesophilic': 79, 'Hyperthermophilic': 20, 'Thermophilic': 13, 'Psychrophilic': 4, 'Unknown': 1})


# Temperatures and categories for all species *that have temperatures*
temperatureVsCategoryStatistics = pd.DataFrame({
    'tax_id':pd.Series(dtype='int'),
    'temperature':pd.Series(dtype='float'),
    'category':pd.Categorical([])
    })


# Plot raw data
for taxId in allSpeciesSource():

    category = None
    temperatureRange = getSpeciesProperty( taxId, 'temperature-range')
    if not temperatureRange[0] is None:
        category = temperatureRange[0]
        categories.update((category,))
    else:
        category = "Unknown"
    assert(not category is None)

    optimalTemperatureData = getSpeciesProperty( taxId, 'optimum-temperature')
    optimalTemperature = None
    if not optimalTemperatureData[0] is None:
        optimalTemperature = float(optimalTemperatureData[0])

        temperatureVsCategoryStatistics = temperatureVsCategoryStatistics.append(pd.DataFrame({
            'tax_id':pd.Series([taxId], dtype='int'),
            'temperature':pd.Series([optimalTemperature], dtype='float'),
def runDistributed():
    import _distributed
    import dask

    scheduler = _distributed.open()

    results = {}

    #taxids = []
    delayedCalls = []

    fractionSize = 20

    for taxId in allSpeciesSource():

        if randint(0, 20) > 0:
            continue

        if not getSpeciesProperty(taxId, 'paired-mRNA-fraction')[0] is None:
            continue

        size = countSpeciesCDS(taxId)

        numFractions = size / fractionSize
        for i in range(numFractions):
            call = dask.delayed(calcNativePairedFraction)(taxId, i,
                                                          numFractions)
            delayedCalls.append(call)
            #taxids.append(taxId)

    print("Starting %d calls..." % len(delayedCalls))

    futures = scheduler.compute(
        delayedCalls
    )  # submit all delayed calculations; obtain futures immediately

    try:
        _distributed.progress(futures)  # wait for all calculations to complete
    except Exception as e:
        print(E)
    print("\n")

    print("Waiting for all tasks to complete...")
    _distributed.wait(futures)

    results = {}
    errorsCount = 0
    for f in futures:
        try:
            (taxId, fraction, cdsCount, countPairedNucleotides,
             countTotalNucleotides) = scheduler.gather(f)

            current = None
            if taxId in results:
                current = results[taxId]
            else:
                current = (0, 0, 0, set())

            current = (current[0] + cdsCount,
                       current[1] + countPairedNucleotides,
                       current[2] + countTotalNucleotides,
                       current[3].union(set((fraction, ))))

            results[taxId] = current

        except Exception as e:
            print(e)
            errorsCount += 1

    for taxId, result in results.items():
        if len(result[3]) != max(result[3]) + 1:
            #raise Exception("Found invalid number of items for taxId=%d" % taxId)
            print("Found invalid number of items for taxId=%d" % taxId)
            continue

        fraction = float(result[1]) / result[2]

        setSpeciesProperty(taxId,
                           "paired-mRNA-fraction",
                           "%.4g" % fraction,
                           "computed (v3)",
                           overwrite=False)

        print("TaxId: %d\t\tFraction: %.4g" % (taxId, fraction))

    print("Finished %d species with %d errors" % (len(results), errorsCount))
    return results
        ret.append(taxId)
    return ret


# get tree
(_, prunedTree) = pruneReferenceTree_Nmicrobiol201648(
    getSpeciesToInclude()
)  # prune complete reference phylogenetic tree to include only dataset species
speciesInTree = getSpeciesFromTree(prunedTree)

shortNames = getSpeciesShortestUniqueNamesMapping_memoized()

stats = Counter()
for taxId in getSpeciesToInclude():

    genomicGC = getSpeciesProperty(taxId, 'gc-content')[0]
    if not genomicGC is None:
        genomicGC = float(genomicGC)

    genomicENcprime = getSpeciesProperty(taxId, 'ENc-prime')[0]
    if not genomicENcprime is None:
        genomicENcprime = float(genomicENcprime)

    optimumTemp = getSpeciesProperty(taxId, 'optimum-temperature')[0]
    if not optimumTemp is None:
        optimumTemp = float(optimumTemp)

    genomeSizeMb = getSpeciesProperty(taxId, 'genome-size-mb')[0]
    if not genomeSizeMb is None:
        genomeSizeMb = float(genomeSizeMb)
Exemplo n.º 12
0
def speciesStatisticsAndValidityReport(args):
    import _distributed

    speciesDf = pd.DataFrame({
        'TaxId': pd.Series([], dtype='int'),  # Species TaxId
        'Species': pd.Series([], dtype='str'),  # Species binomial name
        'Nickname': pd.Series([], dtype='str'),
        'Domain': pd.Categorical([]),  # Bacteria, Eukaryota, Archaea
        'Phylum': pd.Categorical([]),  # Phylum name (string)
        'NumCDSs': pd.Series([], dtype='int'),  # CDS count for this species
        'NumCDSsInProfile':
        pd.Series([], dtype='int'
                  ),  # Num seqs with 20 shuffled profiles for this species
        'AnnotatedNumCDSs': pd.Series([], dtype='int'),  # 
        'CDSDifference': pd.Series([], dtype='float'),  # 
        'NumNativeSeqs': pd.Series([], dtype='int'),  # 
        'GCContentInCDS': pd.Series([], dtype='float'),  # 
        'AnnotatedGCContent': pd.Series([], dtype='float'),  # 
        'RowType': pd.Categorical([]),  # Species count or total
        'Warnings': pd.Series([], dtype='str'),  # 
        'CDSWarnings': pd.Series([], dtype='int'),  # 
        'CDSWarnings_': pd.Series([], dtype='str'),  # 
        'FirstAA': pd.Series([], dtype='str'),  # 
        'LastAA': pd.Series([], dtype='str')  # 
    })

    scheduler = _distributed.open()

    results = {}
    delayedCalls_native = []

    shuffledCounts = {}
    delayedCalls_shuffledProfiles = []

    for taxId in allSpeciesSource():
        if taxId in speciesToExclude:
            continue  # always exclude species from the blacklist
        if args.taxid and taxId not in args.taxid:
            continue  # if a whitelist is specified, skip other species

        warnings = []

        ## DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ##
        #if randint(0, 20) > 0:
        #    continue
        ## DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ### DEBUG ONLY ##

        cdsCountInRedis = countSpeciesCDS(taxId)

        #cdsCountProfiles = countx(taxId, (310, 10, "begin", 0), 102, 11)

        annotatedProteinCount = getSpeciesProperty(taxId, 'protein-count')[0]

        annotatedGCContent = getSpeciesProperty(taxId, 'gc-content')[0]

        proteinDifference = None
        if not annotatedProteinCount is None:
            proteinDifference = (1.0 - float(cdsCountInRedis) /
                                 float(annotatedProteinCount)) * 100.0

            if abs(proteinDifference) > 9.9:
                warnings.append("CDS_count")
        else:
            warnings.append("No_CDS_count")

        # Determine phylum
        lineage = ncbiTaxa.get_lineage(taxId)
        names = ncbiTaxa.get_taxid_translator(lineage)

        ranks = ncbiTaxa.get_rank(lineage)

        # Determine kingdom/domain
        domain = ""
        kingdomTaxId = [
            t for t, rank in ranks.items() if rank == 'superkingdom'
        ]
        if not kingdomTaxId:
            kingdomTaxId = [
                t for t, rank in ranks.items() if rank == 'kingdom'
            ]
        domain = names[kingdomTaxId[0]]

        phylumName = ""
        # Determine phylum
        phylumTaxId = [t for t, rank in ranks.items() if rank == 'phylum']
        if phylumTaxId:
            phylumName = names[phylumTaxId[0]]

        speciesDf = speciesDf.append(
            pd.DataFrame({
                'TaxId':
                pd.Series([taxId], dtype='int'),  # Species TaxId
                'Species':
                pd.Series([getSpeciesName(taxId)], dtype='str'),
                'Nickname':
                pd.Series([shortNames[taxId]], dtype='str'),
                'Domain':
                pd.Categorical([domain]),  # Bacteria, Eukaryota, Archaea
                'Phylum':
                pd.Categorical([phylumName]),  # Phylum name (string)
                'NumCDSs':
                pd.Series([cdsCountInRedis],
                          dtype='int'),  # CDS count for this species
                'NumCDSsInProfile':
                pd.Series([0],
                          dtype='int'),  # Num seqs with 20 shuffled profiles
                'AnnotatedNumCDSs':
                pd.Series([
                    0
                    if annotatedProteinCount is None else annotatedProteinCount
                ],
                          dtype='int'),  # 
                'CDSDifference':
                pd.Series([proteinDifference], dtype='float'),  # 
                'NumNativeSeqs':
                pd.Series([0], dtype='int'),  # 
                'GCContentInCDS':
                pd.Series([0.0], dtype='float'),  # 
                'AnnotatedGCContent':
                pd.Series([annotatedGCContent], dtype='float'),  # 
                'RowType':
                pd.Categorical(["species"]),  # Species count or total
                'Warnings':
                pd.Series([", ".join(warnings)], dtype='str'),  #
                'CDSWarnings':
                pd.Series([0], dtype='int'),
                'CDSWarnings_':
                pd.Series([""], dtype='str'),
                'FirstAA':
                pd.Series([""], dtype='str'),
                'LastAA':
                pd.Series([""], dtype='str'),
                'Source':
                pd.Series([""], dtype='str')
            }))

        fractionSize = 1000  # How many sequences (roughly) to process in each task
        numFractions = cdsCountInRedis / fractionSize
        if numFractions == 0: numFractions = 1

        for i in range(numFractions):
            # DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #
            #if i%100!=5: continue
            # DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #### DEBUG ONLY #

            call = dask.delayed(calcNativeSequencesStatistics)(taxId, i,
                                                               numFractions)
            delayedCalls_native.append(call)

        call = dask.delayed(countShuffledProfiles)(taxId,
                                                   (310, 10, "begin", 0), 102,
                                                   11)
        delayedCalls_shuffledProfiles.append(call)

    speciesDf.set_index('TaxId', inplace=True)

    print("Starting {} calls...".format(
        len(delayedCalls_native) + len(delayedCalls_shuffledProfiles)))

    futures = scheduler.compute(
        delayedCalls_native + delayedCalls_shuffledProfiles
    )  # submit all delayed calculations; obtain futures immediately

    try:
        _distributed.progress(futures)  # wait for all calculations to complete
    except Exception as e:
        print(E)
    print("\n")

    print("Waiting for all tasks to complete...")
    _distributed.wait(futures)

    results = {}

    errorsCount = 0
    for f in futures:
        try:
            ret = scheduler.gather(f)
            if (len(ret) == 9):
                (taxId, fraction, cdsCount, gcCounts, totalCounts, cdsWarnings,
                 warnings, firstAA, lastAA) = ret

                current = None
                if taxId in results:
                    current = results[taxId]
                else:
                    current = (0, 0, 0, 0, Counter(), Counter(), Counter())

                current = (current[0] + cdsCount, current[1] + gcCounts,
                           current[2] + totalCounts, current[3] + cdsWarnings,
                           current[4] + warnings, current[5] + firstAA,
                           current[6] + lastAA)

                results[taxId] = current

            elif (len(ret) == 2):
                (taxId, numShuffledSeqs) = ret
                shuffledCounts[taxId] = numShuffledSeqs

            else:
                assert (False)

        except Exception as e:
            print(e)
            errorsCount += 1

    for taxId, result in results.items():
        (numNativeSeqs, gcCounts, totalCounts, cdsWarnings, warnings, firstAA,
         lastAA) = result
        speciesDf.at[taxId, 'NumNativeSeqs'] = numNativeSeqs

        speciesDf.at[taxId, 'GCContentInCDS'] = round(
            float(gcCounts) / float(totalCounts) * 100.0, 1)

        speciesDf.at[taxId, 'CDSWarnings'] = cdsWarnings

        speciesDf.at[taxId, 'CDSWarnings_'] = summarizeCounter(warnings)
        speciesDf.at[taxId, 'FirstAA'] = summarizeCounter(firstAA)
        speciesDf.at[taxId, 'LastAA'] = summarizeCounter(lastAA)

        #if numNativeSeqs < species.at[taxId, 'NumCDSs']:
        #    pass

    for taxId, result in shuffledCounts.items():
        speciesDf.at[taxId, 'NumCDSsInProfile'] = result

    speciesDf = speciesDf.sort_values(by=['Domain', 'Species'])  # sort rows
    speciesDf.to_html('species_report.html',
                      float_format='{0:.1f}'.format,
                      columns=[
                          'Species', 'Nickname', 'NumCDSs', 'NumCDSsInProfile',
                          'AnnotatedNumCDSs', 'CDSDifference', 'NumNativeSeqs',
                          'GCContentInCDS', 'AnnotatedGCContent', 'Phylum',
                          'Domain', 'Warnings', 'CDSWarnings', 'CDSWarnings_',
                          'FirstAA', 'LastAA'
                      ])

    with open("species_report_simple.rst", "w") as f:
        f.write(
            speciesDf.drop([
                'RowType', 'Warnings', 'CDSWarnings', 'CDSWarnings_',
                'FirstAA', 'LastAA', 'CDSDifference'
            ],
                           axis=1).pipe(tabulate,
                                        headers='keys',
                                        tablefmt='rst'))

    speciesDf.to_html('species_report_simple.html',
                      float_format='{0:.1f}'.format,
                      columns=[
                          'Species', 'Nickname', 'NumCDSs', 'NumCDSsInProfile',
                          'AnnotatedNumCDSs', 'CDSDifference', 'NumNativeSeqs',
                          'GCContentInCDS', 'AnnotatedGCContent', 'Phylum',
                          'Domain'
                      ])

    speciesDf.to_excel('species_report.xlsx', sheet_name='Species summary')