Exemplo n.º 1
0
def reduceLocusMap(geneTree, locusMapD):
    '''Create a new locus map D with only entries for genes in geneTree.'''
    gtLocusMapD = {}
    for leaf in trees.leafList(geneTree):
        # the leaf is a gene number
        gtLocusMapD[leaf] = locusMapD[leaf]
    return gtLocusMapD
Exemplo n.º 2
0
def coreNonCoreCtAtNode(tree, node, familyByNodeL, familyL):
    '''Given a tree and a node, first get all the families present in
descendant species. Then figure out which of these families are
non-core (their mrca is located below node) and which are core (mrca
is at node or above). Return count of non-core and core.'''

    subtree = trees.subtree(tree, node)
    nonCoreMrcaL = trees.nodeList(subtree[1]) + trees.nodeList(subtree[2])

    # get set of all families with members in descendant species of this node
    decFamS = set()
    for leaf in trees.leafList(subtree):
        decFamS.update(familyByNodeL[leaf])

    # figure out which are core, non-core
    coreCt = 0
    nonCoreCt = 0
    totCt = len(decFamS)
    for fam in decFamS:
        if familyL[fam].mrca in nonCoreMrcaL:
            nonCoreCt += 1
        else:
            coreCt += 1

    return nonCoreCt, coreCt
Exemplo n.º 3
0
def coreNonCoreCtAtNode(tree,node,familyByNodeL,familyL):
    '''Given a tree and a node, first get all the families present in
descendant species. Then figure out which of these families are
non-core (their mrca is located below node) and which are core (mrca
is at node or above). Return count of non-core and core.'''

    subtree = trees.subtree(tree,node)
    nonCoreMrcaL = trees.nodeList(subtree[1]) + trees.nodeList(subtree[2])
    
    # get set of all families with members in descendant species of this node
    decFamS = set()
    for leaf in trees.leafList(subtree):
        decFamS.update(familyByNodeL[leaf])

    # figure out which are core, non-core
    coreCt=0
    nonCoreCt=0
    totCt = len(decFamS)
    for fam in decFamS:
        if familyL[fam].mrca in nonCoreMrcaL:
            nonCoreCt += 1
        else:
            coreCt += 1
            
    return nonCoreCt,coreCt
Exemplo n.º 4
0
def calcNormScores(tree, strainNum2StrD, blastFilePath, evalueThresh, scoresO,
                   geneNames, aabrhFN):
    '''Given directory of blast output and a graph of raw similarity
scores, calculate normalized similarity scores by comparing each score
with the range of scores in in all around best reciprocal hits in that
pair of strains.'''

    strainNamesL = sorted(
        [strainNum2StrD[leaf] for leaf in trees.leafList(tree)])
    aabrhL = createAabrhL(blastFilePath, strainNamesL, evalueThresh, aabrhFN)

    aabrhRawScoreSummmaryD = getAabrhRawScoreSummmaryD(strainNamesL, aabrhL,
                                                       scoresO, geneNames)

    # loop over each edge in scoresO, normalizing score and saving there
    for gn1, gn2 in scoresO.iterateEdgesByEndNodes():

        rawSc = scoresO.getScoreByEndNodes(gn1, gn2, 'rawSc')

        # find mean,std from aabrhRawScoreSummmaryD.
        gnName1 = geneNames.numToName(gn1)
        sp1, restOfName1 = gnName1.split('-')
        gnName2 = geneNames.numToName(gn2)
        sp2, restOfName1 = gnName2.split('-')
        mean, std = aabrhRawScoreSummmaryD[(sp1, sp2)]
        normSc = normScore(rawSc, mean, std)
        scoresO.addScoreByEndNodes(gn1, gn2, normSc, 'normSc')

    return scoresO, aabrhL, aabrhRawScoreSummmaryD
Exemplo n.º 5
0
def calcNormScores(tree,strainNum2StrD,blastFilePath,evalueThresh,scoresO,geneNames,aabrhFN):
    '''Given directory of blast output and a graph of raw similarity
scores, calculate normalized similarity scores by comparing each score
with the range of scores in in all around best reciprocal hits in that
pair of strains.'''

    strainNamesL=sorted([strainNum2StrD[leaf] for leaf in trees.leafList(tree)])
    aabrhL = createAabrhL(blastFilePath,strainNamesL,evalueThresh,aabrhFN)

    aabrhRawScoreSummmaryD=getAabrhRawScoreSummmaryD(strainNamesL,aabrhL,scoresO,geneNames)
   
    # loop over each edge in scoresO, normalizing score and saving there
    for gn1,gn2 in scoresO.iterateEdgesByEndNodes():

        rawSc=scoresO.getScoreByEndNodes(gn1,gn2,'rawSc')

        # find mean,std from aabrhRawScoreSummmaryD.
        gnName1 = geneNames.numToName(gn1)
        sp1,restOfName1 = gnName1.split('-')
        gnName2 = geneNames.numToName(gn2)
        sp2,restOfName1 = gnName2.split('-')
        mean,std = aabrhRawScoreSummmaryD[(sp1,sp2)]
        normSc = normScore(rawSc,mean,std)
        scoresO.addScoreByEndNodes(gn1,gn2,normSc,'normSc')

    return scoresO,aabrhL,aabrhRawScoreSummmaryD
Exemplo n.º 6
0
def createLRSets(tree, geneNames):
    '''For every gene in our data, put it into one of three sets. Left,
right, or outgroup. Genes in the left set are found in a species on
the left branch of tree.'''

    leftSpeciesS = set(trees.leafList(tree[1]))
    rightSpeciesS = set(trees.leafList(tree[2]))

    leftS = set()
    rightS = set()
    outgroupS = set()
    for geneNum in geneNames.nums:  # all genes
        strain = geneNames.numToStrainNum(geneNum)
        if strain in leftSpeciesS:
            leftS.add(geneNum)
        elif strain in rightSpeciesS:
            rightS.add(geneNum)
        else:
            outgroupS.add(geneNum)

    return (leftS, rightS, outgroupS)
Exemplo n.º 7
0
def createLRSets(tree,geneNames):
    '''For every gene in our data, put it into one of three sets. Left,
right, or outgroup. Genes in the left set are found in a species on
the left branch of tree.'''

    leftSpeciesS=set(trees.leafList(tree[1]))
    rightSpeciesS=set(trees.leafList(tree[2]))
    
    leftS=set()
    rightS=set()
    outgroupS=set()
    for geneNum in geneNames.nums: # all genes
        strain=geneNames.numToStrainNum(geneNum)
        if strain in leftSpeciesS:
            leftS.add(geneNum)
        elif strain in rightSpeciesS:
            rightS.add(geneNum)
        else:
            outgroupS.add(geneNum)

    return(leftS,rightS,outgroupS)
Exemplo n.º 8
0
def printIslandNeighb(islandNum, synWSize, subtreeL, islandByNodeL, familyL,
                      geneOrderT, gene2FamD, fam2IslandD, geneInfoD, geneNames,
                      strainNum2StrD, fileF):
    '''Print the neighborhood of an island. We include the genes in the island and synWSize/2 genes in either direction.'''

    print("  Island:", islandNum, file=fileF)

    genesInEitherDirec = int(synWSize / 2)

    # get the island object for this islandNum
    for listOfIslands in islandByNodeL:
        _, island = islands.searchIslandsByID(listOfIslands, islandNum)
        if island != None: break

    if island == None:
        raise ValueError("Island " + str(islandNum) + " not found.")

    mrca = island.mrca
    print("  mrca:", strainNum2StrD[mrca], file=fileF)

    leavesL = trees.leafList(subtreeL[mrca])

    for strainNum in leavesL:

        print("  In", strainNum2StrD[strainNum], end=' ', file=fileF)

        islandGenesInStrainL = getIslandGenesInStrain(island, strainNum,
                                                      familyL)

        if islandGenesInStrainL == []:
            print("the island is not found.", file=fileF)
        else:

            neighbGenesL, firstIslandGene, lastIslandGene = getNeighborhoodGenes(
                strainNum, geneOrderT, islandGenesInStrainL,
                genesInEitherDirec)

            # print coordinates of island in this strain
            chrom = geneInfoD[geneNames.numToName(islandGenesInStrainL[0])][3]
            startPos = geneInfoD[geneNames.numToName(firstIslandGene)][4]
            endPos = geneInfoD[geneNames.numToName(lastIslandGene)][5]

            print("(Coordinates",
                  chrom + ":" + str(startPos) + "-" + str(endPos) + ")",
                  file=fileF)

            printGenes(neighbGenesL, geneNames, gene2FamD, fam2IslandD,
                       geneInfoD, islandGenesInStrainL, familyL,
                       strainNum2StrD, fileF)
Exemplo n.º 9
0
def familyPrintStrainsPresentAbsent(tree,strainNum2StrD,familyL,famNum,fileF=sys.stdout):
    '''Print a list of strains where the family is present, and another where it is absent.'''

    presL=[]
    notPresL=[]
    for leafNum in trees.leafList(tree):
        if familyL[famNum].isInStrain(leafNum):
            presL.append(strainNum2StrD[leafNum])
        else:
            notPresL.append(strainNum2StrD[leafNum])
    print("Family:",famNum,file=fileF)
    print("  Strains possessing:",file=fileF)
    for strain in presL:
        print("    "+strain,file=fileF)
    print(file=fileF)
    print("  Strains lacking:",file=fileF)
    for strain in notPresL:
        print("    "+strain,file=fileF)
Exemplo n.º 10
0
def vPrintIsland(island,subtreeL,familyL,strainNum2StrD,geneNames,fileF):
    '''Verbose print of an island.'''

    print("  Island",island.id,file=fileF)
    
    # get species nodes subtended by this mrca
    speciesNodesL=trees.leafList(subtreeL[island.mrca])

    # put everything in lists.
    printL=[]
    printL.append(['Family'])
    for node in speciesNodesL:
        printL[0].append(strainNum2StrD[node])
    for fam in island.familyL:
        newRow=[]
        newRow.append(str(fam))
        for node in speciesNodesL:
            geneT = familyL[fam].famGeneT[node]
            newRow.append(",".join([geneNames.numToName(geneNum) for geneNum in geneT]))
        printL.append(newRow)
    printTable(printL,indent=4,fileF=fileF)
Exemplo n.º 11
0
def printIslandNeighb(islandNum,synWSize,subtreeL,islandByNodeL,familyL,geneOrderT,gene2FamD,fam2IslandD,geneInfoD,geneNames,strainNum2StrD,fileF):
    '''Print the neighborhood of an island. We include the genes in the island and synWSize/2 genes in either direction.'''

    print("  Island:",islandNum,file=fileF)
    
    genesInEitherDirec = int(synWSize/2)

    # get the island object for this islandNum
    for listOfIslands in islandByNodeL:
        _,island = islands.searchIslandsByID(listOfIslands,islandNum)
        if island != None: break

    if island == None:
        raise ValueError("Island "+str(islandNum)+" not found.")
        
    mrca = island.mrca
    print("  mrca:",strainNum2StrD[mrca],file=fileF)

    leavesL=trees.leafList(subtreeL[mrca])

    for strainNum in leavesL:

        print("  In",strainNum2StrD[strainNum],end=' ',file=fileF)

        islandGenesInStrainL = getIslandGenesInStrain(island,strainNum,familyL)

        if islandGenesInStrainL == []:
            print("the island is not found.",file=fileF)
        else:

            neighbGenesL,firstIslandGene,lastIslandGene=getNeighborhoodGenes(strainNum,geneOrderT,islandGenesInStrainL,genesInEitherDirec)

            # print coordinates of island in this strain
            chrom=geneInfoD[geneNames.numToName(islandGenesInStrainL[0])][3]
            startPos = geneInfoD[geneNames.numToName(firstIslandGene)][4]
            endPos = geneInfoD[geneNames.numToName(lastIslandGene)][5]

            print("(Coordinates",chrom+":"+str(startPos)+"-"+str(endPos)+")",file=fileF)

            printGenes(neighbGenesL,geneNames,gene2FamD,fam2IslandD,geneInfoD,islandGenesInStrainL,familyL,strainNum2StrD,fileF)
Exemplo n.º 12
0
def vPrintIsland(island, subtreeL, familyL, strainNum2StrD, geneNames, fileF):
    '''Verbose print of an island.'''

    print("  Island", island.id, file=fileF)

    # get species nodes subtended by this mrca
    speciesNodesL = trees.leafList(subtreeL[island.mrca])

    # put everything in lists.
    printL = []
    printL.append(['Family'])
    for node in speciesNodesL:
        printL[0].append(strainNum2StrD[node])
    for fam in island.familyL:
        newRow = []
        newRow.append(str(fam))
        for node in speciesNodesL:
            geneT = familyL[fam].famGeneT[node]
            newRow.append(",".join(
                [geneNames.numToName(geneNum) for geneNum in geneT]))
        printL.append(newRow)
    printTable(printL, indent=4, fileF=fileF)
Exemplo n.º 13
0
def familyPrintStrainsPresentAbsent(tree,
                                    strainNum2StrD,
                                    familyL,
                                    famNum,
                                    fileF=sys.stdout):
    '''Print a list of strains where the family is present, and another where it is absent.'''

    presL = []
    notPresL = []
    for leafNum in trees.leafList(tree):
        if familyL[famNum].isInStrain(leafNum):
            presL.append(strainNum2StrD[leafNum])
        else:
            notPresL.append(strainNum2StrD[leafNum])
    print("Family:", famNum, file=fileF)
    print("  Strains possessing:", file=fileF)
    for strain in presL:
        print("    " + strain, file=fileF)
    print(file=fileF)
    print("  Strains lacking:", file=fileF)
    for strain in notPresL:
        print("    " + strain, file=fileF)
Exemplo n.º 14
0
def printIslandNeighb(islandNum, synWSize, subtreeL, islandByNodeL, familyL,
                      geneOrderT, gene2FamD, fam2IslandD, geneInfoD, geneNames,
                      strainNum2StrD, fileF):
    '''Print the neighborhood of an island. We include the genes in the island and synWSize/2 genes in either direction.'''

    print("  Island:", islandNum, file=fileF)

    genesInEitherDirec = int(synWSize / 2)

    # get the island object for this islandNum
    for listOfIslands in islandByNodeL:
        _, island = islands.searchIslandsByID(listOfIslands, islandNum)
        if island != None: break

    mrca = island.mrca
    print("  mrca:", strainNum2StrD[mrca], file=fileF)

    leavesL = trees.leafList(subtreeL[mrca])

    for strainNum in leavesL:

        print("  In", strainNum2StrD[strainNum], end=' ', file=fileF)

        islandGenesInStrainL = getIslandGenesInStrain(island, strainNum,
                                                      familyL)

        if islandGenesInStrainL == []:
            print("the island is not found.", file=fileF)
        else:

            neighbGenesL, firstIslandGene, lastIslandGene = getNeighborhoodGenes(
                strainNum, geneOrderT, islandGenesInStrainL,
                genesInEitherDirec)

            # print coordinates of island in this strain
            chrom = geneInfoD[geneNames.numToName(islandGenesInStrainL[0])][3]
            startPos = geneInfoD[geneNames.numToName(firstIslandGene)][4]
            endPos = geneInfoD[geneNames.numToName(lastIslandGene)][5]

            print("(Coordinates",
                  chrom + ":" + str(startPos) + "-" + str(endPos) + ")",
                  file=fileF)

            # now print the neighbors
            rowsL = []
            for tempGene in neighbGenesL:
                tempGeneName = geneNames.numToName(tempGene)
                tempFamNum = gene2FamD[tempGene]
                tempGeneIsland = fam2IslandD[tempFamNum]

                if tempGeneName in geneInfoD:
                    descrip = geneInfoD[tempGeneName][2]
                else:
                    descrip = ''

                # mark genes in the island with a *
                if tempGene in islandGenesInStrainL:
                    tempGeneName = '* ' + tempGeneName
                else:
                    tempGeneName = '  ' + tempGeneName

                infoL = [
                    tempGeneName, "isl:" + str(tempGeneIsland.id),
                    "fam:" + str(tempFamNum),
                    "errSc:" + str(familyL[tempFamNum].possibleErrorCt),
                    "mrca:" + strainNum2StrD[tempGeneIsland.mrca], descrip
                ]

                rowsL.append(infoL)

            printTable(rowsL, indent=4, fileF=fileF)
Exemplo n.º 15
0
## main
#TODO: proper CLI

if __name__ == "__main__":

    speciesTreeFN = sys.argv[1]
    geneTreeFN = sys.argv[2]

    # load stuff
    speciesTree = trees.readTree(speciesTreeFN)
    geneTree = trees.loadOneGeneTree(geneTreeFN)

    bigTipMapD = loadD("tipMap.tsv")
    tipMapD = {}  # cut down to those in this gene tree
    for leaf in trees.leafList(geneTree):
        tipMapD[leaf] = bigTipMapD[leaf]

    locusMapD = loadD("locusMap.tsv")
    gtLocusMapD = familiesDTLORstuff.reduceLocusMap(geneTree, locusMapD)
    locusMapForRootingD = trees.createLocusMapForRootingD(
        geneTree, copy.deepcopy(gtLocusMapD))

    argT = (speciesTree, geneTree, tipMapD, gtLocusMapD, locusMapForRootingD,
            D, T, L, O, R)

    optRootedGeneTree, optMPR = familiesDTLORstuff.reconcile(argT)

    print("Rooted tree:")
    print(optRootedGeneTree)
    print()
Exemplo n.º 16
0
such a list. Its not intended to be used every time we run, but rather
to make the list, which can then be put inside a parameter
file. Doubtless there's a better way to make this list.
    '''
    L=[]
    for i in range(0,int(stride/offset)):
        st=mn+offset*i
        end=mx
        for j in range(st,end,stride):
            L.append(j)
    return L
    
if __name__ == "__main__":

    paramFN=sys.argv[1]
    paramD = parameters.loadParametersD(paramFN)

    ## load data structures we'll use below
    tree,strainStr2NumD,strainNum2StrD = trees.readTree(paramD['treeFN'])
    leafNodesL = trees.leafList(tree)
    geneNames = genomes.geneNames(paramD['geneOrderFN'],strainStr2NumD,strainNum2StrD)
    familyL = families.readFamilies(paramD['familyFN'],tree,geneNames,strainStr2NumD)
    islandByNodeL=islands.readIslands(paramD['islandOutFN'],tree,strainStr2NumD)
    geneInfoD = genomes.readGeneInfoD(paramD['geneInfoFN'])    

    
    # get islands organized by strain
    islandByStrainD = createIslandByStrainD(leafNodesL,strainNum2StrD,islandByNodeL,familyL,geneNames,geneInfoD)

    createAllGffs(islandByStrainD,geneInfoD,tree,strainNum2StrD,paramD['gffFilePath'],paramD['scoreNodeMapD'],paramD['potentialScoresL'])
Exemplo n.º 17
0
    paramFN = sys.argv[1]
    paramD = parameters.loadParametersD(paramFN)

    tree, strainStr2NumD, strainNum2StrD = trees.readTree(paramD['treeFN'])

    # get familyL etc.
    geneNames = genomes.geneNames(paramD['geneOrderFN'], strainStr2NumD,
                                  strainNum2StrD)

    # scores
    scoresO = scores.readScores(paramD['scoresFN'], geneNames)

    aabrhL = scores.loadOrthos(paramD['aabrhFN'])
    strainNamesL = sorted(
        [strainNum2StrD[leaf] for leaf in trees.leafList(tree)])
    aabrhRawScoreSummmaryD = scores.getAabrhRawScoreSummmaryD(
        strainNamesL, aabrhL, scoresO, geneNames)

    print(
        "Mean and standard deviation of raw scores between aabrh orthologs for pairs of species."
    )

    rowL = []
    rowL.append(['Species 1', 'Species 2', 'Mean', 'Standard dev'])
    rowL.append(['---------', '---------', '----', '------------'])
    for keyT, valT in aabrhRawScoreSummmaryD.items():
        row = []
        row.extend(keyT)
        row.append(format(valT[0], '.3f'))
        row.append(format(valT[1], '.3f'))