Пример #1
0
def printStats( recon, orig ) :
    '''
    Given a reconstructed ancestral network (recon) and a __ground truth__ network (orig),
    compute the precision, recall and F1-score of the reconstruction.
    '''
    sick = set([ tuple(sorted([u,v])) for u,v in orig.edges_iter()])
    identifiedAsSick = set([ tuple(sorted([u,v])) for u,v in recon.edges_iter()])
    healthy = set([ tuple(sorted([u,v])) for u,v in allPairs(orig.nodes()) ]) - sick
    identifiedAsHealthy = set([ tuple(sorted([u,v])) for u,v in allPairs(orig.nodes()) ]) - identifiedAsSick

    tp = [ e for e in sick if e in identifiedAsSick ]
    fp = [ e for e in healthy if e in identifiedAsSick ]
    tn = [ e for e in healthy if e in identifiedAsHealthy ]
    fn = [ e for e in sick if e in identifiedAsHealthy ]

    # tp = [(u,v) for u,v in recon.edges_iter() if orig.has_edge(u,v)]
    # tn = [(u,v) for u,v in allPairs( orig.nodes() ) if (not recon.has_edge(u,v)) and (not orig.has_edge(u,v)) ]
    # fp = [(u,v) for u,v in allPairs( orig.nodes() ) if
    # (recon.has_edge(u,v)) and (not orig.has_edge(u,v)) ]

    prec = len(tp) / float( len(tp) + len(fp) ) if float( len(tp) + len(fp) ) > 0 else 0.0
    sens = rec = len(tp) / float( len(tp) + len(fn) ) if float( len(tp) + len(fp) ) > 0 else 0.0
    print("Precision = {0}, Recall = {1}, F1-Score = {2}".format( prec, rec, 2*(prec*rec)/(prec+rec) if (prec+rec) > 0 else 0.0 ) )
    omspec = 1 - ( len(tn) / float(len(tn)+len(fp)) )
    return (omspec, sens)
Пример #2
0
def printStats(recon, orig):
    '''
    Given a reconstructed ancestral network (recon) and a __ground truth__ network (orig),
    compute the precision, recall and F1-score of the reconstruction.
    '''
    sick = set([tuple(sorted([u, v])) for u, v in orig.edges_iter()])
    identifiedAsSick = set(
        [tuple(sorted([u, v])) for u, v in recon.edges_iter()])
    healthy = set([tuple(sorted([u, v]))
                   for u, v in allPairs(orig.nodes())]) - sick
    identifiedAsHealthy = set(
        [tuple(sorted([u, v]))
         for u, v in allPairs(orig.nodes())]) - identifiedAsSick

    tp = [e for e in sick if e in identifiedAsSick]
    fp = [e for e in healthy if e in identifiedAsSick]
    tn = [e for e in healthy if e in identifiedAsHealthy]
    fn = [e for e in sick if e in identifiedAsHealthy]

    # tp = [(u,v) for u,v in recon.edges_iter() if orig.has_edge(u,v)]
    # tn = [(u,v) for u,v in allPairs( orig.nodes() ) if (not recon.has_edge(u,v)) and (not orig.has_edge(u,v)) ]
    # fp = [(u,v) for u,v in allPairs( orig.nodes() ) if
    # (recon.has_edge(u,v)) and (not orig.has_edge(u,v)) ]

    prec = len(tp) / float(len(tp) + len(fp)) if float(len(tp) +
                                                       len(fp)) > 0 else 0.0
    sens = rec = len(tp) / float(len(tp) +
                                 len(fn)) if float(len(tp) +
                                                   len(fp)) > 0 else 0.0
    print("Precision = {0}, Recall = {1}, F1-Score = {2}".format(
        prec, rec, 2 * (prec * rec) / (prec + rec) if
        (prec + rec) > 0 else 0.0))
    omspec = 1 - (len(tn) / float(len(tn) + len(fp)))
    return (omspec, sens)
Пример #3
0
def getOptimalCutoff( pgraph, tgraph ):
    '''
    Return a CutoffPoint that contains the optimal cutoff for this predictor
    (i.e. the cutoff at which the F1-Score is the highest).
    '''
    sd = croc.ScoredData()
    for u,v in allPairs( tgraph.nodes() ):
        sd.add( pgraph[u][v]['weight'] if pgraph.has_edge(u,v) else 0.0, 1 if tgraph.has_edge(u,v) else 0 )

    sweep = sd.sweep_threshold(tie_mode="smooth")
    vals = []
    for k,v in sd.score_labels.iteritems():
        vals = vals + [ (k,x) for x in v ]
    vals = sorted(vals, key = lambda x: x[0], reverse=True)

    def F1Score(cutoff, tp, tn, fp, fn):
        try:
            precision = tp / float(tp+fp)
            recall = tp / float(tp+fn)
            f1 = 2.0 * ((precision*recall)/(precision+recall))
            return CutoffPoint(cutoff,tp,tn,fp,fn,precision,recall,f1)
        except ZeroDivisionError as detail:
            return None

    l = len(vals)
    bestCutoff = CutoffPoint(cutoff=0.0, tp=0, tn=0, fp=0, fn=0, precision=0.0, recall=0.0, f1=0.0)
    for i,(tp,tn,fp,fn) in enumerate(sweep):
        if i < l:
            c = vals[i][0]            
            currentCutoff = F1Score(c, tp, tn, fp, fn)
            if (currentCutoff is None): continue
            if (currentCutoff.f1 >= bestCutoff.f1 or bestCutoff.cutoff == 0.0):
                bestCutoff = currentCutoff

    return bestCutoff.cutoff, bestCutoff.f1
Пример #4
0
def impliedNetworkOn( X, T, rv, nte ) :
    '''
    Given a set of nodes representing a cut through the tree (X), the duplication history (T),
    the root vertex (rv) and the set of non-tree edges (nte), compute the network implied on the
    nodes of X.
    '''
    recon = nx.Graph() if undirected else nx.DiGraph()
    for u,v in allPairs(X.nodes()):
        e = impliedEdges(u,v,rv,T,nte)
        recon.add_edges_from( [ edge for edge in e if not differentExtantNetworks(T, u, v) ] )
    return recon
Пример #5
0
def impliedNetworkOn(X, T, rv, nte):
    '''
    Given a set of nodes representing a cut through the tree (X), the duplication history (T),
    the root vertex (rv) and the set of non-tree edges (nte), compute the network implied on the
    nodes of X.
    '''
    recon = nx.Graph() if undirected else nx.DiGraph()
    for u, v in allPairs(X.nodes()):
        e = impliedEdges(u, v, rv, T, nte)
        recon.add_edges_from(
            [edge for edge in e if not differentExtantNetworks(T, u, v)])
    return recon
Пример #6
0
def getCurve( pgraph, tgraph ):
    '''
    Gets the BEDROC and AUROC scores for the given prediction.
    '''
    F = lambda sweep: croc.ROC(sweep).transform(croc.Linear())
    sd = croc.ScoredData()
    for u,v in allPairs( tgraph.nodes() ):
        sd.add( pgraph[u][v]['weight'] if pgraph.has_edge(u,v) else 0.0, 1 if tgraph.has_edge(u,v) else 0 )

    sweep = sd.sweep_threshold(tie_mode="smooth")
    C = F(sweep)

    return ROC_Results( BEDROC=croc.BEDROC(sd, 20.0)['BEDROC'], AUROC=C.area() )
Пример #7
0
def getPerformanceAtCutoff( pgraph, tgraph, cutoff ):
    '''
    Returns a (CutoffPoint, ROC_Results) pair where the information in the cutoff
    point (e.g. the F1-Score) has been evaluated at the supplied 'cutoff'.
    '''
    import scikits.learn 
    import scipy.special
    import warnings
    import math

    sd = croc.ScoredData()
    nedges = math.ceil(scipy.special.binom(tgraph.order(),2)) + tgraph.order()
    ys = np.zeros(nedges)
    ps = np.zeros(nedges)
    for i,(u,v) in enumerate(allPairs( tgraph.nodes() )):
        y = 1 if tgraph.has_edge(u,v) else 0
        p = pgraph[u][v]['weight'] if pgraph.has_edge(u,v) else 0.0
        ys[i], ps[i] = y, p

    sd = croc.ScoredData( zip(ps,ys) )

    warnings.simplefilter("ignore")
    Ps, Rs, _ = scikits.learn.metrics.precision_recall_curve(ys, ps)
    warnings.simplefilter("always")

    sweep = list(sd.sweep_threshold(tie_mode="smooth"))
    
    vals = []
    for k,v in sd.score_labels.iteritems():
        vals = vals + [ (k,x) for x in v ]

    vals = sorted(vals, key = lambda x: x[0], reverse=True)

    l = len(vals)    
    bestCutoff = CutoffPoint(cutoff=0.0, tp=0, tn=0, fp=0, fn=0, precision=0.0, recall=0.0, f1=0.0)
    for i,(tp,tn,fp,fn) in enumerate(sweep):
        if vals[i+1][0] < cutoff:
            c = vals[i][0]
            bestCutoff = F1Score(c, tp, tn, fp, fn)
            break

    F = lambda sweep: croc.ROC(sweep).transform(croc.Linear())
    C = F(sweep)
    AUPR = scikits.learn.metrics.auc(Ps, Rs)
    return (bestCutoff, ROC_Results( BEDROC=croc.BEDROC(sd, 20.0)['BEDROC'], 
                                     AUROC=C.area(), AUPR=AUPR ))
Пример #8
0
def constructingHistory(T, rv, G, nte, lostNodes):
   '''
   Test if the given set of non-tree edges (nte) reconstructs the extant network (G).
   This function computes the reconstructed newtork (recon) as well as the set of edges (ds)
   that are different between (G) and (recon).  The history given by nte reconstructs
   G <==> (len(ds) == 0)
   '''
   recon = nx.Graph() if undirected else nx.DiGraph()
   if undirected:
       G = G.to_undirected()
   for u,v in allPairs(G.nodes()):
       e = impliedEdges(u, v, rv, T, nte) 
       recon.add_edges_from(  [ edge for edge in e if isEffectiveEdge(T, u, v, lostNodes) ] )
       
   ds = set([])
   ds |= set( [ e for e in G.edges() if not recon.has_edge(e[0],e[1]) ] )
   ds |= set( [ e for e in recon.edges() if not G.has_edge(e[0],e[1]) ] )
   return recon, ds 
Пример #9
0
def constructingHistory(T, rv, G, nte, lostNodes):
    '''
   Test if the given set of non-tree edges (nte) reconstructs the extant network (G).
   This function computes the reconstructed newtork (recon) as well as the set of edges (ds)
   that are different between (G) and (recon).  The history given by nte reconstructs
   G <==> (len(ds) == 0)
   '''
    recon = nx.Graph() if undirected else nx.DiGraph()
    if undirected:
        G = G.to_undirected()
    for u, v in allPairs(G.nodes()):
        e = impliedEdges(u, v, rv, T, nte)
        recon.add_edges_from(
            [edge for edge in e if isEffectiveEdge(T, u, v, lostNodes)])

    ds = set([])
    ds |= set([e for e in G.edges() if not recon.has_edge(e[0], e[1])])
    ds |= set([e for e in recon.edges() if not G.has_edge(e[0], e[1])])
    return recon, ds
Пример #10
0
def initf(T, rv, G, constraints):
    '''
    Given the duplication tree T, the root vertex rv, the extant graph G and
    the constraints, fill in the base case of the recurrence (i.e. the parsimony
    score of entering any pair of leaves in any of the potential states of f).
    '''
    rl = {}
    slnDict = {}
    inf = float('inf')
    directed = not undirected
    for u,v in allPairs( T.node[rv]['leaves'] ):
        if differentExtantNetworks(T, u, v):
            continue
        fn = KeyObj(u,v,0,0)
        fb = KeyObj(u,v,1,1)
        if directed: 
            fr = KeyObj(u,v,0,1)
            ff = KeyObj(u,v,1,0)
        
        # If u and v are different protein
        if u != v :
            d_f = 1 if G.has_edge(u,v) else 0
            d_r = 1 if G.has_edge(v,u) else 0 
            # If the nodes have not been lost, then assign the regular costs
            if u in G.nodes() and v in G.nodes() :
                if undirected:
                    slnDict[fn] = 0 if d_f + d_r == 0 else cc 
                    slnDict[fb] = 0 if d_f + d_r == 2 else dc 
                else: 
                    slnDict[fn] = d_f * cc + d_r * cc
                    slnDict[fb] = (1 - d_f) * dc + (1 - d_r) * dc
                    slnDict[fr] = d_f * cc + (1 - d_r) * dc
                    slnDict[ff] = (1 - d_f) * dc + d_r * cc
        else :
            hasSelfLoop = G.has_edge(u,v)
            if u in G.nodes() and v in G.nodes() :
                # A self loop always costs cc
                slnDict[fn] = cc if hasSelfLoop else 0 
                slnDict[fb] = 0 if hasSelfLoop else dc
                if directed:
                    slnDict[fr] = 0 if hasSelfLoop else dc
                    slnDict[ff] = 0 if hasSelfLoop else dc

        if not( u in G.nodes() and v in G.nodes() ):
            # Costs to lost nodes are always 0
            lostCost = 0
            slnDict[fn] = lostCost; slnDict[fb] = lostCost
            if directed: 
                slnDict[fr] = lostCost; slnDict[ff] = lostCost; 

        ## The base cases for the optimal solutions
        if slnDict[fn] == 0 :
            rl[fn] = ('fn', None, None)
            rl[fb] = ('fb', None, None)
            if directed:
                rl[ff] = ('ff', None, None)
                rl[fr] = ('fr', None, None)

        if directed and slnDict[ff] == 0 :
            rl[ff] = ('fn', None, None)
            rl[fr] = ('fb', None, None)
            rl[fn] = ('ff', None, None)
            rl[fb] = ('fr', None, None)

        if directed and slnDict[fr] == 0 :
            rl[fn] = ('fr', None, None)
            rl[ff] = ('fb', None, None)
            rl[fr] = ('fn', None, None)
            rl[fb] = ('ff', None, None)

        if slnDict[fb] == 0 :
            rl[fn] = ('fb', None, None)
            rl[fb] = ('fn', None, None)            
            if directed:
                rl[ff] = ('fr', None, None)
                rl[fr] = ('ff', None, None)

    return slnDict, rl
Пример #11
0
def initf(T, rv, G, constraints):
    '''
    Given the duplication tree T, the root vertex rv, the extant graph G and
    the constraints, fill in the base case of the recurrence (i.e. the parsimony
    score of entering any pair of leaves in any of the potential states of f).
    '''
    rl = {}
    slnDict = {}
    inf = float('inf')
    directed = not undirected
    for u, v in allPairs(T.node[rv]['leaves']):
        if differentExtantNetworks(T, u, v):
            continue
        fn = KeyObj(u, v, 0, 0)
        fb = KeyObj(u, v, 1, 1)
        if directed:
            fr = KeyObj(u, v, 0, 1)
            ff = KeyObj(u, v, 1, 0)

        # If u and v are different protein
        if u != v:
            d_f = 1 if G.has_edge(u, v) else 0
            d_r = 1 if G.has_edge(v, u) else 0
            # If the nodes have not been lost, then assign the regular costs
            if u in G.nodes() and v in G.nodes():
                if undirected:
                    slnDict[fn] = 0 if d_f + d_r == 0 else cc
                    slnDict[fb] = 0 if d_f + d_r == 2 else dc
                else:
                    slnDict[fn] = d_f * cc + d_r * cc
                    slnDict[fb] = (1 - d_f) * dc + (1 - d_r) * dc
                    slnDict[fr] = d_f * cc + (1 - d_r) * dc
                    slnDict[ff] = (1 - d_f) * dc + d_r * cc
        else:
            hasSelfLoop = G.has_edge(u, v)
            if u in G.nodes() and v in G.nodes():
                # A self loop always costs cc
                slnDict[fn] = cc if hasSelfLoop else 0
                slnDict[fb] = 0 if hasSelfLoop else dc
                if directed:
                    slnDict[fr] = 0 if hasSelfLoop else dc
                    slnDict[ff] = 0 if hasSelfLoop else dc

        if not (u in G.nodes() and v in G.nodes()):
            # Costs to lost nodes are always 0
            lostCost = 0
            slnDict[fn] = lostCost
            slnDict[fb] = lostCost
            if directed:
                slnDict[fr] = lostCost
                slnDict[ff] = lostCost

        ## The base cases for the optimal solutions
        if slnDict[fn] == 0:
            rl[fn] = ('fn', None, None)
            rl[fb] = ('fb', None, None)
            if directed:
                rl[ff] = ('ff', None, None)
                rl[fr] = ('fr', None, None)

        if directed and slnDict[ff] == 0:
            rl[ff] = ('fn', None, None)
            rl[fr] = ('fb', None, None)
            rl[fn] = ('ff', None, None)
            rl[fb] = ('fr', None, None)

        if directed and slnDict[fr] == 0:
            rl[fn] = ('fr', None, None)
            rl[ff] = ('fb', None, None)
            rl[fr] = ('fn', None, None)
            rl[fb] = ('ff', None, None)

        if slnDict[fb] == 0:
            rl[fn] = ('fb', None, None)
            rl[fb] = ('fn', None, None)
            if directed:
                rl[ff] = ('fr', None, None)
                rl[fr] = ('ff', None, None)

    return slnDict, rl