예제 #1
0
def printStats( recon, orig ) :
    '''
    Given a reconstructed ancestral network (recon) and a __ground truth__ network (orig),
    compute the precision, recall and F1-score of the reconstruction.
    '''
    sick = set([ tuple(sorted([u,v])) for u,v in orig.edges_iter()])
    identifiedAsSick = set([ tuple(sorted([u,v])) for u,v in recon.edges_iter()])
    healthy = set([ tuple(sorted([u,v])) for u,v in allPairs(orig.nodes()) ]) - sick
    identifiedAsHealthy = set([ tuple(sorted([u,v])) for u,v in allPairs(orig.nodes()) ]) - identifiedAsSick

    tp = [ e for e in sick if e in identifiedAsSick ]
    fp = [ e for e in healthy if e in identifiedAsSick ]
    tn = [ e for e in healthy if e in identifiedAsHealthy ]
    fn = [ e for e in sick if e in identifiedAsHealthy ]

    # tp = [(u,v) for u,v in recon.edges_iter() if orig.has_edge(u,v)]
    # tn = [(u,v) for u,v in allPairs( orig.nodes() ) if (not recon.has_edge(u,v)) and (not orig.has_edge(u,v)) ]
    # fp = [(u,v) for u,v in allPairs( orig.nodes() ) if
    # (recon.has_edge(u,v)) and (not orig.has_edge(u,v)) ]

    prec = len(tp) / float( len(tp) + len(fp) ) if float( len(tp) + len(fp) ) > 0 else 0.0
    sens = rec = len(tp) / float( len(tp) + len(fn) ) if float( len(tp) + len(fp) ) > 0 else 0.0
    print("Precision = {0}, Recall = {1}, F1-Score = {2}".format( prec, rec, 2*(prec*rec)/(prec+rec) if (prec+rec) > 0 else 0.0 ) )
    omspec = 1 - ( len(tn) / float(len(tn)+len(fp)) )
    return (omspec, sens)
예제 #2
0
def printStats(recon, orig):
    '''
    Given a reconstructed ancestral network (recon) and a __ground truth__ network (orig),
    compute the precision, recall and F1-score of the reconstruction.
    '''
    sick = set([tuple(sorted([u, v])) for u, v in orig.edges_iter()])
    identifiedAsSick = set(
        [tuple(sorted([u, v])) for u, v in recon.edges_iter()])
    healthy = set([tuple(sorted([u, v]))
                   for u, v in allPairs(orig.nodes())]) - sick
    identifiedAsHealthy = set(
        [tuple(sorted([u, v]))
         for u, v in allPairs(orig.nodes())]) - identifiedAsSick

    tp = [e for e in sick if e in identifiedAsSick]
    fp = [e for e in healthy if e in identifiedAsSick]
    tn = [e for e in healthy if e in identifiedAsHealthy]
    fn = [e for e in sick if e in identifiedAsHealthy]

    # tp = [(u,v) for u,v in recon.edges_iter() if orig.has_edge(u,v)]
    # tn = [(u,v) for u,v in allPairs( orig.nodes() ) if (not recon.has_edge(u,v)) and (not orig.has_edge(u,v)) ]
    # fp = [(u,v) for u,v in allPairs( orig.nodes() ) if
    # (recon.has_edge(u,v)) and (not orig.has_edge(u,v)) ]

    prec = len(tp) / float(len(tp) + len(fp)) if float(len(tp) +
                                                       len(fp)) > 0 else 0.0
    sens = rec = len(tp) / float(len(tp) +
                                 len(fn)) if float(len(tp) +
                                                   len(fp)) > 0 else 0.0
    print("Precision = {0}, Recall = {1}, F1-Score = {2}".format(
        prec, rec, 2 * (prec * rec) / (prec + rec) if
        (prec + rec) > 0 else 0.0))
    omspec = 1 - (len(tn) / float(len(tn) + len(fp)))
    return (omspec, sens)
예제 #3
0
def getOptimalCutoff( pgraph, tgraph ):
    '''
    Return a CutoffPoint that contains the optimal cutoff for this predictor
    (i.e. the cutoff at which the F1-Score is the highest).
    '''
    sd = croc.ScoredData()
    for u,v in allPairs( tgraph.nodes() ):
        sd.add( pgraph[u][v]['weight'] if pgraph.has_edge(u,v) else 0.0, 1 if tgraph.has_edge(u,v) else 0 )

    sweep = sd.sweep_threshold(tie_mode="smooth")
    vals = []
    for k,v in sd.score_labels.iteritems():
        vals = vals + [ (k,x) for x in v ]
    vals = sorted(vals, key = lambda x: x[0], reverse=True)

    def F1Score(cutoff, tp, tn, fp, fn):
        try:
            precision = tp / float(tp+fp)
            recall = tp / float(tp+fn)
            f1 = 2.0 * ((precision*recall)/(precision+recall))
            return CutoffPoint(cutoff,tp,tn,fp,fn,precision,recall,f1)
        except ZeroDivisionError as detail:
            return None

    l = len(vals)
    bestCutoff = CutoffPoint(cutoff=0.0, tp=0, tn=0, fp=0, fn=0, precision=0.0, recall=0.0, f1=0.0)
    for i,(tp,tn,fp,fn) in enumerate(sweep):
        if i < l:
            c = vals[i][0]            
            currentCutoff = F1Score(c, tp, tn, fp, fn)
            if (currentCutoff is None): continue
            if (currentCutoff.f1 >= bestCutoff.f1 or bestCutoff.cutoff == 0.0):
                bestCutoff = currentCutoff

    return bestCutoff.cutoff, bestCutoff.f1
예제 #4
0
파일: parana.py 프로젝트: emresefer/Parana
def impliedNetworkOn( X, T, rv, nte ) :
    '''
    Given a set of nodes representing a cut through the tree (X), the duplication history (T),
    the root vertex (rv) and the set of non-tree edges (nte), compute the network implied on the
    nodes of X.
    '''
    recon = nx.Graph() if undirected else nx.DiGraph()
    for u,v in allPairs(X.nodes()):
        e = impliedEdges(u,v,rv,T,nte)
        recon.add_edges_from( [ edge for edge in e if not differentExtantNetworks(T, u, v) ] )
    return recon
예제 #5
0
def impliedNetworkOn(X, T, rv, nte):
    '''
    Given a set of nodes representing a cut through the tree (X), the duplication history (T),
    the root vertex (rv) and the set of non-tree edges (nte), compute the network implied on the
    nodes of X.
    '''
    recon = nx.Graph() if undirected else nx.DiGraph()
    for u, v in allPairs(X.nodes()):
        e = impliedEdges(u, v, rv, T, nte)
        recon.add_edges_from(
            [edge for edge in e if not differentExtantNetworks(T, u, v)])
    return recon
예제 #6
0
def getCurve( pgraph, tgraph ):
    '''
    Gets the BEDROC and AUROC scores for the given prediction.
    '''
    F = lambda sweep: croc.ROC(sweep).transform(croc.Linear())
    sd = croc.ScoredData()
    for u,v in allPairs( tgraph.nodes() ):
        sd.add( pgraph[u][v]['weight'] if pgraph.has_edge(u,v) else 0.0, 1 if tgraph.has_edge(u,v) else 0 )

    sweep = sd.sweep_threshold(tie_mode="smooth")
    C = F(sweep)

    return ROC_Results( BEDROC=croc.BEDROC(sd, 20.0)['BEDROC'], AUROC=C.area() )
예제 #7
0
def getPerformanceAtCutoff( pgraph, tgraph, cutoff ):
    '''
    Returns a (CutoffPoint, ROC_Results) pair where the information in the cutoff
    point (e.g. the F1-Score) has been evaluated at the supplied 'cutoff'.
    '''
    import scikits.learn 
    import scipy.special
    import warnings
    import math

    sd = croc.ScoredData()
    nedges = math.ceil(scipy.special.binom(tgraph.order(),2)) + tgraph.order()
    ys = np.zeros(nedges)
    ps = np.zeros(nedges)
    for i,(u,v) in enumerate(allPairs( tgraph.nodes() )):
        y = 1 if tgraph.has_edge(u,v) else 0
        p = pgraph[u][v]['weight'] if pgraph.has_edge(u,v) else 0.0
        ys[i], ps[i] = y, p

    sd = croc.ScoredData( zip(ps,ys) )

    warnings.simplefilter("ignore")
    Ps, Rs, _ = scikits.learn.metrics.precision_recall_curve(ys, ps)
    warnings.simplefilter("always")

    sweep = list(sd.sweep_threshold(tie_mode="smooth"))
    
    vals = []
    for k,v in sd.score_labels.iteritems():
        vals = vals + [ (k,x) for x in v ]

    vals = sorted(vals, key = lambda x: x[0], reverse=True)

    l = len(vals)    
    bestCutoff = CutoffPoint(cutoff=0.0, tp=0, tn=0, fp=0, fn=0, precision=0.0, recall=0.0, f1=0.0)
    for i,(tp,tn,fp,fn) in enumerate(sweep):
        if vals[i+1][0] < cutoff:
            c = vals[i][0]
            bestCutoff = F1Score(c, tp, tn, fp, fn)
            break

    F = lambda sweep: croc.ROC(sweep).transform(croc.Linear())
    C = F(sweep)
    AUPR = scikits.learn.metrics.auc(Ps, Rs)
    return (bestCutoff, ROC_Results( BEDROC=croc.BEDROC(sd, 20.0)['BEDROC'], 
                                     AUROC=C.area(), AUPR=AUPR ))
예제 #8
0
파일: parana.py 프로젝트: emresefer/Parana
def constructingHistory(T, rv, G, nte, lostNodes):
   '''
   Test if the given set of non-tree edges (nte) reconstructs the extant network (G).
   This function computes the reconstructed newtork (recon) as well as the set of edges (ds)
   that are different between (G) and (recon).  The history given by nte reconstructs
   G <==> (len(ds) == 0)
   '''
   recon = nx.Graph() if undirected else nx.DiGraph()
   if undirected:
       G = G.to_undirected()
   for u,v in allPairs(G.nodes()):
       e = impliedEdges(u, v, rv, T, nte) 
       recon.add_edges_from(  [ edge for edge in e if isEffectiveEdge(T, u, v, lostNodes) ] )
       
   ds = set([])
   ds |= set( [ e for e in G.edges() if not recon.has_edge(e[0],e[1]) ] )
   ds |= set( [ e for e in recon.edges() if not G.has_edge(e[0],e[1]) ] )
   return recon, ds 
예제 #9
0
def constructingHistory(T, rv, G, nte, lostNodes):
    '''
   Test if the given set of non-tree edges (nte) reconstructs the extant network (G).
   This function computes the reconstructed newtork (recon) as well as the set of edges (ds)
   that are different between (G) and (recon).  The history given by nte reconstructs
   G <==> (len(ds) == 0)
   '''
    recon = nx.Graph() if undirected else nx.DiGraph()
    if undirected:
        G = G.to_undirected()
    for u, v in allPairs(G.nodes()):
        e = impliedEdges(u, v, rv, T, nte)
        recon.add_edges_from(
            [edge for edge in e if isEffectiveEdge(T, u, v, lostNodes)])

    ds = set([])
    ds |= set([e for e in G.edges() if not recon.has_edge(e[0], e[1])])
    ds |= set([e for e in recon.edges() if not G.has_edge(e[0], e[1])])
    return recon, ds
예제 #10
0
파일: parana.py 프로젝트: emresefer/Parana
def initf(T, rv, G, constraints):
    '''
    Given the duplication tree T, the root vertex rv, the extant graph G and
    the constraints, fill in the base case of the recurrence (i.e. the parsimony
    score of entering any pair of leaves in any of the potential states of f).
    '''
    rl = {}
    slnDict = {}
    inf = float('inf')
    directed = not undirected
    for u,v in allPairs( T.node[rv]['leaves'] ):
        if differentExtantNetworks(T, u, v):
            continue
        fn = KeyObj(u,v,0,0)
        fb = KeyObj(u,v,1,1)
        if directed: 
            fr = KeyObj(u,v,0,1)
            ff = KeyObj(u,v,1,0)
        
        # If u and v are different protein
        if u != v :
            d_f = 1 if G.has_edge(u,v) else 0
            d_r = 1 if G.has_edge(v,u) else 0 
            # If the nodes have not been lost, then assign the regular costs
            if u in G.nodes() and v in G.nodes() :
                if undirected:
                    slnDict[fn] = 0 if d_f + d_r == 0 else cc 
                    slnDict[fb] = 0 if d_f + d_r == 2 else dc 
                else: 
                    slnDict[fn] = d_f * cc + d_r * cc
                    slnDict[fb] = (1 - d_f) * dc + (1 - d_r) * dc
                    slnDict[fr] = d_f * cc + (1 - d_r) * dc
                    slnDict[ff] = (1 - d_f) * dc + d_r * cc
        else :
            hasSelfLoop = G.has_edge(u,v)
            if u in G.nodes() and v in G.nodes() :
                # A self loop always costs cc
                slnDict[fn] = cc if hasSelfLoop else 0 
                slnDict[fb] = 0 if hasSelfLoop else dc
                if directed:
                    slnDict[fr] = 0 if hasSelfLoop else dc
                    slnDict[ff] = 0 if hasSelfLoop else dc

        if not( u in G.nodes() and v in G.nodes() ):
            # Costs to lost nodes are always 0
            lostCost = 0
            slnDict[fn] = lostCost; slnDict[fb] = lostCost
            if directed: 
                slnDict[fr] = lostCost; slnDict[ff] = lostCost; 

        ## The base cases for the optimal solutions
        if slnDict[fn] == 0 :
            rl[fn] = ('fn', None, None)
            rl[fb] = ('fb', None, None)
            if directed:
                rl[ff] = ('ff', None, None)
                rl[fr] = ('fr', None, None)

        if directed and slnDict[ff] == 0 :
            rl[ff] = ('fn', None, None)
            rl[fr] = ('fb', None, None)
            rl[fn] = ('ff', None, None)
            rl[fb] = ('fr', None, None)

        if directed and slnDict[fr] == 0 :
            rl[fn] = ('fr', None, None)
            rl[ff] = ('fb', None, None)
            rl[fr] = ('fn', None, None)
            rl[fb] = ('ff', None, None)

        if slnDict[fb] == 0 :
            rl[fn] = ('fb', None, None)
            rl[fb] = ('fn', None, None)            
            if directed:
                rl[ff] = ('fr', None, None)
                rl[fr] = ('ff', None, None)

    return slnDict, rl
예제 #11
0
def initf(T, rv, G, constraints):
    '''
    Given the duplication tree T, the root vertex rv, the extant graph G and
    the constraints, fill in the base case of the recurrence (i.e. the parsimony
    score of entering any pair of leaves in any of the potential states of f).
    '''
    rl = {}
    slnDict = {}
    inf = float('inf')
    directed = not undirected
    for u, v in allPairs(T.node[rv]['leaves']):
        if differentExtantNetworks(T, u, v):
            continue
        fn = KeyObj(u, v, 0, 0)
        fb = KeyObj(u, v, 1, 1)
        if directed:
            fr = KeyObj(u, v, 0, 1)
            ff = KeyObj(u, v, 1, 0)

        # If u and v are different protein
        if u != v:
            d_f = 1 if G.has_edge(u, v) else 0
            d_r = 1 if G.has_edge(v, u) else 0
            # If the nodes have not been lost, then assign the regular costs
            if u in G.nodes() and v in G.nodes():
                if undirected:
                    slnDict[fn] = 0 if d_f + d_r == 0 else cc
                    slnDict[fb] = 0 if d_f + d_r == 2 else dc
                else:
                    slnDict[fn] = d_f * cc + d_r * cc
                    slnDict[fb] = (1 - d_f) * dc + (1 - d_r) * dc
                    slnDict[fr] = d_f * cc + (1 - d_r) * dc
                    slnDict[ff] = (1 - d_f) * dc + d_r * cc
        else:
            hasSelfLoop = G.has_edge(u, v)
            if u in G.nodes() and v in G.nodes():
                # A self loop always costs cc
                slnDict[fn] = cc if hasSelfLoop else 0
                slnDict[fb] = 0 if hasSelfLoop else dc
                if directed:
                    slnDict[fr] = 0 if hasSelfLoop else dc
                    slnDict[ff] = 0 if hasSelfLoop else dc

        if not (u in G.nodes() and v in G.nodes()):
            # Costs to lost nodes are always 0
            lostCost = 0
            slnDict[fn] = lostCost
            slnDict[fb] = lostCost
            if directed:
                slnDict[fr] = lostCost
                slnDict[ff] = lostCost

        ## The base cases for the optimal solutions
        if slnDict[fn] == 0:
            rl[fn] = ('fn', None, None)
            rl[fb] = ('fb', None, None)
            if directed:
                rl[ff] = ('ff', None, None)
                rl[fr] = ('fr', None, None)

        if directed and slnDict[ff] == 0:
            rl[ff] = ('fn', None, None)
            rl[fr] = ('fb', None, None)
            rl[fn] = ('ff', None, None)
            rl[fb] = ('fr', None, None)

        if directed and slnDict[fr] == 0:
            rl[fn] = ('fr', None, None)
            rl[ff] = ('fb', None, None)
            rl[fr] = ('fn', None, None)
            rl[fb] = ('ff', None, None)

        if slnDict[fb] == 0:
            rl[fn] = ('fb', None, None)
            rl[fb] = ('fn', None, None)
            if directed:
                rl[ff] = ('fr', None, None)
                rl[fr] = ('ff', None, None)

    return slnDict, rl