def printStats( recon, orig ) : ''' Given a reconstructed ancestral network (recon) and a __ground truth__ network (orig), compute the precision, recall and F1-score of the reconstruction. ''' sick = set([ tuple(sorted([u,v])) for u,v in orig.edges_iter()]) identifiedAsSick = set([ tuple(sorted([u,v])) for u,v in recon.edges_iter()]) healthy = set([ tuple(sorted([u,v])) for u,v in allPairs(orig.nodes()) ]) - sick identifiedAsHealthy = set([ tuple(sorted([u,v])) for u,v in allPairs(orig.nodes()) ]) - identifiedAsSick tp = [ e for e in sick if e in identifiedAsSick ] fp = [ e for e in healthy if e in identifiedAsSick ] tn = [ e for e in healthy if e in identifiedAsHealthy ] fn = [ e for e in sick if e in identifiedAsHealthy ] # tp = [(u,v) for u,v in recon.edges_iter() if orig.has_edge(u,v)] # tn = [(u,v) for u,v in allPairs( orig.nodes() ) if (not recon.has_edge(u,v)) and (not orig.has_edge(u,v)) ] # fp = [(u,v) for u,v in allPairs( orig.nodes() ) if # (recon.has_edge(u,v)) and (not orig.has_edge(u,v)) ] prec = len(tp) / float( len(tp) + len(fp) ) if float( len(tp) + len(fp) ) > 0 else 0.0 sens = rec = len(tp) / float( len(tp) + len(fn) ) if float( len(tp) + len(fp) ) > 0 else 0.0 print("Precision = {0}, Recall = {1}, F1-Score = {2}".format( prec, rec, 2*(prec*rec)/(prec+rec) if (prec+rec) > 0 else 0.0 ) ) omspec = 1 - ( len(tn) / float(len(tn)+len(fp)) ) return (omspec, sens)
def printStats(recon, orig): ''' Given a reconstructed ancestral network (recon) and a __ground truth__ network (orig), compute the precision, recall and F1-score of the reconstruction. ''' sick = set([tuple(sorted([u, v])) for u, v in orig.edges_iter()]) identifiedAsSick = set( [tuple(sorted([u, v])) for u, v in recon.edges_iter()]) healthy = set([tuple(sorted([u, v])) for u, v in allPairs(orig.nodes())]) - sick identifiedAsHealthy = set( [tuple(sorted([u, v])) for u, v in allPairs(orig.nodes())]) - identifiedAsSick tp = [e for e in sick if e in identifiedAsSick] fp = [e for e in healthy if e in identifiedAsSick] tn = [e for e in healthy if e in identifiedAsHealthy] fn = [e for e in sick if e in identifiedAsHealthy] # tp = [(u,v) for u,v in recon.edges_iter() if orig.has_edge(u,v)] # tn = [(u,v) for u,v in allPairs( orig.nodes() ) if (not recon.has_edge(u,v)) and (not orig.has_edge(u,v)) ] # fp = [(u,v) for u,v in allPairs( orig.nodes() ) if # (recon.has_edge(u,v)) and (not orig.has_edge(u,v)) ] prec = len(tp) / float(len(tp) + len(fp)) if float(len(tp) + len(fp)) > 0 else 0.0 sens = rec = len(tp) / float(len(tp) + len(fn)) if float(len(tp) + len(fp)) > 0 else 0.0 print("Precision = {0}, Recall = {1}, F1-Score = {2}".format( prec, rec, 2 * (prec * rec) / (prec + rec) if (prec + rec) > 0 else 0.0)) omspec = 1 - (len(tn) / float(len(tn) + len(fp))) return (omspec, sens)
def getOptimalCutoff( pgraph, tgraph ): ''' Return a CutoffPoint that contains the optimal cutoff for this predictor (i.e. the cutoff at which the F1-Score is the highest). ''' sd = croc.ScoredData() for u,v in allPairs( tgraph.nodes() ): sd.add( pgraph[u][v]['weight'] if pgraph.has_edge(u,v) else 0.0, 1 if tgraph.has_edge(u,v) else 0 ) sweep = sd.sweep_threshold(tie_mode="smooth") vals = [] for k,v in sd.score_labels.iteritems(): vals = vals + [ (k,x) for x in v ] vals = sorted(vals, key = lambda x: x[0], reverse=True) def F1Score(cutoff, tp, tn, fp, fn): try: precision = tp / float(tp+fp) recall = tp / float(tp+fn) f1 = 2.0 * ((precision*recall)/(precision+recall)) return CutoffPoint(cutoff,tp,tn,fp,fn,precision,recall,f1) except ZeroDivisionError as detail: return None l = len(vals) bestCutoff = CutoffPoint(cutoff=0.0, tp=0, tn=0, fp=0, fn=0, precision=0.0, recall=0.0, f1=0.0) for i,(tp,tn,fp,fn) in enumerate(sweep): if i < l: c = vals[i][0] currentCutoff = F1Score(c, tp, tn, fp, fn) if (currentCutoff is None): continue if (currentCutoff.f1 >= bestCutoff.f1 or bestCutoff.cutoff == 0.0): bestCutoff = currentCutoff return bestCutoff.cutoff, bestCutoff.f1
def impliedNetworkOn( X, T, rv, nte ) : ''' Given a set of nodes representing a cut through the tree (X), the duplication history (T), the root vertex (rv) and the set of non-tree edges (nte), compute the network implied on the nodes of X. ''' recon = nx.Graph() if undirected else nx.DiGraph() for u,v in allPairs(X.nodes()): e = impliedEdges(u,v,rv,T,nte) recon.add_edges_from( [ edge for edge in e if not differentExtantNetworks(T, u, v) ] ) return recon
def impliedNetworkOn(X, T, rv, nte): ''' Given a set of nodes representing a cut through the tree (X), the duplication history (T), the root vertex (rv) and the set of non-tree edges (nte), compute the network implied on the nodes of X. ''' recon = nx.Graph() if undirected else nx.DiGraph() for u, v in allPairs(X.nodes()): e = impliedEdges(u, v, rv, T, nte) recon.add_edges_from( [edge for edge in e if not differentExtantNetworks(T, u, v)]) return recon
def getCurve( pgraph, tgraph ): ''' Gets the BEDROC and AUROC scores for the given prediction. ''' F = lambda sweep: croc.ROC(sweep).transform(croc.Linear()) sd = croc.ScoredData() for u,v in allPairs( tgraph.nodes() ): sd.add( pgraph[u][v]['weight'] if pgraph.has_edge(u,v) else 0.0, 1 if tgraph.has_edge(u,v) else 0 ) sweep = sd.sweep_threshold(tie_mode="smooth") C = F(sweep) return ROC_Results( BEDROC=croc.BEDROC(sd, 20.0)['BEDROC'], AUROC=C.area() )
def getPerformanceAtCutoff( pgraph, tgraph, cutoff ): ''' Returns a (CutoffPoint, ROC_Results) pair where the information in the cutoff point (e.g. the F1-Score) has been evaluated at the supplied 'cutoff'. ''' import scikits.learn import scipy.special import warnings import math sd = croc.ScoredData() nedges = math.ceil(scipy.special.binom(tgraph.order(),2)) + tgraph.order() ys = np.zeros(nedges) ps = np.zeros(nedges) for i,(u,v) in enumerate(allPairs( tgraph.nodes() )): y = 1 if tgraph.has_edge(u,v) else 0 p = pgraph[u][v]['weight'] if pgraph.has_edge(u,v) else 0.0 ys[i], ps[i] = y, p sd = croc.ScoredData( zip(ps,ys) ) warnings.simplefilter("ignore") Ps, Rs, _ = scikits.learn.metrics.precision_recall_curve(ys, ps) warnings.simplefilter("always") sweep = list(sd.sweep_threshold(tie_mode="smooth")) vals = [] for k,v in sd.score_labels.iteritems(): vals = vals + [ (k,x) for x in v ] vals = sorted(vals, key = lambda x: x[0], reverse=True) l = len(vals) bestCutoff = CutoffPoint(cutoff=0.0, tp=0, tn=0, fp=0, fn=0, precision=0.0, recall=0.0, f1=0.0) for i,(tp,tn,fp,fn) in enumerate(sweep): if vals[i+1][0] < cutoff: c = vals[i][0] bestCutoff = F1Score(c, tp, tn, fp, fn) break F = lambda sweep: croc.ROC(sweep).transform(croc.Linear()) C = F(sweep) AUPR = scikits.learn.metrics.auc(Ps, Rs) return (bestCutoff, ROC_Results( BEDROC=croc.BEDROC(sd, 20.0)['BEDROC'], AUROC=C.area(), AUPR=AUPR ))
def constructingHistory(T, rv, G, nte, lostNodes): ''' Test if the given set of non-tree edges (nte) reconstructs the extant network (G). This function computes the reconstructed newtork (recon) as well as the set of edges (ds) that are different between (G) and (recon). The history given by nte reconstructs G <==> (len(ds) == 0) ''' recon = nx.Graph() if undirected else nx.DiGraph() if undirected: G = G.to_undirected() for u,v in allPairs(G.nodes()): e = impliedEdges(u, v, rv, T, nte) recon.add_edges_from( [ edge for edge in e if isEffectiveEdge(T, u, v, lostNodes) ] ) ds = set([]) ds |= set( [ e for e in G.edges() if not recon.has_edge(e[0],e[1]) ] ) ds |= set( [ e for e in recon.edges() if not G.has_edge(e[0],e[1]) ] ) return recon, ds
def constructingHistory(T, rv, G, nte, lostNodes): ''' Test if the given set of non-tree edges (nte) reconstructs the extant network (G). This function computes the reconstructed newtork (recon) as well as the set of edges (ds) that are different between (G) and (recon). The history given by nte reconstructs G <==> (len(ds) == 0) ''' recon = nx.Graph() if undirected else nx.DiGraph() if undirected: G = G.to_undirected() for u, v in allPairs(G.nodes()): e = impliedEdges(u, v, rv, T, nte) recon.add_edges_from( [edge for edge in e if isEffectiveEdge(T, u, v, lostNodes)]) ds = set([]) ds |= set([e for e in G.edges() if not recon.has_edge(e[0], e[1])]) ds |= set([e for e in recon.edges() if not G.has_edge(e[0], e[1])]) return recon, ds
def initf(T, rv, G, constraints): ''' Given the duplication tree T, the root vertex rv, the extant graph G and the constraints, fill in the base case of the recurrence (i.e. the parsimony score of entering any pair of leaves in any of the potential states of f). ''' rl = {} slnDict = {} inf = float('inf') directed = not undirected for u,v in allPairs( T.node[rv]['leaves'] ): if differentExtantNetworks(T, u, v): continue fn = KeyObj(u,v,0,0) fb = KeyObj(u,v,1,1) if directed: fr = KeyObj(u,v,0,1) ff = KeyObj(u,v,1,0) # If u and v are different protein if u != v : d_f = 1 if G.has_edge(u,v) else 0 d_r = 1 if G.has_edge(v,u) else 0 # If the nodes have not been lost, then assign the regular costs if u in G.nodes() and v in G.nodes() : if undirected: slnDict[fn] = 0 if d_f + d_r == 0 else cc slnDict[fb] = 0 if d_f + d_r == 2 else dc else: slnDict[fn] = d_f * cc + d_r * cc slnDict[fb] = (1 - d_f) * dc + (1 - d_r) * dc slnDict[fr] = d_f * cc + (1 - d_r) * dc slnDict[ff] = (1 - d_f) * dc + d_r * cc else : hasSelfLoop = G.has_edge(u,v) if u in G.nodes() and v in G.nodes() : # A self loop always costs cc slnDict[fn] = cc if hasSelfLoop else 0 slnDict[fb] = 0 if hasSelfLoop else dc if directed: slnDict[fr] = 0 if hasSelfLoop else dc slnDict[ff] = 0 if hasSelfLoop else dc if not( u in G.nodes() and v in G.nodes() ): # Costs to lost nodes are always 0 lostCost = 0 slnDict[fn] = lostCost; slnDict[fb] = lostCost if directed: slnDict[fr] = lostCost; slnDict[ff] = lostCost; ## The base cases for the optimal solutions if slnDict[fn] == 0 : rl[fn] = ('fn', None, None) rl[fb] = ('fb', None, None) if directed: rl[ff] = ('ff', None, None) rl[fr] = ('fr', None, None) if directed and slnDict[ff] == 0 : rl[ff] = ('fn', None, None) rl[fr] = ('fb', None, None) rl[fn] = ('ff', None, None) rl[fb] = ('fr', None, None) if directed and slnDict[fr] == 0 : rl[fn] = ('fr', None, None) rl[ff] = ('fb', None, None) rl[fr] = ('fn', None, None) rl[fb] = ('ff', None, None) if slnDict[fb] == 0 : rl[fn] = ('fb', None, None) rl[fb] = ('fn', None, None) if directed: rl[ff] = ('fr', None, None) rl[fr] = ('ff', None, None) return slnDict, rl
def initf(T, rv, G, constraints): ''' Given the duplication tree T, the root vertex rv, the extant graph G and the constraints, fill in the base case of the recurrence (i.e. the parsimony score of entering any pair of leaves in any of the potential states of f). ''' rl = {} slnDict = {} inf = float('inf') directed = not undirected for u, v in allPairs(T.node[rv]['leaves']): if differentExtantNetworks(T, u, v): continue fn = KeyObj(u, v, 0, 0) fb = KeyObj(u, v, 1, 1) if directed: fr = KeyObj(u, v, 0, 1) ff = KeyObj(u, v, 1, 0) # If u and v are different protein if u != v: d_f = 1 if G.has_edge(u, v) else 0 d_r = 1 if G.has_edge(v, u) else 0 # If the nodes have not been lost, then assign the regular costs if u in G.nodes() and v in G.nodes(): if undirected: slnDict[fn] = 0 if d_f + d_r == 0 else cc slnDict[fb] = 0 if d_f + d_r == 2 else dc else: slnDict[fn] = d_f * cc + d_r * cc slnDict[fb] = (1 - d_f) * dc + (1 - d_r) * dc slnDict[fr] = d_f * cc + (1 - d_r) * dc slnDict[ff] = (1 - d_f) * dc + d_r * cc else: hasSelfLoop = G.has_edge(u, v) if u in G.nodes() and v in G.nodes(): # A self loop always costs cc slnDict[fn] = cc if hasSelfLoop else 0 slnDict[fb] = 0 if hasSelfLoop else dc if directed: slnDict[fr] = 0 if hasSelfLoop else dc slnDict[ff] = 0 if hasSelfLoop else dc if not (u in G.nodes() and v in G.nodes()): # Costs to lost nodes are always 0 lostCost = 0 slnDict[fn] = lostCost slnDict[fb] = lostCost if directed: slnDict[fr] = lostCost slnDict[ff] = lostCost ## The base cases for the optimal solutions if slnDict[fn] == 0: rl[fn] = ('fn', None, None) rl[fb] = ('fb', None, None) if directed: rl[ff] = ('ff', None, None) rl[fr] = ('fr', None, None) if directed and slnDict[ff] == 0: rl[ff] = ('fn', None, None) rl[fr] = ('fb', None, None) rl[fn] = ('ff', None, None) rl[fb] = ('fr', None, None) if directed and slnDict[fr] == 0: rl[fn] = ('fr', None, None) rl[ff] = ('fb', None, None) rl[fr] = ('fn', None, None) rl[fb] = ('ff', None, None) if slnDict[fb] == 0: rl[fn] = ('fb', None, None) rl[fb] = ('fn', None, None) if directed: rl[ff] = ('fr', None, None) rl[fr] = ('ff', None, None) return slnDict, rl