def compute_relklinker(G, relsim, subs, preds, objs): """ Parameters: ----------- G: rgraph See `datastructures`. relsim: ndarray A square matrix containing relational similarity scores. subs, preds, objs: sequence Sequences representing the subject, predicate and object of input triples. Returns: -------- scores, paths, rpaths, times: sequence One sequence each for the proximity scores, shortest path in terms of nodes, shortest path in terms of relation sequence, and times taken. """ # set weights indegsim = weighted_degree(G.indeg_vec, weight=WTFN).reshape((1, G.N)) indegsim = indegsim.ravel() targets = G.csr.indices % G.N specificity_wt = indegsim[targets] # specificity G.csr.data = specificity_wt.copy() # relation vector relations = (G.csr.indices - targets) / G.N relations_int = relations.astype(int) # convert to int for indexing # back up data = G.csr.data.copy() indices = G.csr.indices.copy() indptr = G.csr.indptr.copy() scores, paths, rpaths, times = [], [], [], [] for idx, (s, p, o) in enumerate(zip(subs, preds, objs)): print('{}. Working on {}..'.format(idx + 1, (s, p, o)), end=' ') ts = time() # set relational weight G.csr.data[targets == o] = 1 # no cost for target t => max. specificity. relsimvec = relsim[p, :] # specific to predicate p relsim_wt = relsimvec[relations_int] # graph weight G.csr.data = np.multiply(relsim_wt, G.csr.data) rp = relclosure(G, s, p, o, kind='metric', linkpred=True) tend = time() print('time: {:.2f}s'.format(tend - ts)) times.append(tend - ts) scores.append(rp.score) paths.append(rp.path) rpaths.append(rp.relational_path) # reset graph G.csr.data = data.copy() G.csr.indices = indices.copy() G.csr.indptr = indptr.copy() sys.stdout.flush() log.info('') return scores, paths, rpaths, times
def compute_klinker(self, G, sid, pid, oid): """ Parameters: ----------- G: rgraph See `datastructures`. subs, preds, objs: sequence Sequences representing the subject, predicate and object of input triples. Returns: -------- scores, paths, rpaths, times: sequence One sequence each for the proximity scores, shortest path in terms of nodes, shortest path in terms of relation sequence, and times taken. """ # set weights indegsim = weighted_degree(G.indeg_vec, weight=self.WTFN).reshape( (1, G.N)) indegsim = indegsim.ravel() targets = G.csr.indices % G.N specificity_wt = indegsim[targets] # specificity G.csr.data = specificity_wt.copy() # back up data = G.csr.data.copy() indices = G.csr.indices.copy() indptr = G.csr.indptr.copy() # compute closure scores, paths, rpaths, times = [], [], [], [] for idx, (s, p, o) in enumerate(zip(sid, pid, oid)): print '{}. Working on {}..'.format(idx + 1, (s, p, o)), ts = time() rp = closure(G, s, p, o, kind='metric', linkpred=True) tend = time() print 'time: {:.2f}s'.format(tend - ts) times.append(tend - ts) scores.append(rp.score) paths.append(rp.path) rpaths.append(rp.relational_path) # reset graph G.csr.data = data.copy() G.csr.indices = indices.copy() G.csr.indptr = indptr.copy() sys.stdout.flush() log.info('') return scores, paths, rpaths, times
def compute_mincostflow(G, relsim, subs, preds, objs, flowfile): """ Parameters: ----------- G: rgraph See `datastructures`. relsim: ndarray A square matrix containing relational similarity scores. subs, preds, objs: sequence Sequences representing the subject, predicate and object of input triples. flowfile: str Absolute path of the file where flow will be stored as JSON, one line per triple. Returns: -------- mincostflows: sequence A sequence containing total flow for each triple. times: sequence Times taken to compute stream of each triple. """ # take graph backup G_bak = { 'data': G.csr.data.copy(), 'indices': G.csr.indices.copy(), 'indptr': G.csr.indptr.copy() } cost_vec_bak = np.log(G.indeg_vec).copy() # some set up G.sources = np.repeat(np.arange(G.N), np.diff(G.csr.indptr)) G.targets = G.csr.indices % G.N cost_vec = cost_vec_bak.copy() indegsim = weighted_degree(G.indeg_vec, weight=WTFN) specificity_wt = indegsim[G.targets] # specificity relations = (G.csr.indices - G.targets) / G.N mincostflows, times = [], [] with open(flowfile, 'w', 0) as ff: for idx, (s, p, o) in enumerate(zip(subs, preds, objs)): s, p, o = [int(x) for x in (s, p, o)] ts = time() print '{}. Working on {} .. '.format(idx + 1, (s, p, o)), sys.stdout.flush() # set weights relsimvec = np.array(relsim[p, :]) # specific to predicate p relsim_wt = relsimvec[relations] G.csr.data = np.multiply(relsim_wt, specificity_wt) # compute mcflow = succ_shortest_path(G, cost_vec, s, p, o, return_flow=False, npaths=5) mincostflows.append(mcflow.flow) ff.write(json.dumps(mcflow.stream) + '\n') tend = time() times.append(tend - ts) print 'mincostflow: {:.5f}, #paths: {}, time: {:.2f}s.'.format( mcflow.flow, len(mcflow.stream['paths']), tend - ts) # reset state of the graph np.copyto(G.csr.data, G_bak['data']) np.copyto(G.csr.indices, G_bak['indices']) np.copyto(G.csr.indptr, G_bak['indptr']) np.copyto(cost_vec, cost_vec_bak) return mincostflows, times
def train_model_sm(G, triples, relsim, use_interpretable_features=False, cv=10): """ Entry point for building a fact-checking classifier. Performs three steps: 1. Path extraction (features) 2a. Path selection using information gain 2b. Filtering most informative discriminative predicate paths 3. Building logistic regression model Parameters: ----------- G: rgraph Knowledge graph. triples: dataframe A data frame consisting of at least four columns, including sid, pid, oid, class. use_interpretable_features: bool Whether or not to perform 2b. cv: int Number of cross-validation folds. Returns: -------- vec: DictVectorizer Useful for preprocessing future triples. model: dict A dictionary containing 'clf' as the built model, and two other key-value pairs, including best parameter and best AUROC score. """ y = triples['class'] # ground truth triples = triples[['sid', 'pid', 'oid']].to_dict(orient='records') pid = triples[0]['pid'] log.info('PID is: {}, with type: {}'.format(pid, pid.dtype)) if np.DataSource().exists(join(HOME, "sm", "G_fil_val_{}.npz".format(int(pid)) ))\ and np.DataSource().exists(join(HOME, "sm", "G_fil_rel_{}.npz".format(int(pid)) )): Gr = load_npz(join(HOME, 'sm', 'G_fil_rel_{}.npz'.format(int(pid)))) Gv = load_npz(join(HOME, 'sm', 'G_fil_val_{}.npz'.format(int(pid)))) else: # set weights indegsim = weighted_degree(G.indeg_vec, weight=WTFN).reshape((1, G.N)) indegsim = indegsim.ravel() targets = G.csr.indices % G.N relations = (G.csr.indices - targets) / G.N relsimvec = np.array(relsim[int(pid), :]) # specific to predicate p relsim_wt = relsimvec[ relations] # with the size of relations as the number of relations ###################################################### specificity_wt = indegsim[targets] # specificity ## Removing all the edges with the predicte p in between any nodes. log.info('=> Removing predicate {} from KG.\n\n'.format(pid)) eraseedges_mask = ((G.csr.indices - (G.csr.indices % G.N)) / G.N) == pid specificity_wt[eraseedges_mask] = 0 relsim_wt[eraseedges_mask] = 0 G.csr.data = specificity_wt.copy() G.csr.data = np.multiply(relsim_wt, G.csr.data) log.info("Constructing adjacency matrix for: {}".format(pid)) adj_list_data = [] adj_list_s = [] adj_list_p = [] adj_list_o = [] sel_data = np.array([]) sel_relations = np.array([]) dicti = {} num_nodes = len(G.csr.indptr) - 1 for node in tqdm(xrange(num_nodes)): dicti = {} start = G.csr.indptr[node] end = G.csr.indptr[node + 1] sel_data = G.csr.data[start:end] sel_relations = relations[start:end] for i, sel_tar in enumerate(targets[start:end]): if sel_tar in dicti: if dicti[sel_tar][0] < sel_data[i]: dicti[sel_tar] = (sel_data[i], sel_relations[i]) else: dicti[sel_tar] = (sel_data[i], sel_relations[i]) for key, value in dicti.iteritems(): if value[0] != 0: adj_list_data.append(value[0]) adj_list_s.append(node) adj_list_p.append(value[1]) adj_list_o.append(key) Gr = csr_matrix((adj_list_p, (adj_list_s, adj_list_o)), shape=(num_nodes, num_nodes)) Gv = csr_matrix((adj_list_data, (adj_list_s, adj_list_o)), shape=(num_nodes, num_nodes)) save_npz(join(HOME, 'sm', 'G_fil_rel_{}.npz'.format(int(pid))), Gr) save_npz(join(HOME, 'sm', 'G_fil_val_{}.npz'.format(int(pid))), Gv) ############# Path extraction ################### log.info('=> Path extraction..(this can take a while)') t1 = time() features, pos_features, neg_features, measurements = extract_paths_sm_par( Gv, Gr, triples, y) gc.collect() log.info('P: +:{}, -:{}, unique tot:{}'.format(len(pos_features), len(neg_features), len(features))) vec = DictVectorizer() X = vec.fit_transform(measurements) n, m = X.shape log.info('Time taken: {:.2f}s\n\n'.format(time() - t1)) ########### Path selection ############### log.info('=> Path selection..') t1 = time() pathselect = SelectKBest(mutual_info_classif, k=min(100, m)) X_select = pathselect.fit_transform(X, y) selectidx = pathselect.get_support( indices=True) # selected feature indices vec = vec.restrict(selectidx, indices=True) select_pos_features, select_neg_features = set(), set() for feature in vec.get_feature_names(): if feature in pos_features: select_pos_features.add(feature) if feature in neg_features: select_neg_features.add(feature) log.info('D: +:{}, -:{}, tot:{}'.format(len(select_pos_features), len(select_neg_features), X_select.shape[1])) log.info('Time taken: {:.2f}s\n'.format(time() - t1)) # Fact interpretation if use_interpretable_features and len(select_neg_features) > 0: log.info('=> Fact interpretation..') t1 = time() theta = 10 select_neg_idx = [ i for i, f in enumerate(vec.get_feature_names()) if f in select_neg_features ] removemask = np.where( np.sum(X_select[:, select_neg_idx], axis=0) >= theta)[0] restrictidx = select_neg_idx[removemask] keepidx = [] for i, f in enumerate(vec.get_feature_names()): if i not in restrictidx: keepidx.append(i) else: select_neg_features.remove(f) vec = vec.restrictidx(keepidx, indices=True) X_select = X_select[:, keepidx] log.info('D*: +:{}, -:{}, tot:{}'.format(len(select_pos_features), len(select_neg_features), X_select.shape[1])) log.info('Time taken: {:.2f}s\n'.format(time() - t1)) # Model creation log.info('=> Model building..') t1 = time() model = find_best_model(X_select, y, cv=cv) log.info('#Features: {}, best-AUROC: {:.5f}'.format( X_select.shape[1], model['best_score'])) log.info('Time taken: {:.2f}s\n'.format(time() - t1)) return vec, model
def test_graph2(): sym = True adj = np.array([[0, 1, 0, 16], [2, 4, 0, 14], [4, 5, 0, 4], [0, 2, 1, 13], [2, 1, 1, 4], [3, 5, 1, 20], [1, 3, 2, 12], [3, 2, 2, 9], [4, 3, 2, 7]]) shape = (6, 6, 3) G = make_graph(adj[:, :3], shape, values=adj[:, 3], sym=sym, display=False) print "Original graph:\n", G # set weights indegsim = weighted_degree(G.indeg_vec, weight='degree').reshape((1, G.N)) indegsim = indegsim.ravel() targets = G.csr.indices % G.N specificity_wt = indegsim[targets] # specificity G.csr.data = specificity_wt.copy() # back up data = G.csr.data.copy() indices = G.csr.indices.copy() indptr = G.csr.indptr.copy() # Closure expect = [[0, 0, 1, 0.20000000000000001, [0, 2, 1], [-1, 1, 1]], [0, 0, 2, 1.0, [0, 2], [-1, 1]], [0, 0, 3, 0.25, [0, 1, 3], [-1, 0, 2]], [0, 0, 4, 0.20000000000000001, [0, 2, 4], [-1, 1, 0]], [0, 0, 5, 0.125, [0, 1, 3, 5], [-1, 0, 2, 1]], [0, 1, 1, 1.0, [0, 1], [-1, 0]], [0, 1, 2, 0.25, [0, 1, 2], [-1, 0, 1]], [0, 1, 3, 0.25, [0, 1, 3], [-1, 0, 2]], [0, 1, 4, 0.20000000000000001, [0, 2, 4], [-1, 1, 0]], [0, 1, 5, 0.125, [0, 1, 3, 5], [-1, 0, 2, 1]], [0, 2, 1, 1.0, [0, 1], [-1, 0]], [0, 2, 2, 1.0, [0, 2], [-1, 1]], [0, 2, 3, 0.25, [0, 1, 3], [-1, 0, 2]], [0, 2, 4, 0.20000000000000001, [0, 2, 4], [-1, 1, 0]], [0, 2, 5, 0.125, [0, 1, 3, 5], [-1, 0, 2, 1]], [1, 0, 0, 0.20000000000000001, [1, 2, 0], [-1, 1, 1]], [1, 0, 2, 1.0, [1, 2], [-1, 1]], [1, 0, 3, 1.0, [1, 3], [-1, 2]], [1, 0, 4, 0.20000000000000001, [1, 3, 4], [-1, 2, 2]], [1, 0, 5, 0.20000000000000001, [1, 3, 5], [-1, 2, 1]], [1, 1, 0, 1.0, [1, 0], [-1, 0]], [1, 1, 2, 0.33333333333333331, [1, 0, 2], [-1, 0, 1]], [1, 1, 3, 1.0, [1, 3], [-1, 2]], [1, 1, 4, 0.20000000000000001, [1, 3, 4], [-1, 2, 2]], [1, 1, 5, 0.20000000000000001, [1, 3, 5], [-1, 2, 1]], [1, 2, 0, 1.0, [1, 0], [-1, 0]], [1, 2, 2, 1.0, [1, 2], [-1, 1]], [1, 2, 3, 0.20000000000000001, [1, 2, 3], [-1, 1, 2]], [1, 2, 4, 0.20000000000000001, [1, 3, 4], [-1, 2, 2]], [1, 2, 5, 0.20000000000000001, [1, 3, 5], [-1, 2, 1]], [2, 0, 0, 1.0, [2, 0], [-1, 1]], [2, 0, 1, 1.0, [2, 1], [-1, 1]], [2, 0, 3, 1.0, [2, 3], [-1, 2]], [2, 0, 4, 0.20000000000000001, [2, 3, 4], [-1, 2, 2]], [2, 0, 5, 0.25, [2, 4, 5], [-1, 0, 0]], [2, 1, 0, 0.25, [2, 1, 0], [-1, 1, 0]], [2, 1, 1, 0.33333333333333331, [2, 0, 1], [-1, 1, 0]], [2, 1, 3, 1.0, [2, 3], [-1, 2]], [2, 1, 4, 1.0, [2, 4], [-1, 0]], [2, 1, 5, 0.25, [2, 4, 5], [-1, 0, 0]], [2, 2, 0, 1.0, [2, 0], [-1, 1]], [2, 2, 1, 1.0, [2, 1], [-1, 1]], [2, 2, 3, 0.25, [2, 1, 3], [-1, 1, 2]], [2, 2, 4, 1.0, [2, 4], [-1, 0]], [2, 2, 5, 0.25, [2, 4, 5], [-1, 0, 0]], [3, 0, 0, 0.25, [3, 1, 0], [-1, 2, 0]], [3, 0, 1, 1.0, [3, 1], [-1, 2]], [3, 0, 2, 1.0, [3, 2], [-1, 2]], [3, 0, 4, 1.0, [3, 4], [-1, 2]], [3, 0, 5, 1.0, [3, 5], [-1, 1]], [3, 1, 0, 0.25, [3, 1, 0], [-1, 2, 0]], [3, 1, 1, 1.0, [3, 1], [-1, 2]], [3, 1, 2, 1.0, [3, 2], [-1, 2]], [3, 1, 4, 1.0, [3, 4], [-1, 2]], [3, 1, 5, 0.25, [3, 4, 5], [-1, 2, 0]], [3, 2, 0, 0.25, [3, 1, 0], [-1, 2, 0]], [3, 2, 1, 0.20000000000000001, [3, 2, 1], [-1, 2, 1]], [3, 2, 2, 0.25, [3, 4, 2], [-1, 2, 0]], [3, 2, 4, 0.33333333333333331, [3, 5, 4], [-1, 1, 0]], [3, 2, 5, 1.0, [3, 5], [-1, 1]], [4, 0, 0, 0.20000000000000001, [4, 2, 0], [-1, 0, 1]], [4, 0, 1, 0.20000000000000001, [4, 3, 1], [-1, 2, 2]], [4, 0, 2, 0.20000000000000001, [4, 3, 2], [-1, 2, 2]], [4, 0, 3, 1.0, [4, 3], [-1, 2]], [4, 0, 5, 0.20000000000000001, [4, 3, 5], [-1, 2, 1]], [4, 1, 0, 0.20000000000000001, [4, 2, 0], [-1, 0, 1]], [4, 1, 1, 0.20000000000000001, [4, 3, 1], [-1, 2, 2]], [4, 1, 2, 1.0, [4, 2], [-1, 0]], [4, 1, 3, 1.0, [4, 3], [-1, 2]], [4, 1, 5, 1.0, [4, 5], [-1, 0]], [4, 2, 0, 0.20000000000000001, [4, 2, 0], [-1, 0, 1]], [4, 2, 1, 0.20000000000000001, [4, 3, 1], [-1, 2, 2]], [4, 2, 2, 1.0, [4, 2], [-1, 0]], [4, 2, 3, 0.33333333333333331, [4, 5, 3], [-1, 0, 1]], [4, 2, 5, 1.0, [4, 5], [-1, 0]], [5, 0, 0, 0.125, [5, 4, 2, 0], [-1, 0, 0, 1]], [5, 0, 1, 0.20000000000000001, [5, 3, 1], [-1, 1, 2]], [5, 0, 2, 0.25, [5, 4, 2], [-1, 0, 0]], [5, 0, 3, 1.0, [5, 3], [-1, 1]], [5, 0, 4, 0.20000000000000001, [5, 3, 4], [-1, 1, 2]], [5, 1, 0, 0.125, [5, 4, 2, 0], [-1, 0, 0, 1]], [5, 1, 1, 0.20000000000000001, [5, 3, 1], [-1, 1, 2]], [5, 1, 2, 0.25, [5, 4, 2], [-1, 0, 0]], [5, 1, 3, 0.25, [5, 4, 3], [-1, 0, 2]], [5, 1, 4, 1.0, [5, 4], [-1, 0]], [5, 2, 0, 0.125, [5, 4, 2, 0], [-1, 0, 0, 1]], [5, 2, 1, 0.20000000000000001, [5, 3, 1], [-1, 1, 2]], [5, 2, 2, 0.25, [5, 4, 2], [-1, 0, 0]], [5, 2, 3, 1.0, [5, 3], [-1, 1]], [5, 2, 4, 1.0, [5, 4], [-1, 0]]] results = [] itr = 0 for s in xrange(G.N): for p in xrange(G.R): for o in xrange(G.N): if s == o: continue G.csr.data[targets == o] = 1 rp = relclosure(G, s, p, o, kind='metric', linkpred=True) tmp = [ rp.source, rp.relation, rp.target, rp.score, rp.path, rp.relational_path ] results.append(tmp) assert allclose(expect[itr], tmp) itr += 1 G.csr.data = data.copy() G.csr.indices = indices.copy() G.csr.indptr = indptr.copy()