def main(fname=None, pkl=True, **kwds): assert fname if isinstance(pkl, basestring) and pkl.lower() in ('f', 'false', 'none'): pkl = False if 'err' in kwds: kwds['err'] = int(kwds['err']) if 'th' in kwds: kwds['th'] = float(kwds['th']) D = mio.load(fname) print "Computing all pairs weak boolean class..." print "Computing all pairs weak class for a (%d x %d) data matrix (%d x %d result matrix)..." % \ (D['M'].shape[0], D['M'].shape[1], D['M'].shape[0], D['M'].shape[0]) WEAK, err, th = compute_all_weak(D['M'], **kwds) print "Used parameters err=%d, cutoff th=%f" % (err, th) fname_out = "%s.err%d.th%.4f.weak.tab" % (fname, err, th) print "Saving %s..." % (fname_out) mio.save(WEAK, fname_out, fmt="%d", row_ids=D['row_ids'], col_ids=D['row_ids']) if pkl: fname_pkl_out = fname_out.rpartition('.')[0] + '.pkl' print "Saving %s..." % (fname_pkl_out) pickle.dump(WEAK, open(fname_pkl_out, "w"), protocol=-1) return WEAK
def main(): fp = open(FNAME) fp.next() ppi = {} genes = set() for line in fp: row = line.strip('\n\r').split('\t') genes.add(row[0]); genes.add(row[1]) ppi.setdefault(row[0],set()).add(row[1]) rownames = sorted(genes) colnames = rownames row_idx = dict(((s,i) for i,s in enumerate(rownames))) A = np.zeros((len(rownames), len(colnames))) for gene_i in ppi: for gene_j in ppi[gene_i]: i = row_idx[gene_i] j = row_idx[gene_j] A[i,j] = 1; A[j,i] = 1 print >>sys.stderr, "# interactions x2: %d" % np.sum(A) print >>sys.stderr, "# (genes x genes) size:", A.shape mio.save(A, fp=FNAME_OUT, row_ids=['"%s"'%s for s in rownames], col_ids=['"%s"'%s for s in colnames], fmt="%d")
def main(fname=None, pkl=True, **kwds): assert fname if isinstance(pkl, basestring) and pkl.lower() in ('f', 'false', 'none'): pkl = False if 'b' in kwds: kwds['b'] = float(kwds['b']) if 'z_th' in kwds: kwds['z_th'] = float(kwds['z_th']) if 'err_th' in kwds: kwds['err_th'] = float(kwds['err_th']) if 'd_th' in kwds: kwds['d_th'] = float(kwds['d_th']) if 'r_th' in kwds: kwds['r_th'] = float(kwds['r_th']) print "Loading data..." D = mio.load(fname) print "Computing all pairs boolean class for a (%d x %d) data matrix (%d x %d result matrix)..." % \ (D['M'].shape[0], D['M'].shape[1], D['M'].shape[0], D['M'].shape[0]) CLS, steps, b = compute_all_bool(D['M'], **kwds) z = kwds.get('z_th', 3.0) r = kwds.get('r_th', 2 / 3) if r < 0.5: print "WARNING: r<0.5, (r=%f)... are you sure?" % r err = kwds.get('err_th', 0.1) fname_out = '%s.b%.4f.z%.2f.r%.2f.err%.2f.bool.tab' % (fname, b, z, r, err) print "Saving %s..." % (fname_out) mio.save(CLS, fname_out, fmt="%d", row_ids=D['row_ids'], col_ids=D['row_ids']) steps_fname = fname + ".steps.txt" print "Saving high/low thresholds to %s in original row order..." % steps_fname open(steps_fname, "w").write("\n".join(("%f" % x for x in steps))) if pkl: fname_pkl_out = fname_out.rpartition('.')[0] + '.pkl' print "Saving %s..." % (fname_pkl_out) pickle.dump(CLS, open(fname_pkl_out, "w"), protocol=-1)
def extend_training_datas(): """ extend training datas, expand fourth. 42000 -> 168000 """ old_features, old_labels = lsd.load_train_data('../../dataset/train.csv') # extend the training datas extend_features = [] extend_labels = [] print 'extend training datas, expand fourth. 42000 -> 84000' for i in xrange(len(old_features)): feature = old_features[i] label = old_labels[i] feature = np.mat(feature) feature = feature.reshape(28, 28) for m in range(0, 5, 4): for n in range(0, 5, 4): f_temp = feature[n:28 - 4 + n, m:28 - 4 + m] f_temp = f_temp.reshape(1, 24 * 24) f_temp = f_temp.tolist()[0] extend_features.append(f_temp) extend_labels.append(label) print 'train:', i, np.shape(extend_features), "-", np.shape( extend_labels) matrix_io.save(extend_features, 'extend_train_features.pkl') matrix_io.save(extend_labels, 'extend_train_labels.pkl') print 'training datas count:', len( old_features), '-> extend training datas:', len(extend_features)
def main(fname=None, pkl=True, algorithm="3", outtag="", **kwds): assert fname if isinstance(pkl, basestring) and pkl.lower() in ('f','false','none'): pkl = False print "Loading data from %s..." % fname D = mio.load(fname) print "Computing all pairs (euclidean) distance correlation from a (%d x %d) data matrix to a (%d x %d) result matrix..." % (D['M'].shape[0], D['M'].shape[1], D['M'].shape[0], D['M'].shape[0]) print "Computing all pairs (euclidean) distance correlation..." if algorithm == "1": print "Using Algorithm 1: single dot product, n^2*m memory" DCOR = compute_all_dcor(D['M'], **kwds) elif algorithm == "2": print "Using Algorithm 2: multiple dot products, n*m memory" DCOR = compute_all_dcor_2(D['M'], **kwds) elif algorithm == "3": print "Using Algorithm 3: multiple dot products, n*m memory, n choose 2 savings" DCOR = compute_all_dcor_3(D['M'], **kwds) else: raise Exception, "Unknown algorithm %s" % algorithm if outtag and outtag[-1] != ".": outtag += "." fname_out = '%s.%sdcor.tab' % (fname, outtag) print "Saving %s..." % (fname_out) mio.save(DCOR, fname_out, fmt="%.4f", row_ids=D['row_ids'], col_ids=D['row_ids']) if pkl: fname_pkl_out = fname_out.rpartition('.')[0]+'.pkl' print "Saving %s..." % (fname_pkl_out) pickle.dump(DCOR, open(fname_pkl_out,"w"), protocol=-1) return DCOR
def extend_test_datas(): """ extend test datas, expand fourth. 28000 -> 112000 """ old_test_features = lsd.load_test_data('../../dataset/test.csv') # extend the test datas extend_features = [] print 'extend test datas, expand fourth. 28000 -> 112000' for i in xrange(len(old_test_features)): feature = old_test_features[i] feature = np.mat(feature) feature = feature.reshape(28, 28) for m in range(0, 5, 4): for n in range(0, 5, 4): f_temp = feature[n:28 - 4 + n, m:28 - 4 + m] f_temp = f_temp.reshape(1, 24 * 24) f_temp = f_temp.tolist()[0] extend_features.append(f_temp) print 'test:', i, np.shape(extend_features) matrix_io.save(extend_features, 'extend_test_features.pkl') print 'test datas count:', len( old_test_features), '-> extend test datas:', len(extend_features)
def main(): fp = open(FNAME) fp.next() ppi = {} genes = set() for line in fp: row = line.strip('\n\r').split('\t') genes.add(row[0]) genes.add(row[1]) ppi.setdefault(row[0], set()).add(row[1]) rownames = sorted(genes) colnames = rownames row_idx = dict(((s, i) for i, s in enumerate(rownames))) A = np.zeros((len(rownames), len(colnames))) for gene_i in ppi: for gene_j in ppi[gene_i]: i = row_idx[gene_i] j = row_idx[gene_j] A[i, j] = 1 A[j, i] = 1 print >> sys.stderr, "# interactions x2: %d" % np.sum(A) print >> sys.stderr, "# (genes x genes) size:", A.shape mio.save(A, fp=FNAME_OUT, row_ids=['"%s"' % s for s in rownames], col_ids=['"%s"' % s for s in colnames], fmt="%d")
def main(fname=None, pkl=True, **kwds): assert fname if isinstance(pkl, basestring) and pkl.lower() in ('f','false','none'): pkl = False if 'b' in kwds: kwds['b'] = float(kwds['b']) if 'z_th' in kwds: kwds['z_th'] = float(kwds['z_th']) if 'err_th' in kwds: kwds['err_th'] = float(kwds['err_th']) if 'd_th' in kwds: kwds['d_th'] = float(kwds['d_th']) if 'r_th' in kwds: kwds['r_th'] = float(kwds['r_th']) print "Loading data..." D = mio.load(fname) print "Computing all pairs boolean class for a (%d x %d) data matrix (%d x %d result matrix)..." % \ (D['M'].shape[0], D['M'].shape[1], D['M'].shape[0], D['M'].shape[0]) CLS, steps, b = compute_all_bool(D['M'], **kwds) z = kwds.get('z_th', 3.0) r = kwds.get('r_th', 2/3) if r < 0.5: print "WARNING: r<0.5, (r=%f)... are you sure?" % r err = kwds.get('err_th', 0.1) fname_out = '%s.b%.4f.z%.2f.r%.2f.err%.2f.bool.tab' % (fname, b, z, r, err) print "Saving %s..." % (fname_out) mio.save(CLS, fname_out, fmt="%d", row_ids=D['row_ids'], col_ids=D['row_ids']) steps_fname = fname+".steps.txt" print "Saving high/low thresholds to %s in original row order..." % steps_fname open(steps_fname,"w").write("\n".join(("%f"%x for x in steps))) if pkl: fname_pkl_out = fname_out.rpartition('.')[0]+'.pkl' print "Saving %s..." % (fname_pkl_out) pickle.dump(CLS, open(fname_pkl_out,"w"), protocol=-1)
def main(write_adj_fname=ADJM_OUT_FNAME): if isinstance(write_adj_fname,basestring) and write_adj_fname.lower() in ('f','false','none'): write_adj_fname = None all_genes = set() tf_targs = {} for line in open(TARGS_FNAME): row = line.strip('\n\r').split(',') tf_targs[row[0]] = set(row[1:]) all_genes.update(row[1:]) tfs = set(tf_targs.keys()) for line in open(TFLIST_FNAME): tfs.add(line.strip('\n\r')) all_genes.update(tfs) for line in open(LIT_FNAME): row = line.strip().split(',') tf, targs = row[0], row[1:] tfs.add(tf) all_genes.update(row) tf_targs.setdefault(tf,set()).update(targs) # STATS print "# genes total:", len(all_genes) print "# transcription factors:", len(tfs) print "# transcription factors with targets:", len(tf_targs) # Text adj list print "Printing list of transcription factors with putative targets to %s" % MERGED_LIST_OUT_FNAME fp_out = open(MERGED_LIST_OUT_FNAME,"w") for tf in sorted(tfs): if tf in tf_targs: print >>fp_out, "%s,%s" % (tf,",".join(tf_targs)) else: print >>fp_out, tf # Text adj matrix, columns TF, rows TF targets if not write_adj_fname: return 0 else: print "Writing to .tab text file..." rownames = sorted(all_genes) colnames = sorted(tfs) row_idx = dict(((s,i) for i,s in enumerate(rownames))) A = np.zeros((len(rownames), len(colnames))) for j,tf in enumerate(colnames): for g in tf_targs.get(tf,[]): i = row_idx[g] A[i,j] = 1 print >>sys.stderr, "# interactions: %d" % np.sum(A) print >>sys.stderr, "# genesXtf size:", A.shape mio.save(A, fp=write_adj_fname, row_ids=['"%s"'%s for s in rownames], col_ids=['"%s"'%s for s in colnames], fmt="%d") # compile to R object using R script wrapper print "Converting to RData binary object..." r_script = LOAD_TMP % {'fname':write_adj_fname} p = Popen(["R", "--vanilla", "--slave"], stdout=PIPE, stdin=PIPE, stderr=STDOUT) print p.communicate(input=r_script) p.stdin.close()
def main(): ADJ_D = mio.load("data/gold_0.32_dot_nw.adj.csv") M = np.array(ADJ_D['M'],dtype=int) print M P = np.zeros(M.shape, dtype=np.int) for i in xrange(M.shape[1]): js = list(get_connected(M, i, k=2)) P[js,i] = 1 print P print M==P assert ADJ_D['row_ids'] == ADJ_D['col_ids'] mio.save(P, open("data/gold.paths.dcor0.32.k2.tab","w"), ftype="txt", row_ids=ADJ_D['row_ids'], col_ids=ADJ_D['col_ids'], fmt="%d")
def main(fname=None, as_rows=True, use_weak=False): assert fname if isinstance(as_rows, basestring) and as_rows.lower() in ('f', 'false', 'none'): as_rows = False if not use_weak: print "Loading boolean class enumeration matrix", fname else: print "Loading weak class enumeration matrix", fname D = mio.load(fname, dtype=np.int) M = D['M'] # verify that enumeration matrices look credible if not use_weak: Z = np.in1d(M, np.array([0, 1, 2, 3, 4, 5, 6, 7])) if not np.all(Z): print "%d invalid values in M." % (np.sum(~Z)) print "up to 20 unrecognized values include..." zz = M[Z] print np.unique(zz)[1:np.min(20, len(zz))] else: assert np.all(np.in1d(M, np.array([0, 1, 2, 3, 4, 5]))) if not as_rows: print "Computing distance between all pairs of columns..." M = np.transpose(M) else: print "Computing distance between all pairs of rows..." if not use_weak: print "Computing Boolean Class distance" DIST = all_pairs_bool_dist(M) fname_out = fname + '.booldist.tab' else: print "Computing Weak Class distance" DIST = all_pairs_weak_dist(M) fname_out = fname + '.weakdist.tab' print "Saving boolean class distance matrix as", fname_out if as_rows: ids = D.get('row_ids', None) mio.save(DIST, fp=fname_out, row_ids=ids, col_ids=ids, fmt="%d") else: ids = D.get('col_ids', None) mio.save(DIST, fp=fname_out, row_ids=ids, col_ids=ids, fmt="%d") return fname_out
def main(): # 1: load adj matrix ADJ_D = mio.load(ADJ_FNAME, dtype=np.int, force_row_ids=True, force_col_ids=True) assert len(ADJ_D['row_ids']) == len(ADJ_D['col_ids']) assert len(ADJ_D['row_ids']) == ADJ_D['M'].shape[0] assert ADJ_D['M'].shape[0] == ADJ_D['M'].shape[1] # 2.1: find paths k=3 P3 = paths.fill_paths(ADJ_D["M"], k=3) mio.save(Pinf, open("data/all_k61_0.5_dot_nw.adj.paths.k3.csv","w"), ftype="txt", row_ids=ADJ_D['row_ids'], col_ids=ADJ_D['col_ids'], fmt="%d", delimit_c=",") # 2.2: find paths k=2 P2 = paths.fill_paths(ADJ_D["M"], k=2) mio.save(Pinf, open("data/all_k61_0.5_dot_nw.adj.paths.k2.csv","w"), ftype="txt", row_ids=ADJ_D['row_ids'], col_ids=ADJ_D['col_ids'], fmt="%d", delimit_c=",") # 3: load ranks ranks = load_ranks(open(RANKS_FNAME)) print ranks
def main(fname=None, as_rows=True, use_weak=False): assert fname if isinstance(as_rows,basestring) and as_rows.lower() in ('f','false','none'): as_rows = False if not use_weak: print "Loading boolean class enumeration matrix", fname else: print "Loading weak class enumeration matrix", fname D = mio.load(fname, dtype=np.int) M = D['M'] # verify that enumeration matrices look credible if not use_weak: Z = np.in1d(M,np.array([0,1,2,3,4,5,6,7])) if not np.all(Z): print "%d invalid values in M." % (np.sum(~Z)) print "up to 20 unrecognized values include..." zz = M[Z] print np.unique(zz)[1:np.min(20, len(zz))] else: assert np.all(np.in1d(M,np.array([0,1,2,3,4,5]))) if not as_rows: print "Computing distance between all pairs of columns..." M = np.transpose(M) else: print "Computing distance between all pairs of rows..." if not use_weak: print "Computing Boolean Class distance" DIST = all_pairs_bool_dist(M) fname_out = fname+'.booldist.tab' else: print "Computing Weak Class distance" DIST = all_pairs_weak_dist(M) fname_out = fname+'.weakdist.tab' print "Saving boolean class distance matrix as", fname_out if as_rows: ids = D.get('row_ids',None) mio.save(DIST, fp=fname_out, row_ids=ids, col_ids=ids, fmt="%d") else: ids = D.get('col_ids',None) mio.save(DIST, fp=fname_out, row_ids=ids, col_ids=ids, fmt="%d") return fname_out
def main(): v = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) H = np.matrix([[0, 0, 0, 1], [1, 1, 1, 0], [0, 0, 0, 1]]) L = np.matrix([[1, 1, 0, 0], [0, 0, 0, 1], [1, 1, 1, 0]]) print stepfit(v) D = mio.load("nice.may3.Eg.expr.gold.celegans.csv") M = D['M'] Steps = [] for row in M: Steps.append(stepfit(row)[0]) b = 0.3 #CLS = all_pairs_bool(M, Steps, b, z_th=2.7) CLS = all_pairs_bool(M, Steps, b) print CLS print D.keys() mio.save(CLS, "nice.test.tab", fmt="%d", row_ids=D['row_ids'], col_ids=D['row_ids']) print "OK: saved result to nice.test.tab"
def main(fname=None, pkl=True, **kwds): assert fname if isinstance(pkl, basestring) and pkl.lower() in ('f','false','none'): pkl = False if 'err' in kwds: kwds['err'] = int(kwds['err']) if 'th' in kwds: kwds['th'] = float(kwds['th']) D = mio.load(fname) print "Computing all pairs weak boolean class..." print "Computing all pairs weak class for a (%d x %d) data matrix (%d x %d result matrix)..." % \ (D['M'].shape[0], D['M'].shape[1], D['M'].shape[0], D['M'].shape[0]) WEAK, err, th = compute_all_weak(D['M'], **kwds) print "Used parameters err=%d, cutoff th=%f" % (err, th) fname_out = "%s.err%d.th%.4f.weak.tab" % (fname, err, th) print "Saving %s..." % (fname_out) mio.save(WEAK, fname_out, fmt="%d", row_ids=D['row_ids'], col_ids=D['row_ids']) if pkl: fname_pkl_out = fname_out.rpartition('.')[0]+'.pkl' print "Saving %s..." % (fname_pkl_out) pickle.dump(WEAK, open(fname_pkl_out,"w"), protocol=-1) return WEAK
def main(fname=None, pkl=True, algorithm="3", outtag="", **kwds): assert fname if isinstance(pkl, basestring) and pkl.lower() in ('f', 'false', 'none'): pkl = False print "Loading data from %s..." % fname D = mio.load(fname) print "Computing all pairs (euclidean) distance correlation from a (%d x %d) data matrix to a (%d x %d) result matrix..." % ( D['M'].shape[0], D['M'].shape[1], D['M'].shape[0], D['M'].shape[0]) print "Computing all pairs (euclidean) distance correlation..." if algorithm == "1": print "Using Algorithm 1: single dot product, n^2*m memory" DCOR = compute_all_dcor(D['M'], **kwds) elif algorithm == "2": print "Using Algorithm 2: multiple dot products, n*m memory" DCOR = compute_all_dcor_2(D['M'], **kwds) elif algorithm == "3": print "Using Algorithm 3: multiple dot products, n*m memory, n choose 2 savings" DCOR = compute_all_dcor_3(D['M'], **kwds) else: raise Exception, "Unknown algorithm %s" % algorithm if outtag and outtag[-1] != ".": outtag += "." fname_out = '%s.%sdcor.tab' % (fname, outtag) print "Saving %s..." % (fname_out) mio.save(DCOR, fname_out, fmt="%.4f", row_ids=D['row_ids'], col_ids=D['row_ids']) if pkl: fname_pkl_out = fname_out.rpartition('.')[0] + '.pkl' print "Saving %s..." % (fname_pkl_out) pickle.dump(DCOR, open(fname_pkl_out, "w"), protocol=-1) return DCOR
def main(pkl_fname=None, row_fname=None, col_fname=None, outdir=None, sig=None, doabs=False, diag=1): """ pkl_fname: path to pickled numpy dependency matrix row_fname: path to labeled text matrix with row ids, maybe col ids col_fname: optional path to labeled text matrix with col ids sig: float of minimum significance doabs: flag of whether to use absolute value for significance testing diag: if matrix is symmetric, the value of the diagonal """ assert pkl_fname and row_fname and outdir make_dir(outdir) if doabs: abstxt = "T" else: abstxt = "F" out_fname = os.path.join(outdir, os.path.basename(pkl_fname.rpartition('.')[0])) if sig: out_fname += ".sig%f" % sig if doabs: out_fname += ".absT" out_fname += ".tab" print "Text matrix will be saved to: %s" % out_fname M = pickle.load(open(pkl_fname)) # Get row and column labels. try: D_row = mio.load(row_fname) row_names = np.array(D_row['row_ids']) except AssertionError: row_names = np.array([s.strip('\n\r') for s in open(row_fname)]) if col_fname is None: col_names = np.array(D_row['col_ids']) else: if row_fname == col_fname: col_names = row_names else: try: D_col = mio.load(col_fname) col_names = np.array(D_col['row_ids']) # Use row IDs as column IDs in Dependency Matrix except AssertionError: col_names = np.array([s.strip('\n\r') for s in open(col_fname)]) if len(row_names) == np.size(M,0) and len(col_names) == np.size(M,1): print "Number of rows(%d) and column(%d) names fit matrix size (%d,%d)." % \ (len(row_names), len(col_names), np.size(M,0), np.size(M,1)) else: n = len(row_names) if np.size(M,0) == n*(n-1)//2: print "Matrix seems to be n choose 2 upper triangle matrix. Converting to full matrix..." M = distance.squareform(M) if diag is not None: print "Forcing diagonal to be:", diag for i in xrange(n): M[i,i] = diag else: raise Exception, "Unknown matrix size %s given #row_ids(%d), #col_ids(%d)" % \ (np.shape(M), len(row_names), len(col_names)) # Remove insignificant rows and columns; align row/col names original_dim = M.shape if sig is not None: sig = float(sig) if not doabs: col_max = np.amax(M,0) row_max = np.amax(M,1) else: col_max = np.amax(np.abs(M),0) row_max = np.amax(np.abs(M),1) M = M[row_max>=sig,:][:,col_max>=sig] row_names = row_names[row_max>=sig] col_names = col_names[col_max>=sig] new_dim = M.shape # Dump to text now_timestamp = datetime.datetime.now().isoformat('_') header = ["Generated on %s from pickled matrix file %s" % (now_timestamp, pkl_fname), "Original dimensions: %s, New dimensions: %s" % (original_dim, new_dim), "sig: %s, abs: %s" % (str(sig), str(abstxt))] print "\n".join(header) fp = open(out_fname, "w") mio.save(M, fp, ftype="txt", delimit_c="\t", row_ids=list(row_names), col_ids=list(col_names), headers=header) fp.close() print "Tab matrix saved to %s." % out_fname return out_fname
"""Read PINA file, print simple adjaceny list""" from pina import * import matrix_io as mio pina_fpath = "/nfs/01/osu6683/H**o sapiens-20121210.txt" P = PINAEnriched(open(pina_fpath)) row_names, A = P.return_adj_matrix() mio.save(M=A, fp="/nfs/01/osu6683/pina_adj_matrix.tab", fmt="%d", row_ids=row_names, col_ids=row_names, fill_upper_left=False)
def main(): # 1: load adj matrix ADJ_D = mio.load(ADJ_FNAME, dtype=np.int, force_row_ids=True, force_col_ids=True) DCOR_D = mio.load(DCOR_FNAME, force_row_ids=True, force_col_ids=True) assert len(ADJ_D['row_ids']) == len(ADJ_D['col_ids']) assert len(ADJ_D['row_ids']) == ADJ_D['M'].shape[0] assert DCOR_D["row_ids"] == ADJ_D['row_ids'] assert DCOR_D["row_ids"] == DCOR_D["col_ids"] assert ADJ_D['M'].shape[0] == ADJ_D['M'].shape[1] n = ADJ_D['M'].shape[0] # 2.1: find paths k=2 P2 = paths.fill_paths(ADJ_D["M"], k=2) mio.save(P2, open(P2_FNAME,"w"), ftype="txt", row_ids=ADJ_D['row_ids'], col_ids=ADJ_D['col_ids'], fmt="%d", delimit_c=",") # 2.2: find paths k=3 P3 = paths.fill_paths(ADJ_D["M"], k=3) mio.save(P3, open(P3_FNAME,"w"), ftype="txt", row_ids=ADJ_D['row_ids'], col_ids=ADJ_D['col_ids'], fmt="%d", delimit_c=",") name_order = ADJ_D['row_ids'] # 3.1: load ranks # (I compiled this list manually) Note that rank elements are 0 indexed in ADJ matrix ranks = load_ranks_named(open(RANKS_FNAME), name_order) #print ranks #print name_order #sys.exit(1) node_clusters = [] # 4: find clusters in same rank at k=2 Ignore_Clust = np.zeros(ADJ_D['M'].shape, dtype=np.bool) for lvl, r in enumerate(ranks): CC = group_in_same_rank(r, P2) rnp = np.array(r) c = [map(str,rnp[cc]+1) for cc in CC] node_clusters.append(c) # Ignore edges in equal rank clusters for node in r: adj = set(np.nonzero(ADJ_D['M'][:,node])[0]) for e in (adj & set(r)): Ignore_Clust[node,e] = True # Also remove corresponding opposite direction if ADJ_D['M'][node,e] == ADJ_D['M'][e,node] == 1: Ignore_Clust[e,node] = True print "Clusters" for c in node_clusters: print c print # 4.5 attempt to hide lower strength edges without disconnecting nodes DCOR = DCOR_D['M'] AD = ADJ_D['M'].copy() Ignore_Low = np.zeros(ADJ_D['M'].shape, dtype=np.bool) for i in range(n): for j in range(n): if i == j: continue if DCOR[i,j] < DCOR_TH and AD[j,i]: # edge exists from i to j and it is under dCor thresh. can we remove it? AD[j,i] = 0 # undirected edge if AD[i,j]: AD[i,j] = 0 if np.sum(AD[:,i]) == 0 or np.sum(AD[j:,]) == 0 or \ np.sum(AD[:,j]) == 0 or np.sum(AD[i:,]) == 0: # no, it disconnects something AD[i,j] = 1 AD[j,i] = 1 else: Ignore_Low[i,j] = True Ignore_Low[j,i] = True # directed edge else: if np.sum(AD[:,i]) == 0 or np.sum(AD[j:,]) == 0: # no, it disconnects something AD[j,i] = 1 else: Ignore_Low[j,i] = True NL = count_edges(Ignore_Low) assert np.sum(Ignore_Low & ADJ_D['M']) == np.sum(Ignore_Low) print "Too low:", NL # 5: look for redudant directed edges between levels at least 2 levels apart # remove edge if path of equal length already exists Ignore_Far = np.zeros(ADJ_D['M'].shape, dtype=np.bool) A = ADJ_D['M'].copy() A = A & (~Ignore_Low) n_far_edges = 0 for lvl in xrange(len(ranks)-2): this_rank = ranks[lvl] for dlvl in xrange(lvl+2,len(ranks)): delta = dlvl-lvl that_rank = ranks[dlvl] for top in this_rank: for low in that_rank: if A[low,top]: # adj is col->row n_far_edges += 1 A[low,top] = 0 # try removing this link # is there an alternate path of equal length to this node? conn = paths.is_path(A,top,delta+1) if not low in conn: A[low,top] = 1 # I guess we need this edge... else: Ignore_Far[low,top] = True # also remove an associated undirected edge if A[top,low]: Ignore_Far[top,low] = True A[top,low] = 0 print "# Far edges", n_far_edges # 6: Print Stats assert np.sum(Ignore_Clust & Ignore_Far)==0 NT = count_edges(ADJ_D['M']) NS = count_edges(Ignore_Clust) NF = count_edges(Ignore_Far) print "Total:", NT print "Same Level:", NS print "Redundant Far:", NF n_rm = NS['total']+NF['total']+NL['total'] # this is wrong #$print "removed:", n_rm #print "reduction:", n_rm/NT['total'] # 7: Save Edge Ignore Matrix Ignore = Ignore_Clust | Ignore_Far | Ignore_Low NI = count_edges(Ignore) print "Ignored", NI print np.sum(Ignore) print np.sum(Ignore_Clust | Ignore_Far) print "Save Ignore Matrix at:", IGNORE_FNAME mio.save(Ignore, open(IGNORE_FNAME, "w"), ftype="txt", row_ids=ADJ_D['row_ids'], col_ids=ADJ_D['col_ids'], fmt="%d", delimit_c=",")
def main(): # 1: load adj matrix ADJ_D = mio.load(ADJ_FNAME, dtype=np.int, force_row_ids=True, force_col_ids=True) assert len(ADJ_D['row_ids']) == len(ADJ_D['col_ids']) assert len(ADJ_D['row_ids']) == ADJ_D['M'].shape[0] assert ADJ_D['M'].shape[0] == ADJ_D['M'].shape[1] # 2: find all paths Pinf = paths.fill_paths(ADJ_D["M"], k=None) mio.save(Pinf, open("data/all_k61_0.5_dot_nw.adj.paths.kinf.csv","w"), ftype="txt", row_ids=ADJ_D['row_ids'], col_ids=ADJ_D['col_ids'], fmt="%d", delimit_c=",") print np.sum(ADJ_D["M"]!=Pinf) P2 = paths.fill_paths(ADJ_D["M"], k=2) mio.save(P2, open("data/all_k61_0.5_dot_nw.adj.paths.k2.csv","w"), ftype="txt", row_ids=ADJ_D['row_ids'], col_ids=ADJ_D['col_ids'], fmt="%d", delimit_c=",") print np.sum(P2!=Pinf) P3 = paths.fill_paths(ADJ_D["M"], k=3) mio.save(P3, open("data/all_k61_0.5_dot_nw.adj.paths.k3.csv","w"), ftype="txt", row_ids=ADJ_D['row_ids'], col_ids=ADJ_D['col_ids'], fmt="%d", delimit_c=",") print np.sum(P3!=Pinf) P4 = paths.fill_paths(ADJ_D["M"], k=4) mio.save(P4, open("data/all_k61_0.5_dot_nw.adj.paths.k4.csv","w"), ftype="txt", row_ids=ADJ_D['row_ids'], col_ids=ADJ_D['col_ids'], fmt="%d", delimit_c=",") print np.sum(P4!=Pinf) P5 = paths.fill_paths(ADJ_D["M"], k=5) mio.save(P5, open("data/all_k61_0.5_dot_nw.adj.paths.k5.csv","w"), ftype="txt", row_ids=ADJ_D['row_ids'], col_ids=ADJ_D['col_ids'], fmt="%d", delimit_c=",") print np.sum(P5!=Pinf) P6 = paths.fill_paths(ADJ_D["M"], k=6) mio.save(P6, open("data/all_k61_0.5_dot_nw.adj.paths.k6.csv","w"), ftype="txt", row_ids=ADJ_D['row_ids'], col_ids=ADJ_D['col_ids'], fmt="%d", delimit_c=",") print np.sum(P6!=Pinf) P7 = paths.fill_paths(ADJ_D["M"], k=7) mio.save(P7, open("data/all_k61_0.5_dot_nw.adj.paths.k7.csv","w"), ftype="txt", row_ids=ADJ_D['row_ids'], col_ids=ADJ_D['col_ids'], fmt="%d", delimit_c=",") print np.sum(P7!=Pinf) P8 = paths.fill_paths(ADJ_D["M"], k=8) mio.save(P8, open("data/all_k61_0.5_dot_nw.adj.paths.k8.csv","w"), ftype="txt", row_ids=ADJ_D['row_ids'], col_ids=ADJ_D['col_ids'], fmt="%d", delimit_c=",") print np.sum(P8!=Pinf) P9 = paths.fill_paths(ADJ_D["M"], k=9) mio.save(P9, open("data/all_k61_0.5_dot_nw.adj.paths.k9.csv","w"), ftype="txt", row_ids=ADJ_D['row_ids'], col_ids=ADJ_D['col_ids'], fmt="%d", delimit_c=",") print np.sum(P9!=Pinf) P10 = paths.fill_paths(ADJ_D["M"], k=10) mio.save(P10, open("data/all_k61_0.5_dot_nw.adj.paths.k10.csv","w"), ftype="txt", row_ids=ADJ_D['row_ids'], col_ids=ADJ_D['col_ids'], fmt="%d", delimit_c=",") print np.sum(P10!=Pinf) P11 = paths.fill_paths(ADJ_D["M"], k=11) mio.save(P11, open("data/all_k61_0.5_dot_nw.adj.paths.k11.csv","w"), ftype="txt", row_ids=ADJ_D['row_ids'], col_ids=ADJ_D['col_ids'], fmt="%d", delimit_c=",") print np.sum(P11!=Pinf) P12 = paths.fill_paths(ADJ_D["M"], k=12) mio.save(P12, open("data/all_k61_0.5_dot_nw.adj.paths.k12.csv","w"), ftype="txt", row_ids=ADJ_D['row_ids'], col_ids=ADJ_D['col_ids'], fmt="%d", delimit_c=",") print np.sum(P12!=Pinf) P13 = paths.fill_paths(ADJ_D["M"], k=13) mio.save(P13, open("data/all_k61_0.5_dot_nw.adj.paths.k13.csv","w"), ftype="txt", row_ids=ADJ_D['row_ids'], col_ids=ADJ_D['col_ids'], fmt="%d", delimit_c=",") print np.sum(P13!=Pinf) P14 = paths.fill_paths(ADJ_D["M"], k=14) mio.save(P14, open("data/all_k61_0.5_dot_nw.adj.paths.k14.csv","w"), ftype="txt", row_ids=ADJ_D['row_ids'], col_ids=ADJ_D['col_ids'], fmt="%d", delimit_c=",") print np.sum(P14!=Pinf) P15 = paths.fill_paths(ADJ_D["M"], k=15) mio.save(P15, open("data/all_k61_0.5_dot_nw.adj.paths.k15.csv","w"), ftype="txt", row_ids=ADJ_D['row_ids'], col_ids=ADJ_D['col_ids'], fmt="%d", delimit_c=",") print np.sum(P15!=Pinf) P16 = paths.fill_paths(ADJ_D["M"], k=16) mio.save(P15, open("data/all_k61_0.5_dot_nw.adj.paths.k16.csv","w"), ftype="txt", row_ids=ADJ_D['row_ids'], col_ids=ADJ_D['col_ids'], fmt="%d", delimit_c=",") print np.sum(P16!=Pinf)