def main(fname=None, pkl=True, **kwds):
    assert fname
    if isinstance(pkl, basestring) and pkl.lower() in ('f', 'false', 'none'):
        pkl = False
    if 'err' in kwds: kwds['err'] = int(kwds['err'])
    if 'th' in kwds: kwds['th'] = float(kwds['th'])

    D = mio.load(fname)
    print "Computing all pairs weak boolean class..."
    print "Computing all pairs weak class for a (%d x %d) data matrix (%d x %d result matrix)..." % \
        (D['M'].shape[0], D['M'].shape[1], D['M'].shape[0], D['M'].shape[0])
    WEAK, err, th = compute_all_weak(D['M'], **kwds)
    print "Used parameters err=%d, cutoff th=%f" % (err, th)

    fname_out = "%s.err%d.th%.4f.weak.tab" % (fname, err, th)
    print "Saving %s..." % (fname_out)
    mio.save(WEAK,
             fname_out,
             fmt="%d",
             row_ids=D['row_ids'],
             col_ids=D['row_ids'])
    if pkl:
        fname_pkl_out = fname_out.rpartition('.')[0] + '.pkl'
        print "Saving %s..." % (fname_pkl_out)
        pickle.dump(WEAK, open(fname_pkl_out, "w"), protocol=-1)
    return WEAK
Exemplo n.º 2
0
def main():
  fp = open(FNAME)
  fp.next()
  ppi = {}
  genes = set()
  for line in fp:
    row = line.strip('\n\r').split('\t')
    genes.add(row[0]); genes.add(row[1])
    ppi.setdefault(row[0],set()).add(row[1])

  rownames = sorted(genes)
  colnames = rownames
  row_idx = dict(((s,i) for i,s in enumerate(rownames)))
  A = np.zeros((len(rownames), len(colnames)))

  for gene_i in ppi:
    for gene_j in ppi[gene_i]:
      i = row_idx[gene_i]
      j = row_idx[gene_j]
      A[i,j] = 1; A[j,i] = 1

  print >>sys.stderr, "# interactions x2: %d" % np.sum(A)
  print >>sys.stderr, "# (genes x genes) size:", A.shape

  mio.save(A, fp=FNAME_OUT, row_ids=['"%s"'%s for s in rownames], col_ids=['"%s"'%s for s in colnames], fmt="%d")
def main(fname=None, pkl=True, **kwds):
    assert fname
    if isinstance(pkl, basestring) and pkl.lower() in ('f', 'false', 'none'):
        pkl = False
    if 'b' in kwds: kwds['b'] = float(kwds['b'])
    if 'z_th' in kwds: kwds['z_th'] = float(kwds['z_th'])
    if 'err_th' in kwds: kwds['err_th'] = float(kwds['err_th'])
    if 'd_th' in kwds: kwds['d_th'] = float(kwds['d_th'])
    if 'r_th' in kwds: kwds['r_th'] = float(kwds['r_th'])
    print "Loading data..."
    D = mio.load(fname)
    print "Computing all pairs boolean class for a (%d x %d) data matrix (%d x %d result matrix)..." % \
        (D['M'].shape[0], D['M'].shape[1], D['M'].shape[0], D['M'].shape[0])
    CLS, steps, b = compute_all_bool(D['M'], **kwds)
    z = kwds.get('z_th', 3.0)
    r = kwds.get('r_th', 2 / 3)
    if r < 0.5:
        print "WARNING: r<0.5, (r=%f)... are you sure?" % r
    err = kwds.get('err_th', 0.1)
    fname_out = '%s.b%.4f.z%.2f.r%.2f.err%.2f.bool.tab' % (fname, b, z, r, err)
    print "Saving %s..." % (fname_out)
    mio.save(CLS,
             fname_out,
             fmt="%d",
             row_ids=D['row_ids'],
             col_ids=D['row_ids'])
    steps_fname = fname + ".steps.txt"
    print "Saving high/low thresholds to %s in original row order..." % steps_fname
    open(steps_fname, "w").write("\n".join(("%f" % x for x in steps)))
    if pkl:
        fname_pkl_out = fname_out.rpartition('.')[0] + '.pkl'
        print "Saving %s..." % (fname_pkl_out)
        pickle.dump(CLS, open(fname_pkl_out, "w"), protocol=-1)
Exemplo n.º 4
0
def extend_training_datas():
    """
    extend training datas, expand fourth. 42000 -> 168000
    """
    old_features, old_labels = lsd.load_train_data('../../dataset/train.csv')

    # extend the training datas
    extend_features = []
    extend_labels = []
    print 'extend training datas, expand fourth. 42000 -> 84000'
    for i in xrange(len(old_features)):
        feature = old_features[i]
        label = old_labels[i]
        feature = np.mat(feature)
        feature = feature.reshape(28, 28)
        for m in range(0, 5, 4):
            for n in range(0, 5, 4):
                f_temp = feature[n:28 - 4 + n, m:28 - 4 + m]
                f_temp = f_temp.reshape(1, 24 * 24)
                f_temp = f_temp.tolist()[0]
                extend_features.append(f_temp)
                extend_labels.append(label)

        print 'train:', i, np.shape(extend_features), "-", np.shape(
            extend_labels)

    matrix_io.save(extend_features, 'extend_train_features.pkl')
    matrix_io.save(extend_labels, 'extend_train_labels.pkl')
    print 'training datas count:', len(
        old_features), '-> extend training datas:', len(extend_features)
Exemplo n.º 5
0
def main(fname=None, pkl=True, algorithm="3", outtag="", **kwds):
  assert fname
  if isinstance(pkl, basestring) and pkl.lower() in ('f','false','none'): pkl = False
  print "Loading data from %s..." % fname
  D = mio.load(fname)
  print "Computing all pairs (euclidean) distance correlation from a (%d x %d) data matrix to a (%d x %d) result matrix..." % (D['M'].shape[0], D['M'].shape[1], D['M'].shape[0], D['M'].shape[0])
  print "Computing all pairs (euclidean) distance correlation..."
  
  if algorithm == "1":
    print "Using Algorithm 1: single dot product, n^2*m memory"
    DCOR = compute_all_dcor(D['M'], **kwds)
  elif algorithm == "2":
    print "Using Algorithm 2: multiple dot products, n*m memory"
    DCOR = compute_all_dcor_2(D['M'], **kwds)
  elif algorithm == "3":
    print "Using Algorithm 3: multiple dot products, n*m memory, n choose 2 savings"
    DCOR = compute_all_dcor_3(D['M'], **kwds)
  else:
    raise Exception, "Unknown algorithm %s" % algorithm

  if outtag and outtag[-1] != ".":
    outtag += "."
  fname_out = '%s.%sdcor.tab' % (fname, outtag)
  print "Saving %s..." % (fname_out)
  mio.save(DCOR, fname_out, fmt="%.4f", row_ids=D['row_ids'], col_ids=D['row_ids'])
  if pkl:
    fname_pkl_out = fname_out.rpartition('.')[0]+'.pkl'
    print "Saving %s..." % (fname_pkl_out)
    pickle.dump(DCOR, open(fname_pkl_out,"w"), protocol=-1)
  return DCOR
Exemplo n.º 6
0
def extend_test_datas():
    """
    extend test datas, expand fourth. 28000 -> 112000
    """
    old_test_features = lsd.load_test_data('../../dataset/test.csv')

    # extend the test datas
    extend_features = []
    print 'extend test datas, expand fourth. 28000 -> 112000'
    for i in xrange(len(old_test_features)):
        feature = old_test_features[i]
        feature = np.mat(feature)
        feature = feature.reshape(28, 28)
        for m in range(0, 5, 4):
            for n in range(0, 5, 4):
                f_temp = feature[n:28 - 4 + n, m:28 - 4 + m]
                f_temp = f_temp.reshape(1, 24 * 24)
                f_temp = f_temp.tolist()[0]
                extend_features.append(f_temp)

        print 'test:', i, np.shape(extend_features)

    matrix_io.save(extend_features, 'extend_test_features.pkl')
    print 'test datas count:', len(
        old_test_features), '-> extend test datas:', len(extend_features)
Exemplo n.º 7
0
def main():
    fp = open(FNAME)
    fp.next()
    ppi = {}
    genes = set()
    for line in fp:
        row = line.strip('\n\r').split('\t')
        genes.add(row[0])
        genes.add(row[1])
        ppi.setdefault(row[0], set()).add(row[1])

    rownames = sorted(genes)
    colnames = rownames
    row_idx = dict(((s, i) for i, s in enumerate(rownames)))
    A = np.zeros((len(rownames), len(colnames)))

    for gene_i in ppi:
        for gene_j in ppi[gene_i]:
            i = row_idx[gene_i]
            j = row_idx[gene_j]
            A[i, j] = 1
            A[j, i] = 1

    print >> sys.stderr, "# interactions x2: %d" % np.sum(A)
    print >> sys.stderr, "# (genes x genes) size:", A.shape

    mio.save(A,
             fp=FNAME_OUT,
             row_ids=['"%s"' % s for s in rownames],
             col_ids=['"%s"' % s for s in colnames],
             fmt="%d")
def main(fname=None, pkl=True, **kwds):
  assert fname
  if isinstance(pkl, basestring) and pkl.lower() in ('f','false','none'): pkl = False
  if 'b' in kwds: kwds['b'] = float(kwds['b'])
  if 'z_th' in kwds: kwds['z_th'] = float(kwds['z_th'])
  if 'err_th' in kwds: kwds['err_th'] = float(kwds['err_th'])
  if 'd_th' in kwds: kwds['d_th'] = float(kwds['d_th'])
  if 'r_th' in kwds: kwds['r_th'] = float(kwds['r_th'])
  print "Loading data..."
  D = mio.load(fname)
  print "Computing all pairs boolean class for a (%d x %d) data matrix (%d x %d result matrix)..." % \
      (D['M'].shape[0], D['M'].shape[1], D['M'].shape[0], D['M'].shape[0])
  CLS, steps, b = compute_all_bool(D['M'], **kwds)
  z = kwds.get('z_th', 3.0)
  r = kwds.get('r_th', 2/3)
  if r < 0.5:
    print "WARNING: r<0.5, (r=%f)... are you sure?" % r
  err = kwds.get('err_th', 0.1)
  fname_out = '%s.b%.4f.z%.2f.r%.2f.err%.2f.bool.tab' % (fname, b, z, r, err)
  print "Saving %s..." % (fname_out)
  mio.save(CLS, fname_out, fmt="%d", row_ids=D['row_ids'], col_ids=D['row_ids'])
  steps_fname = fname+".steps.txt"
  print "Saving high/low thresholds to %s in original row order..." % steps_fname
  open(steps_fname,"w").write("\n".join(("%f"%x for x in steps)))
  if pkl:
    fname_pkl_out = fname_out.rpartition('.')[0]+'.pkl'
    print "Saving %s..." % (fname_pkl_out)
    pickle.dump(CLS, open(fname_pkl_out,"w"), protocol=-1)
def main(write_adj_fname=ADJM_OUT_FNAME):
  if isinstance(write_adj_fname,basestring) and write_adj_fname.lower() in ('f','false','none'):
    write_adj_fname = None
  all_genes = set()
  tf_targs = {}
  for line in open(TARGS_FNAME):
    row = line.strip('\n\r').split(',')
    tf_targs[row[0]] = set(row[1:])
    all_genes.update(row[1:])
   
  tfs = set(tf_targs.keys())
  for line in open(TFLIST_FNAME):
    tfs.add(line.strip('\n\r'))
  all_genes.update(tfs)

  for line in open(LIT_FNAME):
    row = line.strip().split(',')
    tf, targs = row[0], row[1:]
    tfs.add(tf)
    all_genes.update(row)
    tf_targs.setdefault(tf,set()).update(targs)

  # STATS
  print "# genes total:", len(all_genes)
  print "# transcription factors:", len(tfs)
  print "# transcription factors with targets:", len(tf_targs)
  # Text adj list
  print "Printing list of transcription factors with putative targets to %s" % MERGED_LIST_OUT_FNAME
  fp_out = open(MERGED_LIST_OUT_FNAME,"w")
  for tf in sorted(tfs):
    if tf in tf_targs:
      print >>fp_out, "%s,%s" % (tf,",".join(tf_targs))
    else:
      print >>fp_out, tf
      
  # Text adj matrix, columns TF, rows TF targets
  if not write_adj_fname:
    return 0
  else:
    print "Writing to .tab text file..."
  rownames = sorted(all_genes)
  colnames = sorted(tfs)
  row_idx = dict(((s,i) for i,s in enumerate(rownames)))
  A = np.zeros((len(rownames), len(colnames)))

  for j,tf in enumerate(colnames):
    for g in tf_targs.get(tf,[]):
      i = row_idx[g]
      A[i,j] = 1
      
  print >>sys.stderr, "# interactions: %d" % np.sum(A)
  print >>sys.stderr, "# genesXtf size:", A.shape
  mio.save(A, fp=write_adj_fname, row_ids=['"%s"'%s for s in rownames], col_ids=['"%s"'%s for s in colnames], fmt="%d")
  # compile to R object using R script wrapper
  print "Converting to RData binary object..."
  r_script = LOAD_TMP % {'fname':write_adj_fname}
  p = Popen(["R", "--vanilla", "--slave"], stdout=PIPE, stdin=PIPE, stderr=STDOUT)
  print p.communicate(input=r_script)
  p.stdin.close()
Exemplo n.º 10
0
def main():
  ADJ_D = mio.load("data/gold_0.32_dot_nw.adj.csv")
  M = np.array(ADJ_D['M'],dtype=int)
  print M
  P = np.zeros(M.shape, dtype=np.int)
  for i in xrange(M.shape[1]):
    js = list(get_connected(M, i, k=2))
    P[js,i] = 1
  print P
  print M==P
  assert ADJ_D['row_ids'] == ADJ_D['col_ids']
  mio.save(P, open("data/gold.paths.dcor0.32.k2.tab","w"), ftype="txt", row_ids=ADJ_D['row_ids'], col_ids=ADJ_D['col_ids'], fmt="%d")
Exemplo n.º 11
0
def main(fname=None, as_rows=True, use_weak=False):
    assert fname
    if isinstance(as_rows,
                  basestring) and as_rows.lower() in ('f', 'false', 'none'):
        as_rows = False
    if not use_weak:
        print "Loading boolean class enumeration matrix", fname
    else:
        print "Loading weak class enumeration matrix", fname
    D = mio.load(fname, dtype=np.int)
    M = D['M']
    # verify that enumeration matrices look credible
    if not use_weak:
        Z = np.in1d(M, np.array([0, 1, 2, 3, 4, 5, 6, 7]))
        if not np.all(Z):
            print "%d invalid values in M." % (np.sum(~Z))
            print "up to 20 unrecognized values include..."
            zz = M[Z]
            print np.unique(zz)[1:np.min(20, len(zz))]
    else:
        assert np.all(np.in1d(M, np.array([0, 1, 2, 3, 4, 5])))

    if not as_rows:
        print "Computing distance between all pairs of columns..."
        M = np.transpose(M)
    else:
        print "Computing distance between all pairs of rows..."

    if not use_weak:
        print "Computing Boolean Class distance"
        DIST = all_pairs_bool_dist(M)
        fname_out = fname + '.booldist.tab'
    else:
        print "Computing Weak Class distance"
        DIST = all_pairs_weak_dist(M)
        fname_out = fname + '.weakdist.tab'

    print "Saving boolean class distance matrix as", fname_out
    if as_rows:
        ids = D.get('row_ids', None)
        mio.save(DIST, fp=fname_out, row_ids=ids, col_ids=ids, fmt="%d")
    else:
        ids = D.get('col_ids', None)
        mio.save(DIST, fp=fname_out, row_ids=ids, col_ids=ids, fmt="%d")
    return fname_out
def main():
  # 1: load adj matrix
  ADJ_D = mio.load(ADJ_FNAME, dtype=np.int, force_row_ids=True, force_col_ids=True)
  assert len(ADJ_D['row_ids']) == len(ADJ_D['col_ids'])
  assert len(ADJ_D['row_ids']) == ADJ_D['M'].shape[0]
  assert ADJ_D['M'].shape[0] == ADJ_D['M'].shape[1]

  # 2.1: find paths k=3
  P3 = paths.fill_paths(ADJ_D["M"], k=3)
  mio.save(Pinf, open("data/all_k61_0.5_dot_nw.adj.paths.k3.csv","w"), ftype="txt", row_ids=ADJ_D['row_ids'], col_ids=ADJ_D['col_ids'], fmt="%d", delimit_c=",")

  # 2.2: find paths k=2
  P2 = paths.fill_paths(ADJ_D["M"], k=2)
  mio.save(Pinf, open("data/all_k61_0.5_dot_nw.adj.paths.k2.csv","w"), ftype="txt", row_ids=ADJ_D['row_ids'], col_ids=ADJ_D['col_ids'], fmt="%d", delimit_c=",")

  # 3: load ranks
  ranks = load_ranks(open(RANKS_FNAME))
  print ranks
Exemplo n.º 13
0
def main(fname=None, as_rows=True, use_weak=False):
  assert fname
  if isinstance(as_rows,basestring) and as_rows.lower() in ('f','false','none'): as_rows = False
  if not use_weak:
    print "Loading boolean class enumeration matrix", fname
  else:
    print "Loading weak class enumeration matrix", fname
  D = mio.load(fname, dtype=np.int)
  M = D['M']
  # verify that enumeration matrices look credible
  if not use_weak:
    Z = np.in1d(M,np.array([0,1,2,3,4,5,6,7]))
    if not np.all(Z):
      print "%d invalid values in M." % (np.sum(~Z))
      print "up to 20 unrecognized values include..."
      zz = M[Z]
      print np.unique(zz)[1:np.min(20, len(zz))]
  else:
    assert np.all(np.in1d(M,np.array([0,1,2,3,4,5])))
  
  if not as_rows:
    print "Computing distance between all pairs of columns..."
    M = np.transpose(M)
  else:
    print "Computing distance between all pairs of rows..."

  if not use_weak:
    print "Computing Boolean Class distance"
    DIST = all_pairs_bool_dist(M)
    fname_out = fname+'.booldist.tab'
  else:
    print "Computing Weak Class distance"
    DIST = all_pairs_weak_dist(M)
    fname_out = fname+'.weakdist.tab'

  print "Saving boolean class distance matrix as", fname_out
  if as_rows:
    ids = D.get('row_ids',None)
    mio.save(DIST, fp=fname_out, row_ids=ids, col_ids=ids, fmt="%d")
  else:
    ids = D.get('col_ids',None)
    mio.save(DIST, fp=fname_out, row_ids=ids, col_ids=ids, fmt="%d")
  return fname_out
Exemplo n.º 14
0
def main():
    v = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
    H = np.matrix([[0, 0, 0, 1], [1, 1, 1, 0], [0, 0, 0, 1]])
    L = np.matrix([[1, 1, 0, 0], [0, 0, 0, 1], [1, 1, 1, 0]])
    print stepfit(v)
    D = mio.load("nice.may3.Eg.expr.gold.celegans.csv")
    M = D['M']
    Steps = []
    for row in M:
        Steps.append(stepfit(row)[0])
    b = 0.3
    #CLS = all_pairs_bool(M, Steps, b, z_th=2.7)
    CLS = all_pairs_bool(M, Steps, b)
    print CLS
    print D.keys()
    mio.save(CLS,
             "nice.test.tab",
             fmt="%d",
             row_ids=D['row_ids'],
             col_ids=D['row_ids'])
    print "OK: saved result to nice.test.tab"
def main(fname=None, pkl=True, **kwds):
  assert fname
  if isinstance(pkl, basestring) and pkl.lower() in ('f','false','none'): pkl = False
  if 'err' in kwds: kwds['err'] = int(kwds['err'])
  if 'th' in kwds: kwds['th'] = float(kwds['th'])

  D = mio.load(fname)
  print "Computing all pairs weak boolean class..."
  print "Computing all pairs weak class for a (%d x %d) data matrix (%d x %d result matrix)..." % \
      (D['M'].shape[0], D['M'].shape[1], D['M'].shape[0], D['M'].shape[0])
  WEAK, err, th = compute_all_weak(D['M'], **kwds)
  print "Used parameters err=%d, cutoff th=%f" % (err, th)

  fname_out = "%s.err%d.th%.4f.weak.tab" % (fname, err, th)
  print "Saving %s..." % (fname_out)
  mio.save(WEAK, fname_out, fmt="%d", row_ids=D['row_ids'], col_ids=D['row_ids'])
  if pkl:
    fname_pkl_out = fname_out.rpartition('.')[0]+'.pkl'
    print "Saving %s..." % (fname_pkl_out)
    pickle.dump(WEAK, open(fname_pkl_out,"w"), protocol=-1)
  return WEAK
Exemplo n.º 16
0
def main(fname=None, pkl=True, algorithm="3", outtag="", **kwds):
    assert fname
    if isinstance(pkl, basestring) and pkl.lower() in ('f', 'false', 'none'):
        pkl = False
    print "Loading data from %s..." % fname
    D = mio.load(fname)
    print "Computing all pairs (euclidean) distance correlation from a (%d x %d) data matrix to a (%d x %d) result matrix..." % (
        D['M'].shape[0], D['M'].shape[1], D['M'].shape[0], D['M'].shape[0])
    print "Computing all pairs (euclidean) distance correlation..."

    if algorithm == "1":
        print "Using Algorithm 1: single dot product, n^2*m memory"
        DCOR = compute_all_dcor(D['M'], **kwds)
    elif algorithm == "2":
        print "Using Algorithm 2: multiple dot products, n*m memory"
        DCOR = compute_all_dcor_2(D['M'], **kwds)
    elif algorithm == "3":
        print "Using Algorithm 3: multiple dot products, n*m memory, n choose 2 savings"
        DCOR = compute_all_dcor_3(D['M'], **kwds)
    else:
        raise Exception, "Unknown algorithm %s" % algorithm

    if outtag and outtag[-1] != ".":
        outtag += "."
    fname_out = '%s.%sdcor.tab' % (fname, outtag)
    print "Saving %s..." % (fname_out)
    mio.save(DCOR,
             fname_out,
             fmt="%.4f",
             row_ids=D['row_ids'],
             col_ids=D['row_ids'])
    if pkl:
        fname_pkl_out = fname_out.rpartition('.')[0] + '.pkl'
        print "Saving %s..." % (fname_pkl_out)
        pickle.dump(DCOR, open(fname_pkl_out, "w"), protocol=-1)
    return DCOR
Exemplo n.º 17
0
def main(pkl_fname=None, row_fname=None, col_fname=None, outdir=None, sig=None, doabs=False, diag=1):
  """
  pkl_fname: path to pickled numpy dependency matrix
  row_fname: path to labeled text matrix with row ids, maybe col ids
  col_fname: optional path to labeled text matrix with col ids
  sig: float of minimum significance
  doabs: flag of whether to use absolute value for significance testing
  diag: if matrix is symmetric, the value of the diagonal
  """
  assert pkl_fname and row_fname and outdir
  make_dir(outdir)
  if doabs:
    abstxt = "T"
  else:
    abstxt = "F"
  out_fname = os.path.join(outdir, os.path.basename(pkl_fname.rpartition('.')[0]))
  if sig:
    out_fname += ".sig%f" % sig
  if doabs:
    out_fname += ".absT"
  out_fname += ".tab"

  print "Text matrix will be saved to: %s" % out_fname
  M = pickle.load(open(pkl_fname))

  # Get row and column labels.
  try:
    D_row = mio.load(row_fname)
    row_names = np.array(D_row['row_ids'])
  except AssertionError:
    row_names = np.array([s.strip('\n\r') for s in open(row_fname)])
  if col_fname is None:
    col_names = np.array(D_row['col_ids'])
  else:
    if row_fname == col_fname:
      col_names = row_names
    else:
      try:
        D_col = mio.load(col_fname)
        col_names = np.array(D_col['row_ids']) # Use row IDs as column IDs in Dependency Matrix
      except AssertionError:
        col_names = np.array([s.strip('\n\r') for s in open(col_fname)])

  if len(row_names) == np.size(M,0) and len(col_names) == np.size(M,1):
    print "Number of rows(%d) and column(%d) names fit matrix size (%d,%d)." % \
        (len(row_names), len(col_names), np.size(M,0), np.size(M,1))
  else:
    n = len(row_names)
    if np.size(M,0) == n*(n-1)//2:
      print "Matrix seems to be n choose 2 upper triangle matrix. Converting to full matrix..."
      M = distance.squareform(M)
      if diag is not None:
        print "Forcing diagonal to be:", diag
        for i in xrange(n):
          M[i,i] = diag
    else:
      raise Exception, "Unknown matrix size %s given #row_ids(%d), #col_ids(%d)" % \
          (np.shape(M), len(row_names), len(col_names))
  

  # Remove insignificant rows and columns; align row/col names
  original_dim = M.shape
  if sig is not None:
    sig = float(sig)
    if not doabs:
      col_max = np.amax(M,0)
      row_max = np.amax(M,1)
    else:
      col_max = np.amax(np.abs(M),0)
      row_max = np.amax(np.abs(M),1)
    M = M[row_max>=sig,:][:,col_max>=sig]
    row_names = row_names[row_max>=sig]
    col_names = col_names[col_max>=sig]
  new_dim = M.shape

  # Dump to text
  now_timestamp = datetime.datetime.now().isoformat('_')
  header = ["Generated on %s from pickled matrix file %s" % (now_timestamp, pkl_fname),
            "Original dimensions: %s, New dimensions: %s" % (original_dim, new_dim),
            "sig: %s, abs: %s" % (str(sig), str(abstxt))]
  print "\n".join(header)
  fp = open(out_fname, "w")
  mio.save(M, fp, ftype="txt", delimit_c="\t", row_ids=list(row_names), col_ids=list(col_names), headers=header)
  fp.close()
  print "Tab matrix saved to %s." % out_fname
  
  return out_fname
"""Read PINA file, print simple adjaceny list"""
from pina import *
import matrix_io as mio

pina_fpath = "/nfs/01/osu6683/H**o sapiens-20121210.txt"
P = PINAEnriched(open(pina_fpath))
row_names, A = P.return_adj_matrix()
mio.save(M=A, fp="/nfs/01/osu6683/pina_adj_matrix.tab", fmt="%d", row_ids=row_names, col_ids=row_names, fill_upper_left=False)
def main():
  # 1: load adj matrix
  ADJ_D = mio.load(ADJ_FNAME, dtype=np.int, force_row_ids=True, force_col_ids=True)
  DCOR_D = mio.load(DCOR_FNAME, force_row_ids=True, force_col_ids=True)
  assert len(ADJ_D['row_ids']) == len(ADJ_D['col_ids'])
  assert len(ADJ_D['row_ids']) == ADJ_D['M'].shape[0]
  assert DCOR_D["row_ids"] == ADJ_D['row_ids']
  assert DCOR_D["row_ids"] == DCOR_D["col_ids"]
  assert ADJ_D['M'].shape[0] == ADJ_D['M'].shape[1]
  n = ADJ_D['M'].shape[0]
  
  # 2.1: find paths k=2
  P2 = paths.fill_paths(ADJ_D["M"], k=2)
  mio.save(P2, open(P2_FNAME,"w"), ftype="txt", row_ids=ADJ_D['row_ids'], col_ids=ADJ_D['col_ids'], fmt="%d", delimit_c=",")
  # 2.2: find paths k=3
  P3 = paths.fill_paths(ADJ_D["M"], k=3)
  mio.save(P3, open(P3_FNAME,"w"), ftype="txt", row_ids=ADJ_D['row_ids'], col_ids=ADJ_D['col_ids'], fmt="%d", delimit_c=",")
  name_order = ADJ_D['row_ids']

  # 3.1: load ranks
  # (I compiled this list manually) Note that rank elements are 0 indexed in ADJ matrix
  ranks = load_ranks_named(open(RANKS_FNAME), name_order)
  #print ranks
  #print name_order
  #sys.exit(1)
  
  node_clusters = []
  # 4: find clusters in same rank at k=2
  Ignore_Clust = np.zeros(ADJ_D['M'].shape, dtype=np.bool)
  for lvl, r in enumerate(ranks):
    CC = group_in_same_rank(r, P2)
    rnp = np.array(r)
    c = [map(str,rnp[cc]+1) for cc in CC]
    node_clusters.append(c)
    # Ignore edges in equal rank clusters
    for node in r:
      adj = set(np.nonzero(ADJ_D['M'][:,node])[0])
      for e in (adj & set(r)):
        Ignore_Clust[node,e] = True
        # Also remove corresponding opposite direction
        if ADJ_D['M'][node,e] == ADJ_D['M'][e,node] == 1:
          Ignore_Clust[e,node] = True

  print "Clusters"
  for c in node_clusters:
    print c
  print

  # 4.5 attempt to hide lower strength edges without disconnecting nodes
  DCOR = DCOR_D['M']
  AD = ADJ_D['M'].copy()
  Ignore_Low = np.zeros(ADJ_D['M'].shape, dtype=np.bool)
  for i in range(n):
    for j in range(n):
      if i == j: continue
      if DCOR[i,j] < DCOR_TH and AD[j,i]:
        # edge exists from i to j and it is under dCor thresh. can we remove it?
        AD[j,i] = 0
        # undirected edge
        if AD[i,j]:
          AD[i,j] = 0
          if np.sum(AD[:,i]) == 0 or np.sum(AD[j:,]) == 0 or \
             np.sum(AD[:,j]) == 0 or np.sum(AD[i:,]) == 0:
            # no, it disconnects something
            AD[i,j] = 1 
            AD[j,i] = 1
          else:
            Ignore_Low[i,j] = True
            Ignore_Low[j,i] = True
        # directed edge
        else:
          if np.sum(AD[:,i]) == 0 or np.sum(AD[j:,]) == 0:
            # no, it disconnects something
            AD[j,i] = 1
          else:
            Ignore_Low[j,i] = True
  NL = count_edges(Ignore_Low)
  assert np.sum(Ignore_Low & ADJ_D['M']) == np.sum(Ignore_Low)
  print "Too low:", NL
  

  # 5: look for redudant directed edges between levels at least 2 levels apart
  # remove edge if path of equal length already exists
  Ignore_Far = np.zeros(ADJ_D['M'].shape, dtype=np.bool)
  A = ADJ_D['M'].copy()
  A = A & (~Ignore_Low)
  n_far_edges = 0
  for lvl in xrange(len(ranks)-2):
    this_rank = ranks[lvl]
    for dlvl in xrange(lvl+2,len(ranks)):
      delta = dlvl-lvl
      that_rank = ranks[dlvl]
      for top in this_rank:
        for low in that_rank:
          if A[low,top]:     # adj is col->row
            n_far_edges += 1
            A[low,top] = 0 # try removing this link
            # is there an alternate path of equal length to this node?
            conn = paths.is_path(A,top,delta+1)
            if not low in conn:
              A[low,top] = 1 # I guess we need this edge...
            else:
              Ignore_Far[low,top] = True
              # also remove an associated undirected edge
              if A[top,low]:
                Ignore_Far[top,low] = True
                A[top,low] = 0
  print "# Far edges", n_far_edges

  # 6: Print Stats
  assert np.sum(Ignore_Clust & Ignore_Far)==0
  NT = count_edges(ADJ_D['M'])
  NS = count_edges(Ignore_Clust)
  NF = count_edges(Ignore_Far)
  print "Total:", NT
  print "Same Level:", NS
  print "Redundant Far:", NF
  n_rm = NS['total']+NF['total']+NL['total'] # this is wrong
  #$print "removed:", n_rm
  #print "reduction:", n_rm/NT['total']
  
  # 7: Save Edge Ignore Matrix
  Ignore = Ignore_Clust | Ignore_Far | Ignore_Low
  NI = count_edges(Ignore)
  print "Ignored", NI
  print np.sum(Ignore)
  print np.sum(Ignore_Clust | Ignore_Far)
    
  print "Save Ignore Matrix at:", IGNORE_FNAME
  mio.save(Ignore, open(IGNORE_FNAME, "w"), ftype="txt", row_ids=ADJ_D['row_ids'], col_ids=ADJ_D['col_ids'], fmt="%d", delimit_c=",")
def main():
  # 1: load adj matrix
  ADJ_D = mio.load(ADJ_FNAME, dtype=np.int, force_row_ids=True, force_col_ids=True)
  assert len(ADJ_D['row_ids']) == len(ADJ_D['col_ids'])
  assert len(ADJ_D['row_ids']) == ADJ_D['M'].shape[0]
  assert ADJ_D['M'].shape[0] == ADJ_D['M'].shape[1]

  # 2: find all paths
  Pinf = paths.fill_paths(ADJ_D["M"], k=None)
  mio.save(Pinf, open("data/all_k61_0.5_dot_nw.adj.paths.kinf.csv","w"), ftype="txt", row_ids=ADJ_D['row_ids'], col_ids=ADJ_D['col_ids'], fmt="%d", delimit_c=",")
  print np.sum(ADJ_D["M"]!=Pinf)
  
  P2 = paths.fill_paths(ADJ_D["M"], k=2)
  mio.save(P2, open("data/all_k61_0.5_dot_nw.adj.paths.k2.csv","w"), ftype="txt", row_ids=ADJ_D['row_ids'], col_ids=ADJ_D['col_ids'], fmt="%d", delimit_c=",")
  print np.sum(P2!=Pinf)
  
  P3 = paths.fill_paths(ADJ_D["M"], k=3)
  mio.save(P3, open("data/all_k61_0.5_dot_nw.adj.paths.k3.csv","w"), ftype="txt", row_ids=ADJ_D['row_ids'], col_ids=ADJ_D['col_ids'], fmt="%d", delimit_c=",")
  print np.sum(P3!=Pinf)
  
  P4 = paths.fill_paths(ADJ_D["M"], k=4)
  mio.save(P4, open("data/all_k61_0.5_dot_nw.adj.paths.k4.csv","w"), ftype="txt", row_ids=ADJ_D['row_ids'], col_ids=ADJ_D['col_ids'], fmt="%d", delimit_c=",")
  print np.sum(P4!=Pinf)
  
  P5 = paths.fill_paths(ADJ_D["M"], k=5)
  mio.save(P5, open("data/all_k61_0.5_dot_nw.adj.paths.k5.csv","w"), ftype="txt", row_ids=ADJ_D['row_ids'], col_ids=ADJ_D['col_ids'], fmt="%d", delimit_c=",")
  print np.sum(P5!=Pinf)

  P6 = paths.fill_paths(ADJ_D["M"], k=6)
  mio.save(P6, open("data/all_k61_0.5_dot_nw.adj.paths.k6.csv","w"), ftype="txt", row_ids=ADJ_D['row_ids'], col_ids=ADJ_D['col_ids'], fmt="%d", delimit_c=",")
  print np.sum(P6!=Pinf)

  P7 = paths.fill_paths(ADJ_D["M"], k=7)
  mio.save(P7, open("data/all_k61_0.5_dot_nw.adj.paths.k7.csv","w"), ftype="txt", row_ids=ADJ_D['row_ids'], col_ids=ADJ_D['col_ids'], fmt="%d", delimit_c=",")
  print np.sum(P7!=Pinf)

  P8 = paths.fill_paths(ADJ_D["M"], k=8)
  mio.save(P8, open("data/all_k61_0.5_dot_nw.adj.paths.k8.csv","w"), ftype="txt", row_ids=ADJ_D['row_ids'], col_ids=ADJ_D['col_ids'], fmt="%d", delimit_c=",")
  print np.sum(P8!=Pinf)

  P9 = paths.fill_paths(ADJ_D["M"], k=9)
  mio.save(P9, open("data/all_k61_0.5_dot_nw.adj.paths.k9.csv","w"), ftype="txt", row_ids=ADJ_D['row_ids'], col_ids=ADJ_D['col_ids'], fmt="%d", delimit_c=",")
  print np.sum(P9!=Pinf)

  P10 = paths.fill_paths(ADJ_D["M"], k=10)
  mio.save(P10, open("data/all_k61_0.5_dot_nw.adj.paths.k10.csv","w"), ftype="txt", row_ids=ADJ_D['row_ids'], col_ids=ADJ_D['col_ids'], fmt="%d", delimit_c=",")
  print np.sum(P10!=Pinf)

  P11 = paths.fill_paths(ADJ_D["M"], k=11)
  mio.save(P11, open("data/all_k61_0.5_dot_nw.adj.paths.k11.csv","w"), ftype="txt", row_ids=ADJ_D['row_ids'], col_ids=ADJ_D['col_ids'], fmt="%d", delimit_c=",")
  print np.sum(P11!=Pinf)

  P12 = paths.fill_paths(ADJ_D["M"], k=12)
  mio.save(P12, open("data/all_k61_0.5_dot_nw.adj.paths.k12.csv","w"), ftype="txt", row_ids=ADJ_D['row_ids'], col_ids=ADJ_D['col_ids'], fmt="%d", delimit_c=",")
  print np.sum(P12!=Pinf)

  P13 = paths.fill_paths(ADJ_D["M"], k=13)
  mio.save(P13, open("data/all_k61_0.5_dot_nw.adj.paths.k13.csv","w"), ftype="txt", row_ids=ADJ_D['row_ids'], col_ids=ADJ_D['col_ids'], fmt="%d", delimit_c=",")
  print np.sum(P13!=Pinf)

  P14 = paths.fill_paths(ADJ_D["M"], k=14)
  mio.save(P14, open("data/all_k61_0.5_dot_nw.adj.paths.k14.csv","w"), ftype="txt", row_ids=ADJ_D['row_ids'], col_ids=ADJ_D['col_ids'], fmt="%d", delimit_c=",")
  print np.sum(P14!=Pinf)

  P15 = paths.fill_paths(ADJ_D["M"], k=15)
  mio.save(P15, open("data/all_k61_0.5_dot_nw.adj.paths.k15.csv","w"), ftype="txt", row_ids=ADJ_D['row_ids'], col_ids=ADJ_D['col_ids'], fmt="%d", delimit_c=",")
  print np.sum(P15!=Pinf)

  P16 = paths.fill_paths(ADJ_D["M"], k=16)
  mio.save(P15, open("data/all_k61_0.5_dot_nw.adj.paths.k16.csv","w"), ftype="txt", row_ids=ADJ_D['row_ids'], col_ids=ADJ_D['col_ids'], fmt="%d", delimit_c=",")
  print np.sum(P16!=Pinf)