Пример #1
0
  def test_sample_network_simulation(self):
    sparsity = 1
    noise_prob = 0
    sim_partial_network = sim.sample_network(self.cluster_sizes,sparsity,noise_prob)

    #without sparsity should completely recover network
    #and no noise either
    assert (self.sim_full_network != sim_partial_network).nnz == 0
Пример #2
0
  def test_sample_network_noise(self):
    #with complete noise should recover opposite of matrix
    #could write another probabilistic test for the case of partial noise
    #but this is OK
    sparsity = 1
    noise_prob = 1
    sim_partial_network = sim.sample_network(self.cluster_sizes,sparsity,noise_prob)

    #should recover opposite of matrix
    assert (self.sim_full_network == sim_partial_network).nnz == 0
Пример #3
0
  def test_sample_network_sparse_quantity(self):
    #with sparsity should in expectation recover a lot of the network
    sparsity = 0.5
    noise_prob = 0
    sim_partial_network = sim.sample_network(self.cluster_sizes,sparsity,noise_prob, symmetric=False)
    expected_num_edges = self.network_size**2 * sparsity

    #Probabilistic test: MAY FAIL WITH CORRECT BEHAVIOR (BUT W/ LOW PROBABILITY)
    #by a Chernoff bound, this should deviate from expected number of edges
    #by more than t*(network_size/2)
    #with probability 2*e^(-t^2/2)
    #here set t = 4, so this tests passes with probability 1 - 2*e^-8
    actual_num_edges = sim_partial_network.nnz
    print expected_num_edges, actual_num_edges
    assert abs(actual_num_edges - expected_num_edges) <= 20
def clustering_pipeline(network_params, clustering_params):
    #Create network
    cluster_sizes, sparsity, noise_prob = network_params
    num_clusters = len(cluster_sizes)

    network = sim.sample_network(cluster_sizes, sparsity, noise_prob)
    rows, cols = network.nonzero()

    #Assign ground truth labels (the first "cluster_size" are in cluster 0,
    #next are in cluster 1, etc.)
    cluster_labels = list()
    for cluster_index in range(len(cluster_sizes)):
        cluster_labels += [cluster_index] * cluster_sizes[cluster_index]
    cluster_labels = np.asarray(cluster_labels)

    #perform clustering
    cluster_sizes, method, completion_alg, completion_params, mode = clustering_params
    cluster_predictions = cluster_signed_network(network, cluster_sizes, method, \
                              completion_alg, completion_params, mode)
    cluster_accuracy = evaluate_cluster_accuracy(cluster_predictions, cluster_labels, \
                              rows, cols)
    return cluster_accuracy
Пример #5
0
def run_experiment():
    simulated = False
    real = True

    use_moi = True
    use_hoc = True
    use_svp = True
    use_sgd_sh = False
    use_sgd_sig = False
    use_als = True

    adj_matrix = None
    if simulated:
        cluster_sizes = [100, 200, 300, 400]
        sparsity_level = 0.01175
        noise_prob = 0
        print "creating adjacency matrix..."
        adj_matrix = sim.sample_network(cluster_sizes, sparsity_level,
                                        noise_prob)

    elif real:
        data_file_name = "data/Preprocessed Data/small_network.npy"
        #data_file_name = "data/Preprocessed Data/wiki_elections_csr.npy"
        try:
            adj_matrix = np.load(data_file_name).item()
        except Exception as e:
            raise ValueError("could not load adj matrix from file: ", e)

    if use_moi:
        print "performing MOI..."
        max_cycle_order_moi = 10
        discount = [0.5**i for i in range(3, max_cycle_order_moi + 1)]
        #max_cycle_order_moi = np.inf
        #discount = 0.0001
        num_folds = 5
        avg_acc, stderr_acc, avg_fpr, stderr_fpr, avg_time, stderr_time = \
              moi.kfoldcv_moi(adj_matrix, discount, max_cycle_order_moi, num_folds)
        print "MOI results: "
        print("Accuracy: average %f with standard error %f" %
              (avg_acc, stderr_acc))
        print("False positive rate: average %f with standard error %f" %
              (avg_fpr, stderr_fpr))
        print("Model running time: average %f with standard error %f" %
              (avg_time, stderr_time))
        print

    if use_hoc:
        print "performing HOC..."
        max_cycle_order_hoc = 5
        num_folds = 10
        avg_acc, stderr_acc, avg_fpr, stderr_fpr, avg_time, stderr_time = \
                  hoc.hoc_learning_pipeline(adj_matrix, max_cycle_order_hoc)
        print "HOC results:"
        print("Accuracy: average %f with standard error %f" %
              (avg_acc, stderr_acc))
        print("False positive rate: average %f with standard error %f" %
              (avg_fpr, stderr_fpr))
        print("Model running time: average %f with standard error %f" %
              (avg_time, stderr_time))
        print

    alg = ""
    alg_params = None

    #settings if using SGD
    if use_sgd_sh or use_sgd_sig:
        #Parameters used for this experiment

        #https://www.cs.uic.edu/~liub/KDD-cup-2007/proceedings/Regular-Paterek.pdf
        learning_rate = 1000  #0.05 for square hinge
        tol = adj_matrix.nnz / 10
        max_iter = 20
        reg_param = 10  #0.5 for square hinge
        dim = 100
        num_folds_mf = 10

        #Bundle up these parameters and use this algorithm
        if use_sgd_sh:
            loss_type = "squarehinge"  #"sigmoid"
            alg_params = (learning_rate, loss_type, tol, max_iter, reg_param,
                          dim)
            alg = "sgd"

            print "performing SGD with square-hinge loss..."
            avg_acc, stderr_acc, avg_fpr, stderr_fpr, avg_time, stderr_time = \
                    mf.kfold_CV_pipeline(adj_matrix, alg, alg_params, num_folds_mf)
            print "SGD_SH results:"
            print("Accuracy: average %f with standard error %f" %
                  (avg_acc, stderr_acc))
            print("False positive rate: average %f with standard error %f" %
                  (avg_fpr, stderr_fpr))
            print("Model running time: average %f with standard error %f" %
                  (avg_time, stderr_time))
            print
        if use_sgd_sig:
            loss_type = "sigmoid"
            alg_params = (learning_rate, loss_type, tol, max_iter, reg_param,
                          dim)
            alg = "sgd"

            print "performing SGD with sigmoid loss..."
            avg_acc, stderr_acc, avg_fpr, stderr_fpr, avg_time, stderr_time = \
                    mf.kfold_CV_pipeline(adj_matrix, alg, alg_params, num_folds_mf)
            print "SGD_SIG results:"
            print("Accuracy: average %f with standard error %f" %
                  (avg_acc, stderr_acc))
            print("False positive rate: average %f with standard error %f" %
                  (avg_fpr, stderr_fpr))
            print("Model running time: average %f with standard error %f" %
                  (avg_time, stderr_time))
            print
    #settings if using als
    if use_als:
        #Parameters used for this experiment
        max_iter = 2
        dim = 40

        #Bundle up these parameters and use this algorithm
        alg_params = (max_iter, dim)
        alg = "als"

        num_folds_mf = 10

        print "performing ALS..."
        avg_acc, stderr_acc, avg_fpr, stderr_fpr, avg_time, stderr_time = \
                mf.kfold_CV_pipeline(adj_matrix, alg, alg_params, num_folds_mf)
        print "ALS results:"
        print("Accuracy: average %f with standard error %f" %
              (avg_acc, stderr_acc))
        print("False positive rate: average %f with standard error %f" %
              (avg_fpr, stderr_fpr))
        print("Model running time: average %f with standard error %f" %
              (avg_time, stderr_time))
        print

    #settings if using SVP
    if use_svp:
        #Parameters used for this experiment
        rank = 40
        tol = 100
        max_iter = 5
        step_size = 1

        #Bundle up these parameters and use this algorithm
        alg_params = (rank, tol, max_iter, step_size)
        alg = "svp"

        num_folds_mf = 10

        print "performing SVP..."
        avg_acc, stderr_acc, avg_fpr, stderr_fpr, avg_time, stderr_time = \
              mf.kfold_CV_pipeline(adj_matrix, alg, alg_params, num_folds_mf)
        print "SVP results:"
        print("Accuracy: average %f with standard error %f" %
              (avg_acc, stderr_acc))
        print("False positive rate: average %f with standard error %f" %
              (avg_fpr, stderr_fpr))
        print("Model running time: average %f with standard error %f" %
              (avg_time, stderr_time))
        print
  '''
  cluster_sizes = [2,3,4]
  sparsity_level = 0.5
  noise_prob = 0
  adj_matrix = sim.sample_network(cluster_sizes, sparsity_level, noise_prob)
  #print adj_matrix.A
  signed_laplacian(adj_matrix).A
  '''

  adj_matrix = None
  if simulated:
    cluster_sizes = [500,500]#[100,200,300,400]
    sparsity_level = 0.5#0.01175
    noise_prob = 0
    print "creating adjacency matrix..."
    adj_matrix = sim.sample_network(cluster_sizes, sparsity_level, noise_prob)

  elif real:
    #data_file_name = "Preprocessed Data/small_network.npy"
    data_file_name = "../data/Preprocessed Data/wiki_elections_csr.npy"
    try:
      adj_matrix = np.load(data_file_name).item()
    except Exception as e:
      raise ValueError("could not load adj matrix from file: ", e)

  avg_acc, avg_fpr = kfoldcv(adj_matrix, num_folds = 20)
  print("Accuracy %f and false positive rate %f" % (avg_acc, avg_fpr))



def run_experiment():
  simulated = False
  real = True

  use_moi = True
  use_hoc = True
  use_svp = True
  use_sgd_sh = False
  use_sgd_sig = False
  use_als = True

  adj_matrix = None
  if simulated:
    cluster_sizes = [100,200,300,400]
    sparsity_level = 0.01175
    noise_prob = 0
    print "creating adjacency matrix..."
    adj_matrix = sim.sample_network(cluster_sizes, sparsity_level, noise_prob)

  elif real:
    data_file_name = "data/Preprocessed Data/small_network.npy"
    #data_file_name = "data/Preprocessed Data/wiki_elections_csr.npy"
    try:
      adj_matrix = np.load(data_file_name).item()
    except Exception as e:
      raise ValueError("could not load adj matrix from file: ", e)

  if use_moi:
    print "performing MOI..."
    max_cycle_order_moi = 10
    discount = [0.5**i for i in range(3, max_cycle_order_moi + 1)]
    #max_cycle_order_moi = np.inf
    #discount = 0.0001
    num_folds = 5
    avg_acc, stderr_acc, avg_fpr, stderr_fpr, avg_time, stderr_time = \
          moi.kfoldcv_moi(adj_matrix, discount, max_cycle_order_moi, num_folds)
    print "MOI results: "
    print("Accuracy: average %f with standard error %f" % (avg_acc, stderr_acc))
    print("False positive rate: average %f with standard error %f" % (avg_fpr, stderr_fpr))
    print("Model running time: average %f with standard error %f" % (avg_time, stderr_time))
    print

  if use_hoc:
    print "performing HOC..."
    max_cycle_order_hoc = 5
    num_folds = 10
    avg_acc, stderr_acc, avg_fpr, stderr_fpr, avg_time, stderr_time = \
              hoc.hoc_learning_pipeline(adj_matrix, max_cycle_order_hoc)
    print "HOC results:"
    print("Accuracy: average %f with standard error %f" % (avg_acc, stderr_acc))
    print("False positive rate: average %f with standard error %f" % (avg_fpr, stderr_fpr))
    print("Model running time: average %f with standard error %f" % (avg_time, stderr_time))
    print

  alg = ""
  alg_params = None

  #settings if using SGD
  if use_sgd_sh or use_sgd_sig:
    #Parameters used for this experiment

    #https://www.cs.uic.edu/~liub/KDD-cup-2007/proceedings/Regular-Paterek.pdf
    learning_rate = 1000#0.05 for square hinge
    tol = adj_matrix.nnz/10
    max_iter = 20
    reg_param = 10#0.5 for square hinge
    dim = 100
    num_folds_mf = 10

    #Bundle up these parameters and use this algorithm
    if use_sgd_sh:
      loss_type = "squarehinge" #"sigmoid"
      alg_params = (learning_rate, loss_type, tol, max_iter, reg_param, dim)
      alg = "sgd"
      
      print "performing SGD with square-hinge loss..."
      avg_acc, stderr_acc, avg_fpr, stderr_fpr, avg_time, stderr_time = \
              mf.kfold_CV_pipeline(adj_matrix, alg, alg_params, num_folds_mf)
      print "SGD_SH results:"
      print("Accuracy: average %f with standard error %f" % (avg_acc, stderr_acc))
      print("False positive rate: average %f with standard error %f" % (avg_fpr, stderr_fpr))
      print("Model running time: average %f with standard error %f" % (avg_time, stderr_time))
      print
    if use_sgd_sig:
      loss_type = "sigmoid"
      alg_params = (learning_rate, loss_type, tol, max_iter, reg_param, dim)
      alg = "sgd"
      
      print "performing SGD with sigmoid loss..."
      avg_acc, stderr_acc, avg_fpr, stderr_fpr, avg_time, stderr_time = \
              mf.kfold_CV_pipeline(adj_matrix, alg, alg_params, num_folds_mf)
      print "SGD_SIG results:"
      print("Accuracy: average %f with standard error %f" % (avg_acc, stderr_acc))
      print("False positive rate: average %f with standard error %f" % (avg_fpr, stderr_fpr))
      print("Model running time: average %f with standard error %f" % (avg_time, stderr_time))
      print
  #settings if using als
  if use_als:
    #Parameters used for this experiment
    max_iter = 2
    dim = 40

    #Bundle up these parameters and use this algorithm
    alg_params = (max_iter, dim)
    alg = "als"

    num_folds_mf = 10

    print "performing ALS..."
    avg_acc, stderr_acc, avg_fpr, stderr_fpr, avg_time, stderr_time = \
            mf.kfold_CV_pipeline(adj_matrix, alg, alg_params, num_folds_mf)
    print "ALS results:"
    print("Accuracy: average %f with standard error %f" % (avg_acc, stderr_acc))
    print("False positive rate: average %f with standard error %f" % (avg_fpr, stderr_fpr))
    print("Model running time: average %f with standard error %f" % (avg_time, stderr_time))
    print

  #settings if using SVP
  if use_svp:
    #Parameters used for this experiment
    rank = 40
    tol = 100
    max_iter = 5
    step_size = 1

    #Bundle up these parameters and use this algorithm
    alg_params = (rank, tol, max_iter, step_size)
    alg = "svp"

    num_folds_mf = 10
    
    print "performing SVP..."
    avg_acc, stderr_acc, avg_fpr, stderr_fpr, avg_time, stderr_time = \
          mf.kfold_CV_pipeline(adj_matrix, alg, alg_params, num_folds_mf)
    print "SVP results:"
    print("Accuracy: average %f with standard error %f" % (avg_acc, stderr_acc))
    print("False positive rate: average %f with standard error %f" % (avg_fpr, stderr_fpr))
    print("Model running time: average %f with standard error %f" % (avg_time, stderr_time))
    print