Пример #1
0
def load_data(data_prefix, dataset_str, precalc):
  """Return the required data formats for GCN models."""
  (num_data, train_adj, full_adj, feats, train_feats, test_feats, labels,
   train_data, val_data,
   test_data) = utils.load_graphsage_data(data_prefix, dataset_str)
  visible_data = train_data

  y_train = np.zeros(labels.shape)
  y_val = np.zeros(labels.shape)
  y_test = np.zeros(labels.shape)
  y_train[train_data, :] = labels[train_data, :]
  y_val[val_data, :] = labels[val_data, :]
  y_test[test_data, :] = labels[test_data, :]

  train_mask = utils.sample_mask(train_data, labels.shape[0])
  val_mask = utils.sample_mask(val_data, labels.shape[0])
  test_mask = utils.sample_mask(test_data, labels.shape[0])

  if precalc:
    train_feats = train_adj.dot(feats)
    train_feats = np.hstack((train_feats, feats))
    test_feats = full_adj.dot(feats)
    test_feats = np.hstack((test_feats, feats))

  return (train_adj, full_adj, train_feats, test_feats, y_train, y_val, y_test,
          train_mask, val_mask, test_mask, train_data, val_data, test_data,
          num_data, visible_data)
def full_load_citation(dataset_str="cora"):
    names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
    objects = []
    for i in range(len(names)):
        with open("dataset/ind.{}.{}".format(dataset_str, names[i]), 'rb') as f:
            objects.append(pkl.load(f, encoding='latin1'))

    x, y, tx, ty, allx, ally, graph = tuple(objects)
    test_idx_reorder = parse_index_file("dataset/ind.{}.test.index".format(dataset_str))
    test_idx_range = np.sort(test_idx_reorder)

    if dataset_str == 'citeseer':
        # Fix citeseer dataset (there are some isolated nodes in the graph)
        # Find isolated nodes, add them as zero-vecs into the right position
        test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1)
        tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1]))
        tx_extended[test_idx_range-min(test_idx_range), :] = tx
        tx = tx_extended
        ty_extended = np.zeros((len(test_idx_range_full), y.shape[1]))
        ty_extended[test_idx_range-min(test_idx_range), :] = ty
        ty = ty_extended

    features = sp.vstack((allx, tx)).tolil()
    features[test_idx_reorder, :] = features[test_idx_range, :]
    adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))

    labels = np.vstack((ally, ty))
    labels[test_idx_reorder, :] = labels[test_idx_range, :]

    idx_test = test_idx_range.tolist()
    idx_train = range(len(y))
    idx_val = range(len(y), len(y)+500)

    train_mask = sample_mask(idx_train, labels.shape[0])
    val_mask = sample_mask(idx_val, labels.shape[0])
    test_mask = sample_mask(idx_test, labels.shape[0])

    y_train = np.zeros(labels.shape)
    y_val = np.zeros(labels.shape)
    y_test = np.zeros(labels.shape)
    y_train[train_mask, :] = labels[train_mask, :]
    y_val[val_mask, :] = labels[val_mask, :]
    y_test[test_mask, :] = labels[test_mask, :]

    return adj, features, labels, train_mask, val_mask, test_mask
Пример #3
0
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

with open("./aifb.pickle", 'rb') as f:
    data = pkl.load(f, encoding='latin1')

A = data['A']
y = data['y']
train_idx = data['train_idx']
test_idx = data['test_idx']

# Get dataset splits
y_train, y_val, y_test, idx_train, idx_val, idx_test = utils.get_splits(
    y, train_idx, test_idx, False)

print((y_train.shape))
train_mask = utils.sample_mask(idx_train, y.shape[0])

print(train_mask.shape)

val_mask = utils.sample_mask(idx_val, y.shape[0])

test_mask = utils.sample_mask(idx_test, y.shape[0])

# print (train_mask)

# print (val_mask)

print(idx_train)
print(idx_val)
print(idx_test)
Пример #4
0
flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.')
flags.DEFINE_integer('epochs', 500, 'Number of epochs to train.')
flags.DEFINE_integer('hidden1', 128, 'Number of units in hidden layer 1.')
flags.DEFINE_float('dropout', 0.0, 'Dropout rate (1 - keep probability).')
flags.DEFINE_float('weight_decay', 0.0, 'Weight for L2 loss on embedding matrix.')
flags.DEFINE_string('save_name', './mymodel.ckpt', 'Path for saving model')

#flags.DEFINE_integer('early_stopping', 10, 'Tolerance for early stopping (# of epochs).')

# Load data
training_inputs, training_data_values, test_inputs, test_data_values = data.load_big_data()
print(training_inputs.shape)
print(training_data_values.shape)


train_mask = utils.sample_mask(np.arange(training_inputs.shape[0]), training_inputs.shape[0])
test_mask = utils.sample_mask(np.arange(test_inputs.shape[0]), test_inputs.shape[0])
#val_mask = utils.sample_mask(val_indexes, y.shape[0])


y_train = np.zeros(training_data_values.shape)
#y_val = np.zeros(y.shape)
y_test = np.zeros(test_data_values.shape)
y_train[train_mask, :] = training_data_values[train_mask, :]
#y_val[val_mask, :] = y[val_mask, :]
y_test[test_mask, :] = test_data_values[test_mask, :]

# Some preprocessing
#features = utils.preprocess_features(training_inputs)
#features_test = utils.preprocess_features(test_inputs)
Пример #5
0
    #time sensitive parameters
    gamma = np.random.beta(1, 1.005-basef**epoch)
    alpha = beta = (1-gamma)/2

    # Construct feed dictionary
    feed_dict = construct_feed_dict(features, support, y_train, train_mask, placeholders)
    feed_dict.update({placeholders['dropout']: FLAGS.dropout})

    # Training step
    outs = sess.run([model.opt_op, model.loss, model.accuracy, model.predict()], feed_dict=feed_dict)

    #choose instance to label based on entropy
    if len(idx_train)<NL:
    	entropy = sc.stats.entropy(outs[3].T)
        train_mask = sample_mask(idx_train, labels.shape[0])
    	#entropy[train_mask+val_mask+test_mask]=-100
	entrperc = np.asarray([perc(entropy,i) for i in range(len(entropy))])
	kmeans = KMeans(n_clusters=NCL, random_state=0).fit(outs[3])
	ed=euclidean_distances(outs[3],kmeans.cluster_centers_)
	ed_score = np.min(ed,axis=1)	#the larger ed_score is, the far that node is away from cluster centers, the less representativeness the node is
	edprec = np.asarray([percd(ed_score,i) for i in range(len(ed_score))])
	finalweight = alpha*entrperc + beta*edprec + gamma*cenperc
	finalweight[train_mask+val_mask+test_mask]=-100
   	select=np.argmax(finalweight)
    	idx_train.append(select) 
    	train_mask = sample_mask(idx_train, labels.shape[0])  
    	y_train = np.zeros(labels.shape)
    	y_train[train_mask, :] = labels[train_mask, :]
    else:
        print ('finish select!')