class _MNA(object): def __init__(self, graph, anchorfile, valid_prop, neg_ratio, log_file): if os.path.exists('log/' + log_file + '.log'): os.remove('log/' + log_file + '.log') self.logger = LogHandler(log_file) if not isinstance(graph, dict): self.logger.error('The graph must contain src and target graphs.') return self.L = load_train_valid_labels(anchorfile, valid_prop) self.graph = graph self.look_up = dict() self.look_up['f'] = self.graph['f'].look_up_dict self.look_up['g'] = self.graph['g'].look_up_dict self.look_back = dict() self.look_back['f'] = self.graph['f'].look_back_list self.look_back['g'] = self.graph['g'].look_back_list self.neg_ratio = neg_ratio self.batch_size = 1024 self.clf = svm.SVC() def __get_pair_features(self, src_nds, target_nds): pair_features = list() if len(src_nds) != len(target_nds): self.logger.warn( 'The size of sampling in processing __get_pair_features is not equal.' ) yield pair_features for i in range(len(src_nds)): src_nd, target_nd = src_nds[i], target_nds[i] if not src_nd in self.graph['f'].G or not target_nd in self.graph[ 'g'].G: continue src_neighbor_anchors = set() for src_nd_to in self.graph['f'].G[src_nd]: if src_nd_to in self.L['f2g']['train']: src_neighbor_anchors.add(src_nd_to) target_neighbor_anchors = set() for target_nd_to in self.graph['g'].G[target_nd]: if target_nd_to in self.L['g2f']['train']: target_neighbor_anchors.add(target_nd_to) cnt_common_neighbors = .0 AA_measure = .0 for sna in src_neighbor_anchors: for k in range(len(self.L['f2g']['train'][sna])): target_anchor_nd = self.L['f2g']['train'][sna][k] if target_anchor_nd in target_neighbor_anchors: cnt_common_neighbors += 1. AA_measure += 1. / np.log( (len(self.graph['f'].G[sna]) + len(self.graph[ 'g'].G[self.L['f2g']['train'][sna][k]])) / 2.) jaccard = cnt_common_neighbors/(len(self.graph['f'].G[src_nd])\ +len(self.graph['g'].G[target_nd])-cnt_common_neighbors+1e-6) yield [cnt_common_neighbors, jaccard, AA_measure] def __batch_iter(self, lbs, batch_size, neg_ratio, lookup_src, lookup_obj, src_lb_tag, obj_lb_tag): train_lb_src2obj = lbs['{}2{}'.format(src_lb_tag, obj_lb_tag)]['train'] train_lb_obj2src = lbs['{}2{}'.format(obj_lb_tag, src_lb_tag)]['train'] train_size = len(train_lb_src2obj) start_index = 0 end_index = min(start_index + batch_size, train_size) src_lb_keys = train_lb_src2obj.keys() obj_lb_keys = train_lb_obj2src.keys() shuffle_indices = np.random.permutation(np.arange(train_size)) while start_index < end_index: pos_src = list() pos_obj = list() neg_src = list() neg_obj = list() for i in range(start_index, end_index): idx = shuffle_indices[i] src_lb = src_lb_keys[idx] obj_lbs = train_lb_src2obj[src_lb] for obj_lb in obj_lbs: cur_neg_src = list() cur_neg_obj = list() for k in range(neg_ratio): rand_obj_lb = None while not rand_obj_lb or rand_obj_lb in cur_neg_obj or rand_obj_lb in obj_lbs: rand_obj_lb_idx = random.randint( 0, len(obj_lb_keys) - 1) rand_obj_lb = obj_lb_keys[rand_obj_lb_idx] cur_neg_src.append(src_lb) cur_neg_obj.append(rand_obj_lb) pos_src.append(src_lb) pos_obj.append(obj_lb) neg_src.append(cur_neg_src) neg_obj.append(cur_neg_obj) start_index = end_index end_index = min(start_index + batch_size, train_size) yield pos_src, pos_obj, neg_src, neg_obj def train(self): batches_f2g = list(self.__batch_iter(self.L, self.batch_size, self.neg_ratio\ , self.look_up['f'], self.look_up['g'], 'f', 'g')) n_batches = len(batches_f2g) X = list() Y = list() for i in range(n_batches): pos_src_f2g, pos_obj_f2g, neg_src_f2g, neg_obj_f2g = batches_f2g[i] if not len(pos_src_f2g) == len(pos_obj_f2g) and not len( neg_src_f2g) == len(neg_obj_f2g): self.logger.info( 'The input label file goes wrong as the file format.') continue pos_features = list( self.__get_pair_features(pos_src_f2g, pos_obj_f2g)) X.extend(pos_features) Y.extend([1 for m in range(len(pos_features))]) for k in range(self.neg_ratio): neg_features = list( self.__get_pair_features(neg_src_f2g[k], neg_obj_f2g[k])) X.extend(neg_features) Y.extend([-1 for m in range(len(neg_features))]) self.logger.info('Training Model...') self.clf.fit(X, Y) self.logger.info('Complete Training process...')
class PALE_MLP(object): def __init__(self, learning_rate, batch_size, n_input, n_hidden, n_layer, device, files, log_file): if os.path.exists('log/' + log_file + '.log'): os.remove('log/' + log_file + '.log') self.logger = LogHandler(log_file) self.device = device # Parameters self.learning_rate = learning_rate self.batch_size = batch_size self.valid_prop = .9 self.valid_sample_size = 9 self.cur_epoch = 1 # Network Parameters self.n_hidden = n_hidden # number of neurons in hidden layer self.n_input = n_input # size of node embeddings self.n_layer = n_layer # number of layer # Set Train Data if not isinstance(files, list) and len(files) < 3: self.logger.info( 'The alogrihtm needs files like [First Graph File, Second Graph File, Label File]' ) return # tf Graph input self.lookup_f = dict() self.lookup_g = dict() self.look_back_f = list() self.look_back_g = list() self._read_train_dat(files[0], files[1], files[2]) # douban, weibo, label files self.valid_sample_size = min( min(self.valid_sample_size, len(self.look_back_f) - 1), len(self.look_back_g) - 1) # TF Graph Building self.sess = tf.Session() cur_seed = random.getrandbits(32) initializer = tf.contrib.layers.xavier_initializer(uniform=False, seed=cur_seed) with tf.device(self.device): with tf.variable_scope("model", reuse=None, initializer=initializer): self.mlp_weights() self.build_train_graph() self.build_valid_graph() self.sess.run(tf.global_variables_initializer()) def _read_labels(self, label_file): labels = list() with open(label_file, 'r') as lb_handler: for ln in lb_handler: ln = ln.strip() if not ln: break labels.append(ln.split()) return labels def _read_embeddings(self, embed_file, lookup, look_back): embedding = list() with open(embed_file, 'r') as emb_handler: idx = 0 for ln in emb_handler: ln = ln.strip() if ln: elems = ln.split() if len(elems) == 2: continue embedding.append(map(float, elems[1:])) lookup[elems[0]] = idx look_back.append(elems[0]) idx += 1 return np.array(embedding), lookup, look_back def _read_train_dat(self, embed1_file, embed2_file, label_file): self.L = load_train_valid_labels(label_file, self.valid_prop) self.X, self.lookup_f, self.look_back_f = self._read_embeddings( embed1_file, self.lookup_f, self.look_back_f) self.Y, self.lookup_g, self.look_back_g = self._read_embeddings( embed2_file, self.lookup_g, self.look_back_g) def mlp_weights(self): # Store layers weight & bias self.weights = dict() self.biases = dict() self.weights['h0'] = tf.Variable( tf.random_normal([self.n_input, self.n_hidden])) self.biases['b0'] = tf.Variable(tf.zeros([self.n_hidden])) for i in range(1, self.n_layer): self.weights['h{}'.format(i)] = tf.Variable( tf.random_normal([self.n_hidden, self.n_hidden])) self.biases['b{}'.format(i)] = tf.Variable( tf.zeros([self.n_hidden])) self.weights['out'] = tf.Variable( tf.random_normal([self.n_hidden, self.n_input])) self.biases['b_out'] = tf.Variable(tf.zeros([self.n_input])) def build_code_graph(self, inputs): # Input layer layer = tf.nn.sigmoid( tf.add( tf.matmul(tf.reshape(inputs, [-1, self.n_input]), self.weights['h0']), self.biases['b0'])) for i in range(1, self.n_layer): layer = tf.nn.sigmoid( tf.add(tf.matmul(layer, self.weights['h{}'.format(i)]), self.biases['b{}'.format(i)])) # Output fully connected layer with a neuron code = tf.nn.tanh( tf.matmul(layer, self.weights['out']) + self.biases['b_out']) return code def build_lin_code_graph(self, inputs): # Output fully connected layer with a neuron code = tf.matmul(tf.reshape(inputs, [-1, self.n_input]), self.weights['out']) + self.biases['b_out'] return code def build_train_graph(self): self.cur_batch_size = tf.placeholder('float32', name='batch_size') self.pos_f_inputs = tf.placeholder('float32', [None, self.n_input]) self.pos_g_inputs = tf.placeholder('float32', [None, self.n_input]) self.PF = self.build_code_graph( self.pos_f_inputs) # batch_size*n_input # train loss self.loss = tf.reduce_mean(.5 * tf.square(self.PF - self.pos_g_inputs)) optimizer = tf.train.AdamOptimizer(self.learning_rate) self.train_op = optimizer.minimize(self.loss) def build_valid_graph(self): # validation self.valid_f_inputs = tf.placeholder( 'float32', [None, self.valid_sample_size, self.n_input]) self.valid_g_inputs = tf.placeholder( 'float32', [None, self.valid_sample_size, self.n_input]) valid_f = tf.reshape(self.build_code_graph(self.valid_f_inputs), [-1, self.valid_sample_size, self.n_input ]) # batch_size*neg_ratio*n_input self.dot_dist = tf.reduce_sum(tf.pow(valid_f - self.valid_g_inputs, 2.), axis=2) # self.hamming_dist = tf.reduce_sum( # tf.clip_by_value(tf.sign(tf.multiply(tf.sign(valid_f),tf.sign(valid_g))),.0,1.) # , axis=2 # ) def train_one_epoch(self): sum_loss = 0.0 # train process # with tf.device(self.device): batches = batch_iter(self.L, self.batch_size, 0\ , self.lookup_f, self.lookup_g, 'f', 'g') batch_id = 0 for batch in batches: pos_f, pos_g, neg_f, neg_g = batch if not len(pos_f) == len(pos_g): self.logger.info( 'The input label file goes wrong as the file format.') continue batch_size = len(pos_f) feed_dict = { self.pos_f_inputs: self.X[pos_f, :], self.pos_g_inputs: self.Y[pos_g, :], self.cur_batch_size: batch_size } _, cur_loss = self.sess.run([self.train_op, self.loss], feed_dict) sum_loss += cur_loss # self.logger.info('Finish processing batch {} and cur_loss={}' # .format(batch_id, cur_loss)) batch_id += 1 # valid process valid_f, valid_g = valid_iter(self.L, self.valid_sample_size, self.lookup_f, self.lookup_g, 'f', 'g') # print valid_f,valid_g if not len(valid_f) == len(valid_g): self.logger.info( 'The input label file goes wrong as the file format.') return valid_size = len(valid_f) feed_dict = { self.valid_f_inputs: self.X[valid_f, :], self.valid_g_inputs: self.Y[valid_g, :] } valid_dist = self.sess.run(self.dot_dist, feed_dict) # valid_dist = self.sess.run(self.hamming_dist,feed_dict) mrr = .0 for i in range(valid_size): fst_dist = valid_dist[i][0] pos = 1 for k in range(1, len(valid_dist[i])): if fst_dist >= valid_dist[i][k]: pos += 1 # print pos # self.logger.info('dist:{},pos:{}'.format(fst_dist,pos)) # print valid_dist[i] mrr += 1. / pos self.logger.info('Epoch={}, sum of loss={!s}, mrr={}'.format( self.cur_epoch, sum_loss / batch_id, mrr / valid_size)) # print 'mrr:',mrr/valid_size # self.logger.info('Epoch={}, sum of loss={!s}, valid_loss={}' # .format(self.cur_epoch, sum_loss/batch_id, valid_loss)) self.cur_epoch += 1 def _write_in_file(self, filename, vec, tag): with open(filename, 'aw') as res_handler: if len(vec.shape) > 1: column_size = vec.shape[1] else: column_size = 1 reshape_vec = vec.reshape(-1) vec_size = len(reshape_vec) res_handler.write(tag + '\n') for i in range(0, vec_size, column_size): res_handler.write('{}\n'.format(' '.join( [str(reshape_vec[i + k]) for k in range(column_size)]))) def save_models(self, filename): if os.path.exists(filename): os.remove(filename) for k, v in self.weights.iteritems(): self._write_in_file(filename, v.eval(self.sess), k) for k, v in self.biases.iteritems(): self._write_in_file(filename, v.eval(self.sess), k)
def main(args): t1 = time.time() os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_id logger = LogHandler('RUN.' + time.strftime('%Y-%m-%d', time.localtime(time.time()))) logger.info(args) SAVING_STEP = args.saving_step MAX_EPOCHS = args.epochs if args.method == 'pale': model = PALE( learning_rate=args.lr, batch_size=args.batch_size, n_input=args.input_size, n_hidden=args.hidden_size, n_layer=args.layers, files=[args.embedding1, args.embedding2, args.identity_linkage], type_model=args.type_model, is_valid=args.is_valid, log_file=args.log_file, device=args.device) if args.method == 'mna' or args.method == 'fruip': graph = defaultdict(Graph) print("Loading graph...") if args.graph_format == 'adjlist': if args.graph1: graph['f'].read_adjlist(filename=args.graph1) if args.graph2: graph['g'].read_adjlist(filename=args.graph2) if args.graph_format == 'edgelist': if args.graph1: graph['f'].read_edgelist(filename=args.graph1) if args.graph2: graph['g'].read_edgelist(filename=args.graph2) if args.method == 'mna': model = MNA(graph=graph, anchorfile=args.identity_linkage, valid_prop=1.\ , neg_ratio=3, log_file=args.log_file) if args.method == 'fruip': embed_files = [args.embedding1, args.embedding2] model = FRUIP(graph=graph, embed_files=embed_files, linkage_file=args.identity_linkage) model.main_proc(args.threshold) if args.method == 'final': main_proc(graph_files=[args.graph1, args.graph2], graph_sizes=[args.graph_size1, args.graph_size2], linkage_file=args.identity_linkage, alpha=args.alpha, epoch=args.epochs, tol=args.tol, graph_format=args.graph_format, test_anchor_file=args.test_anchors, output_file=args.output) if args.method in ['pale']: losses = np.zeros(MAX_EPOCHS) val_scrs = np.zeros(MAX_EPOCHS) best_scr = .0 best_epoch = 0 thres = 100 for i in range(1, MAX_EPOCHS + 1): losses[i - 1], val_scrs[i - 1] = model.train_one_epoch() if i > 0 and i % SAVING_STEP == 0: loss_mean = np.mean(losses[i - SAVING_STEP:i]) scr_mean = np.mean(val_scrs[i - SAVING_STEP:i]) logger.info( 'loss in last {} epoches: {}, validation in last {} epoches: {}' .format(SAVING_STEP, loss_mean, SAVING_STEP, scr_mean)) if scr_mean > best_scr: best_scr = scr_mean best_epoch = i model.save_models(args.output) if args.early_stop and i >= thres * SAVING_STEP: cnt = 0 for k in range(thres - 1, -1, -1): cur_val = np.mean( val_scrs[i - (k + 1) * SAVING_STEP:i - k * SAVING_STEP]) if cur_val < best_scr: cnt += 1 if cnt == thres and (i - best_epoch) >= thres * SAVING_STEP: logger.info('*********early stop*********') logger.info( 'The best epoch: {}\nThe validation score: {}'. format(best_epoch, best_scr)) break if args.method in ['mna', 'fruip']: model.save_model(args.output) t2 = time.time() print('time cost:', t2 - t1)
def main(args): t1 = time.time() os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_id # args.use_net=False logger = LogHandler('RUN.' + time.strftime('%Y-%m-%d', time.localtime(time.time()))) logger.info(args) SAVING_STEP = args.saving_step MAX_EPOCHS = args.epochs if args.method == 'pale': model = PALE(learning_rate=args.lr, batch_size=args.batch_size, n_input=args.input_size, n_hidden=args.hidden_size, n_layer=args.layers, files=args.embeddings + args.identity_linkage, type_model=args.type_model, is_valid=args.is_valid, log_file=args.log_file, device=args.device) losses = np.zeros(MAX_EPOCHS) val_scrs = np.zeros(MAX_EPOCHS) best_scr = .0 best_epoch = 0 thres = 100 for i in range(1, MAX_EPOCHS + 1): losses[i - 1], val_scrs[i - 1] = model.train_one_epoch() if i > 0 and i % SAVING_STEP == 0: loss_mean = np.mean(losses[i - SAVING_STEP:i]) scr_mean = np.mean(val_scrs[i - SAVING_STEP:i]) logger.info( 'loss in last {} epoches: {}, validation in last {} epoches: {}' .format(SAVING_STEP, loss_mean, SAVING_STEP, scr_mean)) if scr_mean > best_scr: best_scr = scr_mean best_epoch = i model.save_models(args.output) if args.early_stop and i >= thres * SAVING_STEP: cnt = 0 for k in range(thres - 1, -1, -1): cur_val = np.mean( val_scrs[i - (k + 1) * SAVING_STEP:i - k * SAVING_STEP]) if cur_val < best_scr: cnt += 1 if cnt == thres and (i - best_epoch) >= thres * SAVING_STEP: logger.info('*********early stop*********') logger.info( 'The best epoch: {}\nThe validation score: {}'. format(best_epoch, best_scr)) break if args.method == 'mna' or args.method == 'fruip': graph = defaultdict(Graph) print("Loading graph...") if len(args.graphs) != 2: logger.error('#####The input graphs must be pairwise!#####') sys.exit(1) if args.graph_format == 'adjlist': if args.graphs[0]: graph['f'].read_adjlist(filename=args.graphs[0]) if args.graphs[1]: graph['g'].read_adjlist(filename=args.graphs[1]) if args.graph_format == 'edgelist': if args.graphs[0]: graph['f'].read_edgelist(filename=args.graphs[0]) if args.graphs[1]: graph['g'].read_edgelist(filename=args.graphs[1]) if args.method == 'mna': model = MNA(graph=graph, attr_file=args.embeddings, anchorfile=args.identity_linkage, valid_prop=1.\ , use_net=args.use_net, neg_ratio=args.neg_ratio, log_file=args.log_file) if args.method == 'fruip': model = FRUIP(graph=graph, embed_files=args.embeddings, linkage_file=args.identity_linkage) model.main_proc(args.threshold) if args.method == 'final': main_proc(graph_files=args.graphs, graph_sizes=args.graph_sizes, linkage_file=args.identity_linkage, alpha=args.alpha, epoch=args.epochs, tol=args.tol, graph_format=args.graph_format, output_file=args.output) if args.method == 'crossmna': num_graphs = len(args.graphs) layer_graphs = [Graph() for i in range(num_graphs)] for k in range(num_graphs): graph_path = args.graphs[k] format_graph_path = '{}.crossmna'.format(graph_path) format_crossmna_graph(graph_path, format_graph_path, k) if args.graph_format == 'adjlist': layer_graphs[k].read_adjlist(filename=format_graph_path) if args.graph_format == 'edgelist': layer_graphs[k].read_edgelist(filename=format_graph_path) model = CROSSMNA(layer_graphs=layer_graphs, anchor_file=args.identity_linkage, lr=args.lr, batch_size=args.batch_size, nd_rep_size=args.nd_rep_size, layer_rep_size=args.layer_rep_size, epoch=args.epochs, negative_ratio=args.neg_ratio, table_size=args.table_size, outfile=args.output, log_file=args.log_file) if args.method in ['mna', 'fruip', 'pale']: model.save_model(args.output) t2 = time.time() print('time cost:', t2 - t1)
class HALF_DP(object): def __init__(self, learning_rate, batch_size, neg_ratio, gamma, eta, n_input, n_out, n_hidden, n_layer, type_model, is_valid, device, files, log_file): if os.path.exists('log/' + log_file + '.log'): os.remove('log/' + log_file + '.log') self.logger = LogHandler(log_file) self.device = device self.type_model = type_model # Parameters self.learning_rate = learning_rate self.batch_size = batch_size self.neg_ratio = neg_ratio self.valid = is_valid self.valid_prop = .9 if self.valid else 1. self.valid_sample_size = 10 self.gamma = gamma self.eta = eta self.cur_epoch = 1 # Network Parameters self.n_hidden = n_hidden if type_model == 'mlp' else n_input # number of neurons in hidden layer self.n_input = n_input # size of node embeddings self.n_out = n_out # hashing code self.n_layer = n_layer # number of layer # Set Train Data if not isinstance(files, list) and len(files) < 3: self.logger.info( 'The alogrihtm needs inputs: feature-src, feature-end, identity-linkage' ) return # tf Graph input self.lookup = defaultdict(dict) self.look_back = defaultdict(list) self._read_train_dat( files) # features from source, features from end, label file self.valid_sample_size = min( min(self.valid_sample_size, len(self.look_back['src']) - 1), len(self.look_back['end']) - 1) # TF Graph Building self.sess = tf.Session() cur_seed = random.getrandbits(32) initializer = tf.contrib.layers.xavier_initializer(uniform=False, seed=cur_seed) with tf.device(self.device): with tf.variable_scope("model", reuse=None, initializer=initializer): self._init_weights() self.build_graph(type_model) self.build_valid_graph(type_model) self.sess.run(tf.global_variables_initializer()) def _read_train_dat(self, files): self.F, self.lookup['src'], self.look_back['src'] = read_features( files['feat-src']) self.G, self.lookup['end'], self.look_back['end'] = read_features( files['feat-end']) self.L = load_train_valid_labels(files['linkage'], self.lookup, self.valid_prop) def _init_weights(self): # Store layers weight & bias self.weights = dict() self.biases = dict() if self.type_model == 'mlp': self.weights['h0_src'] = tf.Variable( tf.random_normal([self.n_input, self.n_hidden])) self.weights['h0_end'] = tf.Variable( tf.random_normal([self.n_input, self.n_hidden])) self.biases['b0_src'] = tf.Variable(tf.zeros([self.n_hidden])) self.biases['b0_end'] = tf.Variable(tf.zeros([self.n_hidden])) for i in range(1, self.n_layer): self.weights['h{}_src'.format(i)] = tf.Variable( tf.random_normal([self.n_hidden, self.n_hidden])) self.weights['h{}_end'.format(i)] = tf.Variable( tf.random_normal([self.n_hidden, self.n_hidden])) self.biases['b{}_src'.format(i)] = tf.Variable( tf.zeros([self.n_hidden])) self.biases['b{}_end'.format(i)] = tf.Variable( tf.zeros([self.n_hidden])) self.weights['out_src'] = tf.Variable( tf.random_normal([self.n_hidden, self.n_out])) self.weights['out_end'] = tf.Variable( tf.random_normal([self.n_hidden, self.n_out])) self.biases['b_out_src'] = tf.Variable(tf.zeros([self.n_out])) self.biases['b_out_end'] = tf.Variable(tf.zeros([self.n_out])) def build_lin_code_graph(self, inputs, tag): # Output fully connected layer with a neuron code = tf.nn.tanh( tf.matmul(tf.reshape(inputs, [-1, self.n_input]), self.weights[ 'out_' + tag]) + self.biases['b_out_' + tag]) return code def build_mlp_code_graph(self, inputs, tag): # Input layer layer = tf.nn.sigmoid( tf.add( tf.matmul(tf.reshape(inputs, [-1, self.n_input]), self.weights['h0_' + tag]), self.biases['b0_' + tag])) for i in range(1, self.n_layer): layer = tf.nn.sigmoid( tf.add(tf.matmul(layer, self.weights['h{}_{}'.format(i, tag)]), self.biases['b{}_{}'.format(i, tag)])) # Output fully connected layer with a neuron code = tf.nn.tanh( tf.matmul(layer, self.weights['out_' + tag]) + self.biases['b_out_' + tag]) return code def build_train_graph(self, src_tag, end_tag, code_graph): PF = code_graph(self.inputs_pos[src_tag], src_tag) # batch_size*n_out PG = code_graph(self.inputs_pos[end_tag], end_tag) # batch_size*n_out NF = tf.reshape( code_graph(self.inputs_neg[src_tag], src_tag), [-1, self.neg_ratio, self.n_out]) # batch_size*neg_ratio*n_out NG = tf.reshape( code_graph(self.inputs_neg[end_tag], end_tag), [-1, self.neg_ratio, self.n_out]) # batch_size*neg_ratio*n_out B = tf.sign(PF + PG) # batch_size*n_out # self.ph['B'] = tf.sign(self.ph['F']+self.ph['G']) # batch_size*n_out # train loss term1_first = tf.log( tf.nn.sigmoid(tf.reduce_sum(tf.multiply(PF, PG), axis=1))) term1_second = tf.reduce_sum( tf.log(1 - tf.nn.sigmoid(tf.reduce_sum(tf.multiply(NF, NG), axis=2))), axis=1) term1 = -tf.reduce_sum(term1_first + term1_second) term2 = tf.reduce_sum(tf.pow( (B - PF), 2)) + tf.reduce_sum(tf.pow((B - PG), 2)) term3 = tf.reduce_sum( tf.pow(PF, 2) + tf.reduce_sum(tf.pow(NF, 2), axis=1)) + tf.reduce_sum( tf.pow(PG, 2) + tf.reduce_sum(tf.pow(NG, 2), axis=1)) # term3 = tf.reduce_sum(tf.pow(tf.reduce_sum(PF,axis=1),2)+tf.reduce_sum(tf.pow(tf.reduce_sum(NF,axis=2),2),axis=1))\ # + tf.reduce_sum(tf.pow(tf.reduce_sum(PG,axis=1),2)+tf.reduce_sum(tf.pow(tf.reduce_sum(NG,axis=2),2),axis=1)) # self.term1 = term1 # self.term2 = term2 # self.term3 = term3 return (term1 + self.gamma * term2 + self.eta * term3) / self.cur_batch_size def build_graph(self, type_code_graph): self.cur_batch_size = tf.placeholder('float32', name='batch_size') self.inputs_pos = { 'src': tf.placeholder('float32', [None, self.n_input]), 'end': tf.placeholder('float32', [None, self.n_input]) } self.inputs_neg = { 'src': tf.placeholder('float32', [None, self.neg_ratio, self.n_input]), 'end': tf.placeholder('float32', [None, self.neg_ratio, self.n_input]) } if type_code_graph == 'lin': code_graph = self.build_lin_code_graph elif type_code_graph == 'mlp': code_graph = self.build_mlp_code_graph self.loss = (self.build_train_graph('src', 'end', code_graph) + self.build_train_graph('end', 'src', code_graph)) / 2. optimizer = tf.train.AdamOptimizer(self.learning_rate) self.train_op = optimizer.minimize(self.loss) def build_valid_graph(self, type_code_graph): # validation self.inputs_val = { 'src': tf.placeholder('float32', [None, self.valid_sample_size, self.n_input]), 'end': tf.placeholder('float32', [None, self.valid_sample_size, self.n_input]) } if type_code_graph == 'lin': code_graph = self.build_lin_code_graph elif type_code_graph == 'mlp': code_graph = self.build_mlp_code_graph valids = { 'src': tf.reshape(code_graph(self.inputs_val['src'], 'src'), [-1, self.valid_sample_size, self.n_out ]), # batch_size*neg_ratio*n_out 'end': tf.reshape(code_graph(self.inputs_val['end'], 'end'), [-1, self.valid_sample_size, self.n_out ]) # batch_size*neg_ratio*n_out } # self.dot_dist = tf.reduce_sum(tf.multiply(valid_f, valid_g),axis=2) self.hamming_dist = -tf.reduce_sum(tf.clip_by_value( tf.sign(tf.multiply(valids['src'], valids['end'])), -1., 0.), axis=2) def train_one_epoch(self): sum_loss = 0.0 mrr = 0.0 # train process # print 'start training...' batches = batch_iter(self.L, self.batch_size, self.neg_ratio\ , self.lookup, 'src', 'end') batch_id = 0 for batch in batches: # training the process from source network to end network pos, neg = batch if not len(pos['src']) == len(pos['end']) and not len( neg['src']) == len(neg['end']): self.logger.info( 'The input label file goes wrong as the file format.') continue batch_size = len(pos['src']) feed_dict = { self.inputs_pos['src']: self.F[pos['src'], :], self.inputs_pos['end']: self.G[pos['end'], :], self.inputs_neg['src']: self.F[neg['src'], :], self.inputs_neg['end']: self.G[neg['end'], :], self.cur_batch_size: batch_size } _, cur_loss = self.sess.run([self.train_op, self.loss], feed_dict) sum_loss += cur_loss batch_id += 1 if self.valid: # valid process valid = valid_iter(self.L, self.valid_sample_size, self.lookup, 'src', 'end') # print valid_f,valid_g if not len(valid['src']) == len(valid['end']): self.logger.info( 'The input label file goes wrong as the file format.') return valid_size = len(valid['src']) feed_dict = { self.inputs_val['src']: self.F[valid['src'], :], self.inputs_val['end']: self.G[valid['end'], :], } # valid_dist = self.sess.run(self.dot_dist,feed_dict) valid_dist = self.sess.run(self.hamming_dist, feed_dict) mrr = .0 for i in range(valid_size): fst_dist = valid_dist[i][0] pos = 1 for k in range(1, len(valid_dist[i])): if fst_dist >= valid_dist[i][k]: pos += 1 # print pos # self.logger.info('dist:{},pos:{}'.format(fst_dist,pos)) # print valid_dist[i] mrr += 1. / pos self.logger.info('Epoch={}, sum of loss={!s}, mrr={}'.format( self.cur_epoch, sum_loss / batch_id / 2, mrr / valid_size)) else: self.logger.info('Epoch={}, sum of loss={!s}'.format( self.cur_epoch, sum_loss / batch_id / 2)) self.cur_epoch += 1 # print(sum_loss/(batch_id+1e-8), mrr/(valid_size+1e-8)) return sum_loss / (batch_id + 1e-8), mrr / (valid_size + 1e-8) def save_models(self, filename): if os.path.exists(filename): os.remove(filename) for k, v in self.weights.items(): if self.type_model == 'lin': if 'out' not in k: continue write_in_file(filename, v.eval(self.sess), k) for k, v in self.biases.items(): if self.type_model == 'lin': if 'out' not in k: continue write_in_file(filename, v.eval(self.sess), k)
class _LINE_ANCHORREG_ALIGN_PRETRAIN(object): def __init__(self, graph, lr=.001, gamma=.1, rep_size=128, batch_size=100, negative_ratio=5\ , order=3, table_size=1e8, embedfile=None, anchorfile=None, log_file='log'): if not embedfile or not anchorfile: return if os.path.exists('log/' + log_file + '.log'): os.remove('log/' + log_file + '.log') self.logger = LogHandler(log_file) self.epsilon = 1e-7 self.table_size = table_size self.sigmoid_table = {} self.sigmoid_table_size = 1000 self.SIGMOID_BOUND = 6 self._init_simgoid_table() self.g = graph self.look_up = self.g.look_up_dict self.idx = defaultdict(int) self.update_dict = defaultdict(dict) self.update_look_back = defaultdict(list) self.node_size = graph.G.number_of_nodes() self.rep_size = rep_size self.order = order self.lr = lr self.gamma = gamma self.cur_epoch = 0 self.batch_size = batch_size self.negative_ratio = negative_ratio self._gen_sampling_table() self._init_params(self.node_size, rep_size, embedfile, anchorfile) def _init_params(self, node_size, rep_size, embedfile, anchorfile): self.embeddings = dict() self.embeddings['order1'] = np.random.normal(0, 1, (node_size, rep_size)) self.embeddings['order2'] = np.random.normal(0, 1, (node_size, rep_size)) self.embeddings['content'] = np.random.normal(0, 1, (node_size, rep_size)) self.embeddings['order1'] = self._set_anchor_nds( self.embeddings['order1'], embedfile, anchorfile, 1) self.embeddings['order2'] = self._set_anchor_nds( self.embeddings['order2'], embedfile, anchorfile, 2) self._init_update_params(node_size, rep_size) self._pre_train() def _init_update_params(self, node_size, rep_size): # adagrad self.h_delta = dict() self.h_delta['order1'] = np.zeros((node_size, rep_size)) self.h_delta['order2'] = np.zeros((node_size, rep_size)) self.h_delta['content'] = np.zeros((node_size, rep_size)) # adam self.m = dict() self.m['order1'] = np.zeros((node_size, rep_size)) self.m['order2'] = np.zeros((node_size, rep_size)) self.m['content'] = np.zeros((node_size, rep_size)) self.v = dict() self.v['order1'] = np.zeros((node_size, rep_size)) self.v['order2'] = np.zeros((node_size, rep_size)) self.v['content'] = np.zeros((node_size, rep_size)) self.t = 1 def _read_anchors(self, anchorfile): anchors = list() with open(anchorfile, 'r') as anchor_handler: for ln in anchor_handler: elems = ln.strip().split() anchors.append((elems[0], elems[1])) return anchors def _read_embeddings(self, embedfile): embeddings = dict() with open(embedfile, 'r') as embed_handler: for ln in embed_handler: elems = ln.strip().split() if len(elems) <= 2: continue embeddings[elems[0]] = map(float, elems[1:]) return embeddings def _set_anchor_nds(self, mat, embedfile, anchorfile, order): self.anchors = self._read_anchors(anchorfile) self.src_embeddings = self._read_embeddings(embedfile) self.anchor_idx = set() for src_nd, target_nd in self.anchors: if not target_nd in self.look_up or not src_nd in self.src_embeddings: continue if len(mat[self.look_up[target_nd]]) != len( self.src_embeddings[src_nd]): self.logger.error( 'The length of embeddings at anchor nodes are illegal') break self.anchor_idx.add(self.look_up[target_nd]) # if order==1: # mat[self.look_up[target_nd]] = self.src_embeddings[src_nd][0:len(mat[self.look_up[target_nd]])] # if order==2: # mat[self.look_up[target_nd]] = self.src_embeddings[src_nd][len(mat[self.look_up[target_nd]]):] mat[self.look_up[target_nd]] = self.src_embeddings[src_nd] return mat def _pre_train(self): self.logger.info("Pretraining...") DISPLAY_EPOCH = 1000 order = self.order batches = self.batch_iter() opt_type = 'adam' for batch in batches: self.idx = defaultdict(int) self.update_look_back = defaultdict(list) self.update_dict = defaultdict(dict) if order == 1 or order == 3: delta_eh_o1 = self._pretrain_update_graph_by_order1(batch) len_delta = len(delta_eh_o1) # print 'order1 nd' if opt_type == 'adagrad': self.h_delta['order1'], self.embeddings['order1'] = \ self.update_vec('nd_order1', self.h_delta['order1'], delta_eh_o1 , self.embeddings['order1'], len_delta, self.t) if opt_type == 'adam': self.m['order1'], self.v['order1'], self.embeddings['order1'] = \ self.update_vec_by_adam('nd_order1', self.m['order1'], self.v['order1'], delta_eh_o1 , self.embeddings['order1'], len_delta, self.t) if order == 2 or order == 3: delta_c, delta_eh_o2 = self._pretrain_update_graph_by_order2( batch) len_delta = len(delta_eh_o2) # print 'order2, nd' if opt_type == 'adagrad': self.h_delta['order2'], self.embeddings['order2'] = \ self.update_vec('nd_order2', self.h_delta['order2'], delta_eh_o2 , self.embeddings['order2'], len_delta, self.t) if opt_type == 'adam': self.m['order2'], self.v['order2'], self.embeddings['order2'] = \ self.update_vec_by_adam('nd_order2', self.m['order2'], self.v['order2'], delta_eh_o2 , self.embeddings['order2'], len_delta, self.t) len_content = len(delta_c) # print 'order2, content' if opt_type == 'adagrad': self.h_delta_c, self.embeddings['content'] = \ self.update_vec('cnt_order2', self.h_delta['content'], delta_c , self.embeddings['content'], len_content, self.t) if opt_type == 'adam': self.m['content'], self.v['content'], self.embeddings['content'] = \ self.update_vec_by_adam('cnt_order2', self.m['content'], self.v['content'], delta_c , self.embeddings['content'], len_content, self.t) # self.embeddings_order2[self.update_look_back[:len_de],:] -= self.lr*delta_eh # len_content = len(delta_c) # self.content_embeddings[self.update_look_back[:len_content],:] -= self.lr*delta_c # break if (self.t - 1) % DISPLAY_EPOCH == 0: self.get_cur_batch_loss(self.t, batch) self.t += 1 self._init_update_params(self.node_size, self.rep_size) self.logger.info("End of Pretraining") def _init_simgoid_table(self): for k in range(self.sigmoid_table_size): x = 2 * self.SIGMOID_BOUND * k / self.sigmoid_table_size - self.SIGMOID_BOUND self.sigmoid_table[k] = 1. / (1 + np.exp(-x)) def _fast_sigmoid(self, val): if val > self.SIGMOID_BOUND: return 1 - self.epsilon elif val < -self.SIGMOID_BOUND: return self.epsilon k = int((val + self.SIGMOID_BOUND) * self.sigmoid_table_size / self.SIGMOID_BOUND / 2) return self.sigmoid_table[k] # return 1./(1+np.exp(-val)) def _pretrain_update_graph_by_order2(self, batch): ''' x = self._binarize(self.embeddings[key]) ''' pos_h, pos_t, pos_h_v, neg_t = batch batch_size = len(pos_h) # print pos_h, pos_t, pos_h_v, neg_t # order 2 pos_u = self.embeddings['order2'][pos_h, :] pos_v_c = self.embeddings['content'][pos_t, :] neg_u = self.embeddings['order2'][pos_h_v, :] neg_v_c = self.embeddings['content'][neg_t, :] pos_e = np.sum(pos_u * pos_v_c, axis=1) # pos_e.shape = batch_size neg_e = np.sum(neg_u * neg_v_c, axis=2) # neg_e.shape = batch_size*negative_ratio sigmoid_pos_e = np.array([ self._fast_sigmoid(val) for val in pos_e.reshape(-1) ]).reshape(pos_e.shape) sigmoid_neg_e = np.array([ self._fast_sigmoid(val) for val in neg_e.reshape(-1) ]).reshape(neg_e.shape) # temporal delta delta_eh = list() delta_c = list() for i in range(len(pos_t)): u, v = pos_h[i], pos_t[i] if not v in self.anchor_idx: delta_c = self._calc_delta_vec('cnt_order2', v, delta_c, (sigmoid_pos_e[i] - 1) * pos_u[i, :]) if not u in self.anchor_idx: delta_eh = self._calc_delta_vec('nd_order2', u, delta_eh, (sigmoid_pos_e[i] - 1) * pos_v_c[i, :]) # print 'delta_eh',delta_eh,ndDict_order neg_shape = neg_e.shape for i in range(neg_shape[0]): for j in range(neg_shape[1]): u, v = pos_h_v[i][j], neg_t[i][j] if not v in self.anchor_idx: delta_c = self._calc_delta_vec( 'cnt_order2', v, delta_c, sigmoid_neg_e[i, j] * neg_u[i, j, :]) if not u in self.anchor_idx: delta_eh = self._calc_delta_vec( 'nd_order2', u, delta_eh, sigmoid_neg_e[i, j] * neg_v_c[i, j, :]) # print sigmoid_neg_e[i,j]*neg_v_c[i,j,:], type(sigmoid_neg_e[i,j]*neg_v_c[i,j,:]) # print 'delta_eh',delta_eh,ndDict_order # delta x & delta codebook delta_eh = self._format_vec('nd_order2', delta_eh) delta_c = self._format_vec('cnt_order2', delta_c) return delta_c / batch_size, delta_eh / batch_size def _pretrain_update_graph_by_order1(self, batch): ''' x = self._binarize(self.embeddings[key]) ''' pos_h, pos_t, pos_h_v, neg_t = batch batch_size = len(pos_h) # order 1 pos_u = self.embeddings['order1'][pos_h, :] pos_v = self.embeddings['order1'][pos_t, :] neg_u = self.embeddings['order1'][pos_h_v, :] neg_v = self.embeddings['order1'][neg_t, :] pos_e = np.sum(pos_u * pos_v, axis=1) # pos_e.shape = batch_size neg_e = np.sum(neg_u * neg_v, axis=2) # neg_e.shape = batch_size*negative_ratio sigmoid_pos_e = np.array([ self._fast_sigmoid(val) for val in pos_e.reshape(-1) ]).reshape(pos_e.shape) sigmoid_neg_e = np.array([ self._fast_sigmoid(val) for val in neg_e.reshape(-1) ]).reshape(neg_e.shape) # delta calculation delta_eh = list() for i in range(len(pos_t)): u, v = pos_h[i], pos_t[i] if not v in self.anchor_idx: delta_eh = self._calc_delta_vec('nd_order1', v, delta_eh, (sigmoid_pos_e[i] - 1) * pos_u[i, :]) if not u in self.anchor_idx: delta_eh = self._calc_delta_vec('nd_order1', u, delta_eh, (sigmoid_pos_e[i] - 1) * pos_v[i, :]) neg_shape = neg_e.shape for i in range(neg_shape[0]): for j in range(neg_shape[1]): u, v = pos_h_v[i][j], neg_t[i][j] if not v in self.anchor_idx: delta_eh = self._calc_delta_vec( 'nd_order1', v, delta_eh, sigmoid_neg_e[i, j] * neg_u[i, j, :]) if not u in self.anchor_idx: delta_eh = self._calc_delta_vec( 'nd_order1', u, delta_eh, sigmoid_neg_e[i, j] * neg_v[i, j, :]) # delta x & delta codebook delta_eh = self._format_vec('nd_order1', delta_eh) return delta_eh / batch_size def _cos_sim(self, vec1, vec2): return np.dot(vec1, vec2) / np.linalg.norm(vec1) / np.linalg.norm(vec2) def _update_graph_by_anchor_reg(self): delta_eh = list() cnt = 0 for src_nd, target_nd in self.anchors: if not src_nd in self.src_embeddings or not target_nd in self.look_up: continue src_emb = np.array(self.src_embeddings[src_nd]) if self.order == 2: target_emb = self.embeddings['order2'][self.look_up[target_nd]] if self.order == 1: target_emb = self.embeddings['order1'][self.look_up[target_nd]] delta_eh = self._calc_delta_vec( 'nd_order2', self.look_up[target_nd], delta_eh, (self._cos_sim(src_emb, target_emb) * target_emb / np.dot(target_emb, target_emb) - src_emb / np.linalg.norm(src_emb) / np.linalg.norm(target_emb)) / self._cos_sim(src_emb, target_emb)) cnt += 1 if self.order == 2: delta_eh = self._format_vec('nd_order2', delta_eh) if self.order == 1: delta_eh = self._format_vec('nd_order1', delta_eh) return delta_eh / cnt def _format_vec(self, cal_type, vec): len_gap = self.idx[cal_type] - len(vec) if len_gap > 0: for i in range(len_gap): if isinstance(vec, list): vec.append(np.zeros(vec[0].shape)) else: vec = np.vstack((vec, np.zeros(vec[0].shape))) return np.array(vec) def _calc_delta_vec(self, cal_type, nd, delta, opt_vec): if nd not in self.update_dict[cal_type]: cur_idx = self.idx[cal_type] self.update_dict[cal_type][nd] = cur_idx self.update_look_back[cal_type].append(nd) self.idx[cal_type] += 1 else: cur_idx = self.update_dict[cal_type][nd] if cur_idx >= len(delta): for i in range(cur_idx - len(delta)): delta.append(np.zeros(opt_vec.shape)) delta.append(opt_vec) else: delta[cur_idx] += opt_vec return delta def _update_graph_by_order2(self, batch): ''' x = self._binarize(self.embeddings[key]) ''' pos_h, pos_t, pos_h_v, neg_t = batch batch_size = len(pos_h) # print pos_h, pos_t, pos_h_v, neg_t # order 2 pos_u = self.embeddings['order2'][pos_h, :] pos_v_c = self.embeddings['content'][pos_t, :] neg_u = self.embeddings['order2'][pos_h_v, :] neg_v_c = self.embeddings['content'][neg_t, :] pos_e = np.sum(pos_u * pos_v_c, axis=1) # pos_e.shape = batch_size neg_e = np.sum(neg_u * neg_v_c, axis=2) # neg_e.shape = batch_size*negative_ratio sigmoid_pos_e = np.array([ self._fast_sigmoid(val) for val in pos_e.reshape(-1) ]).reshape(pos_e.shape) sigmoid_neg_e = np.array([ self._fast_sigmoid(val) for val in neg_e.reshape(-1) ]).reshape(neg_e.shape) # temporal delta delta_eh = list() delta_c = list() for i in range(len(pos_t)): u, v = pos_h[i], pos_t[i] delta_c = self._calc_delta_vec( 'cnt_order2', v, delta_c, (sigmoid_pos_e[i] - 1) * pos_u[i, :]) delta_eh = self._calc_delta_vec('nd_order2', u, delta_eh, (sigmoid_pos_e[i] - 1) * pos_v_c[i, :]) # print 'delta_eh',delta_eh,ndDict_order neg_shape = neg_e.shape for i in range(neg_shape[0]): for j in range(neg_shape[1]): u, v = pos_h_v[i][j], neg_t[i][j] delta_c = self._calc_delta_vec( 'cnt_order2', v, delta_c, sigmoid_neg_e[i, j] * neg_u[i, j, :]) delta_eh = self._calc_delta_vec( 'nd_order2', u, delta_eh, sigmoid_neg_e[i, j] * neg_v_c[i, j, :]) # print sigmoid_neg_e[i,j]*neg_v_c[i,j,:], type(sigmoid_neg_e[i,j]*neg_v_c[i,j,:]) # print 'delta_eh',delta_eh,ndDict_order # delta x & delta codebook delta_eh = self._format_vec('nd_order2', delta_eh) delta_c = self._format_vec('cnt_order2', delta_c) return delta_c / batch_size, delta_eh / batch_size def _update_graph_by_order1(self, batch): ''' x = self._binarize(self.embeddings[key]) ''' pos_h, pos_t, pos_h_v, neg_t = batch batch_size = len(pos_h) # order 1 pos_u = self.embeddings['order1'][pos_h, :] pos_v = self.embeddings['order1'][pos_t, :] neg_u = self.embeddings['order1'][pos_h_v, :] neg_v = self.embeddings['order1'][neg_t, :] pos_e = np.sum(pos_u * pos_v, axis=1) # pos_e.shape = batch_size neg_e = np.sum(neg_u * neg_v, axis=2) # neg_e.shape = batch_size*negative_ratio sigmoid_pos_e = np.array([ self._fast_sigmoid(val) for val in pos_e.reshape(-1) ]).reshape(pos_e.shape) sigmoid_neg_e = np.array([ self._fast_sigmoid(val) for val in neg_e.reshape(-1) ]).reshape(neg_e.shape) # delta calculation delta_eh = list() for i in range(len(pos_t)): u, v = pos_h[i], pos_t[i] delta_eh = self._calc_delta_vec( 'nd_order1', v, delta_eh, (sigmoid_pos_e[i] - 1) * pos_u[i, :]) delta_eh = self._calc_delta_vec( 'nd_order1', u, delta_eh, (sigmoid_pos_e[i] - 1) * pos_v[i, :]) neg_shape = neg_e.shape for i in range(neg_shape[0]): for j in range(neg_shape[1]): u, v = pos_h_v[i][j], neg_t[i][j] delta_eh = self._calc_delta_vec( 'nd_order1', v, delta_eh, sigmoid_neg_e[i, j] * neg_u[i, j, :]) delta_eh = self._calc_delta_vec( 'nd_order1', u, delta_eh, sigmoid_neg_e[i, j] * neg_v[i, j, :]) # delta x & delta codebook delta_eh = self._format_vec('nd_order1', delta_eh) return delta_eh / batch_size def _mat_add(self, mat1, mat2): # print '****mat add****' # print mat1, mat2 len_gap = len(mat1) - len(mat2) # print len_gap if len_gap > 0: for i in range(len_gap): mat2 = np.vstack((mat2, np.zeros(mat2[0, :].shape))) # print mat2 else: for i in range(-len_gap): mat1 = np.vstack((mat1, np.zeros(mat1[0, :].shape))) # print mat1 # print len(mat1), len(mat2) return mat1 + mat2 def get_anchor_reg_loss(self): cos_sim_list = list() for src_nd, target_nd in self.anchors: if not src_nd in self.src_embeddings or not target_nd in self.look_up: continue src_emb = np.array(self.src_embeddings[src_nd]) target_emb = self.embeddings['order2'][self.look_up[target_nd]] cos_sim_list.append(self._cos_sim(src_emb, target_emb)) return -np.mean(cos_sim_list) def get_graph_loss_by_order2(self, batch): pos_h, pos_t, pos_h_v, neg_t = batch # order 2 pos_u = self.embeddings['order2'][pos_h, :] pos_v_c = self.embeddings['content'][pos_t, :] neg_u = self.embeddings['order2'][pos_h_v, :] neg_v_c = self.embeddings['content'][neg_t, :] pos_e = np.sum(pos_u * pos_v_c, axis=1) # pos_e.shape = batch_size neg_e = np.sum(neg_u * neg_v_c, axis=2) # neg_e.shape = batch_size*negative_ratio sigmoid_pos_e = np.array([ self._fast_sigmoid(val) for val in pos_e.reshape(-1) ]).reshape(pos_e.shape) sigmoid_neg_e = np.array([ self._fast_sigmoid(val) for val in neg_e.reshape(-1) ]).reshape(neg_e.shape) return -np.mean( np.log(sigmoid_pos_e) + np.sum(np.log(1 - sigmoid_neg_e), axis=1)) def get_graph_loss_by_order1(self, batch): pos_h, pos_t, pos_h_v, neg_t = batch # order 2 pos_u = self.embeddings['order1'][pos_h, :] pos_v = self.embeddings['order1'][pos_t, :] neg_u = self.embeddings['order1'][pos_h_v, :] neg_v = self.embeddings['order1'][neg_t, :] # pos_e_1 = np.sum(pos_u*pos_v, axis=1)+np.sum(self.b_e[key][0][pos_t,:], axis=1) # pos_e.shape = batch_size # neg_e_1 = np.sum(neg_u*neg_v, axis=2)+np.sum(self.b_e[key][0][neg_t,:], axis=2) # neg_e.shape = batch_size*negative_ratio pos_e = np.sum(pos_u * pos_v, axis=1) # pos_e.shape = batch_size neg_e = np.sum(neg_u * neg_v, axis=2) # neg_e.shape = batch_size*negative_ratio sigmoid_pos_e = np.array([ self._fast_sigmoid(val) for val in pos_e.reshape(-1) ]).reshape(pos_e.shape) sigmoid_neg_e = np.array([ self._fast_sigmoid(val) for val in neg_e.reshape(-1) ]).reshape(neg_e.shape) return -np.mean( np.log(sigmoid_pos_e) + np.sum(np.log(1 - sigmoid_neg_e), axis=1)) def get_cur_batch_loss(self, t, batch): DISPLAY_EPOCH = 1 if t % DISPLAY_EPOCH == 0: loss_order_1 = 0.0 loss_order_2 = 0.0 if self.order == 1 or self.order == 3: loss_order_1 += self.get_graph_loss_by_order1(batch) if self.order == 2 or self.order == 3: anchor_loss = self.get_anchor_reg_loss() loss_order_2 += self.get_graph_loss_by_order2( batch) + anchor_loss if self.order == 1: self.logger.info( 'Finish processing batch {} and loss from order 1:{}'. format(t, loss_order_1)) elif self.order == 2: self.logger.info( 'Finish processing batch {} and loss from order 2:{} and anchor loss:{}' .format(t, loss_order_2, anchor_loss)) elif self.order == 3: self.logger.info( 'Finish processing batch {} and loss from order 3:{}'. format(t, loss_order_1 + loss_order_2)) def update_vec(self, cal_type, h_delta, delta, embeddings, len_delta, t): h_delta[self.update_look_back[cal_type][:len_delta], :] += delta**2 # print 'original embedding:',embeddings[self.update_look_back[cal_type][:len_delta]] embeddings[self.update_look_back[cal_type][:len_delta],:] -= \ self.lr/np.sqrt(h_delta[self.update_look_back[cal_type][:len_delta],:])*delta # print 'delta:',delta # print 'h_delta:',h_delta[self.update_look_back[cal_type][:len_delta]] # print 'embeddings:',embeddings[self.update_look_back[cal_type][:len_delta]] # print 'lmd_rda:',elem_lbd return h_delta, embeddings def update_vec_by_adam(self, cal_type, m, v, delta, embeddings, len_delta, t): self.beta1 = .9 self.beta2 = .999 m[self.update_look_back[cal_type][:len_delta],:] = \ self.beta1*m[self.update_look_back[cal_type][:len_delta],:]+(1-self.beta1)*delta v[self.update_look_back[cal_type][:len_delta],:] = \ self.beta2*v[self.update_look_back[cal_type][:len_delta],:]+(1-self.beta2)*(delta**2) m_ = m[self.update_look_back[cal_type][:len_delta], :] / ( 1 - self.beta1**t) v_ = v[self.update_look_back[cal_type][:len_delta], :] / ( 1 - self.beta2**t) embeddings[ self.update_look_back[cal_type][:len_delta], :] -= self.lr * m_ / ( np.sqrt(v_) + self.epsilon) return m, v, embeddings def train_one_epoch(self): DISPLAY_EPOCH = 1000 order = self.order batches = self.batch_iter() opt_type = 'adam' for batch in batches: self.idx = defaultdict(int) self.update_look_back = defaultdict(list) self.update_dict = defaultdict(dict) if order == 1 or order == 3: delta_eh_o1 = self._update_graph_by_order1(batch) len_delta = len(delta_eh_o1) # print 'order1 nd' if opt_type == 'adagrad': self.h_delta['order1'], self.embeddings['order1'] = \ self.update_vec('nd_order1', self.h_delta['order1'], delta_eh_o1 , self.embeddings['order1'], len_delta, self.t) if opt_type == 'adam': self.m['order1'], self.v['order1'], self.embeddings['order1'] = \ self.update_vec_by_adam('nd_order1', self.m['order1'], self.v['order1'], delta_eh_o1 , self.embeddings['order1'], len_delta, self.t) if order == 2 or order == 3: delta_c, delta_eh_o2 = self._update_graph_by_order2(batch) delta_eh_anchor_reg = self._update_graph_by_anchor_reg() delta_eh_o2 = self._format_vec('nd_order2', delta_eh_o2) len_delta = len(delta_eh_o2) # print 'order2, nd' if opt_type == 'adagrad': self.h_delta['order2'], self.embeddings['order2'] = \ self.update_vec('nd_order2', self.h_delta['order2'] , delta_eh_o2+self.gamma*delta_eh_anchor_reg , self.embeddings['order2'], len_delta, self.t) if opt_type == 'adam': self.m['order2'], self.v['order2'], self.embeddings['order2'] = \ self.update_vec_by_adam('nd_order2', self.m['order2'], self.v['order2'] , delta_eh_o2+self.gamma*delta_eh_anchor_reg , self.embeddings['order2'], len_delta, self.t) len_content = len(delta_c) # print 'order2, content' if opt_type == 'adagrad': self.h_delta_c, self.embeddings['content'] = \ self.update_vec('cnt_order2', self.h_delta['content'], delta_c , self.embeddings['content'], len_content, self.t) if opt_type == 'adam': self.m['content'], self.v['content'], self.embeddings['content'] = \ self.update_vec_by_adam('cnt_order2', self.m['content'], self.v['content'], delta_c , self.embeddings['content'], len_content, self.t) # self.embeddings_order2[self.update_look_back[:len_de],:] -= self.lr*delta_eh # len_content = len(delta_c) # self.content_embeddings[self.update_look_back[:len_content],:] -= self.lr*delta_c # break if (self.t - 1) % DISPLAY_EPOCH == 0: self.get_cur_batch_loss(self.t, batch) self.t += 1 self.cur_epoch += 1 def get_random_node_pairs(self, i, shuffle_indices, edges, edge_set, numNodes): # balance the appearance of edges according to edge_prob if not random.random() < self.edge_prob[shuffle_indices[i]]: shuffle_indices[i] = self.edge_alias[shuffle_indices[i]] cur_h = edges[shuffle_indices[i]][0] head = cur_h * numNodes cur_t = edges[shuffle_indices[i]][1] cur_h_v = [] cur_neg_t = [] for j in range(self.negative_ratio): rn = self.sampling_table[random.randint(0, self.table_size - 1)] while head + rn in edge_set or cur_h == rn or rn in cur_neg_t: rn = self.sampling_table[random.randint( 0, self.table_size - 1)] idx = random.randint(0, self.table_size - 1) cur_h_v.append(cur_h) cur_neg_t.append(rn) return cur_h, cur_t, cur_h_v, cur_neg_t def batch_iter(self): numNodes = self.node_size edges = [(self.look_up[x[0]], self.look_up[x[1]]) for x in self.g.G.edges()] data_size = self.g.G.number_of_edges() edge_set = set([x[0] * numNodes + x[1] for x in edges]) shuffle_indices = np.random.permutation(np.arange(data_size)) start_index = 0 end_index = min(start_index + self.batch_size, data_size) while start_index < data_size: ret = {} pos_h = [] pos_t = [] pos_h_v = [] neg_t = [] for i in range(start_index, end_index): cur_h, cur_t, cur_h_v, cur_neg_t = self.get_random_node_pairs( i, shuffle_indices, edges, edge_set, numNodes) pos_h.append(cur_h) pos_t.append(cur_t) pos_h_v.append(cur_h_v) neg_t.append(cur_neg_t) ret = (pos_h, pos_t, pos_h_v, neg_t) start_index = end_index end_index = min(start_index + self.batch_size, data_size) yield ret def _gen_sampling_table(self): table_size = self.table_size power = 0.75 numNodes = self.node_size print "Pre-procesing for non-uniform negative sampling!" self.node_degree = np.zeros(numNodes) # out degree look_up = self.g.look_up_dict for edge in self.g.G.edges(): self.node_degree[look_up[edge[0]]] += self.g.G[edge[0]][ edge[1]]["weight"] norm = sum( [math.pow(self.node_degree[i], power) for i in range(numNodes)]) self.sampling_table = np.zeros(int(table_size), dtype=np.uint32) p = 0 i = 0 for j in range(numNodes): p += float(math.pow(self.node_degree[j], power)) / norm while i < table_size and float(i) / table_size < p: self.sampling_table[i] = j i += 1 data_size = self.g.G.number_of_edges() self.edge_alias = np.zeros(data_size, dtype=np.int32) self.edge_prob = np.zeros(data_size, dtype=np.float32) large_block = np.zeros(data_size, dtype=np.int32) small_block = np.zeros(data_size, dtype=np.int32) total_sum = sum([ self.g.G[edge[0]][edge[1]]["weight"] for edge in self.g.G.edges() ]) norm_prob = [ self.g.G[edge[0]][edge[1]]["weight"] * data_size / total_sum for edge in self.g.G.edges() ] num_small_block = 0 num_large_block = 0 cur_small_block = 0 cur_large_block = 0 for k in range(data_size - 1, -1, -1): if norm_prob[k] < 1: small_block[num_small_block] = k num_small_block += 1 else: large_block[num_large_block] = k num_large_block += 1 while num_small_block and num_large_block: num_small_block -= 1 cur_small_block = small_block[num_small_block] num_large_block -= 1 cur_large_block = large_block[num_large_block] self.edge_prob[cur_small_block] = norm_prob[cur_small_block] self.edge_alias[cur_small_block] = cur_large_block norm_prob[cur_large_block] = norm_prob[ cur_large_block] + norm_prob[cur_small_block] - 1 if norm_prob[cur_large_block] < 1: small_block[num_small_block] = cur_large_block num_small_block += 1 else: large_block[num_large_block] = cur_large_block num_large_block += 1 while num_large_block: num_large_block -= 1 self.edge_prob[large_block[num_large_block]] = 1 while num_small_block: num_small_block -= 1 self.edge_prob[small_block[num_small_block]] = 1 def save_embeddings(self, outfile): vectors = self.get_vectors() for c in vectors.keys(): if 'node_embeddings' in c or 'content_embeddings' in c: # outfile-[node_embeddings/content-embeddings]-[src/obj] fout = open('{}.{}'.format(outfile, c), 'w') node_num = len(vectors[c].keys()) fout.write("{} {}\n".format(node_num, self.rep_size)) for node, vec in vectors[c].items(): fout.write("{} {}\n".format( node, ' '.join([str(x) for x in vec]))) fout.close() if self.order == 3: fout = open('{}.node_embedding_all'.format(outfile), 'w') node_num = len(vectors[c].keys()) fout.write("{} {}\n".format(node_num, self.rep_size * 2)) for node, vec in vectors['node_embeddings_order1'].items(): fout.write("{} {} {}\n".format( node, ' '.join([str(x) for x in vec]), ' '.join([ str(x) for x in vectors['node_embeddings_order2'][node] ]))) fout.close() def get_one_embeddings(self, embeddings): vectors = dict() look_back = self.g.look_back_list for i, embedding in enumerate(embeddings): vectors[look_back[i]] = embedding return vectors def get_vectors(self): order = self.order ret = dict() node_embeddings_order1 = self.get_one_embeddings( self.embeddings['order1']) ret['node_embeddings_order1'] = node_embeddings_order1 node_embeddings_order2 = self.get_one_embeddings( self.embeddings['order2']) ret['node_embeddings_order2'] = node_embeddings_order2 if order == 2 or order == 3: content_embeddings = dict() content_embeddings = self.get_one_embeddings( self.embeddings['content']) ret['content_embeddings'] = content_embeddings return ret
class _CROSSMNA(object): def __init__(self, layer_graphs, anchor_file, lr=.001, nd_rep_size=16, layer_rep_size=16, batch_size=100, negative_ratio=5, table_size=1e8, log_file='log', last_emb_file=None): if os.path.exists('log/' + log_file + '.log'): os.remove('log/' + log_file + '.log') self.logger = LogHandler(log_file) self.epsilon = 1e-7 self.table_size = table_size self.sigmoid_table = {} self.sigmoid_table_size = 1000 self.SIGMOID_BOUND = 6 self._init_simgoid_table() self.anchors, num_anchors = self._read_anchors(anchor_file, ',') self.logger.info('Number of anchors:%d' % num_anchors) self.num_layers = len(layer_graphs) # number of calculated networks self.layer_graphs = layer_graphs # graphs in different layers self.nd_rep_size = nd_rep_size # representation size of node self.layer_rep_size = layer_rep_size # representation size of layer self.idx = 0 # for speeding up calculation # self.node_size = 0 # for i in range(self.num_layers): # self.node_size += layer_graphs[i].node_size # self.node_size -= num_anchors # print(self.node_size) # may need to be revised self.update_dict = defaultdict(int) self.update_look_back = list() self._build_dict(layer_graphs, self.anchors) self.logger.info('Number of nodes:%d' % len(self.look_back)) self.node_size = len(self.look_back) self._init_params(self.node_size, self.num_layers, nd_rep_size, layer_rep_size, last_emb_file) self.lr = lr self.cur_epoch = 0 self.batch_size = batch_size self.negative_ratio = negative_ratio self._gen_sampling_table() def _build_dict(self, layer_graphs, anchors): self.look_up = defaultdict(int) self.look_back = list() idx = 0 for i in range(self.num_layers): for nd in layer_graphs[i].G.nodes(): if nd in self.look_up: continue if nd in self.anchors: for ac_nd in self.anchors[nd]: self.look_up[ac_nd] = idx self.look_up[nd] = idx self.look_back.append(nd) idx += 1 def _init_params(self, node_size, n_layers, nd_rep_size, layer_rep_size, last_emb_file): self.params = dict() self.params['node'] = np.random.normal(0, 1, (node_size, nd_rep_size)) self.params['layer'] = np.random.normal(0, 1, (n_layers, layer_rep_size)) self.params['W'] = np.random.normal(0, 1, (nd_rep_size, layer_rep_size)) if last_emb_file: self.params['node'] = self._init_emb_matrix(self.params['node']\ , '{}.node'.format(last_emb_file)) self.params['layer'] = self._init_emb_matrix(self.params['layer']\ , '{}.layer'.format(last_emb_file)) self.params['W'] = self._init_emb_matrix( self.params['W'], '{}.W'.format(last_emb_file)) # adagrad self.h_delta = dict() self.h_delta['node'] = np.zeros((node_size, nd_rep_size)) self.h_delta['layer'] = np.zeros((n_layers, layer_rep_size)) self.h_delta['W'] = np.zeros((nd_rep_size, layer_rep_size)) # adam self.m = dict() self.m['node'] = np.zeros((node_size, nd_rep_size)) self.m['layer'] = np.zeros((n_layers, layer_rep_size)) self.m['W'] = np.zeros((nd_rep_size, layer_rep_size)) self.v = dict() self.v['node'] = np.zeros((node_size, nd_rep_size)) self.v['layer'] = np.zeros((n_layers, layer_rep_size)) self.v['W'] = np.zeros((nd_rep_size, layer_rep_size)) self.t = 1 def _init_emb_matrix(self, emb, emb_file): with open(emb_file, 'r') as embed_handler: for ln in embed_handler: elems = ln.strip().split() if len(elems) <= 2: continue emb[self.look_up[elems[0]]] = map(float, elems[1:]) return emb def _read_anchors(self, anchor_file, delimiter): anchors = dict() num_anchors = 0 with open(anchor_file, 'r') as anchor_handler: for ln in anchor_handler: elems = ln.strip().split(delimiter) for i in range(len(elems)): elems[i] = '{}-{}'.format(i, elems[i]) num_anchors += len(elems) - 1 for k in range(len(elems)): anchors[elems[k]] = elems[:k] + elems[k + 1:] return anchors, num_anchors def _init_simgoid_table(self): for k in range(self.sigmoid_table_size): x = 2 * self.SIGMOID_BOUND * k / self.sigmoid_table_size - self.SIGMOID_BOUND self.sigmoid_table[k] = 1. / (1 + np.exp(-x)) def _fast_sigmoid(self, val): if val > self.SIGMOID_BOUND: return 1 - self.epsilon elif val < -self.SIGMOID_BOUND: return self.epsilon k = int((val + self.SIGMOID_BOUND) * self.sigmoid_table_size / self.SIGMOID_BOUND / 2) return self.sigmoid_table[k] # return 1./(1+np.exp(-val)) def _calc_delta_vec(self, nd, delta, opt_vec): if nd not in self.update_dict: cur_idx = self.idx self.update_dict[nd] = cur_idx self.update_look_back.append(nd) self.idx += 1 else: cur_idx = self.update_dict[nd] if cur_idx >= len(delta): for i in range(cur_idx - len(delta)): delta.append(np.zeros(opt_vec.shape)) delta.append(opt_vec) else: delta[cur_idx] += opt_vec return delta def _update_intra_vec(self, batch): ''' x = self._binarize(self.embeddings[key]) ''' pos, neg = batch batch_size = len(pos['h']) # order 1 pos_u = np.dot( self.params['node'][pos['h'], :], self.params['W']) + self.params['layer'][pos['h_layer'], :] pos_v = np.dot( self.params['node'][pos['t'], :], self.params['W']) + self.params['layer'][pos['t_layer'], :] neg_u = np.dot( self.params['node'][neg['h'], :], self.params['W']) + self.params['layer'][neg['h_layer'], :] neg_v = np.dot( self.params['node'][neg['t'], :], self.params['W']) + self.params['layer'][neg['t_layer'], :] pos_e = np.sum(pos_u * pos_v, axis=1) # pos_e.shape = batch_size neg_e = np.sum(neg_u * neg_v, axis=2) # neg_e.shape = batch_size*negative_ratio sigmoid_pos_e = np.array([ self._fast_sigmoid(val) for val in pos_e.reshape(-1) ]).reshape(pos_e.shape) sigmoid_neg_e = np.array([ self._fast_sigmoid(val) for val in neg_e.reshape(-1) ]).reshape(neg_e.shape) # delta calculation delta_eh = list() delta_l = np.zeros( (self.num_layers, self.layer_rep_size)) ### problem in here ### idx = 0 for i in range(len(pos['t'])): u, v = pos['h'][i], pos['t'][i] u_layer, v_layer = pos['h_layer'][i], pos['t_layer'][i] delta_eh = self._calc_delta_vec( v, delta_eh, (sigmoid_pos_e[i] - 1) * pos_u[i, :]) delta_eh = self._calc_delta_vec( u, delta_eh, (sigmoid_pos_e[i] - 1) * pos_v[i, :]) delta_l[v_layer] = (sigmoid_pos_e[i] - 1) * pos_u[i, :] delta_l[u_layer] = (sigmoid_pos_e[i] - 1) * pos_v[i, :] neg_shape = neg_e.shape for i in range(neg_shape[0]): for j in range(neg_shape[1]): u, v = neg['h'][i][j], neg['t'][i][j] u_layer, v_layer = neg['h_layer'][i][j], neg['t_layer'][i][j] delta_eh = self._calc_delta_vec( v, delta_eh, sigmoid_neg_e[i, j] * neg_u[i, j, :]) delta_eh = self._calc_delta_vec( u, delta_eh, sigmoid_neg_e[i, j] * neg_v[i, j, :]) delta_l[v_layer] = sigmoid_neg_e[i, j] * neg_u[i, j, :] delta_l[u_layer] = sigmoid_neg_e[i, j] * neg_v[i, j, :] delta_eh = np.array(delta_eh) delta_l = np.array(delta_l) # delta node, delta W, delta layer # print(self.params['node'][self.update_look_back,:].shape, delta_eh.shape) return np.dot(delta_eh, self.params['W'].T)/(batch_size*(1+self.negative_ratio))/2 \ , np.dot(self.params['node'][self.update_look_back,:].T, delta_eh/batch_size) \ , np.sum(delta_l, axis=0)/(batch_size*(1+self.negative_ratio))*self.num_layers def _get_loss(self, batch): pos, neg = batch batch_size = len(pos['h']) # order 1 pos_u = np.dot( self.params['node'][pos['h'], :], self.params['W']) + self.params['layer'][pos['h_layer'], :] pos_v = np.dot( self.params['node'][pos['t'], :], self.params['W']) + self.params['layer'][pos['t_layer'], :] neg_u = np.dot( self.params['node'][neg['h'], :], self.params['W']) + self.params['layer'][neg['h_layer'], :] neg_v = np.dot( self.params['node'][neg['t'], :], self.params['W']) + self.params['layer'][neg['t_layer'], :] pos_e = np.sum(pos_u * pos_v, axis=1) # pos_e.shape = batch_size neg_e = np.sum(neg_u * neg_v, axis=2) # neg_e.shape = batch_size*negative_ratio sigmoid_pos_e = np.array([ self._fast_sigmoid(val) for val in pos_e.reshape(-1) ]).reshape(pos_e.shape) sigmoid_neg_e = np.array([ self._fast_sigmoid(val) for val in neg_e.reshape(-1) ]).reshape(neg_e.shape) return -np.mean( np.log(sigmoid_pos_e) + np.sum(np.log(1 - sigmoid_neg_e), axis=1)) def get_cur_batch_loss(self, t, batch): loss = self._get_loss(batch) self.logger.info('Finish processing batch {} and loss:{}'.format( t, loss)) return loss def update_node_vec(self, h_delta, delta, embeddings, len_delta): h_delta[self.update_look_back[:len_delta], :] += delta**2 # print 'original embedding:',embeddings[self.update_look_back[:len_delta]] embeddings[self.update_look_back[:len_delta],:] -= \ self.lr/np.sqrt(h_delta[self.update_look_back[:len_delta],:])*delta # print 'delta:',delta # print 'h_delta:',h_delta[self.update_look_back[:len_delta]] # print 'embeddings:',embeddings[self.update_look_back[:len_delta]] # print 'lmd_rda:',elem_lbd return h_delta, embeddings def update_vec(self, h_delta, delta, embeddings): h_delta += delta**2 # print 'original embedding:',embeddings[self.update_look_back[:len_delta]] embeddings -= self.lr / np.sqrt(h_delta) * delta # print 'delta:',delta # print 'h_delta:',h_delta[self.update_look_back[:len_delta]] # print 'embeddings:',embeddings[self.update_look_back[:len_delta]] # print 'lmd_rda:',elem_lbd return h_delta, embeddings def update_node_vec_by_adam(self, m, v, delta, embeddings, t): self.beta1 = .9 self.beta2 = .999 m[self.update_look_back,:] = \ self.beta1*m[self.update_look_back,:]+(1-self.beta1)*delta v[self.update_look_back,:] = \ self.beta2*v[self.update_look_back,:]+(1-self.beta2)*(delta**2) m_ = m[self.update_look_back, :] / (1 - self.beta1**t) v_ = v[self.update_look_back, :] / (1 - self.beta2**t) embeddings[self.update_look_back, :] -= self.lr * m_ / (np.sqrt(v_) + self.epsilon) return m, v, embeddings def update_vec_by_adam(self, m, v, delta, embeddings, t): self.beta1 = .9 self.beta2 = .999 m = self.beta1 * m + (1 - self.beta1) * delta v = self.beta2 * v + (1 - self.beta2) * (delta**2) m_ = m / (1 - self.beta1**t) v_ = v / (1 - self.beta2**t) embeddings -= self.lr * m_ / (np.sqrt(v_) + self.epsilon) return m, v, embeddings def train_one_epoch(self): DISPLAY_EPOCH = 1000 opt_type = 'adam' loss = 0 batches = self.batch_iter() for batch in batches: self.idx = 0 self.update_look_back = list() self.update_dict = defaultdict(int) # delta node, delta W, delta layer delta_node, delta_W, delta_layer = self._update_intra_vec(batch) if opt_type == 'adagrad': self.h_delta['node'] = \ self.update_node_vec(self.h_delta['node'], delta_node, self.params['node'], len(delta_node)) self.h_delta['W'] = self.update_vec(self.h_delta['W'], delta_W, self.params['W']) self.h_delta['layer'] = self.update_vec( self.h_delta['layer'], delta_layer, self.params['layer']) if opt_type == 'adam': self.m['node'], self.v['node'], self.params['node'] = \ self.update_node_vec_by_adam(self.m['node'], self.v['node'], delta_node , self.params['node'], self.t) self.m['W'], self.v['W'], self.params['W'] = \ self.update_vec_by_adam(self.m['W'], self.v['W'], delta_W , self.params['W'], self.t) self.m['layer'], self.v['layer'], self.params['layer'] = \ self.update_vec_by_adam(self.m['layer'], self.v['layer'], delta_layer , self.params['layer'], self.t) if (self.t - 1) % DISPLAY_EPOCH == 0: loss += self.get_cur_batch_loss(self.t, batch) # print self.t, DISPLAY_EPOCH self.t += 1 self.cur_epoch += 1 def _get_nd_layer(self, idx): nd = self.look_back[idx] p = re.compile(r'(^\d+)-.*?') m = p.match(nd) if m: return int(m.group(1)) return -1 def layer_adjust(self, h_idx, t_idx): return self._get_nd_layer(h_idx) \ if self._get_nd_layer(h_idx)>self._get_nd_layer(t_idx) \ else self._get_nd_layer(t_idx) def get_random_node_pairs(self, i, shuffle_indices, edges, edge_set, numNodes): # balance the appearance of edges according to edge_prob if not random.random() < self.edge_prob[shuffle_indices[i]]: shuffle_indices[i] = self.edge_alias[shuffle_indices[i]] pos = dict() pos['h'] = edges[shuffle_indices[i]][0] pos['t'] = edges[shuffle_indices[i]][1] pos['h_layer'] = self.layer_adjust(pos['h'], pos['t']) pos['t_layer'] = self.layer_adjust(pos['h'], pos['t']) head = pos['h'] * numNodes neg = defaultdict(list) # print(self.negative_ratio) for j in range(self.negative_ratio): rn = self.sampling_table[random.randint(0, self.table_size - 1)] # print(self.sampling_table) # print('rn:',rn) while head + rn in edge_set or pos['h'] == rn or rn in neg['t']: rn = self.sampling_table[random.randint( 0, self.table_size - 1)] # print('rn in iteration:',rn) # print(rn) neg['h'].append(pos['h']) neg['t'].append(rn) neg['h_layer'].append(self.layer_adjust(pos['h'], rn)) neg['t_layer'].append(self.layer_adjust(pos['h'], rn)) return pos, neg def batch_iter(self): edges = [] for k in range(self.num_layers): g = self.layer_graphs[k] edges += [(self.look_up[x[0]], self.look_up[x[1]]) for x in g.G.edges()] data_size = len(edges) edge_set = set([x[0] * self.node_size + x[1] for x in edges]) shuffle_indices = np.random.permutation(np.arange(data_size)) start_index = 0 end_index = min(start_index + self.batch_size, data_size) while start_index < data_size: ret = {} pos = defaultdict(list) neg = defaultdict(list) for i in range(start_index, end_index): cur_pos, cur_neg = self.get_random_node_pairs( i, shuffle_indices, edges, edge_set, self.node_size) pos['h'].append(cur_pos['h']) pos['h_layer'].append(cur_pos['h_layer']) pos['t'].append(cur_pos['t']) pos['t_layer'].append(cur_pos['t_layer']) neg['h'].append(cur_neg['h']) neg['h_layer'].append(cur_neg['h_layer']) neg['t'].append(cur_neg['t']) neg['t_layer'].append(cur_neg['t_layer']) ret = (pos, neg) start_index = end_index end_index = min(start_index + self.batch_size, data_size) yield ret def _gen_sampling_table(self): table_size = self.table_size power = 0.75 look_up = self.look_up numNodes = self.node_size print("Pre-procesing for non-uniform negative sampling!") node_degree = np.zeros(numNodes) # out degree edges = [] for k in range(self.num_layers): g = self.layer_graphs[k] edges += [(look_up[x[0]], look_up[x[1]], g.G[x[0]][x[1]]['weight']) for x in g.G.edges()] # print(g.G.edges()) # print('look_up',look_up) for edge in edges: node_degree[edge[0]] += edge[2] norm = sum([math.pow(node_degree[i], power) for i in range(numNodes)]) self.sampling_table = np.zeros(int(table_size), dtype=np.uint32) # print(numNodes) # print(node_degree) p = 0 i = 0 for j in range(numNodes): p += float(math.pow(node_degree[j], power)) / norm while i < table_size and float(i) / table_size < p: self.sampling_table[i] = j i += 1 # print(self.sampling_table) data_size = len(edges) self.edge_alias = np.zeros(data_size, dtype=np.int32) self.edge_prob = np.zeros(data_size, dtype=np.float32) large_block = np.zeros(data_size, dtype=np.int32) small_block = np.zeros(data_size, dtype=np.int32) total_sum = sum([edge[2] for edge in edges]) norm_prob = [edge[2] * data_size / total_sum for edge in edges] num_small_block = 0 num_large_block = 0 cur_small_block = 0 cur_large_block = 0 for k in range(data_size - 1, -1, -1): if norm_prob[k] < 1: small_block[num_small_block] = k num_small_block += 1 else: large_block[num_large_block] = k num_large_block += 1 while num_small_block and num_large_block: num_small_block -= 1 cur_small_block = small_block[num_small_block] num_large_block -= 1 cur_large_block = large_block[num_large_block] self.edge_prob[cur_small_block] = norm_prob[cur_small_block] self.edge_alias[cur_small_block] = cur_large_block norm_prob[cur_large_block] = norm_prob[ cur_large_block] + norm_prob[cur_small_block] - 1 if norm_prob[cur_large_block] < 1: small_block[num_small_block] = cur_large_block num_small_block += 1 else: large_block[num_large_block] = cur_large_block num_large_block += 1 while num_large_block: num_large_block -= 1 self.edge_prob[large_block[num_large_block]] = 1 while num_small_block: num_small_block -= 1 self.edge_prob[small_block[num_small_block]] = 1 def get_one_embeddings(self, params): vectors = dict() look_back = self.look_back for i, param in enumerate(params): vectors[look_back[i]] = param return vectors def get_vectors(self): ret = dict() ret['node'] = self.get_one_embeddings(self.params['node']) ret['W'] = self.params['W'] ret['layer'] = self.params['layer'] return ret
class _FFVM(object): def __init__(self, graph, lr=.001, rep_size=128, batch_size=100, negative_ratio=5, order=3, table_size=1e8, log_file='log', last_emb_file=None): if os.path.exists('log/' + log_file + '.log'): os.remove('log/' + log_file + '.log') self.logger = LogHandler(log_file) self.epsilon = 1e-7 self.table_size = table_size self.sigmoid_table = {} self.sigmoid_table_size = 1000 self.SIGMOID_BOUND = 6 self._init_simgoid_table() self.g = graph self.look_up = self.g.look_up_dict self.idx = defaultdict(int) self.update_dict = defaultdict(dict) self.update_look_back = defaultdict(list) self.node_size = self.g.node_size self.rep_size = rep_size self._init_params(self.node_size, rep_size, last_emb_file) self.order = order self.lr = lr self.cur_epoch = 0 self.batch_size = batch_size self.negative_ratio = negative_ratio def _init_params(self, node_size, rep_size, last_emb_file): self.embeddings = dict() self.embeddings['node'] = np.random.normal(0, 1, (node_size, rep_size)) self.embeddings['content'] = np.random.normal(0, 1, (node_size, rep_size)) if last_emb_file: self.embeddings['node'] = self._init_emb_matrix(self.embeddings['node']\ , '{}.node_embeddings'.format(last_emb_file)) self.embeddings['content'] = self._init_emb_matrix(self.embeddings['content']\ , '{}.content_embeddings'.format(last_emb_file)) self.embeddings['node'] = np.vstack( (self.embeddings['node'], np.zeros(rep_size))) # for "-1" nodes # adagrad self.h_delta = dict() self.h_delta['node'] = np.zeros((node_size, rep_size)) self.h_delta['content'] = np.zeros((node_size, rep_size)) # adam self.m = dict() self.m['node'] = np.zeros((node_size, rep_size)) self.m['content'] = np.zeros((node_size, rep_size)) self.v = dict() self.v['node'] = np.zeros((node_size, rep_size)) self.v['content'] = np.zeros((node_size, rep_size)) self.t = 1 def _init_emb_matrix(self, emb, emb_file): with open(emb_file, 'r') as embed_handler: for ln in embed_handler: elems = ln.strip().split() if len(elems) <= 2: continue emb[self.look_up[elems[0]]] = map(float, elems[1:]) return emb def _init_simgoid_table(self): for k in range(self.sigmoid_table_size): x = 2 * self.SIGMOID_BOUND * k / self.sigmoid_table_size - self.SIGMOID_BOUND self.sigmoid_table[k] = 1. / (1 + np.exp(-x)) def _fast_sigmoid(self, val): if val > self.SIGMOID_BOUND: return 1 - self.epsilon elif val < -self.SIGMOID_BOUND: return self.epsilon k = int((val + self.SIGMOID_BOUND) * self.sigmoid_table_size / self.SIGMOID_BOUND / 2) return self.sigmoid_table[k] # return 1./(1+np.exp(-val)) def _format_vec(self, cal_type, vec): len_gap = len(vec) - self.idx[cal_type] if len_gap > 0: for i in range(len_gap): vec.append(np.zeros(vec[0].shape)) return np.array(vec) def _calc_delta_vec(self, cal_type, nd, delta, opt_vec): if nd not in self.update_dict[cal_type]: cur_idx = self.idx[cal_type] self.update_dict[cal_type][nd] = cur_idx self.update_look_back[cal_type].append(nd) self.idx[cal_type] += 1 else: cur_idx = self.update_dict[cal_type][nd] if cur_idx >= len(delta): for i in range(cur_idx - len(delta)): delta.append(np.zeros(opt_vec.shape)) delta.append(opt_vec) else: delta[cur_idx] += opt_vec return delta def _update_graph(self, batch): ''' x = self._binarize(self.embeddings[key]) ''' sp_nds, sp_neighbors = batch batch_size = len(sp_nds) # order 1 pos_q = self.embeddings['content'][sp_nds, :] pos_c = np.sum(self.embeddings['node'][sp_neighbors, :], axis=1) neg_q = self.embeddings['content'][sp_neighbors, :] neg_c = list() for c in pos_c: neg_c.append(np.tile(c, (self.negative_ratio, 1))) neg_c = np.array(neg_c) pos_e = np.sum(pos_q * pos_c, axis=1) # pos_e.shape = batch_size neg_e = np.sum(neg_q * neg_c, axis=2) # neg_e.shape = batch_size*negative_ratio sigmoid_pos_e = np.array([ self._fast_sigmoid(val) for val in pos_e.reshape(-1) ]).reshape(pos_e.shape) sigmoid_neg_e = np.array([ self._fast_sigmoid(val) for val in neg_e.reshape(-1) ]).reshape(neg_e.shape) # delta calculation delta_q = list() delta_f = list() idx = 0 for i in range(len(sp_nds)): u, neighbors = sp_nds[i], sp_neighbors[i] delta_q = self._calc_delta_vec( 'content', u, delta_q, (sigmoid_pos_e[i] - 1) * pos_c[i, :]) for v in neighbors: if v != -1: delta_f = self._calc_delta_vec('node', v, delta_f, (sigmoid_pos_e[i] - 1) * pos_q[i, :]) for i in range(len(sp_neighbors)): neighbors = sp_neighbors[i] for j in range(len(neighbors)): u = sp_neighbors[i][j] if u != -1: delta_q = self._calc_delta_vec( 'content', u, delta_q, sigmoid_neg_e[i, j] * neg_c[i, j, :]) for v in neighbors: delta_f = self._calc_delta_vec( 'node', v, delta_f, sigmoid_neg_e[i, j] * neg_q[i, j, :]) delta_q = self._format_vec('content', delta_q) delta_f = self._format_vec('node', delta_f) return delta_q / batch_size, delta_f / batch_size def get_graph_loss(self, batch): sp_nds, sp_neighbors = batch batch_size = len(sp_nds) # order 1 pos_q = self.embeddings['content'][sp_nds, :] pos_c = np.sum(self.embeddings['node'][sp_neighbors, :], axis=1) neg_q = self.embeddings['content'][sp_neighbors, :] neg_c = list() for c in pos_c: neg_c.append(np.tile(c, (self.negative_ratio, 1))) neg_c = np.array(neg_c) pos_e = np.sum(pos_q * pos_c, axis=1) # pos_e.shape = batch_size neg_e = np.sum(neg_q * neg_c, axis=2) # neg_e.shape = batch_size*negative_ratio sigmoid_pos_e = np.array([ self._fast_sigmoid(val) for val in pos_e.reshape(-1) ]).reshape(pos_e.shape) sigmoid_neg_e = np.array([ self._fast_sigmoid(val) for val in neg_e.reshape(-1) ]).reshape(neg_e.shape) return -np.mean( np.log(sigmoid_pos_e) + np.sum(np.log(1 - sigmoid_neg_e), axis=1)) def get_cur_batch_loss(self, t, batch): loss = self.get_graph_loss(batch) self.logger.info('Finish processing batch {} and loss:{}'.format( t, loss)) def update_vec(self, cal_type, h_delta, delta, embeddings, len_delta, t): h_delta[self.update_look_back[cal_type][:len_delta], :] += delta**2 # print 'original embedding:',embeddings[self.update_look_back[cal_type][:len_delta]] embeddings[self.update_look_back[cal_type][:len_delta],:] -= \ self.lr/np.sqrt(h_delta[self.update_look_back[cal_type][:len_delta],:])*delta # print 'delta:',delta # print 'h_delta:',h_delta[self.update_look_back[cal_type][:len_delta]] # print 'embeddings:',embeddings[self.update_look_back[cal_type][:len_delta]] # print 'lmd_rda:',elem_lbd return h_delta, embeddings def update_vec_by_adam(self, cal_type, m, v, delta, embeddings, len_delta, t): self.beta1 = .9 self.beta2 = .999 m[self.update_look_back[cal_type][:len_delta],:] = \ self.beta1*m[self.update_look_back[cal_type][:len_delta],:]+(1-self.beta1)*delta v[self.update_look_back[cal_type][:len_delta],:] = \ self.beta2*v[self.update_look_back[cal_type][:len_delta],:]+(1-self.beta2)*(delta**2) m_ = m[self.update_look_back[cal_type][:len_delta], :] / ( 1 - self.beta1**t) v_ = v[self.update_look_back[cal_type][:len_delta], :] / ( 1 - self.beta2**t) embeddings[ self.update_look_back[cal_type][:len_delta], :] -= self.lr * m_ / ( np.sqrt(v_) + self.epsilon) return m, v, embeddings def train_one_epoch(self): DISPLAY_EPOCH = 100 batches = self.batch_iter() # opt_type = 'adagrad' opt_type = 'adam' for batch in batches: self.idx = defaultdict(int) self.update_look_back = defaultdict(list) self.update_dict = defaultdict(dict) delta_q, delta_f = self._update_graph(batch) len_delta_f = len(delta_f) # print 'order2, nd' if opt_type == 'adagrad': self.h_delta['node'], self.embeddings['node'] = \ self.update_vec('node', self.h_delta['node'], delta_f , self.embeddings['node'], len_delta_f, self.t) if opt_type == 'adam': self.m['node'], self.v['node'], self.embeddings['node'] = \ self.update_vec_by_adam('node', self.m['node'], self.v['node'], delta_f , self.embeddings['node'], len_delta_f, self.t) len_delta_q = len(delta_q) # print 'order2, content' if opt_type == 'adagrad': self.h_delta['content'], self.embeddings['content'] = \ self.update_vec('content', self.h_delta['content'], delta_q , self.embeddings['content'], len_delta_q, self.t) if opt_type == 'adam': self.m['content'], self.v['content'], self.embeddings['content'] = \ self.update_vec_by_adam('content', self.m['content'], self.v['content'], delta_q , self.embeddings['content'], len_delta_q, self.t) if (self.t - 1) % DISPLAY_EPOCH == 0: self.get_cur_batch_loss(self.t, batch) self.t += 1 self.cur_epoch += 1 def get_random_neighbor_nodes(self, nd_idx): graph = self.g.G look_up = self.g.look_up_dict look_back = self.g.look_back_list nd = self.g.look_back_list[nd_idx] neigh_nds = np.array([self.look_up[vid] for vid in graph[nd].keys()]) shuffle_idx = np.random.permutation(np.arange(len(neigh_nds))) end_idx = self.negative_ratio if len( neigh_nds) > self.negative_ratio else len(neigh_nds) return neigh_nds[shuffle_idx[:end_idx]] def batch_iter(self): numNodes = self.node_size data_size = numNodes shuffle_indices = np.random.permutation(np.arange(data_size)) start_index = 0 end_index = min(start_index + self.batch_size, data_size) while start_index < data_size: ret = {} sp_nds = shuffle_indices[start_index:end_index] sp_neighbors = [] for idx in sp_nds: neighbors = self.get_random_neighbor_nodes(idx) if len(neighbors) < self.negative_ratio: neighbors = np.hstack( (neighbors, -np.ones(self.negative_ratio - len(neighbors)))).astype(int) sp_neighbors.append(neighbors) ret = sp_nds, sp_neighbors start_index = end_index end_index = min(start_index + self.batch_size, data_size) yield ret def get_one_embeddings(self, embeddings): vectors = dict() look_back = self.g.look_back_list for i, embedding in enumerate(embeddings): if i == len(embeddings) - 1: continue vectors[look_back[i]] = embedding return vectors def get_vectors(self): order = self.order ret = dict() node_embeddings = self.get_one_embeddings(self.embeddings['node']) ret['node'] = node_embeddings content_embeddings = self.get_one_embeddings( self.embeddings['content']) ret['content'] = content_embeddings return ret
class PALE(object): def __init__(self, learning_rate, batch_size, n_input, n_hidden, n_layer, type_model, is_valid, device, files, log_file): if os.path.exists('log/' + log_file + '.log'): os.remove('log/' + log_file + '.log') self.logger = LogHandler(log_file) self.device = device # Parameters self.learning_rate = learning_rate self.batch_size = batch_size self.valid = is_valid self.valid_prop = .9 if self.valid else 1. self.valid_sample_size = 9 self.cur_epoch = 1 # Network Parameters self.n_hidden = n_hidden if type_model == 'mlp' else n_input # number of neurons in hidden layer self.n_input = n_input # size of node embeddings self.n_layer = n_layer # number of layer # Set Train Data if not isinstance(files, list) and len(files) < 3: self.logger.info( 'The alogrihtm needs files like [First Graph File, Second Graph File, Label File]' ) return # tf Graph input self.lookup = defaultdict(dict) self.look_back = defaultdict(list) self._read_train_dat(files[0], files[1], files[2]) # douban, weibo, label files self.valid_sample_size = min( min(self.valid_sample_size, len(self.look_back['f']) - 1), len(self.look_back['g']) - 1) # TF Graph Building self.sess = tf.Session() cur_seed = random.getrandbits(32) initializer = tf.contrib.layers.xavier_initializer(uniform=False, seed=cur_seed) with tf.device(self.device): with tf.variable_scope("model", reuse=None, initializer=initializer): self._init_weights(type_model) self.build_train_graph(type_model) self.build_valid_graph(type_model) self.sess.run(tf.global_variables_initializer()) def _read_labels(self, label_file): labels = list() with open(label_file, 'r') as lb_handler: for ln in lb_handler: ln = ln.strip() if not ln: break labels.append(ln.split()) return labels def _read_train_dat(self, embed1_file, embed2_file, label_file): self.X, self.lookup['f'], self.look_back['f'] = read_embeddings( embed1_file) self.Y, self.lookup['g'], self.look_back['g'] = read_embeddings( embed2_file) self.L = load_train_valid_labels(label_file, self.lookup, self.valid_prop) def _init_weights(self, type_code_graph): # Store layers weight & bias self.weights = dict() self.biases = dict() if type_code_graph == 'mlp': self.weights['h0'] = tf.Variable( tf.random_normal([self.n_input, self.n_hidden])) self.biases['b0'] = tf.Variable(tf.zeros([self.n_hidden])) for i in range(1, self.n_layer): self.weights['h{}'.format(i)] = tf.Variable( tf.random_normal([self.n_hidden, self.n_hidden])) self.biases['b{}'.format(i)] = tf.Variable( tf.zeros([self.n_hidden])) self.weights['out'] = tf.Variable( tf.random_normal([self.n_hidden, self.n_input])) self.biases['b_out'] = tf.Variable(tf.zeros([self.n_input])) def build_mlp_code_graph(self, inputs): # Input layer layer = tf.nn.sigmoid( tf.add( tf.matmul(tf.reshape(inputs, [-1, self.n_input]), self.weights['h0']), self.biases['b0'])) for i in range(1, self.n_layer): layer = tf.nn.sigmoid( tf.add(tf.matmul(layer, self.weights['h{}'.format(i)]), self.biases['b{}'.format(i)])) # Output fully connected layer with a neuron code = tf.nn.tanh( tf.matmul(layer, self.weights['out']) + self.biases['b_out']) return code def build_lin_code_graph(self, inputs): # Output fully connected layer with a neuron code = tf.matmul(tf.reshape(inputs, [-1, self.n_input]), self.weights['out']) + self.biases['b_out'] return code def build_train_graph(self, type_code_graph): if type_code_graph == 'lin': code_graph = self.build_lin_code_graph elif type_code_graph == 'mlp': code_graph = self.build_mlp_code_graph self.cur_batch_size = tf.placeholder('float32', name='batch_size') self.pos_inputs = { 'f': tf.placeholder('float32', [None, self.n_input]), 'g': tf.placeholder('float32', [None, self.n_input]) } self.PF = code_graph(self.pos_inputs['f']) # batch_size*n_input # train loss self.loss = tf.reduce_mean(.5 * tf.square(self.PF - self.pos_inputs['g'])) optimizer = tf.train.AdamOptimizer(self.learning_rate) self.train_op = optimizer.minimize(self.loss) def build_valid_graph(self, type_code_graph): if type_code_graph == 'lin': code_graph = self.build_lin_code_graph elif type_code_graph == 'mlp': code_graph = self.build_mlp_code_graph # validation self.valid_inputs = { 'f': tf.placeholder('float32', [None, self.valid_sample_size, self.n_input]), 'g': tf.placeholder('float32', [None, self.valid_sample_size, self.n_input]) } valid = tf.reshape(code_graph(self.valid_inputs['f']), [-1, self.valid_sample_size, self.n_input ]) # batch_size*neg_ratio*n_input self.dot_dist = tf.reduce_sum(tf.pow(valid - self.valid_inputs['g'], 2.), axis=2) def train_one_epoch(self): sum_loss = 0.0 mrr = 0.0 # train process batches = batch_iter(self.L, self.batch_size, 0, self.lookup, 'f', 'g') batch_id = 0 for batch in batches: pos, neg = batch if not len(pos['f']) == len(pos['g']) and not len(neg['f']) == len( neg['g']): self.logger.info( 'The input label file goes wrong as the file format.') continue batch_size = len(pos['f']) feed_dict = { self.pos_inputs['f']: self.X[pos['f'], :], self.pos_inputs['g']: self.Y[pos['g'], :], self.cur_batch_size: batch_size } _, cur_loss = self.sess.run([self.train_op, self.loss], feed_dict) sum_loss += cur_loss batch_id += 1 # valid process if self.valid: valid = valid_iter(self.L, self.valid_sample_size, self.lookup, 'f', 'g') if not len(valid['f']) == len(valid['g']): self.logger.info( 'The input label file goes wrong as the file format.') return valid_size = len(valid['f']) feed_dict = { self.valid_inputs['f']: self.X[valid['f'], :], self.valid_inputs['g']: self.Y[valid['g'], :] } valid_dist = self.sess.run(self.dot_dist, feed_dict) mrr = .0 for i in range(valid_size): fst_dist = valid_dist[i][0] pos = 1 for k in range(1, len(valid_dist[i])): if fst_dist >= valid_dist[i][k]: pos += 1 mrr += 1. / pos self.logger.info( 'Epoch={}, sum of loss={!s}, mrr in validation={}'.format( self.cur_epoch, sum_loss / (batch_id + 1e-8), mrr / (valid_size + 1e-8))) else: self.logger.info('Epoch={}, sum of loss={!s}'.format( self.cur_epoch, sum_loss / batch_id)) self.cur_epoch += 1 return sum_loss / (batch_id + 1e-8), mrr / (valid_size + 1e-8) def _write_in_file(self, filename, vec, tag): with open(filename, 'a+') as res_handler: if len(vec.shape) > 1: column_size = vec.shape[1] else: column_size = 1 reshape_vec = vec.reshape(-1) vec_size = len(reshape_vec) res_handler.write(tag + '\n') for i in range(0, vec_size, column_size): res_handler.write('{}\n'.format(' '.join( [str(reshape_vec[i + k]) for k in range(column_size)]))) def save_models(self, filename): if os.path.exists(filename): os.remove(filename) for k, v in self.weights.items(): self._write_in_file(filename, v.eval(self.sess), k) for k, v in self.biases.items(): self._write_in_file(filename, v.eval(self.sess), k)
class _ALP_NE(object): def __init__(self, graphs, lr=.001, gamma=.1, rep_size=128, batch_size=100, negative_ratio=5, table_size=1e8, anchor_file=None, log_file='log', last_emb_files=dict()): if os.path.exists('log/'+log_file+'.log'): os.remove('log/'+log_file+'.log') self.logger = LogHandler(log_file) self.epsilon = 1e-7 self.table_size = table_size self.sigmoid_table = {} self.sigmoid_table_size = 1000 self.SIGMOID_BOUND = 6 self._init_simgoid_table() self._init_dicts() self.t = 1 self.rep_size = rep_size for graph_type in ['f', 'g']: self.g[graph_type] = graphs[graph_type] self.look_up[graph_type] = self.g[graph_type].look_up_dict self.idx[graph_type] = 0 self.update_dict[graph_type] = dict() self.update_look_back[graph_type] = list() self.node_size[graph_type] = self.g[graph_type].node_size self.embeddings[graph_type], self.h_delta[graph_type], self.m[graph_type], self.v[graph_type]\ = self._init_params(self.node_size[graph_type], rep_size, last_emb_files, graph_type) self._gen_sampling_table(graph_type) self.anchors = self._read_anchors(anchor_file, ',') self.lr = lr self.gamma = gamma self.cur_epoch = 0 self.batch_size = batch_size self.negative_ratio = negative_ratio def _init_dicts(self): self.g = dict() self.look_up = dict() self.idx = dict() self.update_dict = dict() self.update_look_back = dict() self.node_size = dict() self.embeddings = dict() self.h_delta = dict() self.m = dict() self.v = dict() self.node_degree = dict() self.sampling_table = dict() self.edge_alias = dict() self.edge_prob = dict() def _init_params(self, node_size, rep_size, last_emb_file, graph_type): embeddings = dict() embeddings['node'] = np.random.normal(0,1,(node_size,rep_size)) embeddings['content'] = np.random.normal(0,1,(node_size,rep_size)) if last_emb_file: embeddings['node'] = self._init_emb_matrix(embeddings['node']\ , '{}.node_embeddings'.format(last_emb_file[graph_type]), graph_type) embeddings['content'] = self._init_emb_matrix(embeddings['content']\ , '{}.content_embeddings'.format(last_emb_file[graph_type]), graph_type) # adagrad h_delta = dict() h_delta['node'] = np.zeros((node_size,rep_size)) h_delta['content'] = np.zeros((node_size,rep_size)) # adam m = dict() m['node'] = np.zeros((node_size,rep_size)) m['content'] = np.zeros((node_size,rep_size)) v = dict() v['node'] = np.zeros((node_size,rep_size)) v['content'] = np.zeros((node_size,rep_size)) return embeddings, h_delta, m, v def _init_emb_matrix(self, emb, emb_file, graph_type): with open(emb_file, 'r') as embed_handler: for ln in embed_handler: elems = ln.strip().split() if len(elems)<=2: continue emb[self.look_up[graph_type][elems[0]]] = map(float, elems[1:]) return emb def _read_anchors(self, anchor_file, delimiter): anchors = list() with open(anchor_file, 'r') as anchor_handler: for ln in anchor_handler: elems = ln.strip().split(delimiter) anchors.append((elems[0], elems[1])) return anchors def _init_simgoid_table(self): for k in range(self.sigmoid_table_size): x = 2*self.SIGMOID_BOUND*k/self.sigmoid_table_size-self.SIGMOID_BOUND self.sigmoid_table[k] = 1./(1+np.exp(-x)) def _fast_sigmoid(self, val): if val>self.SIGMOID_BOUND: return 1-self.epsilon elif val<-self.SIGMOID_BOUND: return self.epsilon k = int((val+self.SIGMOID_BOUND)*self.sigmoid_table_size/self.SIGMOID_BOUND/2) return self.sigmoid_table[k] # return 1./(1+np.exp(-val)) def _format_vec(self, vec, graph_type): len_gap = self.idx[graph_type]-len(vec) if len_gap>0: num_col = 0 if isinstance(vec, list): num_col = len(vec[0]) else: num_col = vec.shape[1] vec = np.concatenate((vec, np.zeros((len_gap, num_col)))) # for i in range(len_gap): # vec = np.append(vec, np.zeros(vec[0].shape)) return np.array(vec) def _calc_delta_vec(self, nd, delta, opt_vec, graph_type): if nd not in self.update_dict[graph_type]: cur_idx = self.idx[graph_type] self.update_dict[graph_type][nd] = cur_idx self.update_look_back[graph_type].append(nd) self.idx[graph_type] += 1 else: cur_idx = self.update_dict[graph_type][nd] if cur_idx>=len(delta): for i in range(cur_idx-len(delta)): delta.append(np.zeros(opt_vec.shape)) delta.append(opt_vec) else: delta[cur_idx] += opt_vec return delta def _update_graph_by_links(self, batch, graph_type): ''' x = self._binarize(self.embeddings[key]) ''' pos_h, pos_t, pos_h_v, neg_t = batch[graph_type] batch_size = len(pos_h) # print pos_h, pos_t, pos_h_v, neg_t embeddings = self.embeddings[graph_type] # order 2 pos_u = embeddings['node'][pos_h,:] pos_v_c = embeddings['content'][pos_t,:] neg_u = embeddings['node'][pos_h_v,:] neg_v_c = embeddings['content'][neg_t,:] pos_e = np.sum(pos_u*pos_v_c, axis=1) # pos_e.shape = batch_size neg_e = np.sum(neg_u*neg_v_c, axis=2) # neg_e.shape = batch_size*negative_ratio sigmoid_pos_e = np.array([self._fast_sigmoid(val) for val in pos_e.reshape(-1)]).reshape(pos_e.shape) sigmoid_neg_e = np.array([self._fast_sigmoid(val) for val in neg_e.reshape(-1)]).reshape(neg_e.shape) # temporal delta delta_eh = list() delta_c = list() idx = 0 for i in range(len(pos_t)): u,v = pos_h[i],pos_t[i] delta_c = self._calc_delta_vec(v, delta_c, (sigmoid_pos_e[i]-1)*pos_u[i,:], graph_type) delta_eh = self._calc_delta_vec(u, delta_eh, (sigmoid_pos_e[i]-1)*pos_v_c[i,:], graph_type) # print 'delta_eh',delta_eh,ndDict_order neg_shape = neg_e.shape for i in range(neg_shape[0]): for j in range(neg_shape[1]): u,v = pos_h_v[i][j],neg_t[i][j] delta_c = self._calc_delta_vec(v, delta_c, sigmoid_neg_e[i,j]*neg_u[i,j,:], graph_type) delta_eh = self._calc_delta_vec(u, delta_eh, sigmoid_neg_e[i,j]*neg_v_c[i,j,:], graph_type) # print sigmoid_neg_e[i,j]*neg_v_c[i,j,:], type(sigmoid_neg_e[i,j]*neg_v_c[i,j,:]) # print 'delta_eh',delta_eh,ndDict_order # delta x & delta codebook delta_eh = self._format_vec(delta_eh, graph_type) delta_c = self._format_vec(delta_c, graph_type) # print 'in update graph by links '+graph_type # print self.idx[graph_type], delta_eh.shape, delta_c.shape return delta_c/batch_size, delta_eh/batch_size def _cos_sim(self, vec1, vec2): return np.dot(vec1,vec2)/np.linalg.norm(vec1)/np.linalg.norm(vec2) def _update_graph_by_anchor_reg(self): delta_eh = defaultdict(list) cnt = 0 for src_nd, target_nd in self.anchors: if not src_nd in self.look_up['f'] or not target_nd in self.look_up['g']: continue types = ['f', 'g'] idx = list() # 0 refers to network f, 1 refers to network g emb = list() idx.append(self.look_up['f'][src_nd]) idx.append(self.look_up['g'][target_nd]) emb.append(self.embeddings['f']['node'][idx[0]]) emb.append(self.embeddings['g']['node'][idx[1]]) for i in range(len(types)): delta_eh[types[i]] = self._calc_delta_vec(idx[i], delta_eh[types[i]] , (self._cos_sim(emb[i], emb[1-i])*emb[i]/np.dot(emb[i], emb[i]) -emb[1-i]/np.linalg.norm(emb[1-i])/np.linalg.norm(emb[i])), types[i]) cnt += 1 for graph_type in ['f', 'g']: delta_eh[graph_type] = self._format_vec(delta_eh[graph_type], graph_type)/cnt # print 'in update graph by anchor reg ' + graph_type # print self.idx[graph_type], delta_eh[graph_type].shape return delta_eh def _mat_add(self, mat1, mat2): # print '****mat add****' # print mat1, mat2 len_gap = len(mat1)-len(mat2) # print len_gap if len_gap>0: for i in range(len_gap): mat2 = np.vstack((mat2, np.zeros(mat2[0,:].shape))) # print mat2 else: for i in range(-len_gap): mat1 = np.vstack((mat1, np.zeros(mat1[0,:].shape))) # print mat1 # print len(mat1), len(mat2) return mat1+mat2 def get_graph_loss(self, batch, graph_type): pos_h, pos_t, pos_h_v, neg_t = batch[graph_type] embeddings = self.embeddings[graph_type] # order 2 pos_u = embeddings['node'][pos_h,:] pos_v_c = embeddings['content'][pos_t,:] neg_u = embeddings['node'][pos_h_v,:] neg_v_c = embeddings['content'][neg_t,:] pos_e = np.sum(pos_u*pos_v_c, axis=1) # pos_e.shape = batch_size neg_e = np.sum(neg_u*neg_v_c, axis=2) # neg_e.shape = batch_size*negative_ratio sigmoid_pos_e = np.array([self._fast_sigmoid(val) for val in pos_e.reshape(-1)]).reshape(pos_e.shape) sigmoid_neg_e = np.array([self._fast_sigmoid(val) for val in neg_e.reshape(-1)]).reshape(neg_e.shape) return -np.mean(np.log(sigmoid_pos_e)+np.sum(np.log(1-sigmoid_neg_e), axis=1)) def get_anchor_reg_loss(self): cos_sim_list = list() for src_nd, target_nd in self.anchors: if not src_nd in self.look_up['f'] or not target_nd in self.look_up['g']: continue src_idx = self.look_up['f'][src_nd] target_idx = self.look_up['g'][target_nd] cos_sim_list.append(self._cos_sim(self.embeddings['f']['node'][src_idx] , self.embeddings['g']['node'][target_idx])) return -np.mean(cos_sim_list) def update_vec(self, h_delta, delta, embeddings, len_delta, t, graph_type): update_look_back = self.update_look_back[graph_type] h_delta[update_look_back[:len_delta],:] += delta**2 # print 'original embedding:',embeddings[self.update_look_back[cal_type][:len_delta]] embeddings[update_look_back[:len_delta],:] -= \ self.lr/np.sqrt(h_delta[update_look_back[:len_delta],:])*delta # print 'delta:',delta # print 'h_delta:',h_delta[self.update_look_back[cal_type][:len_delta]] # print 'embeddings:',embeddings[self.update_look_back[cal_type][:len_delta]] # print 'lmd_rda:',elem_lbd return h_delta, embeddings def update_vec_by_adam(self, m, v, delta, embeddings, len_delta, t, graph_type): self.beta1 = .9 self.beta2 = .999 update_look_back = self.update_look_back[graph_type] m[update_look_back[:len_delta],:] = \ self.beta1*m[update_look_back[:len_delta],:]+(1-self.beta1)*delta v[update_look_back[:len_delta],:] = \ self.beta2*v[update_look_back[:len_delta],:]+(1-self.beta2)*(delta**2) m_ = m[update_look_back[:len_delta],:]/(1-self.beta1**t) v_ = v[update_look_back[:len_delta],:]/(1-self.beta2**t) embeddings[update_look_back[:len_delta],:] -= self.lr*m_/(np.sqrt(v_)+self.epsilon) return m,v,embeddings def train_one_epoch(self, opt_type): DISPLAY_EPOCH=100 def batch_init(): for graph_type in ['f', 'g']: self.idx[graph_type] = 0 self.update_look_back[graph_type] = list() self.update_dict[graph_type] = dict() batches = self.batch_iter() last_batch_loss = 1e8 stop_cnt = 0 for batch in batches: batch_loss = .0 batch_init() delta_eh_anchor_reg = self._update_graph_by_anchor_reg() for graph_type in ['f', 'g']: # init h_delta = self.h_delta[graph_type] embeddings = self.embeddings[graph_type] m = self.m[graph_type] v = self.v[graph_type] # end delta_c, delta_eh = self._update_graph_by_links(batch, graph_type) delta_eh_anchor_reg[graph_type] = self._format_vec(delta_eh_anchor_reg[graph_type], graph_type) # print 'in train one epoch' # print self.idx[graph_type], delta_eh_anchor_reg[graph_type].shape, delta_eh.shape len_delta = len(delta_eh) # print 'order2, nd' if opt_type=='adagrad': h_delta['node'], embeddings['node'] = \ self.update_vec(h_delta['node'] , delta_eh+self.gamma*delta_eh_anchor_reg[graph_type] , embeddings['node'], len_delta, self.t, graph_type) if opt_type=='adam': m['node'], self.v['node'], embeddings['node'] = \ self.update_vec_by_adam(m['node'], v['node'] , delta_eh+self.gamma*delta_eh_anchor_reg[graph_type] , embeddings['node'], len_delta, self.t, graph_type) len_content = len(delta_c) # print 'order2, content' if opt_type=='adagrad': h_delta['content'], embeddings['content'] = \ self.update_vec(h_delta['content'], delta_c , embeddings['content'], len_content, self.t, graph_type) if opt_type=='adam': m['content'], v['content'], embeddings['content'] = \ self.update_vec_by_adam(m['content'], v['content'], delta_c , embeddings['content'], len_content, self.t, graph_type) if (self.t-1)%DISPLAY_EPOCH==0: batch_loss += self.get_graph_loss(batch, graph_type)+self.gamma*self.get_anchor_reg_loss() if (self.t-1)%DISPLAY_EPOCH==0: self.logger.info('Finish processing batch {} and loss:{}'.format(self.t-1, batch_loss)) if batch_loss<last_batch_loss: last_batch_loss = batch_loss stop_cnt = 0 else: stop_cnt += 1 if stop_cnt>=2: break self.t += 1 self.cur_epoch += 1 def get_random_node_pairs(self, i, shuffle_indices, edges, edge_set, numNodes, graph_type): # balance the appearance of edges according to edge_prob edge_prob = self.edge_prob[graph_type] edge_alias = self.edge_alias[graph_type] sampling_table = self.sampling_table[graph_type] if i>=len(shuffle_indices): i = np.random.randint(len(shuffle_indices)) if not random.random() < edge_prob[shuffle_indices[i]]: shuffle_indices[i] = edge_alias[shuffle_indices[i]] cur_h = edges[shuffle_indices[i]][0] head = cur_h*numNodes cur_t = edges[shuffle_indices[i]][1] cur_h_v = [] cur_neg_t = [] for j in range(self.negative_ratio): rn = sampling_table[random.randint(0, self.table_size-1)] while head+rn in edge_set or cur_h == rn or rn in cur_neg_t: rn = sampling_table[random.randint(0, self.table_size-1)] cur_h_v.append(cur_h) cur_neg_t.append(rn) return cur_h, cur_t, cur_h_v, cur_neg_t def batch_iter(self): data_size = 0 for graph_type in ['f', 'g']: net_size = self.g[graph_type].G.size() if net_size > data_size: data_size = net_size shuffle_indices = dict() for graph_type in ['f', 'g']: net_size = self.g[graph_type].G.size() shuffle_indices[graph_type] = np.random.permutation(np.arange(net_size)) while net_size<data_size: shuffle_indices[graph_type] = np.append(shuffle_indices[graph_type], shuffle_indices[graph_type][:data_size-net_size]) net_size = len(shuffle_indices[graph_type]) start_index = 0 end_index = min(start_index+self.batch_size, data_size) while start_index < data_size: ret = dict() for graph_type in ['f', 'g']: numNodes = self.node_size[graph_type] look_up = self.look_up[graph_type] g = self.g[graph_type] edges = [(look_up[x[0]], look_up[x[1]]) for x in g.G.edges()] edge_set = set([x[0]*numNodes+x[1] for x in edges]) pos_h = [] pos_t = [] pos_h_v = [] neg_t = [] for i in range(start_index, end_index): cur_h, cur_t, cur_h_v, cur_neg_t\ = self.get_random_node_pairs(i, shuffle_indices[graph_type], edges, edge_set, numNodes, graph_type) pos_h.append(cur_h) pos_t.append(cur_t) pos_h_v.append(cur_h_v) neg_t.append(cur_neg_t) ret[graph_type] = (pos_h, pos_t, pos_h_v, neg_t) start_index = end_index end_index = min(start_index+self.batch_size, data_size) yield ret def _gen_sampling_table(self, graph_type): table_size = self.table_size power = 0.75 print("Pre-procesing for non-uniform negative sampling in {}!".format(graph_type)) numNodes = self.node_size[graph_type] g = self.g[graph_type] node_degree = np.zeros(numNodes) # out degree look_up = g.look_up_dict for edge in g.G.edges(): node_degree[look_up[edge[0]]] += g.G[edge[0]][edge[1]]['weight'] norm = sum([math.pow(node_degree[i], power) for i in range(numNodes)]) p = 0 i = 0 sampling_table = np.zeros(int(table_size), dtype=np.uint32) for j in range(numNodes): p += float(math.pow(node_degree[j], power)) / norm while i < table_size and float(i) / table_size < p: sampling_table[i] = j i += 1 data_size = g.G.size() edge_alias = np.zeros(data_size, dtype=np.int32) edge_prob = np.zeros(data_size, dtype=np.float32) large_block = np.zeros(data_size, dtype=np.int32) small_block = np.zeros(data_size, dtype=np.int32) total_sum = sum([g.G[edge[0]][edge[1]]["weight"] for edge in g.G.edges()]) norm_prob = [g.G[edge[0]][edge[1]]["weight"]*data_size/total_sum for edge in g.G.edges()] num_small_block = 0 num_large_block = 0 cur_small_block = 0 cur_large_block = 0 for k in range(data_size-1, -1, -1): if norm_prob[k] < 1: small_block[num_small_block] = k num_small_block += 1 else: large_block[num_large_block] = k num_large_block += 1 while num_small_block and num_large_block: num_small_block -= 1 cur_small_block = small_block[num_small_block] num_large_block -= 1 cur_large_block = large_block[num_large_block] edge_prob[cur_small_block] = norm_prob[cur_small_block] edge_alias[cur_small_block] = cur_large_block norm_prob[cur_large_block] = norm_prob[cur_large_block] + norm_prob[cur_small_block] -1 if norm_prob[cur_large_block] < 1: small_block[num_small_block] = cur_large_block num_small_block += 1 else: large_block[num_large_block] = cur_large_block num_large_block += 1 while num_large_block: num_large_block -= 1 edge_prob[large_block[num_large_block]] = 1 while num_small_block: num_small_block -= 1 edge_prob[small_block[num_small_block]] = 1 self.node_degree[graph_type] = node_degree self.sampling_table[graph_type] = sampling_table self.edge_alias[graph_type] = edge_alias self.edge_prob[graph_type] = edge_prob def get_one_embeddings(self, embeddings, graph_type): vectors = dict() look_back = self.g[graph_type].look_back_list for i, embedding in enumerate(embeddings): vectors[look_back[i]] = embedding return vectors def get_vectors(self): ret = defaultdict(dict) content_embeddings = defaultdict(dict) for graph_type in ['f', 'g']: node_embeddings=self.get_one_embeddings(self.embeddings[graph_type]['node'], graph_type) ret[graph_type]['node_embeddings']=node_embeddings content_embeddings=self.get_one_embeddings(self.embeddings[graph_type]['content'], graph_type) ret[graph_type]['content_embeddings']=content_embeddings return ret
class DCNH_DP(object): def __init__(self, learning_rate, batch_size, neg_ratio, n_input, n_out, n_hidden, n_layer, device, files, log_file): if os.path.exists('log/' + log_file + '.log'): os.remove('log/' + log_file + '.log') self.logger = LogHandler(log_file) self.device = device # Parameters self.learning_rate = learning_rate self.batch_size = batch_size self.neg_ratio = neg_ratio self.valid_prop = .9 self.valid_sample_size = 9 self.gamma = 1 self.eta = 0 self.cur_epoch = 1 # Network Parameters self.n_hidden = n_hidden # number of neurons in hidden layer self.n_input = n_input # size of node embeddings self.n_out = n_out # hashing code self.n_layer = n_layer # number of layer # Set Train Data if not isinstance(files, list) and len(files) < 3: self.logger.info( 'The alogrihtm needs files like [First Graph File, Second Graph File, Label File]' ) return # tf Graph input self.lookup_f = dict() self.lookup_g = dict() self.look_back_f = list() self.look_back_g = list() self._read_train_dat(files[0], files[1], files[2]) # douban, weibo, label files self.valid_sample_size = min( min(self.valid_sample_size, len(self.look_back_f) - 1), len(self.look_back_g) - 1) # TF Graph Building self.sess = tf.Session() cur_seed = random.getrandbits(32) initializer = tf.contrib.layers.xavier_initializer(uniform=False, seed=cur_seed) with tf.device(self.device): with tf.variable_scope("model", reuse=None, initializer=initializer): self.mlp_weights() self.build_graph() self.build_valid_graph() self.sess.run(tf.global_variables_initializer()) def _read_embeddings(self, embed_file, lookup, look_back): embedding = list() with open(embed_file, 'r') as emb_handler: idx = 0 for ln in emb_handler: ln = ln.strip() if ln: elems = ln.split() if len(elems) == 2: continue embedding.append(map(float, elems[1:])) lookup[elems[0]] = idx look_back.append(elems[0]) idx += 1 return np.array(embedding), lookup, look_back def _read_train_dat(self, embed1_file, embed2_file, label_file): self.L = load_train_valid_labels(label_file, self.valid_prop) self.F, self.lookup_f, self.look_back_f = self._read_embeddings( embed1_file, self.lookup_f, self.look_back_f) self.G, self.lookup_g, self.look_back_g = self._read_embeddings( embed2_file, self.lookup_g, self.look_back_g) def mlp_weights(self): # Store layers weight & bias self.weights = dict() self.biases = dict() self.weights['h0_f'] = tf.Variable( tf.random_normal([self.n_input, self.n_hidden])) self.weights['h0_g'] = tf.Variable( tf.random_normal([self.n_input, self.n_hidden])) self.biases['b0_f'] = tf.Variable(tf.zeros([self.n_hidden])) self.biases['b0_g'] = tf.Variable(tf.zeros([self.n_hidden])) for i in range(1, self.n_layer): self.weights['h{}_f'.format(i)] = tf.Variable( tf.random_normal([self.n_hidden, self.n_hidden])) self.weights['h{}_g'.format(i)] = tf.Variable( tf.random_normal([self.n_hidden, self.n_hidden])) self.biases['b{}_f'.format(i)] = tf.Variable( tf.zeros([self.n_hidden])) self.biases['b{}_g'.format(i)] = tf.Variable( tf.zeros([self.n_hidden])) self.weights['out_f'] = tf.Variable( tf.random_normal([self.n_hidden, self.n_out])) self.weights['out_g'] = tf.Variable( tf.random_normal([self.n_hidden, self.n_out])) self.biases['b_out_f'] = tf.Variable(tf.zeros([self.n_out])) self.biases['b_out_g'] = tf.Variable(tf.zeros([self.n_out])) def build_code_graph(self, inputs, tag): # Input layer layer = tf.nn.sigmoid( tf.add( tf.matmul(tf.reshape(inputs, [-1, self.n_input]), self.weights['h0_' + tag]), self.biases['b0_' + tag])) for i in range(1, self.n_layer): layer = tf.nn.sigmoid( tf.add(tf.matmul(layer, self.weights['h{}'.format(i)]), self.biases['b{}'.format(i)])) # Output fully connected layer with a neuron code = tf.nn.tanh( tf.matmul(layer, self.weights['out']) + self.biases['b_out']) return code def build_train_graph(self, src_tag, obj_tag): PF = self.build_code_graph(self.pos_src_inputs, src_tag) # batch_size*n_out PG = self.build_code_graph(self.pos_obj_inputs, obj_tag) # batch_size*n_out NF = tf.reshape( self.build_code_graph(self.neg_src_inputs, src_tag), [-1, self.neg_ratio, self.n_out]) # batch_size*neg_ratio*n_out NG = tf.reshape( self.build_code_graph(self.neg_obj_inputs, obj_tag), [-1, self.neg_ratio, self.n_out]) # batch_size*neg_ratio*n_out B = tf.sign(PF + PG) # batch_size*n_out # self.ph['B'] = tf.sign(self.ph['F']+self.ph['G']) # batch_size*n_out # train loss term1_first = tf.log( tf.nn.sigmoid(tf.reduce_sum(.5 * tf.multiply(PF, PG), axis=1))) term1_second = tf.reduce_sum(tf.log( 1 - tf.nn.sigmoid(tf.reduce_sum(.5 * tf.multiply(NF, NG), axis=2))), axis=1) term1 = -tf.reduce_sum(term1_first + term1_second) term2 = tf.reduce_sum(tf.pow( (B - PF), 2)) + tf.reduce_sum(tf.pow((B - PG), 2)) term3 = tf.reduce_sum( tf.reduce_sum(tf.pow(PF, 2)) + tf.reduce_sum(tf.pow(PG, 2), axis=1)) # term1 = -tf.reduce_sum(tf.multiply(self.ph['S'], theta)-tf.log(1+tf.exp(theta))) # term2 = tf.reduce_sum(tf.norm(self.ph['B']-self.ph['F'],axis=1))+tf.reduce_sum(tf.norm(self.ph['B']-self.ph['G'],axis=1)) # term3 = tf.reduce_sum(tf.norm(self.ph['F'],axis=1))+tf.reduce_sum(tf.norm(self.ph['G'],axis=1)) return (term1 + self.gamma * term2 + self.eta * term3) / self.cur_batch_size def build_graph(self): self.cur_batch_size = tf.placeholder('float32', name='batch_size') self.pos_src_inputs = tf.placeholder('float32', [None, self.n_input]) self.pos_obj_inputs = tf.placeholder('float32', [None, self.n_input]) self.neg_src_inputs = tf.placeholder( 'float32', [None, self.neg_ratio, self.n_input]) self.neg_obj_inputs = tf.placeholder( 'float32', [None, self.neg_ratio, self.n_input]) self.loss_f2g = self.build_train_graph('f', 'g') self.loss_g2f = self.build_train_graph('g', 'f') # self.loss = (term1+self.eta*term3)/self.cur_batch_size optimizer = tf.train.AdamOptimizer(self.learning_rate) self.train_op_f2g = optimizer.minimize(self.loss_f2g) self.train_op_g2f = optimizer.minimize(self.loss_g2f) def build_valid_graph(self): # validation self.valid_f_inputs = tf.placeholder( 'float32', [None, self.valid_sample_size, self.n_input]) self.valid_g_inputs = tf.placeholder( 'float32', [None, self.valid_sample_size, self.n_input]) valid_f = tf.reshape(self.build_code_graph(self.valid_f_inputs, 'f'), [-1, self.valid_sample_size, self.n_out ]) # batch_size*neg_ratio*n_out valid_g = tf.reshape(self.build_code_graph(self.valid_g_inputs, 'g'), [-1, self.valid_sample_size, self.n_out ]) # batch_size*neg_ratio*n_out # self.dot_dist = tf.reduce_sum(tf.multiply(valid_f, valid_g),axis=2) self.hamming_dist = -tf.reduce_sum(tf.clip_by_value( tf.sign(tf.multiply(valid_f, valid_g)), -1., 0.), axis=2) def train_one_epoch(self): sum_loss = 0.0 # train process batches_f2g = list(batch_iter(self.L, self.batch_size, self.neg_ratio\ , self.lookup_f, self.lookup_g, 'f', 'g')) batches_g2f = list(batch_iter(self.L, self.batch_size, self.neg_ratio\ , self.lookup_g, self.lookup_f, 'g', 'f')) n_batches = min(len(batches_f2g), len(batches_g2f)) batch_id = 0 for i in range(n_batches): # training the process from network f to network g pos_src_f2g, pos_obj_f2g, neg_src_f2g, neg_obj_f2g = batches_f2g[i] if not len(pos_src_f2g) == len(pos_obj_f2g) and not len( neg_src_f2g) == len(neg_obj_f2g): self.logger.info( 'The input label file goes wrong as the file format.') continue batch_size_f2g = len(pos_src_f2g) feed_dict = { self.pos_src_inputs: self.F[pos_src_f2g, :], self.pos_obj_inputs: self.G[pos_obj_f2g, :], self.neg_src_inputs: self.F[neg_src_f2g, :], self.neg_obj_inputs: self.G[neg_obj_f2g, :], self.cur_batch_size: batch_size_f2g } _, cur_loss_f2g = self.sess.run([self.train_op_f2g, self.loss_f2g], feed_dict) sum_loss += cur_loss_f2g # training the process from network g to network f pos_src_g2f, pos_obj_g2f, neg_src_g2f, neg_obj_g2f = batches_g2f[i] if not len(pos_src_g2f) == len(pos_obj_g2f) and not len( neg_src_g2f) == len(neg_obj_g2f): self.logger.info( 'The input label file goes wrong as the file format.') continue batch_size_g2f = len(pos_src_g2f) feed_dict = { self.pos_src_inputs: self.G[pos_src_g2f, :], self.pos_obj_inputs: self.F[pos_obj_g2f, :], self.neg_src_inputs: self.G[neg_src_g2f, :], self.neg_obj_inputs: self.F[neg_obj_g2f, :], self.cur_batch_size: batch_size_g2f } _, cur_loss_g2f = self.sess.run([self.train_op_g2f, self.loss_g2f], feed_dict) sum_loss += cur_loss_g2f batch_id += 1 break # valid process valid_f, valid_g = valid_iter(self.L, self.valid_sample_size, self.lookup_f, self.lookup_g, 'f', 'g') # print valid_f,valid_g if not len(valid_f) == len(valid_g): self.logger.info( 'The input label file goes wrong as the file format.') return valid_size = len(valid_f) feed_dict = { self.valid_f_inputs: self.F[valid_f, :], self.valid_g_inputs: self.G[valid_g, :], } # valid_dist = self.sess.run(self.dot_dist,feed_dict) valid_dist = self.sess.run(self.hamming_dist, feed_dict) mrr = .0 for i in range(valid_size): fst_dist = valid_dist[i][0] pos = 1 for k in range(1, len(valid_dist[i])): if fst_dist >= valid_dist[i][k]: pos += 1 # print pos # self.logger.info('dist:{},pos:{}'.format(fst_dist,pos)) # print valid_dist[i] mrr += 1. / pos self.logger.info('Epoch={}, sum of loss={!s}, mrr={}'.format( self.cur_epoch, sum_loss / batch_id / 2, mrr / valid_size)) # print 'mrr:',mrr/valid_size # self.logger.info('Epoch={}, sum of loss={!s}, valid_loss={}' # .format(self.cur_epoch, sum_loss/batch_id, valid_loss)) self.cur_epoch += 1 def _write_in_file(self, filename, vec, tag): with open(filename, 'aw') as res_handler: if len(vec.shape) > 1: column_size = vec.shape[1] else: column_size = 1 reshape_vec = vec.reshape(-1) vec_size = len(reshape_vec) res_handler.write(tag + '\n') for i in range(0, vec_size, column_size): res_handler.write('{}\n'.format(' '.join( [str(reshape_vec[i + k]) for k in range(column_size)]))) def save_models(self, filename): if os.path.exists(filename): os.remove(filename) for k, v in self.weights.iteritems(): self._write_in_file(filename, v.eval(self.sess), k) for k, v in self.biases.iteritems(): self._write_in_file(filename, v.eval(self.sess), k)
# coding: utf-8 """ ------------------------------------------------------------ File Name: TestLogHandler.py Description: Log operation test Author: JHao date: 2017/03/06 ------------------------------------------------------------ Change Activity: 2017/03/06: Log handler test 2017/09/21: Screen output/file output optional (default screen and file output) ------------------------------------------------------------ """ __author__ = 'JHao' from utils.LogHandler import LogHandler log = LogHandler("log_test") log.info("test_log_info")
class _MNA(object): def __init__(self, graph, attr_file, anchorfile, use_net, valid_prop, neg_ratio, log_file): if os.path.exists('log/' + log_file + '.log'): os.remove('log/' + log_file + '.log') self.logger = LogHandler(log_file) if not isinstance(graph, dict): self.logger.error('The graph must contain src and target graphs.') return self.use_net = use_net self.graph = graph self.lookup = dict() self.lookup['f'] = self.graph['f'].look_up_dict self.lookup['g'] = self.graph['g'].look_up_dict self.look_back = dict() self.look_back['f'] = self.graph['f'].look_back_list self.look_back['g'] = self.graph['g'].look_back_list self.L = load_train_valid_labels(anchorfile, self.lookup, valid_prop) self.attributes = dict() if attr_file: self.attributes['f'] = self._set_node_attributes(attr_file[0]) self.attributes['g'] = self._set_node_attributes(attr_file[1]) self.neg_ratio = neg_ratio self.batch_size = 1024 self.clf = svm.SVC(probability=True) def _set_node_attributes(self, attr_file): node_attributes = defaultdict(list) if not attr_file: return None with open(attr_file, 'r') as fin: for ln in fin: elems = ln.strip().split(',') node_attributes[elems[0]] = list(map(float, elems[1:])) return node_attributes def _get_pair_features(self, src_nds, target_nds): pair_features = list() if len(src_nds) != len(target_nds): self.logger.warn( 'The size of sampling in processing _get_pair_features is not equal.' ) yield pair_features for i in range(len(src_nds)): src_nd_idx, target_nd_idx = src_nds[i], target_nds[i] src_nd = self.look_back['f'][src_nd_idx] target_nd = self.look_back['g'][target_nd_idx] src_neighbor_anchors = set() for src_nd_to in self.graph['f'].G[src_nd]: if src_nd_to in self.L['f2g']['train']: src_neighbor_anchors.add(src_nd_to) target_neighbor_anchors = set() for target_nd_to in self.graph['g'].G[target_nd]: if target_nd_to in self.L['g2f']['train']: target_neighbor_anchors.add(target_nd_to) cnt_common_neighbors = .0 AA_measure = .0 for sna in src_neighbor_anchors: for k in range(len(self.L['f2g']['train'][sna])): target_anchor_nd = self.L['f2g']['train'][sna][k] if target_anchor_nd in target_neighbor_anchors: cnt_common_neighbors += 1. AA_measure += 1./np.log((len(self.graph['f'].G[sna])\ +len(self.graph['g'].G[self.L['f2g']['train'][sna][k]]))/2.) jaccard = cnt_common_neighbors/(len(self.graph['f'].G[src_nd])\ +len(self.graph['g'].G[target_nd])\ -cnt_common_neighbors+1e-6) # print(self.attributes['f'][src_nd], self.attributes['g'][target_nd]) feat_net = [] feat_attr = [] if self.use_net: feat_net = [cnt_common_neighbors, jaccard, AA_measure] if len(self.attributes) > 0: feat_len = len(self.attributes['f'][src_nd]) feat_attr = [1-self.attributes['f'][src_nd][k]\ +self.attributes['g'][target_nd][k] for k in range(feat_len)] # print(len(feat_net), len(feat_attr)) yield feat_net + feat_attr def train(self): batches_f2g = batch_iter(self.L, self.batch_size, self.neg_ratio, self.lookup, 'f', 'g') X = list() Y = list() for batch in batches_f2g: pos, neg = batch if not len(pos['f']) == len(pos['g']) and not len(neg['f']) == len( neg['g']): self.logger.info( 'The input label file goes wrong as the file format.') continue pos_features = list(self._get_pair_features(pos['f'], pos['g'])) # print('feat_len (pos):',len(pos_features[0])) X.extend(pos_features) Y.extend([1 for m in range(len(pos_features))]) for k in range(self.neg_ratio): neg_features = list( self._get_pair_features(neg['f'][k], neg['g'][k])) X.extend(neg_features) # print('feat_len (neg):',len(neg_features[0])) Y.extend([-1 for m in range(len(neg_features))]) self.logger.info('Training Model...') print(len(X), len(X[0]), len(Y)) self.clf.fit(X, Y) print(self.clf) self.logger.info('Training score: %f' % self.clf.score(X, Y)) self.logger.info('Complete Training process...')
class _MNA(object): def __init__(self, graph, anchorfile, valid_prop, neg_ratio, log_file): if os.path.exists('log/' + log_file + '.log'): os.remove('log/' + log_file + '.log') self.logger = LogHandler(log_file) if not isinstance(graph, dict): self.logger.error('The graph must contain src and target graphs.') return self.graph = graph self.lookup = dict() self.lookup['f'] = self.graph['f'].look_up_dict self.lookup['g'] = self.graph['g'].look_up_dict self.look_back = dict() self.look_back['f'] = self.graph['f'].look_back_list self.look_back['g'] = self.graph['g'].look_back_list self.L = load_train_valid_labels(anchorfile, self.lookup, valid_prop) self.neg_ratio = neg_ratio self.batch_size = 1024 self.clf = svm.SVC(probability=True) def __get_pair_features(self, src_nds, target_nds): pair_features = list() if len(src_nds) != len(target_nds): self.logger.warn( 'The size of sampling in processing __get_pair_features is not equal.' ) yield pair_features for i in range(len(src_nds)): src_nd, target_nd = src_nds[i], target_nds[i] src_neighbor_anchors = set() for src_nd_to in self.graph['f'].G[self.look_back['f'][src_nd]]: if src_nd_to in self.L['f2g']['train']: src_neighbor_anchors.add(src_nd_to) target_neighbor_anchors = set() for target_nd_to in self.graph['g'].G[self.look_back['g'] [target_nd]]: if target_nd_to in self.L['g2f']['train']: target_neighbor_anchors.add(target_nd_to) cnt_common_neighbors = .0 AA_measure = .0 for sna in src_neighbor_anchors: for k in range(len(self.L['f2g']['train'][sna])): target_anchor_nd = self.L['f2g']['train'][sna][k] if target_anchor_nd in target_neighbor_anchors: cnt_common_neighbors += 1. AA_measure += 1./np.log((len(self.graph['f'].G[sna])\ +len(self.graph['g'].G[self.L['f2g']['train'][sna][k]]))/2.) jaccard = cnt_common_neighbors/(len(self.graph['f'].G[self.look_back['f'][src_nd]])\ +len(self.graph['g'].G[self.look_back['g'][target_nd]])\ -cnt_common_neighbors+1e-6) yield [cnt_common_neighbors, jaccard, AA_measure] def train(self): batches_f2g = batch_iter(self.L, self.batch_size, self.neg_ratio, self.lookup, 'f', 'g') X = list() Y = list() for batch in batches_f2g: pos, neg = batch if not len(pos['f']) == len(pos['g']) and not len(neg['f']) == len( neg['g']): self.logger.info( 'The input label file goes wrong as the file format.') continue pos_features = list(self.__get_pair_features(pos['f'], pos['g'])) X.extend(pos_features) Y.extend([1 for m in range(len(pos_features))]) for k in range(self.neg_ratio): neg_features = list( self.__get_pair_features(neg['f'][k], neg['g'][k])) X.extend(neg_features) Y.extend([-1 for m in range(len(neg_features))]) self.logger.info('Training Model...') self.clf.fit(X, Y) self.logger.info('Training score: %f' % self.clf.score(X, Y)) self.logger.info('Complete Training process...')
def main(args): t1 = time.time() os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_id logger = LogHandler('RUN.' + time.strftime('%Y-%m-%d', time.localtime(time.time()))) logger.info(args) SAVING_STEP = args.saving_step MAX_EPOCHS = args.max_epochs files = { 'feat-src': args.feature_src, 'feat-end': args.feature_end, 'linkage': args.identity_linkage } if args.method == 'half-sp': model = HALF_SP(learning_rate=args.lr, batch_size=args.batch_size, neg_ratio=args.neg_ratio, gamma=args.gamma, eta=args.eta, n_input=args.input_size, n_out=args.output_size, n_hidden=args.hidden_size, n_layer=args.layers, is_valid=args.is_valid, files=files, type_model=args.type_model, log_file=args.log_file, device=args.device) if args.method == 'half-dp': model = HALF_DP(learning_rate=args.lr, batch_size=args.batch_size, neg_ratio=args.neg_ratio, gamma=args.gamma, eta=args.eta, n_input=args.input_size, n_out=args.output_size, n_hidden=args.hidden_size, n_layer=args.layers, is_valid=args.is_valid, files=files, type_model=args.type_model, log_file=args.log_file, device=args.device) losses = np.zeros(MAX_EPOCHS) val_scrs = np.zeros(MAX_EPOCHS) best_scr = .0 best_epoch = 0 thres = 3 for i in range(1, MAX_EPOCHS + 1): losses[i - 1], val_scrs[i - 1] = model.train_one_epoch() if i % SAVING_STEP == 0: loss_mean = np.mean(losses[i - SAVING_STEP:i]) scr_mean = np.mean(val_scrs[i - SAVING_STEP:i]) logger.info( 'loss in last {} epoches: {}, validation in last {} epoches: {}' .format(SAVING_STEP, loss_mean, SAVING_STEP, scr_mean)) if scr_mean > best_scr: best_scr = scr_mean best_epoch = i model.save_models(args.output) if args.early_stop and i >= thres * SAVING_STEP: cnt = 0 for k in range(thres - 1, -1, -1): cur_val = np.mean(val_scrs[i - (k + 1) * SAVING_STEP:i - k * SAVING_STEP]) if cur_val <= best_scr: cnt += 1 if cnt == thres and (i - best_epoch) >= thres * SAVING_STEP: logger.info('*********early stop*********') logger.info( 'The best epoch: {}\nThe validation score: {}'.format( best_epoch, best_scr)) break t2 = time.time() print('time cost:', t2 - t1)