def main(cfg): pprint.pprint(cfg) mkdir(cfg.checkpoint) mkdir(cfg.codefolder) with tl.session() as sess: dcmh = Model(sess, cfg) dcmh.train()
def test(**kwargs): if 'dataset' not in kwargs: opt = getattr(config, 'Gourmet_Food_data_Config')() else: opt = getattr(config, kwargs['dataset'] + '_Config')() opt.parse(kwargs) logging.basicConfig( filename=f"logs/{opt}.log", filemode="w", format="%(asctime)s %(name)s:%(levelname)s:%(message)s", datefmt="%d-%m-%Y %H:%M:%S", level=logging.DEBUG) random.seed(opt.seed) np.random.seed(opt.seed) torch.manual_seed(opt.seed) if opt.use_gpu: torch.cuda.manual_seed_all(opt.seed) if len(opt.gpu_ids) == 0 and opt.use_gpu: torch.cuda.set_device(opt.gpu_id) model = Model(opt, getattr(models, opt.model)).cuda() print("load...") model.load( "./checkpoints/DPHP_Gourmet_Food_data_cfg-Gourmet_Food_data-poolatt-lr0.001-wd0.0005-drop0.1-id32-hidden100.pth" ) test_data = ReviewData(opt.data_root, mode="Test") test_data_loader = DataLoader(test_data, batch_size=opt.batch_size, shuffle=False, collate_fn=collate_fn) auc, corr, predict_loss = predict(model, test_data_loader, opt, logging)
def generate_conditional_sentence(**kwargs): if 'dataset' not in kwargs: opt = getattr(config, 'AmazonDigitalMusic_Config')() else: opt = getattr(config, kwargs['dataset'] + '_Config')() opt.parse(kwargs) assert(len(opt.pth_path) > 0) random.seed(opt.seed) np.random.seed(opt.seed) torch.manual_seed(opt.seed) if opt.use_gpu: torch.cuda.manual_seed_all(opt.seed) if len(opt.gpu_ids) == 0 and opt.use_gpu: torch.cuda.set_device(opt.gpu_id) model = Model(opt, getattr(models, opt.model)) if opt.use_gpu: model.cuda() if len(opt.gpu_ids) > 0: model = nn.DataParallel(model, device_ids=opt.gpu_ids) if model.net.num_fea != opt.num_fea: raise ValueError(f"the num_fea of {opt.model} is error, please specific --num_fea={model.net.num_fea}") model.load(opt.pth_path) print(f"load model: {opt.pth_path}") test_data = ReviewData(opt.data_root, mode="Test") test_data_loader = DataLoader(test_data, batch_size=1, shuffle=False, collate_fn=collate_fn) print(f"{now()}: generating conditional sentence...") model.eval() with torch.no_grad(): user_review_dict = np.load("./dataset/AmazonDigitalMusic/train/plainUserReviews.npy", allow_pickle=True).item() item_review_dict = np.load("./dataset/AmazonDigitalMusic/train/plainItemReviews.npy", allow_pickle=True).item() cnt = 10 for idx, (test_input, scores) in enumerate(test_data_loader): if idx == cnt: test_input = unpack_input(opt, test_input) output = model(test_input, mode="Generate") uid = test_input[2].item() user_reviews = user_review_dict[uid] iid = test_input[3].item() item_reviews = item_review_dict[iid] imp_user_review_id = output[0].cpu().numpy().squeeze() imp_user_review_id = np.argmax(imp_user_review_id) print(user_reviews[imp_user_review_id]) imp_item_review_id = output[1].cpu().numpy().squeeze() imp_item_review_id = np.argmax(imp_item_review_id) print(item_reviews[imp_item_review_id]) break
def test(**kwargs): if 'dataset' not in kwargs: opt = getattr(config, 'AmazonDigitalMusic_Config')() else: opt = getattr(config, kwargs['dataset'] + '_Config')() opt.parse(kwargs) assert(len(opt.pth_path) > 0) random.seed(opt.seed) np.random.seed(opt.seed) torch.manual_seed(opt.seed) if opt.use_gpu: torch.cuda.manual_seed_all(opt.seed) if len(opt.gpu_ids) == 0 and opt.use_gpu: torch.cuda.set_device(opt.gpu_id) model = Model(opt, getattr(models, opt.model)) if opt.use_gpu: model.cuda() if len(opt.gpu_ids) > 0: model = nn.DataParallel(model, device_ids=opt.gpu_ids) if model.net.num_fea != opt.num_fea: raise ValueError(f"the num_fea of {opt.model} is error, please specific --num_fea={model.net.num_fea}") model.load(opt.pth_path) print(f"load model: {opt.pth_path}") test_data = ReviewData(opt.data_root, mode="Test") test_data_loader = DataLoader(test_data, batch_size=opt.batch_size, shuffle=False, collate_fn=collate_fn) print(f"{now()}: test in the test datset") predict_loss, test_mse, test_mae = predict(model, test_data_loader, opt)
def static_process(args): sample_rate = 1 #0.5 # load emai_eu data data, n, m = load_email_eu(args.input, sample_rate) # STEP 0: Parameters hidden_size = args.representation_size # size of hidden codes to learn, default is 20 activation = tf.nn.sigmoid dimension = [n, hidden_size] rho = 0.5 # sparsity ratio lamb = 0.0017 # weight decay beta = 1 # sparsity weight gama = 340 # autoencoder weight walk_len = args.walk_length epoch = 30 # number of epoch for optimizing, could be larger batch_size = 40 # should be smaller or equal to args.number_walks*n learning_rate = 0.01 # learning rate, for adam, using 0.01, for rmsprop using 0.1 optimizer = "adam" #"rmsprop"#"gd"#"rmsprop" #"""gd"#""lbfgs" corrupt_prob = [0] # corrupt probability, for denoising AE ini_graph_percent = args.init_percent # percent of edges in the initial graph anomaly_percent = 0.2 # percentage of anomaly edges in the testing edges alfa = 0.01 # updating parameter for online k-means to update clustering centroids k = 3 # number of clusters for kmeans to clustering edges # STEP 1: Preparing data: training data and testing list of edges(for online updating) synthetic_test, train_mat, train = anomaly_generation( ini_graph_percent, anomaly_percent, data, n, m) data_zip = [] data_zip.append(synthetic_test) data_zip.append(train) # generating initial training walks netwalk = NetWalk_update(data_zip, walk_per_node=args.number_walks, walk_len=args.walk_length, init_percent=args.init_percent, snap=args.snap) ini_data = netwalk.getInitWalk() embModel = MD.Model(activation, dimension, walk_len, n, gama, lamb, beta, rho, epoch, batch_size, learning_rate, optimizer, corrupt_prob) # STEP 2: Learning initial embeddings for training edges embedding = getEmbedding(embModel, ini_data, n) # dynaimically plot the anomaly score over different snapshots d_plot = DP.DynamicUpdate() # conduct anomaly detection using first snapshot of testing edges scores, auc, n0, c0, res, ab_score = anomaly_detection( embedding, train, synthetic_test[0:args.snap, :], k) print('initial auc of anomaly detection:', auc) print('initial anomaly score:', ab_score) # visualize anomaly score d_plot.addPoint(1, ab_score) # STEP 3: over different snapshots of edges, dynamically updating embeddings of nodes and conduct # online anomaly detection for edges, visualize the anomaly score of each snapshot snapshotNum = 1 while (netwalk.hasNext()): snapshot_data = netwalk.nextOnehotWalks() embedding = getEmbedding(embModel, snapshot_data, n) if netwalk.hasNext(): if len(synthetic_test) > args.snap * (snapshotNum + 1): test_piece = synthetic_test[args.snap * snapshotNum:args.snap * (snapshotNum + 1), :] else: test_piece = synthetic_test[args.snap * snapshotNum:, :] #return else: return # online anomaly detection, each execution will update the clustering center scores, auc, n0, c0, res, ab_score = anomaly_detection_stream( embedding, train, test_piece, k, alfa, n0, c0) print('auc of anomaly detection at snapshot %d: %f' % (snapshotNum, auc)) print('anomaly score at snapshot %d: %f' % (snapshotNum, ab_score)) snapshotNum += 1 # visualizing anomaly score of current snapshot d_plot.addPoint(snapshotNum, ab_score)
def build_model(model): """Build a Tensorflow graph for the QA model. Return a model.Model for training, evaluation, etc. """ with tf.name_scope("Inputs"): questions = tf.placeholder( tf.int32, name="Questions", shape=[None, None]) documents = tf.placeholder( tf.int32, name="Documents", shape=[None, None]) same_as_question_feature = tf.placeholder( tf.float32, name="SameAsQuestionFeature", shape=[None, None]) repeated_words = tf.placeholder( tf.float32, name="RepeatedWordFeature", shape=[None, None]) repeated_word_intensity = tf.placeholder( tf.float32, name="RepeatedWordIntensity", shape=[None, None]) sentence_lengths = tf.placeholder( tf.int32, name="SentenceOffsets", shape=[None, None]) sentence_labels = tf.placeholder( tf.int32, name="SentenceLabels", shape=[None]) word_start_labels = tf.placeholder( tf.int32, name="WordStartLabels", shape=[None]) word_end_labels = tf.placeholder( tf.int32, name="WordEndLabels", shape=[None]) embedding_dropout = tf.placeholder_with_default( model.embedding_dropout_prob, shape=[]) hidden_dropout = tf.placeholder_with_default( model.hidden_dropout_prob, shape=[]) training = tf.placeholder_with_default( True, shape=[], name="TrainingIndicator") exact_match = tf.placeholder( tf.float32, name="ExactMatch", shape=[]) f1 = tf.placeholder( tf.float32, name="F1", shape=[]) with tf.variable_scope("GloveEmbeddings"): embeddings = tf.get_variable( shape=[model.vocab_size, EMBEDDING_DIM], initializer=tf.zeros_initializer(), trainable=False, name="GloveEmbeddings") embedding_placeholder = tf.placeholder( tf.float32, [model.vocab_size, EMBEDDING_DIM]) embedding_init = embeddings.assign(embedding_placeholder) with tf.name_scope("QuestionEmbeddings"): question_vector = featurize_question(model, questions, embedding_dropout, training) with tf.name_scope("DocumentEmbeddings"): document_embeddings = featurize_document( model, questions, documents, same_as_question_feature, repeated_words, repeated_word_intensity, question_vector, embedding_dropout, training) # Keep track of the beam state at each decision point beam_states = [] with tf.name_scope("PickSentence"): sentence_scores = score_sentences( model, document_embeddings, sentence_lengths, hidden_dropout) beam_states.append(([], tf.expand_dims(sentence_scores, 1))) beam_scores, sentence_picks = tf.nn.top_k( sentence_scores, k=tf.minimum(model.beam_size, tf.shape(sentence_scores)[1]), sorted=True) sentence_correct = tf.reduce_mean( tf.cast(tf.equal(sentence_labels, sentence_picks[:, 0]), tf.float32)) with tf.name_scope("PickStartWord"): start_word_scores = score_start_word( model, document_embeddings, sentence_picks, sentence_lengths, hidden_dropout) beam_scores = tf.expand_dims(beam_scores, 2) + start_word_scores beam_states.append(([sentence_picks], beam_scores)) beam_scores, kept_sentences, start_words = ops.prune_beam( beam_scores, sentence_picks, model.beam_size) start_word_correct = tf.reduce_mean( tf.cast(tf.logical_and( tf.equal(word_start_labels, start_words[:, 0]), tf.equal(sentence_labels, kept_sentences[:, 0])), tf.float32)) with tf.name_scope("PickEndWord"): end_word_scores = score_end_words( model, document_embeddings, kept_sentences, start_words, sentence_lengths, hidden_dropout, training) beam_scores = tf.expand_dims(beam_scores, 2) + end_word_scores beam_states.append(([kept_sentences, start_words], beam_scores)) beam_scores, (kept_sentences, kept_start_words), end_words = ops.prune_beam( beam_scores, [kept_sentences, start_words], model.beam_size) # Also track the final decisions. beam_states.append(([kept_sentences, kept_start_words, end_words], beam_scores)) # Get offset from start word end_word_picks = kept_start_words + end_words final_states = [kept_sentences, kept_start_words, end_word_picks] end_word_correct = tf.reduce_mean( tf.cast(tf.logical_and( tf.logical_and( tf.equal(word_end_labels, end_word_picks[:, 0]), tf.equal(word_start_labels, kept_start_words[:, 0])), tf.equal(sentence_labels, kept_sentences[:, 0])), tf.float32)) with tf.name_scope("Loss"): # End prediction is based on the start word offset. end_labels = word_end_labels - word_start_labels labels = (sentence_labels, word_start_labels, end_labels) loss = globally_normalized_loss(beam_states, labels) l2_penalty = tf.contrib.layers.apply_regularization( tf.contrib.layers.l2_regularizer(model.l2_scale), tf.trainable_variables()) loss += l2_penalty with tf.name_scope("TrainStep"): iteration, (step, loss, gradnorm) = ops.default_train_step( model, loss) with tf.name_scope("TrainSummary"): train_summary = ops.scalar_summaries({ "Train-Loss": loss, "Gradient-Norm": gradnorm, "Sentence-Correct": sentence_correct, "Start-Word-Correct": start_word_correct, "End-Word-Correct": end_word_correct}) with tf.name_scope("ValidSummary"): valid_summary = ops.scalar_summaries({ "Validation-Loss": loss, "Sentence-Correct": sentence_correct, "Start-Word-Correct": start_word_correct, "End-Word-Correct": end_word_correct}) with tf.name_scope("SquadSummary"): squad_summary = ops.scalar_summaries({ "Exact-Match": exact_match, "F1": f1}) return Model( inputs=[questions, documents, same_as_question_feature, repeated_words, repeated_word_intensity, sentence_lengths, sentence_labels, word_start_labels, word_end_labels], outputs=[kept_sentences, kept_start_words, end_word_picks, sentence_correct, start_word_correct, end_word_correct], loss=loss, training=training, dropout=[embedding_dropout, hidden_dropout], gradnorm=gradnorm, step=step, iteration=iteration, train_summary=train_summary, valid_summary=valid_summary, embedding_init=embedding_init, embedding_placeholder=embedding_placeholder, squad_summary=squad_summary, squad_inputs=[exact_match, f1])
type=str, default='Sma', help= 'kung fu parallel optimizor,available options: Sync_sgd, Async_sgd, Sma' ) parser.add_argument("--output_dir", type=str, default="save_dir", help="which dir to output the exported pb model") args = parser.parse_args() Config.set_model_name(args.model_name) Config.set_model_type(Config.MODEL[args.model_type]) Config.set_model_backbone(Config.BACKBONE[args.model_backbone]) config = Config.get_config() export_model = Model.get_model(config) input_path = f"{config.model.model_dir}/newest_model.npz" output_dir = f"{args.output_dir}/{config.model.model_name}" output_path = f"{output_dir}/frozen_{config.model.model_name}.pb" print(f"exporting model {config.model.model_name} from {input_path}...") if (not os.path.exists(output_dir)): print("creating output_dir...") os.mkdir(output_dir) if (not os.path.exists(input_path)): print("input model file doesn't exist!") print("conversion aborted!") else: export_model.load_weights(input_path) export_model.eval() if (export_model.data_format == "channels_last"):
def static_process(args): # STEP 0: Parameters hidden_size = args.representation_size # size of hidden codes to learn, default is 20 activation = tf.nn.sigmoid rho = 0.5 # sparsity ratio lamb = 0.0017 # weight decay beta = 1 # sparsity weight gama = 340 # autoencoder weight walk_len = args.walk_length epoch = 400 batch_size = 20 # number of epoch for optimizing, could be larger learning_rate = 0.1 # learning rate, for adam, using 0.01, for rmsprop using 0.1 optimizer = "rmsprop" #"gd"#"rmsprop" #""lbfgs"#"rmsprop"#"adam"#"gd"#""lbfgs"#"adam"# corrupt_prob = [0] # corrupt probability, for denoising AE # STEP 1: Preparing data: training data and testing list of edges(for online updating) data_path = args.input netwalk = NetWalk_update(data_path, walk_per_node=args.number_walks, \ walk_len=args.walk_length, init_percent=args.init_percent, snap=args.snap) n = len(netwalk.vertices) # number of total nodes print("{} Number of nodes: {}".format(print_time(), n)) print("{} Number of walks: {}".format(print_time(), args.number_walks)) print("{} Data size (walks*length): {}".format( print_time(), args.number_walks * args.walk_length)) print("{} Generating network walks...".format(print_time())) print("{} Clique embedding training...".format(print_time())) dimension = [n, hidden_size] embModel = MD.Model(activation, dimension, walk_len, n, gama, lamb, beta, rho, epoch, batch_size, learning_rate, optimizer, corrupt_prob) init_edges, snapshots = netwalk.data data = netwalk.getInitWalk() fig = plt.figure(figsize=(12, 12)) # STEP 2: Learning initial embeddings for training edges embedding_code(embModel, data, n, args) # load karate club graph G = nx.karate_club_graph() edge_list = G.edges() # list of initial edge list tuples tuples = tuple(map(tuple, init_edges - 1)) # complementary set of edges for initial edges rm_list = [x for x in edge_list if x not in tuples] # visualize initial embedding viz_stream(rm_list, fig, 5, 2, 1) # STEP 3: over different snapshots of edges, dynamically updating embeddings of nodes and conduct # online anomaly detection for edges, visualize the anomaly score of each snapshot snapshotNum = 0 while (netwalk.hasNext()): data = netwalk.nextOnehotWalks() tuples = tuple(map(tuple, snapshots[snapshotNum] - 1)) + tuples snapshotNum += 1 embedding_code(embModel, data, n, args) rm_list = [x for x in edge_list if x not in tuples] viz_stream(rm_list, fig, 5, 2, snapshotNum + 1) plt.show() print("finished")
def train(**kwargs): if 'dataset' not in kwargs: opt = getattr(config, 'AmazonDigitalMusic_Config')() else: opt = getattr(config, kwargs['dataset'] + '_Config')() opt.parse(kwargs) random.seed(opt.seed) np.random.seed(opt.seed) torch.manual_seed(opt.seed) if opt.use_gpu: torch.cuda.manual_seed_all(opt.seed) if len(opt.gpu_ids) == 0 and opt.use_gpu: torch.cuda.set_device(opt.gpu_id) model = Model(opt, getattr(models, opt.model)) if opt.use_gpu: model.cuda() if len(opt.gpu_ids) > 0: model = nn.DataParallel(model, device_ids=opt.gpu_ids) if model.net.num_fea != opt.num_fea: raise ValueError(f"the num_fea of {opt.model} is error, please specific --num_fea={model.net.num_fea}") # 3 data train_data = ReviewData(opt.data_root, mode="Train") train_data_loader = DataLoader(train_data, batch_size=opt.batch_size, shuffle=True, collate_fn=collate_fn) val_data = ReviewData(opt.data_root, mode="Val") val_data_loader = DataLoader(val_data, batch_size=opt.batch_size, shuffle=False, collate_fn=collate_fn) print(f'train data: {len(train_data)}; test data: {len(val_data)}') optimizer = optim.Adam(model.parameters(), lr=opt.lr, weight_decay=opt.weight_decay) scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.8) # training print("start training....") min_loss = 1e+10 best_res = 1e+10 mse_func = nn.MSELoss() mae_func = nn.L1Loss() smooth_mae_func = nn.SmoothL1Loss() train_mse_list = [] val_mse_list = [] val_mae_list = [] for epoch in range(opt.num_epochs): total_loss = 0.0 total_maeloss = 0.0 model.train() print(f"{now()} Epoch {epoch}...") for idx, (train_datas, scores) in enumerate(train_data_loader): if opt.use_gpu: scores = torch.FloatTensor(scores).cuda() else: scores = torch.FloatTensor(scores) train_datas = unpack_input(opt, train_datas) optimizer.zero_grad() output = model(train_datas) mse_loss = mse_func(output, scores) total_loss += mse_loss.item() * len(scores) mae_loss = mae_func(output, scores) total_maeloss += mae_loss.item() smooth_mae_loss = smooth_mae_func(output, scores) if opt.loss_method == 'mse': loss = mse_loss if opt.loss_method == 'rmse': loss = torch.sqrt(mse_loss) / 2.0 if opt.loss_method == 'mae': loss = mae_loss if opt.loss_method == 'smooth_mae': loss = smooth_mae_loss loss.backward() optimizer.step() if opt.fine_step: if idx % opt.print_step == 0 and idx > 0: print("\t{}, {} step finised;".format(now(), idx)) val_loss, val_mse, val_mae = predict(model, val_data_loader, opt) if val_loss < min_loss: model.save(name=opt.dataset, opt=opt.print_opt) min_loss = val_loss print("\tmodel save") if val_loss > min_loss: best_res = min_loss scheduler.step() mse = total_loss * 1.0 / len(train_data) print(f"\ttrain data: loss:{total_loss:.4f}, mse: {mse:.4f};") val_loss, val_mse, val_mae = predict(model, val_data_loader, opt) train_mse_list.append(mse) val_mse_list.append(val_mse) val_mae_list.append(val_mae) if val_loss < min_loss: model.save(name=opt.dataset, opt=opt.print_opt) min_loss = val_loss print("model save") if val_mse < best_res: best_res = val_mse print("*"*30) print("----"*20) print(f"{now()} {opt.dataset} {opt.print_opt} best_res: {best_res}") print("----"*20) print("Train MSE:", train_mse_list) print("Val MSE:", val_mse_list) print("Val MAE:", val_mae_list)
help='ckpt index') parser.add_option('--ckpt_dir', dest='ckpt_dir', default='ckpt', help='ckpt') (options, args) = parser.parse_args() os.environ["CUDA_VISIBLE_DEVICES"] = options.gpu p2id = json.load(open("../data/p2id.json")) word2id = json.load(open('../data/word2id.json')) id2word = {} for key in word2id: id2word[word2id[key]] = key config = Config() if options.mode == 'people': test = Test(config, options.ckpt_dir, id2word) test.init_test(Model(config), options.ckpt_index) print('请输入:') line = '' is_continue = False while line != 'stop': line = input() pins = line.strip().split() query = np.ones([1, config.seq_len], dtype=np.int32) target_seq_len = [0] target_seq_len[0] = len(pins) for i, pin in enumerate(pins): if pin not in p2id: is_continue = True print('Invalid input!') break
def static_process(representation_size,walk_length,input,number_walks,init_percent,snap,output,datasetname): # region Parameters hidden_size = representation_size # size of hidden codes to learn, default is 20 activation = tf.nn.sigmoid rho = 0.5 # sparsity ratio lamb = 0.0017 # weight decay beta = 1 # sparsity weight gama = 340 # autoencoder weight walk_len = walk_length epoch = 400 batch_size = 20 # number of epoch for optimizing, could be larger learning_rate = 0.1 # learning rate, for adam, using 0.01, for rmsprop using 0.1 optimizer = "rmsprop"#"gd"#"rmsprop" #""lbfgs"#"rmsprop"#"adam"#"gd"#""lbfgs"#"adam"# corrupt_prob = [0] # corrupt probability, for denoising AE # endregion # region STEP 1: Preparing data: training data and testing list of edges(for online updating) data_path = input netwalk = NetWalk_update(data_path, walk_per_node=number_walks,walk_len=walk_length, init_percent=init_percent, snap=snap) n = len(netwalk.vertices) # number of total nodes # endregion print("{} Number of nodes: {}".format(print_time(), n)) print("{} Number of walks: {}".format(print_time(), number_walks)) print("{} Data size (walks*length): {}".format(print_time(),number_walks*walk_length)) print("{} Generating network walks...".format(print_time())) print("{} Clique embedding training...".format(print_time())) dimension = [n, hidden_size] embModel = MD.Model(activation, dimension, walk_len, n, gama, lamb, beta, rho,epoch, batch_size, learning_rate, optimizer, corrupt_prob) init_edges, snapshots,edges = netwalk.data edges = edges-1 #karate # G = nx.Graph() # G.add_edges_from(edges) # vertices = np.unique(edges) G = nx.Graph() G.add_edges_from(edges) clusteringAccuracy=[] edge_list = tuple(map(tuple, edges)) data = netwalk.getInitWalk() fig = plt.figure(figsize=(12, 12)) # STEP 2: Learning initial embeddings for training edges embeddings=embedding_code(embModel, data, n, output) # list of initial edge list tuples tuples = tuple(map(tuple, init_edges-1))#karate #tuples = tuple(map(tuple, init_edges)) # complementary set of edges for initial edges rm_list = [x for x in edge_list if x not in tuples] # visualize initial embedding clusteringAccuracy=viz_stream(G,rm_list, fig, 5, 2, 1,output,"./tmp/membership_"+datasetname+".txt",representation_size,clusteringAccuracy) # STEP 3: over different snapshots of edges, dynamically updating embeddings of nodes and conduct # online anomaly detection for edges, visualize the anomaly score of each snapshot snapshotNum = 0 while(netwalk.hasNext()): G = nx.Graph() G.add_edges_from(edges) data = netwalk.nextOnehotWalks() tuples = tuple(map(tuple, snapshots[snapshotNum] - 1)) + tuples snapshotNum += 1 embedding_code(embModel, data, n,output) rm_list = [x for x in edge_list if x not in tuples] clusteringAccuracy=viz_stream(G,rm_list, fig, 5, 2, snapshotNum+1,output,"./tmp/membership_"+datasetname+".txt",representation_size) print(clusteringAccuracy) #plt.show() fig.savefig('../plots/graph_'+datasetname+'.png') f = open('./tmp/accuracy_' + datasetname + '.txt', 'a+') f.write("dimension is "+str(dimension)) f.write("\n") for acc in clusteringAccuracy: f.write(str(acc)) f.write("\n") f.write("\n") f.write("\n") #np.savetxt(f, clusteringAccuracy, fmt="%g") print("finished")
def static_process(representation_size, walk_length, input, number_walks, init_percent, snap, output, datasetname): # region Preprocess the data(change directed to undirected/remove self loops/remove duplicate edges) sample_rate = 1 #0.5 data, n, m = load_email_eu(input, sample_rate) # endregion # region Parameters hidden_size = representation_size # size of hidden codes to learn, default is 20 dimension = [n, hidden_size] activation = tf.nn.sigmoid rho = 0.5 # sparsity ratio lamb = 0.0017 # weight decay beta = 1 # sparsity weight gama = 340 # autoencoder weight walk_len = walk_length # Length of each walk epoch = 50 # number of epoch for optimizing, could be larger batch_size = 40 # should be smaller or equal to args.number_walks*n learning_rate = 0.01 # learning rate, for adam, using 0.01, for rmsprop using 0.1 optimizer = "adam" #"rmsprop"#"gd"#"rmsprop" #"""gd"#""lbfgs" corrupt_prob = [0] # corrupt probability, for denoising AE ini_graph_percent = init_percent # percent of edges in the initial graph alfa = 0.01 #0.5(paper) # updating parameter for online k-means to update clustering centroids if (datasetname == "karate"): anomaly_percent = 0.3 k = 4 elif (datasetname == "toy"): anomaly_percent = 1 k = 2 elif (datasetname == "cora"): anomaly_percent = 0.1 k = 7 elif (datasetname == "citeseer"): anomaly_percent = 0.1 k = 6 elif (datasetname == "dolphin"): anomaly_percent = 0.1 k = 3 print("No of Clusters in Dataset " + str(datasetname) + " is " + str(k)) # endregion # region STEP 1: Generates Anomaly data: training data and testing list of edges(for online updating) membership_path = "./tmp/membership_" + datasetname + ".txt" #synthetic_test, train_mat, train = anomaly_generation(ini_graph_percent, anomaly_percent, data, n, m,membership_path) synthetic_test, train_mat, train = anomaly_generation( 0.8, anomaly_percent, data, n, m, membership_path) data_zip = [] data_zip.append(synthetic_test) data_zip.append(train) # endregion # region generating initial training walks netwalk = NetWalk_update(data_zip, walk_per_node=number_walks, walk_len=walk_length, init_percent=init_percent, snap=snap) ini_data = netwalk.getInitWalk() print(np.shape(ini_data[0])) # endregion # region Initialise Model embModel = MD.Model(activation, dimension, walk_len, n, gama, lamb, beta, rho, epoch, batch_size, learning_rate, optimizer, corrupt_prob) # endregion # region STEP 2: Learning initial embeddings for training edges embedding = getEmbedding(embModel, ini_data, n) # endregion # region conduct anomaly detection using first snapshot of testing edges areaUnderCurve = [] xValue = [] #test_piece=synthetic_test[0:snap, :] test_piece = synthetic_test scores, auc, n0, c0, res, ab_score = anomaly_detection( embedding, train, test_piece, k) areaUnderCurve.append(auc) xValue.append(0) #scores, auc, n0, c0, res, ab_score = anomaly_detection(embedding, train, synthetic_test, k) print('initial auc of anomaly detection:', auc) print('initial anomaly score:', ab_score) # endregion # region Online Increment # STEP 3: over different snapshots of edges, dynamically updating embeddings of nodes and conduct # online anomaly detection for edges, visualize the anomaly score of each snapshot snapshotNum = 1 while (netwalk.hasNext()): # region Include next walks dynamically and find embedding snapshot_data = netwalk.nextOnehotWalks() embedding = getEmbedding(embModel, snapshot_data, n) # endregion # if netwalk.hasNext(): # if len(synthetic_test) > snap * (snapshotNum + 1): # #test_piece = synthetic_test[snap * snapshotNum:snap * (snapshotNum + 1), :] # test_piece = synthetic_test[:snap * (snapshotNum + 1), :] # else: # test_piece = synthetic_test # online anomaly detection, each execution will update the clustering center scores, auc, n0, c0, res, ab_score = anomaly_detection_stream( embedding, train, test_piece, k, alfa, n0, c0) print('auc of anomaly detection at snapshot %d: %f' % (snapshotNum, auc)) print('anomaly score at snapshot %d: %f' % (snapshotNum, ab_score)) areaUnderCurve.append(auc) xValue.append(snapshotNum) snapshotNum += 1 # scores, auc, n0, c0, res, ab_score = anomaly_detection_stream(embedding, train, test_piece, k, alfa, n0, c0) # print('Final auc of anomaly detection at snapshot %d: %f' % (snapshotNum, auc)) # print('Final anomaly score at snapshot %d: %f' % (snapshotNum, ab_score)) plt.plot(xValue, areaUnderCurve) plt.yticks(np.arange(0, 100, 5)) plt.savefig('../plots/anomalyaccuracy_' + datasetname + str(datetime.datetime.now()) + '.png')
def static_process(representation_size, walk_length, input, number_walks, init_percent, snap, output, datasetname): # region Preprocess the data(change directed to undirected/remove self loops/remove duplicate edges) sample_rate = 1 #0.5 data, n, m = load_email_eu(input, sample_rate) GraphEdges, trainData, trainLabels, testData, testLabels = preprocessGraph( data, 0.7, n, m) # endregion # region Parameters hidden_size = representation_size # size of hidden codes to learn, default is 20 dimension = [n, hidden_size] activation = tf.nn.sigmoid rho = 0.5 # sparsity ratio lamb = 0.0017 # weight decay beta = 1 # sparsity weight gama = 340 # autoencoder weight walk_len = walk_length # Length of each walk epoch = 50 # number of epoch for optimizing, could be larger batch_size = 40 # should be smaller or equal to args.number_walks*n learning_rate = 0.01 # learning rate, for adam, using 0.01, for rmsprop using 0.1 optimizer = "adam" #"rmsprop"#"gd"#"rmsprop" #"""gd"#""lbfgs" corrupt_prob = [0] # corrupt probability, for denoising AE ini_graph_percent = init_percent # percent of edges in the initial graph alfa = 0.01 #0.5(paper) # updating parameter for online k-means to update clustering centroids if (datasetname == "karate"): anomaly_percent = 0.1 k = 4 elif (datasetname == "toy"): anomaly_percent = 1 k = 2 elif (datasetname == "cora"): anomaly_percent = 0.1 k = 7 elif (datasetname == "citeseer"): anomaly_percent = 0.1 k = 6 elif (datasetname == "dolphin"): anomaly_percent = 0.1 k = 3 print("No of Clusters in Dataset " + str(datasetname) + " is " + str(k)) # endregion # region generating initial training walks netwalk = NetWalk_update(data, walk_per_node=number_walks, walk_len=walk_length, init_percent=init_percent, snap=snap) ini_data = netwalk.getInitWalk() # endregion # region Initialise Model embModel = MD.Model(activation, dimension, walk_len, n, gama, lamb, beta, rho, epoch, batch_size, learning_rate, optimizer, corrupt_prob) # endregion # region STEP 2: Learning initial embeddings for training edges embedding = getEmbedding(embModel, ini_data, n) f = open( "../plots/linkresults_" + str(datasetname) + str(datetime.datetime.now()) + ".txt", "w") # endregion AccuracyList = [] xValue = [1] accuracy = linkPrediction(embedding, np.array(trainData), trainLabels, np.array(testData), testLabels) f.write("Accuracy " + str(accuracy)) f.write("\n") f.close() AccuracyList.append(accuracy) #print("Accuracy ",accuracy) # region Online Increment # STEP 3: over different snapshots of edges, dynamically updating embeddings of nodes and conduct # online anomaly detection for edges, visualize the anomaly score of each snapshot snapshotNum = 1 while (netwalk.hasNext()): # region Include next walks dynamically and find embedding snapshot_data = netwalk.nextOnehotWalks() embedding = getEmbedding(embModel, snapshot_data, n) accuracy = linkPrediction(embedding, np.array(trainData), trainLabels, np.array(testData), testLabels) f = open( "../plots/linkresults_" + str(datasetname) + str(datetime.datetime.now()) + ".txt", "w") f.write("Accuracy " + str(accuracy)) f.write("\n") f.close() AccuracyList.append(accuracy) #print("Accuracy ", accuracy) snapshotNum += 1 xValue.append(snapshotNum) f = open( "../plots/linkresults_" + str(datasetname) + str(datetime.datetime.now()) + ".txt", "w") accuracy = linkPrediction(embedding, np.array(trainData), trainLabels, np.array(testData), testLabels) f.write("Final Accuracy " + str(accuracy)) f.write("\n") f.close() print("Final Accuracy ", accuracy) # scores, auc, n0, c0, res, ab_score = anomaly_detection_stream(embedding, train, test_piece, k, alfa, n0, c0) # print('Final auc of anomaly detection at snapshot %d: %f' % (snapshotNum, auc)) # print('Final anomaly score at snapshot %d: %f' % (snapshotNum, ab_score)) plt.plot(xValue, AccuracyList) plt.yticks(np.arange(0, 1, .1)) plt.savefig('../plots/linkaccuracy_' + datasetname + str(datetime.datetime.now()) + '.png')
def train(**kwargs): if 'dataset' not in kwargs: opt = getattr(config, 'Digital_Music_data_Config')() else: opt = getattr(config, kwargs['dataset'] + '_Config')() opt.parse(kwargs) random.seed(opt.seed) np.random.seed(opt.seed) torch.manual_seed(opt.seed) if opt.use_gpu: torch.cuda.manual_seed_all(opt.seed) if len(opt.gpu_ids) == 0 and opt.use_gpu: torch.cuda.set_device(opt.gpu_id) model = Model(opt, getattr(models, opt.model)) if opt.use_gpu: model.cuda() if len(opt.gpu_ids) > 0: model = nn.DataParallel(model, device_ids=opt.gpu_ids) # 3 data train_data = ReviewData(opt.data_root, train=True) train_data_loader = DataLoader(train_data, batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_workers, collate_fn=collate_fn) test_data = ReviewData(opt.data_root, train=False) test_data_loader = DataLoader(test_data, batch_size=opt.batch_size, shuffle=False, num_workers=opt.num_workers, collate_fn=collate_fn) print('{}: train data: {}; test data: {}'.format(now(), len(train_data), len(test_data))) optimizer = optim.Adam(model.parameters(), lr=opt.lr, weight_decay=opt.weight_decay) scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.8) # training print("start training....") min_loss = 1e+10 best_res = 1e+10 mse_func = nn.MSELoss() mae_func = nn.L1Loss() smooth_mae_func = nn.SmoothL1Loss() for epoch in range(opt.num_epochs): total_loss = 0.0 total_maeloss = 0.0 model.train() print("{} Epoch {}: start".format(now(), epoch)) for idx, (train_datas, scores) in enumerate(train_data_loader): if opt.use_gpu: scores = torch.FloatTensor(scores).cuda() else: scores = torch.FloatTensor(scores) train_datas = unpack_input(opt, train_datas) optimizer.zero_grad() output = model(train_datas) mse_loss = mse_func(output, scores) total_loss += mse_loss.item() * len(scores) mae_loss = mae_func(output, scores) total_maeloss += mae_loss.item() smooth_mae_loss = smooth_mae_func(output, scores) if opt.loss_method == 'mse': loss = mse_loss if opt.loss_method == 'rmse': loss = torch.sqrt(mse_loss) / 2.0 if opt.loss_method == 'mae': loss = mae_loss if opt.loss_method == 'smooth_mae': loss = smooth_mae_loss loss.backward() optimizer.step() if opt.fine_step: if idx % opt.print_step == 0 and idx > 0: print("\t{}, {} step finised;".format(now(), idx)) predict_loss, test_mse = predict(model, test_data_loader, opt, use_gpu=opt.use_gpu) if predict_loss < min_loss: model.save(name=opt.dataset, opt=opt.print_opt) min_loss = predict_loss print("\tmodel save") if predict_loss > min_loss: best_res = min_loss scheduler.step(epoch) print("{}; epoch:{}; total_loss:{}".format(now(), epoch, total_loss)) mse = total_loss * 1.0 / len(train_data) mae = total_maeloss * 1.0 / len(train_data) print("{};train reslut: mse: {}; rmse: {}; mae: {}".format(now(), mse, math.sqrt(mse), mae)) predict_loss, test_mse = predict(model, test_data_loader, opt, use_gpu=opt.use_gpu) if predict_loss < min_loss: model.save(name=opt.dataset, opt=opt.print_opt) min_loss = predict_loss print("model save") if test_mse < best_res: best_res = test_mse print("----"*20) print(f"{now()} {opt.dataset} {opt.print_opt} best_res: {best_res}") print("----"*20)
def train(**kwargs): if 'dataset' not in kwargs: opt = getattr(config, 'Gourmet_Food_data_Config')() else: opt = getattr(config, kwargs['dataset'] + '_Config')() opt.parse(kwargs) logging.basicConfig( filename=f"logs/{opt}.log", filemode="w", format="%(asctime)s %(name)s:%(levelname)s:%(message)s", datefmt="%d-%m-%Y %H:%M:%S", level=logging.DEBUG) random.seed(opt.seed) np.random.seed(opt.seed) torch.manual_seed(opt.seed) if opt.use_gpu: torch.cuda.manual_seed_all(opt.seed) if len(opt.gpu_ids) == 0 and opt.use_gpu: torch.cuda.set_device(opt.gpu_id) model = Model(opt, getattr(models, opt.model)) if opt.use_gpu: model.cuda() # 3 data train_data = ReviewData(opt.data_root, mode="Train") train_data_loader = DataLoader(train_data, batch_size=opt.batch_size, shuffle=True, collate_fn=collate_fn) val_data = ReviewData(opt.data_root, mode="Val") val_data_loader = DataLoader(val_data, batch_size=opt.batch_size, shuffle=True, collate_fn=collate_fn) logging.info('{}: train data: {}; val data: {}'.format( now(), len(train_data), len(val_data))) optimizer = optim.Adam(model.parameters(), lr=opt.lr, weight_decay=opt.weight_decay) scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.5) # training logging.info("start training....") min_loss = 1e+20 best_auc = -1. best_per = -1. best_epoch = 0 cre_loss = nn.BCEWithLogitsLoss() for epoch in range(opt.num_epochs): total_loss = 0.0 model.train() for idx, datas in enumerate(train_data_loader): train_datas, is_helpful, helpful_score = unpack_input(opt, datas) optimizer.zero_grad() output = model(train_datas) loss = cre_loss(output, is_helpful.float()) cur_loss = loss.item() total_loss += cur_loss loss.backward() optimizer.step() scheduler.step(epoch) logging.info(f"{now()}: epoch {epoch}: total_loss: {total_loss}") print(f"epoch: {epoch}") auc, corr, predict_loss = predict(model, val_data_loader, opt, logging) if predict_loss < min_loss: min_loss = predict_loss if auc > best_auc: model.save(name=opt.dataset, epoch=epoch, opt=f"{opt}") best_epoch = epoch best_auc = auc best_per = corr logging.info("model save") logging.info("----" * 20) logging.info( f"{now()}:{opt.model}:{opt} \n\t\t best_auc:{best_auc}, best_per:{best_per}" ) logging.info("----" * 20) print("----" * 20) print( f"{now()}:{opt.model}:{opt} \n\t epoch:{best_epoch}: best_auc:{best_auc}, best_per:{best_per}" ) print("----" * 20)
from framework import Train from framework import Model from framework import Config from data_loader import Data_loader import os from optparse import OptionParser parser = OptionParser() parser.add_option('--gpu', dest='gpu', default=7, help='gpu') parser.add_option('--ckpt_index', dest='ckpt_index', default=1, help='ckpt index') parser.add_option('--ckpt_dir', dest='ckpt_dir', default='ckpt', help='ckpt') (options, args) = parser.parse_args() os.environ["CUDA_VISIBLE_DEVICES"] = options.gpu config = Config() train_data_loader = Data_loader('train', config) train = Train(train_data_loader, config, options.ckpt_dir) train.init_train(Model(config)) train._train()