예제 #1
0
def main(cfg):
    pprint.pprint(cfg)
    mkdir(cfg.checkpoint)
    mkdir(cfg.codefolder)
    with tl.session() as sess:
        dcmh = Model(sess, cfg)
        dcmh.train()
def test(**kwargs):

    if 'dataset' not in kwargs:
        opt = getattr(config, 'Gourmet_Food_data_Config')()
    else:
        opt = getattr(config, kwargs['dataset'] + '_Config')()
    opt.parse(kwargs)
    logging.basicConfig(
        filename=f"logs/{opt}.log",
        filemode="w",
        format="%(asctime)s %(name)s:%(levelname)s:%(message)s",
        datefmt="%d-%m-%Y %H:%M:%S",
        level=logging.DEBUG)

    random.seed(opt.seed)
    np.random.seed(opt.seed)
    torch.manual_seed(opt.seed)

    if opt.use_gpu:
        torch.cuda.manual_seed_all(opt.seed)

    if len(opt.gpu_ids) == 0 and opt.use_gpu:
        torch.cuda.set_device(opt.gpu_id)

    model = Model(opt, getattr(models, opt.model)).cuda()
    print("load...")
    model.load(
        "./checkpoints/DPHP_Gourmet_Food_data_cfg-Gourmet_Food_data-poolatt-lr0.001-wd0.0005-drop0.1-id32-hidden100.pth"
    )
    test_data = ReviewData(opt.data_root, mode="Test")
    test_data_loader = DataLoader(test_data,
                                  batch_size=opt.batch_size,
                                  shuffle=False,
                                  collate_fn=collate_fn)
    auc, corr, predict_loss = predict(model, test_data_loader, opt, logging)
예제 #3
0
def generate_conditional_sentence(**kwargs):
    if 'dataset' not in kwargs:
        opt = getattr(config, 'AmazonDigitalMusic_Config')()
    else:
        opt = getattr(config, kwargs['dataset'] + '_Config')()
    opt.parse(kwargs)
    assert(len(opt.pth_path) > 0)
    random.seed(opt.seed)
    np.random.seed(opt.seed)
    torch.manual_seed(opt.seed)
    if opt.use_gpu:
        torch.cuda.manual_seed_all(opt.seed)

    if len(opt.gpu_ids) == 0 and opt.use_gpu:
        torch.cuda.set_device(opt.gpu_id)

    model = Model(opt, getattr(models, opt.model))
    if opt.use_gpu:
        model.cuda()
        if len(opt.gpu_ids) > 0:
            model = nn.DataParallel(model, device_ids=opt.gpu_ids)
    if model.net.num_fea != opt.num_fea:
        raise ValueError(f"the num_fea of {opt.model} is error, please specific --num_fea={model.net.num_fea}")

    model.load(opt.pth_path)
    print(f"load model: {opt.pth_path}")
    test_data = ReviewData(opt.data_root, mode="Test")
    test_data_loader = DataLoader(test_data, batch_size=1, shuffle=False, collate_fn=collate_fn)
    print(f"{now()}: generating conditional sentence...")

    model.eval()
    with torch.no_grad():
        user_review_dict = np.load("./dataset/AmazonDigitalMusic/train/plainUserReviews.npy", allow_pickle=True).item()
        item_review_dict = np.load("./dataset/AmazonDigitalMusic/train/plainItemReviews.npy", allow_pickle=True).item()
        cnt = 10
        for idx, (test_input, scores) in enumerate(test_data_loader):
            if idx == cnt:
                test_input = unpack_input(opt, test_input)
                output = model(test_input, mode="Generate")

                uid = test_input[2].item()
                user_reviews = user_review_dict[uid]
                iid = test_input[3].item()
                item_reviews = item_review_dict[iid]
                
                imp_user_review_id = output[0].cpu().numpy().squeeze()
                imp_user_review_id = np.argmax(imp_user_review_id)
                print(user_reviews[imp_user_review_id])

                imp_item_review_id = output[1].cpu().numpy().squeeze()
                imp_item_review_id = np.argmax(imp_item_review_id)
                print(item_reviews[imp_item_review_id])
                break
예제 #4
0
def test(**kwargs):

    if 'dataset' not in kwargs:
        opt = getattr(config, 'AmazonDigitalMusic_Config')()
    else:
        opt = getattr(config, kwargs['dataset'] + '_Config')()
    opt.parse(kwargs)
    assert(len(opt.pth_path) > 0)
    random.seed(opt.seed)
    np.random.seed(opt.seed)
    torch.manual_seed(opt.seed)
    if opt.use_gpu:
        torch.cuda.manual_seed_all(opt.seed)

    if len(opt.gpu_ids) == 0 and opt.use_gpu:
        torch.cuda.set_device(opt.gpu_id)

    model = Model(opt, getattr(models, opt.model))
    if opt.use_gpu:
        model.cuda()
        if len(opt.gpu_ids) > 0:
            model = nn.DataParallel(model, device_ids=opt.gpu_ids)
    if model.net.num_fea != opt.num_fea:
        raise ValueError(f"the num_fea of {opt.model} is error, please specific --num_fea={model.net.num_fea}")

    model.load(opt.pth_path)
    print(f"load model: {opt.pth_path}")
    test_data = ReviewData(opt.data_root, mode="Test")
    test_data_loader = DataLoader(test_data, batch_size=opt.batch_size, shuffle=False, collate_fn=collate_fn)
    print(f"{now()}: test in the test datset")
    predict_loss, test_mse, test_mae = predict(model, test_data_loader, opt)
예제 #5
0
def static_process(args):

    sample_rate = 1  #0.5

    # load emai_eu data
    data, n, m = load_email_eu(args.input, sample_rate)

    # STEP 0: Parameters
    hidden_size = args.representation_size  # size of hidden codes to learn, default is 20

    activation = tf.nn.sigmoid

    dimension = [n, hidden_size]

    rho = 0.5  # sparsity ratio
    lamb = 0.0017  # weight decay
    beta = 1  # sparsity weight
    gama = 340  # autoencoder weight
    walk_len = args.walk_length
    epoch = 30  # number of epoch for optimizing, could be larger
    batch_size = 40  # should be smaller or equal to args.number_walks*n
    learning_rate = 0.01  # learning rate, for adam, using 0.01, for rmsprop using 0.1
    optimizer = "adam"  #"rmsprop"#"gd"#"rmsprop" #"""gd"#""lbfgs"
    corrupt_prob = [0]  # corrupt probability, for denoising AE
    ini_graph_percent = args.init_percent  # percent of edges in the initial graph
    anomaly_percent = 0.2  # percentage of anomaly edges in the testing edges
    alfa = 0.01  # updating parameter for online k-means to update clustering centroids
    k = 3  # number of clusters for kmeans to clustering edges

    # STEP 1: Preparing data: training data and testing list of edges(for online updating)
    synthetic_test, train_mat, train = anomaly_generation(
        ini_graph_percent, anomaly_percent, data, n, m)
    data_zip = []
    data_zip.append(synthetic_test)
    data_zip.append(train)
    # generating initial training walks
    netwalk = NetWalk_update(data_zip,
                             walk_per_node=args.number_walks,
                             walk_len=args.walk_length,
                             init_percent=args.init_percent,
                             snap=args.snap)
    ini_data = netwalk.getInitWalk()

    embModel = MD.Model(activation, dimension, walk_len, n, gama, lamb, beta,
                        rho, epoch, batch_size, learning_rate, optimizer,
                        corrupt_prob)

    # STEP 2: Learning initial embeddings for training edges
    embedding = getEmbedding(embModel, ini_data, n)

    # dynaimically plot the anomaly score over different snapshots
    d_plot = DP.DynamicUpdate()

    # conduct anomaly detection using first snapshot of testing edges
    scores, auc, n0, c0, res, ab_score = anomaly_detection(
        embedding, train, synthetic_test[0:args.snap, :], k)

    print('initial auc of anomaly detection:', auc)
    print('initial anomaly score:', ab_score)

    # visualize anomaly score
    d_plot.addPoint(1, ab_score)

    # STEP 3: over different snapshots of edges, dynamically updating embeddings of nodes and conduct
    #         online anomaly detection for edges, visualize the anomaly score of each snapshot
    snapshotNum = 1
    while (netwalk.hasNext()):
        snapshot_data = netwalk.nextOnehotWalks()
        embedding = getEmbedding(embModel, snapshot_data, n)
        if netwalk.hasNext():
            if len(synthetic_test) > args.snap * (snapshotNum + 1):
                test_piece = synthetic_test[args.snap * snapshotNum:args.snap *
                                            (snapshotNum + 1), :]
            else:
                test_piece = synthetic_test[args.snap * snapshotNum:, :]
                #return
        else:
            return

        # online anomaly detection, each execution will update the clustering center
        scores, auc, n0, c0, res, ab_score = anomaly_detection_stream(
            embedding, train, test_piece, k, alfa, n0, c0)

        print('auc of anomaly detection at snapshot %d: %f' %
              (snapshotNum, auc))
        print('anomaly score at snapshot %d: %f' % (snapshotNum, ab_score))

        snapshotNum += 1

        # visualizing anomaly score of current snapshot
        d_plot.addPoint(snapshotNum, ab_score)
예제 #6
0
def build_model(model):
    """Build a Tensorflow graph for the QA model.
    Return a model.Model for training, evaluation, etc.
    """
    with tf.name_scope("Inputs"):
        questions = tf.placeholder(
            tf.int32, name="Questions", shape=[None, None])
        documents = tf.placeholder(
            tf.int32, name="Documents", shape=[None, None])
        same_as_question_feature = tf.placeholder(
            tf.float32, name="SameAsQuestionFeature", shape=[None, None])
        repeated_words = tf.placeholder(
            tf.float32, name="RepeatedWordFeature", shape=[None, None])
        repeated_word_intensity = tf.placeholder(
            tf.float32, name="RepeatedWordIntensity", shape=[None, None])
        sentence_lengths = tf.placeholder(
            tf.int32, name="SentenceOffsets", shape=[None, None])
        sentence_labels = tf.placeholder(
            tf.int32, name="SentenceLabels", shape=[None])
        word_start_labels = tf.placeholder(
            tf.int32, name="WordStartLabels", shape=[None])
        word_end_labels = tf.placeholder(
            tf.int32, name="WordEndLabels", shape=[None])
        embedding_dropout = tf.placeholder_with_default(
            model.embedding_dropout_prob, shape=[])
        hidden_dropout = tf.placeholder_with_default(
            model.hidden_dropout_prob, shape=[])
        training = tf.placeholder_with_default(
            True, shape=[], name="TrainingIndicator")
        exact_match = tf.placeholder(
            tf.float32, name="ExactMatch", shape=[])
        f1 = tf.placeholder(
            tf.float32, name="F1", shape=[])

    with tf.variable_scope("GloveEmbeddings"):
        embeddings = tf.get_variable(
            shape=[model.vocab_size, EMBEDDING_DIM],
            initializer=tf.zeros_initializer(),
            trainable=False, name="GloveEmbeddings")
        embedding_placeholder = tf.placeholder(
            tf.float32, [model.vocab_size, EMBEDDING_DIM])
        embedding_init = embeddings.assign(embedding_placeholder)

    with tf.name_scope("QuestionEmbeddings"):
        question_vector = featurize_question(model, questions,
                                             embedding_dropout, training)

    with tf.name_scope("DocumentEmbeddings"):
        document_embeddings = featurize_document(
            model, questions, documents, same_as_question_feature,
            repeated_words, repeated_word_intensity,
            question_vector, embedding_dropout, training)

    # Keep track of the beam state at each decision point
    beam_states = []
    with tf.name_scope("PickSentence"):
        sentence_scores = score_sentences(
            model, document_embeddings, sentence_lengths, hidden_dropout)

        beam_states.append(([], tf.expand_dims(sentence_scores, 1)))
        beam_scores, sentence_picks = tf.nn.top_k(
            sentence_scores,
            k=tf.minimum(model.beam_size, tf.shape(sentence_scores)[1]),
            sorted=True)

        sentence_correct = tf.reduce_mean(
            tf.cast(tf.equal(sentence_labels, sentence_picks[:, 0]), tf.float32))

    with tf.name_scope("PickStartWord"):
        start_word_scores = score_start_word(
            model, document_embeddings, sentence_picks, sentence_lengths, hidden_dropout)
        beam_scores = tf.expand_dims(beam_scores, 2) + start_word_scores

        beam_states.append(([sentence_picks], beam_scores))
        beam_scores, kept_sentences, start_words = ops.prune_beam(
            beam_scores, sentence_picks, model.beam_size)

        start_word_correct = tf.reduce_mean(
            tf.cast(tf.logical_and(
                tf.equal(word_start_labels, start_words[:, 0]),
                tf.equal(sentence_labels, kept_sentences[:, 0])), tf.float32))

    with tf.name_scope("PickEndWord"):
        end_word_scores = score_end_words(
            model, document_embeddings, kept_sentences,
            start_words, sentence_lengths, hidden_dropout, training)
        beam_scores = tf.expand_dims(beam_scores, 2) + end_word_scores

        beam_states.append(([kept_sentences, start_words], beam_scores))
        beam_scores, (kept_sentences, kept_start_words), end_words = ops.prune_beam(
            beam_scores, [kept_sentences, start_words], model.beam_size)

        # Also track the final decisions.
        beam_states.append(([kept_sentences, kept_start_words, end_words],
                           beam_scores))

    # Get offset from start word
    end_word_picks = kept_start_words + end_words
    final_states = [kept_sentences, kept_start_words, end_word_picks]

    end_word_correct = tf.reduce_mean(
        tf.cast(tf.logical_and(
            tf.logical_and(
                tf.equal(word_end_labels, end_word_picks[:, 0]),
                tf.equal(word_start_labels, kept_start_words[:, 0])),
            tf.equal(sentence_labels, kept_sentences[:, 0])), tf.float32))

    with tf.name_scope("Loss"):
        # End prediction is based on the start word offset.
        end_labels = word_end_labels - word_start_labels
        labels = (sentence_labels, word_start_labels, end_labels)
        loss = globally_normalized_loss(beam_states, labels)

        l2_penalty = tf.contrib.layers.apply_regularization(
            tf.contrib.layers.l2_regularizer(model.l2_scale),
            tf.trainable_variables())

        loss += l2_penalty

    with tf.name_scope("TrainStep"):
        iteration, (step, loss, gradnorm) = ops.default_train_step(
            model, loss)

    with tf.name_scope("TrainSummary"):
        train_summary = ops.scalar_summaries({
            "Train-Loss": loss,
            "Gradient-Norm": gradnorm,
            "Sentence-Correct": sentence_correct,
            "Start-Word-Correct": start_word_correct,
            "End-Word-Correct": end_word_correct})

    with tf.name_scope("ValidSummary"):
        valid_summary = ops.scalar_summaries({
            "Validation-Loss": loss,
            "Sentence-Correct": sentence_correct,
            "Start-Word-Correct": start_word_correct,
            "End-Word-Correct": end_word_correct})

    with tf.name_scope("SquadSummary"):
        squad_summary = ops.scalar_summaries({
            "Exact-Match": exact_match, "F1": f1})

    return Model(
        inputs=[questions, documents, same_as_question_feature,
                repeated_words, repeated_word_intensity,
                sentence_lengths, sentence_labels, word_start_labels,
                word_end_labels],
        outputs=[kept_sentences, kept_start_words, end_word_picks, sentence_correct,
                 start_word_correct, end_word_correct],
        loss=loss, training=training,
        dropout=[embedding_dropout, hidden_dropout],
        gradnorm=gradnorm, step=step, iteration=iteration,
        train_summary=train_summary, valid_summary=valid_summary,
        embedding_init=embedding_init,
        embedding_placeholder=embedding_placeholder,
        squad_summary=squad_summary,
        squad_inputs=[exact_match, f1])
예제 #7
0
        type=str,
        default='Sma',
        help=
        'kung fu parallel optimizor,available options: Sync_sgd, Async_sgd, Sma'
    )
    parser.add_argument("--output_dir",
                        type=str,
                        default="save_dir",
                        help="which dir to output the exported pb model")

    args = parser.parse_args()
    Config.set_model_name(args.model_name)
    Config.set_model_type(Config.MODEL[args.model_type])
    Config.set_model_backbone(Config.BACKBONE[args.model_backbone])
    config = Config.get_config()
    export_model = Model.get_model(config)

    input_path = f"{config.model.model_dir}/newest_model.npz"
    output_dir = f"{args.output_dir}/{config.model.model_name}"
    output_path = f"{output_dir}/frozen_{config.model.model_name}.pb"
    print(f"exporting model {config.model.model_name} from {input_path}...")
    if (not os.path.exists(output_dir)):
        print("creating output_dir...")
        os.mkdir(output_dir)
    if (not os.path.exists(input_path)):
        print("input model file doesn't exist!")
        print("conversion aborted!")
    else:
        export_model.load_weights(input_path)
        export_model.eval()
        if (export_model.data_format == "channels_last"):
예제 #8
0
def static_process(args):

    # STEP 0: Parameters
    hidden_size = args.representation_size  # size of hidden codes to learn, default is 20

    activation = tf.nn.sigmoid

    rho = 0.5  # sparsity ratio
    lamb = 0.0017  # weight decay
    beta = 1  # sparsity weight
    gama = 340  # autoencoder weight
    walk_len = args.walk_length
    epoch = 400
    batch_size = 20  # number of epoch for optimizing, could be larger
    learning_rate = 0.1  # learning rate, for adam, using 0.01, for rmsprop using 0.1
    optimizer = "rmsprop"  #"gd"#"rmsprop" #""lbfgs"#"rmsprop"#"adam"#"gd"#""lbfgs"#"adam"#
    corrupt_prob = [0]  # corrupt probability, for denoising AE

    # STEP 1: Preparing data: training data and testing list of edges(for online updating)
    data_path = args.input
    netwalk = NetWalk_update(data_path, walk_per_node=args.number_walks, \
                             walk_len=args.walk_length, init_percent=args.init_percent, snap=args.snap)
    n = len(netwalk.vertices)  # number of total nodes

    print("{} Number of nodes: {}".format(print_time(), n))
    print("{} Number of walks: {}".format(print_time(), args.number_walks))
    print("{} Data size (walks*length): {}".format(
        print_time(), args.number_walks * args.walk_length))
    print("{} Generating network walks...".format(print_time()))
    print("{} Clique embedding training...".format(print_time()))

    dimension = [n, hidden_size]

    embModel = MD.Model(activation, dimension, walk_len, n, gama, lamb, beta,
                        rho, epoch, batch_size, learning_rate, optimizer,
                        corrupt_prob)

    init_edges, snapshots = netwalk.data
    data = netwalk.getInitWalk()

    fig = plt.figure(figsize=(12, 12))

    # STEP 2: Learning initial embeddings for training edges
    embedding_code(embModel, data, n, args)

    # load karate club graph
    G = nx.karate_club_graph()
    edge_list = G.edges()

    # list of initial edge list tuples
    tuples = tuple(map(tuple, init_edges - 1))

    # complementary set of edges for initial edges
    rm_list = [x for x in edge_list if x not in tuples]

    # visualize initial embedding
    viz_stream(rm_list, fig, 5, 2, 1)

    # STEP 3: over different snapshots of edges, dynamically updating embeddings of nodes and conduct
    #         online anomaly detection for edges, visualize the anomaly score of each snapshot
    snapshotNum = 0
    while (netwalk.hasNext()):
        data = netwalk.nextOnehotWalks()
        tuples = tuple(map(tuple, snapshots[snapshotNum] - 1)) + tuples
        snapshotNum += 1
        embedding_code(embModel, data, n, args)
        rm_list = [x for x in edge_list if x not in tuples]
        viz_stream(rm_list, fig, 5, 2, snapshotNum + 1)

    plt.show()

    print("finished")
예제 #9
0
def train(**kwargs):

    if 'dataset' not in kwargs:
        opt = getattr(config, 'AmazonDigitalMusic_Config')()
    else:
        opt = getattr(config, kwargs['dataset'] + '_Config')()
    opt.parse(kwargs)

    random.seed(opt.seed)
    np.random.seed(opt.seed)
    torch.manual_seed(opt.seed)
    if opt.use_gpu:
        torch.cuda.manual_seed_all(opt.seed)

    if len(opt.gpu_ids) == 0 and opt.use_gpu:
        torch.cuda.set_device(opt.gpu_id)

    model = Model(opt, getattr(models, opt.model))
    if opt.use_gpu:
        model.cuda()
        if len(opt.gpu_ids) > 0:
            model = nn.DataParallel(model, device_ids=opt.gpu_ids)

    if model.net.num_fea != opt.num_fea:
        raise ValueError(f"the num_fea of {opt.model} is error, please specific --num_fea={model.net.num_fea}")

    # 3 data
    train_data = ReviewData(opt.data_root, mode="Train")
    train_data_loader = DataLoader(train_data, batch_size=opt.batch_size, shuffle=True, collate_fn=collate_fn)
    val_data = ReviewData(opt.data_root, mode="Val")
    val_data_loader = DataLoader(val_data, batch_size=opt.batch_size, shuffle=False, collate_fn=collate_fn)
    print(f'train data: {len(train_data)}; test data: {len(val_data)}')

    optimizer = optim.Adam(model.parameters(), lr=opt.lr, weight_decay=opt.weight_decay)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.8)

    # training
    print("start training....")
    min_loss = 1e+10
    best_res = 1e+10
    mse_func = nn.MSELoss()
    mae_func = nn.L1Loss()
    smooth_mae_func = nn.SmoothL1Loss()

    train_mse_list = []
    val_mse_list = []
    val_mae_list = []

    for epoch in range(opt.num_epochs):
        total_loss = 0.0
        total_maeloss = 0.0
        model.train()
        print(f"{now()}  Epoch {epoch}...")
        for idx, (train_datas, scores) in enumerate(train_data_loader):
            if opt.use_gpu:
                scores = torch.FloatTensor(scores).cuda()
            else:
                scores = torch.FloatTensor(scores)
            train_datas = unpack_input(opt, train_datas)

            optimizer.zero_grad()
            output = model(train_datas)
            mse_loss = mse_func(output, scores)
            total_loss += mse_loss.item() * len(scores)

            mae_loss = mae_func(output, scores)
            total_maeloss += mae_loss.item()
            smooth_mae_loss = smooth_mae_func(output, scores)
            if opt.loss_method == 'mse':
                loss = mse_loss
            if opt.loss_method == 'rmse':
                loss = torch.sqrt(mse_loss) / 2.0
            if opt.loss_method == 'mae':
                loss = mae_loss
            if opt.loss_method == 'smooth_mae':
                loss = smooth_mae_loss
            loss.backward()
            optimizer.step()
            if opt.fine_step:
                if idx % opt.print_step == 0 and idx > 0:
                    print("\t{}, {} step finised;".format(now(), idx))
                    val_loss, val_mse, val_mae = predict(model, val_data_loader, opt)
                    if val_loss < min_loss:
                        model.save(name=opt.dataset, opt=opt.print_opt)
                        min_loss = val_loss
                        print("\tmodel save")
                    if val_loss > min_loss:
                        best_res = min_loss

        scheduler.step()
        mse = total_loss * 1.0 / len(train_data)
        print(f"\ttrain data: loss:{total_loss:.4f}, mse: {mse:.4f};")

        val_loss, val_mse, val_mae = predict(model, val_data_loader, opt)
        
        train_mse_list.append(mse)
        val_mse_list.append(val_mse)
        val_mae_list.append(val_mae)

        if val_loss < min_loss:
            model.save(name=opt.dataset, opt=opt.print_opt)
            min_loss = val_loss
            print("model save")
        if val_mse < best_res:
            best_res = val_mse
        print("*"*30)

    print("----"*20)
    print(f"{now()} {opt.dataset} {opt.print_opt} best_res:  {best_res}")
    print("----"*20)

    print("Train MSE:", train_mse_list)
    print("Val MSE:", val_mse_list)
    print("Val MAE:", val_mae_list)
예제 #10
0
파일: test.py 프로젝트: h-peng17/PIM
                  help='ckpt index')
parser.add_option('--ckpt_dir', dest='ckpt_dir', default='ckpt', help='ckpt')
(options, args) = parser.parse_args()

os.environ["CUDA_VISIBLE_DEVICES"] = options.gpu
p2id = json.load(open("../data/p2id.json"))
word2id = json.load(open('../data/word2id.json'))
id2word = {}
for key in word2id:
    id2word[word2id[key]] = key

config = Config()

if options.mode == 'people':
    test = Test(config, options.ckpt_dir, id2word)
    test.init_test(Model(config), options.ckpt_index)

    print('请输入:')
    line = ''
    is_continue = False
    while line != 'stop':
        line = input()
        pins = line.strip().split()
        query = np.ones([1, config.seq_len], dtype=np.int32)
        target_seq_len = [0]
        target_seq_len[0] = len(pins)
        for i, pin in enumerate(pins):
            if pin not in p2id:
                is_continue = True
                print('Invalid input!')
                break
예제 #11
0
파일: NetWalk.py 프로젝트: zhh0998/NetWalk
def static_process(representation_size,walk_length,input,number_walks,init_percent,snap,output,datasetname):

    # region Parameters
    hidden_size = representation_size     # size of hidden codes to learn, default is 20
    activation = tf.nn.sigmoid
    rho = 0.5           # sparsity ratio
    lamb = 0.0017       # weight decay
    beta = 1            # sparsity weight
    gama = 340          # autoencoder weight
    walk_len = walk_length
    epoch = 400
    batch_size = 20     # number of epoch for optimizing, could be larger
    learning_rate = 0.1 # learning rate, for adam, using 0.01, for rmsprop using 0.1
    optimizer = "rmsprop"#"gd"#"rmsprop" #""lbfgs"#"rmsprop"#"adam"#"gd"#""lbfgs"#"adam"#
    corrupt_prob = [0]  # corrupt probability, for denoising AE
    # endregion

    # region STEP 1: Preparing data: training data and testing list of edges(for online updating)
    data_path = input
    netwalk = NetWalk_update(data_path, walk_per_node=number_walks,walk_len=walk_length, init_percent=init_percent, snap=snap)
    n = len(netwalk.vertices) # number of total nodes
    # endregion



    print("{} Number of nodes: {}".format(print_time(), n))
    print("{} Number of walks: {}".format(print_time(), number_walks))
    print("{} Data size (walks*length): {}".format(print_time(),number_walks*walk_length))
    print("{} Generating network walks...".format(print_time()))
    print("{} Clique embedding training...".format(print_time()))


    dimension = [n, hidden_size]

    embModel = MD.Model(activation, dimension, walk_len, n, gama, lamb, beta, rho,epoch, batch_size, learning_rate, optimizer, corrupt_prob)

    init_edges, snapshots,edges = netwalk.data
    edges = edges-1   #karate
    # G = nx.Graph()
    # G.add_edges_from(edges)
    # vertices = np.unique(edges)
    G = nx.Graph()
    G.add_edges_from(edges)
    clusteringAccuracy=[]

    edge_list = tuple(map(tuple, edges))

    data = netwalk.getInitWalk()

    fig = plt.figure(figsize=(12, 12))

    # STEP 2: Learning initial embeddings for training edges
    embeddings=embedding_code(embModel, data, n, output)


    # list of initial edge list tuples
    tuples = tuple(map(tuple, init_edges-1))#karate
    #tuples = tuple(map(tuple, init_edges))

    # complementary set of edges for initial edges
    rm_list = [x for x in edge_list if x not in tuples]

    # visualize initial embedding

    clusteringAccuracy=viz_stream(G,rm_list, fig, 5, 2, 1,output,"./tmp/membership_"+datasetname+".txt",representation_size,clusteringAccuracy)

    # STEP 3: over different snapshots of edges, dynamically updating embeddings of nodes and conduct
    #         online anomaly detection for edges, visualize the anomaly score of each snapshot
    snapshotNum = 0
    while(netwalk.hasNext()):
        G = nx.Graph()
        G.add_edges_from(edges)
        data = netwalk.nextOnehotWalks()
        tuples = tuple(map(tuple, snapshots[snapshotNum] - 1)) + tuples
        snapshotNum += 1
        embedding_code(embModel, data, n,output)
        rm_list = [x for x in edge_list if x not in tuples]
        clusteringAccuracy=viz_stream(G,rm_list, fig, 5, 2, snapshotNum+1,output,"./tmp/membership_"+datasetname+".txt",representation_size)
        print(clusteringAccuracy)


    #plt.show()

    fig.savefig('../plots/graph_'+datasetname+'.png')
    f = open('./tmp/accuracy_' + datasetname + '.txt', 'a+')
    f.write("dimension is "+str(dimension))
    f.write("\n")
    for acc in clusteringAccuracy:
        f.write(str(acc))
        f.write("\n")
    f.write("\n")
    f.write("\n")
    #np.savetxt(f, clusteringAccuracy, fmt="%g")

    print("finished")
예제 #12
0
def static_process(representation_size, walk_length, input, number_walks,
                   init_percent, snap, output, datasetname):
    # region Preprocess the data(change directed to undirected/remove self loops/remove duplicate edges)
    sample_rate = 1  #0.5
    data, n, m = load_email_eu(input, sample_rate)
    # endregion

    # region Parameters
    hidden_size = representation_size  # size of hidden codes to learn, default is 20
    dimension = [n, hidden_size]
    activation = tf.nn.sigmoid
    rho = 0.5  # sparsity ratio
    lamb = 0.0017  # weight decay
    beta = 1  # sparsity weight
    gama = 340  # autoencoder weight
    walk_len = walk_length  # Length of each walk
    epoch = 50  # number of epoch for optimizing, could be larger
    batch_size = 40  # should be smaller or equal to args.number_walks*n
    learning_rate = 0.01  # learning rate, for adam, using 0.01, for rmsprop using 0.1
    optimizer = "adam"  #"rmsprop"#"gd"#"rmsprop" #"""gd"#""lbfgs"
    corrupt_prob = [0]  # corrupt probability, for denoising AE
    ini_graph_percent = init_percent  # percent of edges in the initial graph
    alfa = 0.01  #0.5(paper)                           # updating parameter for online k-means to update clustering centroids
    if (datasetname == "karate"):
        anomaly_percent = 0.3
        k = 4
    elif (datasetname == "toy"):
        anomaly_percent = 1
        k = 2
    elif (datasetname == "cora"):
        anomaly_percent = 0.1
        k = 7
    elif (datasetname == "citeseer"):
        anomaly_percent = 0.1
        k = 6
    elif (datasetname == "dolphin"):
        anomaly_percent = 0.1
        k = 3

    print("No of Clusters in Dataset " + str(datasetname) + " is " + str(k))
    # endregion

    # region STEP 1: Generates Anomaly data: training data and testing list of edges(for online updating)
    membership_path = "./tmp/membership_" + datasetname + ".txt"
    #synthetic_test, train_mat, train = anomaly_generation(ini_graph_percent, anomaly_percent, data, n, m,membership_path)
    synthetic_test, train_mat, train = anomaly_generation(
        0.8, anomaly_percent, data, n, m, membership_path)
    data_zip = []
    data_zip.append(synthetic_test)
    data_zip.append(train)
    # endregion

    # region generating initial training walks
    netwalk = NetWalk_update(data_zip,
                             walk_per_node=number_walks,
                             walk_len=walk_length,
                             init_percent=init_percent,
                             snap=snap)
    ini_data = netwalk.getInitWalk()
    print(np.shape(ini_data[0]))
    # endregion

    # region Initialise Model
    embModel = MD.Model(activation, dimension, walk_len, n, gama, lamb, beta,
                        rho, epoch, batch_size, learning_rate, optimizer,
                        corrupt_prob)
    # endregion

    # region STEP 2: Learning initial embeddings for training edges
    embedding = getEmbedding(embModel, ini_data, n)
    # endregion

    # region conduct anomaly detection using first snapshot of testing edges
    areaUnderCurve = []
    xValue = []
    #test_piece=synthetic_test[0:snap, :]
    test_piece = synthetic_test
    scores, auc, n0, c0, res, ab_score = anomaly_detection(
        embedding, train, test_piece, k)
    areaUnderCurve.append(auc)
    xValue.append(0)
    #scores, auc, n0, c0, res, ab_score = anomaly_detection(embedding, train, synthetic_test, k)
    print('initial auc of anomaly detection:', auc)
    print('initial anomaly score:', ab_score)
    # endregion

    # region Online Increment
    # STEP 3: over different snapshots of edges, dynamically updating embeddings of nodes and conduct
    #         online anomaly detection for edges, visualize the anomaly score of each snapshot
    snapshotNum = 1
    while (netwalk.hasNext()):
        # region Include next walks dynamically and find embedding
        snapshot_data = netwalk.nextOnehotWalks()
        embedding = getEmbedding(embModel, snapshot_data, n)
        # endregion
        # if netwalk.hasNext():
        #     if len(synthetic_test) > snap * (snapshotNum + 1):
        #         #test_piece = synthetic_test[snap * snapshotNum:snap * (snapshotNum + 1), :]
        #         test_piece = synthetic_test[:snap * (snapshotNum + 1), :]
        #     else:
        #         test_piece = synthetic_test
        # online anomaly detection, each execution will update the clustering center
        scores, auc, n0, c0, res, ab_score = anomaly_detection_stream(
            embedding, train, test_piece, k, alfa, n0, c0)
        print('auc of anomaly detection at snapshot %d: %f' %
              (snapshotNum, auc))
        print('anomaly score at snapshot %d: %f' % (snapshotNum, ab_score))
        areaUnderCurve.append(auc)
        xValue.append(snapshotNum)
        snapshotNum += 1
    # scores, auc, n0, c0, res, ab_score = anomaly_detection_stream(embedding, train, test_piece, k, alfa, n0, c0)
    # print('Final auc of anomaly detection at snapshot %d: %f' % (snapshotNum, auc))
    # print('Final anomaly score at snapshot %d: %f' % (snapshotNum, ab_score))
    plt.plot(xValue, areaUnderCurve)
    plt.yticks(np.arange(0, 100, 5))
    plt.savefig('../plots/anomalyaccuracy_' + datasetname +
                str(datetime.datetime.now()) + '.png')
예제 #13
0
def static_process(representation_size, walk_length, input, number_walks,
                   init_percent, snap, output, datasetname):
    # region Preprocess the data(change directed to undirected/remove self loops/remove duplicate edges)
    sample_rate = 1  #0.5
    data, n, m = load_email_eu(input, sample_rate)
    GraphEdges, trainData, trainLabels, testData, testLabels = preprocessGraph(
        data, 0.7, n, m)
    # endregion

    # region Parameters
    hidden_size = representation_size  # size of hidden codes to learn, default is 20
    dimension = [n, hidden_size]
    activation = tf.nn.sigmoid
    rho = 0.5  # sparsity ratio
    lamb = 0.0017  # weight decay
    beta = 1  # sparsity weight
    gama = 340  # autoencoder weight
    walk_len = walk_length  # Length of each walk
    epoch = 50  # number of epoch for optimizing, could be larger
    batch_size = 40  # should be smaller or equal to args.number_walks*n
    learning_rate = 0.01  # learning rate, for adam, using 0.01, for rmsprop using 0.1
    optimizer = "adam"  #"rmsprop"#"gd"#"rmsprop" #"""gd"#""lbfgs"
    corrupt_prob = [0]  # corrupt probability, for denoising AE
    ini_graph_percent = init_percent  # percent of edges in the initial graph
    alfa = 0.01  #0.5(paper)                           # updating parameter for online k-means to update clustering centroids
    if (datasetname == "karate"):
        anomaly_percent = 0.1
        k = 4
    elif (datasetname == "toy"):
        anomaly_percent = 1
        k = 2
    elif (datasetname == "cora"):
        anomaly_percent = 0.1
        k = 7
    elif (datasetname == "citeseer"):
        anomaly_percent = 0.1
        k = 6
    elif (datasetname == "dolphin"):
        anomaly_percent = 0.1
        k = 3

    print("No of Clusters in Dataset " + str(datasetname) + " is " + str(k))
    # endregion

    # region generating initial training walks
    netwalk = NetWalk_update(data,
                             walk_per_node=number_walks,
                             walk_len=walk_length,
                             init_percent=init_percent,
                             snap=snap)
    ini_data = netwalk.getInitWalk()
    # endregion

    # region Initialise Model
    embModel = MD.Model(activation, dimension, walk_len, n, gama, lamb, beta,
                        rho, epoch, batch_size, learning_rate, optimizer,
                        corrupt_prob)
    # endregion

    # region STEP 2: Learning initial embeddings for training edges
    embedding = getEmbedding(embModel, ini_data, n)
    f = open(
        "../plots/linkresults_" + str(datasetname) +
        str(datetime.datetime.now()) + ".txt", "w")
    # endregion
    AccuracyList = []
    xValue = [1]
    accuracy = linkPrediction(embedding, np.array(trainData), trainLabels,
                              np.array(testData), testLabels)
    f.write("Accuracy " + str(accuracy))
    f.write("\n")
    f.close()
    AccuracyList.append(accuracy)
    #print("Accuracy ",accuracy)
    # region Online Increment
    # STEP 3: over different snapshots of edges, dynamically updating embeddings of nodes and conduct
    #         online anomaly detection for edges, visualize the anomaly score of each snapshot
    snapshotNum = 1
    while (netwalk.hasNext()):
        # region Include next walks dynamically and find embedding
        snapshot_data = netwalk.nextOnehotWalks()
        embedding = getEmbedding(embModel, snapshot_data, n)
        accuracy = linkPrediction(embedding, np.array(trainData), trainLabels,
                                  np.array(testData), testLabels)
        f = open(
            "../plots/linkresults_" + str(datasetname) +
            str(datetime.datetime.now()) + ".txt", "w")
        f.write("Accuracy " + str(accuracy))
        f.write("\n")
        f.close()
        AccuracyList.append(accuracy)
        #print("Accuracy ", accuracy)
        snapshotNum += 1
        xValue.append(snapshotNum)
    f = open(
        "../plots/linkresults_" + str(datasetname) +
        str(datetime.datetime.now()) + ".txt", "w")
    accuracy = linkPrediction(embedding, np.array(trainData), trainLabels,
                              np.array(testData), testLabels)
    f.write("Final Accuracy " + str(accuracy))
    f.write("\n")
    f.close()
    print("Final Accuracy ", accuracy)
    # scores, auc, n0, c0, res, ab_score = anomaly_detection_stream(embedding, train, test_piece, k, alfa, n0, c0)
    # print('Final auc of anomaly detection at snapshot %d: %f' % (snapshotNum, auc))
    # print('Final anomaly score at snapshot %d: %f' % (snapshotNum, ab_score))
    plt.plot(xValue, AccuracyList)
    plt.yticks(np.arange(0, 1, .1))
    plt.savefig('../plots/linkaccuracy_' + datasetname +
                str(datetime.datetime.now()) + '.png')
예제 #14
0
def train(**kwargs):

    if 'dataset' not in kwargs:
        opt = getattr(config, 'Digital_Music_data_Config')()
    else:
        opt = getattr(config, kwargs['dataset'] + '_Config')()
    opt.parse(kwargs)

    random.seed(opt.seed)
    np.random.seed(opt.seed)
    torch.manual_seed(opt.seed)
    if opt.use_gpu:
        torch.cuda.manual_seed_all(opt.seed)

    if len(opt.gpu_ids) == 0 and opt.use_gpu:
        torch.cuda.set_device(opt.gpu_id)

    model = Model(opt, getattr(models, opt.model))
    if opt.use_gpu:
        model.cuda()
        if len(opt.gpu_ids) > 0:
            model = nn.DataParallel(model, device_ids=opt.gpu_ids)

    # 3 data
    train_data = ReviewData(opt.data_root, train=True)
    train_data_loader = DataLoader(train_data, batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_workers, collate_fn=collate_fn)
    test_data = ReviewData(opt.data_root, train=False)
    test_data_loader = DataLoader(test_data, batch_size=opt.batch_size, shuffle=False, num_workers=opt.num_workers, collate_fn=collate_fn)
    print('{}: train data: {}; test data: {}'.format(now(), len(train_data), len(test_data)))

    optimizer = optim.Adam(model.parameters(), lr=opt.lr, weight_decay=opt.weight_decay)

    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.8)

    # training
    print("start training....")
    min_loss = 1e+10
    best_res = 1e+10
    mse_func = nn.MSELoss()
    mae_func = nn.L1Loss()
    smooth_mae_func = nn.SmoothL1Loss()

    for epoch in range(opt.num_epochs):
        total_loss = 0.0
        total_maeloss = 0.0
        model.train()
        print("{} Epoch {}: start".format(now(), epoch))
        for idx, (train_datas, scores) in enumerate(train_data_loader):
            if opt.use_gpu:
                scores = torch.FloatTensor(scores).cuda()
            else:
                scores = torch.FloatTensor(scores)
            train_datas = unpack_input(opt, train_datas)
            optimizer.zero_grad()
            output = model(train_datas)
            mse_loss = mse_func(output, scores)
            total_loss += mse_loss.item() * len(scores)

            mae_loss = mae_func(output, scores)
            total_maeloss += mae_loss.item()

            smooth_mae_loss = smooth_mae_func(output, scores)

            if opt.loss_method == 'mse':
                loss = mse_loss
            if opt.loss_method == 'rmse':
                loss = torch.sqrt(mse_loss) / 2.0
            if opt.loss_method == 'mae':
                loss = mae_loss
            if opt.loss_method == 'smooth_mae':
                loss = smooth_mae_loss

            loss.backward()
            optimizer.step()

            if opt.fine_step:
                if idx % opt.print_step == 0 and idx > 0:
                    print("\t{}, {} step finised;".format(now(), idx))
                    predict_loss, test_mse = predict(model, test_data_loader, opt, use_gpu=opt.use_gpu)
                    if predict_loss < min_loss:
                        model.save(name=opt.dataset, opt=opt.print_opt)
                        min_loss = predict_loss
                        print("\tmodel save")
                    if predict_loss > min_loss:
                        best_res = min_loss

        scheduler.step(epoch)
        print("{}; epoch:{}; total_loss:{}".format(now(), epoch, total_loss))
        mse = total_loss * 1.0 / len(train_data)
        mae = total_maeloss * 1.0 / len(train_data)
        print("{};train reslut: mse: {}; rmse: {}; mae: {}".format(now(), mse, math.sqrt(mse), mae))

        predict_loss, test_mse = predict(model, test_data_loader, opt, use_gpu=opt.use_gpu)
        if predict_loss < min_loss:
            model.save(name=opt.dataset, opt=opt.print_opt)
            min_loss = predict_loss
            print("model save")
        if test_mse < best_res:
            best_res = test_mse

    print("----"*20)
    print(f"{now()} {opt.dataset} {opt.print_opt} best_res:  {best_res}")
    print("----"*20)
예제 #15
0
def train(**kwargs):

    if 'dataset' not in kwargs:
        opt = getattr(config, 'Gourmet_Food_data_Config')()
    else:
        opt = getattr(config, kwargs['dataset'] + '_Config')()
    opt.parse(kwargs)
    logging.basicConfig(
        filename=f"logs/{opt}.log",
        filemode="w",
        format="%(asctime)s %(name)s:%(levelname)s:%(message)s",
        datefmt="%d-%m-%Y %H:%M:%S",
        level=logging.DEBUG)

    random.seed(opt.seed)
    np.random.seed(opt.seed)
    torch.manual_seed(opt.seed)

    if opt.use_gpu:
        torch.cuda.manual_seed_all(opt.seed)

    if len(opt.gpu_ids) == 0 and opt.use_gpu:
        torch.cuda.set_device(opt.gpu_id)

    model = Model(opt, getattr(models, opt.model))
    if opt.use_gpu:
        model.cuda()

    # 3 data
    train_data = ReviewData(opt.data_root, mode="Train")
    train_data_loader = DataLoader(train_data,
                                   batch_size=opt.batch_size,
                                   shuffle=True,
                                   collate_fn=collate_fn)

    val_data = ReviewData(opt.data_root, mode="Val")
    val_data_loader = DataLoader(val_data,
                                 batch_size=opt.batch_size,
                                 shuffle=True,
                                 collate_fn=collate_fn)

    logging.info('{}: train data: {}; val data: {}'.format(
        now(), len(train_data), len(val_data)))

    optimizer = optim.Adam(model.parameters(),
                           lr=opt.lr,
                           weight_decay=opt.weight_decay)

    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.5)

    # training
    logging.info("start training....")
    min_loss = 1e+20
    best_auc = -1.
    best_per = -1.
    best_epoch = 0
    cre_loss = nn.BCEWithLogitsLoss()
    for epoch in range(opt.num_epochs):
        total_loss = 0.0
        model.train()
        for idx, datas in enumerate(train_data_loader):
            train_datas, is_helpful, helpful_score = unpack_input(opt, datas)
            optimizer.zero_grad()
            output = model(train_datas)
            loss = cre_loss(output, is_helpful.float())
            cur_loss = loss.item()
            total_loss += cur_loss
            loss.backward()
            optimizer.step()

        scheduler.step(epoch)
        logging.info(f"{now()}: epoch {epoch}: total_loss: {total_loss}")
        print(f"epoch: {epoch}")
        auc, corr, predict_loss = predict(model, val_data_loader, opt, logging)
        if predict_loss < min_loss:
            min_loss = predict_loss
        if auc > best_auc:
            model.save(name=opt.dataset, epoch=epoch, opt=f"{opt}")
            best_epoch = epoch
            best_auc = auc
            best_per = corr
            logging.info("model save")

    logging.info("----" * 20)
    logging.info(
        f"{now()}:{opt.model}:{opt} \n\t\t best_auc:{best_auc}, best_per:{best_per}"
    )
    logging.info("----" * 20)
    print("----" * 20)
    print(
        f"{now()}:{opt.model}:{opt} \n\t epoch:{best_epoch}: best_auc:{best_auc}, best_per:{best_per}"
    )
    print("----" * 20)
예제 #16
0
파일: train.py 프로젝트: h-peng17/PIM
from framework import Train
from framework import Model
from framework import Config
from data_loader import Data_loader
import os
from optparse import OptionParser

parser = OptionParser()
parser.add_option('--gpu', dest='gpu', default=7, help='gpu')
parser.add_option('--ckpt_index',
                  dest='ckpt_index',
                  default=1,
                  help='ckpt index')
parser.add_option('--ckpt_dir', dest='ckpt_dir', default='ckpt', help='ckpt')
(options, args) = parser.parse_args()

os.environ["CUDA_VISIBLE_DEVICES"] = options.gpu
config = Config()
train_data_loader = Data_loader('train', config)
train = Train(train_data_loader, config, options.ckpt_dir)
train.init_train(Model(config))
train._train()