def main(): # train_bundle = process_data('raw/train.csv', None, interpolation=True, add_noise=True) train_bundle = process_data('raw/train.csv', None, interpolation=True, add_noise=True)[:160] val_bundle = process_data('raw/train.csv', None, interpolation=False, add_noise=False)[160:] # features = np.load('../osic_version_03/label/test_01.npy') # features = read_csv('../osic_version_03/label/offset.csv') # train_set = LsmDataset(train_bundle, features=features[:160], tag='train') # val_set = LsmDataset(val_bundle, features=features[160:], tag='val') train_set = LsmDataset(train_bundle, features=None, tag='train') val_set = LsmDataset(val_bundle, features=None, tag='val') # val_set = None model = LsmModel() model.fit(train_set, val_set) print(model.a.shape) model.save_model('model/lsm_15.npy')
def evaluate_accuracy(model, loader, vali_data, batchsize, dim_input_course, dim_input_grade, dim_input_major): model.eval() summ1 = 0 # ABCDF summ2 = 0 # credit/uncredit len1 = len2 = 0 for step, (batch_x, batch_y) in enumerate(loader): # batch_x: index of batch data processed_data = process_data(batch_x.numpy(), vali_data, batchsize, dim_input_course, dim_input_grade, dim_input_major) padded_input = Variable(torch.Tensor(processed_data[0]), requires_grad=False).cuda() seq_len = processed_data[1] padded_label = Variable(torch.Tensor(processed_data[2]), requires_grad=False).cuda() # clear hidden states model.hidden = model.init_hidden() # compute output y_pred = model(padded_input, seq_len).cuda() # only compute the accuracy for testing period accura = accuracy(y_pred, seq_len, padded_label) len1 += accura[3] len2 += accura[4] summ1 += (accura[0] * accura[3]) summ2 += (accura[1] * accura[4]) average_accuracy = (summ1 + summ2) / (len1 + len2) return average_accuracy
def evaluate_loss(model, loader, vali_data, batchsize, dim_input_course, dim_input_grade, dim_input_major, weight1, weight2): model.eval() summ = [] for step, (batch_x, batch_y) in enumerate(loader): # batch_x: index of batch data processed_data = process_data(batch_x.numpy(), vali_data, batchsize, dim_input_course, dim_input_grade, dim_input_major) padded_input = Variable(torch.Tensor(processed_data[0]), requires_grad=False).cuda() seq_len = processed_data[1] padded_label = Variable(torch.Tensor(processed_data[2]), requires_grad=False).cuda() # clear hidden states model.hidden = model.init_hidden() model.hidden[0] = model.hidden[0].cuda() model.hidden[1] = model.hidden[1].cuda() # compute output y_pred = model(padded_input, seq_len).cuda() # only compute the loss for testing period loss = model.vali_loss(y_pred, seq_len, padded_label, weight1, weight2).cuda() summ.append(loss.data[0]) average_loss = np.average(summ) return average_loss
def train(model, optimizer, loader, train_data, epoch): model.train() summ = [] for step, (batch_x, batch_y) in enumerate(loader): # batch_x: index of batch data print('Epoch: ', epoch, ' | Iteration: ', step + 1) processed_data = process_data(batch_x.numpy(), train_data, batchsize, dim_input_course, dim_input_grade) padded_input = Variable(torch.Tensor(processed_data[0]), requires_grad=False).cuda() seq_len = processed_data[1] padded_label = Variable(torch.Tensor(processed_data[2]), requires_grad=False).cuda() # clear gradients and hidden state optimizer.zero_grad() model.hidden = model.init_hidden() model.hidden[0] = model.hidden[0].cuda() model.hidden[1] = model.hidden[1].cuda() y_pred = model(padded_input, seq_len).cuda() loss = model.loss(y_pred, padded_label).cuda() print('Epoch ' + str(epoch) + ': ' + 'The ' + str(step + 1) + '-th interation: loss' + str(loss.item()) + '\n') loss.backward() if clip_gradient > 0: clip_grad_norm(model.parameters(), clip_gradient) optimizer.step() summ.append(loss.item()) average_loss = np.mean(summ) return average_loss
async def meh(event): sender = await event.get_sender() if '/start' in event.raw_text: await bot.send_message(sender, 'Привет') if not database.is_user_new(sender.id): await bot.send_message(sender, "Выбери категории:", buttons=buttons) else: await bot.send_message(sender, 'Хочешь изменить выбор?', buttons=buttons_to_change) else: await bot.send_message(sender, 'Напиши: /start') f = open('last_date.txt') last_time = f.read() if abs(datetime.datetime.now().hour - int(last_time)) > 0: f = open('last_date.txt', 'w') f.write(str(datetime.datetime.now().hour)) f.close() if abs(datetime.datetime.now().hour - int(last_time)) > 24: database.truncate_all_data() database.truncate_tempdata() ch = database.get_channels(amount_bottom=10, amount_top=390) await parse(ch, 400, sleep_time=10, update_id=True) process_data(probability_sim=0.3, rnn=True) await send_posts(amount_to_send=0) if '/help' in event.raw_text: pass
def main(): bundle = process_data('raw/train.csv', add_noise=False) train_bundle, val_bundle = bundle[:160], bundle[160:] train_set = OsicDataset(train_bundle, train_transform, tag='train', sample_num=100) val_set = OsicDataset(val_bundle, val_transform, tag='val') lsm_model = LsmModel() lsm_model.load_model('model/lsm_09.npy') model = OsicModel('cnn_01', lsm_model=lsm_model, net=NetFc(input_dim=13, input_channel=1, output_dim=1), learning_rate=5e-5) model.fit(train_set, val_set, epochs=200, batch_size=32)
def pretrain(rnn_params, input_file, training_split, feature_file, test_file, judgement_file): raw_data = get_corpus_data(input_file) inventory, phone2ix, ix2phone, training, dev = process_data( raw_data, dev=False, training_split=training_split) inventory_size = len(inventory) rnn_params['inv_size'] = inventory_size if feature_file is None: RNN = Emb_RNNLM(rnn_params) print('Fitting embedding model...') else: features, num_feats = process_features(feature_file, inventory) # build feature table, to replace embedding table, No grad b/c features # are fixed feature_table = torch.zeros(inventory_size, num_feats, requires_grad=False) for i in range(inventory_size): feature_table[i] = torch.tensor(features[ix2phone[i]]) rnn_params['d_feats'] = num_feats RNN = Feature_RNNLM(rnn_params, feature_table) print('Fitting feature model...') RNN = RNN.cuda() dev = dev.cuda() train_lm(training, dev, rnn_params, RNN) RNN.eval() # TODO add automatic creation of models and results folder torch.save( RNN, f"models/rnn_{rnn_params['stress']}_{rnn_params['num_layers']}_{rnn_params['d_emb']}_{rnn_params['d_hid']}.pt" ) prepend = "stress_" if rnn_params["stress"] else "" pickle.dump(phone2ix, open(f'models/{prepend}phone2ix.bin', mode='wb')) pickle.dump(ix2phone, open(f'models/{prepend}ix2phone.bin', mode='wb')) pickle.dump(inventory, open(f'models/{prepend}inventory.bin', mode='wb')) get_probs(test_file, RNN, phone2ix, judgement_file)
default=DEFAULT_DEV, help='Trains on all data and tests on a small subset.') parser.add_argument('--device', default=DEFAULT_DEVICE, help='cpu or cuda') parser.add_argument('--num_rnns', default=DEFAULT_NUM_RNNS, help='number of rnns') args = parser.parse_args() if args.device is None: device = torch.device("cuda" if torch.cuda.is_available() else "cpu") else: device = args.device print('Running on', device) raw_data = get_corpus_data(args.input_file) inventory, phone2ix, ix2phone, training, dev = process_data( raw_data, device, dev=args.dev, training_split=args.training_split) print('training size', training.size()) inventory_size = len(inventory) rnn_params = {} rnn_params['d_emb'] = args.d_emb rnn_params['d_hid'] = args.d_hid rnn_params['num_layers'] = args.num_layers rnn_params['batch_size'] = args.batch_size rnn_params['learning_rate'] = args.learning_rate rnn_params['epochs'] = args.epochs rnn_params['tied'] = args.tied rnn_params['device'] = device rnn_params['inv_size'] = inventory_size rnns = {}
def evaluate_metrics(model, loader, vali_data, batchsize, dim_input_course, dim_input_grade, dim_input_major): model.eval() summ1 = 0 # >=B or <B summ2 = 0 # credit/uncredit len1 = len2 = 0 tp = np.zeros(2) tn = np.zeros(2) true = np.zeros(2) false = np.zeros(2) predict_true = np.zeros(2) predict_false = np.zeros(2) for step, (batch_x, batch_y) in enumerate(loader): # batch_x: index of batch data processed_data = process_data(batch_x.numpy(), vali_data, batchsize, dim_input_course, dim_input_grade, dim_input_major) padded_input = Variable(torch.Tensor(processed_data[0]), requires_grad=False).cuda() seq_len = processed_data[1] padded_label = Variable(torch.Tensor(processed_data[2]), requires_grad=False).cuda() # clear hidden states model.hidden = model.init_hidden() model.hidden[0] = model.hidden[0].cuda() model.hidden[1] = model.hidden[1].cuda() # compute output y_pred = model(padded_input, seq_len) # only compute the accuracy for testing period accura = accuracy(y_pred, seq_len, padded_label) len1 += accura[3] len2 += accura[4] summ1 += (accura[0] * accura[3]) summ2 += (accura[1] * accura[4]) print('>=B or not', accura[0], 'credit/uncredit', accura[1], 'total', accura[2]) # compute tp, fp, fn, tn sen = sensitivity(y_pred, seq_len, padded_label) tp += sen[0] tn += sen[1] true += sen[2] false += sen[3] predict_true += sen[4] predict_false += sen[5] average_metric1 = summ1 / len1 average_metric2 = summ2 / len2 average_metric = (summ1 + summ2) / (len1 + len2) print("num of >=B or <B: ", len1, "num of credit/uncredit: ", len2) print("On average: ", average_metric1, average_metric2, average_metric) tpr = tp / true fpr = (predict_true - tp) / false fnr = (predict_false - tn) / true tnr = tn / false precision_B = (tn / predict_false)[0] f_value_B = 2 / (1 / tnr[0] + 1 / precision_B) precision_uncredit = (tn / predict_false)[-1] f_value_uncredit = 2 / (1 / tnr[-1] + 1 / precision_uncredit) f_value = np.append(f_value_B, f_value_uncredit) print("tpr: ", tpr) print("fpr: ", fpr) print("fnr: ", fnr) print("tnr: ", tnr) print('F: ', f_value, 'average F:', np.average(f_value))
def main(_): word2id = {} ent2id = {} rel2id = {} words = set() relations = set() entities = set() FLAGS.checkpoint_dir = os.path.join(FLAGS.checkpoint_dir, FLAGS.data_file) FLAGS.checkpoint_dir = os.path.join(FLAGS.checkpoint_dir, FLAGS.KB_file) if not os.path.exists(FLAGS.checkpoint_dir): os.makedirs(FLAGS.checkpoint_dir) KB_file = '%s/%s.txt' % (FLAGS.data_dir, FLAGS.KB_file) data_file = '%s/%s.txt' % (FLAGS.data_dir, FLAGS.data_file) start = time.time() if FLAGS.data_file == "WC-C": Q, A, P, S, Triples, FLAGS.query_size = process_data_c( KB_file, data_file, word2id, rel2id, ent2id, words, relations, entities) FLAGS.path_size = len(P[0][0]) #5 else: Q, A, P, S, Triples, FLAGS.query_size = process_data( KB_file, data_file, word2id, rel2id, ent2id, words, relations, entities) FLAGS.path_size = len(P[0]) #5 or 7 or FLAGS.nhop = FLAGS.path_size // 2 # 2 or 3 print("read data cost %f seconds" % (time.time() - start)) FLAGS.nwords = len(word2id) FLAGS.nrels = len(rel2id) FLAGS.nents = len(ent2id) #trainQ, testQ, trainA, testA, trainP, testP, trainS, testS = cross_validation.train_test_split(Q, A, P, S, test_size=.1, random_state=123) #trainQ, validQ, trainA, validA, trainP, validP, trainS, validS = cross_validation.train_test_split(trainQ, trainA, trainP, trainS, test_size=.11, random_state=0) trainQ, testQ, trainA, testA, trainP, testP, trainS, testS = train_test_split( Q, A, P, S, test_size=.1, random_state=123) trainQ, validQ, trainA, validA, trainP, validP, trainS, validS = train_test_split( trainQ, trainA, trainP, trainS, test_size=.11, random_state=0) # for UNSEEN relations (incomplete kb setting, change data_utils.py) if FLAGS.unseen: id_c = [] for idx in range(trainQ.shape[0]): if trainP[idx][-4] == 1 or trainP[idx][-4] == 2 or trainP[idx][ -4] == 3: id_c.append(idx) trainQ = np.delete(trainQ, id_c, axis=0) trainA = np.delete(trainA, id_c, axis=0) trainP = np.delete(trainP, id_c, axis=0) trainS = np.delete(trainS, id_c, axis=0) n_train = trainQ.shape[0] n_test = testQ.shape[0] n_val = validQ.shape[0] print("Training Size", n_train) print("Validation Size", n_val) print("Testing Size", n_test) # #other data and some flags # id2word = dict(zip(word2id.values(), word2id.keys())) id2rel = dict( zip(rel2id.values(), rel2id.keys()) ) #{0: '<end>', 1: 'cause_of_death', 2: 'gender', 3: 'profession', 4: 'institution', 5: 'religion', 6: 'parents', 7: 'location', 8: 'place_of_birth', 9: 'nationality', 10: 'place_of_death', 11: 'spouse', 12: 'children', 13: 'ethnicity'} train_labels = np.argmax(trainA, axis=1) test_labels = np.argmax(testA, axis=1) valid_labels = np.argmax(validA, axis=1) print(flags.FLAGS.__flags) #batch_id #batches = [(start, end) for start, end in batches] abandom last few examples batches = list( zip(range(0, n_train - FLAGS.batch_size, FLAGS.batch_size), range(FLAGS.batch_size, n_train, FLAGS.batch_size))) r = np.arange(n_train) # instance idx to be shuffled l = n_train / FLAGS.batch_size * FLAGS.batch_size #total instances used in training with tf.Session() as sess: if not FLAGS.data_file == "WC-C": model = IRN(FLAGS, sess) print("KB Size", Triples.shape[0]) #144 pre_batches = list( zip( range(0, Triples.shape[0] - FLAGS.batch_size, FLAGS.batch_size), range(FLAGS.batch_size, Triples.shape[0], FLAGS.batch_size))) pre_val_preds = model.predict(Triples, validQ, validP) pre_test_preds = model.predict(Triples, testQ, testP) best_val_epoch = -1 best_val_acc = MultiAcc(validP, pre_val_preds, FLAGS.path_size) best_val_true_acc = InSet(validP, validS, pre_val_preds) for t in range(1, FLAGS.nepoch + 1): start = time.time() np.random.shuffle(batches) for i in range(FLAGS.inner_nepoch): np.random.shuffle(pre_batches) pre_total_cost = 0.0 for s, e in pre_batches: pre_total_cost += model.batch_pretrain( Triples[s:e], trainQ[0:FLAGS.batch_size], trainA[0:FLAGS.batch_size], np.argmax(trainA[0:FLAGS.batch_size], axis=1), trainP[0:FLAGS.batch_size]) total_cost = 0.0 for s, e in batches: total_cost += model.batch_fit( Triples[s:e], trainQ[s:e], trainA[s:e], np.argmax(trainA[s:e], axis=1), trainP[s:e]) if t % 1 == 0: train_preds = model.predict(Triples, trainQ, trainP) train_acc = MultiAcc(trainP, train_preds, FLAGS.path_size) train_true_acc = InSet(trainP, trainS, train_preds) val_preds = model.predict( Triples, validQ, validP) # (n_val,1) each is answer id val_acc = MultiAcc(validP, val_preds, FLAGS.path_size) val_true_acc = InSet(validP, validS, val_preds) if val_true_acc > best_val_true_acc: best_val_epoch = t best_val_true_acc = val_true_acc model.store() print('-----------------------') print('Epoch', t) print('timing', (time.time() - start)) print('Total Cost:', total_cost) print('Train Accuracy:', train_true_acc) print('Validation Accuracy:', val_true_acc) print('Best Validation epoch & Acc:', best_val_epoch, best_val_true_acc) print('-----------------------') ''' if not t % 100 == 0: continue idx = model.match() for i in range(1,14): print "relation: ",id2word[i] print "similar words are: " for iid in idx[i]: print id2word[iid] print('-----------------------') print('-----------------------') ''' elif FLAGS.data_file == "WC-C": model = IRN_C(FLAGS, sess) print("KB Size", Triples.shape[0]) #144 pre_batches = list( zip( range(0, Triples.shape[0] - FLAGS.batch_size, FLAGS.batch_size), range(FLAGS.batch_size, Triples.shape[0], FLAGS.batch_size))) pre_val_preds = model.predict(Triples, validQ, validP) pre_test_preds = model.predict(Triples, testQ, testP) best_val_epoch = -1 best_val_acc = MultiAcc_C(validP, pre_val_preds) best_val_true_acc = InSet(validP, validS, pre_val_preds) for t in range(1, FLAGS.nepoch + 1): start = time.time() np.random.shuffle(batches) for i in range(FLAGS.inner_nepoch): np.random.shuffle(pre_batches) pre_total_cost = 0.0 for s, e in pre_batches: pre_total_cost += model.batch_pretrain( Triples[s:e], trainQ[0:FLAGS.batch_size], trainA[0:FLAGS.batch_size], np.argmax(trainA[0:FLAGS.batch_size], axis=1), trainP[0:FLAGS.batch_size]) total_cost = 0.0 for s, e in batches: total_cost += model.batch_fit( Triples[s:e], trainQ[s:e], trainA[s:e], np.argmax(trainA[s:e], axis=1), trainP[s:e]) if t % 1 == 0: train_preds = model.predict(Triples, trainQ, trainP) train_acc = MultiAcc_C(trainP, train_preds) train_true_acc = InSet(trainP, trainS, train_preds) val_preds = model.predict( Triples, validQ, validP) # (n_val,1) each is answer id val_acc = MultiAcc_C(validP, val_preds) val_true_acc = InSet(validP, validS, val_preds) if val_true_acc > best_val_true_acc: best_val_epoch = t best_val_true_acc = val_true_acc model.store() print('-----------------------') print('Epoch', t) print('timing', (time.time() - start)) print('Total Cost:', total_cost) print('Train Accuracy:', train_true_acc) print('Validation Accuracy:', val_true_acc) print('Best Validation epoch & Acc:', best_val_epoch, best_val_true_acc) print('-----------------------')
def train(vi=None, device='/gpu:1', label_Flag=False, fps='Morgan', pro='psa', fps_size=512, nclass=2): #Data set ds = 'HIV' prop = pro #Settings if fps == 'Maccs': fps_size = 167 continue_training = True if continue_training: preTrain = 'preTrain' else: preTrain = 'scratch' save_model = False batch_size = 256 _flag_noise = False nEpoch = 501 fps_dim = fps_size latent_space = 6 n_classes = nclass layers_dim = np.array([fps_dim // 2, fps_dim // 8, latent_space]) data = dp.process_data(ds, fps_type=fps, n_classes=n_classes, nBits=fps_dim, test_size=600, prop=prop) len_train = len(data['fps_train']) len_val = len(data['fps_test']) if fps == 'Morgan': val_lr_enc = 0.00001 val_lr_dec = 0.0001 val_lr_dis = 0.00001 elif fps == 'Maccs': val_lr_enc = 0.00003 val_lr_dec = 0.0001 val_lr_dis = 0.000012 decay_steps = 1000 thrs_noise = 0.8 #Paths model_path = os.path.join( '/mnt/HDD1/models/', ds + '_' + fps + '_' + prop + '_classes' + str(n_classes)) preTrain_model_path = os.path.join( '/mnt/HDD1/models/', 'Train' + '_' + fps + '_' + prop + '_classes' + str(n_classes)) if not os.path.exists(model_path): os.makedirs(model_path) #File name with tf.device(device): with tf.variable_scope('input'): #real and fake image placholders real_fps = tf.placeholder(tf.float32, shape=[None, fps_dim], name='real_fps') gen_fps = tf.placeholder(tf.float32, shape=[None, fps_dim], name='gen_fps') if label_Flag: dist_encode = tf.placeholder( tf.float32, shape=[None, layers_dim[2] + n_classes], name='real_z') else: dist_encode = tf.placeholder(tf.float32, shape=[None, layers_dim[2]], name='real_z') labels = tf.placeholder(tf.float32, shape=[None, n_classes], name='labels') is_train_enc = tf.placeholder(tf.bool, name='is_train_enc') is_train_dec = tf.placeholder(tf.bool, name='is_tain_dec') is_train_dis = tf.placeholder(tf.bool, name='is_train_dis') global_step = tf.placeholder(tf.float32, name='global_step') lengt = tf.placeholder(tf.float32, name='lengt') l = tf.placeholder(tf.float32, shape=[None, latent_space], name='l') fp = tf.placeholder(tf.float32, shape=[None, fps_dim], name='fp') lr_dis = tf.train.polynomial_decay(val_lr_dis, global_step, decay_steps, end_learning_rate=0.000001, power=1.0) lr_enc = tf.train.polynomial_decay(val_lr_enc, global_step, decay_steps, end_learning_rate=0.000001, power=1.0) lr_dec = tf.train.polynomial_decay(val_lr_dec, global_step, decay_steps, end_learning_rate=0.000001, power=2.0) # wgan real_encode = module.dense_encoder(real_fps, fps_dim, layers_dim, is_train=is_train_enc, reuse=False) real_decode = module.dense_decoder(real_encode, fps_dim, layers_dim, is_train=is_train_dec, reuse=False) if label_Flag: real_encode = tf.concat([real_encode, labels], 1) #Discriminator real_result = module.dense_discriminator(dist_encode, layers_dim, is_train=is_train_dis, n_classes=n_classes, reuse=False, label_Flag=label_Flag) fake_result = module.dense_discriminator(real_encode, layers_dim, is_train=is_train_dis, n_classes=n_classes, reuse=True, label_Flag=label_Flag) decode = module.heavside( module.dense_decoder(l, fps_dim, layers_dim, is_train=False, reuse=True)) encode = module.dense_encoder(fp, fps_dim, layers_dim, is_train=False, reuse=True) #Loss calculations #dis_loss_real = tf.losses.mean_squared_error(real_result, tf.ones_like(real_result)) #dis_loss_fake = tf.losses.mean_squared_error(fake_result, -tf.ones_like(fake_result)) #dis_loss_real = tf.nn.sigmoid_cross_entropy_with_logits(labels = tf.ones_like(real_result),logits =real_result) #dis_loss_fake = tf.nn.sigmoid_cross_entropy_with_logits(labels = tf.zeros_like(fake_result), logits = fake_result) dis_loss_fake = tf.reduce_mean(fake_result) dis_loss_real = -tf.reduce_mean(real_result) dis_loss = tf.reduce_mean([dis_loss_real, dis_loss_fake]) enc_loss = -tf.reduce_mean(fake_result) #enc_loss = tf.losses.mean_squared_error(fake_result, tf.ones_like(fake_result)) dec_loss = tf.losses.mean_squared_error(real_fps, real_decode) #dec_loss = tf.reduce_mean(tf.reduce_sum(tf.abs(real_fps - real_decode))) #Trainers t_vars = tf.trainable_variables() dis_vars = [var for var in t_vars if 'dense_discriminator' in var.name] enc_vars = [var for var in t_vars if 'dense_encoder' in var.name] dec_vars = [var for var in t_vars if 'dense_decoder' in var.name] trainer_dis_real = tf.train.AdamOptimizer( learning_rate=lr_dis, beta1=0.9, beta2=0.999, epsilon=1e-08, use_locking=False, name='Adam_discriminator').minimize(dis_loss_real, var_list=dis_vars) trainer_dis_fake = tf.train.AdamOptimizer( learning_rate=lr_dis, beta1=0.9, beta2=0.999, epsilon=1e-08, use_locking=False, name='Adam_discriminator').minimize(dis_loss_fake, var_list=dis_vars) trainer_enc = tf.train.AdamOptimizer(learning_rate=lr_enc, beta1=0.9, beta2=0.999, epsilon=1e-08, use_locking=False, name='Adam_encoder').minimize( enc_loss, var_list=enc_vars) trainer_dec = tf.train.AdamOptimizer(learning_rate=lr_dec, beta1=0.9, beta2=0.999, epsilon=1e-08, use_locking=False, name='Adam_decoder').minimize( dec_loss, var_list=dec_vars) d_clip = [v.assign(tf.clip_by_value(v, -0.01, 0.01)) for v in dis_vars] #Accuracy calculations less_then_05 = tf.cast( tf.math.less_equal(tf.zeros_like(real_result), real_result), tf.float32) count = tf.reduce_sum(less_then_05) acc_real = tf.divide(count, lengt) acc_fake = tf.divide( tf.reduce_sum( tf.cast( tf.math.less_equal(fake_result, tf.zeros_like(fake_result)), tf.float32)), lengt) acc_dis = tf.reduce_mean([acc_real, acc_fake]) acc_enc = 1 - acc_fake gen_fps = module.heavside(real_decode) acc_dec = tf.metrics.accuracy(module.heavside(real_fps), module.heavside(real_decode)) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.43) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) # continue training if continue_training: ckpt = tf.train.latest_checkpoint(preTrain_model_path) saver.restore(sess, ckpt) #gpu_options = tf.GPUOptions(allow_growth=True) #session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False, # gpu_options=gpu_options) #threads = tf.train.start_queue_runners(sess=sess, coord=coord) batch_num = math.floor(len(data['fps_train']) / batch_size) print('batch size: %d, batch num per epoch: %d, epoch num: %d' % (batch_size, batch_num, nEpoch)) print('start training...') dec_iters, dis_iters, enc_iters = 5, 1, 0 trainLoss_dis, trainAcc_dis = 0, 0 trainLoss_enc, trainAcc_enc = 0, 0 trainLoss_dec, trainAcc_dec = 0, 0 valLoss_dis, valLoss_enc, valLoss_dec = 0, 0, 0 valAcc_dis, valAcc_enc, valAcc_dec = 0, 0, 0 for i in range(nEpoch): if (trainAcc_dis < 0.505 and trainAcc_enc > 0.98): enc_iters = 1 dis_iters = 7 thrs_noise = 0.9 _flag_noise = False elif (trainAcc_dis < 0.505 and trainAcc_enc < 0.01): dis_iters = 1 enc_iters = 1 thrs_noise = 0.7 _flag_noise = False else: _flag_noise = False thrs_noise = 0.95 dis_iters = 5 enc_iters = 1 trainLoss_dis, trainAcc_dis = 0, 0 trainLoss_enc, trainAcc_enc = 0, 0 trainLoss_dec, trainAcc_dec = 0, 0 valLoss_dis, valLoss_enc, valLoss_dec = 0, 0, 0 valAcc_dis, valAcc_enc, valAcc_dec = 0, 0, 0 max_iter = max([dec_iters, dis_iters, enc_iters]) batch = dp.batch_gen(data['fps_train'], data['labels_train'], batch_size=batch_size, n_dim=layers_dim[2], n_labels=n_classes, label_Flag=label_Flag, dic_iter=max_iter) print("Epoch %d" % i) train_real_z = distrib.normal_mixture(data['labels_train'], np.shape( data['labels_train'])[0], n_dim=layers_dim[2], n_labels=n_classes) val_real_z = distrib.normal_mixture(data['labels_val'], np.shape(data['labels_val'])[0], n_dim=layers_dim[2], n_labels=n_classes) if label_Flag: train_real_z = np.concatenate((train_real_z, data['labels_train']), axis=1) val_real_z = np.concatenate((val_real_z, data['labels_val']), axis=1) for j in range(batch_num): if _flag_noise and np.random.uniform(0, 1) > thrs_noise: _real_fps = batch['fps'][j] + np.random.normal( 0, 0.4, size=np.shape(batch['fps'][j])) else: _real_fps = batch['fps'][j] enc_dict = { real_fps: _real_fps, labels: batch['label'][j], global_step: i, is_train_enc: True, is_train_dis: False } for k in range(dis_iters): if _flag_noise and np.random.uniform(0, 1) > thrs_noise: _real_fps = batch['fps'][ j * dis_iters + k] + np.random.normal( 0, 0.4, size=np.shape(batch['fps'][j * dis_iters + k])) else: _real_fps = batch['fps'][j * dis_iters + k] dis_dict = { real_fps: _real_fps, labels: batch['label'][j * dis_iters + k], dist_encode: batch['real_z'][j * dis_iters + k], global_step: i, is_train_enc: False, is_train_dis: True } sess.run([trainer_dis_real], feed_dict=dis_dict) sess.run([trainer_dis_fake], feed_dict=dis_dict) # Update the encoder for k in range(enc_iters): sess.run([trainer_enc], feed_dict=enc_dict) # Update decoder for k in range(dec_iters): if _flag_noise and np.random.uniform(0, 1) > thrs_noise: _real_fps = batch['fps'][ j * dis_iters + k] + np.random.normal( 0, 0.2, size=np.shape(batch['fps'][j * dis_iters + k])) else: _real_fps = batch['fps'][j * dis_iters + k] dec_dict = { real_fps: _real_fps, global_step: i, is_train_dec: True, is_train_enc: False } sess.run([trainer_dec], feed_dict=dec_dict) nom = 10000 ds_size_nom = np.shape(data['fps_train'])[0] // nom + 1 if i % 10 == 0: l_space = np.zeros( [latent_space, np.shape(data['fps_train'])[0]], dtype=np.float32) for b in range(ds_size_nom): l_space[:, b * nom:b * nom + nom] = (np.array( sess.run([encode], feed_dict={ fp: data['fps_train'][b * nom:b * nom + nom, :] }))[0].T) sample = GM.generate_latent(l_space, np.array(data['labels_train'])) for j in sample.keys(): generated_fingerprints = np.array( sess.run([decode], feed_dict={l: sample[j]})[0]) for k in range(n_classes): avg_tver, max_tver, min_tver, u_tver, su_tver, nu_tver = sim.tversky( data['fps_test'][k], generated_fingerprints, 1, 1) arg1 = { 'Average_tversky': [avg_tver], 'Max_tversky': [max_tver], 'Min_tversky': [min_tver], 'Useful_tversky': [u_tver], 'Semiuseful_tversky': [su_tver], 'Notuseful_tversky': [nu_tver] } log.log_sim_data(i, arg1, flag=label_Flag, fps=fps, dSet=ds, prop=prop, n_class=n_classes, preTrain=preTrain) d = np.zeros([np.shape(data['fps_train'])[0], latent_space], dtype=np.float32) for b in range(ds_size_nom): train_loss_dict = { real_fps: data['fps_train'][b * nom:b * nom + nom], labels: data['labels_train'][b * nom:b * nom + nom], dist_encode: train_real_z, is_train_dec: False, is_train_enc: False, is_train_dis: False, lengt: len_train } val_loss_dict = { real_fps: data['fps_val'], labels: data['labels_val'], dist_encode: val_real_z, is_train_dec: False, is_train_enc: False, is_train_dis: False, lengt: len_val } d[b * nom:b * nom + nom, :] = sess.run( [encode], feed_dict={fp: data['fps_train'][b * nom:b * nom + nom, :]})[0] trainLoss_dis += sess.run([dis_loss], feed_dict=train_loss_dict)[0] trainLoss_enc += sess.run([enc_loss], feed_dict=train_loss_dict)[0] trainLoss_dec += sess.run([dec_loss], feed_dict=train_loss_dict)[0] valLoss_dis += sess.run([dis_loss], feed_dict=val_loss_dict)[0] valLoss_enc += sess.run([enc_loss], feed_dict=val_loss_dict)[0] valLoss_dec += sess.run([dec_loss], feed_dict=val_loss_dict)[0] trainAcc_dis += sess.run([acc_dis], feed_dict=train_loss_dict)[0] valAcc_dis += sess.run([acc_dis], feed_dict=val_loss_dict)[0] trainAcc_enc += sess.run([acc_enc], feed_dict=train_loss_dict)[0] valAcc_enc += sess.run([acc_enc], feed_dict=val_loss_dict)[0] trainAcc_dec += sess.run([acc_dec], feed_dict=train_loss_dict)[0][0] valAcc_dec += sess.run([acc_dec], feed_dict=val_loss_dict)[0][0] print(sess.run([lr_dis], feed_dict={global_step: i})) print( 'Discriminator trainLoss = %f valLoss = %f trainAcc = %f valAcc = %f' % (trainLoss_dis / (ds_size_nom), valLoss_dis / (ds_size_nom), trainAcc_dis / (ds_size_nom + 1), valAcc_dis / (ds_size_nom))) print('Encoder trainLoss = %f valLoss = %f trainAcc = %f valAcc = %f' % (trainLoss_enc / (ds_size_nom), valLoss_enc / (ds_size_nom), trainAcc_enc / (ds_size_nom), valAcc_enc / (ds_size_nom))) print('Decoder trainLoss = %f valLoss = %f trainAcc = %f valAcc = %f' % (trainLoss_dec / (ds_size_nom), valLoss_dec / (ds_size_nom), trainAcc_dec / (ds_size_nom), valAcc_dec / (ds_size_nom))) arg = { 'Train_loss': [], 'Val_loss': [], 'Train_acc': [], 'Val_acc': [] } arg['Train_loss'] = [ trainLoss_dis / (ds_size_nom), trainLoss_enc / (ds_size_nom), trainLoss_dec / (ds_size_nom) ] arg['Val_loss'] = [ valLoss_dis / (ds_size_nom), valLoss_enc / (ds_size_nom), valLoss_dec / (ds_size_nom) ] arg['Train_acc'] = [ trainAcc_dis / (ds_size_nom), trainAcc_enc / (ds_size_nom), trainAcc_dec / (ds_size_nom) ] arg['Val_acc'] = [ valAcc_dis / (ds_size_nom), valAcc_enc / (ds_size_nom), valAcc_dec / (ds_size_nom) ] log.log_train_data(i, arg, flag=label_Flag, fps=fps, dSet=ds, prop=prop, n_class=n_classes, preTrain=preTrain) if vi != None: d1 = np.empty(np.shape(d)[0] * latent_space // 2, dtype=np.float32) d2 = np.empty(np.shape(d)[0] * latent_space // 2, dtype=np.float32) c = np.empty(np.shape(d)[0] * latent_space // 2, dtype=np.int32) for h in range(latent_space // 2): d1[np.shape(d)[0] * h:np.shape(d)[0] * h + np.shape(d)[0]] = d[:, 2 * h] d2[np.shape(d)[0] * h:np.shape(d)[0] * h + np.shape(d)[0]] = d[:, 2 * h + 1] c[np.shape(d)[0] * h:np.shape(d)[0] * h + np.shape(d)[0]] = np.nonzero(data['labels_train'])[1] #vi.update(d1_avg, d2_avg, np.nonzero(data['labels_train'])[1]) vi.update(d1, d2, c) if i % 50 == 0 and i != 0: if save_model: saver.save( sess, os.path.join(model_path, 'model' + str(i) + '.ckpt')) sess.close()
def main(_): word2id = {} ent2id = {} rel2id = {} words = set() relations = set() entities = set() FLAGS.checkpoint_dir = os.path.join(FLAGS.checkpoint_dir, FLAGS.data_file) FLAGS.checkpoint_dir = os.path.join(FLAGS.checkpoint_dir, FLAGS.KB_file) if not os.path.exists(FLAGS.checkpoint_dir): os.makedirs(FLAGS.checkpoint_dir) KB_file = '%s/%s.txt' % (FLAGS.data_dir, FLAGS.KB_file) data_file = '%s/%s.txt' % (FLAGS.data_dir, FLAGS.data_file) start = time.time() if FLAGS.data_file == "WC-C": Q, A, P, S, Triples, FLAGS.query_size = process_data_c( KB_file, data_file, word2id, rel2id, ent2id, words, relations, entities) FLAGS.path_size = len(P[0][0]) #5 else: Q, A, P, S, Triples, FLAGS.query_size = process_data( KB_file, data_file, word2id, rel2id, ent2id, words, relations, entities) FLAGS.path_size = len(P[0]) #5 or 7 or FLAGS.nhop = FLAGS.path_size // 2 # must be an integer print("read data cost %f seconds" % (time.time() - start)) FLAGS.nwords = len(word2id) FLAGS.nrels = len(rel2id) FLAGS.nents = len(ent2id) trainQ, testQ, trainA, testA, trainP, testP, trainS, testS = train_test_split( Q, A, P, S, test_size=.1, random_state=123) # for UNSEEN relations (incomplete kb setting, change data_utils.py) if FLAGS.unseen: id_c = [] for idx in range(trainQ.shape[0]): if trainP[idx][-4] == 1 or trainP[idx][-4] == 2 or trainP[idx][ -4] == 3: id_c.append(idx) trainQ = np.delete(trainQ, id_c, axis=0) trainA = np.delete(trainA, id_c, axis=0) trainP = np.delete(trainP, id_c, axis=0) trainS = np.delete(trainS, id_c, axis=0) # #other data and some flags # id2word = dict(zip(word2id.values(), word2id.keys())) id2ent = dict(zip(ent2id.values(), ent2id.keys())) id2rel = dict( zip(rel2id.values(), rel2id.keys()) ) #{0: '<end>', 1: 'cause_of_death', 2: 'gender', 3: 'profession', 4: 'institution', 5: 'religion', 6: 'parents', 7: 'location', 8: 'place_of_birth', 9: 'nationality', 10: 'place_of_death', 11: 'spouse', 12: 'children', 13: 'ethnicity'} test_labels = np.argmax(testA, axis=1) print(flags.FLAGS.__flags) with tf.Session() as sess: if not FLAGS.data_file == "WC-C": model = IRN(FLAGS, sess) elif FLAGS.data_file == "WC-C": model = IRN_C(FLAGS, sess) model.load() test_preds = model.predict(Triples, testQ, testP) if not FLAGS.data_file == "WC-C": test_acc = MultiAcc(testP, test_preds, FLAGS.path_size) elif FLAGS.data_file == "WC-C": test_acc = MultiAcc_C(testP, test_preds) test_true_acc = InSet(testP, testS, test_preds) show_k = FLAGS.show_case_no if FLAGS.show_case_no < testQ.shape[ 0] else 0 input_q = " ".join([id2word[w] for w in testQ[show_k]]) #output = test_preds[0][0] path_words = [] for j in range(FLAGS.path_size): if j % 2 == 0: path_words.append(id2ent[test_preds[show_k][j]]) else: path_words.append(id2rel[test_preds[show_k][j]]) output = "---".join(path_words) if FLAGS.show_case_only: print('-----------------------') print('test input:', input_q) print('test output:', output) print('-----------------------') return print('-----------------------') print('Test Data', data_file) print('Test Accuracy:', test_true_acc) print('Test Accuracy for whole Path:', test_acc) print('-----------------------')
Created on Thu Sep 6 15:47:18 2018 @author: huangjin """ from data_all_get import data_to_local from data_process import process_data from data_fea_gen import gen_feature_file from data_split import split_data from model_train import train_model from model_pred import pred_model from evaluation import evaluation_result if __name__=='__main__': # 读取数据到本地 data_to_local() # 处理数据 process_data() # 提取特征:输入值,预测目标名 gen_feature_file('oper_rev') # 划分数据集 split_data('2011-03-31', '2016-12-31', '2017-03-31', '2017-12-31', '2017-12-31', '2018-03-31', 'oper_rev') # 模型训练,并保存模型 train_model('train.csv', 'valid.csv', '2018-03-31', 'oper_rev') # 预测 pred_model('2018-03-31', 'oper_rev') # 评估 evaluation_result('2018-03-31', 'oper_rev')
print "y size: ", len(y) plt.scatter(x, y, c=color) plt.xlabel(xname) plt.ylabel(yname) # add legend classes = ['0', '1'] class_colours = ['r', 'g'] recs = [] for i in range(len(class_colours)): recs.append(mpatches.Rectangle((0, 0), 1, 1, fc=class_colours[i])) plt.legend(recs, classes, loc='upper left') plt.show() train, test, features, features_non_numeric = data_process.read_data() train, test, features, features_non_numeric = data_process.process_data( train, test, features, features_non_numeric) tsize = 0.001 dtrain, dtest = cross_validation.train_test_split(train, test_size=tsize) #importance_feat(features) #Correlation_Matrix_plot(train) features = ['Customers', 'Sales', 'Promo'] data = dtest[features] Scatter_plot(data)
# Data sources (from NYT repository as submodule) pop_data_dir = 'population-data' covid_data_dir = 'covid-19-data' county_data_filename = 'us-counties.csv' state_data_filename = 'us-states.csv' # County population data from USDA county_pop_data = dp.import_pop_data(os.path.join(pop_data_dir, county_data_filename)) state_pop_data = dp.import_pop_data(os.path.join(pop_data_dir, state_data_filename)) county_df = dp.import_data(os.path.join(covid_data_dir, county_data_filename)) county_df = county_df[county_df['fips'].notnull()] # county_df = county_df[(county_df['fips'].notnull()) & (county_df['fips'] != 'nan')] county_df = dp.process_data(county_df, county_pop_data) state_df = dp.process_data(dp.import_data(os.path.join(covid_data_dir, state_data_filename)), state_pop_data) states = state_df.state.unique() states.sort() last_date = county_df['date'].max() # Get the listing of counties for placing data on the map with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response: counties = json.load(response) # ------------------------------------------------------------------------------
from cnn import create_convolutional_netowork from data_generator import traing_data_generator, testing_data_generator from data_process import process_data from training_phase import train_cnn from testing_phase import test_cnn create_convolutional_netowork() training_data, validation_data = process_data('Training_Dataset', 'Validation_Data', 150, 'categorical') train_cnn(training_data, validation_data) #test_cnn(Skip_lable=False): Pass True if do not want to display lable test_cnn()