def easyDecode(confdict, verbose=True): data = Data() data.read_config(confdict) print('Model Decode') try: data.load(data.dset_dir) except TypeError: data.load_export(data.xpt_dir) data.read_config(confdict) data.HP_gpu = torch.cuda.is_available() print('Decoding source: ', data.raw_dir) if verbose: data.show_data_summary() data.generate_instance('raw') decode_results, pred_scores = load_model_decode(data, 'raw') if data.decode_dir: data.write_decoded_results(decode_results, 'raw')
def __init__(self, model_config: dict): """ :param model_config: path """ self.model_config = model_config self.encoder_type = self.model_config['encoder_type'] alphabet_path = self.model_config[ 'alphabet_path'] + '/' + self.encoder_type + '.dset' data_config_file = self.model_config['data_config_file'] self.data = Data(data_config_file, alphabet_path) # 不做替换的lexi_index no_replaced_lexicon_name = self.data.data_config[ 'no_replaced_lexicon_name'] self.no_replaced_lexicon_id = [ self.data.lexicon_alphabet.get_index(i) for i in no_replaced_lexicon_name ]
def train_image(): datasets = ['yeast', 'scene', 'enron', 'image'] dataset = datasets[3] data = Data(dataset, label_type=0) x, y = data.load_data() x_train = x[0:1800] y_train = y[0:1800] x_test = x[1800:2000] y_test = y[1800:2000] camel.train_image(dataset, x_train, y_train, x_test, y_test, rho=1, alpha=0.1, alpha_ban=0.5, lam2=0.1)
def __init__(self, sampling='NONE', feature_selection='AUTO', do_extract_features='NO'): self.sampling = sampling.upper() assert self.sampling in ['UP', 'DOWN', 'SMOTE', 'NONE'] self.feature_selection = feature_selection.upper() assert self.feature_selection in ['AUTO', 'MANUAL'] do_extract_features_input = do_extract_features.upper() assert do_extract_features_input in ['YES', 'NO'] self.do_extract_features = True if do_extract_features_input == 'YES' else False self.data = Data() self.entities_by_bioconcept = self.data.learn_training_entries() self.nlp = spacy.load(self.MODEL) self.eppo = Eppo(time_to_sleep=0) self.cat = CatLife(time_to_sleep=0) self.columns = [column for _, columns in self.COLUMNS_BY_SOURCE.items() for column in columns] self.entities_df_path = self.data.cwd / self.DF_FILE_NAME self.df = self.get_training_df() pandas.set_option("display.max_rows", 500) pandas.set_option("display.max_columns", 50) pandas.set_option('display.width', 200)
def main(): doc = """ Usage: predict <model_dir> <input_file> <output_file> """ args = docopt(doc) config = {} config['status'] = 'decode' config['raw_dir'] = args['<input_file>'] config['decode_dir'] = args['<output_file>'] dset_dir = os.path.join(args['<model_dir>'], 'model.dset') load_model_dir = os.path.join(args['<model_dir>'], 'model.best.model') config['dset_dir'] = dset_dir config['load_model_dir'] = load_model_dir data = Data() data.read_config(config) data.HP_gpu = torch.cuda.is_available() print("Seed num: %s", seed_num) if data.status == 'decode': print("MODEL: decode") data.load(data.dset_dir) data.read_config(config) print(f"Reading from {data.raw_dir}") # exit(0) data.show_data_summary() data.generate_instance('raw') print(f"nbest: {data.nbest}") decode_results, pred_scores = load_model_decode(data, 'raw') if data.nbest: data.write_nbest_decoded_results(decode_results, pred_scores, 'raw') else: data.write_decoded_results(decode_results, 'raw') else: print("Invalid command")
def data_initialization(args): data_stored_directory = args.data_stored_directory file = data_stored_directory + args.dataset_name + "_dataset.dset" if os.path.exists(file) and not args.refresh: data = load_data_setting(data_stored_directory, args.dataset_name) else: data = Data() data.dataset_name = args.dataset_name data.norm_char_emb = args.norm_char_emb data.norm_gaz_emb = args.norm_gaz_emb data.number_normalized = args.number_normalized data.max_sentence_length = args.max_sentence_length data.build_gaz_file(args.gaz_file) data.generate_instance(args.train_file, "train", False) data.generate_instance(args.dev_file, "dev") data.generate_instance(args.test_file, "test") data.build_char_pretrain_emb(args.char_embedding_path) data.build_gaz_pretrain_emb(args.gaz_file) data.fix_alphabet() data.get_tag_scheme() save_data_setting(data, data_stored_directory) return data
def dispatch(config=None, status="train", data=None): if data is None: data = Data() data.HP_gpu = torch.cuda.is_available() data.read_config(config) else: data.HP_gpu = torch.cuda.is_available() data.show_data_summary() status = data.status.lower() print("Seed num:", seed_num) if status == 'train': print("MODEL: train") data_initialization(data) data.generate_instance('train') data.generate_instance('dev') data.generate_instance('test') data.build_pretrain_emb() return train(data) elif status == 'decode': print("MODEL: decode") data.load(data.dset_dir) data.read_config(config) print(data.raw_dir) # exit(0) data.show_data_summary() data.generate_instance('raw') print("nbest: %s" % (data.nbest)) decode_results, pred_scores = load_model_decode(data, 'raw') if data.nbest and not data.sentence_classification: data.write_nbest_decoded_results(decode_results, pred_scores, 'raw') else: data.write_decoded_results(decode_results, 'raw') else: print( "Invalid argument! Please use valid arguments! (train/test/decode)" )
def __init__(self): self.data = Data() self.entities_by_bioconcept = self.data.learn_training_entries()
import tensorflow as tf import numpy as np import pandas as pd import random from utils.data import Data ################ from population.population import Population from population.network import Network # suppress tf GPU logging import os os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' data = Data("data/coinbase-1min.csv", interval=30) ################# # for evaluating model fitness def calculate_profit(trades, prices): btc_wallet, starting_cash = 0., 100. usd_wallet = starting_cash fee = 0.0011 holding = False for idx, trade in enumerate(trades): if holding and not np.argmax(trade): holding = False usd_wallet = btc_wallet * prices[idx] * (1 - fee) if not holding and np.argmax(trade): holding = True btc_wallet = usd_wallet / prices[idx] * (1 - fee) # sell if holding
def __init__(self): self.data = Data()
lexicon_word_ids_sent.append( word_lexicon_pad[0][0:self.max_lexicon_words_num]) word_length_tensor_sent.append( word_lexicon_pad[1][0:self.max_lexicon_words_num]) lexicon_word_ids.append(lexicon_word_ids_sent) word_length_tensor.append(word_length_tensor_sent) lexicon_word_ids = list( map( lambda l: l + [[0] * self.max_lexicon_words_num] * (self.max_char_len - len(l)), lexicon_word_ids)) word_length_tensor = list( map( lambda l: l + [[0] * self.max_lexicon_words_num] * (self.max_char_len - len(l)), word_length_tensor)) return chars_ids, lexicon_word_ids, word_length_tensor, labels if __name__ == "__main__": confs = Configure(sys.argv[1]) # model_save_path = "model/" # dev_ret_path = "tmp/" model = LatticeNet(conf=confs) model.data = Data() model.load_data_and_embedding(model.data) model.create_model() model.train_model(model.data) # model.save_parameters() # model.train_model()
from utils.data import Data tiff_filename = "./Data/Images/" shp_filename = "./Data/Labels/" data = Data(tiff_filename, shp_filename) tiff = data.read_tiff() mask = data.get_mask() X, y = data.get_Xy(tiff, mask, n_sample=2000000) X_train, X_test, y_train, y_test = data.train_test_split(X, y)
def main(args): logger = logging.getLogger() hdlr = logging.FileHandler(args.log_path) formatter = logging.Formatter('[%(asctime)s] [%(levelname)s] [%(threadName)-10s] %(message)s') hdlr.setFormatter(formatter) logger.addHandler(hdlr) logger.addHandler(logging.StreamHandler()) logger.setLevel(logging.INFO) databox = Data(args.input_dir) dataset_size = databox.size logger.info('Dataset size: {}'.format(dataset_size)) pggan = PGGAN() resolutions = [2**(i+2) for i in range(9)] z = tf.placeholder(tf.float32, [None, 1, 1, 512]) reals = [tf.placeholder(tf.float32, [None, r, r, 3]) for r in resolutions] alpha = tf.placeholder(tf.float32, []) fakes = [pggan.generator(z, alpha, stage=i+1) for i in range(9)] d_reals = [pggan.discriminator(x, alpha, stage=i+1, reuse=False) for i, x in enumerate(reals)] d_fakes = [pggan.discriminator(x, alpha, stage=i+1, reuse=True) for i, x in enumerate(fakes)] xhats = [] d_xhats = [] for i, (real, fake) in enumerate(zip(reals, fakes)): epsilon = tf.random_uniform(shape=[tf.shape(real)[0], 1, 1, 1], minval=0.0, maxval=1.0) inter = real * epsilon + fake * (1 - epsilon) d_xhat = pggan.discriminator(inter, alpha, stage=i+1, reuse=True) xhats.append(inter) d_xhats.append(d_xhat) g_losses, d_losses = calc_losses(d_reals, d_fakes, xhats, d_xhats) g_var_list = [] d_var_list = [] for v in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES): if ('generator' in v.name): g_var_list.append(v) elif ('discriminator' in v.name): d_var_list.append(v) global_step = tf.Variable(0, name='global_step', trainable=False) opt = tf.train.AdamOptimizer(learning_rate=1e-3, beta1=0.0, beta2=0.99, epsilon=1e-8) g_train_op = [opt.minimize(loss, global_step=global_step, var_list=g_var_list) for loss in g_losses] d_train_op = [opt.minimize(loss, global_step=global_step, var_list=d_var_list) for loss in d_losses] sess = tf.Session() init_op = tf.global_variables_initializer() sess.run(init_op) if args.resume: saver = tf.train.Saver() saver.restore(sess, args.resume) logger.info('Resuming training') if args.finetuning: sess.run(global_step.assign(0)) logger.info('Fine-tuning') stage_steps = [ int(epoch * dataset_size / batch_size) for epoch, batch_size in zip(args.epochs, args.batch_sizes) ] current_stage = None while True: step = int(sess.run(global_step) / 2) if step >= sum(stage_steps): logger.info('Done!') break for i in range(len(stage_steps)): if step < sum(stage_steps[:i+1]): stage = i break image_size = resolutions[i] if current_stage != stage: if current_stage is not None: databox.terminate() databox.start(image_size) current_stage = stage progress = step + 1 - sum(stage_steps[:i]) logger.info('step: {}/{} - {}x{} (stage {})'.format( progress, stage_steps[i], image_size, image_size, stage+1)) current_stage_step = stage_steps[stage] current_stage_progress = step - sum(stage_steps[:stage]) delta = 4 / current_stage_step # 25 % if stage == 0: alp = 1.0 else: alp = min(current_stage_progress * delta, 1.0) x_batch = databox.get(args.batch_sizes[stage]) z_batch = np.random.normal(size=[args.batch_sizes[stage], 1, 1, 512]) _, d_loss = sess.run([d_train_op[stage], d_losses[stage]], feed_dict={reals[stage]: x_batch, z: z_batch, alpha: alp}) z_batch = np.random.normal(size=[args.batch_sizes[stage], 1, 1, 512]) _, g_loss = sess.run([g_train_op[stage], g_losses[stage]], feed_dict={z: z_batch, alpha: alp}) if progress % 1000 == 0: saver = tf.train.Saver() saver.save(sess, os.path.join(args.weights_dir, 'latest'), write_meta_graph=False) z_batch = np.random.normal(size=[args.batch_sizes[stage], 1, 1, 512]) out = fakes[stage].eval(feed_dict={z: z_batch, alpha: 1.0}, session=sess) out = np.array((out[0] + 1) * 127.5, dtype=np.uint8) out = cv2.cvtColor(out, cv2.COLOR_RGB2BGR) outdir = os.path.join(args.output_dir, 'stage{}'.format(stage+1)) os.makedirs(outdir, exist_ok=True) dst = os.path.join(outdir, '{}.png'.format('{0:09d}'.format(progress))) cv2.imwrite(dst, out) if int(sess.run(global_step) / 2) == sum(stage_steps[:stage+1]): saver = tf.train.Saver() saver.save(sess, os.path.join(args.weights_dir, 'stage{}'.format(stage+1)), write_meta_graph=False)
def get(): numbers = Contest.query.order_by(Contest.id).filter_by(visible=True).count() data = Data(data={"count": numbers}, status=200) return data.to_response()
import time print(sys.version) utils_path = pathlib.Path(os.getcwd() + '/utils') # i suspect this one is not needed print(utils_path.exists()) print(os.getcwd()) #sys.path.append(str(utils_path)) # may not be necessary #sys.path.append(os.getcwd()) # i thnk this is the one that works sys.path.append('../') # this one is one level up so we can see the utils lib print(sys.path) import numpy as np import sklearn from sklearn.datasets import load_iris, load_digits from sklearn.model_selection import train_test_split import matplotlib.pyplot as plt import seaborn as sns import pandas as pd from utils.data import Data from utils.config import Config import umap import numba d = Data() # v2 = True: loading the dataset with the con flag df = d.get18M_features(v2=True) print('END')
return pred_results, pred_scores, probs, acc_instances, acc_speed if __name__ == '__main__': parser = argparse.ArgumentParser(description='Sequence NCRF++') parser.add_argument('--train-config', help='Train configuration File', dest="train") parser.add_argument('--decode-config', help='Decode configuration File', required=True, dest="decode") args = parser.parse_args() decode = Data() decode.read_config(args.decode) if args.train is not None: #TRAIN train_data = Data() encoding_index = train_data.encoding train_data.read_config(args.train) train_enc = l.Encoding(train_data.encoding, train_data.postag_type) dict_encoded, all_sent, _ = train_enc.encode(train_data.dev_gold) processing.write_to_conllu(dict_encoded, train_data.dev_enc_dep2label, 0) train_data.HP_gpu = torch.cuda.is_available() print("Seed num:", seed_num) train_enc = l.Encoding(train_data.encoding, train_data.postag_type) dict_encoded, all_sent, _ = train_enc.encode(train_data.train_gold)
def post(): request_data = request.get_json() data = Data(message="It's OK, you already login", data=request_data, status=200) return data.to_response()
def get(): data = Data(message="It's OK, you already login", data=request.args, status=200) return data.to_response()
def main(): # data data = Data( IMAGE_DIR, PICKLES_DIR, params['keep_words'], params=params, img_embed=params["img_embed"]) data_dict = data.dictionary # define placeholders capt_inputs = tf.placeholder(tf.int32, [None, None]) capt_labels = tf.placeholder(tf.int32, [None, None]) seq_length = tf.placeholder(tf.int32, [None]) # forward pass is expensive, so can use this method to reduce computation if params["img_embed"]== "vgg": n_features = 4096 elif params["img_embed"] == "resnet": n_features = 2048 image_embs = tf.placeholder(tf.float32, [None, n_features]) # vgg16 if params['num_captions'] > 1 and params['mode'] == 'training': features_tiled = tf.tile(tf.expand_dims(image_embs, 1), [1, params['num_captions'], 1]) features_tiled = tf.reshape(features_tiled, [tf.shape(image_embs )[0] * params['num_captions'], n_features]) # [5 * b_s, 4096] else: features_tiled = image_embs model = Decoder(capt_inputs, params['lstm_hidden'], params['embed_dim'], seq_length, data_dict, params['lstm_hidden'], image_embs, params=params, reuse_text_emb=True) with tf.variable_scope("rnn", reuse=tf.AUTO_REUSE): x_cmlogits, _ = model.forward(mode='train_capt', image_embs=features_tiled) x_lmrlogits, _ = model.forward(mode='train_lmr', lm_label='romantic') x_lmhlogits, _ = model.forward(mode='train_lmh', lm_label='humorous') # losses labels_flat = tf.reshape(capt_labels, [-1]) cm_loss = masked_loss(labels_flat, x_cmlogits, mode='train_capt') lmh_loss = masked_loss(labels_flat, x_lmhlogits, mode='train_lmh') lmr_loss = masked_loss(labels_flat, x_lmrlogits, mode='train_lmr') # optimizers cm_opt = lstm_optimizer(cm_loss, params, params['learning_rate'], mode='train_capt') lmh_opt = lstm_optimizer(lmh_loss, params, 0.0005, mode='train_lmh') lmr_opt = lstm_optimizer(lmr_loss, params, 0.0005, mode='train_lmr') # train saver = tf.train.Saver(tf.trainable_variables(), max_to_keep=params['keep_cp']) gpu_options = tf.GPUOptions( visible_device_list=params["gpu"], allow_growth=True) config = tf.ConfigProto(gpu_options=gpu_options) with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) if params['write_summary']: summary_writer = tf.summary.FileWriter('./logs', sess.graph) summary_writer.add_graph(sess.graph) # print(tf.trainable_variables()) # train 3 networks, save S_act, S_hum, S_rom if params['restore']: print("Restoring from checkpoint") saver.restore(sess, "./checkpoints/{}.ckpt".format(params['checkpoint'])) # choose labels for the training tr_labels = ['actual'] tr_style = params['tr_style'] if tr_style == 'both': tr_labels.extend(['humorous', 'romantic']) else: tr_labels.append(tr_style) if params['mode'] == 'training': for e in range(params['epochs']): for label in tr_labels: if label == 'actual': # folowing paper batch_size = params['batch_size'] else: batch_size = params['batch_size_lm'] for captions, lengths, image_f in data.get_batch( batch_size, label=label, set='train'): feed = {capt_inputs: captions[0], capt_labels: captions[1], image_embs: image_f, seq_length: lengths} if label == 'actual': opt_loss, optim = cm_loss, cm_opt elif label == 'humorous': opt_loss, optim = lmh_loss, lmh_opt elif label == 'romantic': opt_loss, optim = lmr_loss, lmr_opt loss_, _ = sess.run([opt_loss, optim], feed) if e % 4 == 0: losses = [] for captions, lengths, image_f in data.get_batch( params['batch_size'], label=label, set='val'): feed = {capt_inputs: captions[0], capt_labels: captions[1], image_embs: image_f, seq_length: lengths} if label == 'actual': opt_loss, optim = cm_loss, cm_opt elif label == 'humorous': opt_loss, optim = lmh_loss, lmh_opt elif label == 'romantic': opt_loss, optim = lmr_loss, lmr_opt vloss_ = sess.run([opt_loss], feed) losses.append(vloss_) print("Validation Model: {} Epoch: {} Loss: {}".format( label, e, np.mean(losses))) # save model if not os.path.exists("./checkpoints"): os.makedirs("./checkpoints") if e % 10 == 0 and e != 0: # save every 10 epochs save_path = saver.save(sess, "./checkpoints/{}.ckpt".format( params['checkpoint'])) print("Model saved in file: %s" % save_path) print("{} Model: Epoch: {} Loss: {}".format(label, e, loss_)) # save model if not os.path.exists("./checkpoints"): os.makedirs("./checkpoints") save_path = saver.save(sess, "./checkpoints/{}.ckpt".format( params['checkpoint'])) print("Model saved in file: %s" % save_path) elif params['mode'] == 'inference': inference(params, model, data, saver, sess)
# %% from utils.config import Config from utils.data import Data import pandas as pd # %% c = Config() #%% desc_ie = pd.read_csv(c.descriptors_IE) desc_c33 = pd.read_csv(c.descriptors_C33) #%% df_300K = Data().get300K() #%% desc_ie.shape desc_c33.shape #%% [markdown] # # plan # 1. check the unique monolayer ids in descriptors and compare with the 300K and 18M, how much is missing? # 2. some comparison is needed between the two descriptors files, do we take everything? # 3. now becuase i have the IE and C33, should be able to do some feature important analysis. # 4. sample the 300K file, is there a test for samlping # 5. fit a UMAP, fit different UMAPs # 6. visualise # %% [markdown] # # EDA
def __init__(self, client): self.client = client self.rateLimited = False self.data = Data()
def main(): logging.basicConfig( level=logging.DEBUG, format= '%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s' ) parser = argparse.ArgumentParser() parser.add_argument('-dialog', type=str) parser.add_argument('-k', type=str) parser.add_argument('-pool', type=str) parser.add_argument('-save_path', type=str) parser.add_argument('--bc_size', type=int, default=32) parser.add_argument('--lr', type=float, default=5e-5) parser.add_argument('--pool_size', type=int, default=1) parser.add_argument('--max_len', type=int, default=64) parser.add_argument('--language', type=str, default='en') parser.add_argument('--pt_path', type=str, default='none') parser.add_argument('--dist', type=int, default=1) args = parser.parse_args() dialog_path = args.dialog knowledge_path = args.k pool_path = args.pool batch_size = args.bc_size lr = args.lr pool_size = args.pool_size max_len = args.max_len lang_code = mbart_lang_to_id[args.language] distributed = args.dist save_path = args.save_path language = args.language if distributed: dist_init() local_rank = dist.get_rank() if distributed else 0 if knowledge_path != 'redis': knowledge = [] for i in range(200): if os.path.exists(f'{knowledge_path}/{i}.pkl'): knowledge.extend(read_pkl(f'{knowledge_path}/{i}.pkl')) else: knowledge = knowledge_path knowledge_pool = read_pkl(pool_path) dataset = Data(read_pkl(f'{dialog_path}/context.pkl'), read_pkl(f'{dialog_path}/response.pkl'), knowledge_pool, pool_size=pool_size, knowledge=knowledge, order=None, max_len=max_len, lang_code=lang_code) test_dataset = CKGCTestData(args.language, pool=f'dataset/ckgc/{args.language}/pool.txt', max_len=max_len, lang_code=lang_code) tokenizer = get_tokenizer('mbart') tokenizer.lang_code_to_id = mbart_lang_to_id logging.info('Build generator') generator = Generator() if torch.cuda.is_available(): generator = generator.cuda() if distributed: generator = torch.nn.parallel.DistributedDataParallel( generator, device_ids=[local_rank], output_device=local_rank, find_unused_parameters=True) optimizer = AdamW(generator.parameters(), lr) pretrained_path = args.pt_path if os.path.exists(pretrained_path): logging.info(f'Load pretrained model from {pretrained_path}') if distributed: dist.barrier() map_location = {'cuda:%d' % 0: 'cuda:%d' % dist.get_rank()} generator.load_state_dict( torch.load(pretrained_path, map_location=map_location)) dist.barrier() else: generator.load_state_dict({ k.replace("module.", ""): v for k, v in torch.load(pretrained_path).items() }) for epoch in range(100): if os.path.exists(f'{save_path}/generator/{epoch}.pt'): if distributed: dist.barrier() map_location = {'cuda:%d' % 0: 'cuda:%d' % dist.get_rank()} generator.load_state_dict( torch.load(f'{save_path}/generator/{epoch}.pt', map_location=map_location)) dist.barrier() else: generator.load_state_dict({ k.replace("module.", ""): v for k, v in torch.load(save_path + f'_{epoch}.pt').items() }) continue if distributed: dist.barrier() logging.info(f'Training epoch {epoch}') train_generator(generator, optimizer, dataset, pad_idx=1, batch_size=batch_size, epoch=epoch, distributed=distributed) if distributed: dist.barrier() if local_rank == 0: predict, true = test_generator(generator, test_dataset, language, tokenizer, pad_idx=1, batch_size=batch_size, epoch=epoch, word_mask=None) logging.info(eval_all(predict, true)) write_file(predict, f'{save_path}/predict/{epoch}.txt') torch.save(generator.state_dict(), f'{save_path}/generator/{epoch}.pt') if distributed: dist.barrier()
def load_model_decode(data): print("Load Model from file: ", data.model_dir) model = SeqModel(data) model.load_state_dict(torch.load(data.load_model_dir)) evaluate(data.ner_2_test_idx, data, model) if __name__ == '__main__': parser = argparse.ArgumentParser( description='cross ner via cross language model') parser.add_argument('--config', help='configuration File') args = parser.parse_args() cross_data = Data() cross_data.HP_gpu = torch.cuda.is_available() cross_data.read_config(args.config) status = cross_data.status.lower() print("Seed num:", seed_num) if status == 'train': print("MODEL: train") transfer_flag = False if cross_data.mode == 'supervised': data_init_supervised(cross_data) elif cross_data.mode == 'transfer': data_init_transfer(cross_data)
import time import sys import argparse import random import copy import torch import gc import cPickle as pickle import torch.autograd as autograd import torch.nn as nn import torch.nn.functional as F import torch.optim as optim import numpy as np from utils.metric import get_ner_fmeasure from model.seqmodel import SeqModel from model.seqmodel import SeqModel_circulationBiLSTM from model.LSTMText import LSTMText from utils.data import Data from utils.data import init_parser import os data1 = Data() data1.load('data/data.substring.base2.pickle') data2 = Data() data2.load('data/data.pickle') print data1 print data2 print 'fff'
def __init__(self): self.data = Data() self.entities_by_bioconcept = self.data.learn_training_entries() self.nlp = spacy.load("en_core_web_sm")
def main(params): # load data, class data contains captions, images, image features (if avaliable) if params.gen_val_captions < 0: repartiton = False else: repartiton = True data = Data(params, True, params.image_net_weights_path, repartiton=repartiton, gen_val_cap=params.gen_val_captions) # load batch generator, repartiton to use more val set images in train gen_batch_size = params.batch_size if params.fine_tune: gen_batch_size = params.batch_size batch_gen = data.load_train_data_generator(gen_batch_size, params.fine_tune) # whether use presaved pretrained imagenet features (saved in pickle) # feature extractor after fine_tune will be saved in tf checkpoint # caption generation after fine_tune must be made with params.fine_tune=True pretrained = not params.fine_tune val_gen = data.get_valid_data(gen_batch_size, val_tr_unused=batch_gen.unused_cap_in, pretrained=pretrained) test_gen = data.get_test_data(gen_batch_size, pretrained=pretrained) # annotations vector of form <EOS>...<BOS><PAD>... ann_inputs_enc = tf.placeholder(tf.int32, [None, None]) ann_inputs_dec = tf.placeholder(tf.int32, [None, None]) ann_lengths = tf.placeholder(tf.int32, [None]) if params.fine_tune: # if fine_tune dont just load images_fv image_f_inputs = tf.placeholder(tf.float32, [None, 224, 224, 3]) else: # use prepared image features [batch_size, 4096] (fc2) image_f_inputs = tf.placeholder(tf.float32, [None, 4096]) if params.use_c_v or (params.prior == 'GMM' or params.prior == 'AG'): c_i = tf.placeholder(tf.float32, [None, 90]) else: c_i = ann_lengths # dummy tensor # because of past changes image_batch, cap_enc, cap_dec, cap_len, cl_vectors = image_f_inputs,\ ann_inputs_enc, ann_inputs_dec, ann_lengths, c_i # features, params.fine_tune stands for not using presaved imagenet weights # here, used this dummy placeholder during fine_tune, will remove it in # future releases, thats for saving image_net weights for futher usage image_f_inputs2 = tf.placeholder_with_default(tf.ones([1, 224, 224, 3]), shape=[None, 224, 224, 3], name='dummy_ps') if params.fine_tune: image_f_inputs2 = image_batch if params.mode == 'training' and params.fine_tune: cnn_dropout = params.cnn_dropout weights_regularizer = tf.contrib.layers.l2_regularizer( params.weight_decay) else: cnn_dropout = 1.0 weights_regularizer = None with tf.variable_scope("cnn", regularizer=weights_regularizer): image_embeddings = vgg16(image_f_inputs2, trainable_fe=params.fine_tune_fe, trainable_top=params.fine_tune_top, dropout_keep=cnn_dropout) if params.fine_tune: features = image_embeddings.fc2 else: features = image_batch # forward pass is expensive, so can use this method to reduce computation if params.num_captions > 1 and params.mode == 'training': # [b_s, 4096] features_tiled = tf.tile(tf.expand_dims(features, 1), [1, params.num_captions, 1]) features = tf.reshape(features_tiled, [ tf.shape(features)[0] * params.num_captions, params.cnn_feature_size ]) # [5 * b_s, 4096] # dictionary cap_dict = data.dictionary params.vocab_size = cap_dict.vocab_size # image features [b_size + f_size(4096)] -> [b_size + embed_size] images_fv = layers.dense(features, params.embed_size, name='imf_emb') # images_fv = tf.Print(images_fv, [tf.shape(features), features[0][0:10], # image_embeddings.imgs[0][:10], images_fv]) # encoder, input fv and ...<BOS>,get z if not params.no_encoder: encoder = Encoder(images_fv, cap_enc, cap_len, params) # decoder, input_fv, get x, x_logits (for generation) decoder = Decoder(images_fv, cap_dec, cap_len, params, cap_dict) if params.use_c_v or (params.prior == 'GMM' or params.prior == 'AG'): # cluster vectors from "Diverse and Accurate Image Description.." paper. # 80 is number of classes, for now hardcoded # for GMM-CVAE must be specified c_i_emb = layers.dense(cl_vectors, params.embed_size, name='cv_emb') # map cluster vectors into embedding space decoder.c_i = c_i_emb decoder.c_i_ph = cl_vectors if not params.no_encoder: encoder.c_i = c_i_emb encoder.c_i_ph = cl_vectors if not params.no_encoder: with tf.variable_scope("encoder"): qz, tm_list, tv_list = encoder.q_net() if params.prior == 'Normal': # kld between normal distributions KL(q, p), see Kingma et.al kld = -0.5 * tf.reduce_mean( tf.reduce_sum( 1 + tf.log(tf.square(qz.distribution.std) + 0.00001) - tf.square(qz.distribution.mean) - tf.square(qz.distribution.std), 1)) elif params.prior == 'GMM': # initialize sigma as constant, mu drawn randomly # TODO: finish GMM loss implementation c_means, c_sigma = init_clusters(params.num_clusters, params.latent_size) decoder.cap_clusters = c_means kld = -0.5 * tf.reduce_mean( tf.reduce_sum( 1 + tf.log(tf.square(qz.distribution.std) + 0.00001) - tf.square(qz.distribution.mean) - tf.square(qz.distribution.std), 1)) elif params.prior == 'AG': c_means, c_sigma = init_clusters(params.num_clusters, params.latent_size) decoder.cap_clusters = c_means kld_clusters = 0.5 + tf.log(qz.distribution.std+ 0.00001)\ - tf.log(c_sigma + 0.00001) - ( tf.square(qz.distribution.mean - tf.matmul( tf.squeeze(c_i), c_means)) + tf.square( qz.distribution.std))/(2*tf.square(c_sigma)+0.0000001) kld = -0.5 * tf.reduce_sum(kld_clusters, 1) with tf.variable_scope("decoder"): if params.no_encoder: dec_model, x_logits, shpe, _ = decoder.px_z_fi({}) else: dec_model, x_logits, shpe, _ = decoder.px_z_fi({'z': qz}) # calculate rec. loss, mask padded part labels_flat = tf.reshape(cap_enc, [-1]) ce_loss_padded = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=x_logits, labels=labels_flat) loss_mask = tf.sign(tf.to_float(labels_flat)) batch_loss = tf.div(tf.reduce_sum(tf.multiply(ce_loss_padded, loss_mask)), tf.reduce_sum(loss_mask), name="batch_loss") tf.losses.add_loss(batch_loss) rec_loss = tf.losses.get_total_loss() # kld weight annealing anneal = tf.placeholder_with_default(0, []) if params.fine_tune or params.restore: annealing = tf.constant(1.0) else: if params.ann_param > 1: annealing = (tf.tanh( (tf.to_float(anneal) - 1000 * params.ann_param) / 1000) + 1) / 2 else: annealing = tf.constant(1.0) # overall loss reconstruction loss - kl_regularization if not params.no_encoder: lower_bound = rec_loss + tf.multiply(tf.to_float(annealing), tf.to_float(kld)) / 10 else: lower_bound = rec_loss kld = tf.constant(0.0) # optimization, can print global norm for debugging optimize, global_step, global_norm = optimizers.non_cnn_optimizer( lower_bound, params) optimize_cnn = tf.constant(0.0) if params.fine_tune and params.mode == 'training': optimize_cnn, _ = optimizers.cnn_optimizer(lower_bound, params) # cnn parameters update # model restore vars_to_save = tf.trainable_variables() if not params.fine_tune_fe or not params.fine_tune_top: cnn_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 'cnn') vars_to_save += cnn_vars saver = tf.train.Saver(vars_to_save, max_to_keep=params.max_checkpoints_to_keep) # m_builder = tf.saved_model.builder.SavedModelBuilder('./saved_model') config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: sess.run([ tf.global_variables_initializer(), tf.local_variables_initializer() ]) # train using batch generator, every iteration get # f(I), [batch_size, max_seq_len], seq_lengths if params.mode == "training": if params.logging: summary_writer = tf.summary.FileWriter(params.LOG_DIR, sess.graph) summary_writer.add_graph(sess.graph) if not params.restore: print("Loading imagenet weights for futher usage") image_embeddings.load_weights(params.image_net_weights_path, sess) if params.restore: print("Restoring from checkpoint") saver.restore( sess, "./checkpoints/{}.ckpt".format(params.checkpoint)) for e in range(params.num_epochs): gs = tf.train.global_step(sess, global_step) gs_epoch = 0 while True: def stop_condition(): num_examples = gs_epoch * params.batch_size if num_examples > params.num_ex_per_epoch: return True return False for f_images_batch,\ captions_batch, cl_batch, c_v in batch_gen.next_batch( use_obj_vectors=params.use_c_v, num_captions=params.num_captions): if params.num_captions > 1: captions_batch, cl_batch, c_v = preprocess_captions( captions_batch, cl_batch, c_v) feed = { image_f_inputs: f_images_batch, ann_inputs_enc: captions_batch[1], ann_inputs_dec: captions_batch[0], ann_lengths: cl_batch, anneal: gs } if params.use_c_v or (params.prior == 'GMM' or params.prior == 'AG'): feed.update({c_i: c_v[:, 1:]}) gs = tf.train.global_step(sess, global_step) feed.update({anneal: gs}) # if gs_epoch == 0: # print(sess.run(debug_print, feed)) kl, rl, lb, _, _, ann = sess.run([ kld, rec_loss, lower_bound, optimize, optimize_cnn, annealing ], feed) gs_epoch += 1 if gs % 500 == 0: print("Epoch: {} Iteration: {} VLB: {} " "Rec Loss: {}".format( e, gs, np.mean(lb), rl)) if not params.no_encoder: print("Annealing coefficient:" "{} KLD: {}".format(ann, np.mean(kl))) if stop_condition(): break if stop_condition(): break print("Epoch: {} Iteration: {} VLB: {} Rec Loss: {}".format( e, gs, np.mean(lb), rl, )) def validate(): val_rec = [] for f_images_batch, captions_batch, cl_batch, c_v in val_gen.next_batch( use_obj_vectors=params.use_c_v, num_captions=params.num_captions): gs = tf.train.global_step(sess, global_step) if params.num_captions > 1: captions_batch, cl_batch, c_v = preprocess_captions( captions_batch, cl_batch, c_v) feed = { image_f_inputs: f_images_batch, ann_inputs_enc: captions_batch[1], ann_inputs_dec: captions_batch[0], ann_lengths: cl_batch, anneal: gs } if params.use_c_v or (params.prior == 'GMM' or params.prior == 'AG'): feed.update({c_i: c_v[:, 1:]}) rl = sess.run([rec_loss], feed_dict=feed) val_rec.append(rl) print("Validation reconstruction loss: {}".format( np.mean(val_rec))) print("-----------------------------------------------") validate() # save model if not os.path.exists("./checkpoints"): os.makedirs("./checkpoints") save_path = saver.save( sess, "./checkpoints/{}.ckpt".format(params.checkpoint)) print("Model saved in file: %s" % save_path) # builder.add_meta_graph_and_variables(sess, ["main_model"]) if params.use_hdf5 and params.fine_tune: batch_gen.h5f.close() # run inference if params.mode == "inference": inference.inference(params, decoder, val_gen, test_gen, image_f_inputs, saver, sess)
def main(): parser = argparse.ArgumentParser(description='Tuning with NCRF++') # parser.add_argument('--status', choices=['train', 'decode'], help='update algorithm', default='train') parser.add_argument('--config', help='Configuration File', default='None') parser.add_argument('--wordemb', help='Embedding for words', default='None') parser.add_argument('--charemb', help='Embedding for chars', default='None') parser.add_argument('--status', choices=['train', 'decode'], help='update algorithm', default='train') parser.add_argument('--savemodel', default="data/model/saved_model.lstmcrf.") parser.add_argument('--savedset', help='Dir of saved data setting') parser.add_argument('--train', default="data/conll03/train.bmes") parser.add_argument('--dev', default="data/conll03/dev.bmes") parser.add_argument('--test', default="data/conll03/test.bmes") parser.add_argument('--seg', default="True") parser.add_argument('--random-seed', type=int, default=42) parser.add_argument('--lr', type=float) parser.add_argument('--batch-size', type=int) parser.add_argument('--raw') parser.add_argument('--loadmodel') parser.add_argument('--output') parser.add_argument('--output-tsv') parser.add_argument('--model-prefix') parser.add_argument('--cpu', action='store_true') args = parser.parse_args() # Set random seed seed_num = args.random_seed random.seed(seed_num) torch.manual_seed(seed_num) np.random.seed(seed_num) data = Data() data.random_seed = seed_num data.HP_gpu = torch.cuda.is_available() if args.config == 'None': data.train_dir = args.train data.dev_dir = args.dev data.test_dir = args.test data.model_dir = args.savemodel data.dset_dir = args.savedset print("Save dset directory:", data.dset_dir) save_model_dir = args.savemodel data.word_emb_dir = args.wordemb data.char_emb_dir = args.charemb if args.seg.lower() == 'true': data.seg = True else: data.seg = False print("Seed num:", seed_num) else: data.read_config(args.config) if args.lr: data.HP_lr = args.lr if args.batch_size: data.HP_batch_size = args.batch_size data.output_tsv_path = args.output_tsv if args.cpu: data.HP_gpu = False if args.model_prefix: data.model_dir = args.model_prefix # data.show_data_summary() status = data.status.lower() print("Seed num:", seed_num) if status == 'train': print("MODEL: train") data_initialization(data) data.generate_instance('train') data.generate_instance('dev') data.generate_instance('test') data.build_pretrain_emb() train(data) elif status == 'decode': print("MODEL: decode") data.load(data.dset_dir) data.read_config(args.config) print(data.raw_dir) # exit(0) data.show_data_summary() data.generate_instance('raw') print("nbest: %s" % (data.nbest)) decode_results, pred_scores = load_model_decode(data, 'raw') if data.nbest and not data.sentence_classification: data.write_nbest_decoded_results(decode_results, pred_scores, 'raw') else: data.write_decoded_results(decode_results, 'raw') else: print( "Invalid argument! Please use valid arguments! (train/test/decode)" )
def __init__(self): self.__learn_rate_init = cfg.LEARN_RATE_INIT self.__learn_rate_end = cfg.LEARN_RATE_END self.__max_periods = cfg.MAX_PERIODS self.__warmup_periods = cfg.WARMUP_PERIODS self.__weights_dir = cfg.WEIGHTS_DIR self.__weights_init = cfg.WEIGHTS_INIT self.__time = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime(time.time())) self.__log_dir = os.path.join(cfg.LOG_DIR, 'train', self.__time) self.__moving_ave_decay = cfg.MOVING_AVE_DECAY self.__train_data = Data('train') self.__steps_per_period = len(self.__train_data) with tf.name_scope('input'): self.__input_data = tf.placeholder(dtype=tf.float32, name='input_data') self.__label_sbbox = tf.placeholder(dtype=tf.float32, name='label_sbbox') self.__label_mbbox = tf.placeholder(dtype=tf.float32, name='label_mbbox') self.__label_lbbox = tf.placeholder(dtype=tf.float32, name='label_lbbox') self.__sbboxes = tf.placeholder(dtype=tf.float32, name='sbboxes') self.__mbboxes = tf.placeholder(dtype=tf.float32, name='mbboxes') self.__lbboxes = tf.placeholder(dtype=tf.float32, name='lbboxes') self.__training = tf.placeholder(dtype=tf.bool, name='training') with tf.name_scope('learning_rate'): self.__global_step = tf.Variable(1.0, dtype=tf.float64, trainable=False, name='global_step') warmup_steps = tf.constant(self.__warmup_periods * self.__steps_per_period, dtype=tf.float64, name='warmup_steps') train_steps = tf.constant(self.__max_periods * self.__steps_per_period, dtype=tf.float64, name='train_steps') self.__learn_rate = tf.cond( pred=self.__global_step < warmup_steps, true_fn=lambda: self.__global_step / warmup_steps * self. __learn_rate_init, false_fn=lambda: self.__learn_rate_end + 0.5 * (self.__learn_rate_init - self.__learn_rate_end) * (1 + tf.cos( (self.__global_step - warmup_steps) / (train_steps - warmup_steps) * np.pi))) global_step_update = tf.assign_add(self.__global_step, 1.0) yolo = YOLOV3(self.__training) conv_sbbox, conv_mbbox, conv_lbbox, \ pred_sbbox, pred_mbbox, pred_lbbox = yolo.build_nework(self.__input_data) load_var = tf.global_variables('yolov3') restore_dict = self.__get_restore_dict(load_var) self.__loss = yolo.loss(conv_sbbox, conv_mbbox, conv_lbbox, pred_sbbox, pred_mbbox, pred_lbbox, self.__label_sbbox, self.__label_mbbox, self.__label_lbbox, self.__sbboxes, self.__mbboxes, self.__lbboxes) with tf.name_scope('optimizer'): moving_ave = tf.train.ExponentialMovingAverage( self.__moving_ave_decay).apply(tf.trainable_variables()) optimizer = tf.train.AdamOptimizer(self.__learn_rate).minimize( self.__loss, var_list=tf.trainable_variables()) with tf.control_dependencies([optimizer, global_step_update]): with tf.control_dependencies([moving_ave]): self.__train_op = tf.no_op() with tf.name_scope('load_save'): self.__load = tf.train.Saver(restore_dict) self.__save = tf.train.Saver(tf.global_variables(), max_to_keep=self.__max_periods) with tf.name_scope('summary'): self.__loss_ave = tf.Variable(0, dtype=tf.float32, trainable=False) tf.summary.scalar('loss_ave', self.__loss_ave) tf.summary.scalar('learn_rate', self.__learn_rate) self.__summary_op = tf.summary.merge_all() self.__summary_writer = tf.summary.FileWriter(self.__log_dir) self.__summary_writer.add_graph(tf.get_default_graph()) self.__sess = tf.Session(config=tf.ConfigProto( allow_soft_placement=True)) self.__sess.run(tf.global_variables_initializer()) logging.info('Restoring weights from:\t %s' % self.__weights_init) self.__load.restore(self.__sess, self.__weights_init) super(Yolo_train, self).__init__(self.__sess, self.__input_data, self.__training, pred_sbbox, pred_mbbox, pred_lbbox)
print( "%s: time:%.2fs, speed:%.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f" % (name, time_cost, speed, acc, p, r, f)) else: print("%s: time:%.2fs, speed:%.2fst/s; acc: %.4f" % (name, time_cost, speed, acc)) return pred_results, pred_scores if __name__ == '__main__': parser = argparse.ArgumentParser(description='Tuning with NCRF++') # parser.add_argument('--status', choices=['train', 'decode'], help='update algorithm', default='train') parser.add_argument('--config', help='Configuration File') args = parser.parse_args() data = Data() data.HP_gpu = torch.cuda.is_available() data.read_config(args.config) data.show_data_summary() status = data.status.lower() print("Seed num:", seed_num) if status == 'train': print("MODEL: train") data_initialization(data) data.generate_instance('train') data.generate_instance('dev') data.generate_instance('test') data.build_pretrain_emb() train(data) elif status == 'decode':
def __init__(self, data=None, model_dir=None): print("Creat environment...") # 加载训练好的基础模型 if data is not None: self.data = data elif model_dir is not None: self.data = Data() self.data.load(model_dir + "/data.dset") else: print("Error: Unable to create base model") exit(1) self.model = load_model(self.data) # 设置 model 为评估模式 self.model.eval() self.label_alphabet = self.data.label_alphabet # 用来寻找同一篇文章中一个单词的其他出现位置 self.word_mat = self.data.word_mat self.max_read_memory = self.data.max_read_memory self.threshold = self.data.threshold # 数据集 self.train_Ids = self.data.train_Ids + self.data.dev_Ids self.test_Ids = self.data.test_Ids self.train_doc_total_num = len(self.train_Ids) self.test_doc_total_num = len(self.test_Ids) print("训练集文章数为 %s" % self.train_doc_total_num) print("测试集文章数为 %s" % self.test_doc_total_num) self.pred_label_total_result = [0] * (self.train_doc_total_num + self.test_doc_total_num) self.gold_label_total_result = [0] * (self.train_doc_total_num + self.test_doc_total_num) self.uncertainty_total_result = [0] * (self.train_doc_total_num + self.test_doc_total_num) self.lstm_out_total_result = [0] * (self.train_doc_total_num + self.test_doc_total_num) self.outs_total_result = [0] * (self.train_doc_total_num + self.test_doc_total_num) start = time.time() self.get_all_result() end = time.time() print("使用基模型获得所有预测结果使用时: %.2fs" % (end-start)) # 当前处理的文档 self.cur_doc_idx = -1 # 当前处理文章的标号 self.cur_doc_num = -1 # 当前文档单词总数 self.cur_doc_word_total_num = -1 # 当前处理文档的基模型预测结果 self.pred_label_result = None self.gold_label_result = None self.lstm_out_result = None self.outs_result = None self.uncertainty_result = None # 当前处理的单词 self.cur_word_idx = -1 # 当前处理单词的参考单词(同一篇文章中的不同出现) self.cur_word_reference = None self.cur_word_reference_idx = -1 # 可以参考的单词总数 self.cur_word_reference_num = -1 # 测试时使用 self.gold_results = [] self.pred_results = [] # 结束状态向量 self.state_dim = None self.end_state = None self.time_step = -1