def load_model(models_path, glove_path, toy=False): ### CONFIGURABLE GPU = True # GPU activated B_word = 42 # GloVE corpus size N_word = 300 # word embedding dimension N_h = 300 # hidden layer size N_depth = 2 # num LSTM layers print("Loading GloVE word embeddings...") word_emb = load_word_emb('{}/glove.{}B.{}d.txt'.format( glove_path, B_word, N_word), load_used=False, use_small=toy) model = SuperModel(word_emb, N_word=N_word, gpu=GPU, trainable_emb=False, table_type='std', use_hs=True) print("Loading trained models...") model.multi_sql.load_state_dict( torch.load("{}/multi_sql_models.dump".format(models_path))) model.key_word.load_state_dict( torch.load("{}/keyword_models.dump".format(models_path))) model.col.load_state_dict( torch.load("{}/col_models.dump".format(models_path))) model.op.load_state_dict( torch.load("{}/op_models.dump".format(models_path))) model.agg.load_state_dict( torch.load("{}/agg_models.dump".format(models_path))) model.root_teminal.load_state_dict( torch.load("{}/root_tem_models.dump".format(models_path))) model.des_asc.load_state_dict( torch.load("{}/des_asc_models.dump".format(models_path))) model.having.load_state_dict( torch.load("{}/having_models.dump".format(models_path))) return model
task.dev_set.set_input('task_id', 'words_idx', flag=True) task.dev_set.set_target('label', flag=True) task.test_set.set_input('task_id', 'words_idx', flag=True) task.test_set.set_target('label', flag=True) logger.info('Finished. Dumping vocabulary to data/vocab.txt') with open('data/vocab.txt', mode='w', encoding='utf-8') as f: for i in range(len(vocab)): f.write(vocab.to_word(i) + '\n') logger.info('Testing data...') for task in task_lst: logger.info(str(task.task_id) + ' ' + task.task_name) logger.info(task.train_set[0]) logger.info(task.dev_set[0]) logger.info(task.test_set[0]) logger.info('Dumping data...') data = {'task_lst': task_lst} save_file = open('data/data.pkl', 'wb') pickle.dump(data, save_file) save_file.close() logger.info('Finished. Looking up for word embeddings...') embed_path = '/remote-home/txsun/data/word-embedding/glove/glove.840B.300d.txt' _ = load_word_emb(embed_path, 300, vocab) logger.info('Finished. Elapse: {}s.'.format(time.time() - start_time)) logger.removeHandler(stream_handler) logger.removeHandler(file_handler)
plt.switch_backend('TkAgg') import matplotlib.ticker as ticker from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score,confusion_matrix import pickle device = torch.device("cpu") N_word=100 B_word=6 hidden_size = 256 max_length = 1000 SOS_token = 0 CLASS_size = 6 word_emb = load_word_emb('../glove/glove.%dB.%dd.txt'%(B_word,N_word)) full_table, classes_, weight_tensor = load_data(device) train_df, test_df = train_test_split(full_table, test_size=0.2) CLASS_size = len(classes_) class_index = range(CLASS_size) class_dict = dict(zip(classes_, class_index)) import time import math def asMinutes(s): m = math.floor(s / 60) s -= m * 60 return '%dm %ds' % (m, s)
from inference import infer_script from train_feedback import train_feedback from utils import get_table_names, get_tables_html, load_word_emb # @app.route('/') # @app.route('/index') # def index(): # return "Hello, World!" N_word = 300 B_word = 42 LOAD_USED_W2I = False USE_SMALL = True print("Creating word embedding dictionary...") word_emb = load_word_emb('glove/glove.%dB.%dd.txt'%(B_word,N_word), \ load_used=LOAD_USED_W2I, use_small=USE_SMALL) @app.route('/') @app.route('/index') @app.route('/input') def cesareans_input(): return render_template("input.html") @app.route('/output') # @app.route('/') # @app.route('/index') def cesareans_output():
if __name__ == '__main__': parser = argparse.ArgumentParser( description="Parse word embeddings in text format.") # evaluation parser.add_argument('--embedding_text_file', help="Specify the path without file-suffix!", required=True, type=str) args = parser.parse_args() print( "Convert embedding file '{0}.txt' to the binary file '{0}.npy' and the vocab-file '{0}.vocab'" .format(args.embedding_text_file)) convert_to_binary(args.embedding_text_file) print("Converting done! Try to reload.") t = TicToc() t.tic() word_embedding_map = load_word_emb_binary(args.embedding_text_file) loading_time = t.tocvalue() print("Load {} words and embeddings in {} seconds".format( len(word_embedding_map), loading_time)) t.tic() word_embedding_map_2 = load_word_emb(args.embedding_text_file + '.txt') t.toc(msg="Loading it as text file takes")
def main(): ''' main function ''' parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-n_epoch', type=int, default=30) parser.add_argument('-batch_size', type=int, default=50) parser.add_argument('-gpu', type=str, default='0') parser.add_argument('-accumulation_steps', type=int, default=1) parser.add_argument('-freeze', type=int, default=0) parser.add_argument('-same_lr', type=int, default=0) parser.add_argument('-dataset', type=str, default='sports') parser.add_argument('-model_config', type=str, default='tf-6-4-512.config') parser.add_argument('-add_com', type=str, default='stl') parser.add_argument( '-log_dir', type=str, default='/remote-home/txsun/fnlp/watchboard/product/stl') parser.add_argument('-save_path', type=str, default='saved_models/') parser.add_argument( '-embed_path', type=str, default= '/remote-home/txsun/data/word-embedding/glove/glove.840B.300d.txt') args = parser.parse_args() os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu bsz = args.batch_size // args.accumulation_steps global logger global model_config global config_str model_config = {} print('Reading configure file {}...'.format(args.model_config)) with open(args.model_config, 'r') as f: lines = f.readlines() for line in lines: key = line.split(':')[0].strip() value = line.split(':')[1].strip() model_config[key] = value print('{}: {}'.format(key, value)) config_str = '' for key, value in model_config.items(): config_str += key + '-' + value + '-' config_str += args.add_com # Stream Handler stream_handler = logging.StreamHandler(sys.stdout) stream_handler.setLevel(logging.INFO) stream_formatter = logging.Formatter('[%(levelname)s] %(message)s') stream_handler.setFormatter(stream_formatter) logger.addHandler(stream_handler) # File Handler log_path = os.path.join('logs', args.dataset) if not os.path.exists(log_path): os.mkdir(log_path) log_path = os.path.join(log_path, config_str) file_handler = logging.FileHandler(log_path, mode='w') file_handler.setLevel(logging.DEBUG) file_formatter = logging.Formatter( fmt='%(asctime)s - [%(levelname)s] - %(name)s - %(message)s', datefmt='%Y/%m/%d %H:%M:%S') file_handler.setFormatter(file_formatter) logger.addHandler(file_handler) logger.info('========== Loading Datasets ==========') dataset_file = os.path.join('data', args.dataset, 'data.pkl') logger.info('Loading dataset {}...'.format(dataset_file)) data = torch.load(dataset_file) global vocab vocab = data['vocab'] args.vocab_size = len(vocab) lb_vocab = data['class_dict'] args.n_class = len(lb_vocab) logger.info('# classes: {}'.format(args.n_class)) train_data = data['train'] dev_data = data['dev'] test_data = data['test'] train_set = ClsDataset(train_data) train_iter = DataLoader(train_set, batch_size=bsz, drop_last=True, shuffle=True, num_workers=2, collate_fn=custom_collate) logger.info('Train set loaded.') dev_set = ClsDataset(dev_data) dev_iter = DataLoader(dev_set, batch_size=args.batch_size, num_workers=2, collate_fn=custom_collate) logger.info('Development set loaded.') test_set = ClsDataset(test_data) test_iter = DataLoader(test_set, batch_size=args.batch_size, num_workers=2, collate_fn=custom_collate) logger.info('Test set loaded.') logger.info('Datasets finished.') logger.info('====== Loading Word Embedding =======') we_path = os.path.join('data', args.dataset, 'word_embedding.npy') word_embedding = load_word_emb(args.embed_path, 300, vocab, save_path=we_path) logger.info('========== Preparing Model ==========') model = Transformer(args, model_config, word_embedding) logger.info('Model parameters:') params = list(model.named_parameters()) sum_param = 0 for name, param in params: if param.requires_grad == True: logger.info('{}: {}'.format(name, param.shape)) sum_param += param.numel() logger.info('# Parameters: {}.'.format(sum_param)) logger.info('========== Training Model ==========') lr = float(model_config['lr']) if args.same_lr or args.freeze: opt = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr) else: word_embed_params = list( map(id, model.embed.word_embeddings.parameters())) base_params = filter(lambda p: id(p) not in word_embed_params, model.parameters()) opt = optim.Adam([{ 'params': base_params }, { 'params': model.embed.word_embeddings.parameters(), 'lr': lr * 0.1 }], lr=lr) train(model, train_iter, dev_iter, test_iter, opt, args) return
def main(): ''' main function ''' parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-n_epoch', type=int, default=15) parser.add_argument('-batch_size', type=int, default=64) parser.add_argument('-gpu', type=str, default='0') parser.add_argument('-accumulation_steps', type=int, default=1) parser.add_argument('-model_config', type=str, default='lstm.config') parser.add_argument('-loss_split', type=str, default='1-1-1') parser.add_argument('-log_dir', type=str, default='logs/tensorboardlogs/') parser.add_argument('-save_path', type=str, default='saved_models/') parser.add_argument( '-embed_path', type=str, default='/remote-home/txsun/data/word-embedding/glove/glove.6B.300d.txt' ) args = parser.parse_args() os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu bsz = args.batch_size // args.accumulation_steps global logger global model_config global config_str model_config = {} print('Reading configure file {}...'.format(args.model_config)) with open(args.model_config, 'r') as f: lines = f.readlines() for line in lines: key = line.split(':')[0].strip() value = line.split(':')[1].strip() model_config[key] = value print('{}: {}'.format(key, value)) config_str = '' for key, value in model_config.items(): config_str += key + '-' + value + '-' config_str += args.loss_split # Stream Handler stream_handler = logging.StreamHandler(sys.stdout) stream_handler.setLevel(logging.INFO) stream_formatter = logging.Formatter('[%(levelname)s] %(message)s') stream_handler.setFormatter(stream_formatter) logger.addHandler(stream_handler) # File Handler file_handler = logging.FileHandler('logs/' + config_str + '.log') file_handler.setLevel(logging.DEBUG) file_formatter = logging.Formatter( fmt='%(asctime)s - [%(levelname)s] - %(name)s - %(message)s', datefmt='%Y/%m/%d %H:%M:%S') file_handler.setFormatter(file_formatter) logger.addHandler(file_handler) logger.info('========== Loading Datasets ==========') data = torch.load('data/all_data.pkl') vocab = data['vocab'] args.vocab_size = len(vocab) global lb_vocabs lb_vocabs = data['class_dict'] del lb_vocabs[2] args.n_classes = [len(lb_voc) for lb_voc in lb_vocabs] logger.info('# POS Tagging labels: {}'.format(args.n_classes[0])) logger.info('# NER Tagging labels: {}'.format(args.n_classes[1])) logger.info('# Chunking labels: {}'.format(args.n_classes[2])) assert len(args.n_classes) == 3 train_data = data['train'] dev_data = data['dev'] test_data = data['test'] train_set = SeqLabDataset(train_data) train_iter = DataLoader(train_set, batch_size=bsz, drop_last=True, shuffle=True, num_workers=2, collate_fn=custom_collate) logger.info('Train set loaded.') dev_set = SeqLabDataset(dev_data) dev_iter = DataLoader(dev_set, batch_size=args.batch_size, num_workers=2, collate_fn=custom_collate) logger.info('Development set loaded.') test_set = SeqLabDataset(test_data) test_iter = DataLoader(test_set, batch_size=args.batch_size, num_workers=2, collate_fn=custom_collate) logger.info('Test set loaded.') logger.info('Datasets finished.') logger.info('====== Loading Word Embedding =======') word_embedding = load_word_emb(args.embed_path, 300, vocab) logger.info('========== Preparing Model ==========') if model_config['model'] == 'transformer': model = Transformer(args, model_config, word_embedding) elif model_config['model'] == 'LSTM': model = BiLSTM(args, model_config, word_embedding) else: logger.error('No support for {}.'.format(model_config['model'])) return logger.info('Model parameters:') params = list(model.named_parameters()) for name, param in params: logger.info('{}: {}'.format(name, param.shape)) logger.info('# Parameters: {}.'.format( sum(param.numel() for param in model.parameters()))) logger.info('========== Training Model ==========') opt = optim.Adam(model.parameters(), lr=float(model_config['lr'])) train(model, train_iter, dev_iter, opt, args) return