예제 #1
0
def load_model(models_path, glove_path, toy=False):
    ### CONFIGURABLE
    GPU = True  # GPU activated
    B_word = 42  # GloVE corpus size
    N_word = 300  # word embedding dimension
    N_h = 300  # hidden layer size
    N_depth = 2  # num LSTM layers

    print("Loading GloVE word embeddings...")
    word_emb = load_word_emb('{}/glove.{}B.{}d.txt'.format(
        glove_path, B_word, N_word),
                             load_used=False,
                             use_small=toy)

    model = SuperModel(word_emb,
                       N_word=N_word,
                       gpu=GPU,
                       trainable_emb=False,
                       table_type='std',
                       use_hs=True)

    print("Loading trained models...")
    model.multi_sql.load_state_dict(
        torch.load("{}/multi_sql_models.dump".format(models_path)))
    model.key_word.load_state_dict(
        torch.load("{}/keyword_models.dump".format(models_path)))
    model.col.load_state_dict(
        torch.load("{}/col_models.dump".format(models_path)))
    model.op.load_state_dict(
        torch.load("{}/op_models.dump".format(models_path)))
    model.agg.load_state_dict(
        torch.load("{}/agg_models.dump".format(models_path)))
    model.root_teminal.load_state_dict(
        torch.load("{}/root_tem_models.dump".format(models_path)))
    model.des_asc.load_state_dict(
        torch.load("{}/des_asc_models.dump".format(models_path)))
    model.having.load_state_dict(
        torch.load("{}/having_models.dump".format(models_path)))
    return model
예제 #2
0
        task.dev_set.set_input('task_id', 'words_idx', flag=True)
        task.dev_set.set_target('label', flag=True)

        task.test_set.set_input('task_id', 'words_idx', flag=True)
        task.test_set.set_target('label', flag=True)

    logger.info('Finished. Dumping vocabulary to data/vocab.txt')
    with open('data/vocab.txt', mode='w', encoding='utf-8') as f:
        for i in range(len(vocab)):
            f.write(vocab.to_word(i) + '\n')

    logger.info('Testing data...')
    for task in task_lst:
        logger.info(str(task.task_id) + ' ' + task.task_name)
        logger.info(task.train_set[0])
        logger.info(task.dev_set[0])
        logger.info(task.test_set[0])

    logger.info('Dumping data...')
    data = {'task_lst': task_lst}
    save_file = open('data/data.pkl', 'wb')
    pickle.dump(data, save_file)
    save_file.close()
    logger.info('Finished. Looking up for word embeddings...')
    embed_path = '/remote-home/txsun/data/word-embedding/glove/glove.840B.300d.txt'
    _ = load_word_emb(embed_path, 300, vocab)
    logger.info('Finished. Elapse: {}s.'.format(time.time() - start_time))
    logger.removeHandler(stream_handler)
    logger.removeHandler(file_handler)
plt.switch_backend('TkAgg')
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix
import pickle

device = torch.device("cpu")
N_word=100
B_word=6
hidden_size = 256
max_length = 1000
SOS_token = 0
CLASS_size = 6


word_emb = load_word_emb('../glove/glove.%dB.%dd.txt'%(B_word,N_word))
full_table, classes_, weight_tensor = load_data(device)
train_df, test_df = train_test_split(full_table, test_size=0.2)
CLASS_size = len(classes_)
class_index = range(CLASS_size)
class_dict = dict(zip(classes_, class_index))

import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)
예제 #4
0
from inference import infer_script
from train_feedback import train_feedback
from utils import get_table_names, get_tables_html, load_word_emb

# @app.route('/')
# @app.route('/index')
# def index():
#    return "Hello, World!"

N_word = 300
B_word = 42
LOAD_USED_W2I = False
USE_SMALL = True
print("Creating word embedding dictionary...")
word_emb = load_word_emb('glove/glove.%dB.%dd.txt'%(B_word,N_word), \
      load_used=LOAD_USED_W2I,
      use_small=USE_SMALL)


@app.route('/')
@app.route('/index')
@app.route('/input')
def cesareans_input():

    return render_template("input.html")


@app.route('/output')
# @app.route('/')
# @app.route('/index')
def cesareans_output():
예제 #5
0

if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description="Parse word embeddings in text format.")

    # evaluation
    parser.add_argument('--embedding_text_file',
                        help="Specify the path without file-suffix!",
                        required=True,
                        type=str)

    args = parser.parse_args()

    print(
        "Convert embedding file '{0}.txt' to the binary file '{0}.npy' and the vocab-file '{0}.vocab'"
        .format(args.embedding_text_file))
    convert_to_binary(args.embedding_text_file)

    print("Converting done! Try to reload.")
    t = TicToc()
    t.tic()
    word_embedding_map = load_word_emb_binary(args.embedding_text_file)
    loading_time = t.tocvalue()
    print("Load {} words and embeddings in {} seconds".format(
        len(word_embedding_map), loading_time))

    t.tic()
    word_embedding_map_2 = load_word_emb(args.embedding_text_file + '.txt')
    t.toc(msg="Loading it as text file takes")
예제 #6
0
def main():
    ''' main function '''

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-n_epoch', type=int, default=30)
    parser.add_argument('-batch_size', type=int, default=50)
    parser.add_argument('-gpu', type=str, default='0')
    parser.add_argument('-accumulation_steps', type=int, default=1)
    parser.add_argument('-freeze', type=int, default=0)
    parser.add_argument('-same_lr', type=int, default=0)
    parser.add_argument('-dataset', type=str, default='sports')
    parser.add_argument('-model_config', type=str, default='tf-6-4-512.config')
    parser.add_argument('-add_com', type=str, default='stl')
    parser.add_argument(
        '-log_dir',
        type=str,
        default='/remote-home/txsun/fnlp/watchboard/product/stl')
    parser.add_argument('-save_path', type=str, default='saved_models/')
    parser.add_argument(
        '-embed_path',
        type=str,
        default=
        '/remote-home/txsun/data/word-embedding/glove/glove.840B.300d.txt')

    args = parser.parse_args()

    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
    bsz = args.batch_size // args.accumulation_steps

    global logger
    global model_config
    global config_str

    model_config = {}
    print('Reading configure file {}...'.format(args.model_config))
    with open(args.model_config, 'r') as f:
        lines = f.readlines()
        for line in lines:
            key = line.split(':')[0].strip()
            value = line.split(':')[1].strip()
            model_config[key] = value
            print('{}: {}'.format(key, value))

    config_str = ''
    for key, value in model_config.items():
        config_str += key + '-' + value + '-'
    config_str += args.add_com

    # Stream Handler
    stream_handler = logging.StreamHandler(sys.stdout)
    stream_handler.setLevel(logging.INFO)
    stream_formatter = logging.Formatter('[%(levelname)s] %(message)s')
    stream_handler.setFormatter(stream_formatter)
    logger.addHandler(stream_handler)

    # File Handler
    log_path = os.path.join('logs', args.dataset)
    if not os.path.exists(log_path):
        os.mkdir(log_path)
    log_path = os.path.join(log_path, config_str)
    file_handler = logging.FileHandler(log_path, mode='w')
    file_handler.setLevel(logging.DEBUG)
    file_formatter = logging.Formatter(
        fmt='%(asctime)s - [%(levelname)s] - %(name)s - %(message)s',
        datefmt='%Y/%m/%d %H:%M:%S')
    file_handler.setFormatter(file_formatter)
    logger.addHandler(file_handler)

    logger.info('========== Loading Datasets ==========')
    dataset_file = os.path.join('data', args.dataset, 'data.pkl')
    logger.info('Loading dataset {}...'.format(dataset_file))
    data = torch.load(dataset_file)
    global vocab
    vocab = data['vocab']
    args.vocab_size = len(vocab)
    lb_vocab = data['class_dict']
    args.n_class = len(lb_vocab)
    logger.info('# classes: {}'.format(args.n_class))

    train_data = data['train']
    dev_data = data['dev']
    test_data = data['test']

    train_set = ClsDataset(train_data)
    train_iter = DataLoader(train_set,
                            batch_size=bsz,
                            drop_last=True,
                            shuffle=True,
                            num_workers=2,
                            collate_fn=custom_collate)
    logger.info('Train set loaded.')

    dev_set = ClsDataset(dev_data)
    dev_iter = DataLoader(dev_set,
                          batch_size=args.batch_size,
                          num_workers=2,
                          collate_fn=custom_collate)
    logger.info('Development set loaded.')

    test_set = ClsDataset(test_data)
    test_iter = DataLoader(test_set,
                           batch_size=args.batch_size,
                           num_workers=2,
                           collate_fn=custom_collate)
    logger.info('Test set loaded.')
    logger.info('Datasets finished.')

    logger.info('====== Loading Word Embedding =======')
    we_path = os.path.join('data', args.dataset, 'word_embedding.npy')
    word_embedding = load_word_emb(args.embed_path,
                                   300,
                                   vocab,
                                   save_path=we_path)

    logger.info('========== Preparing Model ==========')
    model = Transformer(args, model_config, word_embedding)

    logger.info('Model parameters:')
    params = list(model.named_parameters())
    sum_param = 0
    for name, param in params:
        if param.requires_grad == True:
            logger.info('{}: {}'.format(name, param.shape))
            sum_param += param.numel()
    logger.info('# Parameters: {}.'.format(sum_param))

    logger.info('========== Training Model ==========')
    lr = float(model_config['lr'])
    if args.same_lr or args.freeze:
        opt = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()),
                         lr=lr)
    else:
        word_embed_params = list(
            map(id, model.embed.word_embeddings.parameters()))
        base_params = filter(lambda p: id(p) not in word_embed_params,
                             model.parameters())
        opt = optim.Adam([{
            'params': base_params
        }, {
            'params': model.embed.word_embeddings.parameters(),
            'lr': lr * 0.1
        }],
                         lr=lr)

    train(model, train_iter, dev_iter, test_iter, opt, args)

    return
예제 #7
0
def main():
    ''' main function '''

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-n_epoch', type=int, default=15)
    parser.add_argument('-batch_size', type=int, default=64)
    parser.add_argument('-gpu', type=str, default='0')
    parser.add_argument('-accumulation_steps', type=int, default=1)
    parser.add_argument('-model_config', type=str, default='lstm.config')
    parser.add_argument('-loss_split', type=str, default='1-1-1')
    parser.add_argument('-log_dir', type=str, default='logs/tensorboardlogs/')
    parser.add_argument('-save_path', type=str, default='saved_models/')
    parser.add_argument(
        '-embed_path',
        type=str,
        default='/remote-home/txsun/data/word-embedding/glove/glove.6B.300d.txt'
    )

    args = parser.parse_args()

    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
    bsz = args.batch_size // args.accumulation_steps

    global logger
    global model_config
    global config_str

    model_config = {}
    print('Reading configure file {}...'.format(args.model_config))
    with open(args.model_config, 'r') as f:
        lines = f.readlines()
        for line in lines:
            key = line.split(':')[0].strip()
            value = line.split(':')[1].strip()
            model_config[key] = value
            print('{}: {}'.format(key, value))

    config_str = ''
    for key, value in model_config.items():
        config_str += key + '-' + value + '-'
    config_str += args.loss_split

    # Stream Handler
    stream_handler = logging.StreamHandler(sys.stdout)
    stream_handler.setLevel(logging.INFO)
    stream_formatter = logging.Formatter('[%(levelname)s] %(message)s')
    stream_handler.setFormatter(stream_formatter)
    logger.addHandler(stream_handler)

    # File Handler
    file_handler = logging.FileHandler('logs/' + config_str + '.log')
    file_handler.setLevel(logging.DEBUG)
    file_formatter = logging.Formatter(
        fmt='%(asctime)s - [%(levelname)s] - %(name)s - %(message)s',
        datefmt='%Y/%m/%d %H:%M:%S')
    file_handler.setFormatter(file_formatter)
    logger.addHandler(file_handler)

    logger.info('========== Loading Datasets ==========')
    data = torch.load('data/all_data.pkl')
    vocab = data['vocab']
    args.vocab_size = len(vocab)

    global lb_vocabs
    lb_vocabs = data['class_dict']
    del lb_vocabs[2]
    args.n_classes = [len(lb_voc) for lb_voc in lb_vocabs]

    logger.info('# POS Tagging labels: {}'.format(args.n_classes[0]))
    logger.info('# NER Tagging labels: {}'.format(args.n_classes[1]))
    logger.info('# Chunking labels: {}'.format(args.n_classes[2]))
    assert len(args.n_classes) == 3
    train_data = data['train']
    dev_data = data['dev']
    test_data = data['test']

    train_set = SeqLabDataset(train_data)
    train_iter = DataLoader(train_set,
                            batch_size=bsz,
                            drop_last=True,
                            shuffle=True,
                            num_workers=2,
                            collate_fn=custom_collate)
    logger.info('Train set loaded.')

    dev_set = SeqLabDataset(dev_data)
    dev_iter = DataLoader(dev_set,
                          batch_size=args.batch_size,
                          num_workers=2,
                          collate_fn=custom_collate)
    logger.info('Development set loaded.')

    test_set = SeqLabDataset(test_data)
    test_iter = DataLoader(test_set,
                           batch_size=args.batch_size,
                           num_workers=2,
                           collate_fn=custom_collate)
    logger.info('Test set loaded.')
    logger.info('Datasets finished.')

    logger.info('====== Loading Word Embedding =======')
    word_embedding = load_word_emb(args.embed_path, 300, vocab)

    logger.info('========== Preparing Model ==========')
    if model_config['model'] == 'transformer':
        model = Transformer(args, model_config, word_embedding)
    elif model_config['model'] == 'LSTM':
        model = BiLSTM(args, model_config, word_embedding)
    else:
        logger.error('No support for {}.'.format(model_config['model']))
        return

    logger.info('Model parameters:')
    params = list(model.named_parameters())
    for name, param in params:
        logger.info('{}: {}'.format(name, param.shape))
    logger.info('# Parameters: {}.'.format(
        sum(param.numel() for param in model.parameters())))

    logger.info('========== Training Model ==========')
    opt = optim.Adam(model.parameters(), lr=float(model_config['lr']))
    train(model, train_iter, dev_iter, opt, args)

    return