Пример #1
0
def train_skipgram (corpus_dir, extn, learning_rate, embedding_size, num_negsample, epochs, batch_size, output_dir,valid_size):
    '''

    :param corpus_dir: folder containing WL kernel relabeled files. All the files in this folder will be relabled
    according to WL relabeling strategy and the format of each line in these folders shall be: <target> <context 1> <context 2>....
    :param extn: Extension of the WL relabled file
    :param learning_rate: learning rate for the skipgram model (will involve a linear decay)
    :param embedding_size: number of dimensions to be used for learning subgraph representations
    :param num_negsample: number of negative samples to be used by the skipgram model
    :param epochs: number of iterations the dataset is traversed by the skipgram model
    :param batch_size: size of each batch for the skipgram model
    :param output_dir: the folder where embedding file will be stored
    :param valid_size: number of subgraphs to be chosen at random to validate the goodness of subgraph representation
    learning process in every epoc
    :return: name of the file that contains the subgraph embeddings (in word2vec format proposed by Mikolov et al (2013))
    '''

    op_fname = '_'.join([os.path.basename(corpus_dir), 'dims', str(embedding_size), 'epochs', str(epochs),'embeddings.txt'])
    op_fname = os.path.join(output_dir, op_fname)
    if os.path.isfile(op_fname):
        logging.info('The embedding file: {} is already present, hence NOT training skipgram model '
                     'for subgraph vectors'.format(op_fname))
        return op_fname

    logging.info("Initializing SKIPGRAM...")
    corpus = Corpus(corpus_dir, extn = extn, max_files=0)  # just load 'max_files' files from this folder
    corpus.scan_and_load_corpus()
    valid_examples = np.concatenate((np.random.choice(corpus.high_freq_word_ids, valid_size, replace=False),
                                     np.random.choice(corpus.low_freq_word_ids, valid_size, replace=False)))

    model_skipgram = skipgram(
        doc_size=corpus._vocabsize,  # for doc2vec skipgram model, the doc size should be same as word size
        vocabulary_size=corpus._vocabsize,  # size of i/p and o/p layers
        learning_rate=learning_rate,  # will decay over time?
        embedding_size=embedding_size,  # hidden layer neurons
        num_negsample=num_negsample,
        num_steps=epochs,  # no. of time the training set will be iterated through
        corpus=corpus,  # data set of (target,context) tuples
        valid_dataset=valid_examples,  # validation set (a small subset) of (target, context) tuples?
    )

    final_embeddings, final_weights = model_skipgram.train(
        corpus=corpus,
        batch_size=batch_size,
        valid_dataset=valid_examples,
    )


    logging.info('Write the matrix to a word2vec format file')
    save_embeddings(corpus, final_embeddings, embedding_size, op_fname)
    logging.info('Completed writing the final embeddings, pls check file: {} for the same'.format(op_fname))
    return op_fname
Пример #2
0
def get_corpus(datadir, dataset):
    fn = os.path.join(datadir, 'cache.pt')
    if os.path.exists(fn):
        print('Load cached dataset...')
        corpus = torch.load(fn)
    else:
        corpus = Corpus(datadir, dataset)
Пример #3
0
def setup(seq_len, corpus_name, model_name):
    global seq_length, corpus_path, sample_path, corpus, vocab_size, model

    seq_length = seq_len

    corpus_path = './data/' + corpus_name
    sample_path = './sample/sample.txt'

    corpus = Corpus()
    corpus.get_data(corpus_path, batch_size)
    vocab_size = len(corpus.dictionary)

    model = RNNLM(vocab_size, embed_size, hidden_size, num_layers)
    model = model.cuda()
    model.load_state_dict(
        torch.load('./model/' + model_name,
                   map_location=lambda storage, loc: storage))
Пример #4
0
def build_dataset(train_bs, test_bs):
    path = '../../data/ptb'
    train_path = os.path.join(path, 'train.txt')
    val_path = os.path.join(path, 'valid.txt')
    test_path = os.path.join(path, 'test.txt')

    corpus = Corpus([train_path, val_path, test_path], train_bs, train_bs,
                    test_bs)
    print('Data is loaded.')

    return corpus
Пример #5
0
def predict(args, is_eval=False):
    args.to_resume_model = True
    if is_eval:
        input_file_name = "dev.csv"
    else:
        input_file_name = "test.csv"
    device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    if args.model_name in MODEL_MAP:
        Config, Model, Tokenizer, Tansform = MODEL_MAP[args.model_name]
        config = Config.from_pretrained(args.pretrained_model_path,
                                        num_labels=args.num_labels)
        config = add_args_to_config(args, config)
        tokenizer = Tokenizer.from_pretrained(args.pretrained_model_path,
                                              do_lower_case=args.do_lower_case)
        transform = Tansform(tokenizer, args)
        model = load_model(Model, args, config)
        model = model.to(device)
        if args.n_gpus > 1:
            model = nn.DataParallel(model)
        pub_data = Corpus(args, input_file_name, transform)
        pub_sampler = SequentialSampler(pub_data)
        pub_loader = DataLoader(pub_data,
                                batch_size=args.eval_batch_size,
                                sampler=pub_sampler)
        logits, _, _ = do_inference(model, pub_loader, device)
        df = pd.read_csv(os.path.join(args.data_dir, input_file_name))
        inference_label = logits.argmax(axis=1)
        df['label_pre'] = inference_label
        if is_eval:
            df['label_0'] = logits[:, 0]
            df['label_1'] = logits[:, 1]
            df[['id', 'label', 'label_pre', 'label_0',
                'label_1']].to_csv(os.path.join(args.out_dir, "dev_sub.csv"),
                                   index=False)
        else:
            df['label_0'] = logits[:, 0]
            df['label_1'] = logits[:, 1]
            filename = time.ctime().replace(' ', '-')
            label_filename = "label-" + filename
            filename = filename.replace(':', '-') + ".csv"
            label_filename = label_filename.replace(':', '-') + ".csv"
            df[['id', 'label_0',
                'label_1']].to_csv(os.path.join(args.out_dir, filename),
                                   index=False)
            # df[['id', 'label_pre']].to_csv(os.path.join(args.out_dir, label_filename), index=False)
            with open(os.path.join(args.out_dir, label_filename),
                      'w',
                      encoding='utf-8') as out:
                for i in range(df.shape[0]):
                    out.write("{}\t{}\n".format(df['id'][i],
                                                df['label_pre'][i]))
Пример #6
0
def train(args):
    start = time.time()
    print(
        'Train with hid=%d layers=%d drop=%.3lf seq_len=%d lr=%.5lf, seed=%s' %
        (args.hidden_size, args.num_layers, args.dropout, args.seq_length,
         args.lr, args.seed))

    continuous_no_update_epochs = 0
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    _ = torch.empty(size=[0], device=device)
    train_data, eval_data, test_data, vocab_size = Corpus().get_data(
        args.data_dir, args.batch_size)
    model = RNNLM(vocab_size=vocab_size,
                  embed_size=args.hidden_size,
                  hidden_size=args.hidden_size,
                  num_layers=args.num_layers,
                  device=device,
                  dropout=args.dropout,
                  batch_size=args.batch_size).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = get_optimizer(args.optimizer, model)

    best_val_loss = None
    for nth_epoch in range(1, args.epoch + 1):
        train_epoch(nth_epoch, model, train_data, criterion, optimizer, args)
        eval_loss = evaluate(model,
                             data=eval_data,
                             criterion=criterion,
                             seq_len=args.seq_length,
                             epoch=nth_epoch)
        if not best_val_loss or eval_loss < best_val_loss:
            print(' >>> Save model %.3lf -> %.3lf' %
                  ((np.exp(best_val_loss) if best_val_loss else 0.0),
                   np.exp(eval_loss)),
                  flush=True)
            best_val_loss = eval_loss
            continuous_no_update_epochs = 0
            model.save(get_model_path(args))
        else:
            continuous_no_update_epochs += 1
            print('', flush=True)
        if continuous_no_update_epochs == args.continuous_no_update_epochs_threshold:
            break

    print('Test result is %s' % (np.exp(
        evaluate(RNNLM.load(get_model_path(args)),
                 data=test_data,
                 criterion=criterion,
                 seq_len=args.seq_length,
                 test=True))))
    print('Finished in %.3lfms\n' % ((time.time() - start) / 60))
Пример #7
0
def predict(args, Model, tokenizer, config, transform, is_eval=False):
    args.to_resume_model = True
    if is_eval:
        input_file_name = "dev.csv"
    else:
        input_file_name = "test.csv"
    device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    # if args.model_name in MODEL_MAP:
    #     Config, Model, Tokenizer,Tansform = MODEL_MAP[args.model_name]
    #     config = Config.from_pretrained(args.pretrained_model_path, num_labels=args.num_labels)
    #     config = add_args_to_config(args,config)
    #     tokenizer = Tokenizer.from_pretrained(args.pretrained_model_path, do_lower_case=args.do_lower_case)
    #     transform = Tansform(tokenconfig_Tue-Dec-10-01-05-51-2019izer,args)
    model = load_model(Model, args, config)
    model = model.to(device)
    if args.n_gpus > 1:
        model = nn.DataParallel(model)
    pub_data = Corpus(args, input_file_name, transform)
    pub_sampler = SequentialSampler(pub_data)
    pub_loader = DataLoader(pub_data,
                            batch_size=args.eval_batch_size,
                            sampler=pub_sampler)
    logits, _, _ = do_inference(model, pub_loader, device)
    df = pd.read_csv(os.path.join(args.data_dir, input_file_name))
    inference_label = logits.argmax(axis=1)
    df['label_pre'] = inference_label
    if is_eval:
        df['label_0'] = logits[:, 0]
        df['label_1'] = logits[:, 1]
        df[[
            'id', 'label', 'label_pre', 'label_0', 'label_1', 'question1',
            'question2'
        ]].to_csv(os.path.join(args.out_dir, "dev_sub.csv"), index=False)
    else:
        df['label_0'] = logits[:, 0]
        df['label_1'] = logits[:, 1]
        filename = time.ctime().replace(' ', '-')
        label_filename = "label-" + filename
        filename = filename.replace(':', '-') + ".csv"
        label_filename = label_filename.replace(':', '-') + ".csv"
        df[['id', 'label_0',
            'label_1']].to_csv(os.path.join(args.out_dir, filename),
                               index=False)
        df[['id', 'label_pre']].to_csv(os.path.join(args.out_dir,
                                                    label_filename),
                                       index=False,
                                       header=False,
                                       sep='\t')
Пример #8
0
import torch
from torch.autograd import Variable
from torch import nn, optim
from data_utils import Corpus

seq_length = 30

train_file = 'train.txt'
valid_file = 'valid.txt'
test_file = 'test.txt'
train_corpus = Corpus()
valid_corpus = Corpus()
test_corpus = Corpus()

train_id = train_corpus.get_data(train_file)
valid_id = valid_corpus.get_data(valid_file)
test_id = test_corpus.get_data(test_file)

vocab_size = len(train_corpus.dic)
num_batches = train_id.size(1) // seq_length


class languagemodel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size, num_layers):
        super(languagemodel, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim,
                            hidden_size,
                            num_layers,
                            batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)
Пример #9
0
def train(args, Model, tokenizer, config, transform):  ##
    """
    :param args: training arguments
    :param Model: Class of Model
    :param tokenizer: word tokenizer
    :param config: bert config instance
    :param transform: data transform instance
    :return:
    """
    device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    best_f1 = 0
    logger.info("the current config is :\n {}".format(str(vars(args))))
    set_seed(args)
    # if args.model_name in MODEL_MAP:
    #     Config, Model, Tokenizer, Transform = MODEL_MAP[args.model_name]
    #     config = BertConfig.from_pretrained(args.pretrained_model_path, num_labels=args.num_labels)
    #     config = add_args_to_config(args,config) ##add customized args
    #     tokenizer = BertTokenizer.from_pretrained(args.pretrained_model_path, do_lower_case=args.do_lower_case)
    model = load_model(Model, args, config)
    model = model.to(device)
    if args.n_gpus > 1:
        model = nn.DataParallel(model)

    train_data = Corpus(args, "train.csv", transform)
    dev_data = Corpus(args, 'dev.csv', transform)
    dev_sampler = SequentialSampler(dev_data)
    dev_loader = DataLoader(dev_data,
                            batch_size=args.eval_batch_size,
                            sampler=dev_sampler)

    # Run prediction for full data
    eval_sampler = SequentialSampler(dev_data)
    dev_loader = DataLoader(dev_data,
                            sampler=eval_sampler,
                            batch_size=args.eval_batch_size)
    train_sampler = RandomSampler(train_data)
    test_sampler = SubsetRandomSampler(
        np.random.randint(low=0, high=(len(train_data)), size=len(dev_data)))
    train_loader = DataLoader(train_data,
                              batch_size=args.batch_size,
                              sampler=train_sampler,
                              drop_last=True)

    test_loader = DataLoader(train_data,
                             batch_size=args.eval_batch_size,
                             sampler=test_sampler)
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_data))
    logger.info("  Batch size = %d", args.batch_size)
    logger.info("  Num steps = %d", args.epochs)
    logger.info("  Early Stoppi   ng dev_loss = %f", args.dev_loss)
    bar = tqdm(total=len(train_loader) * args.epochs)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=0,
                                     t_total=len(bar))
    steps = 0
    total_train_loss = 0
    set_seed(args)
    for _ in range(args.epochs):
        for step, data_batch in enumerate(train_loader):
            bar.update(1)
            model.train()
            for k, v in data_batch.items():
                data_batch[k] = v.to(device)
            loss = model(batch=data_batch, feed_labels=True)
            if args.n_gpus > 1:
                loss = loss.mean()
            optimizer.zero_grad()  ## clear previous grad
            loss.backward()
            torch.nn.utils.clip_grad_norm_(
                model.parameters(), args.max_grad_norm
            )  ##grad norm according to the glue it is useful.
            optimizer.step()
            scheduler.step()
            # while len(model.points)>100:
            # model.points.pop(0)
            ##setting bar
            steps += 1
            if steps > args.optimize_steps:
                print("early stopping in {} steps".format(args.optimize_steps))
                break
            total_train_loss += loss.item()
            bar.set_description("training loss {}".format(loss.item()))
            if (steps) % args.eval_steps == 0:
                logits, loss, dev_labels = do_inference(
                    model, dev_loader, device)
                test_logits, test_loss, test_labels = do_inference(
                    model, test_loader, device)
                inference_labels = logits.argmax(axis=1)
                test_inference_labels = test_logits.argmax(axis=1)
                f1 = f1_score(y_true=dev_labels,
                              y_pred=inference_labels,
                              average='macro')
                c1_f1, c2_f1 = f1_score(y_true=dev_labels,
                                        y_pred=inference_labels,
                                        average=None)
                test_f1 = f1_score(y_true=test_labels,
                                   y_pred=test_inference_labels,
                                   average='macro')
                acc = accuracy_score(dev_labels, inference_labels)
                logger.info("=========eval report =========")
                logger.info("step : %s ", str(steps))
                logger.info("average_train loss: %s" %
                            (str(total_train_loss / steps)))
                logger.info("subset train loss: %s" % (str(test_loss)))
                logger.info("subset train f1 score: %s", str(test_f1))
                logger.info("eval loss: %s", str(loss))
                logger.info("eval acc: %s", str(acc))
                logger.info("eval f1 score: %s", str(f1))
                logger.info("eval label 0 f1 score: %s", str(c1_f1))
                logger.info("eval label 1 f1 score: %s", str(c2_f1))
                output_eval_file = os.path.join(args.out_dir,
                                                "eval_records.txt")
                with open(output_eval_file, "a") as writer:
                    if steps == args.eval_steps:
                        writer.write("\n%s\n" % (args.memo))
                    writer.write("=========eval report =========\n")
                    writer.write("step : %s \n" % (str(steps)))
                    writer.write("average_train loss: %s\n" %
                                 (str(total_train_loss / steps)))
                    writer.write("subset train loss: %s\n" % (str(test_loss)))
                    writer.write("subset f   1 score: %s\n" % (str(test_f1)))
                    writer.write("eval loss: %s\n" % (str(loss)))
                    writer.write("eval f1 score: %s\n" % (str(f1)))
                    writer.write("eval label 0 f1 score: %s\n" % str(c1_f1))
                    writer.write("eval label 1 f1 score: %s\n" % str(c2_f1))
                    writer.write('\n')
                if f1 > best_f1:
                    logger.info("we get a best dev f1 %s saving model....",
                                str(f1))
                    output_path = os.path.join(args.out_dir,
                                               "pytorch_model.bin")
                    if hasattr(model, 'module'):
                        logger.info("model has module")
                    model_to_save = model.module if hasattr(
                        model, 'module') else model
                    torch.save(model_to_save.state_dict(), output_path)
                    logger.info("model saved")
                    best_f1 = f1
    save_config(args)
    logger.info("args saved")
    ##load the final model
    args.to_resume_model = True
    model = load_model(Model, args, config)
    model = model.to(device)
    if args.n_gpus > 1:
        model = nn.DataParallel(model)
    dev_logits, loss, dev_labels = do_inference(
        model, dev_loader, device)  ##do the inference for dev set
    pub_data = Corpus(args, 'test.csv', transform)
    pub_sampler = SequentialSampler(pub_data)
    pub_loader = DataLoader(pub_data,
                            batch_size=args.eval_batch_size,
                            sampler=pub_sampler)
    logits, loss, dev_labels = do_inference(model, dev_loader, device)
    test_logits, _, _ = do_inference(model, pub_loader, device)
    return dev_logits, dev_labels, test_logits
Пример #10
0
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyper-parameters
embed_size = 128
hidden_size = 1024
num_layers = 1
num_epochs = 5
num_samples = 1000  # number of words to be sampled
batch_size = 20
seq_length = 30
learning_rate = 0.002

# Load "Penn Treebank" dataset
corpus = Corpus()
ids = corpus.get_data('data/short.txt',
                      batch_size)  # label each word with word ids
# print(ids.shape)
vocab_size = len(corpus.dictionary)
num_batches = ids.size(1) // seq_length


# RNN based language model
class RNNLM(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(RNNLM, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size,
                            hidden_size,
                            num_layers,
Пример #11
0
    def forward(self, x, h):
        x = self.embedding(x)
        x, hi = self.lstm(x, h)
        b, s, h = x.size()
        x = x.contiguous().view(b * s, h)
        x = self.linear(x)
        return x, hi


seq_length = 30

train_file = 'train.txt'
val_file = 'val.txt'
test_file = 'test.txt'
train_corpus = Corpus()
val_corpus = Corpus()
test_corpus = Corpus()

train_id = train_corpus.get_data(train_file)
val_id = train_corpus.get_data(val_file)
test_id = train_corpus.get_data(test_file)

vocab_size = len(train_corpus.dic)
num_batches = train_id.size(1) // seq_length

model = language_model(vocab_size, 128, 1024, 1)

# if torch.cuda.device_count() > 1:
#     model = nn.DataParallel(model)
if torch.cuda.is_available():
from data_utils import Corpus
from config import args
import os
import pickle
from model import MOS
from sklearn.utils import shuffle

if __name__ == '__main__':

    if args.nhidlast < 0:
        args.nhidlast = args.emsize
    if args.dropoutl < 0:
        args.dropoutl = args.dropouth
    if args.small_batch_size < 0:
        args.small_batch_size = args.batch_size

    data = Corpus(args.data)
    vocab_size = len(data.dictionary)

    train_data = data.train
    val_data = data.valid
    test_data = data.test

    model = MOS(vocab_size)
    model.train(train_data, val_data)
Пример #13
0
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyper-parameters
embed_size = 128
hidden_size = 1024
num_layers = 1
num_epochs = 5
num_samples = 1000     # number of words to be sampled
batch_size = 20
seq_length = 30
learning_rate = 0.002

# Load "Penn Treebank" dataset
corpus = Corpus()
ids = corpus.get_data('data/train.txt', batch_size)
vocab_size = len(corpus.dictionary)
num_batches = ids.size(1) // seq_length


# RNN based language model
class RNNLM(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(RNNLM, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)
        
    def forward(self, x, h):
        # Embed word ids to vectors
Пример #14
0
from data_utils import Dictionary, Corpus

# Hyper Parameters
embed_size = 128
hidden_size = 1024
num_layers = 1
num_epochs = 5
num_samples = 1000  # number of words to be sampled
batch_size = 20
seq_length = 30
learning_rate = 0.002

# Load Penn Treebank Dataset
train_path = './data/train.txt'
sample_path = './sample.txt'
corpus = Corpus()
ids = corpus.get_data(train_path, batch_size)
vocab_size = len(corpus.dictionary)
num_batches = ids.size(1) // seq_length


# RNN Based Language Model
class RNNLM(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(RNNLM, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size,
                            hidden_size,
                            num_layers,
                            batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)
Пример #15
0
    print(device, '\n')
    # Hyper-parameters
    embed_size = 220
    hidden_size = 220
    num_layers = 2
    num_epochs = 40
    num_samples = 5  # number of words to be sampled
    batch_size = 20
    seq_length = 30
    dropout = 0.3
    learning_rate = 0.005
    dropout = 0.5
    learning_rate = 0.01

    # Load "Penn Treebank" dataset
    corpus = Corpus()
    ids = corpus.get_data('data/train.txt', batch_size)  # divide to batch size
    valid_d = corpus.get_data('data/valid.txt', batch_size)
    test_d = corpus.get_data('data/test.txt', batch_size)
    vocab_size = len(corpus.dictionary)
    num_batches = ids.size(1) // seq_length
    best_val_loss = None

    model = RNNLM(vocab_size, embed_size, hidden_size, num_layers, dropout).to(device)

    # Calculate the number of trainable parameters in the model
    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
    params = sum([np.prod(p.size()) for p in model_parameters])
    print('Number of trainable parameters: ', params)

    # Loss and optimizer
Пример #16
0
from data_utils import Dictionary, Corpus

# Hyper Parameters
embed_size = 128
hidden_size = 1024
num_layers = 1
num_epochs = 5
num_samples = 1000   # number of words to be sampled
batch_size = 20
seq_length = 30
learning_rate = 0.002

# Load Penn Treebank Dataset
train_path = './data/train.txt'
sample_path = './sample.txt'
corpus = Corpus()
ids = corpus.get_data(train_path, batch_size)
vocab_size = len(corpus.dictionary)
num_batches = ids.size(1) // seq_length

# RNN Based Language Model
class RNNLM(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(RNNLM, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)
        self.init_weights()
        
    def init_weights(self):
        self.embed.weight.data.uniform_(-0.1, 0.1)
Пример #17
0
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyper-parameters
embed_size = 128
hidden_size = 1024
num_layers = 1
num_epochs = 5
num_samples = 10000  # number of words to be sampled
batch_size = 20
seq_length = 30
learning_rate = 0.002

# Load "Penn Treebank" dataset
corpus = Corpus()
ids = corpus.get_data('data/shakespeare.txt', batch_size)
vocab_size = len(corpus.dictionary)
num_batches = ids.size(1) // seq_length

infer_mode = False

model_path = "model.ckpt"

import sys

if len(sys.argv) > 1 and sys.argv[1] == 'infer':
    infer_mode = True

if infer_mode:
    print("Inference mode..")
Пример #18
0
hidden_size = 200
num_layers = 2
num_epochs = 20
num_samples = 1000  # number of words to be sampled
batch_size = 20
seq_length = 20
learning_rate = 0.002
num_steps = 20
unrolling_size = 5
dtype = torch.cuda.LongTensor
ftype = torch.cuda.FloatTensor

# Load Penn Treebank Dataset
train_path = './data/ptb.train.txt'
test_path = '/data/ptb.test.txt'
corpus = Corpus('./data/ptb.train.txt')
raw_data = corpus.get_data(train_path, batch_size)
vocab_size = len(corpus.dictionary)
data_len = len(raw_data)
n_seq = (data_len - 1) // num_steps
raw_data_x = raw_data[0:n_seq * num_steps].view(n_seq, num_steps)
raw_data_y = raw_data[1:n_seq * num_steps + 1].view(n_seq, num_steps)
logger = Logger('./logs')


def convert(data, unroll, num_steps):
    datalist = torch.split(data, 1, dim=1)
    x0 = torch.cat(datalist[:unroll], dim=1)
    x1 = torch.cat(datalist[unroll:], dim=1)
    dataconvert = torch.cat((x1, x0), dim=1)
    return dataconvert
Пример #19
0
def train(args):
    device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    best_f1 = 0
    logger.info("the current config is :\n {}".format(str(vars(args))))
    set_seed(args)
    if args.model_name in MODEL_MAP:
        Config, Model, Tokenizer, Transform = MODEL_MAP[args.model_name]
        config = BertConfig.from_pretrained(args.pretrained_model_path,
                                            num_labels=args.num_labels)
        config = add_args_to_config(args, config)  ##add customized args
        tokenizer = BertTokenizer.from_pretrained(
            args.pretrained_model_path, do_lower_case=args.do_lower_case)
        model = load_model(LinearBertModel, args, config)
        model = model.to(device)
        if args.n_gpus > 1:
            model = nn.DataParallel(model)
        ###adv training
        pgd = PGD(model)
        ###adv training
        transform = base_transform(tokenizer, args)
        train_data = Corpus(args, "train.csv", transform)
        dev_data = Corpus(args, 'dev.csv', transform)
        dev_sampler = SequentialSampler(dev_data)
        dev_loader = DataLoader(dev_data,
                                batch_size=args.eval_batch_size,
                                sampler=dev_sampler)

        # Run prediction for full data
        eval_sampler = SequentialSampler(dev_data)
        dev_loader = DataLoader(dev_data,
                                sampler=eval_sampler,
                                batch_size=args.eval_batch_size)
        train_sampler = RandomSampler(train_data)
        test_sampler = SubsetRandomSampler(
            np.random.randint(low=0,
                              high=(len(train_data)),
                              size=len(dev_data)))
        train_loader = DataLoader(train_data,
                                  batch_size=args.batch_size,
                                  sampler=train_sampler,
                                  drop_last=True)

        test_loader = DataLoader(train_data,
                                 batch_size=args.eval_batch_size,
                                 sampler=test_sampler)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_data))
        logger.info("  Batch size = %d", args.batch_size)
        logger.info("  Num steps = %d", args.epochs)
        logger.info("  Early Stopping dev_loss = %f", args.dev_loss)
        bar = tqdm(total=len(train_loader) * args.epochs)
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            args.weight_decay
        }, {
            'params': [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.0
        }]
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          eps=args.adam_epsilon)
        scheduler = WarmupLinearSchedule(optimizer,
                                         warmup_steps=0,
                                         t_total=len(bar))
        steps = 0
        total_train_loss = 0
        set_seed(args)
        model.zero_grad()
        for _ in range(args.epochs):
            for step, data_batch in enumerate(train_loader):
                bar.update(1)
                model.train()
                for k, v in data_batch.items():
                    data_batch[k] = v.to(device)
                loss = model(batch=data_batch, feed_labels=True)
                if args.n_gpus > 1:
                    loss = loss.mean()
                loss.backward()
                ###adv training
                pgd.backup_grad()
                for t in range(2):
                    pgd.attack(is_first_attack=(t == 0))
                    if t != 2 - 1:
                        model.zero_grad()
                    else:
                        pgd.restore_grad()
                    loss_adv = model(batch=data_batch, feed_labels=True)
                    if args.n_gpus > 1:
                        loss_adv = loss_adv.mean()
                    loss_adv.backward()  # 反向传播,并在正常的grad基础上,累加对抗训练的梯度
                pgd.restore()  # 恢复embedding参数
                ###adv training
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               args.max_grad_norm)
                optimizer.step()
                scheduler.step()
                model.zero_grad()  ##delete the grad in the graph
                ##setting bar
                steps += 1
                total_train_loss += loss.item()
                bar.set_description("training loss {}".format(loss.item()))
                if (steps) % args.eval_steps == 0:
                    logits, loss, dev_labels = do_inference(
                        model, dev_loader, device)
                    test_logits, test_loss, test_labels = do_inference(
                        model, test_loader, device)
                    inference_labels = logits.argmax(axis=1)
                    test_inference_labels = test_logits.argmax(axis=1)
                    f1 = f1_score(y_true=dev_labels, y_pred=inference_labels)
                    test_f1 = f1_score(y_true=test_labels,
                                       y_pred=test_inference_labels)
                    acc = accuracy_score(dev_labels, inference_labels)
                    logger.info("=========eval report =========")
                    logger.info("step : %s ", str(steps))
                    logger.info("average_train loss: %s" %
                                (str(total_train_loss / steps)))
                    logger.info("subset train loss: %s" % (str(test_loss)))
                    logger.info("subset train f1 score: %s", str(test_f1))
                    logger.info("eval loss: %s", str(loss))
                    logger.info("eval f1 score: %s", str(f1))
                    logger.info("eval acc: %s", str(acc))
                    output_eval_file = os.path.join(args.out_dir,
                                                    "eval_records.txt")
                    with open(output_eval_file, "a") as writer:
                        if steps == args.eval_steps:
                            writer.write("\n%s\n" % (args.memo))
                        writer.write("=========eval report =========\n")
                        writer.write("step : %s \n" % (str(steps)))
                        writer.write("average_train loss: %s\n" %
                                     (str(total_train_loss / steps)))
                        writer.write("subset train loss: %s\n" %
                                     (str(test_loss)))
                        writer.write("subset f1 score: %s\n" % (str(test_f1)))
                        writer.write("eval loss: %s\n" % (str(loss)))
                        writer.write("eval f1 score: %s\n" % (str(f1)))
                        writer.write('\n')
                    if f1 > best_f1:
                        logger.info("we get a best dev f1 %s saving model....",
                                    str(f1))
                        output_path = os.path.join(args.out_dir,
                                                   "pytorch_model.bin")
                        if hasattr(model, 'module'):
                            logger.info("model has module")
                        model_to_save = model.module if hasattr(
                            model, 'module') else model
                        torch.save(model_to_save.state_dict(), output_path)
                        logger.info("model saved")
                        best_f1 = f1
        save_config(args)
        logger.info("args saved")
        ##load the final model
        args.to_resume_model = True
        model = load_model(Model, args, config)
        model = model.to(device)
        if args.n_gpus > 1:
            model = nn.DataParallel(model)
        dev_logits, loss, dev_labels = do_inference(
            model, dev_loader, device)  ##do the inference for dev set
        pub_data = Corpus(args, 'test.csv', transform)
        pub_sampler = SequentialSampler(pub_data)
        pub_loader = DataLoader(pub_data,
                                batch_size=args.eval_batch_size,
                                sampler=pub_sampler)
        # logits, loss, dev_labels = do_inference(model, dev_loader, device)
        test_logits, _, _ = do_inference(model, pub_loader, device)
        return dev_logits, dev_labels, test_logits
    else:
        logger.info("the model %s is not registered", args.model_name)
        return
Пример #20
0
def train_skipgram(corpus_dir, extn, learning_rate, embedding_size,
                   num_negsample, epochs, batch_size, output_dir, valid_size):
    '''

    :param corpus_dir: folder containing WL kernel relabeled files. All the files in this folder will be relabled
    according to WL relabeling strategy and the format of each line in these folders shall be: <target> <context 1> <context 2>....
    :param extn: Extension of the WL relabled file
    :param learning_rate: learning rate for the skipgram model (will involve a linear decay)
    :param embedding_size: number of dimensions to be used for learning subgraph representations
    :param num_negsample: number of negative samples to be used by the skipgram model
    :param epochs: number of iterations the dataset is traversed by the skipgram model
    :param batch_size: size of each batch for the skipgram model
    :param output_dir: the folder where embedding file will be stored
    :param valid_size: number of subgraphs to be chosen at random to validate the goodness of subgraph representation
    learning process in every epoc
    :return: name of the file that contains the subgraph embeddings (in word2vec format proposed by Mikolov et al (2013))
    '''

    op_fname = '_'.join([
        os.path.basename(corpus_dir), 'dims',
        str(embedding_size), 'epochs',
        str(epochs), 'embeddings.txt'
    ])
    op_fname = os.path.join(output_dir, op_fname)
    if os.path.isfile(op_fname):
        logging.info(
            'The embedding file: {} is already present, hence NOT training skipgram model '
            'for subgraph vectors'.format(op_fname))
        return op_fname

    logging.info("Initializing SKIPGRAM...")
    corpus = Corpus(
        corpus_dir, extn=extn,
        max_files=0)  # just load 'max_files' files from this folder
    corpus.scan_and_load_corpus()
    valid_examples = np.concatenate(
        (np.random.choice(corpus.high_freq_word_ids, valid_size,
                          replace=False),
         np.random.choice(corpus.low_freq_word_ids, valid_size,
                          replace=False)))

    model_skipgram = skipgram(
        doc_size=corpus.
        _vocabsize,  # for doc2vec skipgram model, the doc size should be same as word size
        vocabulary_size=corpus._vocabsize,  # size of i/p and o/p layers
        learning_rate=learning_rate,  # will decay over time?
        embedding_size=embedding_size,  # hidden layer neurons
        num_negsample=num_negsample,
        num_steps=
        epochs,  # no. of time the training set will be iterated through
        corpus=corpus,  # data set of (target,context) tuples
        valid_dataset=
        valid_examples,  # validation set (a small subset) of (target, context) tuples?
    )

    final_embeddings, final_weights = model_skipgram.train(
        corpus=corpus,
        batch_size=batch_size,
        valid_dataset=valid_examples,
    )

    logging.info('Write the matrix to a word2vec format file')
    save_embeddings(corpus, final_embeddings, embedding_size, op_fname)
    logging.info(
        'Completed writing the final embeddings, pls check file: {} for the same'
        .format(op_fname))
    return op_fname
Пример #21
0
def train(args):
    device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    set_seed(args)
    best_f1 = 0
    logger.info("the current config is :\n {}".format(str(vars(args))))
    if args.model_name in MODEL_MAP:
        Config, Model, Tokenizer, Transform = MODEL_MAP[args.model_name]
        config = Config.from_pretrained(args.pretrained_model_path,
                                        num_labels=args.num_labels)
        config = add_args_to_config(args, config)  ##add customized args
        tokenizer = Tokenizer.from_pretrained(args.pretrained_model_path,
                                              do_lower_case=args.do_lower_case)
        model = load_model(Model, args, config)
        model = model.to(device)
        if args.n_gpus > 1:
            model = nn.DataParallel(model)
        ###adv training
        pgd = PGD(model)
        transform = Transform(tokenizer, args)
        train_data = Corpus(args, "train.csv", transform)
        ###get the weighted sample with the weight [0.9,0.2,0.5]
        # weight = [0.9,0.2,0.5]
        # weight_sequence = []
        # for i in range(len(train_data)):
        #     data = train_data[i]
        #     label =data.get('label').item()
        #     weight_sequence.append(weight[label]) ###add the weight of this label
        dev_data = Corpus(args, 'dev.csv', transform)
        dev_sampler = SequentialSampler(dev_data)
        dev_loader = DataLoader(dev_data,
                                batch_size=args.eval_batch_size,
                                sampler=dev_sampler)

        # Run prediction for full data
        eval_sampler = SequentialSampler(dev_data)
        dev_loader = DataLoader(dev_data,
                                sampler=eval_sampler,
                                batch_size=args.eval_batch_size)

        train_sampler = RandomSampler(train_data)
        # weight_sampler = WeightedRandomSampler(weights=weight_sequence,num_samples=args.epochs*len(train_data), replacement=True)
        test_sampler = SubsetRandomSampler(
            np.random.randint(low=0,
                              high=(len(train_data)),
                              size=len(dev_data)))
        train_loader = DataLoader(train_data,
                                  batch_size=args.batch_size,
                                  sampler=train_sampler,
                                  drop_last=True)

        test_loader = DataLoader(train_data,
                                 batch_size=args.eval_batch_size,
                                 sampler=test_sampler)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_data))
        logger.info("  Batch size = %d", args.batch_size)
        logger.info("  Num steps = %d", args.epochs)
        logger.info("  Early Stopping dev_loss = %f", args.dev_loss)
        bar = tqdm(range(len(train_loader) * args.epochs),
                   total=len(train_loader) * args.epochs)
        train_loader = cycle(train_loader)
        ##get optimizer
        param_optimizer = list(model.named_parameters())

        # hack to remove pooler, which is not used
        # thus it produce None grad that break apex
        param_optimizer = [n for n in param_optimizer]

        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.0
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            args.weight_decay
        }]
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          eps=args.adam_epsilon)
        scheduler = WarmupLinearSchedule(optimizer,
                                         warmup_steps=0,
                                         t_total=len(bar))
        steps = 0
        # dev_labels = dev_data.get_feature("label")
        # dev_labels = [i.item() for i in dev_labels]# get gold label
        total_train_loss = 0
        for step in bar:
            model.train()
            data_batch = next(train_loader)
            for k, v in data_batch.items():
                data_batch[k] = v.to(device)
            loss = model(batch=data_batch, feed_labels=True)
            if args.n_gpus > 1:
                loss = loss.mean()
            loss.backward()
            ###adv training
            pgd.backup_grad()
            for t in range(1):
                pgd.attack(is_first_attack=(t == 0))
                if t != 1 - 1:
                    model.zero_grad()
                else:
                    pgd.restore_grad()
                loss_adv = model(batch=data_batch, feed_labels=True)
                if args.n_gpus > 1:
                    loss_adv = loss_adv.mean()
                loss_adv.backward()  # 反向传播,并在正常的grad基础上,累加对抗训练的梯度
            pgd.restore()  # 恢复embedding参数
            ###adv training
            optimizer.step()
            optimizer.zero_grad()
            scheduler.step()
            ##setting bar
            steps += 1
            total_train_loss += loss.item()
            bar.set_description("training loss {}".format(loss.item()))
            if (steps) % args.eval_steps == 0:
                logits, loss, dev_labels = do_inference(
                    model, dev_loader, device)
                test_logits, test_loss, test_labels = do_inference(
                    model, test_loader, device)
                inference_labels = logits.argmax(axis=1)
                test_inference_labels = test_logits.argmax(axis=1)
                f1 = f1_score(dev_labels,
                              inference_labels,
                              labels=[0, 1, 2],
                              average="macro")
                test_f1 = f1_score(test_labels,
                                   test_inference_labels,
                                   labels=[0, 1, 2],
                                   average="macro")
                # acc = accuracy_score(dev_labels, inference_labels)
                logger.info("=========eval report =========")
                logger.info("step : %s ", str(steps))
                logger.info("average_train loss: %s" %
                            (str(total_train_loss / steps)))
                logger.info("subset train loss: %s" % (str(test_loss)))
                logger.info("subset train f1 score: %s", str(test_f1))
                logger.info("eval loss: %s", str(loss))
                logger.info("eval f1 score: %s", str(f1))
                output_eval_file = os.path.join(args.out_dir,
                                                "eval_records.txt")
                with open(output_eval_file, "a") as writer:
                    if steps == args.eval_steps:
                        writer.write("\n%s\n" % (args.memo))
                    writer.write("=========eval report =========\n")
                    writer.write("step : %s \n" % (str(steps)))
                    writer.write("average_train loss: %s\n" %
                                 (str(total_train_loss / steps)))
                    writer.write("subset train loss: %s\n" % (str(test_loss)))
                    writer.write("subset f1 score: %s\n" % (str(test_f1)))
                    writer.write("eval loss: %s\n" % (str(loss)))
                    writer.write("eval f1 score: %s\n" % (str(f1)))
                    writer.write('\n')
                if f1 > best_f1:
                    logger.info("we get a best dev f1 %s saving model....",
                                str(f1))
                    output_path = os.path.join(args.out_dir,
                                               "pytorch_model.bin")
                    if hasattr(model, 'module'):
                        logger.info("model has module")
                    model_to_save = model.module if hasattr(
                        model, 'module') else model
                    torch.save(model_to_save.state_dict(), output_path)
                    logger.info("model saved")
                    best_f1 = f1
        save_config(args)
        logger.info("args saved")
        ##load the final model
        args.to_resume_model = True
        model = load_model(Model, args, config)
        model = model.to(device)
        if args.n_gpus > 1:
            model = nn.DataParallel(model)
        dev_logits, loss, dev_labels = do_inference(
            model, dev_loader, device)  ##do the inference for dev set
        pub_data = Corpus(args, 'test.csv', transform)
        pub_sampler = SequentialSampler(pub_data)
        pub_loader = DataLoader(pub_data,
                                batch_size=args.eval_batch_size,
                                sampler=pub_sampler)
        # logits, loss, dev_labels = do_inference(model, dev_loader, device)
        test_logits, _, _ = do_inference(model, pub_loader, device)
        return dev_logits, dev_labels, test_logits
    else:
        logger.info("the model %s is not registered", args.model_name)
        return
Пример #22
0
num_layers = 1
#num_layers = 2

num_epochs = 10

num_samples = 10000  # number of words to be sampled

batch_size = 20

seq_length = 30

learning_rate = 0.002

# Load "Penn Treebank" dataset

corpus = Corpus()

ids = corpus.get_data('data/wikitext-2-v1.train.tokens', batch_size)

vocab_size = len(corpus.dictionary)

num_batches = ids.size(1) // seq_length

# RNN based language model


class RNNLM(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(RNNLM, self).__init__()

        self.embed = nn.Embedding(vocab_size, embed_size)
Пример #23
0
path_data = os.path.join(path_this, 'data', 'meditations.mb.txt')

from data_utils import Corpus

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

embed_size = 128
hidden_size = 1024
num_layers = 1
num_epoch = 20
batch_size = 20
max_seq = 30
learning_rate = 0.002
max_vocab = None

corpus = Corpus()
tensor = corpus.fit(path_data, limit=max_vocab)
vocab_size = len(corpus.vocab.stoi)
tot_batch = tensor.shape[1] // max_seq

# not yet?
# tensor = tensor[:, :max_seq * tot_batch] # remove spilled


# RNN based language model
class RNNLM(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(RNNLM, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size,
                            hidden_size,
Пример #24
0
import numpy as np
from torch.nn.utils import clip_grad_norm_
from data_utils import Dictionary, Corpus

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

embed_size = 128
hidden_size = 1024
num_layers = 1
num_epochs = 5
num_samples = 1000
batch_size = 20
seq_length = 30
learning_rate = 0.002

corpus = Corpus()
ids = corpus.get_data('data/train.txt', batch_size)
vocab_size = len(corpus.dictionary)
num_batches = ids.size(1) // seq_length


class RNNLM(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):

        super(RNNLM, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size,
                            hidden_size,
                            num_layers,
                            batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)
Пример #25
0
from torch.nn.utils import clip_grad_norm_
from data_utils import Dictionary, Corpus

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

embed_size = 128
hidden_size = 1024
num_layers = 1
num_epochs = 0
num_samples = 1000
batch_size = 20
seq_length = 30
learning_rate = 0.002

corpus = Corpus()
ids = corpus.get_data('data/train.txt',
                      batch_size)  # (batch_size, word_ids from many sentence)
vocab_size = len(corpus.dictionary)
num_batches = ids.size(1) // seq_length


class RNNLM(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(RNNLM, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size,
                            hidden_size,
                            num_layers,
                            batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)