Exemplo n.º 1
0
def buildDataSets():
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Model parameter
    MAX_SEQ_LEN = 16
    PAD_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
    UNK_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.unk_token)

    # Fields

    label_field = Field(sequential=False,
                        use_vocab=False,
                        batch_first=True,
                        dtype=torch.int8)
    text_field = Field(use_vocab=False,
                       tokenize=tokenizer.encode,
                       lower=False,
                       include_lengths=False,
                       batch_first=True,
                       fix_length=MAX_SEQ_LEN,
                       pad_token=PAD_INDEX,
                       unk_token=UNK_INDEX)

    fields = {'label': ('label', label_field), 'text': ('text', text_field)}

    # TabularDataset

    train, valid, test = TabularDataset.splits(path='memesData/data',
                                               train='train.jsonl',
                                               validation='dev_unseen.jsonl',
                                               test='dev_seen.jsonl',
                                               format='JSON',
                                               fields=fields)

    # Iterators

    train_iter = BucketIterator(train,
                                batch_size=8,
                                sort_key=lambda x: len(x.text),
                                train=True,
                                sort=True,
                                sort_within_batch=True)
    valid_iter = BucketIterator(valid,
                                batch_size=8,
                                sort_key=lambda x: len(x.text),
                                train=True,
                                sort=True,
                                sort_within_batch=True)
    test_iter = Iterator(test,
                         batch_size=8,
                         train=False,
                         shuffle=False,
                         sort=False)
    return train_iter, valid_iter, test_iter
def produce_iterators(train_filename,
                      valid_filename,
                      test_filename,
                      asr_tokenizer,
                      ttx_tokenizer=None):
    """
    Produce datasets for each of training, validation and test data. Also build vocabs for true text, tags, and ASR.
    :param train_filename: location of train data csv
    :param valid_filename: location of valid data csv
    :param test_filename: location of test data csv
    :return:
    """
    TTX = Field(tokenize=lambda x: tokenize_TTX(x, ttx_tokenizer),
                init_token='<sos>',
                eos_token='<eos>',
                lower=False,
                batch_first=True)

    TRG = Field(tokenize=tokenize_TRG,
                init_token='<sos>',
                eos_token='<eos>',
                lower=False,
                batch_first=True)

    ASR = Field(tokenize=lambda x: tokenize_ASR(x, asr_tokenizer),
                init_token='<sos>',
                eos_token='<eos>',
                lower=False,
                batch_first=True)

    fields = {
        'true_text': ('true_text', TTX),
        'tags': ('tags', TRG),
        'asr': ('asr', ASR)
    }

    train_data, valid_data, test_data = TabularDataset.splits(
        path='.\\',
        train=train_filename,
        validation=valid_filename,
        test=test_filename,
        format='csv',
        fields=fields)

    # Put min_freq at 2 or higher for real data
    TTX.build_vocab(train_data, min_freq=1)
    TRG.build_vocab(train_data, min_freq=1)
    ASR.build_vocab(train_data, min_freq=1)

    return train_data, valid_data, test_data, TTX, TRG, ASR
Exemplo n.º 3
0
def test(file_list):
    sentences = list(
        map(lambda x: list(sent_tokenize(open(x).read())), file_list))
    train_set = sentences[:math.floor(len(sentences) / 2)]
    test_set = sentences[math.floor(len(sentences) / 2):]
    train_data = pd.DataFrame()
    test_data = pd.DataFrame()
    for i in train_set:
        train_data = pd.concat([train_data, pd.DataFrame(i)])
    for j in test_set:
        test_data = pd.concat([test_data, pd.DataFrame(j)])

    train_data.to_csv("train_data.csv", index=False)
    test_data.to_csv("test_data.csv", index=False)

    TEXT = data.Field(sequential=True,
                      use_vocab=True,
                      tokenize=word_tokenize,
                      lower=True,
                      batch_first=True)
    LABEL = data.Field(sequential=False,
                       use_vocab=False,
                       batch_first=False,
                       is_target=True)

    train_data, test_data = TabularDataset.splits(path='.',
                                                  train='train_data.csv',
                                                  test='test_data.csv',
                                                  format='csv',
                                                  fields=[('text', TEXT),
                                                          ('label', LABEL)],
                                                  skip_header=True)

    batch_size = 5
    train_loader = Iterator(dataset=train_data, batch_size=batch_size)
    test_loader = Iterator(dataset=test_data, batch_size=batch_size)
Exemplo n.º 4
0
label_field = Field(sequential=False,
                    use_vocab=False,
                    batch_first=True,
                    dtype=torch.float)
text_field = Field(tokenize="spacy",
                   lower=True,
                   include_lengths=True,
                   batch_first=True)
fields = [("words", text_field), ("target", label_field)]
fields_test = [("words", text_field)]

train, valid = TabularDataset.splits(
    path="data",
    train="train_rnn.csv",
    validation="valid_rnn.csv",
    format="CSV",
    fields=fields,
    skip_header=True,
)
test = TabularDataset(path="data/test_rnn.csv",
                      format="CSV",
                      fields=fields_test,
                      skip_header=True)

train_iter = BucketIterator(
    train,
    batch_size=flor.log("batch_size", 200),
    sort_key=lambda x: len(x.words),
    device=device,
    sort=True,
    sort_within_batch=True,
Exemplo n.º 5
0
text_field = Field(use_vocab=False,
                   tokenize=tokenizer.encode,
                   lower=False,
                   include_lengths=False,
                   batch_first=True,
                   fix_length=MAX_SEQ_LEN,
                   pad_token=PAD_INDEX,
                   unk_token=UNK_INDEX)
fields = [('index', label_field), ('text', text_field), ('label', label_field)]

# TabularDataset

train, valid, test = TabularDataset.splits(path='./data',
                                           train='IMDB_single.csv',
                                           validation='IMDBs.csv',
                                           test='IMDBs.csv',
                                           format='CSV',
                                           fields=fields,
                                           skip_header=True)

# Iterators

train_iter = BucketIterator(train,
                            batch_size=16,
                            sort_key=lambda x: len(x.text),
                            device=device,
                            train=True,
                            sort=True,
                            sort_within_batch=True)
valid_iter = BucketIterator(valid,
                            batch_size=16,
Exemplo n.º 6
0
    init_token_idx = tokenizer.convert_tokens_to_ids(init_token)
    eos_token_idx = tokenizer.convert_tokens_to_ids(eos_token)
    pad_token_idx = tokenizer.convert_tokens_to_ids(pad_token)
    unk_token_idx = tokenizer.convert_tokens_to_ids(unk_token)

    TEXT = Field(batch_first=True,
                 use_vocab=False,
                 tokenize=tokenize_and_cut,
                 preprocessing=tokenizer.convert_tokens_to_ids,
                 init_token=init_token_idx,
                 eos_token=eos_token_idx,
                 pad_token=pad_token_idx,
                 unk_token=unk_token_idx)
    LABEL = LabelField(dtype=torch.long, use_vocab=False)
    fields = [('data', TEXT), ('label', LABEL)]
    train, valid, test = TabularDataset.splits(path=source_folder, train='train.csv', validation='validation.csv', test='test.csv',
                                               format='CSV', fields=fields, skip_header=True)

    train_generator, val_generator, test_generator = BucketIterator.splits(
        (train, valid, test),
        batch_size=batch_size,
        device=device, sort=False)

    criterion = nn.CrossEntropyLoss()

    criterion = criterion.to(device)
    all_statedict_path = glob.glob('/root/logs/*.pth')
    for state_dict_path in all_statedict_path:
        print(state_dict_path)
        epoch_loss = 0
        epoch_acc = 0
        model = phobert_lstm(phobert_path=phobert_path,
Exemplo n.º 7
0
print(device)

#python -m spacy download en
spacy_en = spacy.load("en")

def tokenize(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

Texto = Field(sequential=True, use_vocab=True, tokenize=tokenize, lower=True)
Valoracion = Field(sequential=False, use_vocab=False)

fields = {"Texto": ("t", Texto), "Valoracion": ("v", Valoracion)}

train_data, test_data = TabularDataset.splits(
                                        path='/content/Dataset',
                                        train='train.csv',
                                        test='test.csv',
                                        format='csv',
                                        fields=fields)

len(train_data) , len(test_data)

print(vars(train_data.examples[0]))

Texto.build_vocab(train_data, max_size=10000, min_freq=1,vectors="glove.6B.100d")

Texto.vocab.freqs.most_common(25)

Texto.vocab.itos[:10]

train_iterator, test_iterator = BucketIterator.splits(
    (train_data, test_data), batch_size=2, device=device
Exemplo n.º 8
0
from torchtext.legacy.data import Field, TabularDataset, BucketIterator
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.optim as optim
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import flor
flor.flags.NAME = 'kaggle-nlp-disasters-rnn'
flor.flags.REPLAY = False
device = torch.device(('cuda:0' if torch.cuda.is_available() else 'cpu'))
device
label_field = Field(sequential=False, use_vocab=False, batch_first=True, dtype=torch.float)
text_field = Field(tokenize='spacy', lower=True, include_lengths=True, batch_first=True)
fields = [('words', text_field), ('target', label_field)]
fields_test = [('words', text_field)]
(train, valid) = TabularDataset.splits(path='data', train='train_rnn.csv', validation='valid_rnn.csv', format='CSV', fields=fields, skip_header=True)
test = TabularDataset(path='data/test_rnn.csv', format='CSV', fields=fields_test, skip_header=True)
train_iter = BucketIterator(train, batch_size=200, sort_key=(lambda x: len(x.words)), device=device, sort=True, sort_within_batch=True)
valid_iter = BucketIterator(valid, batch_size=200, sort_key=(lambda x: len(x.words)), device=device, sort=True, sort_within_batch=True)
test_iter = BucketIterator(test, batch_size=200, sort_key=(lambda x: len(x.words)), device=device, sort=True, sort_within_batch=True)
text_field.build_vocab(train, min_freq=5)

class LSTM(nn.Module):

    def __init__(self, dimension=128):
        super(LSTM, self).__init__()
        self.embedding = nn.Embedding(len(text_field.vocab), 300)
        self.dimension = dimension
        self.lstm = nn.LSTM(input_size=300, hidden_size=dimension, num_layers=1, batch_first=True, bidirectional=True)
        self.drop = nn.Dropout(p=0.5)
        self.fc = nn.Linear((2 * dimension), 1)
              tokenize=tokenize,
              init_token='<sos>',
              eos_token='<eos>',
              lower=True)
TARGET = Field(sequential=True,
               tokenize=tokenize,
               init_token='<sos>',
               eos_token='<eos>',
               lower=True)

datafields = [("input", INPUT), ("target", TARGET)]

trn, vld, tst = TabularDataset.splits(path="data/" + data_size,
                                      train=train_csv,
                                      validation=validation_csv,
                                      test=test_csv,
                                      format='csv',
                                      skip_header=True,
                                      fields=datafields)

print(f"Number of {data_size} training examples: {len(trn.examples)}")
print(f"Number of {data_size} validation examples: {len(vld.examples)}")
print(f"Number of {data_size} test examples: {len(tst.examples)}")

INPUT.build_vocab(trn)
TARGET.build_vocab(trn)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_iter, val_iter, test_iter = BucketIterator.splits(
    (trn, vld, tst),
Exemplo n.º 10
0
    #                                                     fields = (SRC, TRG))

    # fetch from Github repo
    # !wget
    # https://raw.githubusercontent.com/tberg12/cse291spr21/main/assignment1/train.json
    # !wget
    # https://raw.githubusercontent.com/tberg12/cse291spr21/main/assignment1/valid.json
    # !wget
    # https://raw.githubusercontent.com/tberg12/cse291spr21/main/assignment1/test.json

    # and load to same variables
    fields = {'src': ('src', SRC), 'trg': ('trg', TRG)}
    train_data, valid_data, test_data = TabularDataset.splits(
        path='.',
        train='train.json',
        validation='valid.json',
        test='test.json',
        format='json',
        fields=fields)

    SRC.build_vocab(train_data, min_freq=2)
    TRG.build_vocab(train_data, min_freq=2)

    BATCH_SIZE = 128

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
        (train_data, valid_data, test_data),
        batch_size=BATCH_SIZE,
        sort_within_batch=True,
Exemplo n.º 11
0
# Model parameter
MAX_SEQ_LEN = 128
PAD_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
UNK_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.unk_token)

# Fields

label_field = Field(sequential=False, use_vocab=False, batch_first=True, dtype=torch.float)
text_field = Field(use_vocab=False, tokenize=tokenizer.encode, lower=False, include_lengths=False, batch_first=True,
                   fix_length=MAX_SEQ_LEN, pad_token=PAD_INDEX, unk_token=UNK_INDEX)
fields = [('label', label_field), ('titletext', text_field)]

# TabularDataset
source_folder = "data/real_fake_news"
destination_folder = "outs/debug"
train, valid, test = TabularDataset.splits(path=source_folder, train='train{}.csv'.format(debug_flag), validation='valid{}.csv'.format(debug_flag),
                                           test='test{}.csv'.format(debug_flag), format='CSV', fields=fields, skip_header=True)

# Iterators

train_iter = BucketIterator(train, batch_size=16, sort_key=lambda x: len(x.titletext),
                            device=device, train=True, sort=True, sort_within_batch=True)
valid_iter = BucketIterator(valid, batch_size=16, sort_key=lambda x: len(x.titletext),
                            device=device, train=True, sort=True, sort_within_batch=True)
test_iter = Iterator(test, batch_size=16, device=device, train=False, shuffle=False, sort=False)


# Build model

class BERT(nn.Module):

    def __init__(self):
Exemplo n.º 12
0
def main():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    parser = argparse.ArgumentParser()
    parser.add_argument('-epoch', type=int, default=10)
    parser.add_argument('-b', '--batch_size', type=int, default=2048)
    parser.add_argument('-d_model', type=int, default=512)
    parser.add_argument('-d_inner_hid', type=int, default=2048)
    parser.add_argument('-d_k', type=int, default=64)
    parser.add_argument('-d_v', type=int, default=64)
    parser.add_argument('-n_head', type=int, default=8)
    parser.add_argument('-n_layers', type=int, default=6)
    parser.add_argument('-lr_mul', type=float, default=2.0)
    parser.add_argument('-dropout', type=float, default=0.1)
    parser.add_argument('-output_dir', type=str, default=None)
    parser.add_argument('-warmup', '--n_warmup_steps', type=int, default=4000)

    opt = parser.parse_args()

    english = Field(sequential=True,
                    use_vocab=True,
                    tokenize=tokenize_eng,
                    lower=True,
                    pad_token='<blank>',
                    init_token='<s>',
                    eos_token='</s>')

    german = Field(sequential=True,
                   use_vocab=True,
                   tokenize=tokenize_ger,
                   lower=True,
                   pad_token='<blank>',
                   init_token='<s>',
                   eos_token='</s>')

    fields = {'English': ('eng', english), 'German': ('ger', german)}
    train_data, test_data = TabularDataset.splits(path='',
                                                  train='train.json',
                                                  test='test.json',
                                                  format='json',
                                                  fields=fields)

    english.build_vocab(train_data, max_size=1000, min_freq=1)
    print('[Info] Get source language vocabulary size:', len(english.vocab))

    german.build_vocab(train_data, max_size=1000, min_freq=1)
    print('[Info] Get target language vocabulary size:', len(german.vocab))

    batch_size = opt.batch_size
    # data = pickle.load(open(opt.data_file, 'rb'))

    opt.src_pad_idx = english.vocab.stoi['<blank>']
    opt.trg_pad_idx = german.vocab.stoi['<blank>']

    opt.src_vocab_size = len(english.vocab)
    opt.trg_vocab_size = len(german.vocab)

    devices = [0, 1, 2, 3]
    pad_idx = opt.trg_vocab_size
    model = make_model(len(english.vocab), len(german.vocab), N=6)
    model.cuda()
    criterion = LabelSmoothing(size=len(german.vocab),
                               padding_idx=pad_idx,
                               smoothing=0.1)
    criterion.cuda()
    BATCH_SIZE = 12000
    train_iter = MyIterator(train_data,
                            batch_size=BATCH_SIZE,
                            device=0,
                            repeat=False,
                            sort_key=lambda x: (len(x.eng), len(x.ger)),
                            batch_size_fn=batch_size_fn,
                            train=True)
    valid_iter = MyIterator(test_data,
                            batch_size=BATCH_SIZE,
                            device=0,
                            repeat=False,
                            sort_key=lambda x: (len(x.eng), len(x.ger)),
                            batch_size_fn=batch_size_fn,
                            train=False)
    model_par = nn.DataParallel(model, device_ids=devices)

    model_opt = NoamOpt(
        model.src_embed[0].d_model, 1, 2000,
        torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98),
                         eps=1e-9))
    for epoch in range(10):
        model_par.train()
        run_epoch((rebatch(pad_idx, b) for b in train_iter), model_par,
                  MultiGPULossCompute(model.generator,
                                      criterion,
                                      devices=devices,
                                      opt=model_opt))
        model_par.eval()
        loss = run_epoch((rebatch(pad_idx, b) for b in valid_iter), model_par,
                         MultiGPULossCompute(model.generator,
                                             criterion,
                                             devices=devices,
                                             opt=None))
        print(loss)

    for i, batch in enumerate(valid_iter):
        src = batch.src.transpose(0, 1)[:1]
        src_mask = (src != english.vocab.stoi["<blank>"]).unsqueeze(-2)
        out = greedy_decode(model,
                            src,
                            src_mask,
                            max_len=60,
                            start_symbol=german.vocab.stoi["<s>"])
        print("Translation:", end="\t")
        for i in range(1, out.size(1)):
            sym = german.vocab.itos[out[0, i]]
            if sym == "</s>": break
            print(sym, end=" ")
        print()
        print("Target:", end="\t")
        for i in range(1, batch.trg.size(0)):
            sym = german.vocab.itos[batch.trg.data[i, 0]]
            if sym == "</s>": break
            print(sym, end=" ")
        print()
        break
Exemplo n.º 13
0
    unk_token_idx = tokenizer.convert_tokens_to_ids(unk_token)
    max_input_length = tokenizer.max_model_input_sizes['vinai/phobert-base']
    TEXT = Field(batch_first=True,
                 use_vocab=False,
                 tokenize=tokenize_and_cut,
                 preprocessing=tokenizer.convert_tokens_to_ids,
                 init_token=init_token_idx,
                 eos_token=eos_token_idx,
                 pad_token=pad_token_idx,
                 unk_token=unk_token_idx)
    LABEL = LabelField(dtype=torch.long, use_vocab=False)
    fields = [('data', TEXT), ('label', LABEL)]
    train, valid, test = TabularDataset.splits(path=SOURCE_FOLDER,
                                               train='train.csv',
                                               validation='validation.csv',
                                               test='test.csv',
                                               format='CSV',
                                               fields=fields,
                                               skip_header=True)

    train_generator, val_generator, test_generator = BucketIterator.splits(
        (train, valid, test), batch_size=BATCH_SIZE, device=device, sort=False)

    if not os.path.exists(log_dir):
        os.makedirs(log_dir)
    writer = tensorboardX.SummaryWriter()
    optimizer = optim.Adam(model.parameters(), lr=1e-4)
    criterion = nn.CrossEntropyLoss()
    criterion = criterion.to(device)
    global_count = 0
    for epoch in range(NUM_EPOCHS):