def preprocess(self, train_csv):
        '''Returns the Dataset'''

        # Create the fields
        self.fields = {
            'query_text': ('query', self.que_f),
            'program_text': ('program', self.prog_f)
        }

        # Create dataset object
        train_data = TabularDataset.splits(path="./",
                                           train=train_csv,
                                           format="csv",
                                           fields=self.fields)[0]

        # Build vocabulary
        self.que_f.build_vocab(train_data, max_size=100, min_freq=1)
        self.prog_f.build_vocab(train_data,
                                max_size=100,
                                min_freq=1,
                                specials=['<nxt>'])

        return train_data
예제 #2
0
    def __init__(self, train_data = 'offenseval-training-v1.tsv', trained_cnn_model = 'MIDAS_CNN.pt', trained_blstm_model = 'MIDAS_BLSTM.pt', trained_blstmGru_model = 'MIDAS_BLSTM-GRU.pt'):

        self.tokenize = lambda x: nltk.word_tokenize(x.lower())
        
        self.TEXT = Field(sequential = True, tokenize = self.tokenize, lower = True, include_lengths=True)
        self.LABEL = Field(sequential = False, use_vocab = False, dtype = torch.float)
        self.ID = Field(sequential = False, use_vocab = False)
        
        off_datafields = [('id', None), ('text', self.TEXT), ('label', self.LABEL), ('is_target', None), ('target', None)]
        
        trn = TabularDataset.splits(path='.', train=train_data, format='tsv', fields=off_datafields)[0]

        self.TEXT.build_vocab(trn, vectors='glove.6B.200d')
        
        self.BATCH_SIZE = 64
                
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        
        # load pre-trained model
        self.cnn_model = torch.load(trained_cnn_model)
        self.blstm_model = torch.load(trained_blstm_model)
        self.blstmGru_model = torch.load(trained_blstmGru_model)
예제 #3
0
def preprocess_couplet():
    SRC = Field(include_lengths=True,
                init_token="<sos>",
                eos_token="<eos>",
                pad_token="<pad>",
                unk_token="<unk>",
                lower=True,
                batch_first=False,
                tokenize=lambda text: text.split())
    TRG = Field(include_lengths=True,
                init_token="<sos>",
                eos_token="<eos>",
                pad_token="<pad>",
                unk_token="<unk>",
                lower=True,
                batch_first=False,
                tokenize=lambda text: text.split())
    _train, _test = TabularDataset.splits(path="data/couplet", root="data", train="train.tsv", test="test.tsv",
                                    format='csv', skip_header=False, fields=[("src", SRC), ("trg", TRG)],
                                    csv_reader_params={"quoting": csv.QUOTE_NONE, "delimiter": "\t"})
    SRC.build_vocab(_train.src, _train.trg, min_freq=1)
    TRG.vocab = SRC.vocab
    return _train, _test, SRC, TRG
 def init_dataset(self, root_path, train_path, dev_path, test_path, isSkipHead):
     if self.isBertCat:
         if train_path and dev_path and test_path:
             return BertTabularDataset_MultipleChoice.splits(
                 path = root_path,
                 train = train_path, validation = dev_path, test = test_path,
                 format='tsv',
                 question_fix_length = 40,
                 fields=self.dataset_field,
                 bert_fields=self.bert_field,
                 skip_header=isSkipHead
             )
         else:
             return BertTabularDataset_MultipleChoice(
                 path = os.path.join(root_path, train_path), 
                 format='tsv',
                 question_fix_length = 40,
                 fields=self.dataset_field,
                 bert_fields=self.bert_field,
                 skip_header=isSkipHead
             ), None, None
     else:
         if train_path and dev_path and test_path:
             return TabularDataset.splits(
                 path = root_path,
                 train = train_path, validation = dev_path, test = test_path,
                 format='tsv',
                 fields=self.dataset_field,
                 skip_header=isSkipHead
             )
         else:
             return TabularDataset(
                 path = os.path.join(root_path, train_path), 
                 format='tsv',
                 fields=self.dataset_field,
                 skip_header=isSkipHead
             ), None, None
예제 #5
0
파일: loadData.py 프로젝트: sairoopb/MaPP
 def get(self):
     REVIEW = Field(tokenize=self.en_tokenizer,
                    init_token='<sos>',
                    eos_token='<eos>',
                    stop_words=STOP_WORDS,
                    use_vocab=True)
     INPUT_H = Field(sequential=False,
                     use_vocab=False,
                     pad_token=None,
                     unk_token=None,
                     dtype=torch.float32)
     INPUT_F = Field(sequential=False,
                     use_vocab=False,
                     pad_token=None,
                     unk_token=None,
                     dtype=torch.float32)
     OUTPUT = Field(sequential=False,
                    use_vocab=False,
                    pad_token=None,
                    unk_token=None,
                    dtype=torch.float32)
     fields = {
         'Review': ('r', REVIEW),
         'Input Hidden': ('h', INPUT_H),
         'Input Final': ('f', INPUT_F),
         'Output': ('o', OUTPUT)
     }
     trainds, valds, testds = TabularDataset.splits(path='./',
                                                    train='train.json',
                                                    validation='val.json',
                                                    test='test.json',
                                                    format='json',
                                                    fields=fields)
     REVIEW.build_vocab(trainds, valds)
     length_of_vocab = len(REVIEW.vocab)
     return trainds, valds, testds, length_of_vocab
예제 #6
0
def load_dataset(config, device):

    label_dict = {"observing": 0, "against": 1, "for": 2}
    LABEL = Field(use_vocab = False, sequential = False,\
     dtype = torch.long, preprocessing = lambda x: label_dict[x.strip()])

    SEQ = Field(dtype = torch.long, lower = True, batch_first = True,\
     preprocessing = lambda x:x[:45], include_lengths = True)
    SENT = Field(dtype = torch.long, lower = True, batch_first = True,\
     preprocessing = lambda x:x[:45], include_lengths = False)

    DOC = NestedField(SENT, tokenize = lambda s:s.strip().split(' </s> '), \
     preprocessing = lambda s:[x for x in s[:45] if x], dtype = torch.long,\
     include_lengths = True)

    fields = [('label', LABEL), ('claim', SEQ), ('hline', SEQ),\
     ('abst', SEQ), ('body', DOC)]

    train, test = TabularDataset.splits(path="../stance_data/", format = "tsv",\
     fields = fields, train = config.train_file, test = config.test_file)
    train, val = train.split(split_ratio=0.80)

    vectors = GloVe(name="6B",
                    dim=config.embed_dim,
                    cache='/users4/jwduan/vectors/')
    DOC.build_vocab(train, val, test, vectors=vectors)

    SEQ.build_vocab()
    SEQ.vocab = DOC.vocab

    config.vocab_size = len(DOC.vocab)
    train_loader, val_loader, test_loader = Iterator.splits((train, val, test),\
     batch_sizes = (config.batch_size, 256, 256), sort_key = lambda x:len(x.body), sort = True,
      device = device, shuffle = True, repeat = False)

    return (train_loader, val_loader, test_loader), DOC.vocab.vectors
예제 #7
0
def generate_equation_for_torch(allowed_operators: Iterable, min_value: int,
                                max_value: int, train_size: int,
                                validation_size: int, test_size: int, x: Field,
                                y: Field):
    train_samples = generate_equations(allowed_operators, train_size,
                                       min_value, max_value)
    test_samples = generate_equations(allowed_operators, test_size, min_value,
                                      max_value)
    validation_samples = generate_equations(allowed_operators, validation_size,
                                            min_value, max_value)
    with open('tmp_train.csv', 'w') as f:
        writer = csv.writer(f)
        writer.writerow(['x', 'y'])
        writer.writerows(train_samples)
    with open('tmp_validation.csv', 'w') as f:
        writer = csv.writer(f)
        writer.writerow(['x', 'y'])
        writer.writerows(validation_samples)
    with open('tmp_test.csv', 'w') as f:
        writer = csv.writer(f)
        writer.writerow(['x', 'y'])
        writer.writerows(test_samples)

    train, validation, test = TabularDataset.splits(
        path='',
        train='tmp_train.csv',
        validation='tmp_validation.csv',
        test='tmp_test.csv',
        fields=[('x', x), ('y', y)],
        format='csv',
        skip_header=True)
    os.remove('tmp_train.csv')
    os.remove('tmp_validation.csv')
    os.remove('tmp_test.csv')

    return train, validation, test
예제 #8
0
def load_dataset(config, device):

    LABEL = Field(sequential = False, dtype = torch.long, use_vocab = False,\
     batch_first = True, preprocessing = lambda x:1 if float(x) > 0. else 0)
    TARGET = Field(batch_first=True,
                   lower=True,
                   dtype=torch.long,
                   preprocessing=lambda x: x[0].split('_'),
                   include_lengths=True)

    TEXT = Field(dtype = torch.long, lower = True, batch_first = True,\
     preprocessing = lambda x:x[:50])# [w for w in x if w not in stopwords_set][:50])

    LEADS = NestedField(TEXT, dtype = torch.long, include_lengths = True,\
     tokenize = lambda s: s.split('</s>'), preprocessing = lambda x:x[-5:])

    DOC = NestedField(TEXT, dtype = torch.long, include_lengths = True,\
     tokenize = lambda s: s.split('</s>'), preprocessing = lambda x:[s for s in x[1:50] if s])
    DOCS = NestNestedField(DOC, dtype = torch.long, include_lengths = True,\
     tokenize = lambda s: s.split('</p>'), preprocessing = lambda x:x[-5:])

    fields = [('label', LABEL), ('target', TARGET), ('leads', LEADS),
              ('docs', DOCS)]
    train, val, test = TabularDataset.splits(path="../abrt_data/", format = "tsv", \
     fields = fields, train = config.train_file, validation = config.dev_file, test = config.test_file)

    TARGET.build_vocab(train, val, test)
    DOCS.build_vocab(train, val, test)

    config.wvocab_size = len(DOCS.vocab)
    config.tvocab_size = len(TARGET.vocab)
    # sort = False,
    train_loader, val_loader, test_loader = BucketIterator.splits((train, val, test),\
     sort_key = lambda x: len(x.docs), sort = True, batch_sizes = (config.batch_size, 32, 32),\
     device = device, repeat = False)
    return (train_loader, val_loader, test_loader)
def pad_under_five(toknized):
    """
    모델에서 5-gram 단위 필터를 사용하기 때문에
    5-gram이 안되는 문장에 <pad>로 채워준다
    """
    if len(toknized) < 5:
        toknized.extend(["<pad>"]*(5-len(toknized)))
    return toknized

TEXT = Field(tokenize=tagger.morphs,lower=True,include_lengths=False,batch_first=True,preprocessing=pad_under_five)
LABEL = Field(sequential=False,use_vocab=True,unk_token=None)

train_data, test_data = TabularDataset.splits(path=DATA_PATH+'/nsmc/',
 train='ratings_train.txt',
 test='ratings_test.txt',
 format='tsv', 
 skip_header=True, 
 fields=[('id',None),('text',TEXT),('label',LABEL)], 
 filter_pred = lambda x: True if len(x.text) > 1 else False) 
# 토큰 레벨 문장의 길이가 1 이상인 경우만 허용

TEXT.build_vocab(train_data,min_freq=2)
LABEL.build_vocab(train_data)

# print (TEXT.vocab)
# print (len(TEXT.vocab),len(LABEL.vocab))

# print (TEXT.vocab.itos[:5])
# print (LABEL.vocab.itos)

train_loader, test_loader = BucketIterator.splits((train_data,test_data),sort_key=lambda x:len(x.text), sort_within_batch=True,
예제 #10
0
def train_data():
    tokenize = lambda x: x.split()

    Text_src = Field(sequential=True, tokenize=tokenize, eos_token='<EOS>', include_lengths=True, lower=True)
    Answer = Field(sequential=True, tokenize=tokenize, eos_token='<EOS>', include_lengths=True, lower=True)
    Text_tgt = Field(sequential=True, tokenize=tokenize, eos_token='<EOS>',
                     include_lengths=True, init_token='<SOS>', lower=True)


    trn_datafields = [("source",Text_src),
                    ("target", Text_tgt),
                    ("answer", Answer)]
    trn, val = TabularDataset.splits(
        path="../data/"+str(data_name), # the root directory where the data lies
        train='train.json', validation = 'validation.json',
        format='json',
        # skip_header=True, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!
        fields={'source': trn_datafields[0], 'target': trn_datafields[1], 'answer': trn_datafields[2]})

    # Text_src.build_vocab(trn, max_size=vocab_size)
    Text_src.build_vocab(trn, max_size=src_vocab_size)
    Text_tgt.build_vocab(trn, max_size=tgt_vocab_size)
    Answer.build_vocab(trn)
    Text_src.vocab.load_vectors("glove.840B.300d")
    Text_tgt.vocab.load_vectors("glove.840B.300d")

    train_iter, val_iter = BucketIterator.splits(
            (trn, val), # we pass in the datasets we want the iterator to draw data from
            batch_sizes= (batch_size, batch_size),
            device=-1, # if you want to use the GPU, specify the GPU number here
            sort_key=lambda x: len(x.source), # the BucketIterator needs to be told what function it should use to group the data.
            sort_within_batch=True,
            shuffle = True,
            repeat= False)


    Text_tgt_r = ReversibleField(sequential=True, include_lengths=True,
                                 eos_token='<EOS>', init_token='<SOS>', lower=True)
    Text_tgt_r.vocab = Text_tgt.vocab

    Text_src_r = ReversibleField(sequential=True, include_lengths=True,
                                 eos_token='<EOS>', lower=True)
    Text_src_r.vocab = Text_src.vocab

    Text_ans_r = ReversibleField(sequential=True, tokenize=tokenize,
                   eos_token='<EOS>', include_lengths=True, lower=True)
    Text_ans_r.vocab = Answer.vocab

    src_pad = Text_src.vocab.stoi['<pad>']
    src_unk = Text_src.vocab.stoi['<unk>']
    src_eos = Text_src.vocab.stoi['<EOS>']
    src_special = [src_pad, src_unk, src_eos]

    ans_pad = Answer.vocab.stoi['<pad>']
    ans_unk = Answer.vocab.stoi['<unk>']
    ans_eos = Answer.vocab.stoi['<EOS>']
    ans_special = [ans_pad, ans_unk, ans_eos]

    tgt_pad = Text_tgt.vocab.stoi['<pad>']
    tgt_unk = Text_tgt.vocab.stoi['<unk>']
    tgt_eos = Text_tgt.vocab.stoi['<EOS>']
    tgt_sos = Text_tgt.vocab.stoi['<SOS>']
    tgt_special = [tgt_pad, tgt_unk, tgt_eos, tgt_sos]


    # discriminator data iterator
    passage = Field(sequential=True, tokenize=tokenize, eos_token='<EOS>', include_lengths=True, lower=True)
    ans = Field(sequential=True, tokenize=tokenize, eos_token='<EOS>', include_lengths=True, lower=True)
    ques = Field(sequential=True, tokenize=tokenize, eos_token='<EOS>',include_lengths=True, lower=True)
    target = Field(sequential=False, use_vocab=False)

    disc_trn_datafields = [("question", ques),
                      ("answer", ans),
                      ("passage", passage),
                      ("target", target)]

    disc_trn = TabularDataset(
        path="../data/" + str(data_name) + "/disc.json",  # the root directory where the data lies
        # train='disc.json',
        format='json',
        # skip_header=True, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!
        fields={'question': disc_trn_datafields[0], 'answer': disc_trn_datafields[1], 'passage': disc_trn_datafields[2], 'target': disc_trn_datafields[3]})

    passage.vocab = Text_src.vocab
    ans.vocab = Answer.vocab
    ques.vocab = Text_tgt.vocab

    disc_train_iter = BucketIterator(
        dataset=disc_trn,  # we pass in the datasets we want the iterator to draw data from
        batch_size = batch_size,
        device=-1,  # if you want to use the GPU, specify the GPU number here
        sort_key=lambda x: len(x.question),
        # the BucketIterator needs to be told what function it should use to group the data.
        sort_within_batch=True,
        shuffle=True,
        repeat=False)



    # raw data iterator
    Text_tgt_raw = ReversibleField(sequential=True, tokenize=tokenize, include_lengths=True, lower=True)

    trn_datafields = [("source", Text_tgt_raw),
                      ("target", Text_tgt_raw)]
    trn_raw, val_raw = TabularDataset.splits(
        path="../data/"+str(data_name),  # the root directory where the data lies
        train='train.json', validation='validation.json',
        format='json',
        # skip_header=True,
        # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!
        fields={'source': trn_datafields[0], 'target': trn_datafields[1]})


    Text_tgt_raw.build_vocab(val_raw)

    train_iter_raw, val_iter_raw = BucketIterator.splits(
        (trn_raw, val_raw),  # we pass in the datasets we want the iterator to draw data from
        batch_sizes=(batch_size, batch_size),
        device=-1,  # if you want to use the GPU, specify the GPU number here
        sort_key=lambda x: len(x.source),
        # the BucketIterator needs to be told what function it should use to group the data.
        sort_within_batch=True,
        shuffle=True,
        repeat=False)


    return train_iter, val_iter, src_special, tgt_special, Text_tgt_r, val_iter_raw, Text_tgt_raw, Text_src_r,\
           Text_src, Text_tgt, ans_special, Text_ans_r, disc_train_iter
예제 #11
0
def tokenize_en(sentence):
    return [tok.text for tok in en.tokenizer(sentence)]


def tokenize_fr(sentence):
    return [tok.text for tok in fr.tokenizer(sentence)]


EN_TEXT = Field(tokenize=tokenize_en)
FR_TEXT = Field(tokenize=tokenize_fr, init_token='<sos>', eos_token='<eos>')

# associate the text in the 'English' column with the EN_TEXT field, # and 'French' with FR_TEXT
data_fields = [('English', EN_TEXT), ('French', FR_TEXT)]
train, val = TabularDataset.splits(path='data_small',
                                   train='train.csv',
                                   validation='val.csv',
                                   format='csv',
                                   fields=data_fields)

FR_TEXT.build_vocab(train, val)
EN_TEXT.build_vocab(train, val)

max_src_in_batch, max_tgt_in_batch = 100, 100


def batch_size_fn(new, count, sofar):
    "Keep augmenting batch and calculate total number of tokens + padding."
    global max_src_in_batch, max_tgt_in_batch
    if count == 1:
        max_src_in_batch = 0
        max_tgt_in_batch = 0
예제 #12
0
def tokenize(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]


#tokenize = lambda x: x.split()

# preprocess
quote = Field(sequential=True, use_vocab=True, tokenize=tokenize, lower=True)
score = Field(sequential=False, use_vocab=False)

fields = {'quote': ('q', quote), 'score': ('s', score)}

train_data, test_data = TabularDataset.splits(
    path='data',
    train='train.json',
    test='test.json',
    #validation='validation.json',
    format='json',
    fields=fields)
print(train_data[0].__dict__.keys())
print(train_data[0].__dict__.values())
'''
train_data, test_data = TabularDataset.splits(
                                        path='data',
                                        train='train.csv',
                                        test='test.csv',
                                        format='csv',
                                        fields=fields
                                    )
train_data, test_data = TabularDataset.splits(
                                        path='data',
예제 #13
0
from utils.custom_utils import create_sentence, tokenize_text, modified_bleu, foldify
from utils.utils import save_checkpoint, load_checkpoint

input_text = Field(tokenize=tokenize_text,
                   lower=True,
                   init_token="<sos>",
                   eos_token="<eos>")
output_text = Field(tokenize=tokenize_text,
                    lower=True,
                    init_token="<sos>",
                    eos_token="<eos>")

fields = {'Input': ('i', input_text), 'Output': ('o', output_text)}

big_data = TabularDataset.splits(path="",
                                 train="./shuffledgutenberg.json",
                                 format='json',
                                 fields=fields)

input_text.build_vocab(big_data[0], max_size=20_000,
                       min_freq=8)  # , vectors='fasttext.simple.300d'
output_text.build_vocab(big_data[0], max_size=20_000,
                        min_freq=8)  # , vectors='fasttext.simple.300d'

print("Input Vocab Size: {}".format(len(input_text.vocab)))
print("Output Vocab Size: {}".format(len(output_text.vocab)))

# We're ready to define everything we need for training our Seq2Seq model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

load_model = True
save_model = True
예제 #14
0
DEVICE = 0 if USE_CUDA else -1

batch_size = 64

# Tokenizer
tagger = Mecab()
tagger = tagger.morphs

# Make Field
REVIEW = Field(tokenize=tagger, use_vocab=True, lower=True, #init_token="<s>", eos_token="</s>",
               include_lengths=True, batch_first=True)
LABEL = Field(sequential=False, use_vocab=False, preprocessing=lambda x: int(x))

# Get train/test data
train_data, test_data = TabularDataset.splits(
                   path="./data/", train='train_docs.txt', validation="test_docs.txt",
                   format='tsv', fields=[('review', REVIEW), ('label', LABEL)])

# Build Vocaburary
REVIEW.build_vocab(train_data)
len(REVIEW.vocab)

# Make iterator for splits
train_iter, test_iter = BucketIterator.splits(
    (train_data, test_data), batch_size=batch_size, device=DEVICE, # device -1 : cpu, device 0 : 남는 gpu
    sort_key=lambda x: len(x.review), sort_within_batch=True, repeat=False) # x.TEXT 길이 기준으로 정렬

# parameters
V = len(REVIEW.vocab)
D = 100
H = 200
예제 #15
0
def train(args: Dict):
    """ Train the NMT Model.
    @param args (Dict): args from cmd line
    """
    train_data_src = read_corpus(args['--train-src'], source='src')
    train_data_tgt = read_corpus(args['--train-tgt'], source='tgt')

    dev_data_src = read_corpus(args['--dev-src'], source='src')
    dev_data_tgt = read_corpus(args['--dev-tgt'], source='tgt')

    train_data = list(zip(train_data_src, train_data_tgt))
    dev_data = list(zip(dev_data_src, dev_data_tgt))

    train_batch_size = int(args['--batch-size'])
    clip_grad = float(args['--clip-grad'])
    valid_niter = int(args['--valid-niter'])
    log_every = int(args['--log-every'])
    model_save_path = args['--save-to']

    #prefer to do our entire train,test,val split in the code itself as opposed to our previous script
    # remove these comments

    #data preprocessing for Qs and As.
    spacy_en = spacy.load('en')

    def tokenizer(text):  # create a tokenizer function
        return [tok.text for tok in spacy_en.tokenizer(text)]

    TEXT = Field(sequential=True,
                 tokenize=tokenizer,
                 lower=True,
                 include_lengths=True,
                 init_token='<s>',
                 eos_token='</s>')
    analogies_datafields = [("abc", TEXT), ("d", TEXT)]

    train, val, test = TabularDataset.splits(
        path="data",  # the root directory where the data lies
        train='ngram_train.csv',
        validation="ngram_val.csv",
        test='ngram_test.csv',
        format='csv',
        skip_header=
        False,  # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!
        fields=analogies_datafields)

    pretrained_vecs = torchtext.vocab.Vectors('../GloVe-1.2/life_vectors.txt')
    TEXT.build_vocab(
        vectors=pretrained_vecs)  # specials=['<pad>', '<s>', '</s>']

    if args['--cuda'] == 'cpu':
        torch_text_device = -1
    else:
        torch_text_device = 0

    training_iter, val_iter, test_iter = Iterator.splits(
        (train, val, test),
        sort_key=lambda x: len(x.abc),
        batch_sizes=(100, 20, 1),
        device=torch_text_device,
        sort_within_batch=True)

    model = NMT(embed_size=int(args['--embed-size']),
                hidden_size=int(args['--hidden-size']),
                dropout_rate=float(args['--dropout']),
                vocab=TEXT.vocab)
    model.train()  #sets training = True

    uniform_init = float(args['--uniform-init'])
    if np.abs(uniform_init) > 0.:
        print('uniformly initialize parameters [-%f, +%f]' %
              (uniform_init, uniform_init),
              file=sys.stderr)
        for p in model.parameters():
            p.data.uniform_(-uniform_init, uniform_init)

    device = torch.device("cuda:0" if args['--cuda'] else "cpu")
    print('use device: %s' % device, file=sys.stderr)

    model = model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=float(args['--lr']))

    num_trial = 0
    train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0
    cum_examples = report_examples = epoch = valid_num = 0
    hist_valid_scores = []
    train_time = begin_time = time.time()
    print('begin Maximum Likelihood training')

    writer = SummaryWriter('logs')
    is_better_count = 0  #TODO: Remove this and debug the nonstopping part
    while True:
        epoch += 1

        for _, data in enumerate(training_iter):
            (src_sents, src_lengths), (tgt_sents, _) = data.abc, data.d

            train_iter += 1

            optimizer.zero_grad()

            batch_size = src_sents.shape[1]

            example_losses = model(src_sents, src_lengths,
                                   tgt_sents)  # (batch_size,)
            batch_loss = example_losses.sum()
            loss = batch_loss / batch_size

            loss.backward()

            # clip gradient
            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                       clip_grad)

            optimizer.step()

            batch_losses_val = batch_loss.item()
            report_loss += batch_losses_val
            cum_loss += batch_losses_val

            tgt_words_num_to_predict = sum(
                len(s[1:]) for s in tgt_sents)  # omitting leading `<s>`
            report_tgt_words += tgt_words_num_to_predict
            cum_tgt_words += tgt_words_num_to_predict
            report_examples += batch_size
            cum_examples += batch_size

            if train_iter % log_every == 0:
                print('epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f ' \
                      'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % (epoch, train_iter,
                                                                                         report_loss / report_examples,
                                                                                         math.exp(report_loss / report_tgt_words),
                                                                                         cum_examples,
                                                                                         report_tgt_words / (time.time() - train_time),
                                                                                         time.time() - begin_time), file=sys.stderr)

                writer.add_scalar('Train/AvgLoss',
                                  report_loss / report_examples, epoch)
                writer.add_scalar('Train/AvgPPL',
                                  math.exp(report_loss / report_tgt_words),
                                  epoch)

                train_time = time.time()
                report_loss = report_tgt_words = report_examples = 0.

            # perform validation
            if train_iter % valid_niter == 0:
                print(
                    'epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f cum. examples %d'
                    % (epoch, train_iter, cum_loss / cum_examples,
                       np.exp(cum_loss / cum_tgt_words), cum_examples),
                    file=sys.stderr)

                cum_loss = cum_examples = cum_tgt_words = 0.
                valid_num += 1

                print('begin validation ...', file=sys.stderr)

                # compute dev. ppl and bleu
                dev_ppl, val_loss = evaluate_ppl(
                    model, val_iter)  # dev batch size can be a bit larger
                valid_metric = -dev_ppl

                print('validation: iter %d, dev. ppl %f, dev loss %f' %
                      (train_iter, dev_ppl, val_loss),
                      file=sys.stderr)
                writer.add_scalar('Val/AvgPPL', dev_ppl, epoch)
                writer.add_scalar('Val/AvgLoss', val_loss, epoch)

                is_better = len(hist_valid_scores
                                ) == 0 or valid_metric > max(hist_valid_scores)
                print(hist_valid_scores)
                print(valid_metric)
                hist_valid_scores.append(valid_metric)

                if is_better:
                    patience = 0
                    print('save currently the best model to [%s]' %
                          model_save_path,
                          file=sys.stderr)
                    model.save(model_save_path)
                    is_better_count = is_better_count + 1
                    print(is_better_count)
                    # also save the optimizers' state
                    torch.save(optimizer.state_dict(),
                               model_save_path + '.optim')
                    if is_better_count > 3:
                        print('reached maximum number of epochs!',
                              file=sys.stderr)
                        writer.close()
                        exit(0)

                elif patience < int(args['--patience']):
                    patience += 1
                    print('hit patience %d' % patience, file=sys.stderr)

                    if patience == int(args['--patience']):
                        num_trial += 1
                        print('hit #%d trial' % num_trial, file=sys.stderr)
                        if num_trial == int(args['--max-num-trial']):
                            print('early stop!', file=sys.stderr)
                            exit(0)

                        # decay lr, and restore from previously best checkpoint
                        lr = optimizer.param_groups[0]['lr'] * float(
                            args['--lr-decay'])
                        print(
                            'load previously best model and decay learning rate to %f'
                            % lr,
                            file=sys.stderr)

                        # load model
                        params = torch.load(
                            model_save_path,
                            map_location=lambda storage, loc: storage)
                        model.load_state_dict(params['state_dict'])
                        model = model.to(device)

                        print('restore parameters of the optimizers',
                              file=sys.stderr)
                        optimizer.load_state_dict(
                            torch.load(model_save_path + '.optim'))

                        # set new lr
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr

                        # reset patience
                        patience = 0

                if epoch == int(args['--max-epoch']):
                    print('reached maximum number of epochs!', file=sys.stderr)
                    writer.close()
                    exit(0)
예제 #16
0
                    use_vocab=False,
                    batch_first=True,
                    dtype=torch.float)
text_field = Field(tokenize='spacy',
                   lower=True,
                   include_lengths=True,
                   batch_first=True)
fields = [('label', label_field), ('title', text_field), ('text', text_field),
          ('titletext', text_field)]

# TabularDataset

train, valid, test = TabularDataset.splits(path=source_folder,
                                           train='train.csv',
                                           validation='valid.csv',
                                           test='test.csv',
                                           format='CSV',
                                           fields=fields,
                                           skip_header=True)

# Iterators

train_iter = BucketIterator(train,
                            batch_size=32,
                            sort_key=lambda x: len(x.text),
                            device=device,
                            sort=True,
                            sort_within_batch=True)
valid_iter = BucketIterator(valid,
                            batch_size=32,
                            sort_key=lambda x: len(x.text),
예제 #17
0
    def __init__(self, config):
        # logger
        self.logger = logging.getLogger(config["name"])

        # data loader params
        self.config = config["data_loader"]["args"]

        data_path = self.config["data_path"]
        ensure_dir(data_path)
        self.train_path = os.path.join(data_path, self.config["train_file"])
        self.valid_path = os.path.join(data_path, self.config["valid_file"])
        self.test_path = os.path.join(data_path, self.config["test_file"])

        # limit max text length
        self.context_threshold = self.config["context_threshold"]

        self.logger.info("preprocessing data files...")
        if not os.path.exists(self.train_path) or not os.path.exists(
                self.valid_path):
            self.preprocess(type="train")
        if not os.path.exists(self.test_path):
            self.preprocess(type="test")

        # define filed
        TEXT = Field(sequential=True,
                     use_vocab=True,
                     tokenize=lambda x: x,
                     lower=True,
                     include_lengths=True,
                     batch_first=True)
        LABLE = Field(sequential=False, use_vocab=False)

        # build dataset
        self.logger.info("building dataset......")

        train_dict_fileds = {'text': ('text', TEXT), 'label': ('label', LABLE)}

        self.train, self.valid, self.test = TabularDataset.splits(
            path=data_path,  # data root path
            format="json",
            train=self.config["train_file"],
            validation=self.config["valid_file"],
            test=self.config["test_file"],
            fields=train_dict_fileds)

        # build vocab
        self.logger.info("building vocab....")
        TEXT.build_vocab(self.train, self.valid)

        # load pretrained embeddings
        self.logger.info("load pretrained embeddings...")
        Vectors = vocab.Vectors(self.config["pretrain_emd_file"])
        TEXT.vocab.load_vectors(Vectors)
        # just for call easy
        self.vocab = TEXT.vocab

        # build iterators
        self.logger.info("building iterators.....")
        self.train_iter, self.valid_iter = BucketIterator.splits(
            (self.train, self.valid),
            batch_sizes=(self.config["train_batch_size"],
                         self.config["valid_batch_size"]),
            device=self.config["device"],
            sort_key=lambda x: len(x.text),
            sort_within_batch=False)
        self.test_iter = BucketIterator(
            self.test,
            batch_size=self.config["test_batch_size"],
            device=self.config["device"],
            sort_key=lambda x: len(x.text),
            sort_within_batch=False)
        self.logger.info("building iterators done!")
        self.logger.info(
            "Total train data set is: {}, valid data set is: {}, test "
            "data is: {}".format(len(self.train), len(self.valid),
                                 len(self.test)))
예제 #18
0
    def handle(self, *args, **kwargs):
        min_freq = kwargs['min_freq']
        batch_size = kwargs['batch_size']
        num_epochs = kwargs['num_epochs']
        embedding_output = kwargs['embedding_output']
        hidden_size = kwargs['hidden_size']
        num_layers = kwargs['num_layers']
        bi_lstm = kwargs['bi_lstm']
        self.stdout.write("Loading Dataset ... ")
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.stdout.write("creating fields ...")
        # Fields

        label_field = Field(sequential=False, use_vocab=False, batch_first=True, dtype=torch.float)
        text_field = Field(tokenize='moses', lower=True, include_lengths=True, batch_first=True)
        fields = [('label', label_field), ('title', text_field), ('text', text_field), ('titletext', text_field)]

        # TabularDataset
        self.stdout.write('creating TabularDataset...')
        train, valid, test = TabularDataset.splits(path='./data/preprocessed/', train='train.csv',
                                                   validation='valid.csv', test='test.csv',
                                                   format='CSV', fields=fields, skip_header=True)

        # Iterators
        self.stdout.write("Creating iterators...")
        train_iter = BucketIterator(train, batch_size=batch_size, sort_key=lambda x: len(x.text),
                                    device=device, sort=False, sort_within_batch=True)
        valid_iter = BucketIterator(valid, batch_size=batch_size, sort_key=lambda x: len(x.text),
                                    device=device, sort=False, sort_within_batch=True)
        test_iter = BucketIterator(test, batch_size=batch_size, sort_key=lambda x: len(x.text),
                                   device=device, sort=False, sort_within_batch=True)

        # Vocabulary
        self.stdout.write("Creating vocabulary")
        text_field.build_vocab(train, min_freq=min_freq, )

        class FakeNewsNet(nn.Module):
            def __init__(self, vocab_size=len(text_field.vocab), hidden_size=300, num_layers=1, bi_lstm=True):
                super(FakeNewsNet, self).__init__()
                self.vocab_size = vocab_size
                self.hidden_size = hidden_size
                self.num_layers = num_layers
                self.bi_lstm = bi_lstm
                self.embedding = nn.Embedding(self.vocab_size, embedding_output)
                self.LSTM = nn.LSTM(input_size=embedding_output, hidden_size=self.hidden_size, num_layers=self.num_layers,
                                    bidirectional=self.bi_lstm, batch_first=True)
                self.drop = nn.Dropout(p=0.5)
                if bi_lstm:
                    self.out = nn.Linear(2 * self.hidden_size, 1)
                else:
                    self.out = nn.Linear(self.hidden_size, 1)

            def forward(self, inp, input_len):

                embeded_text = self.embedding(inp)
                packed_input = pack_padded_sequence(embeded_text, input_len, batch_first=True, enforce_sorted=False)
                packed_output, _ = self.LSTM(packed_input)
                output, _ = pad_packed_sequence(packed_output, batch_first=True)

                out_forward = output[range(len(output)), input_len - 1, :self.hidden_size]
                out_reverse = output[:, 0, self.hidden_size:]
                out_reduced = torch.cat((out_forward, out_reverse), 1)
                text_fea = self.drop(out_reduced)

                text_fea = self.out(text_fea)
                text_fea = torch.squeeze(text_fea, 1)
                text_out = torch.sigmoid(text_fea)

                return text_out

        def save_checkpoint(save_path, model, optimizer, valid_loss):

            if save_path == None:
                return

            state_dict = {'model_state_dict': model.state_dict(),
                          'optimizer_state_dict': optimizer.state_dict(),
                          'valid_loss': valid_loss}

            torch.save(state_dict, save_path)
            self.stdout.write(f'Model saved to :{save_path}')

        def load_checkpoint(load_path, model, optimizer):

            if load_path == None:
                return

            state_dict = torch.load(load_path, map_location=device)
            self.stdout.write(f'Model loaded from : {load_path}')

            model.load_state_dict(state_dict['model_state_dict'])
            optimizer.load_state_dict(state_dict['optimizer_state_dict'])

            return state_dict['valid_loss']

        def save_metrics(save_path, train_loss_list, valid_loss_list, global_steps_list):

            if save_path == None:
                return

            state_dict = {'train_loss_list': train_loss_list,
                          'valid_loss_list': valid_loss_list,
                          'global_steps_list': global_steps_list}

            torch.save(state_dict, save_path)
            self.stdout.write(f'Model saved to: {save_path}')

        def load_metrics(load_path):

            if load_path == None:
                return

            state_dict = torch.load(load_path, map_location=device)
            self.stdout.write(f'Model loaded from: {load_path}')

            return state_dict['train_loss_list'], state_dict['valid_loss_list'], state_dict['global_steps_list']

        def train(model,
                  optimizer,
                  criterion=nn.BCELoss(),
                  train_loader=train_iter,
                  valid_loader=valid_iter,
                  num_epochs=100,
                  eval_every=len(train_iter) // 2,
                  file_path='./saved',
                  best_valid_loss=float("Inf")):

            # initialize running values
            running_loss = 0.0
            valid_running_loss = 0.0
            global_step = 0
            train_loss_list = []
            valid_loss_list = []
            global_steps_list = []

            # training loop
            self.stdout.write("training ...")
            model.train()
            for epoch in range(num_epochs):
                for (labels, (title, title_len), (text, text_len), (titletext, titletext_len)), _ in train_loader:
                    labels = labels.to(device)
                    titletext = titletext.to(device)
                    titletext_len = titletext_len.to(device)
                    output = model(titletext, titletext_len)
                    loss = criterion(output, labels)
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

                    # update running values
                    running_loss += loss.item()
                    global_step += 1

                    # evaluation step
                    if global_step % eval_every == 0:
                        model.eval()
                        with torch.no_grad():
                            # validation loop
                            for (labels, (title, title_len), (text, text_len),
                                 (titletext, titletext_len)), _ in valid_loader:
                                labels = labels.to(device)
                                titletext = titletext.to(device)
                                titletext_len = titletext_len.to(device)
                                output = model(titletext, titletext_len)

                                loss = criterion(output, labels)
                                valid_running_loss += loss.item()

                        # evaluation
                        average_train_loss = running_loss / eval_every
                        average_valid_loss = valid_running_loss / len(valid_loader)
                        train_loss_list.append(average_train_loss)
                        valid_loss_list.append(average_valid_loss)
                        global_steps_list.append(global_step)

                        # resetting running values
                        running_loss = 0.0
                        valid_running_loss = 0.0
                        model.train()

                        # self.stdout.write progress
                        self.stdout.write('Epoch [{}/{}], Step [{}/{}], Train Loss: {:.4f}, Valid Loss: {:.4f}'
                              .format(epoch + 1, num_epochs, global_step, num_epochs * len(train_loader),
                                      average_train_loss, average_valid_loss))

                        # checkpoint
                        if best_valid_loss > average_valid_loss:
                            best_valid_loss = average_valid_loss
                            save_checkpoint(file_path + '/model.pt', model, optimizer, best_valid_loss)
                            save_metrics(file_path + '/metrics.pt', train_loss_list, valid_loss_list, global_steps_list)

            save_metrics(file_path + '/metrics.pt', train_loss_list, valid_loss_list, global_steps_list)
            self.stdout.write('Finished Training!')

        model = FakeNewsNet(hidden_size=hidden_size, num_layers=num_layers, bi_lstm=bi_lstm).to(device)
        self.stdout.write(model)
        optimizer = optim.Adam(model.parameters(), lr=0.01, eps=1e-6, )

        train(model=model, optimizer=optimizer, num_epochs=num_epochs, eval_every=2)
예제 #19
0
def main():
    # -----------------get train, val and test data--------------------
    train_data, test_data = TabularDataset.splits(
        path=r"D:\ruin\data\csv_file\imdb_split", train='train_data.csv', test='test_data.csv', format='csv',
        fields=[('review', TEXT), ('sentiment', LABEL)], skip_header=True)

    train_data, eval_data = train_data.split(random_state = random.seed(RANDOM_SEED))

    print('Number of train data {}'.format(len(train_data)))
    print('Number of val data {}'.format(len(eval_data)))
    print('Number of test data {}'.format(len(test_data)))

    TEXT.build_vocab(train_data,
                     max_size=MAX_VOCAB_SIZE,
                     vectors="glove.6B.100d",
                     min_freq=10)
    LABEL.build_vocab(train_data)
    print('Unique token in Text vocabulary {}'.format(len(TEXT.vocab)))  # 250002(<unk>, <pad>)
    print(TEXT.vocab.itos)
    print('Unique token in LABEL vocabulary {}'.format(len(LABEL.vocab)))
    print(LABEL.vocab.itos)

    print('Top 20 frequency of word: \n {}'.format(TEXT.vocab.freqs.most_common(20)))
    print('Embedding shape {}'.format(TEXT.vocab.vectors.size))

    print('Done')

    # generate dataloader
    train_iter = data.BucketIterator(train_data, batch_size=BATCH_SIZE, device=device, shuffle=True)
    eval_iter, test_iter = data.BucketIterator.splits((eval_data, test_data), batch_size=BATCH_SIZE, device=device,
                                                      sort_key=lambda x: len(x.review),
                                                      sort_within_batch=True)
    ## https://stackoverflow.com/questions/58241313/understanding-typeerror-not-supported-between-instances-of-example-and-e
    ## sort_key=lambda x: len(x.review) 추가에 대해선 위 링크에서 설명.
    for batch_data in train_iter:
        print(batch_data.review)  # text, text_length
        print(batch_data.sentiment)  # label
        break

    # construct model
    VOCAB_SIZE = len(TEXT.vocab)
    HIDDEN_SIZE = 256
    OUTPUT_SIZE = 1
    NUM_LAYER = 2
    BIDIRECTIONAL = True
    DROPOUT = 0.5
    EMBEDDING_DIM = 100
    PAD_INDEX = TEXT.vocab.stoi[TEXT.pad_token]
    UNK_INDEX = TEXT.vocab.stoi[TEXT.unk_token]

    model = BiLSTMSentiment(vocab_size=VOCAB_SIZE, embedding_dim=EMBEDDING_DIM, hidden_size=HIDDEN_SIZE,
                            output_size=OUTPUT_SIZE, num_layer=NUM_LAYER, bidirectional=BIDIRECTIONAL,
                            dropout=DROPOUT, pad_index=PAD_INDEX)

    # load pretrained weight of embedding layer
    pretrained_embedding = TEXT.vocab.vectors
    print(pretrained_embedding)
    pretrained_embedding[PAD_INDEX] = 0
    pretrained_embedding[UNK_INDEX] = 0
    print(pretrained_embedding)

    model.embedding.weight.data.copy_(pretrained_embedding)

    # optimizer
    optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)
    # criterion
    criterion = nn.BCEWithLogitsLoss()
    scheduler = lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)

    model = model.to(device)
    EPOCH = 5
    # MODEL_PATH = './output/bilstm_model.pth'
    # BEST_MODEL_PATH = './output/bilstm_model_best.pth'
    best_eval_loss = float('inf')
    for epoch in range(EPOCH):
        print('{}/{}'.format(epoch, EPOCH))
        train_acc, train_loss = train(model, train_iter, optimizer=optimizer, criterion=criterion)
        eval_acc, eval_loss = test(model, eval_iter, criterion=criterion)

        print('Train => acc {:.3f}, loss {:4f}'.format(train_acc, train_loss))
        print('Valid => acc {:.3f}, loss {:4f}'.format(eval_acc, eval_loss))
        scheduler.step()

        # save model
        state = {
            'vocab_size': VOCAB_SIZE,
            'embedding_dim': EMBEDDING_DIM,
            'hidden_size': HIDDEN_SIZE,
            'output_size': OUTPUT_SIZE,
            'num_layer': NUM_LAYER,
            'bidirectional': BIDIRECTIONAL,
            'dropout': DROPOUT,
            'state_dict': model.state_dict(),
            'pad_index': PAD_INDEX,
            'unk_index': UNK_INDEX,
            'text_vocab': TEXT.vocab.stoi,
        }

        # os.makedirs(os.path.dirname(MODEL_PATH), exist_ok=True)
        # torch.save(state, MODEL_PATH)
        if eval_loss < best_eval_loss:
            # shutil.copy(MODEL_PATH, BEST_MODEL_PATH)
            best_eval_loss = eval_loss

    test_acc, test_loss = test(model, test_iter, criterion=criterion)
    print('Test Eval => acc {:.3f}, loss {:4f}'.format(test_acc, test_loss))
예제 #20
0
            unk_token=g_bert_tokenizer.unk_token_id)

TGT = Field(use_vocab=False,
            tokenize=g_gpt_tokenizer.tokenize,
            preprocessing=g_gpt_tokenizer.convert_tokens_to_ids,
            init_token=g_gpt_tokenizer.bos_token_id,
            eos_token=g_gpt_tokenizer.eos_token_id,
            pad_token=g_gpt_tokenizer.eos_token_id,
            unk_token=g_gpt_tokenizer.unk_token_id)

g_data_fields = [('src', SRC), ('tgt', TGT)]

train_data, validation_data, test_data = TabularDataset.splits(
    path='datasets/',
    format='csv',
    train='chat_corpus_train.csv',
    validation='chat_corpus_validation.csv',
    test='chat_corpus_test.csv',
    skip_header=False,
    fields=g_data_fields)

g_device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, validation_iterator, test_iterator = BucketIterator.splits(
    (train_data, validation_data, test_data),
    batch_size=BATCH_SIZE,
    sort_key=lambda x: len(x.src),  # function used to group the data
    sort_within_batch=False,
    device=g_device)


class TransBertEncoder(nn.Module):
TEXT = data.Field(
    sequential=True,
    use_vocab=True,
    tokenize=okt.morphs,
    lower=True,
    batch_first=True,
    fix_length=20)  # 모든 text length를 fix_length에 맞추고 길이가 부족하면  padding

LABEL = data.Field(sequential=False, use_vocab=False,
                   is_target=True)  # 대상 변수인지 여부,

train_data, test_data = TabularDataset.splits(path='.',
                                              train='ratings_train.txt',
                                              test='ratings_test.txt',
                                              format='tsv',
                                              fields=[('id', ID),
                                                      ('text', TEXT),
                                                      ('label', LABEL)],
                                              skip_header=True)

print(vars(train_data[0]))
TEXT.build_vocab(train_data, min_freq=10,
                 max_size=70000)  # 최소 10번 이상 나온 단어 word2index

batch_size = 50
train_loader = Iterator(dataset=train_data,
                        batch_size=batch_size,
                        shuffle=True)
test_loader = Iterator(dataset=test_data, batch_size=batch_size)

예제 #22
0
def decode(args: Dict[str, str]):
    """ Performs decoding on a test set, and save the best-scoring decoding results.
    If the target gold-standard sentences are given, the function also computes
    corpus-level BLEU score.
    @param args (Dict): args from cmd line
    """
    spacy_en = spacy.load('en')

    def tokenizer(text):  # create a tokenizer function
        return [tok.text for tok in spacy_en.tokenizer(text)]

    TEXT = Field(sequential=True,
                 tokenize=tokenizer,
                 lower=True,
                 include_lengths=True,
                 init_token='<s>',
                 eos_token='</s>')
    analogies_datafields = [("abc", TEXT), ("d", TEXT)]

    train, val, test = TabularDataset.splits(
        path="data",  # the root directory where the data lies
        train='ngram_train.csv',
        validation="ngram_val.csv",
        test='ngram_test.csv',
        format='csv',
        skip_header=
        False,  # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!
        fields=analogies_datafields)

    pretrained_vecs = torchtext.vocab.Vectors('../GloVe-1.2/life_vectors.txt')
    TEXT.build_vocab(
        vectors=pretrained_vecs)  # specials=['<pad>', '<s>', '</s>']

    if args['--cuda'] == 'cpu':
        torch_text_device = -1
    else:
        torch_text_device = 0

    training_iter, val_iter, test_iter = Iterator.splits(
        (train, val, test),
        sort_key=lambda x: len(x.abc),
        batch_sizes=(100, 20, 1),
        device=torch_text_device,
        sort_within_batch=True)

    print("load test source sentences from [{}]".format(
        args['TEST_SOURCE_FILE']),
          file=sys.stderr)
    test_data_src = read_corpus(args['TEST_SOURCE_FILE'], source='src')
    if args['TEST_TARGET_FILE']:
        print("load test target sentences from [{}]".format(
            args['TEST_TARGET_FILE']),
              file=sys.stderr)
        test_data_tgt = read_corpus(args['TEST_TARGET_FILE'], source='tgt')

    print("load model from {}".format(args['MODEL_PATH']), file=sys.stderr)
    model = NMT.load(args['MODEL_PATH'])

    if args['--cuda']:
        model = model.to(torch.device("cuda:0"))

    hypotheses = beam_search(model,
                             test_iter,
                             beam_size=int(args['--beam-size']),
                             max_decoding_time_step=int(
                                 args['--max-decoding-time-step']))

    if args['TEST_TARGET_FILE']:
        top_hypotheses = [hyps[0] for hyps in hypotheses]
        bleu_score = compute_corpus_level_bleu_score(test_data_tgt,
                                                     top_hypotheses)

        #accuracy (unigrams)
        perfectly_correct = 0
        for index, hyp in enumerate(top_hypotheses):
            if hyp.value[0] == test_data_tgt[index][1]:
                perfectly_correct += 1
        print('Ignore accuracy for non unigrams')
        print('Accuracy: {}'.format(perfectly_correct / len(test_data_tgt)),
              file=sys.stderr)
        print('Corpus BLEU: {}'.format(bleu_score * 100), file=sys.stderr)

    with open(args['OUTPUT_FILE'], 'w') as f:
        for src_sent, hyps in zip(test_data_src, hypotheses):
            top_hyp = hyps[0]
            hyp_sent = ' '.join(top_hyp.value)
            f.write(hyp_sent + '\n')
예제 #23
0
decoy_strength = args.decoy_strength

seed(p)
s = S(p)
out_name = str(args.which_adversarial) + p._str(p)
use_individual = True  # XXX
torch.cuda.set_device(args.gpu)

inputs = data.Field(lower=True)
answers = data.Field(sequential=False, unk_token=None)
tv_datafields = [("text", inputs), ("label", answers)]
train, dev, test = TabularDataset.splits(
    path=dataset_path,  # the root directory where the data lies
    train='train_bias_SST.csv',
    validation="dev_bias_SST.csv",
    test="test_bias_SST.csv",
    format='csv',
    skip_header=
    False,  # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!
    fields=tv_datafields)

inputs.build_vocab(train, dev, test)
if args.word_vectors:
    if os.path.isfile(args.vector_cache):
        inputs.vocab.vectors = torch.load(args.vector_cache)
    else:
        inputs.vocab.load_vectors(args.word_vectors)
        os.makedirs(os.path.dirname(args.vector_cache), exist_ok=True)
        torch.save(inputs.vocab.vectors, args.vector_cache)
answers.build_vocab(train)
class_decoy = (inputs.vocab.stoi['a'], inputs.vocab.stoi['the'])
patience = 0
decoy_strength = args.decoy_strength

seed(p)
s = S(p)

out_name = str(args.which_adversarial) + p._str(p)
torch.cuda.set_device(args.gpu)

inputs = data.Field(lower=True)
answers = data.Field(sequential=False, unk_token=None)
tv_datafields = [("text", inputs), ("label", answers)]
train, dev, test = TabularDataset.splits(
    path=dataset_path,  # the root directory where the data lies
    train='train_bias_SST.csv',
    validation="dev_bias_SST.csv",
    test="test_bias_SST.csv",
    format='csv',
    skip_header=False,
    fields=tv_datafields)

inputs.build_vocab(train, dev, test)
if args.word_vectors:
    if os.path.isfile(args.vector_cache):
        inputs.vocab.vectors = torch.load(args.vector_cache)
    else:
        inputs.vocab.load_vectors(args.word_vectors)
        os.makedirs(os.path.dirname(args.vector_cache), exist_ok=True)
        torch.save(inputs.vocab.vectors, args.vector_cache)
answers.build_vocab(train)
class_decoy = (inputs.vocab.stoi['a'], inputs.vocab.stoi['the'])
def main():
    train_data, test_data = TabularDataset.splits(
        path=r"D:\ruin\data\csv_file\imdb_split",
        train='train_data.csv',
        test='test_data.csv',
        format='csv',
        fields=[('review', TEXT), ('sentiment', LABEL)],
        skip_header=True)

    train_data, valid_data = train_data.split(random_state=random.seed(SEED))

    print('Number of train data {}'.format(len(train_data)))
    print('Number of val data {}'.format(len(valid_data)))
    print('Number of test data {}'.format(len(test_data)))

    MAX_VOCAB_SIZE = 25_000

    TEXT.build_vocab(train_data,
                     max_size=MAX_VOCAB_SIZE,
                     vectors='glove.6B.100d',
                     unk_init=torch.Tensor.normal_)
    LABEL.build_vocab(train_data)

    BATCH_SIZE = 64

    train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
        (train_data, valid_data, test_data),
        batch_size=BATCH_SIZE,
        sort_key=lambda x: len(x.review),
        device=device)

    INPUT_DIM = len(TEXT.vocab)
    EMBEDDING_DIM = 100
    N_FILTERS = 100
    FILTER_SIZES = [3, 4, 5]
    OUTPUT_DIM = 1
    DROPOUT = 0.5
    PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

    model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM,
                DROPOUT, PAD_IDX)

    pretrained_embeddings = TEXT.vocab.vectors

    model.embedding.weight.data.copy_(pretrained_embeddings)
    UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
    model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
    model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

    optimizer = torch.optim.Adam(model.parameters())

    criterion = nn.BCEWithLogitsLoss()

    model = model.to(device)
    criterion = criterion.to(device)

    def epoch_time(start_time, end_time):
        elapsed_time = end_time - start_time
        elapsed_mins = int(elapsed_time / 60)
        elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
        return elapsed_mins, elapsed_secs

    best_valid_loss = float('inf')
    N_EPOCHS = 5

    for epoch in range(N_EPOCHS):

        start_time = time.time()
        train_loss, train_acc = train(model, train_iterator, optimizer,
                                      criterion)
        valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
        end_time = time.time()

        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss

        print(
            f'Epoch: {epoch + 1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
        print(
            f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}%'
        )
        print(
            f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc * 100:.2f}%'
        )

    test_loss, test_acc = evaluate(model, test_iterator, criterion)

    print(
        f'\t Test. Loss: {test_loss:.3f} |  Test. Acc: {test_acc * 100:.2f}%')
예제 #26
0
# Declaring Fields
tokenize = lambda x: x.split()
TEXT = Field(sequential=True, tokenize=tokenize, lower=True)
LABEL = Field(sequential=False, use_vocab=False)

# Creating the Dataset
tv_datafields = [("id", None), ("comment_text", TEXT), ("toxic", LABEL),
                 ("severe_toxic", LABEL),
                 ("threat", LABEL), ("obscene", LABEL), ("insult", LABEL),
                 ("identity_hate", LABEL)]

train_data, valid_data = TabularDataset.splits(
    path="data",  # the root directory where the data lies
    train="train.csv",
    validation="valid.csv",
    format='csv',
    skip_header=True,
    fields=tv_datafields)

tst_datafields = [("id", None), ("comment_text", TEXT)]

test_data = TabularDataset(path="data/test.csv",
                           format='csv',
                           skip_header=True,
                           fields=tst_datafields)

TEXT.build_vocab(train_data)
TEXT.vocab.freqs.most_common(10)
print(train_data[0])
print(train_data[0].__dict__.keys)
예제 #27
0
def load_dataset(source_folder,
                 device,
                 tokenizer,
                 MAX_SEQ_LEN=128,
                 BATCH_SIZE=16,
                 name_of_train_dataset='train.csv',
                 name_of_validation_dataset='valid.csv',
                 name_of_test_dataset='test.csv'):

    PAD_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
    UNK_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.unk_token)

    # Fields

    label_field = Field(sequential=False,
                        use_vocab=False,
                        batch_first=True,
                        dtype=torch.float)

    text_field = Field(use_vocab=False,
                       tokenize=tokenizer.encode,
                       lower=False,
                       include_lengths=False,
                       batch_first=True,
                       fix_length=MAX_SEQ_LEN,
                       pad_token=PAD_INDEX,
                       unk_token=UNK_INDEX)
    # note, the fields must be in the same order as the input csv columns
    fields = [('clause_text', text_field), ('label', label_field)]

    # TabularDataset

    train, valid, test = TabularDataset.splits(
        path=source_folder,
        train=name_of_train_dataset,
        validation=name_of_validation_dataset,
        test=name_of_test_dataset,
        format='CSV',
        fields=fields,
        skip_header=True)

    # Iterators

    train_iter = BucketIterator(train,
                                batch_size=BATCH_SIZE,
                                sort_key=lambda x: len(x.clause_text),
                                device=device,
                                train=True,
                                sort=True,
                                sort_within_batch=True)
    valid_iter = BucketIterator(valid,
                                batch_size=BATCH_SIZE,
                                sort_key=lambda x: len(x.clause_text),
                                device=device,
                                train=True,
                                sort=True,
                                sort_within_batch=True)
    test_iter = Iterator(test,
                         batch_size=BATCH_SIZE,
                         device=device,
                         train=False,
                         shuffle=False,
                         sort=False)
    return train_iter, valid_iter, test_iter
def run_smiles_generator(test_file):

    src = Field(sequential=True,
                tokenize=tokenize_drug,
                init_token='<sos>',
                eos_token='<eos>')

    trg = Field(sequential=True,
                tokenize=tokenize_drug,
                init_token='<sos>',
                eos_token='<eos>')

    #Get the train and test set in torchtext format
    datafields = [
        ("src",
         src),  # we won't be needing the id, so we pass in None as the field
        ("trg", trg)
    ]
    train, test = TabularDataset.splits(path='../data/SMILES_Autoencoder/',
                                        train='all_smiles_revised_final.csv',
                                        test=test_file,
                                        format='csv',
                                        skip_header=True,
                                        fields=datafields)

    #Split the dataset into train and validation set
    train_data, valid_data = train.split(split_ratio=0.99)

    print(f"Number of examples: {len(train_data.examples)}")
    src.build_vocab(train_data, min_freq=2)
    trg.build_vocab(train_data, min_freq=2)

    #Total no of unique words in our vocabulary
    print(f"Unique tokens in source vocabulary: {len(src.vocab)}")
    print(f"Unique tokens in target vocabulary: {len(trg.vocab)}")
    TRG_PAD_IDX = trg.vocab.stoi[trg.pad_token]
    print("Padding Id: ", TRG_PAD_IDX)

    #Create the iterator to traverse over test samples for which we need to generate latent space
    BATCH_SIZE = 128
    (train_iterator, test_iterator) = BucketIterator.splits(
        (train_data, test),
        batch_size=BATCH_SIZE,
        device=DEVICE,
        sort=False,
        shuffle=False)
    print(src.vocab.stoi)
    print(trg.vocab.stoi)

    #Define the model once again
    INPUT_DIM = len(src.vocab)
    OUTPUT_DIM = len(trg.vocab)
    ENC_EMB_DIM = 128
    DEC_EMB_DIM = 128
    HID_DIM = 256
    N_LAYERS = 1
    ENC_DROPOUT = 0.0
    DEC_DROPOUT = 0.0

    enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
    dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

    model = Seq2Seq(enc, dec, device=DEVICE).to(DEVICE)
    model.apply(init_weights)

    optimizer = optim.Adam(model.parameters())
    criterion = nn.CrossEntropyLoss().to(DEVICE)
    model.load_state_dict(
        torch.load('../models/lstm_out/torchtext_checkpoint.pt',
                   map_location=torch.device('cpu')))

    #Get latent space for all drugs
    model.eval()
    epoch_loss = 0

    ls_list = []
    encode_list = []
    decode_list = []
    error_list = []
    with torch.no_grad():
        for j, batch in enumerate(test_iterator):
            new_src = batch.src
            new_trg = batch.trg

            #Get output
            outputs = model(new_src, new_trg, 1)  #turn on teacher forcing
            output = outputs[0]
            hidden = outputs[1]
            cell_state = outputs[2]

            #Get latent space
            o1 = torch.argmax(torch.softmax(output, dim=2), dim=2)
            h1 = torch.mean(hidden, dim=0).cpu().detach().tolist()
            c1 = torch.mean(cell_state, dim=0).cpu().detach().tolist()

            for i in range(len(h1)):
                temp_ls = h1[i]
                temp_encode = new_trg[:, i].cpu().detach().tolist()
                temp_decode = o1[:, i].cpu().detach().tolist()
                try:
                    index_1 = temp_decode.index(1)
                except:
                    index_1 = len(temp_decode)
                temp_error = np.array(temp_encode) - np.array(temp_decode)
                error = sum(
                    np.absolute(temp_error[1:index_1]) > 0) / len(temp_error)
                error_list.append(error)
                ls_list.append(temp_ls)
                encode_list.append(temp_encode)
                decode_list.append(temp_decode)

            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            rev_trg = new_trg[1:].view(-1)

            loss = criterion(output, rev_trg)
            print("Reconstruction Loss for iteration " + str(j) + " is :" +
                  str(round(loss.item(), 3)))
            epoch_loss += loss.item()

    #Print overall average error
    print("Average reconstruction error: ", epoch_loss / len(test_iterator))
    torch.cuda.empty_cache()

    final_list, only_smiles_list = [], []
    for i in range(len(encode_list)):
        temp_encode = encode_list[i]
        temp_decode = decode_list[i]
        temp_encode_str, temp_decode_str, temp_mol_str, temp_error_str = '', '', '', ''

        #Get original string
        for j in range(1, len(temp_encode)):

            #Break when it sees padding
            if (temp_encode[j] == 1):
                break

            #Don't pad end of sentence
            if (temp_encode[j] != 3):
                temp_encode_str += src.vocab.itos[temp_encode[j]]

        #Get decoded string
        for j in range(1, len(temp_decode)):

            if (temp_decode[j] == 1):
                break

            if (temp_decode[j] != 3):
                temp_decode_str += src.vocab.itos[temp_decode[j]]

        #m = Chem.MolFromSmiles(temp_decode_str)
        #if (m is not None):
        #    temp_mol_str = '1'
        #else:
        #    temp_mol_str = '0'

        #string_list = [temp_encode_str, temp_decode_str, temp_mol_str, str(error_list[i])]
        #only_smiles_list.append(string_list)
        #string_list_with_ls = string_list + ls_list[i]
        #final_list.append(string_list_with_ls)

    colids = ['LS_' + str(x) for x in range(len(ls_list[0]))]
    final_out_df = pd.DataFrame(ls_list, columns=colids)
    return (final_out_df)
예제 #29
0
a = 'Hello World!'
print(a)
print('Tokenization: ', tokenizer(a))

#define fields
TEXT = Field(sequential=True, tokenize=tokenizer, lower=True)
LABEL = Field(sequential=False, use_vocab=False, dtype=torch.float)

fields = [("question_text", TEXT), ("label", LABEL)]

#load datasets
train_data, valid_data, test_data = TabularDataset.splits(
    path="data/",
    train='train.csv',
    validation='validation.csv',
    test='test.csv',
    format='csv',
    fields=fields,
    skip_header=True)

##print(len(train_data))
##print(len(valid_data))
##print(len(test_data))

#build vocabulary
TEXT.build_vocab(train_data)

print('Vocabulary size: ', len(TEXT.vocab))
print('First example: ', vars(train_data.examples[0]))

#print(train_data[0])
예제 #30
0
def main():
    # Add ckp
    parser = argparse.ArgumentParser(
        description='PyTorch PennTreeBank RNN/LSTM Language Model')
    parser.add_argument(
        '--data',
        type=str,
        default='/input',  # /input
        help='location of the data corpus')
    parser.add_argument('--checkpoint',
                        type=str,
                        default='',
                        help='model checkpoint to use')
    parser.add_argument(
        '--model',
        type=str,
        default='LSTM',
        help='type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU)')
    parser.add_argument('--emsize',
                        type=int,
                        default=200,
                        help='size of word embeddings')
    parser.add_argument('--nhid',
                        type=int,
                        default=200,
                        help='number of hidden units per layer')
    parser.add_argument('--nlayers',
                        type=int,
                        default=2,
                        help='number of layers')
    parser.add_argument('--lr',
                        type=float,
                        default=0.001,
                        help='initial learning rate')
    parser.add_argument('--clip',
                        type=float,
                        default=0.25,
                        help='gradient clipping')
    parser.add_argument('--epochs',
                        type=int,
                        default=40,
                        help='upper epoch limit')
    parser.add_argument('--batch_size',
                        type=int,
                        default=256,
                        metavar='N',
                        help='batch size')
    parser.add_argument('--dropout',
                        type=float,
                        default=0.2,
                        help='dropout applied to layers (0 = no dropout)')
    parser.add_argument('--tied',
                        action='store_true',
                        help='tie the word embedding and softmax weights')
    parser.add_argument('--seed', type=int, default=1111, help='random seed')
    parser.add_argument('--cuda', action='store_true', help='use CUDA')
    parser.add_argument('--log-interval',
                        type=int,
                        default=200,
                        metavar='N',
                        help='report interval')
    parser.add_argument(
        '--save',
        type=str,
        default='/output/model.pt',  # /output
        help='path to save the final model')
    args = parser.parse_args()

    # Set the random seed manually for reproducibility.
    torch.manual_seed(args.seed)
    if torch.cuda.is_available():
        if not args.cuda:
            print(
                "WARNING: You have a CUDA device, so you should probably run with --cuda"
            )
        else:
            torch.cuda.manual_seed(args.seed)

    # Load checkpoint
    build_vocab = False
    if args.checkpoint != '' and os.path.exists(args.checkpoint):
        print(f'Loading field from {args.checkpoint}')
        save_dict = torch.load(args.checkpoint)
        field = save_dict['field']
        start_epoch = save_dict['start_epoch']
    else:
        save_dict = None
        field = Field(tokenize=split_tokenize, init_token='<init>')
        build_vocab = True
        start_epoch = 0

    ###############################################################################
    # Load data
    ###############################################################################

    train_data, val_data, test_data = TabularDataset.splits(
        path=args.data,
        train='train.txt',
        validation='valid.txt',
        test='test.txt',
        format='tsv',
        fields=[('text', field)])
    print(train_data, len(train_data), val_data, len(val_data), test_data,
          len(test_data))
    if build_vocab:
        field.eos_token = '<eos>'
        field.build_vocab(train_data, val_data, min_freq=1000)
        field.eos_token = None
    eos_id = field.vocab.stoi['<eos>']
    pad_id = field.vocab.stoi[field.pad_token]

    train_iter = BucketIterator(train_data,
                                args.batch_size,
                                train=True,
                                repeat=False,
                                device='cuda:0' if args.cuda else 'cpu:0')
    val_iter = Iterator(val_data,
                        args.batch_size,
                        repeat=False,
                        device='cuda:0' if args.cuda else 'cpu:0')
    test_iter = Iterator(test_data,
                         args.batch_size,
                         repeat=False,
                         device='cuda:0' if args.cuda else 'cpu:0')
    print(train_iter, len(train_iter), val_iter, len(val_iter), test_iter,
          len(test_iter))

    ###############################################################################
    # Build the model
    ###############################################################################

    ntokens = len(field.vocab)
    model = RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers,
                     args.dropout, args.tied)

    if save_dict is not None:
        model.load_state_dict(save_dict['model'])

    if args.cuda:
        model.cuda()
    else:
        model.cpu()
    print(model)

    if save_dict:
        opt = save_dict['optimizer']
    else:
        opt = torch.optim.Adam(model.parameters(), lr=args.lr)

    if args.checkpoint:
        torch.save(
            dict(field=field,
                 model=model.state_dict(),
                 optimizer=opt,
                 start_epoch=start_epoch), args.checkpoint)

    ###############################################################################
    # Training code
    ###############################################################################

    criterion = torch.nn.CrossEntropyLoss(ignore_index=pad_id)

    def make_target(text):
        batch_size = text.size()[1]
        eos_vector = torch.full((1, batch_size),
                                eos_id,
                                dtype=text.dtype,
                                device='cuda:0' if args.cuda else 'cpu:0')
        target = torch.cat((text[1:], eos_vector), dim=0)
        return target

    def compute_loss(output, text):
        output_flat = output.view(-1, ntokens)
        target = make_target(text)
        target_flat = target.view(-1)

        return criterion(output_flat, target_flat)

    def evaluate(data_source):
        # Turn on evaluation mode which disables dropout.
        with torch.no_grad():
            model.eval()
            total_loss = 0
            for batch in data_source:
                output, hidden = model(batch.text)
                loss = compute_loss(output, batch.text)

                total_loss += loss.item()
            return total_loss / len(data_source)

    def train():
        # Turn on training mode which enables dropout.
        model.train()
        total_loss = 0
        start_time = time.time()
        for i, batch in enumerate(train_iter):
            model.zero_grad()

            output, hidden = model(batch.text)
            target = make_target(batch.text)

            loss = compute_loss(output, batch.text)
            loss.backward()

            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
            opt.step()

            total_loss += loss.item()

            if i % args.log_interval == 0 and i > 0:
                cur_loss = total_loss / args.log_interval
                elapsed = time.time() - start_time
                print(
                    '| epoch {:3d} | {:5d}/{:5d} batches | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                        epoch, i, len(train_iter),
                        elapsed * 1000 / args.log_interval, cur_loss,
                        math.exp(cur_loss)))
                total_loss = 0
                start_time = time.time()

    # Loop over epochs.
    best_val_loss = None

    # At any point you can hit Ctrl + C to break out of training early.
    try:
        for epoch in range(start_epoch, args.epochs):
            epoch_start_time = time.time()
            train()
            val_loss = evaluate(val_iter)
            print('-' * 89)
            print(
                '| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f}'.format(epoch,
                                           (time.time() - epoch_start_time),
                                           val_loss, math.exp(val_loss)))
            print('-' * 89)
            # Save the model if the validation loss is the best we've seen so far.
            if not best_val_loss or val_loss < best_val_loss:
                if args.checkpoint:
                    torch.save(
                        dict(field=field,
                             model=model.state_dict(),
                             optimizer=opt,
                             start_epoch=epoch), args.checkpoint)
                best_val_loss = val_loss
    except KeyboardInterrupt:
        print('-' * 89)
        print('Exiting from training early')

    torch.save(
        dict(vocab=field.vocab.itos,
             model=model.state_dict(),
             settings=dict(rnn_type=args.model,
                           emsize=args.emsize,
                           nhid=args.nhid,
                           nlayers=args.nlayers)), args.save)

    # Load the best saved model.
    #with open(args.save, 'rb') as f:
    #    save_dict = torch.load(f)
    #    field = save_dict['field']
    #    if save_dict is not None:
    #        model.load_state_dict(save_dict['model'])
    #
    #    if args.cuda:
    #        model.cuda()
    #    else:
    #        model.cpu()

    # Run on test data.
    test_loss = evaluate(test_iter)
    print('=' * 89)
    print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
        test_loss, math.exp(test_loss)))
    print('=' * 89)
예제 #31
0
data_test.FULLNAME = data_test.FULLNAME.apply(split_fio)

data_test.head()

data_test.to_csv('C:/help_files/comp_test.csv', index=None)

tokenize = lambda x: x.split(' ')

TEXT = Field(sequential=True, tokenize=tokenize, lower=True)
LABEL = Field(sequential=False, use_vocab=False, is_target=True)

nation_fields = [('FULLNAME', TEXT), ('NATION', LABEL)]

trn, vld = TabularDataset.splits(path='C:/help_files/',
                                 train='train.csv',
                                 validation="test.csv",
                                 format='csv',
                                 skip_header=True,
                                 fields=nation_fields)

TEXT.build_vocab(trn)

TEXT.vocab.freqs.most_common(10)

TEXT.vocab.stoi

batch_size = 256

train_iter, val_iter = BucketIterator.splits(
    (trn, vld),
    batch_sizes=(batch_size, batch_size),
    device=device,