Пример #1
0
def train_Seq2Seq(train_data, test_data, model, criterion, optimizer, cur_dir,
                  attack_vocab):
    best_accuracy = 0.0
    for epoch in range(AttackConfig.epochs):
        logging(f'epoch {epoch} start')
        logging(f'epoch {epoch} train Seq2Seq model')
        model.train()
        loss_mean = 0.0
        n = 0
        for x, x_mask, y, _ in train_data:
            x, x_mask, y = x.to(AttackConfig.train_device), x_mask.to(
                AttackConfig.train_device), y.to(AttackConfig.train_device)
            model.zero_grad()
            logits = model(x, x_mask, is_noise=False)
            logits = logits.reshape(-1, logits.shape[-1])
            y = y.reshape(-1)
            loss = criterion(logits, y)
            loss_mean += loss.item()
            loss.backward()
            optimizer.step()
            n += (x.shape[0] * x.shape[1])
        logging(f"epoch {epoch} train_loss is {loss_mean / n}")
        eval_accuracy = evaluate_Seq2Seq(
            test_data, model, cur_dir + f'/eval_Seq2Seq_model_epoch_{epoch}',
            attack_vocab)
        logging(f"epoch {epoch} test_acc is {eval_accuracy}")
        if best_accuracy < eval_accuracy:
            best_accuracy = eval_accuracy
            logging('Saveing Seq2Seq models...')
            torch.save(model.state_dict(), cur_dir + r'/Seq2Seq_model.pt')
Пример #2
0
    def token2idx(self):
        logging(f'{self.path} in token2idx')
        for tokens in self.data_tokens:
            self.data_idx.append(self.tokenizer.convert_tokens_to_ids(tokens))
            self.data_mask.append([1] * len(tokens))

        if self.attack_vocab:
            for tokens in self.label_tokens:
                self.label_idx.append(
                    [self.attack_vocab.get_index(token) for token in tokens])
        else:
            for tokens in self.label_tokens:
                self.label_idx.append(
                    self.tokenizer.convert_tokens_to_ids(tokens))

        for i in range(len(self.data_idx)):
            if len(self.data_idx[i]) < self.sen_len:
                self.data_idx[i] += [0
                                     ] * (self.sen_len - len(self.data_idx[i]))
                self.data_mask[i] += [0] * (self.sen_len -
                                            len(self.data_mask[i]))

            if len(self.label_idx[i]) < self.sen_len:
                self.label_idx[i] += [0] * (self.sen_len -
                                            len(self.label_idx[i]))
Пример #3
0
    def write_syn_csv(self, path):
        with open(path, 'w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file, delimiter=',')
            for key, value in self.syn_dict.items():
                writer.writerow([key] + list(value))

        logging(f'write synoymous words {self.syn_count} to {path}')
Пример #4
0
 def read_standard_data(self, path, sentences, debug_mode=False):
     label_classes = SNLIConfig.label_classes
     premise_data = []
     hypothesis_data = []
     labels = []
     if debug_mode:
         i = 1000
         with open(path, 'r', encoding='utf-8') as file:
             for line in file:
                 tokens = line.strip().split('\t')
                 labels.append(label_classes[tokens[0].strip()])
                 premise_data.append(sentences[int(tokens[1].strip())])
                 hypothesis_data.append(sentences[int(tokens[2].strip())])
                 i -= 1
                 if i == 0:
                     break
         logging(f'loading data {len(premise_data)} from {path}')
         return premise_data, hypothesis_data, labels
     with open(path, 'r', encoding='utf-8') as file:
         for line in file:
             tokens = line.strip().split('\t')
             labels.append(label_classes[tokens[0].strip()])
             premise_data.append(sentences[int(tokens[1].strip())])
             hypothesis_data.append(sentences[int(tokens[2].strip())])
     logging(f'loading data {len(premise_data)} from {path}')
     return premise_data, hypothesis_data, labels
Пример #5
0
    def data2tokens(self):
        logging(f'{self.path} in data2tokens')
        for sen in self.premise_data:
            data_tokens = ['[CLS]']
            data_tokens += self.tokenizer.tokenize(sen)[:self.sen_len - 1]
            if self.attack_vocab:
                label_tokens = self.baseline_tokenizer(sen)[:self.sen_len - 1]
                label_tokens += ['[SEP]']
            else:
                label_tokens = self.tokenizer.tokenize(sen)[:self.sen_len - 1]
                label_tokens += ['[SEP]']

            self.premise_data_tokens.append(data_tokens)
            self.premise_label_tokens.append(label_tokens)

        for sen in self.hypothesis_data:
            data_tokens = ['[CLS]']
            data_tokens += self.tokenizer.tokenize(sen)[:self.sen_len - 1]
            if self.attack_vocab:
                label_tokens = self.baseline_tokenizer(sen)[:self.sen_len - 1]
                label_tokens += ['[SEP]']
            else:
                label_tokens = self.tokenizer.tokenize(sen)[:self.sen_len - 1]
                label_tokens += ['[SEP]']
            self.hypothesis_data_tokens.append(data_tokens)
            self.hypothesis_label_tokens.append(label_tokens)
Пример #6
0
def write_results_to_file(models, results, logs):
    with open(args.save_path, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file, delimiter=',')
        writer.writerow([
            'model_name', 'dataset', 'origin_acc', 'clean_acc', 'adv_acc',
            'acc_shift', 'success_rate', 'sub_rate'
        ])
        for model, res, log in zip(models, results, logs):
            print(
                f'{model} {dataset_name} testdata {len(test_data)} all acc is {res[0]:.5f}'
            )
            print(
                f'{model} {dataset_name} cleandata {len(clean_data)} acc is {res[1]:.5f}'
            )
            print(
                f'{model} {dataset_name} advdata {len(clean_data)} acc is {res[2]:.5f}'
            )
            print(
                f'{model} {dataset_name} advdata {log[0]}, mean sub_rate {log[1]:.5f}, mean NE_rate {log[2]:.5f}'
            )
            acc_shift = res[1] - res[2]
            success_rate = acc_shift / res[1]
            writer.writerow([
                model, dataset_name, res[0], res[1], res[2], acc_shift,
                success_rate, log[1]
            ])
    logging(f'evaluate results saved in {args.save_path}')
Пример #7
0
    def read_syn_csv(self, path):
        with open(path, 'r', newline='', encoding='utf-8') as file:
            reader = csv.reader(file, delimiter=',')
            for line in reader:
                res = set(line[1:])
                self.syn_count += len(res)
                self.syn_dict[line[0]] = res

        logging(
            f'load syn_words from {path}, key word is {len(self.syn_dict)}, synonymous words is {self.syn_count}'
        )
Пример #8
0
 def token2seq(self, vocab:'Vocab', maxlen:int):
     if len(self.data_seq) > 0:
         self.data_seq.clear()
         self.labels_tensor.clear()
     logging(f'data is train {self.is_train} is to sequence!')
     self.vocab = vocab
     self.maxlen = maxlen
     assert self.data_token is not None
     for tokens in self.data_token:
         self.data_seq.append(self.__encode_tokens(tokens))
     for label in self.labels:
         self.labels_tensor.append(torch.tensor(label))
Пример #9
0
    def build_syn_dict(self, vocab: 'Vocab', path):
        assert len(vocab.word_dict) > 0
        for key, value in tqdm(vocab.word_dict.items()):
            if value == 0: continue
            res = self.get_similarity_words(key)
            if len(res) > 0: self.syn_dict[key] = res
            self.syn_count += len(res)

        num = len(self.syn_dict)
        logging(
            f'synonymous words has been built, key word is {num}, synonymous words is {self.syn_count}'
        )
        self.write_syn_csv(path)
Пример #10
0
 def data2tokens(self):
     logging(f'{self.path} in data2tokens')
     for sen in self.datas:
         if self.attack_vocab:
             data_tokens = ['[CLS]']
             data_tokens += self.baseline_tokenizer(sen)[:self.sen_len - 1]
             label_tokens = self.baseline_tokenizer(sen)[:self.sen_len - 1]
             label_tokens += ['[SEP]']
         else:
             data_tokens = ['[CLS]']
             data_tokens += self.tokenizer.tokenize(sen)[:self.sen_len - 1]
             label_tokens = self.tokenizer.tokenize(sen)[:self.sen_len - 1]
             label_tokens += ['[SEP]']
         self.data_tokens.append(data_tokens)
         self.label_tokens.append(label_tokens)
Пример #11
0
def evaluate_Seq2Seq(test_data, Seq2Seq_model, dir, attack_vocab):
    Seq2Seq_model.eval()
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    logging(f'Saving evaluate of Seq2Seq_model outputs into {dir}')
    with torch.no_grad():
        acc_sum = 0
        n = 0
        for x, x_mask, y, _ in test_data:
            x, x_mask, y = x.to(AttackConfig.train_device), x_mask.to(
                AttackConfig.train_device), y.to(AttackConfig.train_device)
            logits = Seq2Seq_model(x, x_mask, is_noise=False)
            # outputs_idx: [batch, sen_len]
            outputs_idx = logits.argmax(dim=2)
            acc_sum += (outputs_idx == y).float().sum().item()
            n += y.shape[0] * y.shape[1]

            if attack_vocab:
                with open(dir, 'a') as f:
                    for i in range(len(y)):
                        f.write('-------orginal sentence----------\n')
                        f.write(' '.join(
                            [attack_vocab.get_word(token)
                             for token in y[i]]) + '\n')
                        f.write(
                            '-------sentence -> encoder -> decoder----------\n'
                        )
                        f.write(' '.join([
                            attack_vocab.get_word(token)
                            for token in outputs_idx[i]
                        ]) + '\n' * 2)
            else:
                with open(dir, 'a') as f:
                    for i in range(len(y)):
                        f.write('-------orginal sentence----------\n')
                        f.write(
                            ' '.join(tokenizer.convert_ids_to_tokens(y[i])) +
                            '\n')
                        f.write(
                            '-------sentence -> encoder -> decoder----------\n'
                        )
                        f.write(' '.join(
                            tokenizer.convert_ids_to_tokens(outputs_idx[i])) +
                                '\n' * 2)

        return acc_sum / n
Пример #12
0
 def __init__(self, tokenizer_type='normal', remove_stop_words=False):
     self.is_remove_stop_words = remove_stop_words
     if tokenizer_type == 'normal':
         self.tokenizer = self.normal_token
     elif tokenizer_type == 'spacy':
         self.nlp = spacy.load('en_core_web_sm')
         self.tokenizer = self.spacy_token
     else:
         raise RuntimeError(
             f'Tokenizer type is error, do not have type {tokenizer_type}')
     self.token_type = tokenizer_type
     self.stop_words = set(stopwords.words('english'))
     for w in [
             "<br />", '!', ',', '.', '?', '-s', '-ly', '</s>', 's', '</',
             '>', '/>', 'br', '<'
     ]:
         self.stop_words.add(w)
     logging(
         f'using tokenizer {tokenizer_type}, is_remove_stop_words={remove_stop_words}'
     )
Пример #13
0
    def evaluate(now_ep) -> float:
        test_loss = 0.0
        logging('starting evaluate!')
        net.eval()
        with torch.no_grad():
            correct = 0
            total = 0
            for (X, label) in tqdm(test_data):
                X, label = X.to(config_device), label.to(config_device)
                logits = net(X)
                test_loss += criterion(logits, label).item()
                predict = logits.argmax(dim=1)
                correct += predict.eq(label).float().sum().item()
                total += X.size(0)

        correct /= total
        test_loss /= len(test_data)
        logging(
            f'epoch {now_ep} evaluate done! test acc {correct:.5f}, best acc{best_acc:.5f}, test batch loss {test_loss:.5f}'
        )
        return correct, test_loss
Пример #14
0
 def read_standard_data(self, path, debug_mode=False):
     path_list = []
     logging(f'start loading data from {path}')
     dirs = os.listdir(path)
     for dir in dirs:
         if dir == 'pos' or dir == 'neg':
             file_list = os.listdir(os.path.join(path, dir))
             file_list = map(lambda x: os.path.join(path, dir, x),
                             file_list)
             path_list += list(file_list)
     datas = []
     labels = []
     if debug_mode:
         i = 1000
         for p in path_list:
             label = 0 if 'neg' in p else 1
             with open(p, 'r', encoding='utf-8') as file:
                 datas.append(file.readline())
                 labels.append(label)
             i -= 1
             if i == 0:
                 logging(f'loading data {len(datas)} from {path}')
                 return datas, labels
     for p in path_list:
         label = 0 if 'neg' in p else 1
         with open(p, 'r', encoding='utf-8') as file:
             datas.append(file.readline())
             labels.append(label)
     logging(f'loading data {len(datas)} from {path}')
     return datas, labels
Пример #15
0
def evaluate(dataset: MyDataset, net: Module):
    assert dataset.is_train == False
    assert len(dataset) > 0
    dataset = DataLoader(dataset, batch_size=1000, shuffle=False)
    test_loss = 0.0
    net.eval()
    logging('starting evaluate!')
    with torch.no_grad():
        correct = 0
        total = 0
        for (X, label) in tqdm(dataset):
            X, label = X.to(config_device), label.to(config_device)
            logits = net(X)
            predict = logits.argmax(dim=1)
            correct += predict.eq(label).float().sum().item()
            total += X.size(0)

    correct /= total
    test_loss /= len(test_data)
    logging(
        f'evaluate done! acc {correct:.5f}, test average loss {test_loss:.5f}')
    return correct
Пример #16
0
 def read_standard_data(self, path, debug_mode=False):
     data = []
     labels = []
     if debug_mode:
         i = 1000
         with open(path, 'r', encoding='utf-8') as file:
             for line in file:
                 i -= 1
                 line = line.strip('\n')
                 data.append(line[:-1])
                 labels.append(int(line[-1]))
                 if i == 0:
                     break
         logging(f'loading data {len(data)} from {path}')
         return data, labels
     with open(path, 'r', encoding='utf-8') as file:
         for line in file:
             line = line.strip('\n')
             data.append(line[:-1])
             labels.append(int(line[-1]))
     logging(f'loading data {len(data)} from {path}')
     return data, labels
Пример #17
0
 def __init__(self,
              origin_data_tokens,
              word_dim: int = 100,
              vocab_limit_size=80000,
              is_using_pretrained=True,
              word_vec_file_path=r'./static/glove.6B.100d.txt'):
     self.file_path = word_vec_file_path
     self.word_dim = word_dim
     self.word_dict = {}
     self.word_count = {}
     self.vectors = None
     self.num = 0
     self.data_tokens = []
     self.words_vocab = []
     assert len(origin_data_tokens) > 0
     self.data_tokens = origin_data_tokens
     self.__build_words_index()
     self.__limit_dict_size(vocab_limit_size)
     if is_using_pretrained:
         logging(f'building word vectors from {self.file_path}')
         self.__read_pretrained_word_vecs()
     logging(f'word vectors has been built! dict size is {self.num}')
Пример #18
0
                            shuffle=True,
                            num_workers=4)
    test_data = DataLoader(test_dataset_orig,
                           batch_size=AttackConfig.batch_size,
                           shuffle=False,
                           num_workers=4)
    return train_data, test_data


def save_config(path):
    copyfile(config_path, path + r'/config.txt')


if __name__ == '__main__':
    if AttackConfig.train_multi_cuda:
        logging('Using cuda device gpu: ' + str(AttackConfig.multi_cuda_idx))
    else:
        logging('Using cuda device gpu: ' + str(AttackConfig.cuda_idx))
    cur_dir = AttackConfig.output_dir + '/seq2seq_model/' + AttackConfig.dataset + '/' + str(
        int(time.time()))
    # make output directory if it doesn't already exist
    if not os.path.isdir(cur_dir):
        os.makedirs(cur_dir)
    logging('Saving into directory' + cur_dir)
    save_config(cur_dir)

    baseline_model_builder = BaselineModelBuilder(AttackConfig.dataset,
                                                  AttackConfig.baseline_model,
                                                  AttackConfig.train_device,
                                                  is_load=True)
def train_bert_baseline_Entailment(model, train_data, test_data,
                                   criterion_baseline_model,
                                   optimizer_baseline_model, cur_dir):
    best_accuracy = 0.0
    for epoch in range(BaselineConfig.epochs):
        logging(f'epoch {epoch} start')
        logging(f'epoch {epoch} train baseline_model')
        model.train()
        loss_mean = 0.0
        for _, _, _, _, _, _, x, x_mask, x_type, label in train_data:
            x, x_mask, x_type, label = x.to(
                BaselineConfig.train_device), x_mask.to(
                    BaselineConfig.train_device), x_type.to(
                        BaselineConfig.train_device), label.to(
                            BaselineConfig.train_device)
            logits = model(x, x_mask, x_type)
            optimizer_baseline_model.zero_grad()
            loss = criterion_baseline_model(logits, label)
            loss_mean += loss.item()
            loss.backward()
            optimizer_baseline_model.step()

        loss_mean /= len(train_data)
        logging(f"epoch {epoch} train_loss is {loss_mean}")
        eval_accuracy = eval_bert_baseline_Entailment(model, test_data)
        logging(f"epoch {epoch} test_acc is {eval_accuracy}")
        if best_accuracy < eval_accuracy:
            best_accuracy = eval_accuracy
            logging('Saveing baseline models...')
            torch.save(model.state_dict(), cur_dir + r'/baseline_model.pt')
        if loss_mean < 0.1:
            logging(f'best accuracy is {best_accuracy}')
            break
    logging(f'best accuracy is {best_accuracy}')
Пример #20
0
    def generate_adversarial_samples(self,
                                     path,
                                     adv_method: str,
                                     verbose=False,
                                     use_typos=True,
                                     tokenizer=None,
                                     vocab=None,
                                     net=None,
                                     change_log_path=None,
                                     sub_rate_limit=None):
        assert len(self.datas) > 0
        self.adv_methods = {
            'PWWS': self.get_fool_sentence_pwws,
            'TEXTFOOL': self.get_fool_sentence_textfool,
            'RANDOM': self.get_fool_sentence_random,
        }
        assert adv_method in self.adv_methods
        self.adv_methods = self.adv_methods[adv_method]
        self.use_typos = use_typos
        self.verbose = verbose
        self.adv_datas = []
        self.tokenizer = tokenizer
        self.vocab = vocab
        self.net = net
        self.log_path = change_log_path
        self.sub_rate_limit = sub_rate_limit
        success_num = 0
        failure_num = 0
        try_all = 0
        logging(
            f'generate adversarial samples {len(self.datas)} with {adv_method} to {path}'
        )
        # assert net.training == False
        for idx, data in enumerate(self.datas):
            adv_s, flag, end = self.adv_methods(data, self.labels[idx], idx)
            self.adv_datas.append(str(adv_s))
            if flag == 1:
                success_num += 1
                try_all += 1
                logging(
                    f'The {idx}th adv successfully crafted, '
                    f'success rate is {success_num/try_all:.5f}, cost {end:.2f} seconds'
                )
            elif flag == 0:
                failure_num += 1
                try_all += 1
                logging(
                    f'The {idx}th adv example failed crafted, '
                    f'fail rate is {failure_num/try_all:.5f}, cost {end:.2f} seconds'
                )
            del adv_s
            del flag
            del end

            if (idx + 1) % 100 == 0:
                write_standard_data(self.adv_datas,
                                    self.labels[idx - 99:idx + 1], path, 'a')
                self.adv_datas.clear()
                gc.collect()

            if args.verbose:
                now = idx + 1
                acc = 1 - success_num / now
                success_rate = success_num / try_all if try_all > 0 else 0.0
                failure_rate = 1 - success_rate
                vis.line(X=[now],
                         Y=[acc],
                         env=env_name,
                         win='rate',
                         name='acc',
                         update='append')
                vis.line(X=[now],
                         Y=[success_rate],
                         env=env_name,
                         win='rate',
                         name='att_success_rate',
                         update='append')
                vis.line(X=[now],
                         Y=[failure_rate],
                         env=env_name,
                         win='rate',
                         name='att_failure_rate',
                         update='append')

        logging(f'try to generate adv_samples {try_all} '
                f'generate successfully {success_num}  '
                f'failed {failure_num}  '
                f'origin samples num is {len(self.datas)}')
        if len(self.adv_datas) > 0:
            write_standard_data(self.adv_datas,
                                self.labels[len(self.adv_datas):], path, 'a')
Пример #21
0
 def data2token(self):
     logging(f'data is train {self.is_train} is to tokens!')
     assert self.data is not None
     for sen in self.data:
         self.data_token.append(self.tokenizer(sen))
    else:
        if BaselineConfig.baseline_model == 'Bert':
            train_bert_baseline_Classification(model, train_data, test_data,
                                               criterion_baseline_model,
                                               optimizer_baseline_model,
                                               cur_dir)
        else:
            train_baseline_Classification(model, train_data, test_data,
                                          criterion_baseline_model,
                                          optimizer_baseline_model, cur_dir)


if __name__ == "__main__":

    logging('Using cuda device gpu: ' + str(BaselineConfig.cuda_idx))
    cur_dir = BaselineConfig.output_dir + '/baseline_model/' + BaselineConfig.dataset + '/' + BaselineConfig.baseline_model + '/' + str(
        int(time.time()))
    # make output directory if it doesn't already exist
    if not os.path.isdir(cur_dir):
        os.makedirs(cur_dir)
    logging('Saving into directory' + cur_dir)
    save_config(cur_dir)

    train_data, test_data = build_dataset()
    model = build_model().to(BaselineConfig.train_device)

    logging('Training Baseline Model...')
    criterion_baseline_model = nn.CrossEntropyLoss().to(
        BaselineConfig.train_device)
    optimizer_baseline_model = optim.Adam(
Пример #23
0
def main(epochs: int, learning_rate: float):
    global best_acc
    global best_path
    global note
    global best_state

    criterion = nn.CrossEntropyLoss().to(config_device)
    optimizer = optim.Adam(net.parameters(), lr=learning_rate)
    # optimizer = optim.SGD(net.parameters(), lr=learning_rate)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                     mode='min',
                                                     factor=0.95,
                                                     patience=3,
                                                     verbose=True,
                                                     min_lr=3e-9)
    warmup_scheduler = optim.lr_scheduler.LambdaLR(optimizer,
                                                   lr_lambda=lambda ep: 1e-2
                                                   if ep < 3 else 1)

    loss = 0.0
    train_loss = 0.0

    global_batch_idx = 1
    acc_all = []
    test_best_loss = 1e5

    def evaluate(now_ep) -> float:
        test_loss = 0.0
        logging('starting evaluate!')
        net.eval()
        with torch.no_grad():
            correct = 0
            total = 0
            for (X, label) in tqdm(test_data):
                X, label = X.to(config_device), label.to(config_device)
                logits = net(X)
                test_loss += criterion(logits, label).item()
                predict = logits.argmax(dim=1)
                correct += predict.eq(label).float().sum().item()
                total += X.size(0)

        correct /= total
        test_loss /= len(test_data)
        logging(
            f'epoch {now_ep} evaluate done! test acc {correct:.5f}, best acc{best_acc:.5f}, test batch loss {test_loss:.5f}'
        )
        return correct, test_loss

    for ep in range(epochs):
        logging(f'epoch {ep} start!')
        net.train()
        train_loss = 0.0
        for batch_idx, (X, label) in enumerate(tqdm(train_data)):
            X, label = X.to(config_device), label.to(config_device)
            logits = net(X)
            loss = criterion(logits, label)

            train_loss += loss.item()
            global_batch_idx += 1

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if global_batch_idx % 2000 == 0:
                logging(
                    f'global batch {global_batch_idx}, loss is {train_loss/(batch_idx+1):.5f}'
                )

        train_loss /= len(train_data)
        logging(
            f'epoch {ep} train done! averge train loss is {train_loss:.5f}')

        ep_test_acc, test_loss = evaluate(ep)
        acc_all.append(ep_test_acc)
        if ep < 4: warmup_scheduler.step(ep)
        else: scheduler.step(test_loss, epoch=ep)

        if args.verbose:
            vis.line(X=[ep + 1],
                     Y=[train_loss],
                     env=env_name,
                     win='loss',
                     name='train_loss',
                     update='append')
            vis.line(X=[ep + 1],
                     Y=[test_loss],
                     env=env_name,
                     win='loss',
                     name='test_loss',
                     update='append')
            vis.line(X=[ep + 1],
                     Y=[ep_test_acc],
                     env=env_name,
                     win='acc',
                     name='test_acc',
                     update='append')

        if ep_test_acc > best_acc or (ep_test_acc == best_acc
                                      and test_loss < test_best_loss):
            best_acc = ep_test_acc
            test_best_loss = test_loss
            best_path = config_model_save_path[dataset_name].format(
                net.model_name, best_acc, get_time(), note)
            best_state = copy.deepcopy(net.state_dict())

        if (ep + 1) % (epochs // 3) == 0 and best_state:
            logging(f'saving model in {best_path} best acc {best_acc:.5f}')
            torch.save(best_state, best_path)
            best_state = None

    count = epochs - epochs // 2
    acc_all = sum(acc_all[epochs // 2:])
    acc_all /= count
    if best_state is not None: torch.save(best_state, best_path)
    logging(
        f'train {epochs} done! The last train loss is {train_loss/len(test_data):.5f}!'
    )
    logging(
        f'The last {count} epoch test average acc is {acc_all:.5f}, best acc is {best_acc:.5f}\n'
        f'best model saved in {best_path}')
Пример #24
0
    net = build_LSTM_model(dataset_name,
                           vocab,
                           config_device,
                           is_bid=True,
                           syn=syn,
                           is_load=is_load_model,
                           is_adv=args.adv)

# net = nn.DataParallel(net, device_ids=[0, 1])
net.to(config_device)
best_path = config_model_load_path[dataset_name].get(net.model_name)
best_state = None
best_acc = 0.0 if is_load_model == False else float(
    re.findall("_\d.\d+_", best_path)[0][1:-1])
if is_load_model:
    logging(f'loading net model from {best_path}, acc is {best_acc}')
else:
    pass  # net.apply(weights_init)


def main(epochs: int, learning_rate: float):
    global best_acc
    global best_path
    global note
    global best_state

    criterion = nn.CrossEntropyLoss().to(config_device)
    optimizer = optim.Adam(net.parameters(), lr=learning_rate)
    # optimizer = optim.SGD(net.parameters(), lr=learning_rate)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                     mode='min',
Пример #25
0
    with torch.no_grad():
        model.eval()
        acc_sum = 0
        n = 0
        for _, x_mask, x, label in test_data:
            x, x_mask, label = x.to(BaselineConfig.train_device), x_mask.to(
                BaselineConfig.train_device), label.to(
                    BaselineConfig.train_device)
            logits = model(x, x_mask)
            acc_sum += (logits.argmax(dim=1) == label).float().sum().item()
            n += label.shape[0]
        return acc_sum / n


if __name__ == "__main__":
    baseline_model_builder = BaselineModelBuilder('AGNEWS',
                                                  'LSTM',
                                                  BaselineConfig.train_device,
                                                  is_load=True)
    test_dataset_orig = AGNEWS_Dataset(
        train_data=False,
        attack_vocab=baseline_model_builder.vocab,
        debug_mode=False)
    test_data = DataLoader(test_dataset_orig,
                           batch_size=BaselineConfig.batch_size,
                           shuffle=False,
                           num_workers=4)
    logging(
        eval_bert_baseline_Classification(baseline_model_builder.net,
                                          test_data))
Пример #26
0
        test_dataset_orig = SST2_Dataset(train_data=False,
                                         attack_vocab=attack_vocab,
                                         debug_mode=AttackConfig.debug_mode)
    train_data = DataLoader(train_dataset_orig,
                            batch_size=AttackConfig.batch_size,
                            shuffle=True,
                            num_workers=4)
    test_data = DataLoader(test_dataset_orig,
                           batch_size=AttackConfig.batch_size,
                           shuffle=False,
                           num_workers=4)
    return train_data, test_data


if __name__ == '__main__':
    logging('Using cuda device gpu: ' + str(AttackConfig.cuda_idx))
    cur_dir = AttackConfig.output_dir + '/gan_model/' + AttackConfig.dataset + '/' + AttackConfig.baseline_model + '/' + str(
        int(time.time()))
    cur_dir_models = cur_dir + '/models'
    # make output directory if it doesn't already exist
    if not os.path.isdir(cur_dir):
        os.makedirs(cur_dir)
        os.makedirs(cur_dir_models)
    logging('Saving into directory' + cur_dir)
    save_config(cur_dir)

    baseline_model_builder = BaselineModelBuilder(AttackConfig.dataset,
                                                  AttackConfig.baseline_model,
                                                  AttackConfig.train_device,
                                                  is_load=True)
Пример #27
0
def evaluate_gan(test_data, Seq2Seq_model, gan_gen, gan_adv, dir,
                 attack_vocab):
    Seq2Seq_model.eval()
    gan_gen.eval()
    gan_adv.eval()
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    logging(f'Saving evaluate of gan outputs into {dir}')
    with torch.no_grad():

        for x, x_mask, y, _ in test_data:
            x, x_mask, y = x.to(AttackConfig.train_device), x_mask.to(
                AttackConfig.train_device), y.to(AttackConfig.train_device)

            # sentence -> encoder -> decoder
            Seq2Seq_outputs = Seq2Seq_model(x, x_mask, is_noise=False)
            # Seq2Seq_idx: [batch, seq_len]
            Seq2Seq_idx = Seq2Seq_outputs.argmax(dim=2)

            # sentence -> encoder -> adversary -> generator ->  decoder
            # eagd_outputs: [batch, seq_len, vocab_size]
            eagd_outputs = Seq2Seq_model(x,
                                         x_mask,
                                         is_noise=False,
                                         generator=gan_gen,
                                         adversary=gan_adv)
            # eagd_idx: [batch_size, sen_len]
            eagd_idx = eagd_outputs.argmax(dim=2)

            if attack_vocab:
                with open(dir, 'a') as f:
                    for i in range(len(y)):
                        f.write('------orginal sentence---------\n')
                        f.write(' '.join(
                            [attack_vocab.get_word(token)
                             for token in y[i]]) + '\n')
                        f.write('------setence -> encoder -> decoder-------\n')
                        f.write(' '.join([
                            attack_vocab.get_word(token)
                            for token in Seq2Seq_idx[i]
                        ]) + '\n')
                        f.write(
                            '------sentence -> encoder -> inverter -> generator -> decoder-------\n'
                        )
                        f.write(' '.join([
                            attack_vocab.get_word(token)
                            for token in eagd_idx[i]
                        ]) + '\n' * 2)
            else:
                with open(dir, 'a') as f:
                    for i in range(len(y)):
                        f.write('------orginal sentence---------\n')
                        f.write(
                            ' '.join(tokenizer.convert_ids_to_tokens(y[i])) +
                            '\n')
                        f.write('------setence -> encoder -> decoder-------\n')
                        f.write(' '.join(
                            tokenizer.convert_ids_to_tokens(Seq2Seq_idx[i])) +
                                '\n')
                        f.write(
                            '------sentence -> encoder -> inverter -> generator -> decoder-------\n'
                        )
                        f.write(' '.join(
                            tokenizer.convert_ids_to_tokens(eagd_idx[i])) +
                                '\n' * 2)
Пример #28
0
def save_all_models(Seq2Seq_model, gan_gen, gan_adv, dir):
    logging('Saving models...')
    torch.save(Seq2Seq_model.state_dict(), dir + '/Seq2Seq_model.pt')
    torch.save(gan_gen.state_dict(), dir + '/gan_gen.pt')
    torch.save(gan_adv.state_dict(), dir + '/gan_adv.pt')
Пример #29
0
            net = build_LSTM_model(dataset_name,
                                   vocab0,
                                   config_device,
                                   is_bid=True,
                                   syn=None)
        elif model == 'BidLSTM_adv':
            net = build_LSTM_model(dataset_name,
                                   vocab1,
                                   config_device,
                                   is_bid=True,
                                   syn=None,
                                   is_adv=True)
        elif model == 'BidLSTM_enhanced':
            net = build_LSTM_model(dataset_name,
                                   vocab0,
                                   config_device,
                                   is_bid=True,
                                   syn=syn)

        logging('evaluate origin test data')
        test_acc = evaluate(test_data, net)
        logging('evaluate origin 1k clean data')
        clean1k_acc = evaluate(clean_data, net)
        logging('evaluate adversarial 1k data')
        adv1k_acc = evaluate(adv_data, net)

        results.append((test_acc, clean1k_acc, adv1k_acc))
        logs.append(read_fool_log(adv_log_path))

    write_results_to_file(models, results, logs)