def load_tag_data(config, data_type):

    train_data, dev_data, test_data = build_dataset(config)
    if data_type == 'train':
        tag_data = build_iterator(train_data, config)
    elif data_type == 'dev':
        tag_data = build_iterator(dev_data, config)
    elif data_type == 'test':
        tag_data = build_iterator(test_data, config)
    
    return tag_data
예제 #2
0
def Pred(model, config):
    print("Testing...")
    with open("../user_data/tmp_data/B_nli_256.txt") as f:
        test_nli = json.load(f)
    with open("../user_data/tmp_data/B_tnews_256.txt") as f:
        test_news = json.load(f)
    with open("../user_data/tmp_data/B_emotion_256.txt") as f:
        test_emotion = json.load(f)
    print(len(test_nli), len(test_news), len(test_emotion))
    test_iter1 = build_iterator(test_nli, config)
    test_iter2 = build_iterator(test_news, config)
    test_iter3 = build_iterator(test_emotion, config)
    test(config, model, test_iter1, test_iter2, test_iter3)
    print("End..")
예제 #3
0
    def wunaijiade(self):
        dataset = 'THUCNews'  # 数据集
        model_name = args.model  # bert
        x = import_module('models.' + model_name)
        config = x.Config(dataset)
        np.random.seed(1)
        torch.manual_seed(1)
        torch.cuda.manual_seed_all(1)
        torch.backends.cudnn.deterministic = True  # 保证每次结果一样

        start_time = time.time()
        print("Loading data...")
        test_data = build_dataset(config)
        # test_data = build_dataset(config)
        test_iter = build_iterator(test_data, config)
        # print("test_data",test_data)
        time_dif = get_time_dif(start_time)
        print("Time usage:", time_dif)

        # train
        model = x.Model(config)  # .to(config.device)
        # model = x.Model(config)

        # beipoyuanli.train(config, model, train_iter, dev_iter, test_iter)
        aa = train(config, model, test_iter)
        # train(config, model, test_iter)
        return aa
예제 #4
0
def main():
    dataset = 'C:/Users/USER/Documents/Capstone_Project/datalogs'
    x = import_module('models.{}'.format('RNN'))
    config = x.Config(dataset)
    train_data, dev_data, test_data = build_dataset(config, True, False)
    dev_iter = build_iterator(dev_data, config, do_dev=True)

    vocab = pickle.load(open(config.vocab_path, 'rb'))
    vocab = pickle.load(open(config.vocab_path, 'rb'))
    re_vocab = {token_id: token for token, token_id in vocab.items()}

    x = []
    real_y = []
    for i, (Queries, Responses) in enumerate(dev_iter):
        x = x + [sentence(q, re_vocab) for q in Queries[0].cpu().tolist()]
        real_y = real_y + [
            sentence(r, re_vocab) for r in Responses[0].cpu().tolist()
        ]

    data = []
    for i in range(len(x)):
        data.append('Pair {}'.format(i + 1))
        data.append('Query: {}'.format(x[i]))
        data.append('Original Response: {}'.format(real_y[i]))
        data.append(' ')

    data = pd.DataFrame(data)
    data.to_csv(os.path.join('results_token.txt'),
                sep='\t',
                encoding='utf8',
                header=False,
                index=False)
예제 #5
0
파일: auto.py 프로젝트: wangjs9/ecs
def Test(model_name):
    dataset = 'C:/Users/USER/Documents/Capstone_Project/datalogs'  # 数据集

    p = os.path.dirname(os.path.dirname((os.path.abspath(__file__))))
    if p not in sys.path:
        sys.path.append(p)

    do_train = False
    do_test = True

    x = import_module('models.{}'.format(model_name))
    config = x.Config(dataset)
    np.random.seed(156)
    torch.cuda.manual_seed_all(1024)
    torch.backends.cudnn.deterministic = True

    start_time = time.time()
    print('Loading data...')
    train_data, dev_data, test_data = build_dataset(config, do_train, do_test)

    if do_test:
        test_iter = build_iterator(test_data, config, do_dev=True)

    time_dif = get_time_dif(start_time)

    model = x.Seq2SeqModel(config).to(config.device)

    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)

    if do_test:
        test(config, model, test_iter)
예제 #6
0
def build(hidden_size, batch_size, max_len, cuda):
    bidirectional = False
    model_name = 'bert'
    x = import_module('models.' + model_name)
    config = x.Config(batch_size)
    train_data = build_dataset(config)
    train_dataloader = build_iterator(train_data, config)
    val_data, test_data = build_dataset_eval(config)
    val_dataloader = build_iterator_eval(val_data, config)
    test_dataloader = build_iterator_eval(test_data, config)

    encoder = x.Model(config).to(config.device)
    decoder = DecoderRNN(len(config.tokenizer.vocab),
                         max_len,
                         hidden_size * 2 if bidirectional else hidden_size,
                         dropout_p=0.2,
                         use_attention=True,
                         bidirectional=bidirectional,
                         eos_id=config.tokenizer.convert_tokens_to_ids([SEP
                                                                        ])[0],
                         sos_id=config.tokenizer.convert_tokens_to_ids([CLS
                                                                        ])[0])
    seq2seq = Seq2seq(encoder, decoder)

    if cuda:
        seq2seq.cuda()

    optimizer = torch.optim.Adam(lr=1e-3, params=seq2seq.parameters())
    Tensor = torch.cuda.LongTensor if cuda else torch.LongTensor

    if cuda:
        seq2seq.cuda()
    Tensor = torch.cuda.LongTensor if cuda else torch.LongTensor
    loss_fun = torch.nn.NLLLoss(reduce=False)
    return seq2seq, optimizer, Tensor, train_dataloader, val_dataloader, test_dataloader, loss_fun, config
예제 #7
0
파일: predict.py 프로젝트: ldcodes/nlp
    def build_dataset(self, path):
        # 加载数据集
        # [(tokens, int(id), seq_len, mask)]
        config = self.config

        print('\nloading predict set ...')
        predict_data = self.load_dataset(path, config.pad_size)
        print('Done!')
        self.predict_iter = build_iterator(predict_data, config)
예제 #8
0
    def predict_text(self, input_text):
        label_dict = {0: "other", 1: "weather"}
        model_in = self.load_dataset(input_text, self.vocab)
        test_iter = build_iterator([model_in], self.config)

        with torch.no_grad():
            for texts, labels in test_iter:
                outputs = self.model(texts)
                label = torch.max(outputs.data, 1)[1].cpu().numpy()[0]
                return label_dict[label]
예제 #9
0
def build(hidden_size, batch_size, max_len, cuda):
    bidirectional = False

    model_name = 'bert'
    x = import_module('models.' + model_name)
    config = x.Config(batch_size)
    train_data = build_dataset(config)
    train_dataloader = build_iterator(train_data, config)
    val_data, test_data = build_dataset_eval(config)
    val_dataloader = build_iterator_eval(val_data, config)
    test_dataloader = build_iterator_eval(test_data, config)

    encoder = x.Model(config).to(config.device)
    decoder = DecoderRNN(len(config.tokenizer.vocab),
                         max_len,
                         hidden_size * 2 if bidirectional else hidden_size,
                         dropout_p=0.2,
                         use_attention=True,
                         bidirectional=bidirectional,
                         eos_id=config.tokenizer.convert_tokens_to_ids([SEP
                                                                        ])[0],
                         sos_id=config.tokenizer.convert_tokens_to_ids([CLS
                                                                        ])[0])

    decoder = decoder.to(config.device)

    seq2seq = Seq2seq(encoder, decoder)
    if cuda:
        seq2seq.cuda()
    param_optimizer = list(seq2seq.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    # optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)
    print(len(train_data))
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=config.learning_rate,
                         warmup=0.03,
                         t_total=len(train_data) * config.num_epochs)
    Tensor = torch.cuda.LongTensor if cuda else torch.LongTensor

    if cuda:
        seq2seq.cuda()
    Tensor = torch.cuda.LongTensor if cuda else torch.LongTensor
    loss_fun = torch.nn.NLLLoss(reduce=False)
    return seq2seq, optimizer, Tensor, train_dataloader, val_dataloader, test_dataloader, loss_fun, config
예제 #10
0
    async def delete_all(self, ctx, mode_pattern):
        """Deletes all database entries
        
        This should be used with a lot of caution. There is no way to retract the deleted entries.
        
        Example: !delete_all active"""

        iterator = build_iterator(modes=mode_pattern)

        for mode, in iterator:
            delete_all(mode)

        await ctx.send(f'Emptied the database for {mode}.')
예제 #11
0
    async def delete_me(self, ctx, mode_pattern):
        """Deletes the calling user from the database
        
        This should be used with a lot of caution. There is no way to retract your deleted entry.
        
        Example: !delete_me active"""
        iterator = build_iterator(modes=mode_pattern)

        for mode, in iterator:
            delete_user(ctx.author.id, mode)

        await ctx.send(
            f'Deleted the user {ctx.author.display_name} from {mode_pattern}.')
예제 #12
0
    async def empty_me(self, ctx, mode_pattern, day_pattern):
        """Empties the time intervals of the calling user
        
        This can be used to reset your time intervals on certain days, but be careful. There is no way to retract the deleted information.
        
        Example: !empty_me active weekdays"""
        iterator = build_iterator(modes=mode_pattern, days=day_pattern)

        for mode, day in iterator:
            empty_user(ctx.author.id, mode, day)

        await ctx.send(
            f'Emptied time intervals of {ctx.author.display_name} on {long_name(day_pattern)} in {mode_pattern}.'
        )
예제 #13
0
    async def show_all(self, ctx, mode_pattern, day_pattern):
        """Prints currently registered time intervals.
        
        Reads the database for the given mode(s) and returns a formatted version of the time intervals of all the server's users on the given day(s).
        
        Example: !show_all active weekdays"""

        iterator = build_iterator(modes=mode_pattern, days=day_pattern)

        output = '**All currently registered time intervals:**\n'

        for mode, day in iterator:
            output += all_intervals_format(mode, day)

        await ctx.send(output)
예제 #14
0
def evaluate(config, model, data_set, test=False):
    model.eval()
    loss_total = 0
    predict_all = np.array([], dtype=int)
    labels_all = np.array([], dtype=int)
    batch_size = 10
    b_len = len(data_set) // batch_size
    data_set = build_iterator(data_set, config)
    embeddings = []
    with torch.no_grad():
        for batch_data, label in data_set:
            outputs = model(batch_data)
            embeddings.append(outputs.cpu().detach().numpy())

    return np.concatenate(embeddings, 0)
예제 #15
0
    async def empty_all(self, ctx, mode_pattern, day_pattern):
        """Empties time intervals of all users
        
        This can be used to reset everything, but be careful. There is no way to retract the deleted information.
        
        Example: !empty_all active weekdays"""

        iterator = build_iterator(modes=mode_pattern, days=day_pattern)

        for mode, day in iterator:
            empty_all(mode, day)

        await ctx.send(
            f'Emptied the time intervals for everyone on {long_name(day_pattern)} in {mode_pattern}.'
        )
예제 #16
0
def predict(textList):
    key = []
    value = []
    newTextList = []
    for i in range(len(textList)):
        tmp = []
        tmp.append(i)
        key.append(i)
        value.append(tmp)
    listmap = dict(zip(key,value))
    for i in range(len(textList)):
        tmpList = get_split_text(textList[i])
        newTextList.extend(tmpList)
        listmap[i]=len(tmpList)
    print("listmap:",listmap)
    print("new_predict_all len:", len(newTextList))
    #print(newTextList)
    test_data = load_dataset(newTextList, config.pad_size)
    test_iter = build_iterator(test_data, config)
    predict_all_int = np.array([], dtype=int)
    with torch.no_grad():
        for texts, lables in test_iter:
            outputs = model(texts)
            predict = torch.max(outputs.data, 1)[1].cpu().numpy()
            predict_all_int = np.append(predict_all_int, predict)
    predict_all_s  = []
    for i in range(len(predict_all_int)):
        predict_all_s.append(class1.get(predict_all_int[i]))

    new_predict_all = []
    index = 0
    for i in range(len(key)):
        tmpPredict = []
        num = listmap[i]
        for j in range(num):
            tmpPredict.append(predict_all_s[index])
            index = index+1
        print("tmpPredict:",tmpPredict)
        new_predict_all.extend(most_common(tmpPredict))


    return new_predict_all
예제 #17
0
    async def when(self, ctx, day_pattern):
        """Calculates the common time intervals of all members.
        
        This computes the intersection of the time intervals in active mode for all members of the server on the given day(s).

        Example: !when weekend"""

        iterator = build_iterator(days=day_pattern)

        output = '**Common time intervals for all members:**\n'

        for day, in iterator:
            common_interval = get_common_interval(day)

            if is_empty(common_interval):
                output += f'\t**{long_name(day)}**: No common time interval.\n'

            else:
                output += f'\t**{long_name(day)}**: {time_intervals_to_str_readable(common_interval)}\n'

        await ctx.send(output)
예제 #18
0
    async def to_profile(self, ctx, day_pattern):
        """Sets the time intervals of the calling user to his/her profile
        
        After this call, the time intervals in active, which are used to compute common time intervals, are set to equal the ones in profile. Be careful, the time intervals in active cannot be restored.
        
        Example: !to_profile all"""

        iterator = build_iterator(days=day_pattern)

        user_id = ctx.author.id

        if not in_database(user_id, 'profile'):
            await ctx.send('You have no registered times in profile.')
            return

        for day, in iterator:
            interval = get_time_interval(user_id, day, 'profile')

            set_time_interval(user_id, day, 'active', interval)

        await ctx.send(
            f'Set the time intervals for {ctx.author.display_name} on {long_name(day_pattern)} to his/her profile.'
        )
예제 #19
0
    async def show_me(self, ctx, mode_pattern, day_pattern):
        """Prints currently registered time intervals for the calling user.
        
        Reads the database for the given mode(s) and returns a formatted version of the time intervals of the calling user on the given day(s).
        
        Example: !show_me profile fri"""

        iterator = build_iterator(modes=mode_pattern, days=day_pattern)

        user_id = ctx.author.id

        for mode, day in iterator:
            if day == 'mon':
                if not in_database(user_id, mode):
                    await ctx.send(
                        f'You have no registered time intervals in {mode}.')
                    return

                output = f'**Time intervals for {ctx.author.display_name} in {mode}:**\n'

            interval = get_time_interval(user_id, day, mode)
            output += f'\t**{long_name(day)}:** {time_intervals_to_str_readable(interval)}\n'

        await ctx.send(output)
예제 #20
0
# parser = argparse.ArgumentParser(description='Chinese Text Classification')
# parser.add_argument('--model', type=str, required=True, help='choose a model: Bert, ERNIE')
# args = parser.parse_args()

if __name__ == '__main__':
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
    dataset = 'THUCNews'  # 数据集

    # model_name = args.model  # bert
    model_name = 'bert'
    x = import_module('models.' + model_name)
    config = x.Config(dataset)
    np.random.seed(1)
    torch.manual_seed(1)
    torch.cuda.manual_seed_all(1)
    torch.backends.cudnn.deterministic = True  # 保证每次结果一样

    start_time = time.time()
    print("Loading data...")
    train_data, dev_data, test_data = build_dataset(config)
    train_iter = build_iterator(train_data, config)
    dev_iter = build_iterator(dev_data, config)
    test_iter = build_iterator(test_data, config)
    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)

    # train
    model = x.Model(config).to(config.device)
    train(config, model, train_iter, dev_iter, test_iter)
예제 #21
0
x = import_module('models.' + model_name)
# 配置参数
config = x.Config(dataset)

# 固定以下参数是为了保证每次结果一样
np.random.seed(1)
# 为CPU设置种子用于生成随机数
torch.manual_seed(1)
# #为所有GPU设置随机种子
torch.cuda.manual_seed_all(1)
# 这个参数为True, 每次返回的卷积算法将是确定的,即默认算法
torch.backends.cudnn.deterministic = True  # 保证每次结果一样

start_time = time.time()
print("Loading data...")
OCNLI_train, OCNLI_dev, OCEMOTION_train, OCEMOTION_dev, TNEWS_train, TNEWS_dev = build_dataset(
    config, mode='train')
OCNLI_train_iter = build_iterator(OCNLI_train, config)
OCEMOTION_train_iter = build_iterator(OCEMOTION_train, config)
TNEWS_train_iter = build_iterator(TNEWS_train, config)
OCNLI_dev_iter = build_iterator(OCNLI_dev, config)
OCEMOTION_dev_iter = build_iterator(OCEMOTION_dev, config)
TNEWS_dev_iter = build_iterator(TNEWS_dev, config)

time_dif = get_time_dif(start_time)

# train
model = x.Model(config).to(config.device)
train(config, model, OCNLI_train_iter, OCNLI_dev_iter, OCEMOTION_train_iter,
      OCEMOTION_dev_iter, TNEWS_train_iter, TNEWS_dev_iter)
예제 #22
0
        final_result.append(dic)
    # 输出json文件
    import json
    with open(output_path, 'w') as f:
        for each in final_result:
            json_str = json.dumps(each)  # dumps
            f.write(json_str)
            f.write('\n')
        

if __name__ == '__main__':
    dataset = '.'  # 数据集

    model_name = 'bert'  # bert
    # 动态导入模块
    x = import_module('models.' + model_name)
    # 配置参数
    config = x.Config(dataset)

    model = x.Model(config).to(config.device)

    OCNLI_test, OCEMOTION_test, TNEWS_test = build_dataset(config, mode='test')
    OCNLI_test_iter = build_iterator(OCNLI_test, config)
    OCEMOTION_test_iter = build_iterator(OCEMOTION_test, config)
    TNEWS_test_iter = build_iterator(TNEWS_test, config)
    # 第一个任务的提交
    submit_test(config, model, OCNLI_test_iter, config.OCLI_submit_output_path, 0)
    submit_test(config, model, OCEMOTION_test_iter, config.OCEMOTION_submit_output_path, 1)
    submit_test(config, model, TNEWS_test_iter, config.TNEWS_submit_output_path, 2)

예제 #23
0
#    start_time = time.time()
#    print("Loading data...")
#    train_data, dev_data, test_data = build_dataset(config)
#    train_iter = build_iterator(train_data, config)
#    dev_iter = build_iterator(dev_data, config)
#    test_iter = build_iterator(test_data, config)
#    time_dif = get_time_dif(start_time)
#    print("Time usage:", time_dif)

    # train
    for i in range(3,5):
        config.train_path = dataset + '/data/fold5/cvfold'+str(i)+'_train.txt'
        config.dev_path = dataset + '/data/fold5/cvfold'+str(i)+'_dev.txt' 
        config.test_path = dataset + '/data/fold5/cv_valid.txt'
        config.save_path = dataset + '/saved_dict/' + config.model_name + '512-5fold-'+str(i)+'.bin' 
        #if i==0 or i==1:
        #    config.num_epochs = 1
        submit_data = build_dataset(config)
        #train_iter = build_iterator(train_data, config)
        #dev_iter = build_iterator(dev_data, config)
        #test_iter = build_iterator(test_data, config)
        submit_iter = build_iterator(submit_data, config)    
        model = x.Model(config).to(config.device)
        test(config, model, submit_iter, 'bertdrop_submitb_'+str(i)+'.npy')
        #test(config, model, test_iter,'bertRNN_valid_'+str(i)+'.npy')
        #test(config, model, dev_iter, 'bertRNN_train_'+str(i)+'.npy')
        #model.load_state_dict(torch.load(config.save_path))
	
        #train(config, model, train_iter, dev_iter, test_iter)
예제 #24
0
    do_test = args.do_test
    if (do_train or do_test) == False:
        raise ValueError(
            'At lest one of `do_train` or `do_test` muest be True.')
    x = import_module('models.{}'.format(model_name))
    config = x.Config(dataset)
    np.random.seed(156)
    torch.cuda.manual_seed_all(1024)
    torch.backends.cudnn.deterministic = True

    start_time = time.time()
    print('Loading data...')
    train_data, dev_data, test_data = build_dataset(config, do_train, do_test)

    if do_train:
        train_iter = build_iterator(train_data, config)
        dev_iter = build_iterator(dev_data, config, do_dev=True)
    if do_test:
        test_iter = build_iterator(test_data, config, do_dev=True)

    time_dif = get_time_dif(start_time)

    model = x.Seq2SeqModel(config).to(config.device)

    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)

    if do_train:
        train(config, model, train_iter, dev_iter)
    if do_test:
    # dataset = 'data/Intention2_V2'  # 数据集
    # dataset = 'data/Intention2_V2'  # 数据集
    # dataset = 'data/Intention135'  # 数据集
    dataset = args.data_dir  # 数据集
    task_name = args.task_name
    # model_dir = 'data/Intention2/saved_dict/bert.ckpt'

    model_name = args.model  # bert
    x = import_module('models.' + model_name)
    config = x.Config(dataset, task_name)
    # config.save_path = model_dir
    np.random.seed(1)
    torch.manual_seed(1)
    torch.cuda.manual_seed_all(1)
    torch.backends.cudnn.deterministic = True  # 保证每次结果一样

    start_time = time.time()
    print("Loading data...")
    train_data, dev_data, test_data = build_dataset(config)

    test_iter = build_iterator(test_data, config)
    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)

    # train
    model = x.Model(config).to(config.device)
    # train(config, model, train_iter, dev_iter, test_iter)

    out_result_file = dataset + '/result/model_name.result.txt'
    predict(config, model, test_iter, out_file=out_result_file)
예제 #26
0
from utils import build_dataset, build_iterator, get_time_dif, load_vocabulary, build_vocab

torch.manual_seed(1)
torch.cuda.manual_seed_all(1)
torch.backends.cudnn.deterministic = True
start_time = time.time()
print('加载数据...')
build_vocab(config.input_file, os.path.join(config.vocab_path, 'in_vocab'))
build_vocab(config.slot_file, os.path.join(config.vocab_path, 'slot_vocab'))
build_vocab(config.intent_file, os.path.join(config.vocab_path, 'intent_vocab'), pad=False, unk=False)
in_vocab = load_vocabulary(os.path.join(config.vocab_path, 'in_vocab'))
slot_vocab = load_vocabulary(os.path.join(config.vocab_path, 'slot_vocab'))
intent_vocab = load_vocabulary(os.path.join(config.vocab_path, 'intent_vocab'))
train_data, dev_data, test_data = build_dataset(in_vocab['vocab'], slot_vocab['vocab'], intent_vocab['vocab'])

train_iter = build_iterator(train_data)
dev_iter = build_iterator(dev_data)
test_iter = build_iterator(test_data)
time_dif = get_time_dif(start_time)
print('time usage:', time_dif)

config.n_vocab = len(in_vocab['vocab'])

x = import_module(model_name)
model = x.Model(config).to(torch.device('cuda'))
init_network(model)
print(model.parameters)

train(config, model, train_iter, dev_iter, test_iter)
# test(config, model, test_iter)
예제 #27
0
        if not hasattr(config, 'STLR'):
            setattr(config, 'STLR', True)
    else:
        setattr(config, 'STLR', False)

    # random initialize
    np.random.seed(1)
    torch.manual_seed(1)
    torch.cuda.manual_seed_all(1)
    torch.backends.cudnn.deterministic = True  # 保证每次结果一样

    start_time = time.time()
    print("Loading data...")
    if not os.path.exists('./datasets'):
        os.makedirs('./datasets')
        build_dataset(config)
    train_iter = build_iterator('train', config)
    dev_iter = build_iterator('dev', config)
    test_iter = build_iterator('test', config)
    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)

    # train
    model = model.Model(config).to(config.device)
    train(config,
          model,
          train_iter,
          dev_iter,
          test_iter,
          save_loss=save_loss_flag)
예제 #28
0
파일: test.py 프로젝트: cscyuge/seq2seq
def main():
    config = DCMN_Config()
    eval_seq_dataset, eval_dcmn_dataset = build_dataset_eval(config)
    eval_dataloader = build_iterator(eval_seq_dataset, eval_dcmn_dataset,
                                     config)
    seq2seq, seq_optimizer, seq_scheduler, seq_loss_fun = build_seq2seq(
        config, 768, config.no_cuda)
    dcmn = BertForMultipleChoiceWithMatch.from_pretrained(
        config.bert_model, num_choices=config.num_choices)
    dcmn.to(config.dcmn_device)

    save_file_best = torch.load('./backup/bert/best_save.data',
                                map_location=torch.device('cuda:2'))
    # dcmn.load_state_dict(save_file_best['dcmn_para'])
    seq2seq.load_state_dict(save_file_best['para'])

    save_file_best = torch.load('./backup/dcmn/best_save.data',
                                map_location=torch.device('cuda:2'))
    dcmn.load_state_dict(save_file_best['dcmn_para'])

    dcmn.eval()
    seq2seq.eval()

    # src = '[CLS] [MASK] computed tomography [MASK] showed [MASK] patient [MASK] was [MASK] fine [MASK] . [SEP]'
    # src_ids, src_masks = seq_tokenize([src], config)
    # decoder_outputs, decoder_hidden, ret_dict = seq2seq([src_ids, src_masks], src_ids, 0.0, False)
    # symbols = ret_dict['sequence']
    # symbols = torch.cat(symbols, 1).data.cpu().numpy()
    # results = decode_sentence(symbols, config)
    # print(results)

    results = []
    seq_srcs_all = []
    for step, (seq_batches, dcmn_batches) in enumerate(
            tqdm(eval_dataloader, desc="Evaluating")):
        seq_srcs, seq_tars, cudics, k_cs = [[_[__] for _ in seq_batches]
                                            for __ in range(4)]
        outs = []

        if len(dcmn_batches) > 0:
            for p in range(0, len(dcmn_batches), config.batch_size):
                dcmn_batches_smaller = dcmn_batches[p:p + config.batch_size]
                input_ids, input_mask, segment_ids, doc_len, ques_len, option_len, labels = [
                    torch.LongTensor([_[__] for _ in dcmn_batches_smaller
                                      ]).to(config.dcmn_device)
                    for __ in range(7)
                ]

                with torch.no_grad():
                    logits = dcmn(input_ids, segment_ids, input_mask, doc_len,
                                  ques_len, option_len)
                    outs_smaller = np.argmax(logits.detach().cpu().numpy(),
                                             axis=1)
                    outs.extend(outs_smaller)

        seq_srcs = remove_unk(seq_srcs, outs, k_cs)
        seq_srcs_all.extend(seq_srcs)
        src_ids, src_masks = seq_tokenize(seq_srcs, config)
        decoder_outputs, decoder_hidden, ret_dict = seq2seq(
            [src_ids, src_masks], src_ids, 0.0, False)

        symbols = ret_dict['sequence']
        symbols = torch.cat(symbols, 1).data.cpu().numpy()
        results.extend(decode_sentence(symbols, config))
    with open('./outs/outs-new.pkl', 'wb') as f:
        pickle.dump(results, f)

    sentences = []
    for words in results:
        words = words.replace('[MASK] ', '')
        words = words.replace(' - ', '-').replace(' . ',
                                                  '.').replace(' / ', '/')
        sentences.append(words.strip())

    with open('./result/tmp.out.txt', 'w', encoding='utf-8') as f:
        f.writelines([x.lower() + '\n' for x in sentences])
    bleu, hit, com, ascore = get_score()
    print('bleu:{}, hit:{}, com:{}, ascore:{}'.format(bleu, hit, com, ascore))
예제 #29
0
    test_inputs = load_dataset(df_test, test_categories, config.pad_size)
    gkf = GroupKFold(n_splits=5).split(X=df_train.q2, groups=df_train.id)

    valid_preds = []
    test_preds = []
    print("一共"+str(df_train.shape[0])+"个训练语句")
    oof = np.zeros((len(df_train), 1))
    for fold, (train_idx, valid_idx) in enumerate(gkf):
        # if fold<10:
        #     continue
        model = x.Model(config).to(config.device)
        print("Loading " + str(fold + 1) + " fold data...")
        train_idx = shuffle(train_idx)
        train_inputs = [datas[i] for i in train_idx]
        valid_inputs = [datas[i] for i in valid_idx]
        train_iter = build_iterator(train_inputs, config.batch_size, config)
        dev_iter = build_iterator(valid_inputs, config.test_batch, config)
        test_iter = build_iterator(test_inputs, config.test_batch, config)
        valid_outputs = np.array([], dtype=int)
        for d, (text, labels) in enumerate(dev_iter):
            valid_outputs = np.append(valid_outputs, labels.data.cpu().numpy())
        time_dif = get_time_dif(start_time)
        print("Time usage:", time_dif)
        train(config, model, train_iter, dev_iter, fold)
        oof_p = predict(config, model, dev_iter, fold, activation='softmax')
        oof[valid_idx] = oof_p
        valid_preds.append(oof_p)

        f1, t = search_f1(valid_outputs, valid_preds[-1])
        print('validation score = ', f1)
        each_fold_predict = predict(config, model, test_iter, fold, activation='softmax')
예제 #30
0
파일: run.py 프로젝트: cscyuge/seq2seq
def build_dcmn():
    config = DCMN_Config()

    output_eval_file = os.path.join(config.output_dir, config.output_file)

    if os.path.exists(
            output_eval_file) and config.output_file != 'output_test.txt':
        raise ValueError(
            "Output file ({}) already exists and is not empty.".format(
                output_eval_file))
    with open(output_eval_file, "w") as writer:
        writer.write(
            "***** Eval results Epoch  %s *****\t\n" %
            (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))))
        dic = str([(name, value) for name, value in vars(config).items()])
        writer.write("%s\t\n" % dic)

    random.seed(config.seed)
    np.random.seed(config.seed)
    torch.manual_seed(config.seed)
    if not config.no_cuda > 0:
        torch.cuda.manual_seed_all(config.seed)

    # train_seq_dataset, train_dcmn_dataset = build_dataset(config)
    # with open('./data/train_seq_dataset.pkl', 'wb') as f:
    #     pickle.dump(train_seq_dataset, f)
    # with open('./data/train_dcmn_dataset.pkl', 'wb') as f:
    #     pickle.dump(train_dcmn_dataset, f)
    with open('./data/train_seq_dataset.pkl', 'rb') as f:
        train_seq_dataset = pickle.load(f)
    with open('./data/train_dcmn_dataset.pkl', 'rb') as f:
        train_dcmn_dataset = pickle.load(f)
    train_dataloader = build_iterator(train_seq_dataset, train_dcmn_dataset,
                                      config)

    # eval_seq_dataset, eval_dcmn_dataset = build_dataset_eval(config)
    # with open('./data/eval_seq_dataset.pkl', 'wb') as f:
    #     pickle.dump(eval_seq_dataset, f)
    # with open('./data/eval_dcmn_dataset.pkl', 'wb') as f:
    #     pickle.dump(eval_dcmn_dataset, f)
    with open('./data/eval_seq_dataset.pkl', 'rb') as f:
        eval_seq_dataset = pickle.load(f)
    with open('./data/eval_dcmn_dataset.pkl', 'rb') as f:
        eval_dcmn_dataset = pickle.load(f)
    eval_dataloader = build_iterator(eval_seq_dataset, eval_dcmn_dataset,
                                     config)

    num_train_steps = int(
        len(train_seq_dataset) / config.batch_size /
        config.gradient_accumulation_steps * config.num_train_epochs)
    t_total = num_train_steps
    config.t_total = t_total

    dcmn_t_total = 0
    for step, (seq_batches, dcmn_batches) in enumerate(train_dataloader):
        if len(dcmn_batches) > 0:
            dcmn_t_total += len(dcmn_batches) // config.batch_size
            if len(dcmn_batches) % config.batch_size > 0:
                dcmn_t_total += 1
    dcmn_t_total *= config.num_train_epochs

    model = BertForMultipleChoiceWithMatch.from_pretrained(
        config.bert_model, num_choices=config.num_choices)
    model.to(config.dcmn_device)

    param_optimizer = list(model.named_parameters())

    # hack to remove pooler, which is not used
    # thus it produce None grad that break apex
    param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]

    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.0
    }]

    optimizer = AdamW(params=optimizer_grouped_parameters,
                      lr=config.dcmn_learning_rate,
                      correct_bias=False)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(config.dcmn_warmup_proportion * dcmn_t_total),
        num_training_steps=dcmn_t_total)  # PyTorch scheduler

    loss_fun = torch.nn.CrossEntropyLoss()

    return model, config, train_dataloader, eval_dataloader, optimizer, scheduler, loss_fun