예제 #1
0
def main():
    config = Config()
    print("loading data...")
    ids, data, labels = bd.load_data("./corpus/seg_train.txt")
    count, dict_word2index, dict_index2word = bd.build_vocabulary(data, min_count=config.min_count)
    print("save word2index and index2word")
    bde.save_dict(dict_word2index,config.word2index_path)
    bde.save_dict(dict_index2word,config.index2word_path)
    print("load word2index and index2word")
    print(bde.load_pickle(config.word2index_path))
    print(bde.load_pickle(config.index2word_path))
def main(task2_model_id, task2_model_path):
    multi_config = MultiConfig()
    multi_config.is_training = False
    multi_config.dropout_rate = 0.0

    print("loading data...")
    dict_word2index = bpe.load_pickle(multi_config.word2index_path)
    tests_id, test_data = bd.load_test_data(multi_config.test_path)
    if task2_model_id != 4:
        test_X = bd.build_test_data(test_data, dict_word2index, multi_config.max_text_len)
    else:
        test_X = bd.build_test_data_HAN(test_data, dict_word2index, multi_config.num_sentences, multi_config.sequence_length)

    testset = MingLueTestData(test_X)
    test_loader = DataLoader(dataset=testset,
                             batch_size=multi_config.batch_size,
                             shuffle=False,
                             num_workers=multi_config.num_workers)
    
    multi_config.vocab_size = len(dict_word2index)
    print("loading model...")
    model2 = load_multi_model(task2_model_path, task2_model_id, multi_config)
    
    print("model loaded")

    print("predicting...")
    predicted_multi_labels = [[]]
    predicted_multi_labels = predict_multi_label(test_loader, model2, multi_config)
    generate_result_json(tests_id, predicted_multi_labels, multi_config.result_path)
def main(task1_model_id, task1_model_path):
    config = Config()
    multi_config = MultiConfig()
    config.is_training = False
    config.dropout_rate = 0.0
    multi_config.is_training = False
    multi_config.dropout_rate = 0.0
    # model_id = int(input("Please select a model(input model id):\n0: fastText\n1: TextCNN\n2: TextRCNN\nInput: "))

    print("loading data...")
    dict_word2index = bpe.load_pickle(config.word2index_path)
    if task1_model_id != 4:
        tests_id, test_data = bd.load_test_data(config.test_path)
        test_X = bd.build_test_data(test_data, dict_word2index,
                                    config.max_text_len)
    else:
        tests_id, test_data = bd.load_test_data(config.test_path)
        test_X = bd.build_test_data_HAN(test_data, dict_word2index,
                                        config.num_sentences,
                                        config.sequence_length)
    testset = MingLueTestData(test_X)
    test_loader = DataLoader(dataset=testset,
                             batch_size=config.batch_size,
                             shuffle=False,
                             num_workers=config.num_workers)

    config.vocab_size = len(dict_word2index)
    multi_config.vocab_size = len(dict_word2index)
    print("loading model...")
    model1 = load_model(task1_model_path, task1_model_id, config)

    print("model loaded")

    print("predicting...")
    predicted_labels = predict(test_loader, model1, config.has_cuda)
    predicted_multi_labels = [[]]
    generate_result_json(tests_id, predicted_labels, predicted_multi_labels,
                         config.result_path)
예제 #4
0
def main(rcnn_model_path, han_model_path):
    config = Config()
    config.is_training = False
    config.dropout_rate = 0.0

    print("loading data...")
    dict_word2index = bpe.load_pickle(config.word2index_path)
    tests_id, test_data = bd.load_test_data(config.test_path)
    test_X = bd.build_test_data(test_data, dict_word2index, config.max_text_len)

    testset = MingLueTestData(test_X)
    test_loader = DataLoader(dataset=testset,
                             batch_size=config.batch_size,
                             shuffle=False,
                             num_workers=config.num_workers)

    test_X_HAN = bd.build_test_data_HAN(test_data, dict_word2index, config.num_sentences, config.sequence_length)

    testset = MingLueTestData(test_X_HAN)
    test_loader_HAN = DataLoader(dataset=testset,
                             batch_size=config.batch_size,
                             shuffle=False,
                             num_workers=config.num_workers)

    
    config.vocab_size = len(dict_word2index)
    print("loading model...")

    rcnn_model = load_model(rcnn_model_path, 2, config)
    han_model = load_model(han_model_path, 4, config)
    print("model loaded")

    print("predicting...")
    predicted_labels = predict(test_loader, test_loader_HAN, rcnn_model, han_model,  config)

    generate_result_json(tests_id, predicted_labels, config.result_path)
def main(model_id, use_element, is_save):
    config = Config()
    print("epoch num: ", config.epoch_num)
    print("loading data...")
    ids, data, labels = bd.load_data(config.data_path)
    total_vocab_size = sd.count_vocab_size(data)
    print("total vocab size", total_vocab_size)
    force = config.force_word2index
    if not force and os.path.exists(config.index2word_path) and os.path.exists(
            config.word2index_path):
        print("load word2index")
        dict_word2index = bpe.load_pickle(config.word2index_path)
        print(dict_word2index['<UNK>'], dict_word2index['<PAD>'])
    else:
        print("save word2index and index2word")
        count, dict_word2index, dict_index2word = bd.build_vocabulary(
            data, min_count=config.min_count)
        bpe.save_dict(dict_index2word, config.index2word_path)
        bpe.save_dict(dict_word2index, config.word2index_path)
        return

    if is_save == 'y':
        if model_id == 4:
            print("save HAN...")
            train_data, train_labels = bd.build_data_set_HAN(
                data,
                labels,
                dict_word2index,
                num_sentences=config.num_sentences,
                sequence_length=config.sequence_length)
            print(np.shape(train_data), np.shape(train_labels))
            print(len(ids))
            dataset = MingLueData(ids, train_data, train_labels)
    else:
        if model_id == 4:
            train_data, train_labels = bd.build_data_set_HAN(
                data,
                labels,
                dict_word2index,
                num_sentences=config.num_sentences,
                sequence_length=config.sequence_length)
            train_ids, valid_ids = bd.split_data(ids, radio=0.9)
            train_X, valid_X = bd.split_data(train_data, radio=0.9)
            train_y, valid_y = bd.split_data(train_labels, radio=0.9)
        print("trainset size:", len(train_ids))
        print("validset size:", len(valid_ids))
        dataset = MingLueData(train_ids, train_X, train_y)
    del data
    batch_size = config.batch_size
    if model_id == 4:
        batch_size = config.han_batch_size
    train_loader = DataLoader(
        dataset=dataset,
        batch_size=batch_size,  # 更改便于为不同模型传递不同batch
        shuffle=True,
        num_workers=config.num_workers)
    if is_save != 'y':
        dataset = MingLueData(valid_ids, valid_X, valid_y)
        valid_loader = DataLoader(
            dataset=dataset,
            batch_size=batch_size,  # 更改便于为不同模型传递不同batch
            shuffle=False,
            num_workers=config.num_workers)
    print("data loaded")
    config.vocab_size = len(dict_word2index)
    print('config vocab size:', config.vocab_size)
    model = model_selector(config, model_id, use_element)
    if config.has_cuda:
        model = model.cuda()

    loss_weight = torch.FloatTensor(config.loss_weight_value)
    loss_weight = loss_weight + 1 - loss_weight.mean()
    print("loss weight:", loss_weight)
    loss_fun = nn.CrossEntropyLoss(loss_weight.cuda())
    optimizer = model.get_optimizer(config.learning_rate,
                                    config.learning_rate2, config.weight_decay)
    print("training...")

    weight_count = 0
    max_score = 0
    total_loss_weight = torch.FloatTensor(torch.zeros(8))
    for epoch in range(config.epoch_num):
        print("lr:", config.learning_rate, "lr2:", config.learning_rate2)
        running_loss = 0.0
        running_acc = 0.0
        for i, data in enumerate(train_loader, 0):
            ids, texts, labels = data
            if config.has_cuda:
                inputs, labels = Variable(texts.cuda()), Variable(
                    labels.cuda())
            else:
                inputs, labels = Variable(texts), Variable(labels)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = loss_fun(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.data[0]
            if i % config.step == config.step - 1:
                if epoch % config.epoch_step == config.epoch_step - 1:
                    _, predicted = torch.max(outputs.data, 1)
                    predicted = predicted.cpu().numpy().tolist()
                    running_acc = accuracy(predicted,
                                           labels.data.cpu().numpy())
                    print('[%d, %5d] loss: %.3f, acc: %.3f' %
                          (epoch + 1, i + 1, running_loss / config.step,
                           running_acc))
                running_loss = 0.0

        if is_save != 'y' and epoch % config.epoch_step == config.epoch_step - 1:
            print("predicting...")
            loss_weight, score = do_eval(valid_loader, model, model_id,
                                         config.has_cuda)
            if score >= 0.478 and score > max_score:
                max_score = score
                save_path = config.model_path + "." + str(
                    score) + "." + config.model_names[model_id]
                torch.save(model.state_dict(), save_path)

            if epoch >= 3:
                weight_count += 1
                total_loss_weight += loss_weight
                print("avg_loss_weight:", total_loss_weight / weight_count)

        if epoch >= config.begin_epoch - 1:
            if epoch >= config.begin_epoch and config.learning_rate2 == 0:
                config.learning_rate2 = 2e-4
            elif config.learning_rate2 > 0:
                config.learning_rate2 *= config.lr_decay
                if config.learning_rate2 <= 1e-5:
                    config.learning_rate2 = 1e-5
            config.learning_rate = config.learning_rate * config.lr_decay
            optimizer = model.get_optimizer(config.learning_rate,
                                            config.learning_rate2,
                                            config.weight_decay)
    time_stamp = str(int(time.time()))

    if is_save == "y":
        if use_element:
            save_path = config.model_path + "." + time_stamp + ".use_element." + config.model_names[
                model_id]
        else:
            save_path = config.model_path + "." + time_stamp + "." + config.model_names[
                model_id]
        torch.save(model.state_dict(), save_path)
    else:
        print("not save")
예제 #6
0
def main(model_id, use_element, is_save):
    config = MultiConfig()
    print("epoch num", config.epoch_num)
    config.use_element = use_element
    print("loading data...")
    ids, data, labels = bmd.load_data(config.data_path)
    #    sd.show_text_len_distribution(data)
    #    sd.show_label_text_len_distribution(labels, data)
    total_vocab_size = sd.count_vocab_size(data)
    print("total vocab size", total_vocab_size)
    force = config.force_word2index
    if not force and os.path.exists(config.index2word_path) and os.path.exists(
            config.word2index_path):
        print("load word2index")
        dict_word2index = bpe.load_pickle(config.word2index_path)
    else:
        print("save word2index and index2word")
        count, dict_word2index, dict_index2word = bmd.build_vocabulary(
            data, min_count=config.min_count)
        bpe.save_dict(dict_index2word, config.index2word_path)
        bpe.save_dict(dict_word2index, config.word2index_path)
        return


#    train_ids, train_X, train_y = bd.over_sample(train_ids, train_X, train_y)
#    print(train_y.shape[0], Counter(train_y))
    if is_save == 'y':
        if model_id != 4:
            all_train_ids, all_train_X, all_train_y = bmd.build_dataset(
                ids, data, labels, dict_word2index, config.max_text_len,
                config.num_class)
            dataset = MingLueMultiData(all_train_ids, all_train_X, all_train_y)
            # dataset = MingLueMultiData(valid_ids, valid_X, valid_y)
        else:
            train_data, train_labels = bmd.build_data_set_HAN(
                data,
                labels,
                dict_word2index,
                num_sentences=config.num_sentences,
                sequence_length=config.sequence_length,
                num_class=config.num_class)
            print("save HAN...")
            dataset = MingLueMultiData(ids, train_data, train_labels)
            print(np.shape(train_data), np.shape(train_labels))
            print(len(ids))
    else:
        if model_id == 4:
            train_data, train_labels = bmd.build_data_set_HAN(
                data,
                labels,
                dict_word2index,
                num_sentences=config.num_sentences,
                sequence_length=config.sequence_length,
                num_class=config.num_class)
            train_ids, valid_ids = bmd.split_data(ids, radio=0.9)
            train_X, valid_X = bmd.split_data(train_data, radio=0.9)
            train_y, valid_y = bmd.split_data(train_labels, radio=0.9)
        else:
            train_ids, valid_ids = bmd.split_data(ids, radio=0.9)
            train_data, valid_data = bmd.split_data(data, radio=0.9)
            train_labels, valid_labels = bmd.split_data(labels, radio=0.9)
            train_ids, train_X, train_y = bmd.build_dataset(
                train_ids, train_data, train_labels, dict_word2index,
                config.max_text_len, config.num_class)
            valid_ids, valid_X, valid_y = bmd.build_dataset(
                valid_ids, valid_data, valid_labels, dict_word2index,
                config.max_text_len, config.num_class)
        print("trainset size:", len(train_ids))
        print("validset size:", len(valid_ids))
        dataset = MingLueMultiData(train_ids, train_X, train_y)

    batch_size = config.batch_size
    if model_id == 4:
        batch_size = config.han_batch_size

    del data

    train_loader = DataLoader(
        dataset=dataset,
        batch_size=batch_size,  # 更改便于为不同模型传递不同batch
        shuffle=True,
        num_workers=config.num_workers)
    if is_save != 'y':
        dataset = MingLueMultiData(valid_ids, valid_X, valid_y)
        valid_loader = DataLoader(
            dataset=dataset,
            batch_size=batch_size,  # 更改便于为不同模型传递不同batch
            shuffle=False,
            num_workers=config.num_workers)
    if model_id == 5 or model_id == 6:  # cnn and rcnn with doc2vec
        dmpv_model, dbow_model = gdv.load_doc2vec_model(
            config.dmpv_model_path, config.dbow_model_path)
    print("data loaded")

    config.vocab_size = len(dict_word2index)
    print('config vocab size:', config.vocab_size)
    model = model_selector(config, model_id, use_element)
    if config.has_cuda:
        model = model.cuda()
    if use_element:
        all_element_vector = bpe.load_pickle(config.element_vector_path)

    loss_weight = torch.FloatTensor(config.loss_weight)
    print(loss_weight.mean())
    loss_weight = 1 + 2 * (loss_weight.mean() - loss_weight)

    #loss_fun = nn.MultiLabelSoftMarginLoss(loss_weight.cuda())

    loss_fun = nn.MultiLabelSoftMarginLoss()
    #    optimizer = optim.Adam(model.parameters(),lr=config.learning_rate, weight_decay=config.weight_decay)
    optimizer = model.get_optimizer(config.learning_rate,
                                    config.learning_rate2, config.weight_decay)
    print("training...")

    weight_count = 0
    max_score = 0
    for epoch in range(config.epoch_num):
        print("lr:", config.learning_rate, "lr2:", config.learning_rate2)
        running_loss = 0.0
        running_jaccard = 0.0
        for i, data in enumerate(train_loader, 0):
            ids, texts, labels = data
            # TODO
            if model_id == 4:
                pass
            if config.has_cuda:
                inputs, labels = Variable(texts.cuda()), Variable(
                    labels.cuda())
            else:
                inputs, labels = Variable(texts), Variable(labels)
            optimizer.zero_grad()
            if model_id == 5 or model_id == 6:  # cnn and rcnn with doc2vec
                doc2vec = gdv.build_doc2vec(ids, dmpv_model, dbow_model)
                if config.has_cuda:
                    doc2vec = Variable(torch.FloatTensor(doc2vec).cuda())
                else:
                    doc2vec = Variable(torch.FloatTensor(doc2vec))
                # [batch_size, (doc2vec_size*2)]
                # print(doc2vec.size())
                outputs = model(inputs, doc2vec)
            elif use_element:
                element_vec = build_element_vec(ids, all_element_vector)
                if config.has_cuda:
                    element_vec = Variable(
                        torch.LongTensor(element_vec).cuda())
                else:
                    element_vec = Variable(torch.LongTensor(element_vec))
                outputs = model(inputs, element_vec)
            else:
                outputs = model(inputs)
            loss = loss_fun(outputs,
                            labels.float())  # or weight *labels.float()
            loss.backward()
            optimizer.step()

            running_loss += loss.data[0]

            if i % config.step == config.step - 1:
                if epoch % config.epoch_step == config.epoch_step - 1:
                    predicted_labels = get_multi_label_from_output(
                        outputs, config)

                    true_label = labels.data.cpu().numpy()
                    rows, true_label = np.where(true_label == 1)
                    true_label = where_result_reshape(outputs.size()[0], rows,
                                                      true_label)

                    running_jaccard = cs.jaccard(predicted_labels, true_label)
                    print('[%d, %5d] loss: %.3f, jaccard: %.3f' %
                          (epoch + 1, i + 1, running_loss / config.step,
                           running_jaccard))
                running_loss = 0.0

        if is_save != 'y' and epoch % config.epoch_step == config.epoch_step - 1:
            print("predicting...")
            if model_id == 5 or model_id == 6:
                score = do_eval(valid_loader, model, model_id, config,
                                dmpv_model, dbow_model)
            else:
                score = do_eval(valid_loader, model, model_id, config)
                if epoch >= 5:
                    config.max_prob = 0.55
                    print("max prob:", config.max_prob)
                    score_2 = do_eval(valid_loader, model, model_id, config)
                    config.max_prob = 0.45
                    print("max prob:", config.max_prob)
                    score_3 = do_eval(valid_loader, model, model_id, config)
                    if score >= 0.788 and score > max_score:
                        max_score = score
                        save_path = config.model_path + "." + str(
                            score) + ".multi." + config.model_names[model_id]
                        torch.save(model.state_dict(), save_path)
            if epoch >= 3:
                weight_count += 1
            #    total_loss_weight += loss_weight
            #    print("avg_loss_weight:",total_loss_weight/weight_count)

        if epoch >= config.begin_epoch - 1:
            if epoch >= config.begin_epoch and config.learning_rate2 == 0:
                config.learning_rate2 = 2e-4
            elif config.learning_rate2 > 0:
                config.learning_rate2 *= config.lr_decay
                if config.learning_rate2 <= 1e-5:
                    config.learning_rate2 = 1e-5
            config.learning_rate = config.learning_rate * config.lr_decay
            optimizer = model.get_optimizer(config.learning_rate,
                                            config.learning_rate2,
                                            config.weight_decay)
    time_stamp = str(int(time.time()))

    if is_save == "y":
        if use_element:
            save_path = config.model_path + "." + time_stamp + ".multi.use_element." + config.model_names[
                model_id]
        else:
            save_path = config.model_path + "." + time_stamp + ".multi." + config.model_names[
                model_id]
        torch.save(model.state_dict(), save_path)
    else:
        print("not save")
예제 #7
0
    stopwords_data_path = "data/stopwords.txt"
    # stopwords_data_df = pd.read_csv(stopwords_data_path, encoding="utf-8", sep="\t", index_col=None, quoting=3,names=["stopword"])

    # add <PAD> for embedding
    word_count = [('<UNK>', -1), ('<PAD>', -1)]
    words = []
    for word in data:
        words.append(word)
    counter = Counter(words)
    counter_list = counter.most_common()
    for word, count in counter_list:
        if count >= min_count:
            word_count.append((word, count))
    dict_word2index = dict()
    for word, _ in word_count:
        dict_word2index[word] = len(dict_word2index)
    dict_index2word = dict(
        zip(dict_word2index.values(), dict_word2index.keys()))
    print("vocab size:", len(word_count))
    return word_count, dict_word2index, dict_index2word


if __name__ == "__main__":
    config = Config()
    data = ['公诉', '机关', '莆田市', '荔城区', '荔城区', '荔城区']
    word_count, dict_word2index, dict_index2word = build_vocabulary(data)
    bde.save_dict(dict_word2index, config.word2index_path)
    bde.save_dict(dict_index2word, config.index2word_path)

    print(bde.load_pickle(config.word2index_path))
    print(bde.load_pickle(config.index2word_path))
예제 #8
0
파일: train.py 프로젝트: shenfuli/ML_3
def main(model_id, use_element, is_save):
    config = Config()
    print("epoch num: ", config.epoch_num)
    config.use_element = use_element
    print("loading data...")
    # 原始数据 切分3列-list id  data  label
    ids, data, labels = bd.load_data(config.data_path)
    train_ids, valid_ids = bd.split_data(ids, radio=0.7)
    train_data, valid_data = bd.split_data(data, radio=0.7)
    train_labels, valid_labels = bd.split_data(labels, radio=0.7)

    # 求数据中所有词汇个数
    total_vocab_size = sd.count_vocab_size(data)
    print("total vocab size", total_vocab_size)
    print("load word2index")
    dict_word2index = bpe.load_pickle(config.word2index_path)
    # print(len(dict_word2index))

    train_ids, train_X, train_y = bd.build_dataset(
        train_ids,
        train_data,
        train_labels,
        dict_word2index,
        max_text_len=config.max_text_len)
    print(train_ids[0:4])
    print(train_X[0:4])
    print(train_y[0:4])
    valid_ids, valid_X, valid_y = bd.build_dataset(
        valid_ids,
        valid_data,
        valid_labels,
        dict_word2index,
        max_text_len=config.max_text_len)
    print("trainset size:", len(train_ids))
    print("validset size:", len(valid_ids))

    dataset_train = MingLueData(train_ids, train_X, train_y)
    dataset_valid = MingLueData(valid_ids, valid_X, valid_y)

    batch_size = config.batch_size
    train_loader = DataLoader(dataset=dataset_train,
                              batch_size=batch_size,
                              shuffle=True,
                              num_workers=config.num_workers)

    valid_loader = DataLoader(dataset=dataset_valid,
                              batch_size=batch_size,
                              shuffle=True,
                              num_workers=config.num_workers)

    config.vocab_size = len(dict_word2index)
    print('config vocab size:', config.vocab_size)
    model = model_selector(config, model_id, use_element)
    if config.has_cuda:
        model = model.cuda()

    loss_weight = torch.FloatTensor(config.loss_weight_value)
    loss_weight = loss_weight + 1 - loss_weight.mean()
    print("loss weight:", loss_weight)
    loss_fun = nn.CrossEntropyLoss(loss_weight.cuda())
    optimizer = model.get_optimizer(config.learning_rate,
                                    config.learning_rate2, config.weight_decay)
    print("training...")
    weight_count = 0
    max_score = 0
    total_loss_weight = torch.FloatTensor(torch.zeros(8))
    for epoch in range(config.epoch_num):
        print("lr:", config.learning_rate, "lr2:", config.learning_rate2)
        running_loss = 0.0
        for i, data in enumerate(train_loader, 0):
            ids, texts, labels = data
            if config.has_cuda:
                inputs, labels = Variable(texts.cuda()), Variable(
                    labels.cuda())
            else:
                inputs, labels = Variable(texts), Variable(labels)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = loss_fun(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.data[0]

            if i % config.step == config.step - 1:
                if epoch % config.epoch_step == config.epoch_step - 1:
                    _, predicted = torch.max(outputs.data, 1)
                    predicted = predicted.cpu().numpy().tolist()
                    running_acc = accuracy(predicted,
                                           labels.data.cpu().numpy())
                    print('[%d, %5d] loss: %.3f, acc: %.3f' %
                          (epoch + 1, i + 1, running_loss / config.step,
                           running_acc))
                running_loss = 0.0

        if is_save != 'y' and epoch % config.epoch_step == config.epoch_step - 1:
            print("predicting...")
            loss_weight, score = do_eval(valid_loader, model, model_id,
                                         config.has_cuda)
            if score >= 0.478 and score > max_score:
                max_score = score
                save_path = config.model_path + "." + str(
                    score) + "." + config.model_names[model_id]
                torch.save(model.state_dict(), save_path)

            if epoch >= 3:
                weight_count += 1
                total_loss_weight += loss_weight
                print("avg_loss_weight:", total_loss_weight / weight_count)

        if epoch >= config.begin_epoch - 1:
            if epoch >= config.begin_epoch and config.learning_rate2 == 0:
                config.learning_rate2 = 2e-4
            elif config.learning_rate2 > 0:
                config.learning_rate2 *= config.lr_decay
                if config.learning_rate2 <= 1e-5:
                    config.learning_rate2 = 1e-5
            config.learning_rate = config.learning_rate * config.lr_decay
            optimizer = model.get_optimizer(config.learning_rate,
                                            config.learning_rate2,
                                            config.weight_decay)
예제 #9
0
def main(model_id, use_element, is_save):
    config = Config()
    print("epoch num: ", config.epoch_num)
    config.use_element = use_element
    # model_id = int(input("Please select a model(input model id):\n0: fastText\n1: TextCNN\n2: TextRCNN\n4: HAN\nInput: "))
    # is_save = input("Save Model?(y/n): ")
    print("loading data...")
    ids, data, labels = bd.load_data(config.data_path)
    #    sd.show_text_len_distribution(data)
    #    sd.show_label_text_len_distribution(labels, data)
    total_vocab_size = sd.count_vocab_size(data)
    print("total vocab size", total_vocab_size)
    force = config.force_word2index
    if not force and os.path.exists(config.index2word_path) and os.path.exists(
            config.word2index_path):
        print("load word2index")
        dict_word2index = bpe.load_pickle(config.word2index_path)
        print(dict_word2index['<UNK>'], dict_word2index['<PAD>'])
    else:
        print("save word2index and index2word")
        count, dict_word2index, dict_index2word = bd.build_vocabulary(
            data, min_count=config.min_count)
        bpe.save_dict(dict_index2word, config.index2word_path)
        bpe.save_dict(dict_word2index, config.word2index_path)
        return


#    train_ids, train_X, train_y = bd.over_sample(train_ids, train_X, train_y)
#    print(train_y.shape[0], Counter(train_y))
    if is_save == 'y':
        if model_id != 4:
            all_train_ids, all_train_X, all_train_y = bd.build_dataset_over_sample(
                ids, data, labels, dict_word2index, config.max_text_len)
            dataset = MingLueData(all_train_ids, all_train_X, all_train_y)
        else:
            print("save HAN...")
            train_data, train_labels = bd.build_data_set_HAN(
                data,
                labels,
                dict_word2index,
                num_sentences=config.num_sentences,
                sequence_length=config.sequence_length)
            print(np.shape(train_data), np.shape(train_labels))
            print(len(ids))
            dataset = MingLueData(ids, train_data, train_labels)

    else:
        if model_id == 4:
            train_data, train_labels = bd.build_data_set_HAN(
                data,
                labels,
                dict_word2index,
                num_sentences=config.num_sentences,
                sequence_length=config.sequence_length)
            train_ids, valid_ids = bd.split_data(ids, radio=0.9)
            train_X, valid_X = bd.split_data(train_data, radio=0.9)
            train_y, valid_y = bd.split_data(train_labels, radio=0.9)
        else:
            train_ids, valid_ids = bd.split_data(ids, radio=0.9)
            train_data, valid_data = bd.split_data(data, radio=0.9)
            train_labels, valid_labels = bd.split_data(labels, radio=0.9)
            # over sample for train data
            train_ids, train_X, train_y = bd.build_dataset_over_sample(
                train_ids, train_data, train_labels, dict_word2index,
                config.max_text_len)
            valid_ids, valid_X, valid_y = bd.build_dataset(
                valid_ids, valid_data, valid_labels, dict_word2index,
                config.max_text_len)
        print("trainset size:", len(train_ids))
        print("validset size:", len(valid_ids))
        dataset = MingLueData(train_ids, train_X, train_y)

    del data

    batch_size = config.batch_size
    if model_id == 4:
        batch_size = config.han_batch_size
    train_loader = DataLoader(
        dataset=dataset,
        batch_size=batch_size,  # 更改便于为不同模型传递不同batch
        shuffle=True,
        num_workers=config.num_workers)
    if is_save != 'y':
        dataset = MingLueData(valid_ids, valid_X, valid_y)
        valid_loader = DataLoader(
            dataset=dataset,
            batch_size=batch_size,  # 更改便于为不同模型传递不同batch
            shuffle=False,
            num_workers=config.num_workers)
    if model_id == 5 or model_id == 6:  # cnn and rcnn with doc2vec
        dmpv_model, dbow_model = gdv.load_doc2vec_model(
            config.dmpv_model_path, config.dbow_model_path)
    print("data loaded")

    config.vocab_size = len(dict_word2index)
    print('config vocab size:', config.vocab_size)
    model = model_selector(config, model_id, use_element)
    if config.has_cuda:
        model = model.cuda()

    if use_element:
        all_element_vector = bpe.load_pickle(config.element_vector_path)

    loss_weight = torch.FloatTensor(config.loss_weight_value)
    loss_weight = loss_weight + 1 - loss_weight.mean()
    print("loss weight:", loss_weight)

    loss_fun = nn.CrossEntropyLoss(loss_weight.cuda())

    #    loss_fun = nn.CrossEntropyLoss()
    #    optimizer = optim.Adam(model.parameters(),lr=config.learning_rate, weight_decay=config.weight_decay)
    optimizer = model.get_optimizer(config.learning_rate,
                                    config.learning_rate2, config.weight_decay)
    print("training...")

    weight_count = 0
    max_score = 0
    total_loss_weight = torch.FloatTensor(torch.zeros(8))
    for epoch in range(config.epoch_num):
        print("lr:", config.learning_rate, "lr2:", config.learning_rate2)
        running_loss = 0.0
        running_acc = 0.0
        for i, data in enumerate(train_loader, 0):
            ids, texts, labels = data
            # TODO
            if model_id == 4:
                pass
            if config.has_cuda:
                inputs, labels = Variable(texts.cuda()), Variable(
                    labels.cuda())
            else:
                inputs, labels = Variable(texts), Variable(labels)
            optimizer.zero_grad()
            if model_id == 5 or model_id == 6:  # cnn and rcnn with doc2vec
                doc2vec = gdv.build_doc2vec(ids, dmpv_model, dbow_model)
                if config.has_cuda:
                    doc2vec = Variable(torch.FloatTensor(doc2vec).cuda())
                else:
                    doc2vec = Variable(torch.FloatTensor(doc2vec))
                # [batch_size, (doc2vec_size*2)]
                # print(doc2vec.size())
                outputs = model(inputs, doc2vec)
            elif use_element:
                element_vec = build_element_vec(ids, all_element_vector)
                if config.has_cuda:
                    element_vec = Variable(
                        torch.LongTensor(element_vec).cuda())
                else:
                    element_vec = Variable(torch.LongTensor(element_vec))
                outputs = model(inputs, element_vec)
            else:
                outputs = model(inputs)
            loss = loss_fun(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.data[0]

            if i % config.step == config.step - 1:
                if epoch % config.epoch_step == config.epoch_step - 1:
                    _, predicted = torch.max(outputs.data, 1)
                    predicted = predicted.cpu().numpy().tolist()
                    #  predicted = [i[0] for i in predicted]
                    running_acc = accuracy(predicted,
                                           labels.data.cpu().numpy())
                    print('[%d, %5d] loss: %.3f, acc: %.3f' %
                          (epoch + 1, i + 1, running_loss / config.step,
                           running_acc))
                running_loss = 0.0

        if is_save != 'y' and epoch % config.epoch_step == config.epoch_step - 1:
            print("predicting...")
            if model_id == 5 or model_id == 6:
                loss_weight, score = do_eval(valid_loader, model, model_id,
                                             config.has_cuda, dmpv_model,
                                             dbow_model)
            else:
                loss_weight, score = do_eval(valid_loader, model, model_id,
                                             config.has_cuda)
                if score >= 0.478 and score > max_score:
                    max_score = score
                    save_path = config.model_path + "." + str(
                        score) + "." + config.model_names[model_id]
                    torch.save(model.state_dict(), save_path)

            if epoch >= 3:
                weight_count += 1
                total_loss_weight += loss_weight
                print("avg_loss_weight:", total_loss_weight / weight_count)

        if epoch >= config.begin_epoch - 1:
            if epoch >= config.begin_epoch and config.learning_rate2 == 0:
                config.learning_rate2 = 2e-4
            elif config.learning_rate2 > 0:
                config.learning_rate2 *= config.lr_decay
                if config.learning_rate2 <= 1e-5:
                    config.learning_rate2 = 1e-5
            config.learning_rate = config.learning_rate * config.lr_decay
            optimizer = model.get_optimizer(config.learning_rate,
                                            config.learning_rate2,
                                            config.weight_decay)
    time_stamp = str(int(time.time()))

    if is_save == "y":
        if use_element:
            save_path = config.model_path + "." + time_stamp + ".use_element." + config.model_names[
                model_id]
        else:
            save_path = config.model_path + "." + time_stamp + "." + config.model_names[
                model_id]
        torch.save(model.state_dict(), save_path)
    else:
        print("not save")