Пример #1
0
def run(args):
    """Builds model, loads data, trains and evaluates"""
    config = load_yaml(args.configfile)

    model = get_model(config)
    model.load_data(args.eval)
    model.build()

    if args.eval:
        model.evaluate()
    else:
        model.train()
Пример #2
0
    def __init__(self, configfile: str):
        # Config
        config = load_yaml(configfile)
        self.config = Config.from_json(config)

        # Builds model
        self.model = get_model(config)
        self.model.build()
        self.model_name = self.model.model_name

        # device
        if torch.cuda.is_available():
            self.device = torch.device('cuda')
        else:
            self.device = torch.device('cpu')
        self.model.model.to(self.device)
        self.model.model.eval()

        # classes
        self.classes = self.model.classes
Пример #3
0
    def __init__(self, configfile: str):
        # Config
        config = load_yaml(configfile)
        self.config = Config.from_json(config)

        # Builds model
        self.model = get_model(config)
        self.model.build()
        self.model_name = self.model.model_name

        # device
        if torch.cuda.is_available():
            self.device = torch.device('cuda')
        else:
            self.device = torch.device('cpu')
        self.model.model.to(self.device)
        self.model.model.eval()

        self.n_classes = self.config.model.n_classes
        self.vis_img = VisImage(
            n_classes=self.n_classes,
            label_color_map=self.config.data.label_color_map)
        data_all_dataframe = load_xlsx(self.eval_path)
        data_x = list(data_all_dataframe["text"])
        data_y = list(data_all_dataframe["intent"])

        # for index in range(len(data_x) - 1, -1, -1):
        #     # 将多标签的数据替换为第一个标签
        #     if "," in data_y[index]:
        #         label_split = data_y[index].split(",")
        #         label = label_split[0][2:-1]
        #         data_y[index] = "['" + label + "']"

        d = load_json(self.label2index_json_path)

        data_x_clear, data_y_clear = list(), list()
        for index in range(len(data_x)):
            if data_y[index] != "[]" and "," not in data_y[index] and data_y[
                    index] in d.keys():
                data_x_clear.append(data_x[index])
                data_y_clear.append(data_y[index])
        # print(len(data_x_clear))
        return data_x_clear, data_y_clear


if __name__ == '__main__':
    params = load_yaml(
        "/Users/yangyu/PycharmProjects/infer_of_intent/dataset/preprocess_config.yaml"
    )
    p = Preprocess(params)
    p.get_train_data()
    p.get_eval_data()
        save_df.to_excel("final_.xlsx")

    def predict_one(self, input_sequence, label):
        """预测一条数据

        params: input_sequence: 一条语句
        params: label: 语句对应的正确标签

        return: predict_label
        """
        # 使用传统模型
        if self.model_name in ["fasttext", "lstm_base", "lstm_pack", "textcnn"]:
            print(self.tokenizer(input_sequence))
            raise Exception("未完成!")
        # 使用预训练模型
        else:
            tokenized = self.tokenizer(input_sequence, return_tensors="pt")
            # print(self.tokenizer(input_sequence, return_tensors="pt"))
            input_ids, token_type_ids, attention_mask = tokenized["input_ids"], tokenized["token_type_ids"], tokenized["attention_mask"]
            input_ids, token_type_ids, attention_mask = input_ids.cuda(), token_type_ids.cuda(), attention_mask.cuda()
            predict_result = self.model(input_ids, token_type_ids, attention_mask)
            # print(predict_result.shape)
            label = torch.argmax(predict_result, 0).cpu()
            # print(self.label_tokenizer.decoßde(label))
            return self.label_tokenizer.decode(label)


if __name__=="__main__":
    configs = load_yaml("eval_infer_config.yaml")
    evalInferFire = EvalInferFire(configs)
    evalInferFire.predict()
def train():
    use_cuda = True if torch.cuda.is_available() else False

    params = load_yaml(
        "/Users/yangyu/PycharmProjects/infer_of_intent/dataset/preprocess_config.yaml"
    )
    p = Preprocess(params)
    source_dataset_x, source_dataset_y = p.get_preprocessed_data()

    sequencetokenizer = SequenceTokenizer(
        "/Users/yangyu/PycharmProjects/infer_of_intent/data_preprocess/vocab2index.json"
    )
    classificationlabeltokenizer = ClassificationLabelTokenizer(
        "/Users/yangyu/PycharmProjects/infer_of_intent/data_preprocess/label2index.json"
    )

    (x_tokenized, x_lengths), y_tokenized = sequencetokenizer(
        source_dataset_x), classificationlabeltokenizer(source_dataset_y)
    # print(x_tokenized.shape, y_tokenized.shape)
    dataset = SequenceDataset(x_tokenized, y_tokenized, x_lengths)

    # 使用交叉验证创建多个数据集,并将数据集切分为训练集和测试集,如果参数为0,那么作用为将原始数据集划分为训练集和测试集
    kFoldCV = KFoldCrossValidation(0)  # 10折交叉验证
    dataset_generator = kFoldCV(dataset,
                                shuffle=False)  # 交叉验证类返回生成器,生成器每次返回一个交叉验证数据集

    for train, eval in dataset_generator:
        print(train, eval)

        model_params = load_yaml(
            "/Users/yangyu/PycharmProjects/infer_of_intent/simple_classification/lstm_base/lstm_base_config.yaml"
        )
        model = LSTM_Classfication(model_params)

        if use_cuda:
            model = model.cuda()

        optimizer = Adam(params=model.parameters(), lr=0.0001)

        epochs = model_params["epoch"]
        batch_size = model_params["batch_size"]
        train_dataloader = DataLoader(dataset=train,
                                      batch_size=64,
                                      shuffle=True)

        eval_dataloader = DataLoader(dataset=eval, batch_size=64)

        for batch in eval_dataloader:
            eval_x, eval_y, _ = batch
            eval_y = eval_y.squeeze()
            if use_cuda:
                eval_x, eval_y = eval_x.cuda(), eval_y.cuda()

        for epoch in range(epochs):
            for index, batch in enumerate(train_dataloader):
                optimizer.zero_grad()
                train_x, train_y, _ = batch
                train_y = train_y.squeeze()

                if use_cuda:
                    train_x, train_y = train_x.cuda(), train_y.cuda()

                # print(train_x.shape, train_y.shape)
                train_y_head = model(train_x)
                train_loss = cross_entropy(train_y_head, train_y)

                eval_y_head = model(eval_x)
                eval_loss = cross_entropy(eval_y_head, eval_y)

                train_predict = torch.argmax(train_y_head, 1)
                eval_predict = torch.argmax(eval_y_head, 1)
                train_accu = int(
                    (train_y == train_predict).sum()) / len(train_x)
                eval_accu = int((eval_y == eval_predict).sum()) / len(eval_x)

                train_loss.backward()
                optimizer.step()
                print(
                    "train_epoch:{} | train_batch:{} | train_loss:{} | eval_loss:{} | train_accu:{} | eval_accu:{}"
                    "".format(epoch, index, train_loss.item(),
                              eval_loss.item(), train_accu, eval_accu))
Пример #7
0
def train():
    # 是否使用gpu加速
    use_cuda = True if torch.cuda.is_available() else False
    # 查看可用的GPU数量
    if use_cuda:
        device_nums = torch.cuda.device_count()
        print("use {} GPUs!".format(device_nums))

    # 预处理以及模型参数读取(后面需要修改,因为需要读取两个文件,耦合性过高)
    train_configs = load_yaml("electra_classification_config.yaml")

    print("模型参数如下:")
    model_params_print(train_configs)

    params = load_yaml(train_configs["path"]["preprocess_config_path"])

    # 根据参数实例化数据集类(当前该类需要训练集和测试集才能够初始化,需要修改)
    p = Preprocess(params)

    # 得到原始数据集文本和标签
    source_train_x, source_train_y = p.get_train_data()
    source_eval_x, source_eval_y = p.get_eval_data()

    # 初始化bert输入和label的tokenizer
    electratokenizer = AutoTokenizer.from_pretrained(
        train_configs["path"]["electra_path"])
    labeltokenizer = ClassificationLabelTokenizer(
        params["tokenized_path"]["label2index_json_path"])

    # 将训练集进行tokenize
    train_x_tokenized = electratokenizer(source_train_x,
                                         padding=True,
                                         truncation=True,
                                         return_tensors="pt")
    train_y_tokenized = labeltokenizer(source_train_y)

    # 将测试集进行tokenize
    eval_x_tokenized = electratokenizer(source_eval_x,
                                        padding=True,
                                        truncation=True,
                                        return_tensors="pt")
    eval_y_tokenized = labeltokenizer(source_eval_y)

    # 创建训练集和测试集(测试集类,如果有明确划分的训练集和测试集,那么分别进行初始化,如果只有训练集,那么可以进行交叉验证)
    train_set = BertSequenceDataset(train_x_tokenized, train_y_tokenized)
    eval_set = BertSequenceDataset(eval_x_tokenized, eval_y_tokenized)

    # 读取模型参数初始化模型
    model = ElectraClassification(train_configs)

    # 如果使用cuda,那么进行多GPU训练
    if use_cuda:
        model = nn.DataParallel(model, device_ids=list(range(device_nums)))
        # 将模型放在第0个GPU
        model = model.cuda(device=0)

    # 获取训练参数
    epoch_param = train_configs["train_params"]["epoch"]
    batch_size_param = train_configs["train_params"]["batch_size"]
    learning_rate_param = train_configs["train_params"]["learning_rate"]

    # 使用dataloader加载训练集和测试集
    train_dataloader = DataLoader(train_set,
                                  batch_size=batch_size_param,
                                  shuffle=True)
    eval_dataloader = DataLoader(eval_set, batch_size=batch_size_param)

    # 创建优化器
    optmizer = Adam(model.parameters(), lr=learning_rate_param)

    # 创建损失函数
    loss_f = nn.CrossEntropyLoss()

    # 训练过程记录类
    train_record = TrainProcessRecord(train_configs)

    # 损失梯度缩放器
    scaler = GradScaler()

    train_start = time.time()
    for epoch in range(epoch_param):
        for batch_index, batch in enumerate(train_dataloader):
            # 梯度清零(单精度、混合精度)
            optmizer.zero_grad()
            input_ids, token_type_ids, attention_mask, train_y = batch

            # 使用cuda训练
            if use_cuda:
                input_ids, token_type_ids, attention_mask, train_y = \
                    input_ids.cuda(device=0), token_type_ids.cuda(device=0), attention_mask.cuda(device=0), train_y.cuda(device=0)

            # 自动精度转换进行前向传播(混合精度使用上下文)
            with autocast():
                train_y_predict = model(input_ids, token_type_ids,
                                        attention_mask)
                train_y = train_y.squeeze()
                train_loss = loss_f(train_y_predict, train_y)

            # 放大损失(混合精度)
            scaler.scale(train_loss).backward()
            # 权重更新(混合精度)
            scaler.step(optmizer)
            # 更新缩放器(混合精度)
            scaler.update()

            # 不使用混合精度训练
            # optmizer.step()

            # 每 train_params_save_threshold 个batch进行测试集预测和存储
            if batch_index % train_configs["train_record_settings"][
                    "train_params_save_threshold"] == 0:
                model.eval()
                with torch.no_grad():
                    train_predict = torch.argmax(train_y_predict, 1)
                    train_accu = int(
                        (train_y == train_predict).sum()) / len(train_y)
                    sum_eval_accu = 0
                    sum_eval_loss = 0

                    eval_start = time.time()

                    for e_i, eval_batch in enumerate(eval_dataloader):
                        eval_input_ids, eval_token_type_ids, eval_attention_mask, eval_y = eval_batch
                        eval_y = eval_y.squeeze()
                        if use_cuda:
                            eval_input_ids, eval_token_type_ids, eval_attention_mask, eval_y = \
                                eval_input_ids.cuda(device=0), eval_token_type_ids.cuda(device=0), eval_attention_mask.cuda(device=0), eval_y.cuda(device=0)
                        # print(e_i)
                        eval_y_predict = model(eval_input_ids,
                                               eval_token_type_ids,
                                               eval_attention_mask)
                        eval_loss = cross_entropy(eval_y_predict, eval_y)

                        eval_predict = torch.argmax(eval_y_predict, 1)
                        eval_accu = int(
                            (eval_y == eval_predict).sum()) / len(eval_y)
                        sum_eval_accu = sum_eval_accu + eval_accu
                        sum_eval_loss = sum_eval_loss + eval_loss.item()
                        optmizer.zero_grad()
                        torch.cuda.empty_cache()
                    sum_eval_accu = sum_eval_accu / len(eval_dataloader)
                    sum_eval_loss = sum_eval_loss / len(eval_dataloader)

                    eval_end = time.time()

                    eval_time = eval_end - eval_start

                    print(
                        "train_epoch:{} | train_batch:{} | train_loss:{} | eval_loss:{} | train_accu:{} | eval_accu:{}"
                        "eval time:{}".format(epoch, batch_index,
                                              train_loss.item(), sum_eval_loss,
                                              train_accu, sum_eval_accu,
                                              eval_time))
                    train_record(model, epoch, batch_index, train_loss.item(),
                                 eval_loss.item(), train_accu, eval_accu,
                                 eval_time)
                model.train()
    train_end = time.time()
    train_time = train_end - train_start
    train_record.save_time(train_time)
def train():
    # 是否使用gpu加速
    use_cuda = True if torch.cuda.is_available() else False
    # 查看可用的GPU数量
    if use_cuda:
        device_nums = torch.cuda.device_count()
        print("use {} GPUs!".format(device_nums))

    # 预处理以及模型参数读取(后面需要修改,因为需要读取两个文件,耦合性过高)
    train_configs = load_yaml("bert_classification_config.yaml")
    params = load_yaml(train_configs["path"]["preprocess_config_path"])

    # 根据参数实例化数据集类(当前该类需要训练集和测试集才能够初始化,需要修改)
    p = Preprocess(params)

    # 得到原始数据集文本和标签
    source_train_x, source_train_y = p.get_train_data()
    source_eval_x, source_eval_y = p.get_eval_data()

    # 初始化bert输入和label的tokenizer
    berttokenizer = BertTokenizer.from_pretrained(
        train_configs["path"]["bert_path"])
    labeltokenizer = ClassificationLabelTokenizer(
        params["tokenized_path"]["label2index_json_path"])

    # 将训练集进行tokenize
    train_x_tokenized = berttokenizer(source_train_x,
                                      padding=True,
                                      truncation=True,
                                      return_tensors="pt")
    train_y_tokenized = labeltokenizer(source_train_y)

    # 将测试集进行tokenize
    eval_x_tokenized = berttokenizer(source_eval_x,
                                     padding=True,
                                     truncation=True,
                                     return_tensors="pt")
    eval_y_tokenized = labeltokenizer(source_eval_y)

    # 创建训练集和测试集(测试集类,如果有明确划分的训练集和测试集,那么分别进行初始化,如果只有训练集,那么可以进行交叉验证)
    train_set = BertSequenceDataset(train_x_tokenized, train_y_tokenized)
    eval_set = BertSequenceDataset(eval_x_tokenized, eval_y_tokenized)

    # 读取模型参数初始化模型
    model = BertClassification(train_configs)

    # 如果使用cuda,那么进行多GPU训练
    if use_cuda:
        model = nn.DataParallel(model, device_ids=list(range(device_nums)))
        # 将模型放在第0个GPU
        model = model.cuda(device=0)

    # 使用dataloader加载训练集和测试集
    train_dataloader = DataLoader(train_set, batch_size=64, shuffle=True)
    eval_dataloader = DataLoader(eval_set, batch_size=64)

    # 创建优化器
    optmizer = Adam(model.parameters(), lr=0.00001)

    # 创建损失函数
    loss_f = nn.CrossEntropyLoss()
    for epoch in range(100):
        for batch_index, batch in enumerate(train_dataloader):
            # print(batch_index)
            optmizer.zero_grad()
            input_ids, token_type_ids, attention_mask, train_y = batch
            # l = input_ids.numpy().tolist()
            # for i in l:
            #     print(berttokenizer.decode(i))
            # print(labeltokenizer.decode(train_y))
            if use_cuda:
                input_ids, token_type_ids, attention_mask, train_y = \
                    input_ids.cuda(device=0), token_type_ids.cuda(device=0), attention_mask.cuda(device=0), train_y.cuda(device=0)

            train_y_predict = model(input_ids, token_type_ids, attention_mask)
            train_y = train_y.squeeze()
            train_loss = loss_f(train_y_predict, train_y)

            train_loss.backward()
            optmizer.step()
            # print(train_loss)
            if train_loss < 0.01 or epoch == 3:
                model = model.cpu()
                torch.save(model, "model.bin")
                return

            if batch_index % 100 == 0:
                model.eval()
                with torch.no_grad():
                    train_predict = torch.argmax(train_y_predict, 1)
                    train_accu = int(
                        (train_y == train_predict).sum()) / len(train_y)
                    sum_eval_accu = 0
                    sum_eval_loss = 0
                    for e_i, eval_batch in enumerate(eval_dataloader):
                        eval_input_ids, eval_token_type_ids, eval_attention_mask, eval_y = eval_batch
                        eval_y = eval_y.squeeze()
                        if use_cuda:
                            eval_input_ids, eval_token_type_ids, eval_attention_mask, eval_y = \
                                eval_input_ids.cuda(device=0), eval_token_type_ids.cuda(device=0), eval_attention_mask.cuda(device=0), eval_y.cuda(device=0)
                        # print(e_i)
                        eval_y_predict = model(eval_input_ids,
                                               eval_token_type_ids,
                                               eval_attention_mask)
                        eval_loss = cross_entropy(eval_y_predict, eval_y)

                        eval_predict = torch.argmax(eval_y_predict, 1)
                        eval_accu = int(
                            (eval_y == eval_predict).sum()) / len(eval_y)
                        sum_eval_accu = sum_eval_accu + eval_accu
                        sum_eval_loss = sum_eval_loss + eval_loss
                        optmizer.zero_grad()
                        torch.cuda.empty_cache()
                    sum_eval_accu = sum_eval_accu / len(eval_dataloader)
                    sum_eval_loss = sum_eval_loss / len(eval_dataloader)
                    print(
                        "train_epoch:{} | train_batch:{} | train_loss:{} | eval_loss:{} | train_accu:{} | eval_accu:{}"
                        "".format(epoch, batch_index, train_loss.item(),
                                  sum_eval_loss, train_accu, sum_eval_accu))
                    # train_record(model, params_s, epoch, index, train_loss, eval_loss, train_accu, eval_accu)
                model.train()
Пример #9
0
        self.bidirectional_lstm = self.configs["nn_params"]["bidirectional_lstm"]
        self.lstm_layers = self.configs["nn_params"]["lstm_layers"]
        self.drop_out = self.configs["nn_params"]["drop_out"]
        self.dense = self.configs["nn_params"]["dense"]
        self.label_nums = self.configs["nn_params"]["label_nums"]

        self.embedding_and_lstm = torch.nn.Sequential(
            torch.nn.Embedding(self.input_size, self.embedding_size),
            torch.nn.LSTM(input_size=self.embedding_size,
                          hidden_size=self.lstm_hiddensize,
                          bidirectional=self.bidirectional_lstm,
                          num_layers=self.lstm_layers,
                          dropout=self.drop_out)
        )
        # output shape of lstm: (seq_len, batch_size, lstm_hiddensize)
        self.classification = torch.nn.Sequential(
            torch.nn.Linear(self.lstm_hiddensize*2, self.dense),
            torch.nn.Linear(self.dense, self.label_nums)
        )

    def forward(self, input_batch):
        input_batch = input_batch.permute(1, 0)
        # print(input_batch.shape)
        lstm_out, _ = self.embedding_and_lstm(input_batch)
        # print(lstm_out.shape)
        return self.classification(lstm_out[-1, :, :].squeeze())

if __name__ == '__main__':
    model_params = load_yaml("/Users/yangyu/PycharmProjects/infer_of_intent/simple_classification/lstm_base/lstm_base_config.yaml")
    model = LSTM_Classfication(model_params)
Пример #10
0
        self.embedding = torch.nn.Sequential(
            torch.nn.Embedding(self.input_size, self.embedding_size), )

        self.lstm = torch.nn.Sequential(
            torch.nn.LSTM(input_size=self.embedding_size,
                          hidden_size=self.lstm_hiddensize,
                          bidirectional=self.bidirectional_lstm,
                          num_layers=self.lstm_layers,
                          dropout=self.drop_out,
                          batch_first=True))

        # output shape of lstm: (seq_len, batch_size, lstm_hiddensize)
        self.classification = torch.nn.Sequential(
            torch.nn.Linear(self.lstm_hiddensize * 2, self.dense),
            torch.nn.ReLU(), torch.nn.Linear(self.dense, self.label_nums))

    def forward(self, input_batch, batch_seqs_len):
        embedded = self.embedding(input_batch)
        embedded_packed = pack_padded_sequence(embedded,
                                               batch_seqs_len,
                                               batch_first=True)
        # print(embedded_packed)
        lstm_out, _ = self.lstm(embedded_packed)
        lstm_out = pad_packed_sequence(lstm_out)[0]
        # print(lstm_out.shape)
        return self.classification(lstm_out[0, :, :].squeeze())


if __name__ == '__main__':
    params_dict = load_yaml("lstm_pack_config.yaml")
    LSTM_Classfication_Packed(params_dict)
def train():
    # 是否使用gpu加速
    use_cuda = True if torch.cuda.is_available() else False

    # 预处理参数读取
    params = load_yaml(
        "/Users/yangyu/PycharmProjects/infer_of_intent/dataset/preprocess_config.yaml"
    )
    p = Preprocess(params)
    # 得到原始数据集训练文本和标签
    source_train_x, source_train_y = p.get_train_data()
    source_eval_x, source_eval_y = p.get_eval_data()

    # 初始化数据和标签tokenizer
    sequencetokenizer = SequenceTokenizer(
        params["tokenized_path"]["vocab2index_json_path"])
    classificationlabeltokenizer = ClassificationLabelTokenizer(
        params["tokenized_path"]["label2index_json_path"])

    # tokenized
    (tokenized_train_x,
     tokenized_train_x_lengths), tokenized_train_y = sequencetokenizer(
         source_train_x), classificationlabeltokenizer(source_train_y)

    (tokenized_eval_x,
     tokenized_eval_x_lengths), tokenized_eval_y = sequencetokenizer(
         source_eval_x), classificationlabeltokenizer(source_eval_y)

    # 构建训练数据集
    train_dataset = SequenceDataset(tokenized_train_x, tokenized_train_y,
                                    tokenized_train_x_lengths)

    # 构建测试数据集
    eval_dataset = SequenceDataset(tokenized_eval_x, tokenized_eval_y,
                                   tokenized_eval_x_lengths)

    model_params = load_yaml("lstm_pack_config.yaml")
    model = LSTM_Classfication_Packed(model_params)

    train_record = TrainProcessRecord(1000, 50)
    params_s = []

    if use_cuda:
        model = model.cuda()

    optimizer = Adam(params=model.parameters(), lr=0.0001)

    epochs = model_params["epoch"]
    batch_size = model_params["batch_size"]
    train_dataloader = DataLoader(dataset=train_dataset,
                                  batch_size=batch_size,
                                  shuffle=True)

    eval_dataloader = DataLoader(dataset=eval_dataset, batch_size=64)

    for epoch in range(epochs):
        for index, batch in enumerate(train_dataloader):
            optimizer.zero_grad()
            train_x, train_y, train_seqlen = batch
            train_y = train_y.squeeze()

            if use_cuda:
                train_x, train_y, train_seqlen = train_x.cuda(), train_y.cuda(
                ), train_seqlen.cuda()

            train_y = train_y.unsqueeze(-1)
            # 对输入的tensor进行重排序
            # print(train_x.shape, train_y.shape, train_seqlen.shape)
            train_x, train_seqlen, train_y, _ = tensor_seq_len_desent(
                train_x, train_seqlen, train_y)

            # print(train_x.shape, train_y.shape)
            train_y_head = model(train_x, train_seqlen)
            train_loss = cross_entropy(train_y_head, train_y)

            train_loss.backward()
            optimizer.step()

            if index % 100 == 0:
                train_predict = torch.argmax(train_y_head, 1)
                train_accu = int(
                    (train_y == train_predict).sum()) / len(train_x)
                sum_eval_accu = 0
                sum_eval_loss = 0
                for batch in eval_dataloader:
                    eval_x, eval_y, eval_seqlen = batch
                    eval_y = eval_y.squeeze()
                    if use_cuda:
                        eval_x, eval_y, eval_seqlen = eval_x.cuda(
                        ), eval_y.cuda(), eval_seqlen.cuda()
                    eval_y = eval_y.unsqueeze(-1)
                    eval_x, eval_seqlen, eval_y, _ = tensor_seq_len_desent(
                        eval_x, eval_seqlen, eval_y)

                    eval_y_head = model(eval_x, eval_seqlen)
                    eval_loss = cross_entropy(eval_y_head, eval_y)

                    eval_predict = torch.argmax(eval_y_head, 1)
                    eval_accu = int(
                        (eval_y == eval_predict).sum()) / len(eval_x)
                    sum_eval_accu = sum_eval_accu + eval_accu
                    sum_eval_loss = sum_eval_loss + eval_loss

                sum_eval_accu = sum_eval_accu / len(eval_dataloader)
                sum_eval_loss = sum_eval_loss / len(eval_dataloader)
                print(
                    "train_epoch:{} | train_batch:{} | train_loss:{} | eval_loss:{} | train_accu:{} | eval_accu:{}"
                    "".format(epoch, index, train_loss.item(), sum_eval_loss,
                              train_accu, sum_eval_accu))
class DataClear:
    def __init__(self, configs):
        self.configs = configs
        self.use_expand = self.configs["use_expand"]
        if self.use_expand:
            self.dirty_path = self.configs["expand_dirty_path"]
        else:
            self.dirty_path = self.configs["source_dirty_path"]


    def clear(self):
        # 读取数据
        dirty_df = load_xlsx(self.dirty_path)
        dirty_labels = list()
        true_labels = list()
        labels = list(set(list(dirty_df["intent"])))
        print(labels)
        print(len(labels))
        for label in labels:
            if "_" in label:
                true_labels.append(label)
        # print(true_labels)

        print(len(true_labels))


if __name__ == '__main__':
    configs = load_yaml("clear_config.yaml")

    data_clear = DataClear(configs)
    data_clear.clear()
Пример #13
0
        super(XLNetClassification, self).__init__()

        self.configs = configs

        self.xlnet_model_path = self.configs["model_path"]["xlnet_path"]
        self.xlnet_d_model = self.configs["model_params"]["xlnet_d_model"]
        self.label_nums = configs["model_params"]["label_nums"]
        self.linear_mid_dimension["model_params"]["linear_mid_dimension"]

        self.xlnet_model = XLNetModel.from_pretrained(self.xlnet_model_path)

        self.classifier = torch.nn.Sequential(
            torch.nn.Linear(self.xlnet_d_model, self.linear_mid_dimension),
            nn.ReLU(),
            torch.nn.Linear(self.linear_mid_dimension, self.label_nums)
        )
        

    def forward(self, input):
        last_hidden_state = self.xlnet_model(input, return_dict=True)  # shape of last_hidden_state:(bsz, num_predict, hidden_size)
        classify_character = last_hidden_state[:, 0, :].squeeze()
        return self.classifier(classify_character)


if __name__=="__main__":
    # 打印网络各层的名称
    configs = load_yaml("xlnet_classification_config.yaml")
    x = XLNetClassification(configs)
    for name in x.state_dict():
        print(name)
Пример #14
0
        # 更新每个sample的attention score
        for i in range(input.shape[0]):
            # 获取每个sample的attention score
            for j in range(attention_mask[i]):
                for k in range(attention_mask[i]):
                    mask_attention_score[i][j][k] = attention_score[i][j][k]

        # print(mask_attention_score)
        attention_score_n = softmax_tensor(mask_attention_score)
        print(attention_score_n.shape) # (bsz, seq_len, seq_len)

        # 按照score进行合并
        # attention_score shape (bsz, seq_len, seq_len)
        # input shape (bsz, seq, d_model)

        self_attention = torch.matmul(attention_score, input)
        # print(self_attention.shape) (bsz, seq_len, d_model)

        return self_attention



if __name__=="__main__":
    configs = load_yaml("/home/ubuntu1804/pytorch_sequence_classification/customizeLayer/transformer/transformer_encoder_config.yaml")
    s = SelfAttention(configs)
    input = torch.randn((10, 50, 512))
    attention_mask = torch.tensor([1, 4, 5, 2, 2, 8, 6, 4, 8, 4]).unsqueeze(-1)
    # print(attention_mask)
    s(input, attention_mask)
    for name in s.state_dict():
        print(name)
Пример #15
0
"""
将嵌套字典进行输出
"""

from utils.load import load_yaml


def model_params_print(configs):
    if isinstance(configs, dict):  #使用isinstance检测数据类型
        for x in range(len(configs)):
            temp_key = list(configs.keys())[x]
            temp_value = configs[temp_key]
            if not isinstance(temp_value, dict):
                print("%s : %s" % (temp_key, temp_value))
                print("\n")
            model_params_print(temp_value)  #自我调用实现无限遍历


if __name__ == "__main__":
    a = load_yaml(
        "/home/ubuntu1804/pytorch_sequence_classification/transformers_based_classification/electra_classification/electra_classification_config.yaml"
    )
    model_params_print(a)
        self.configs = configs
        self.vocab_size = configs["vocab_size"]
        self.word_embedding_dim = self.configs["word_embed_dim"]

        self.word_embedding = torch.nn.Embedding(self.vocab_size,
                                                 self.word_embedding_dim)
        self.position_embedding = AbsPositionEmbedding(self.configs)

    def forward(self, input, attention_mask):
        # shape of input (bsz, batch_seq_len)
        # shape of attention_mask (bsz, 1)
        word_embedded = self.word_embedding(input)

        position_embedded = self.position_embedding(input, attention_mask)

        embedding = word_embedded + position_embedded

        print(embedding.shape)
        return embedding


if __name__ == "__main__":
    configs = load_yaml("transformer_encoder_configs.yaml")
    transformer_encoder_layer = TransformerEncoderLayer(configs)
    input = torch.tensor([[1, 3, 12, 4, 134, 5, 2], [6, 3, 1, 4, 0, 0, 0],
                          [8, 645, 3, 1, 4, 5, 0], [76, 3, 1, 7, 0, 0, 0],
                          [3, 0, 0, 0, 0, 0, 0]])
    attention_mask = torch.tensor([7, 4, 6, 4, 1]).unsqueeze(-1)
    transformer_encoder_layer(input, attention_mask)
        recalls = self.recall()
        f1_scores = self.f1_score()

        accumulates = dict()

        for key in precisions.keys():
            accumulates[self.index2label[key]] = [precisions[key], recalls[key], f1_scores[key]]

        print(accumulates)
        df = pd.DataFrame(accumulates)
        df.to_excel("score.xlsx")

        # 打印混淆矩阵
        df = pd.DataFrame(self.confusion_matrix.numpy())
        df.index = self.label2index.keys()
        df.columns = self.label2index.keys()
        print(df)
        for i in self.not_in_test:
            df = df.drop(self.index2label[i], axis=0)
            df = df.drop(self.index2label[i], axis=1)
        df.to_excel("confusion_matrix.xlsx")



if __name__=="__main__":
    configs = load_yaml("/home/ubuntu1804/pytorch_sequence_classification/algorithm_assess/assess_config.yaml")
    m = MultilabelF1_Score(configs)
    m.precision()
    m.recall()
    print(m.f1_score())
    m.visualize()