예제 #1
0
    def do_preprocess(self, url_list, label_list):
        """
        进行预处理
        :param url_list:
        :param label_list:
        :return:
        """
        if MIN_WORD_FREQ > 0:
            x__, word_reverse_dict = get_word_vocab(url_list, MAX_LENGTH_WORDS,
                                                    MIN_WORD_FREQ)
            self.high_freq_words = sorted(list(word_reverse_dict.values()))

        self.x, self.word_reverse_dict = get_word_vocab(
            url_list, MAX_LENGTH_WORDS)
        word_x = get_words(self.x, self.word_reverse_dict, DELIMIT_MODE,
                           url_list)
        self.ngramed_id_x, self.ngrams_dict, self.worded_id_x, self.words_dict = \
            ngram_id_x(word_x, MAX_LENGTH_SUBWORDS, self.high_freq_words)
        self.chars_dict = self.ngrams_dict
        self.chared_id_x = get_char_id_x(url_list, self.chars_dict,
                                         MAX_LENGTH_CHARS)

        pos_x, neg_x = list(), list()
        for index in range(len(label_list)):
            label = label_list[index]
            if label == 1:
                pos_x.append(index)
            else:
                neg_x.append(index)
        print("Overall Mal/Ben split: {}/{}".format(len(pos_x), len(neg_x)))
        pos_x = np.array(pos_x)
        neg_x = np.array(neg_x)

        self.x_train, self.y_train, self.x_test, self.y_test = prep_train_test(
            pos_x, neg_x, DEV_PERCENTAGE)

        self.x_train_char = get_ngramed_id_x(self.x_train, self.ngramed_id_x)
        self.x_test_char = get_ngramed_id_x(self.x_test, self.ngramed_id_x)

        self.x_train_word = get_ngramed_id_x(self.x_train, self.worded_id_x)
        self.x_test_word = get_ngramed_id_x(self.x_test, self.worded_id_x)

        self.x_train_char_seq = get_ngramed_id_x(self.x_train,
                                                 self.chared_id_x)
        self.x_test_char_seq = get_ngramed_id_x(self.x_test, self.chared_id_x)

        self.dump_dict(self.ngrams_dict, NGRAMS_DICT_FILE)
        self.dump_dict(self.words_dict, WORDS_DICT_FILE)
        self.dump_dict(self.chars_dict, CHARS_DICT_FILE)
예제 #2
0
def main(args):
    set_randomSeed(args)
    init_logger()

    # load tokenizer
    if args.model_type.endswith('S2S'):
        tokenizer = get_word_vocab(args)
    elif args.model_type.endswith('bert'):
        tokenizer = load_tokenizer(args)
    # load dataset
    train_set = load_and_cacheExampels(args, tokenizer, 'train')
    val_set = load_and_cacheExampels(args, tokenizer, 'dev')
    test_set = load_and_cacheExampels(args, tokenizer, 'test')
    # # build train proccess
    if args.model_type.endswith('S2S'):
        proccesser = RnnTrainer(train_set, val_set, test_set, args)
    elif args.model_type.endswith('bert'):
        proccesser = PreTrainedTrainer(train_set, val_set, test_set, args)

    if args.do_train:
        proccesser.train_model()

    if args.do_eval:
        if args.model_type.endswith('S2S'):
            proccesser = RnnTrainer.reload_data_(args.model_dir, train_set,
                                                 val_set, test_set)

        elif args.model_type.endswith('bert'):
            proccesser = PreTrainedTrainer.reload_data_(
                args.model_dir, train_set, val_set, test_set)
        proccesser.evaluate('eval')
예제 #3
0
def main(pred_config):
    init_logger()
    device = set_device(pred_config)
    # loading train Model args
    args_path = os.path.join(pred_config.model_dir, 'train_args.bin')
    train_args = torch.load(args_path)

    # load labels
    intent_vocab = get_intent_labels(train_args)
    slot_vocab = get_slot_labels(train_args)

    # load preain Model
    model = load_pretrainModel(pred_config, train_args, len(intent_vocab),
                               len(slot_vocab))
    model.to(device)

    # read data
    examples = read_file(pred_config)
    pad_label_id = pred_config.pad_label_id

    logger.info(f'Start to predict using {pred_config.model_type}')
    if pred_config.model_type.endswith('S2S'):
        # convert data to tensor
        tokenizer = get_word_vocab(train_args)
        dataset = convert_input_file_to_RnnTensor(examples,
                                                  tokenizer,
                                                  train_args,
                                                  pred_config,
                                                  pad_token_id=pad_label_id)

        # get predict!
        intent_preds, slot_preds, slot_masks = get_s2s_predict(
            model, dataset, pred_config, train_args, slot_vocab, device)
    elif pred_config.model_type.endswith('bert'):
        # convert data to tensor
        tokenizer = load_tokenizer(train_args)
        dataset = convert_input_file_to_BertTensor(examples,
                                                   tokenizer,
                                                   train_args,
                                                   pred_config,
                                                   pad_label_id=pad_label_id)

        # get predict!
        intent_preds, slot_preds, slot_masks = get_pretrain_predict(
            model, dataset, pred_config, train_args, device)

    logger.info('***Display PredictInfo***')
    logger.info(f'Predict number:{len(dataset)}')
    logger.info(f'Predict max_seqLen:{train_args.max_seqLen}')
    logger.info(f'Whether to use CRF:{train_args.use_crf}')

    intent_labels, slot_labels = convert_to_labels(intent_vocab, slot_vocab,
                                                   intent_preds, slot_preds,
                                                   slot_masks, pad_label_id)
    # output to file
    write_predsFile(pred_config, examples, intent_labels, slot_labels)
예제 #4
0
    def do_preprocess(self, url_list):
        """
        测试数据预处理
        :param url_list:
        :return:
        """
        self.chars_dict = self.load_dict(CHARS_DICT_FILE)
        self.ngrams_dict = self.load_dict(NGRAMS_DICT_FILE)
        self.words_dict = self.load_dict(WORDS_DICT_FILE)

        x, word_reverse_dict = get_word_vocab(url_list, MAX_LENGTH_WORDS)
        word_x = get_words(x, word_reverse_dict, DELIMIT_MODE, url_list)

        self.ngramed_id_x, self.worded_id_x = \
            ngram_id_x_from_dict(word_x, MAX_LENGTH_SUBWORDS, self.ngrams_dict, self.words_dict)
        self.chared_id_x = get_char_id_x(url_list, self.chars_dict,
                                         MAX_LENGTH_CHARS)
예제 #5
0
    def __init__(self,
                 train_set,
                 val_set,
                 test_set,
                 args,
                 model=None,
                 pretrained_path=None):

        self.train_set = train_set
        self.val_set = val_set
        self.test_set = test_set
        self.args = args
        self.word_vocab = utils.get_word_vocab(args)
        self.intent_vocab = get_intent_labels(args)
        self.slot_vocab = get_slot_labels(args)
        self.device = torch.device('cuda:0') if torch.cuda.is_available() \
            and not args.no_cuda else torch.device('cpu')
        self.model = model
예제 #6
0
        'train.add_expert_feature': add_expert_feature,
        'train.l2_reg_lambda': 0.0,
        'train.lr': 0.001,
        'model.filter_sizes': "3,4,5,6",
        'train.batch_size': 10,
        'train.nb_epochs': 5,
        'log.print_every': 50,
        'log.eval_every': 50,
        'log.checkpoint_every': 50
    }

urls, labels = read_data(FLAGS["data.data_dir"])

high_freq_words = None
if FLAGS["data.min_word_freq"] > 0:
    x1, word_reverse_dict = get_word_vocab(urls, FLAGS["data.max_len_words"],
                                           FLAGS["data.min_word_freq"])
    high_freq_words = sorted(list(word_reverse_dict.values()))
    print("Number of words with freq >={}: {}".format(
        FLAGS["data.min_word_freq"], len(high_freq_words)))

expert_features_x = get_expert_features(urls)
x, word_reverse_dict = get_word_vocab(urls, FLAGS["data.max_len_words"])
word_x = get_words(x, word_reverse_dict, FLAGS["data.delimit_mode"], urls)
ngramed_id_x, ngrams_dict, worded_id_x, words_dict = ngram_id_x(
    word_x, FLAGS["data.max_len_subwords"], high_freq_words)

chars_dict = ngrams_dict
chared_id_x = char_id_x(urls, chars_dict, FLAGS["data.max_len_chars"])

pos_x = []
neg_x = []