def do_preprocess(self, url_list, label_list): """ 进行预处理 :param url_list: :param label_list: :return: """ if MIN_WORD_FREQ > 0: x__, word_reverse_dict = get_word_vocab(url_list, MAX_LENGTH_WORDS, MIN_WORD_FREQ) self.high_freq_words = sorted(list(word_reverse_dict.values())) self.x, self.word_reverse_dict = get_word_vocab( url_list, MAX_LENGTH_WORDS) word_x = get_words(self.x, self.word_reverse_dict, DELIMIT_MODE, url_list) self.ngramed_id_x, self.ngrams_dict, self.worded_id_x, self.words_dict = \ ngram_id_x(word_x, MAX_LENGTH_SUBWORDS, self.high_freq_words) self.chars_dict = self.ngrams_dict self.chared_id_x = get_char_id_x(url_list, self.chars_dict, MAX_LENGTH_CHARS) pos_x, neg_x = list(), list() for index in range(len(label_list)): label = label_list[index] if label == 1: pos_x.append(index) else: neg_x.append(index) print("Overall Mal/Ben split: {}/{}".format(len(pos_x), len(neg_x))) pos_x = np.array(pos_x) neg_x = np.array(neg_x) self.x_train, self.y_train, self.x_test, self.y_test = prep_train_test( pos_x, neg_x, DEV_PERCENTAGE) self.x_train_char = get_ngramed_id_x(self.x_train, self.ngramed_id_x) self.x_test_char = get_ngramed_id_x(self.x_test, self.ngramed_id_x) self.x_train_word = get_ngramed_id_x(self.x_train, self.worded_id_x) self.x_test_word = get_ngramed_id_x(self.x_test, self.worded_id_x) self.x_train_char_seq = get_ngramed_id_x(self.x_train, self.chared_id_x) self.x_test_char_seq = get_ngramed_id_x(self.x_test, self.chared_id_x) self.dump_dict(self.ngrams_dict, NGRAMS_DICT_FILE) self.dump_dict(self.words_dict, WORDS_DICT_FILE) self.dump_dict(self.chars_dict, CHARS_DICT_FILE)
def main(args): set_randomSeed(args) init_logger() # load tokenizer if args.model_type.endswith('S2S'): tokenizer = get_word_vocab(args) elif args.model_type.endswith('bert'): tokenizer = load_tokenizer(args) # load dataset train_set = load_and_cacheExampels(args, tokenizer, 'train') val_set = load_and_cacheExampels(args, tokenizer, 'dev') test_set = load_and_cacheExampels(args, tokenizer, 'test') # # build train proccess if args.model_type.endswith('S2S'): proccesser = RnnTrainer(train_set, val_set, test_set, args) elif args.model_type.endswith('bert'): proccesser = PreTrainedTrainer(train_set, val_set, test_set, args) if args.do_train: proccesser.train_model() if args.do_eval: if args.model_type.endswith('S2S'): proccesser = RnnTrainer.reload_data_(args.model_dir, train_set, val_set, test_set) elif args.model_type.endswith('bert'): proccesser = PreTrainedTrainer.reload_data_( args.model_dir, train_set, val_set, test_set) proccesser.evaluate('eval')
def main(pred_config): init_logger() device = set_device(pred_config) # loading train Model args args_path = os.path.join(pred_config.model_dir, 'train_args.bin') train_args = torch.load(args_path) # load labels intent_vocab = get_intent_labels(train_args) slot_vocab = get_slot_labels(train_args) # load preain Model model = load_pretrainModel(pred_config, train_args, len(intent_vocab), len(slot_vocab)) model.to(device) # read data examples = read_file(pred_config) pad_label_id = pred_config.pad_label_id logger.info(f'Start to predict using {pred_config.model_type}') if pred_config.model_type.endswith('S2S'): # convert data to tensor tokenizer = get_word_vocab(train_args) dataset = convert_input_file_to_RnnTensor(examples, tokenizer, train_args, pred_config, pad_token_id=pad_label_id) # get predict! intent_preds, slot_preds, slot_masks = get_s2s_predict( model, dataset, pred_config, train_args, slot_vocab, device) elif pred_config.model_type.endswith('bert'): # convert data to tensor tokenizer = load_tokenizer(train_args) dataset = convert_input_file_to_BertTensor(examples, tokenizer, train_args, pred_config, pad_label_id=pad_label_id) # get predict! intent_preds, slot_preds, slot_masks = get_pretrain_predict( model, dataset, pred_config, train_args, device) logger.info('***Display PredictInfo***') logger.info(f'Predict number:{len(dataset)}') logger.info(f'Predict max_seqLen:{train_args.max_seqLen}') logger.info(f'Whether to use CRF:{train_args.use_crf}') intent_labels, slot_labels = convert_to_labels(intent_vocab, slot_vocab, intent_preds, slot_preds, slot_masks, pad_label_id) # output to file write_predsFile(pred_config, examples, intent_labels, slot_labels)
def do_preprocess(self, url_list): """ 测试数据预处理 :param url_list: :return: """ self.chars_dict = self.load_dict(CHARS_DICT_FILE) self.ngrams_dict = self.load_dict(NGRAMS_DICT_FILE) self.words_dict = self.load_dict(WORDS_DICT_FILE) x, word_reverse_dict = get_word_vocab(url_list, MAX_LENGTH_WORDS) word_x = get_words(x, word_reverse_dict, DELIMIT_MODE, url_list) self.ngramed_id_x, self.worded_id_x = \ ngram_id_x_from_dict(word_x, MAX_LENGTH_SUBWORDS, self.ngrams_dict, self.words_dict) self.chared_id_x = get_char_id_x(url_list, self.chars_dict, MAX_LENGTH_CHARS)
def __init__(self, train_set, val_set, test_set, args, model=None, pretrained_path=None): self.train_set = train_set self.val_set = val_set self.test_set = test_set self.args = args self.word_vocab = utils.get_word_vocab(args) self.intent_vocab = get_intent_labels(args) self.slot_vocab = get_slot_labels(args) self.device = torch.device('cuda:0') if torch.cuda.is_available() \ and not args.no_cuda else torch.device('cpu') self.model = model
'train.add_expert_feature': add_expert_feature, 'train.l2_reg_lambda': 0.0, 'train.lr': 0.001, 'model.filter_sizes': "3,4,5,6", 'train.batch_size': 10, 'train.nb_epochs': 5, 'log.print_every': 50, 'log.eval_every': 50, 'log.checkpoint_every': 50 } urls, labels = read_data(FLAGS["data.data_dir"]) high_freq_words = None if FLAGS["data.min_word_freq"] > 0: x1, word_reverse_dict = get_word_vocab(urls, FLAGS["data.max_len_words"], FLAGS["data.min_word_freq"]) high_freq_words = sorted(list(word_reverse_dict.values())) print("Number of words with freq >={}: {}".format( FLAGS["data.min_word_freq"], len(high_freq_words))) expert_features_x = get_expert_features(urls) x, word_reverse_dict = get_word_vocab(urls, FLAGS["data.max_len_words"]) word_x = get_words(x, word_reverse_dict, FLAGS["data.delimit_mode"], urls) ngramed_id_x, ngrams_dict, worded_id_x, words_dict = ngram_id_x( word_x, FLAGS["data.max_len_subwords"], high_freq_words) chars_dict = ngrams_dict chared_id_x = char_id_x(urls, chars_dict, FLAGS["data.max_len_chars"]) pos_x = [] neg_x = []