def test(method, train_data, test_data): config = function.read_json_file(NET_CONFIG_FOLDER + "/" + method + ".json") device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") train_loader, val_loader, test_loader = torch_data.load_data( train_data, test_data, config['sentence_len'], config['batch_size'], device) if method == 'RNN': net = RNN(config) elif method == 'MLP': net = MLP(config) elif method == 'RCNN': net = RCNN(config) elif method == 'FastText': net = FastText(config) else: net = CNN(config) net.to(device) if 'load' in config: # read pre-trained net net.load_state_dict(torch.load(NET_FOLDER + "/" + method)) else: net = train(config, net, train_loader, val_loader) if 'save' in config: torch.save(net.state_dict(), NET_FOLDER + "/" + method) _, coefficient, report = test_stat(test_loader, net) print("Test result:") print(report) print(f"coefficient = {coefficient:.4f}")
def gen_test_set(file_path, test_count=500, only_long_sentence=True): homo_dic = function.read_json_file(HOMO_DIC_PATH) all_sentences = function.read_json_file(file_path) all_length = len(all_sentences) test_index = random.sample(range(0, all_length), test_count) answers = [] inputs = [] char_count = 0 for index in test_index: sentence = all_sentences[index] length = len(sentence) if only_long_sentence and length < 10: continue chars = [sentence[i] for i in range(0, length, 2)] pinyin_ids = [int(sentence[i]) for i in range(1, length, 2)] pinyins = [] for char, pinyin_id in zip(chars, pinyin_ids): for dic_pinyin, dic_id in homo_dic[char].items(): if dic_id == pinyin_id: pinyins.append(dic_pinyin) continue answers.append(''.join(chars) + '\n') inputs.append(' '.join(pinyins) + '\n') char_count += len(chars) new_all_sentences = [] # delete test from training file for index, sentence in enumerate(all_sentences): if index not in test_index: new_all_sentences.append(sentence) function.write_json_file(file_path, new_all_sentences) with open(TEST_INPUT, "a") as file: file.writelines(inputs) with open(TEST_ANSWER, "a", encoding='gbk') as file: file.writelines(answers) print( f"Generate a test set with {len(inputs)} sentences and {char_count} characters. " f"Test input added at {TEST_INPUT}. Answer added at {TEST_ANSWER}")
def data2tensor_vocab(self, news_data): word2id = function.read_json_file(WORD2ID_PATH) sentences, labels, emotions = function.data2vec(news_data) texts_id = [] for sentence in sentences: word_list = sentence.split(' ') words_id = [] for word in word_list: words_id.append(word2id.get(word, word2id[UNKNOWN])) if len(words_id) < self.sentence_len: # padding words_id.extend([word2id[PADDING]] * (self.sentence_len - len(words_id))) texts_id.append(words_id[:self.sentence_len]) return (torch.tensor(texts_id).to(self.device), torch.tensor(labels).to(self.device), torch.tensor(emotions).to(self.device))
def __init__(self, model_path): model_dic = function.read_json_file(model_path) self.model = Model.input_dict(model_dic) self.pinyin2char = function.read_json_file(PINYIN2CHAR_PATH) print("Model loaded.")
import torch from constants import * import function import deep_learning import numpy as np def parse(argv): parser = argparse.ArgumentParser() parser.add_argument("--command", required=True, choices=['preprocess', 'test']) parser.add_argument("--method", choices=['naive_bayes', 'SVM', 'CNN', 'RNN', 'MLP', 'RCNN', 'FastText'], default='naive_bayes') parser.add_argument("--preprocess_folder", default=SINA_FOLDER) parser.add_argument("--train_path", default=TRAIN_PATH) parser.add_argument("--test_path", default=TEST_PATH) parser.add_argument("--ori_embedding_path", default=ORI_EMB_PATH) parser.add_argument("--embedding_path", default=EMBEDDING_PATH) return parser.parse_args(argv) if __name__ == '__main__': args = parse(sys.argv[1:]) if args.command == 'test': test_data = function.read_json_file(args.test_path) train_data = function.read_json_file(args.train_path) if args.method == 'naive_bayes' or args.method == 'SVM': baseline_ml.test(args.method, train_data, test_data) else: deep_learning.test(args.method, train_data, test_data) if args.command == 'preprocess': preprocess.preprocess(args.preprocess_folder, args.ori_embedding_path)
def preprocess(folder_path, name): char_table = function.read_json_file(CHAR_TABLE_PATH) homo_dic = function.read_json_file(HOMO_DIC_PATH) # cut a line to chinese sentences. if ignore number, skip all sentences with number. def cut_sentences(inp_line, ignore_number=False): pro_str = "" sentences = [] valid_str = True for char in inp_line: if char in char_table: pro_str += char elif char.encode("utf-8").isdigit(): if not ignore_number: pro_str += char else: valid_str = False elif char in SEPARATOR: if valid_str: sentences.append(pro_str) pro_str = "" valid_str = True # for english characters and other punctuations like " ", "/" else: continue ret_sentences = [] for sentence in sentences: if len(sentence) < 2: # sentence too short, ignore continue try: ret_sentences.append(cn2an.transform(sentence, mode="an2cn")) except ValueError: # number too long. ignore the sentence. continue return ret_sentences # label value in homo_dic to each character def label_homo(sentence): pinyins = pypinyin.lazy_pinyin(sentence) ret_sentence = "" for char, pinyin in zip(sentence, pinyins): ret_sentence += char pinyin = function.pinyin_fix(pinyin) try: ret_sentence += str(homo_dic[char][pinyin]) except KeyError: # pinyin not in dict. caused by conflicts of pypinyin and "拼音汉字表" print(char, pinyin) ret_sentence += "0" return ret_sentence def process_file(file_path, cnt, batch_name): all_sentences = [] if batch_name == 'sina': with open(file_path, encoding="gbk") as file: lines = file.readlines() for line in lines: news_piece = json.loads(line) title = news_piece["title"] content = news_piece["html"] all_sentences += cut_sentences(title) all_sentences += cut_sentences(content) if batch_name == 'weixin': with open(file_path) as file: lines = file.readlines() length = len(lines) for line_index in range(0, length, 3): # get 1/3 of weixin corpus content = json.loads(lines[line_index])['content'] all_sentences += cut_sentences(content, ignore_number=True) sentences_with_pinyin = [] for sentence in all_sentences: sentences_with_pinyin.append(label_homo(sentence)) save_path = TRAINING_DATA_PATH + f"/{name}-{cnt}.json" function.write_json_file(save_path, sentences_with_pinyin) print(f"{file_path} processed. Saved as {save_path}") function.pypinyin_fix() all_files_paths = os.listdir(folder_path) for index, rel_path in enumerate(all_files_paths): path = folder_path + "/" + rel_path try: print(f"Begin processing {path}") process_file(path, index, name) except UnicodeDecodeError: print("Illegal file, continue.")
def train_file(file_path): data = function.read_json_file(file_path) for sentence in list(data): # add (n_gram - 1) 'bb' to the beginning of the sentence and 'ee' to the end sentence = ('bb' * (n - 1)) + sentence + 'ee' model.train(sentence)
def get_embedding(): embedding = function.read_json_file(EMBEDDING_PATH) return torch.tensor(embedding['list'])