def predict_document(file_name, model): f = open(file_name, 'r', encoding='utf-8') document = [] for line in f.readlines(): text = cut_sentence.segment(line, type='arr') document.append(text) f.close() return predict_naive_bayes(document, model)
def test_naive_bayes(test_file): test_set = open(test_file, 'r', encoding='utf-8') print("Load ", test_file) document = [] for line in test_set.readlines(): data = json.loads(line) text = cut_sentence.segment(data['text'], type='arr') document.append(text) test_set.close() # for doc in document: # yield doc return document[3]
def save_sentence_from_json(rootdir): directories = os.listdir(rootdir) fw = open(rootdir + 'sentences', 'a', encoding='utf-8') for dir in directories: class_of_dir = rootdir + dir json_files = os.listdir(class_of_dir) for file_name in json_files: fr = open(class_of_dir + '/' + file_name, 'r', encoding='utf-8') for line in fr.readlines(): datum = json.loads(line) text = zhconv.convert(datum['text'], 'zh-cn') sentence = cut_sentence.segment(text) fw.write(sentence) print(file_name + ' finished') fr.close() fw.close() print('save_sentence_from_json ok')
def load_dataset(rootdir): rootdir = os.path.abspath(rootdir) + '/' directories = os.listdir(rootdir) for clazz in directories: train_set = open(rootdir + clazz + '/train_set', 'r', encoding='utf-8') print("Load ", clazz) document = [] sample_num = 0 lines = train_set.readlines() for line in lines: data = json.loads(line) text = cut_sentence.segment(data['text'], type='arr') document.append(text) sample_num += 1 if sample_num > max_sample: break documents[clazz] = document train_set.close() return documents