def train_test_system(features_names_list, data_info_file, system, output_filename): """ This function trains a classifier based on given system and data information, tests this classifier system and writes the predictions to an outputfile. :param features_names_list: list of indications of all feature columns that should be used :param data_info_file: path to file containing info about all necessary data :param system: name of the ML algorithm that is passed to the classifier :param output_filename: path to conll outputfile :type feature_names_list: list :type data_info_file: string :type system: string :type output_filename: string """ data = load_json(data_info_file) # Train model inputfile = data['training']['file'] annotation_column = data['training']['annotation_column'] model = TextClassifier(system) model.train(inputfile, features_names_list, annotation_column) # Classify gold_file = data['gold']['file'] predictions = model.predict(gold_file) # Write output append_column_and_write_file(output_filename, gold_file, predictions, 'predictions') # Update data info name = os.path.basename(output_filename[:-6]) data[name] = {'annotation_column': 'predictions', 'file': output_filename} dump_json(data_info_file, data)
def model_xunlian(): #读取数据并预处理 df_bingyin_list = load_dataset('病因') df_zhenduan_list = load_dataset('诊断') df_zhengzhuang_list = load_dataset('症状') df_zhiliao_list = load_dataset('治疗') #对各个类别数据进行空值符处理 df_bingyin_word = processing_null(df_bingyin_list) # print(len(df_bingyin_word)) df_zhenduan_word = processing_null(df_zhenduan_list) df_zhengzhuang_word = processing_null(df_zhengzhuang_list) df_zhiliao_word = processing_null(df_zhiliao_list) bingyin = df_bingyin_word.values.tolist() zhenduan = df_zhenduan_word.values.tolist() zhengzhuang = df_zhengzhuang_word.values.tolist() zhiliao = df_zhiliao_word.values.tolist() #分别把各个类别数据整理成一个列表形式 sentences = [] prep = preprocess(sentences, bingyin, zhenduan, zhengzhuang, zhiliao) prep.preprocess_text(bingyin, sentences, 'pathogeny') prep.preprocess_text(zhenduan, sentences, 'diagnosis') prep.preprocess_text(zhengzhuang, sentences, 'symptom') prep.preprocess_text(zhiliao, sentences, 'treatment') random.shuffle(sentences) # 分别把各个类别数据整理成各个列表形式 bingyin_list = [] zhenduan_list = [] zhengzhuang_list = [] zhiliao_list = [] prep = preprocess2(bingyin_list, zhenduan_list, zhengzhuang_list, zhiliao_list, bingyin, zhenduan, zhengzhuang, zhiliao) prep.preprocess_lines(bingyin, bingyin_list, 'pathogeny') prep.preprocess_lines(zhenduan, zhenduan_list, 'diagnosis') prep.preprocess_lines(zhengzhuang, zhengzhuang_list, 'symptom') prep.preprocess_lines(zhiliao, zhiliao_list, 'treatment') #分割数据 x, y = zip(*sentences) x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1234) #训练数据 text_classifier = TextClassifier() text_classifier.fit(x_train, y_train) #保存并加载模型 joblib.dump(text_classifier, 'text_classifier.pkl') # new_text_classifier=joblib.load('text_classifier.pkl') # precision=text_classifier.score(x_test, y_test) return bingyin_list, zhenduan_list, zhengzhuang_list, zhiliao_list, x_train, x_test, y_train, y_test
def test(hparams): model = TextClassifier(hparams) model.load_state_dict( torch.load( "/home/marcelbraasch/PycharmProjects/MultiClassTextClassifier/Models/model_2.pt" )) model.eval() print(*model.get_confusion(), sep="\n")
def main(hparams): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') torch.device(device) model = TextClassifier(hparams) tb_logger = loggers.TensorBoardLogger('logs/') checkpoint_callback = ModelCheckpoint(monitor='val_loss') trainer = Trainer(min_epochs=hparams["min_epochs"], max_epochs=hparams["max_epochs"], logger=tb_logger, callbacks=[checkpoint_callback], default_root_dir="/Models/checkpoints") trainer.fit(model) trainer.test() # loads the best model automatically torch.save(model.state_dict(), f"Models/model_{hparams['no']}.pt") model.eval() with open("log.txt", mode="a") as f: for line in model.get_confusion(): f.write(str(line))
# randomize random.shuffle(my_texts) random.shuffle(other_texts) train_percent = 0.8 slice_my_index = int(len(my_texts) * train_percent) slice_other_index = int(len(other_texts) * train_percent) train_my_texts = my_texts[:slice_my_index] train_other_texts = other_texts[:slice_other_index] test_my_texts = my_texts[slice_my_index:] test_other_texts = other_texts[slice_other_index:] target_indices = ([0] * len(train_my_texts)) + ([1] * len(train_other_texts)) test_target_indices = ([0] * len(test_my_texts)) + ([1] * len(test_other_texts)) training_data = train_my_texts + train_other_texts test_data = test_my_texts + test_other_texts targets = TARGETS classifier = TextClassifier(training_data, targets, target_indices) #classifier.train('svm') #classifier.predict(test_data, test_target_indices) print('----------------------') classifier.train_nltk() classifier.test_nltk(test_data)
prep.preprocess_text(bingyin, sentences, 'pathogeny') prep.preprocess_text(zhenduan, sentences, 'diagnosis') prep.preprocess_text(zhengzhuang, sentences, 'symptom') prep.preprocess_text(zhiliao, sentences, 'treatment') random.shuffle(sentences) #把各个列表数据转化成word2vec形式 df_bingyin_word_vec = numpy.load('df_bingyin_word_vec.npy').tolist() df_zhenduan_word_vec = numpy.load('df_zhenduan_word_vec.npy').tolist() df_zhengzhuang_word_vec = numpy.load('df_zhengzhuang_word_vec.npy').tolist() df_zhiliao_word_vec = numpy.load('df_zhiliao_word_vec.npy').tolist() #分割数据 x, y = zip(*sentences) x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1234) #训练数据,并训练各自的疾病数据成向量 text_classifier = TextClassifier() text_classifier.fit(x_train, y_train) print(text_classifier.score(x_test, y_test)) jibing_xl_dict = { 'diagnosis': df_zhenduan_word_vec, 'treatment': df_zhiliao_word_vec, 'symptom': df_zhengzhuang_word_vec, 'pathogeny': df_bingyin_word_vec } #输出预测类别 line = input('请输入:') line = text_classifier.process_line(line) leibie = text_classifier.predict(line)[0] line_xl = get_line_vecs(line)[0].tolist() # print(line_xl)
args = parse_args() if __name__ == '__main__': print('-- Loading test set -- ') test_ds = TextDataset(args.test, split='test', stopwords_path=args.stopwords) print('-- Loading training set --') train_ds = TextDataset(args.train, split='train', stopwords_path=args.stopwords, method=args.method, n_features=8800) clf = TextClassifier(train_ds, test_ds, args.method) print('-- Evaluating --') f1 = clf.evaluate() print('F1 score:', f1) #ret = [] #for n in tqdm(range(6000, 10000, 200)): # print('-- Loading training set --') # train_ds = TextDataset(args.train, split='train', stopwords_path=args.stopwords, method=args.method, n_features=n) # clf = TextClassifier(train_ds, test_ds, args.method) # print('-- Evaluating --') # f1 = clf.evaluate() # ret.append(f1) # print('-- f1=%.4f, n=%d --' % (f1, n)) #print('* Max F1=%.4f with %d features selected' % (max(ret), 10 + 5 * ret.index(max(ret))))
zhenduan_list = [] zhengzhuang_list = [] zhiliao_list = [] prep = preprocess1(bingyin_list, zhenduan_list, zhengzhuang_list, zhiliao_list, bingyin, zhenduan, zhengzhuang, zhiliao) prep.preprocess_lines(bingyin, bingyin_list) prep.preprocess_lines(zhenduan, zhenduan_list) prep.preprocess_lines(zhengzhuang, zhengzhuang_list) prep.preprocess_lines(zhiliao, zhiliao_list) #分割数据 x, y = zip(*sentences) x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1234) #训练数据 text_classifier = TextClassifier() text_classifier.fit(x_train, y_train) #保存并加载模型 joblib.dump(text_classifier, 'text_classifier.pkl') new_text_classifier = joblib.load('text_classifier.pkl') print(new_text_classifier.score(x_test, y_test)) bingyin_xl = new_text_classifier.features(bingyin_list).todense() zhiliao_xl = new_text_classifier.features(zhiliao_list).todense() zhengzhuang_xl = new_text_classifier.features(zhengzhuang_list).todense() zhenduan_xl = new_text_classifier.features(zhenduan_list).todense() jibing_xl_dict = { 'diagnosis': zhenduan_xl, 'treatment': zhiliao_xl, 'symptom': zhengzhuang_xl, 'pathogeny': bingyin_xl }
''' Created on May 8, 2013 @author: Ashish ''' from classifier import TextClassifier from tweet import aggregator if __name__ == '__main__': tweetClassifier = TextClassifier.TweetClassifier( "C:\\work\\development\\python\\workspace\\stocksentiment\\polarityData\\rt-polaritydata\\rt-polarity-pos.txt", "C:\\work\\development\\python\\workspace\\stocksentiment\\polarityData\\rt-polaritydata\\rt-polarity-neg.txt" ) classifier = tweetClassifier.buildClassifier( tweetClassifier.make_full_dict) tweetAggregator = aggregator.Aggregator( "qkszpkt1i2x1kY9Ac73w", "tTNJAdzmD4tDBCbENM710TWK1UkoczHEnn8hZyO4Lwc", "996319352-9pP5LTKNyrdmLiviq47CmzasffUfZF4t0efd48", "puJC3Pv9n9QeZltBpMLYWlfD7aRLwcGuU5b29jnWkRk") tweetAggregator.setClassfier(classifier) tweetAggregator.searchKeyword('$APPL') print classifier.labels()
""" This module initializes a TextClassifier with keywords, categories and training data taken from bayes.json file. """ import json from classifier import TextClassifier from classifier import TrainingSet config = {} tc = TextClassifier() with open('app/bayes/bayes.json') as config: config = json.load(config) for category in config["categories"]: tc.add_category(category.encode('utf-8')) for keyword in config["keywords"]: tc.add_keyword(keyword.encode('utf-8')) tc.init() for category, trainings in config["training"].iteritems(): for training in trainings: ts = TrainingSet() ts[:] = map(lambda w: w.encode('utf-8'), training) tc.add_training(ts, category.encode('utf-8')) tc.train()
return model def lr(): from sklearn.linear_model import LogisticRegression model = LogisticRegression() return model x_train, y_train, x_test, y_test = get_data() type_model = { 'lr': lr(), } # type_model={'bayes':MultinomialNB(), # 'gdbt':gdbt(), # 'rfc':rfc(), # 'svm':SVC(), # 'lr':lr(), # } for i in type_model: model = type_model[i] print('model:', model) #训练数据 text_classifier = TextClassifier(model) text_classifier.fit(x_train, y_train) #保存并加载模型 joblib.dump(text_classifier, 'text_classifier.pkl') print(text_classifier.score(x_test, y_test)) print("-------------------")
zhenduan_list = [] zhengzhuang_list = [] zhiliao_list = [] prep = preprocess1(bingyin_list, zhenduan_list, zhengzhuang_list, zhiliao_list, bingyin, zhenduan, zhengzhuang, zhiliao) prep.preprocess_lines(bingyin, bingyin_list) prep.preprocess_lines(zhenduan, zhenduan_list) prep.preprocess_lines(zhengzhuang, zhengzhuang_list) prep.preprocess_lines(zhiliao, zhiliao_list) #分割数据 x, y = zip(*sentences) x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1234) #训练数据,并训练各自的疾病数据成向量 text_classifier = TextClassifier() text_classifier.fit(x_train, y_train) print(text_classifier.score(x_test, y_test)) bingyin_xl = text_classifier.features(bingyin_list).todense() zhiliao_xl = text_classifier.features(zhiliao_list).todense() zhengzhuang_xl = text_classifier.features(zhengzhuang_list).todense() zhenduan_xl = text_classifier.features(zhenduan_list).todense() jibing_xl_dict = { 'diagnosis': zhenduan_xl, 'treatment': zhiliao_xl, 'symptom': zhengzhuang_xl, 'pathogeny': bingyin_xl } # print(zhenduan_xl) #输出预测类别