def train_test_system(features_names_list, data_info_file, system,
                      output_filename):
    """
    This function trains a classifier based on given system and data information,
    tests this classifier system and writes the predictions to an outputfile.

    :param features_names_list: list of indications of all feature columns that should be used
    :param data_info_file: path to file containing info about all necessary data
    :param system: name of the ML algorithm that is passed to the classifier 
    :param output_filename: path to conll outputfile
    :type feature_names_list: list
    :type data_info_file: string
    :type system: string
    :type output_filename: string
    """
    data = load_json(data_info_file)

    # Train model
    inputfile = data['training']['file']
    annotation_column = data['training']['annotation_column']
    model = TextClassifier(system)
    model.train(inputfile, features_names_list, annotation_column)

    # Classify
    gold_file = data['gold']['file']
    predictions = model.predict(gold_file)

    # Write output
    append_column_and_write_file(output_filename, gold_file, predictions,
                                 'predictions')

    # Update data info
    name = os.path.basename(output_filename[:-6])
    data[name] = {'annotation_column': 'predictions', 'file': output_filename}
    dump_json(data_info_file, data)
示例#2
0
def model_xunlian():
    #读取数据并预处理
    df_bingyin_list = load_dataset('病因')
    df_zhenduan_list = load_dataset('诊断')
    df_zhengzhuang_list = load_dataset('症状')
    df_zhiliao_list = load_dataset('治疗')

    #对各个类别数据进行空值符处理
    df_bingyin_word = processing_null(df_bingyin_list)
    # print(len(df_bingyin_word))
    df_zhenduan_word = processing_null(df_zhenduan_list)
    df_zhengzhuang_word = processing_null(df_zhengzhuang_list)
    df_zhiliao_word = processing_null(df_zhiliao_list)

    bingyin = df_bingyin_word.values.tolist()
    zhenduan = df_zhenduan_word.values.tolist()
    zhengzhuang = df_zhengzhuang_word.values.tolist()
    zhiliao = df_zhiliao_word.values.tolist()

    #分别把各个类别数据整理成一个列表形式
    sentences = []
    prep = preprocess(sentences, bingyin, zhenduan, zhengzhuang, zhiliao)
    prep.preprocess_text(bingyin, sentences, 'pathogeny')
    prep.preprocess_text(zhenduan, sentences, 'diagnosis')
    prep.preprocess_text(zhengzhuang, sentences, 'symptom')
    prep.preprocess_text(zhiliao, sentences, 'treatment')
    random.shuffle(sentences)

    # 分别把各个类别数据整理成各个列表形式
    bingyin_list = []
    zhenduan_list = []
    zhengzhuang_list = []
    zhiliao_list = []
    prep = preprocess2(bingyin_list, zhenduan_list, zhengzhuang_list,
                       zhiliao_list, bingyin, zhenduan, zhengzhuang, zhiliao)
    prep.preprocess_lines(bingyin, bingyin_list, 'pathogeny')
    prep.preprocess_lines(zhenduan, zhenduan_list, 'diagnosis')
    prep.preprocess_lines(zhengzhuang, zhengzhuang_list, 'symptom')
    prep.preprocess_lines(zhiliao, zhiliao_list, 'treatment')

    #分割数据
    x, y = zip(*sentences)
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        random_state=1234)

    #训练数据
    text_classifier = TextClassifier()
    text_classifier.fit(x_train, y_train)
    #保存并加载模型
    joblib.dump(text_classifier, 'text_classifier.pkl')
    # new_text_classifier=joblib.load('text_classifier.pkl')
    # precision=text_classifier.score(x_test, y_test)
    return bingyin_list, zhenduan_list, zhengzhuang_list, zhiliao_list, x_train, x_test, y_train, y_test
示例#3
0
def test(hparams):
    model = TextClassifier(hparams)
    model.load_state_dict(
        torch.load(
            "/home/marcelbraasch/PycharmProjects/MultiClassTextClassifier/Models/model_2.pt"
        ))
    model.eval()
    print(*model.get_confusion(), sep="\n")
示例#4
0
def main(hparams):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    torch.device(device)
    model = TextClassifier(hparams)
    tb_logger = loggers.TensorBoardLogger('logs/')

    checkpoint_callback = ModelCheckpoint(monitor='val_loss')
    trainer = Trainer(min_epochs=hparams["min_epochs"],
                      max_epochs=hparams["max_epochs"],
                      logger=tb_logger,
                      callbacks=[checkpoint_callback],
                      default_root_dir="/Models/checkpoints")
    trainer.fit(model)
    trainer.test()  # loads the best model automatically
    torch.save(model.state_dict(), f"Models/model_{hparams['no']}.pt")
    model.eval()
    with open("log.txt", mode="a") as f:
        for line in model.get_confusion():
            f.write(str(line))
    # randomize
    random.shuffle(my_texts)
    random.shuffle(other_texts)

    train_percent = 0.8
    slice_my_index = int(len(my_texts) * train_percent)
    slice_other_index = int(len(other_texts) * train_percent)

    train_my_texts = my_texts[:slice_my_index]
    train_other_texts = other_texts[:slice_other_index]

    test_my_texts = my_texts[slice_my_index:]
    test_other_texts = other_texts[slice_other_index:]

    target_indices = ([0] * len(train_my_texts)) + ([1] *
                                                    len(train_other_texts))
    test_target_indices = ([0] * len(test_my_texts)) + ([1] *
                                                        len(test_other_texts))

    training_data = train_my_texts + train_other_texts
    test_data = test_my_texts + test_other_texts

    targets = TARGETS
    classifier = TextClassifier(training_data, targets, target_indices)
    #classifier.train('svm')
    #classifier.predict(test_data, test_target_indices)

    print('----------------------')
    classifier.train_nltk()
    classifier.test_nltk(test_data)
示例#6
0
prep.preprocess_text(bingyin, sentences, 'pathogeny')
prep.preprocess_text(zhenduan, sentences, 'diagnosis')
prep.preprocess_text(zhengzhuang, sentences, 'symptom')
prep.preprocess_text(zhiliao, sentences, 'treatment')
random.shuffle(sentences)
#把各个列表数据转化成word2vec形式
df_bingyin_word_vec = numpy.load('df_bingyin_word_vec.npy').tolist()
df_zhenduan_word_vec = numpy.load('df_zhenduan_word_vec.npy').tolist()
df_zhengzhuang_word_vec = numpy.load('df_zhengzhuang_word_vec.npy').tolist()
df_zhiliao_word_vec = numpy.load('df_zhiliao_word_vec.npy').tolist()
#分割数据
x, y = zip(*sentences)
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1234)

#训练数据,并训练各自的疾病数据成向量
text_classifier = TextClassifier()
text_classifier.fit(x_train, y_train)
print(text_classifier.score(x_test, y_test))
jibing_xl_dict = {
    'diagnosis': df_zhenduan_word_vec,
    'treatment': df_zhiliao_word_vec,
    'symptom': df_zhengzhuang_word_vec,
    'pathogeny': df_bingyin_word_vec
}

#输出预测类别
line = input('请输入:')
line = text_classifier.process_line(line)
leibie = text_classifier.predict(line)[0]
line_xl = get_line_vecs(line)[0].tolist()
# print(line_xl)
示例#7
0

args = parse_args()

if __name__ == '__main__':

    print('-- Loading test set -- ')
    test_ds = TextDataset(args.test,
                          split='test',
                          stopwords_path=args.stopwords)
    print('-- Loading training set --')
    train_ds = TextDataset(args.train,
                           split='train',
                           stopwords_path=args.stopwords,
                           method=args.method,
                           n_features=8800)
    clf = TextClassifier(train_ds, test_ds, args.method)
    print('-- Evaluating --')
    f1 = clf.evaluate()
    print('F1 score:', f1)
    #ret = []
    #for n in tqdm(range(6000, 10000, 200)):
    #    print('-- Loading training set --')
    #    train_ds = TextDataset(args.train, split='train', stopwords_path=args.stopwords, method=args.method, n_features=n)
    #    clf = TextClassifier(train_ds, test_ds, args.method)
    #    print('-- Evaluating --')
    #    f1 = clf.evaluate()
    #    ret.append(f1)
    #    print('-- f1=%.4f, n=%d --' % (f1, n))
    #print('* Max F1=%.4f with %d features selected' % (max(ret), 10 + 5 * ret.index(max(ret))))
示例#8
0
zhenduan_list = []
zhengzhuang_list = []
zhiliao_list = []
prep = preprocess1(bingyin_list, zhenduan_list, zhengzhuang_list, zhiliao_list,
                   bingyin, zhenduan, zhengzhuang, zhiliao)
prep.preprocess_lines(bingyin, bingyin_list)
prep.preprocess_lines(zhenduan, zhenduan_list)
prep.preprocess_lines(zhengzhuang, zhengzhuang_list)
prep.preprocess_lines(zhiliao, zhiliao_list)

#分割数据
x, y = zip(*sentences)
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1234)

#训练数据
text_classifier = TextClassifier()
text_classifier.fit(x_train, y_train)
#保存并加载模型
joblib.dump(text_classifier, 'text_classifier.pkl')
new_text_classifier = joblib.load('text_classifier.pkl')
print(new_text_classifier.score(x_test, y_test))
bingyin_xl = new_text_classifier.features(bingyin_list).todense()
zhiliao_xl = new_text_classifier.features(zhiliao_list).todense()
zhengzhuang_xl = new_text_classifier.features(zhengzhuang_list).todense()
zhenduan_xl = new_text_classifier.features(zhenduan_list).todense()
jibing_xl_dict = {
    'diagnosis': zhenduan_xl,
    'treatment': zhiliao_xl,
    'symptom': zhengzhuang_xl,
    'pathogeny': bingyin_xl
}
示例#9
0
'''
Created on May 8, 2013

@author: Ashish
'''
from classifier import TextClassifier
from tweet import aggregator
if __name__ == '__main__':
    tweetClassifier = TextClassifier.TweetClassifier(
        "C:\\work\\development\\python\\workspace\\stocksentiment\\polarityData\\rt-polaritydata\\rt-polarity-pos.txt",
        "C:\\work\\development\\python\\workspace\\stocksentiment\\polarityData\\rt-polaritydata\\rt-polarity-neg.txt"
    )
    classifier = tweetClassifier.buildClassifier(
        tweetClassifier.make_full_dict)

    tweetAggregator = aggregator.Aggregator(
        "qkszpkt1i2x1kY9Ac73w", "tTNJAdzmD4tDBCbENM710TWK1UkoczHEnn8hZyO4Lwc",
        "996319352-9pP5LTKNyrdmLiviq47CmzasffUfZF4t0efd48",
        "puJC3Pv9n9QeZltBpMLYWlfD7aRLwcGuU5b29jnWkRk")
    tweetAggregator.setClassfier(classifier)
    tweetAggregator.searchKeyword('$APPL')

    print classifier.labels()
示例#10
0
"""
This module initializes a TextClassifier
with keywords, categories and training data
taken from bayes.json file.
"""
import json

from classifier import TextClassifier
from classifier import TrainingSet

config = {}
tc = TextClassifier()

with open('app/bayes/bayes.json') as config:
    config = json.load(config)

for category in config["categories"]:
    tc.add_category(category.encode('utf-8'))

for keyword in config["keywords"]:
    tc.add_keyword(keyword.encode('utf-8'))

tc.init()

for category, trainings in config["training"].iteritems():
    for training in trainings:
        ts = TrainingSet()
        ts[:] = map(lambda w: w.encode('utf-8'), training)
        tc.add_training(ts, category.encode('utf-8'))

tc.train()
示例#11
0
    return model


def lr():
    from sklearn.linear_model import LogisticRegression
    model = LogisticRegression()
    return model


x_train, y_train, x_test, y_test = get_data()
type_model = {
    'lr': lr(),
}
# type_model={'bayes':MultinomialNB(),
# 'gdbt':gdbt(),
# 'rfc':rfc(),
# 'svm':SVC(),
# 'lr':lr(),
# }

for i in type_model:

    model = type_model[i]
    print('model:', model)
    #训练数据
    text_classifier = TextClassifier(model)
    text_classifier.fit(x_train, y_train)
    #保存并加载模型
    joblib.dump(text_classifier, 'text_classifier.pkl')
    print(text_classifier.score(x_test, y_test))
    print("-------------------")
示例#12
0
zhenduan_list = []
zhengzhuang_list = []
zhiliao_list = []
prep = preprocess1(bingyin_list, zhenduan_list, zhengzhuang_list, zhiliao_list,
                   bingyin, zhenduan, zhengzhuang, zhiliao)
prep.preprocess_lines(bingyin, bingyin_list)
prep.preprocess_lines(zhenduan, zhenduan_list)
prep.preprocess_lines(zhengzhuang, zhengzhuang_list)
prep.preprocess_lines(zhiliao, zhiliao_list)

#分割数据
x, y = zip(*sentences)
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1234)

#训练数据,并训练各自的疾病数据成向量
text_classifier = TextClassifier()
text_classifier.fit(x_train, y_train)
print(text_classifier.score(x_test, y_test))
bingyin_xl = text_classifier.features(bingyin_list).todense()
zhiliao_xl = text_classifier.features(zhiliao_list).todense()
zhengzhuang_xl = text_classifier.features(zhengzhuang_list).todense()
zhenduan_xl = text_classifier.features(zhenduan_list).todense()
jibing_xl_dict = {
    'diagnosis': zhenduan_xl,
    'treatment': zhiliao_xl,
    'symptom': zhengzhuang_xl,
    'pathogeny': bingyin_xl
}
# print(zhenduan_xl)

#输出预测类别