示例#1
0
def run_main():
    """
        主函数
    """
    # 1. 数据读取,处理,清洗,准备
    if is_first_run:
        print('处理清洗文本数据中...', end=' ')
        # 如果是第一次运行需要对原始文本数据进行处理和清洗

        # 读取原始文本数据,将标签和文本数据保存成csv
        read_and_save_to_csv()

        # 读取处理好的csv文件,构造数据集
        text_df = pd.read_csv(os.path.join(dataset_path, output_text_filename),
                              encoding='utf-8')

        # 处理文本数据
        text_df['text'] = text_df['text'].apply(proc_text)

        # 过滤空字符串,去掉所有空行部分
        text_df = text_df[text_df['text'] != '']

        # 保存处理好的文本数据,文本预处理结束
        text_df.to_csv(os.path.join(dataset_path, output_cln_text_filename),
                       index=None,
                       encoding='utf-8')
        print('完成,并保存结果。')

    # 2. 分割训练集、测试集
    print('加载处理好的文本数据')
    clean_text_df = pd.read_csv(os.path.join(dataset_path,
                                             output_cln_text_filename),
                                encoding='utf-8')
    # 分割训练集和测试集
    # 按每个情感值的80%做分割,
    train_text_df, test_text_df = split_train_test(clean_text_df)
    # 查看训练集测试集基本信息
    print('训练集中各类的数据个数:', train_text_df.groupby('label').size())
    print('测试集中各类的数据个数:', test_text_df.groupby('label').size())

    # 3. 特征提取
    # 计算词频
    n_common_words = 200

    # 将训练集中的单词拿出来统计词频
    print('统计词频...')

    # 获取训练集数据集里所有的词语的列表
    all_words_in_train = get_word_list_from_data(train_text_df)
    # 统计词频
    fdisk = nltk.FreqDist(all_words_in_train)

    # 获取词频排名前200个的词语的词频
    # 构建“常用单词列表”
    common_words_freqs = fdisk.most_common(n_common_words)
    print('出现最多的{}个词是:'.format(n_common_words))

    for word, count in common_words_freqs:
        print('{}: {}次'.format(word, count))
    print()

    # 在训练集上提取特征
    # 将text部分转换为list做为参数
    text_collection = TextCollection(train_text_df['text'].values.tolist())

    # 提取训练样本和测试样本的特征
    # _X 表示常用单词在每一行的tf-idf值,_y 表示情感值
    print('训练样本提取特征...', end=' ')
    train_X, train_y = extract_feat_from_data(train_text_df, text_collection,
                                              common_words_freqs)
    print('完成')
    print()

    print('测试样本提取特征...', end=' ')
    test_X, test_y = extract_feat_from_data(test_text_df, text_collection,
                                            common_words_freqs)
    print('完成')

    # 4. 训练模型Naive Bayes
    print('训练模型...', end=' ')
    # 创建高斯朴素贝叶斯模型
    gnb = GaussianNB()
    # 向模型加载训练集特征数据,训练模型,
    gnb.fit(train_X, train_y)
    print('完成')
    print()

    # 5. 预测
    print('测试模型...', end=' ')
    # 加载测试集特征数据,用来预测数据。
    test_pred = gnb.predict(test_X)
    # test_pred : ndarray : array([3., 3., 3., 2., 3., 3., 3., 0., 3., 3., 3., 2., 1. .....])

    print('完成')

    # 输出准确率
    print('准确率:', cal_acc(test_y, test_pred))
示例#2
0
import tools
import matplotlib.pyplot as plt
import dataset_loader
import cfg.config as config
import torch
cfg = config.get_cfg_defaults()
val_data_flow = dataset_loader.Data_flow(5,
                                         cfg.TRAIN.raw_data_file,
                                         cfg.img_dir, [cfg.img_h, cfg.img_w],
                                         cfg.out_features,
                                         train=True)
input, targets = val_data_flow.load_next_batch()
tools.show_targets(input.cpu().permute(0, 2, 3, 1).numpy(),
                   targets.cpu().numpy(), tools.get_preds(targets.numpy()))
print(
    tools.cal_acc(tools.get_preds(targets.numpy()),
                  tools.get_preds(targets.numpy())) + 10)

# class struct:
#     def __init__(self):
#         self.x = []
#         self.y = []
#
#
# epoch = struct()
# train_l = struct()
# val_l = struct()
# train_a = struct()
# val_a = struct()
# time = struct()
# with open('/home/agni/Downloads/train_log(1).txt') as f:
#     for i, line in enumerate(f):
示例#3
0
def run_main():
    """
        主函数
    """
    # 1. 数据读取,处理,清洗,准备
    if is_first_run:
        print('处理清洗文本数据中...', end=' ')
        # 如果是第一次运行需要对原始文本数据进行处理和清洗

        # 读取原始文本数据,将标签和文本数据保存成csv
        read_and_save_to_csv()

        # 读取处理好的csv文件,构造数据集
        text_df = pd.read_csv(os.path.join(dataset_path, output_text_filename),
                              encoding='utf-8')

        # 处理文本数据
        text_df['text'] = text_df['text'].apply(proc_text)

        # 过滤空字符串
        text_df = text_df[text_df['text'] != '']

        # 保存处理好的文本数据
        text_df.to_csv(os.path.join(dataset_path, output_cln_text_filename),
                       index=None, encoding='utf-8')
        print('完成,并保存结果。')

    # 2. 分割训练集、测试集
    print('加载处理好的文本数据')
    clean_text_df = pd.read_csv(os.path.join(dataset_path, output_cln_text_filename),
                                encoding='utf-8')
    # 分割训练集和测试集
    train_text_df, test_text_df = split_train_test(clean_text_df)
    # 查看训练集测试集基本信息
    print('训练集中各类的数据个数:', train_text_df.groupby('label').size())
    print('测试集中各类的数据个数:', test_text_df.groupby('label').size())

    # 3. 特征提取
    # 计算词频
    n_common_words = 200

    # 将训练集中的单词拿出来统计词频
    print('统计词频...')
    all_words_in_train = get_word_list_from_data(train_text_df)
    fdisk = nltk.FreqDist(all_words_in_train)
    common_words_freqs = fdisk.most_common(n_common_words)
    print('出现最多的{}个词是:'.format(n_common_words))
    for word, count in common_words_freqs:
        print('{}: {}次'.format(word, count))
    print()

    # 在训练集上提取特征
    text_collection = TextCollection(train_text_df['text'].values.tolist())
    print('训练样本提取特征...', end=' ')
    train_X, train_y = extract_feat_from_data(train_text_df, text_collection, common_words_freqs)
    print('完成')
    print()

    print('测试样本提取特征...', end=' ')
    test_X, test_y = extract_feat_from_data(test_text_df, text_collection, common_words_freqs)
    print('完成')

    # 4. 训练模型Naive Bayes
    print('训练模型...', end=' ')
    gnb = GaussianNB()
    gnb.fit(train_X, train_y)
    print('完成')
    print()

    # 5. 预测
    print('测试模型...', end=' ')
    test_pred = gnb.predict(test_X)
    print('完成')

    # 输出准确率
    print('准确率:', cal_acc(test_y, test_pred))
示例#4
0
def run_main():
    """
        主函数
    """
    # 1. 数据读取,处理,清洗,准备
    if is_first_run:
        print('处理清洗文本数据中...', end=' ')
        # 如果是第一次运行需要对原始文本数据进行处理和清洗

        # 读取原始文本数据,将标签和文本数据保存成csv
        read_and_save_to_csv()

        # 读取处理好的csv文件,构造数据集
        text_df = pd.read_csv(os.path.join(dataset_path, output_text_filename),
                              encoding='utf-8')

        # 处理文本数据
        text_df['text'] = text_df['text'].apply(proc_text)

        # 过滤空字符串,去掉所有空行部分
        text_df = text_df[text_df['text'] != '']

        # 保存处理好的文本数据,文本预处理结束
        text_df.to_csv(os.path.join(dataset_path, output_cln_text_filename),
                       index=None, encoding='utf-8')
        print('完成,并保存结果。')



    # 2. 分割训练集、测试集
    print('加载处理好的文本数据')
    clean_text_df = pd.read_csv(os.path.join(dataset_path, output_cln_text_filename),
                                encoding='utf-8')
    # 分割训练集和测试集
    # 按每个情感值的80%做分割,
    train_text_df, test_text_df = split_train_test(clean_text_df)
    # 查看训练集测试集基本信息
    print('训练集中各类的数据个数:', train_text_df.groupby('label').size())
    print('测试集中各类的数据个数:', test_text_df.groupby('label').size())


    # 3. 特征提取
    # 计算词频
    n_common_words = 1000

    # 将训练集中的单词拿出来统计词频
    #def count_tf():

    # 获取训练集数据集里所有的词语的列表

    all_words_in_train = get_word_list_from_data(train_text_df)
    print('统计词频...')
    print("总单词数",len(all_words_in_train))

    # 统计词频
    fdisk = nltk.FreqDist(all_words_in_train)
    print("词频",len(fdisk))
    # 获取词频排名前200个的词语的词频
    # 构建“常用单词列表”
    common_words_freqs = fdisk.most_common(n_common_words)
    print('出现最多的{}个词是:'.format(n_common_words))

    for word, count in common_words_freqs:
        print('{}: {}次'.format(word, count))
    print()

    # 在训练集上提取特征
    # 将text部分转换为list做为参数
    text_collection = TextCollection(train_text_df['text'].values.tolist())

    # 提取训练样本和测试样本的特征
    # _X 表示常用单词在每一行的tf-idf值,_y 表示情感值
    print('训练样本提取特征...', end=' ')
    if load_np:
        train_X = np.load("train_x.npy")
        print(train_X.shape)
        train_X = train_X.reshape(train_X.shape[0],1,train_X.shape[1])
        print(train_X.shape)
        train_y = np.load("train_y.npy")
        test_X = np.load("test_X.npy")
        test_X = test_X.reshape(test_X.shape[0],1,test_X.shape[1])
        test_y = np.load("test_y.npy")
    else:
        train_X, train_y = extract_feat_from_data(train_text_df, text_collection, common_words_freqs)
        np.save("train_x.npy",train_X)
        np.save("train_y.npy",train_y)
        print('完成')
        print()

        print('测试样本提取特征...', end=' ')
        test_X, test_y = extract_feat_from_data(test_text_df, text_collection, common_words_freqs)
        np.save("test_X.npy",test_X)
        np.save("test_y.npy",test_y)
        print('完成')

    # 4. 训练模型Naive Bayes
    print('训练模型...', end=' ')
    # 创建高斯朴素贝叶斯模型
	#gnb = GaussianNB() 0.29
    gnb = LogisticRegression(multi_class="ovr")
    model = get_model(n_common_words)
    onehot_train_y =  keras.utils.to_categorical(train_y, num_classes=4)
    onehot_test_y =  keras.utils.to_categorical(test_y, num_classes=4)

    #model.fit(train_X, onehot_train_y,            epochs=50,batch_size=128,verbose=1)
    #score = model.evaluate(test_X, onehot_test_y, batch_size=128)
    # 向模型加载训练集特征数据,训练模型,
    gnb.fit(train_X, train_y)
    model.save_weights("model.h5")
    print('完成')
    #print('score',score)

    # 5. 预测
    print('测试模型...', end=' ')
    # 加载测试集特征数据,用来预测数据。
    #text_pred = model.predict(test_X,128)
    test_pred = gnb.predict(test_X)
    # test_pred : ndarray : array([3., 3., 3., 2., 3., 3., 3., 0., 3., 3., 3., 2., 1. .....])

    print('完成')

    # 输出准确率
    print('准确率:', cal_acc(test_y, test_pred))