def convert_w2v(binary_data_path,
                w2v_model,
                binary_w2v_data_path,
                skip_save_path=''):
    print('..word2vec文本表示')
    start_time = time.time()
    # 加载数据
    binary_data_dic = load_data(binary_data_path)
    # 加载w2v模型
    # w2v_model = load_w2v_model(w2v_model_path)
    # load_model_time = time.time()
    # print('加载w2v模型用时:{}s'.format(load_model_time-start_time))
    for cate, data in binary_data_dic.items():
        print('cur_cate: ' + cate)
        for i in range(NUMBER):
            self_con, self_label = binary_data_dic[cate][i][
                0], binary_data_dic[cate][i][1]
            train_w2v = get_train_vec(self_con, w2v_model, skip_save_path)
            binary_data_dic[cate][i] = [train_w2v, self_label]

    doc_w2v_json = json.dumps(binary_data_dic, cls=MyEncoder)  # 速度比上一行快
    save_file(binary_w2v_data_path, doc_w2v_json, 'w')
    end_time = time.time()
    print('文本表示用时:{}s'.format(end_time - start_time))
    return binary_data_dic
Exemplo n.º 2
0
    def train_classifier(self):
        # self.load_train_dataset()
        #实验后的最佳参数之一

        start_time = time.time()
        classifier = ff.train_supervised(
            self.data_path, lr=0.1, loss='hs', wordNgrams=2,
            epoch=300)  # epoch=20,0.91;epoch=50,0.93;
        model = classifier.save_model(
            self.model_save_path + 'level_2_fasttext_classifier_big_big.model'
        )  # 保存模型  all:0.91;all_2:0.93
        classifier.get_labels()  # 输出标签
        # 测试模型
        # print('加载fasttext模型--{}'.format('level_1_fasttext_classifier_big_test.model'))
        # classifier = ff.load_model(self.model_save_path+'level_1_fasttext_classifier_big_test.model')
        test_result = classifier.test(self.test_save_path + 'test_big.txt')
        result_str = 'test precision:{}\n'.format(test_result)
        print(result_str)

        end_time = time.time()
        load_time = round(end_time - start_time, 3)
        train_time_str = 'train and test model time %fs' % load_time
        print(train_time_str)

        save_file(self.result_save_path + 'fasttext_result_big.txt',
                  result_str + train_time_str + '\n', 'a')
Exemplo n.º 3
0
def select_datas(ori_datas,ch_of_corpus,level3_ch,lev_n,target_path,stopword):
    print('='*40)
    print('I-1.抽取数据,生成训练和测试数据集...')
    #三级类总数目统计
    all_cate_count = 0
    #总数据列表
    all_datas = []
    #一级类下三级类别数目统计
    le3_cate_count = 0
    #设置分类标志
    ch_of_corpus_flag = []
    for ccn in ch_of_corpus:
        if ';' in ccn:
            item_list = ccn.split(';')
            ch_of_corpus_flag.append([item_list,[True]*len(item_list)])
        else:
            ch_of_corpus_flag.append([ccn,[True]])
    #n级目录表在外循环,语料库分类号在内循环
    for ch in level3_ch:
        #一级类目下三级类别的数据条数统计
        le3_data_count = 0
        #一级类目下三级类别的数据list       # 可改为2级类
        le3_datas = []
        #三级类数据存放路径
        le3_datas_path = target_path + ch[0] +'/'
        if not os.path.exists(le3_datas_path):
            os.makedirs(le3_datas_path)
        for idx,ccn in enumerate(ch_of_corpus[:]): #原始数据分类号列表
            item_list = []
            if ';' in ccn:       #类别数大于1时
                item_list = ccn.split(';')
            else:
                item_list.append(ccn)
            for p in item_list:
                if not len(p):
                    continue
                if True not in ch_of_corpus_flag[idx][1]: #如果已抽取,将不再抽取
                    continue
                elif ch[0] == p[0] and ch in p: #抽取语料库中存在的N级类目数据
                    row = ori_datas.loc[idx]
                    title = row['标题']   #抽取数据标题
                    abstract = row['摘要']  #抽取数据摘要
                    key_word = row['关键词'] #抽取数据关键词
                    content = title+' '+abstract+' '+key_word
                    if len(content) < 10:
                        continue
                    #抽取后标记false
                    p_index = item_list.index(p)
                    ch_of_corpus_flag[idx][1][p_index] = False

                    con_seg = deal_data(content,stopword)
                    le3_datas.append('__label__'+ch+', '+' '.join(con_seg))
                    le3_data_count += 1

        if le3_data_count >= 500:
            #存储三级类数据
            save_file(le3_datas_path+ch[0]+'_data_count.txt',ch+'-->'+str(le3_data_count)+'\n','a')
            random.shuffle(le3_datas)
            write_data(le3_datas_path+ch[0]+'_data.txt',le3_datas)
Exemplo n.º 4
0
def deal_datas(cut_path, save_path):

    cate_counts = len(count_dic)
    for i, j in count_dic.items():
        save_file(save_path + 'train_count.txt', i + ':' + str(j) + ',', 'a')
    save_file(save_path + 'train_count.txt',
              '--------类目数量:' + str(cate_counts), 'a')
    random.shuffle(contents)
    write_datas(save_path + 'level_3_train.txt', contents)
Exemplo n.º 5
0
def read_Ch_cate(save_path,le_file_path,class_info,len_info):
    print('读取中图分类号...')
    if not '.txt' in os.listdir(save_path):
        infos = pd.read_excel(le_file_path)
        #抽取n级类号数据
        le_n_datas = infos.loc[infos['层级']==int(len_info)]
        le_n_id = list(le_n_datas['类号ID'])
        
        le_select = []
        for i in le_n_id:
            if i not in le_select:
                # i = re.sub(r'[\r\n\t]','',i)
                le_select.append(str(i))

        #存储n级目录
        save_file(save_path+'level_'+class_info+'_'+str(len_info)+'.txt',','.join(le_select),'w')
Exemplo n.º 6
0
def get_dataset(original_data_path, datalevel_info_file, w2v_model_path,
                dataset_w2v_data_path):
    dataset_dic = {}
    dataset_label_dic = {}

    # 加载词向量
    # w2v_model = None
    w2v_model = load_w2v_model(w2v_model_path)
    # 读取数据集二级类目信息
    infos = read_file_lines(datalevel_info_file)
    level2_list = eval(infos[0])
    level2_dic = eval(infos[1])
    # 初始化数据集字典
    for le2 in level2_list:
        if le2 not in dataset_dic:
            dataset_dic[le2] = []
            dataset_label_dic[le2] = []

    # 读取数据集
    data_list, data_label_list = [], []
    contents = read_file_lines(original_data_path + 'le3_data.txt')
    for line in contents:
        line_list = line.split(',')
        label = line_list[0].replace('__label__', '')
        content = line_list[1]

        if label[:3] in level2_list:
            dataset_dic[label[:3]].append(content)
            dataset_label_dic[label[:3]].append(label)
        elif label[:2] in level2_list:
            dataset_dic[label[:2]].append(content)
            dataset_label_dic[label[:2]].append(label)
    # 保存中间文件
    dataset_w2v_dic = {}
    key_list = dataset_dic.keys()
    for key in key_list:
        # if not len(dataset_dic[key]):
        #     print(key)
        if key not in dataset_w2v_dic:
            dataset_w2v_dic[key] = []
        dataset_w2v_dic[key] = get_train_vec(dataset_dic[key], w2v_model)

    doc_w2v_json = json.dumps(dataset_w2v_dic, cls=MyEncoder)
    save_file(dataset_w2v_data_path, doc_w2v_json, 'w')
Exemplo n.º 7
0
def stat_le1_error(records_path, le1_save_path):
    if not os.path.exists(le1_save_path):
        os.makedirs(le1_save_path)
    le1_error_dic = {}
    records_file_list = os.listdir(records_path)
    for file in records_file_list:
        if not file[0] in le1_error_dic:
            le1_error_dic[file[0]] = [0, 0]
        record_list = read_file_lines(records_path + file)
        for line in record_list:
            if 'le1 result' in line:
                line_list = line.split(':')
                le1 = line_list[1].strip()
                if le1 != file[0]:
                    le1_error_dic[file[0]][1] += 1
                else:
                    le1_error_dic[file[0]][0] += 1

    print(le1_error_dic)
    test_text_count, error_text_count = 0, 0
    rate_of_error_dic = {}
    for i, j in le1_error_dic.items():
        rate_of_error_dic[i] = '{:.3}'.format(j[1] / j[0] * 100) + '%'
        test_text_count += j[0]
        error_text_count += j[1]

    save_file(le1_save_path + 'le1_error_rate.txt', '一级分类错误文档数统计\n\n', 'w')
    save_file(le1_save_path + 'le1_error_rate.txt',
              str(le1_error_dic) + '\n' + str(rate_of_error_dic) + '\n\n', 'a')
    le1_error_rate = error_text_count / test_text_count
    save_file(
        le1_save_path + 'le1_error_rate.txt',
        '测试集总数:{}'.format(test_text_count) + '\n' +
        '错误总数:{}'.format(error_text_count) + '\n' +
        '错误占比:{:.4}'.format(le1_error_rate), 'a')
Exemplo n.º 8
0
def stat_le3_acc(le1_path, result_path, save_path, threshold):
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    le1_list = read_file_lines(le1_path)
    results = read_file_lines(result_path)
    save_str = ''
    count = 0
    le3_acc_result_dic = {i.strip(): 0 for i in le1_list}
    print(le3_acc_result_dic)
    for line in results[:-2]:
        line_list = line.split(':')
        cate = line_list[0].split()[0]
        acc = line_list[1].strip()

        if float(acc) <= threshold:
            save_str += cate + ':' + str(acc) + '\n'
            le3_acc_result_dic[cate[0]] += 1
            count += 1
    print('less ' + str(threshold) + ' category counts:' + str(count) + '\n')
    # save_file(save_path+'less_'+str(threshold)+'.txt',save_str,'w')
    # save_file(save_path+'le3_acc_result.txt','less '+str(threshold)+' category counts:'+str(count)+'\n','a')
    save_file(
        save_path + 'le3_acc_distribute_result.txt',
        'less ' + str(threshold) + ':\n' + str(le3_acc_result_dic) + '\n', 'a')
Exemplo n.º 9
0
def select_txt(le_n, con, le_n_names, levs, train_path, test_path,
               stopword_path):
    print('抽取数据,生成训练和测试数据集...')
    stopword = read_file(stopword_path, 'utf').split('\n')
    #分类的类别数量
    count_train_1 = 0
    count_test_1 = 0
    #临时数据保存
    train_list = []
    test_list = []
    #三级类数目统计
    le3_count_cate = 0
    #类别标志
    temp_cate = le_n_names[0][0]

    for i in le_n_names:  #n级目录列表
        #类别数据个数统计
        count_train_3 = 0
        count_test_3 = 0
        #临时保存数据
        test_1 = []
        train_1 = []
        test_3 = []
        train_3 = []

        #训练文件集一级类名字
        le1_name = i[0]
        len_num = 1

        if '/' in i:
            i = i.replace(r'/', ' ')

        le_3_train_path = train_path + i[0] + '/'
        le_3_test_path = test_path + i[0] + '/'

        if not os.path.exists(le_3_train_path):
            os.makedirs(le_3_train_path)
        if not os.path.exists(le_3_test_path):
            os.makedirs(le_3_test_path)

        for m in enumerate(levs[:]):  #原始数据分类号列表
            m_list = []
            try:
                if ';' in m[1]:  #类别数大于1时
                    m_list = m[1].split(';')
                else:
                    m_list.append(m[1])
            except:
                continue
            for p in m_list:
                #print(p[0])
                if len(p) == 0:
                    continue
                elif i[0] == p[0] and i in p:  #符合级目录则抽取
                    index = m[0]  #获得待抽取数据索引
                    item = con.loc[index]
                    title = item['标题']  #抽取数据标题
                    content = item['摘要']  #抽取数据摘要,作为文本内容
                    key_word = item['关键词']  #抽取数据关键词
                    content = title + ' ' + content + ' ' + key_word
                    try:
                        if len_num % 4 == 0:
                            test_1.append(deal_datas(i[0], content, stopword))
                            test_3.append(deal_datas(i, content, stopword))
                            count_test_3 += 1
                        else:
                            train_1.append(deal_datas(i[0], content, stopword))
                            train_3.append(deal_datas(i, content, stopword))
                            count_train_3 += 1
                    except:
                        print('抽取数据类别%s时出错!' % i)
                    else:
                        len_num += 1

        if count_train_3 >= 100:
            #抽取三级类训练数据集
            save_file(le_3_train_path + i[0] + '_train_count.txt',
                      i + '-->' + str(count_train_3) + ',', 'a')
            random.shuffle(train_3)
            write_datas(le_3_train_path + i[0] + '_train.txt', train_3)
            train_list.append(train_1)

            if temp_cate == i[0]:
                le3_count_cate += 1
            else:
                save_file(
                    train_path + temp_cate + '/' + temp_cate +
                    '_train_count.txt',
                    '类别数目' + '-->' + str(le3_count_cate) + ',', 'a')
                le3_count_cate = 1
            temp_cate = i[0]

        if count_test_3 > 35:
            #抽取三级类测试数据集
            save_file(le_3_test_path + i[0] + '_test_count.txt',
                      i + '-->' + str(count_test_3) + ',', 'a')
            random.shuffle(test_3)
            write_datas(le_3_test_path + i[0] + '_test.txt', test_3)
            test_list.append(test_1)

    #打乱数据,使同类别的数据分散
    for l1 in test_list:
        random.shuffle(l1)
        write_datas(test_path + 'level_' + le_n[0] + '_test.txt', l1)
    for l2 in train_list:
        random.shuffle(l2)
        write_datas(train_path + 'level_' + le_n[0] + '_train.txt', l2)

    save_file(train_path + temp_cate + '/' + temp_cate + '_train_count.txt',
              '类别数目' + '-->' + str(le3_count_cate) + ',', 'a')
def save_pre_file(file_path, file_con):
    save_file(file_path, file_con, 'w')
Exemplo n.º 11
0
def select_txt(le_n,con,le_n_names,levs,train_path,test_path,stopword_path,lens):
    print('抽取数据,生成训练和测试数据集...')
    #分类的类别数量
    cate_count = 0
    #临时数据保存
    train_list = []
    test_list = []
    stopword = read_file(stopword_path,'utf').split('\n')
    for i in le_n_names:    #n级目录列表
        #类别数据个数统计
        count_train = 0
        count_test = 0
        #临时保存数据
        test = []
        train = []
        #训练文件集一级类名字
        # le1_name = i[0]
        len_num = 1
        
        if '/' in i:
            i=i.replace(r'/',' ')

        for j in enumerate(levs[:]): #原始数据分类号列表

            j_list=[]
            try:
                if ';' in j[1]:    #类别数大于1时
                    j_list = j[1].split(';')   
                else:
                    j_list.append(j[1])
            except:
                continue          
            for p in j_list:  
                p=str(p)  
                #print(p[0])
                if len(p) == 0:
                    continue
                # elif i[0] == p[0] and i in p: #符合n级目录则抽取
                elif len(i) >= 3 and len(p) >= 3 and i[:3] == p[:3]: #符合n级目录则抽取
                    index = j[0]   #获得待抽取数据索引
                    item = con.loc[index]
                    # title = item['标题']   #抽取数据标题
                    content = item['ContentText']  #抽取数据摘要,作为文本内容
                    # key_word = item['关键词'] #抽取数据关键词
                    # content = title+' '+content+' '+key_word
                    try:
                        # if len_num%8==0:
                        #     test.append(deal_datas(i,content,stopword))
                        #     count_test+=1
                        # else:
                        train.append(deal_datas(i,content,stopword))
                        count_train+=1
                    except:
                        print('抽取数据类别%s时出错!'%i)
                    else:
                        len_num+=1
        
        if count_train >= 20 :
            save_file(train_path+'train_'+lens+'_count.txt',i+'-->'+str(count_train)+',','a')
            train_list.append(train)
            cate_count += 1
        # if count_test >= 0:
        #     save_file(test_path+'test_'+lens+'_count.txt',i+'-->'+str(count_test)+',','a')
        #     test_list.append(test)

    #打乱数据,使得同类别的样本不至于扎堆
    for l1 in test_list:
        random.shuffle(l1)
        write_datas(test_path+'level_'+lens+'_test.txt',l1)
    for l2 in train_list:
        random.shuffle(l2)
        write_datas(train_path+'level_'+lens+'_train.txt',l2)
    
    save_file(train_path+'train_'+lens+'_count.txt','类别数目: '+str(cate_count)+',','a')