Пример #1
0
def createPattern(str, accurate):  # 添加正则匹配规则
    if accurate:
        result = '^' + str.replace(',', '$|^') + '$'
    else:
        result = '.*' + str.replace(',', '.*|.*') + '.*'
    return result


if __name__ == '__main__':

    try:
        LABEL_FILE, LABEL_PATH = u.getFirstFile('txt')
        SOURCE_FILE, SOURCE_PATH = u.getFirstFile('csv')
        print u.utf8_2_gbk('打标签文件:' + LABEL_FILE)
        print u.utf8_2_gbk('数据源文件:' + SOURCE_FILE)
        source_file_body = u.create_file_body(SOURCE_PATH)
        for num, line in enumerate(source_file_body):
            source_file_body[num] = line.strip().lower() + ',' + '\n'
        labelType, labelNum = createMoreMatch(LABEL_PATH)
        matchHead = u.create_file_head(SOURCE_PATH, 'right', [LABEL_FILE.split('.')[0]])
        print u.utf8_2_gbk('标签个数:' + str(labelNum) + '个')
        for key, value in labelType.items():
            count += 1
            print u.utf8_2_gbk('当前执行到第' + str(count) + '个')
            words = value.strip().split('|')
            if len(words) == 1:
                c = createPattern(words[0], ACCURATE)
                p = re.compile(c)
                for num, line in enumerate(source_file_body):
                    content = u.create_content(line, COLUNM)
                    if p.match(content):
Пример #2
0
#!/bin/bash
# coding=utf-8
import re
import sys
import os
import util as u
'''
功能说明:先groupby某一列,然后统计每个类的数量,输出一份文件
'''

##############参数说明###############
SOURCE_FILE = 'hebing.csv'  # 输入文件
RESULT_FILE = 'result.csv'  # 输出文件
COLUMN = 1  # 要groupby和统计的列的列
####################################
count_dict = {}

if __name__ == '__main__':
    source_file_head = u.create_file_head(SOURCE_FILE, 'right', ['次数'])
    source_file_body = u.create_file_body(SOURCE_FILE)
    for line in source_file_body:
        content = u.create_content(line, COLUMN)
        count_dict[content] = count_dict.get(content, 0) + 1
    result_file = file(RESULT_FILE, 'w+')
    result_file.write(source_file_head)
    for key, value in count_dict.items():
        result_file.write(key + ',' + str(value) + '\n')
    result_file.close()
Пример #3
0

def rm_repeat(file_list):
    rm_set = set()
    for line in file_list:
        content = u.create_content(line, 1)
        rm_set.add(content.replace('"', '') + '\n')
    return list(rm_set)


if __name__ == '__main__':

    file_list = u.GetFileList(u.utf8_2_gbk(FILE_PATH), [])
    for f in file_list:
        file_dict[f.encode('gbk')] = rm_repeat(
            u.create_file_body(f.encode('utf-8')))

    total_file = file('total.csv', 'w+')
    for key, value in file_dict.items():
        total_file.writelines(value)
    total_file.close()

    count_file = open('total.csv', 'rb')
    for line in count_file:
        content = u.create_content(line, 1)
        count_dict[content] = count_dict.get(content, 0) + 1

    result_file = file(RESULT_FILE, 'w+')
    for key, value in count_dict.items():
        result_file.write(key + '\t' + str(value) + '\n')
    result_file.close()
Пример #4
0
    for key, value in res.items():
        rowNum_list = value[2:].split(",")
        if len(rowNum_list) >= NUMBER:
            for num in rowNum_list:
                remove_list.append(num)
        else:
            for num in rowNum_list:
                save_list.append(num)


if __name__ == "__main__":

    try:
        print u.utf8_2_gbk('开始执行')
        result_file_head = u.create_file_head(SOURCE_FILE)  # 文件标题
        result_file_body = u.create_file_body(SOURCE_FILE)  # 文件内容
        factory(result_file_body)  # 构造输出文件
        for num in save_list:
            save_file_list.append(result_file_body[int(num) - 1])

        for num in remove_list:
            remove_file_list.append(result_file_body[int(num) - 1])

        print u.utf8_2_gbk(SAVE_FILE + '行数:' + str(len(save_file_list)))
        print u.utf8_2_gbk(REMOVE_FILE + '行数:' + str(len(remove_file_list)))
        u.create_result_file(REMOVE_FILE, result_file_head,
                             remove_file_list)  # 符合条件的输出文件(大于等于101次)
        u.create_result_file(SAVE_FILE, result_file_head,
                             save_file_list)  # 不符合条件的输出文件
    except:
        traceback.print_exc()
Пример #5
0
def rm_repeat(file_list):
    rm_set = set()
    for line in file_list:
        content = u.create_content(line, 1)
        rm_set.add(content.replace('"', '') + '\n')
    return list(rm_set)


if __name__ == '__main__':

    try:
        print u.utf8_2_gbk('开始执行')
        file_list = u.GetFileList(FILE_PATH, [])
        for f in file_list:
            file_dict[f.encode('gbk')] = rm_repeat(u.create_file_body(f.encode('utf-8')))

        total_file = file('total.csv', 'w+')
        for key, value in file_dict.items():
            total_file.writelines(value)
        total_file.close()

        count_file = open('total.csv', 'rb')
        for line in count_file:
            content = u.create_content(line, 1)
            count_dict[content] = count_dict.get(content, 0) + 1
        count_file.close()

        u.writeDictFile(RESULT_FILE, count_dict, 1)
        print u.utf8_2_gbk('执行完毕')
        print u.utf8_2_gbk('输出文件路径:') + sys.path[0] + u.utf8_2_gbk('\\' + RESULT_FILE)
Пример #6
0
# coding=utf-8
import sys
import re
import util as u
import os

'''
功能说明:将指定路径的文件合并成一个文件(文件格式为csv)
'''

##################参数说明#################
FILE_PATH = r'G:\merge_n_file\data'  # 文件路径
RESULT_FILE = 'result.csv'  # 合并之后的文件
RESULT_HEAD = r'G:\merge_n_file\data\3779_利鑫-999道私房菜.csv'  # 指定合并后的文件标题基准
##########################################
file_dict = {}

if __name__ == '__main__':

    file_list = u.GetFileList(u.utf8_2_gbk(FILE_PATH), [])
    for f in file_list:
        file_dict[f.encode('gbk')] = u.create_file_body(f.encode('utf-8'))

    result_file = file(RESULT_FILE, 'w+')
    result_file_head = u.create_file_head(RESULT_HEAD)
    result_file.write(result_file_head)

    for key, value in file_dict.items():
        result_file.writelines(value)
    result_file.close()