예제 #1
0
#!/bin/bash
# -*- coding=utf-8 -*-
# -*- author=liangzhicheng -*-
import re
import os
import sys
import util as u
import linecache
import traceback

########################参数说明#######################
COUNT = 10  # 要切分的列数
######################################################

FILE_NAME, FILE_PATH = u.getFirstFile('csv')


def create_file(fileName):
    fileList = linecache.getlines(u.utf8_2_gbk(fileName))
    fileHead = fileList[0]
    fileBody = fileList[1:]
    fileBLen = len(fileBody)
    return [fileHead, fileBody, fileBLen]


if __name__ == '__main__':
    try:
        fileHead, fileBody, fileBLen = create_file(FILE_PATH)
        middle = (fileBLen / COUNT) + 1
        for num in range(COUNT):
            left = num * middle
예제 #2
0
    f.close()
    return [typeCount, count]


def createPattern(str, accurate):  # 添加正则匹配规则
    if accurate:
        result = '^' + str.replace(',', '$|^') + '$'
    else:
        result = '.*' + str.replace(',', '.*|.*') + '.*'
    return result


if __name__ == '__main__':

    try:
        LABEL_FILE, LABEL_PATH = u.getFirstFile('txt')
        SOURCE_FILE, SOURCE_PATH = u.getFirstFile('csv')
        print u.utf8_2_gbk('打标签文件:' + LABEL_FILE)
        print u.utf8_2_gbk('数据源文件:' + SOURCE_FILE)
        source_file_body = u.create_file_body(SOURCE_PATH)
        for num, line in enumerate(source_file_body):
            source_file_body[num] = line.strip().lower() + ',' + '\n'
        labelType, labelNum = createMoreMatch(LABEL_PATH)
        matchHead = u.create_file_head(SOURCE_PATH, 'right', [LABEL_FILE.split('.')[0]])
        print u.utf8_2_gbk('标签个数:' + str(labelNum) + '个')
        for key, value in labelType.items():
            count += 1
            print u.utf8_2_gbk('当前执行到第' + str(count) + '个')
            words = value.strip().split('|')
            if len(words) == 1:
                c = createPattern(words[0], ACCURATE)
예제 #3
0
import re
import util as u
import constants as c
import traceback
'''
功能说明:
统计某列出现次数,超过指定次数,移除该行
数据文件放在source文件夹,数据文件为csv,gbk编码
'''

##############参数说明##############
COLUMN = 1  # 要处理的列数
NUMBER = 100  # 出现的次数
###################################

SOURCE_NAME, SOURCE_FILE = u.getFirstFile('csv')  # 输入文件
SAVE_FILE = u.changeFileName(SOURCE_NAME,
                             '少于' + str(NUMBER) + '次.csv')  # 不符合条件的输出文件
REMOVE_FILE = u.changeFileName(SOURCE_NAME, '大于等于' + str(NUMBER) +
                               '次.csv')  # 符合条件的输出文件(大于等于101次)
save_list = []  # 保存保留文件行号
remove_list = []  # 保存过滤文件行号
content_list = []
save_file_list = []
remove_file_list = []
res = {}


def remove_linebreak(line):
    return line.strip()
예제 #4
0
import os
import util as u
import linecache

reload(sys)
sys.setdefaultencoding('utf8')
'''
标签词和关键词用'\t'隔开,多关键词用','隔开,匹配词和过滤词用'|'隔开
数据源文件为csv,gbk编码
标签词文件为txt,gbk编码
例子:
标签词 关键词|过滤关键词
水饺  水饺,云吞,馄饨,饺子|汤圆,元宵
'''
###########################参数说明##################
LABEL_FILE = u.getFirstFile('txt')
SOURCE_FILE = u.getFirstFile('csv')
COLUNM = 8  # 需要匹配的列
ACCURATE = True  # 是否精确匹配
#####################################################

keyWordCount = {}
count = 0


def combinefileName(file1, file2):
    return file1.split('.')[0] + '-' + file2.split('.')[0] + '.csv'


def createMoreMatch(fileName):
    typeCount = {}
예제 #5
0
数据文件放在source文件夹,数据文件为csv,gbk编码
'''

####################以下是参数######################
COLUMN = 37  # 要清洗的数据在第几列
PATTERNTYPE = 3  # 选择匹配规则patternTemp1,patternTemp2,patternTemp3

####################以上是参数######################

patternTemp1 = "@[^,,::\s@()/]+"
patternTemp2 = "\[.*?\]"
patternTemp3 = "#.*?#"

DELIMITER = ","
CODING = "gbk"
SOURCE, SOURCEPATH = u.getFirstFile('csv')
COUNT_FILE = u.changeFileName(SOURCE, '-总数统计.txt')
OUTPUT_FILE = u.changeFileName(SOURCE, '-统计.csv')

i = 0
j = 0
result = dict()


def utf8_2_gbk(src):
    res = src.decode("utf-8").encode("gbk", "ignore")
    return res


if __name__ == "__main__":
예제 #6
0
import sys
import re
import util as u
import os
import traceback
'''
功能说明:将指定路径的csv文件合并成一个csv文件
'''

######################参数说明######################
RESULT_FILE = 'result.csv'  # 合并之后的文件

###################################################
file_dict = {}
FILE_PATH = sys.path[0] + '\\source'  # 文件路径
RESULT_HEAD = u.getFirstFile('csv')[1]  # 默认以第一个csv文件抬头为合并后的csv文件抬头
if __name__ == '__main__':
    try:
        print u.utf8_2_gbk('开始执行')
        file_list = u.GetFileList(FILE_PATH, [])
        for f in file_list:
            file_dict[f.encode('gbk')] = u.create_file_body(f.encode('utf-8'))
        result_file = file(u.utf8_2_gbk(RESULT_FILE), 'w+')
        result_file_head = u.create_file_head(RESULT_HEAD)
        result_file.write(result_file_head)
        for key, value in file_dict.items():
            result_file.writelines(value)
        result_file.close()
        print u.utf8_2_gbk('执行完毕')
        print u.utf8_2_gbk('输出文件路径:') + sys.path[0] + u.utf8_2_gbk('\\' +
                                                                   RESULT_FILE)
예제 #7
0
import os
import util as u
import sys
import traceback
'''
功能说明:
将数据文件拆分成一份含关键词,一份不含关键词
关键词之间用逗号隔开,数据文件问csv,gbk编码,匹配词文件txt,gbk编码,匹配词支持中英文,支持大小写,支持精确匹配
关键词放在label文件夹,数据文件放在source文件夹
'''
####################参数和说明#####################
COLUMN = 5  # 需要匹配的列
ACCURATE = True  # 选择是否精确匹配
##################################################

SOURCE, SOURCEPATH = u.getFirstFile('csv')
LABEL, LABELPATH = u.getFirstFile('txt')

result_list = []
remove_list = []


def createPattern(fileName):
    content = ''
    f = open(u.utf8_2_gbk(fileName))
    for line in f:
        if line[:-1].strip():
            if ACCURATE:
                content += '|' + '^' + line.strip() + '$'
            else:
                content += '|' + '.*' + line.strip() + '.*'