#!/bin/bash # -*- coding=utf-8 -*- # -*- author=liangzhicheng -*- import re import os import sys import util as u import linecache import traceback ########################参数说明####################### COUNT = 10 # 要切分的列数 ###################################################### FILE_NAME, FILE_PATH = u.getFirstFile('csv') def create_file(fileName): fileList = linecache.getlines(u.utf8_2_gbk(fileName)) fileHead = fileList[0] fileBody = fileList[1:] fileBLen = len(fileBody) return [fileHead, fileBody, fileBLen] if __name__ == '__main__': try: fileHead, fileBody, fileBLen = create_file(FILE_PATH) middle = (fileBLen / COUNT) + 1 for num in range(COUNT): left = num * middle
f.close() return [typeCount, count] def createPattern(str, accurate): # 添加正则匹配规则 if accurate: result = '^' + str.replace(',', '$|^') + '$' else: result = '.*' + str.replace(',', '.*|.*') + '.*' return result if __name__ == '__main__': try: LABEL_FILE, LABEL_PATH = u.getFirstFile('txt') SOURCE_FILE, SOURCE_PATH = u.getFirstFile('csv') print u.utf8_2_gbk('打标签文件:' + LABEL_FILE) print u.utf8_2_gbk('数据源文件:' + SOURCE_FILE) source_file_body = u.create_file_body(SOURCE_PATH) for num, line in enumerate(source_file_body): source_file_body[num] = line.strip().lower() + ',' + '\n' labelType, labelNum = createMoreMatch(LABEL_PATH) matchHead = u.create_file_head(SOURCE_PATH, 'right', [LABEL_FILE.split('.')[0]]) print u.utf8_2_gbk('标签个数:' + str(labelNum) + '个') for key, value in labelType.items(): count += 1 print u.utf8_2_gbk('当前执行到第' + str(count) + '个') words = value.strip().split('|') if len(words) == 1: c = createPattern(words[0], ACCURATE)
import re import util as u import constants as c import traceback ''' 功能说明: 统计某列出现次数,超过指定次数,移除该行 数据文件放在source文件夹,数据文件为csv,gbk编码 ''' ##############参数说明############## COLUMN = 1 # 要处理的列数 NUMBER = 100 # 出现的次数 ################################### SOURCE_NAME, SOURCE_FILE = u.getFirstFile('csv') # 输入文件 SAVE_FILE = u.changeFileName(SOURCE_NAME, '少于' + str(NUMBER) + '次.csv') # 不符合条件的输出文件 REMOVE_FILE = u.changeFileName(SOURCE_NAME, '大于等于' + str(NUMBER) + '次.csv') # 符合条件的输出文件(大于等于101次) save_list = [] # 保存保留文件行号 remove_list = [] # 保存过滤文件行号 content_list = [] save_file_list = [] remove_file_list = [] res = {} def remove_linebreak(line): return line.strip()
import os import util as u import linecache reload(sys) sys.setdefaultencoding('utf8') ''' 标签词和关键词用'\t'隔开,多关键词用','隔开,匹配词和过滤词用'|'隔开 数据源文件为csv,gbk编码 标签词文件为txt,gbk编码 例子: 标签词 关键词|过滤关键词 水饺 水饺,云吞,馄饨,饺子|汤圆,元宵 ''' ###########################参数说明################## LABEL_FILE = u.getFirstFile('txt') SOURCE_FILE = u.getFirstFile('csv') COLUNM = 8 # 需要匹配的列 ACCURATE = True # 是否精确匹配 ##################################################### keyWordCount = {} count = 0 def combinefileName(file1, file2): return file1.split('.')[0] + '-' + file2.split('.')[0] + '.csv' def createMoreMatch(fileName): typeCount = {}
数据文件放在source文件夹,数据文件为csv,gbk编码 ''' ####################以下是参数###################### COLUMN = 37 # 要清洗的数据在第几列 PATTERNTYPE = 3 # 选择匹配规则patternTemp1,patternTemp2,patternTemp3 ####################以上是参数###################### patternTemp1 = "@[^,,::\s@()/]+" patternTemp2 = "\[.*?\]" patternTemp3 = "#.*?#" DELIMITER = "," CODING = "gbk" SOURCE, SOURCEPATH = u.getFirstFile('csv') COUNT_FILE = u.changeFileName(SOURCE, '-总数统计.txt') OUTPUT_FILE = u.changeFileName(SOURCE, '-统计.csv') i = 0 j = 0 result = dict() def utf8_2_gbk(src): res = src.decode("utf-8").encode("gbk", "ignore") return res if __name__ == "__main__":
import sys import re import util as u import os import traceback ''' 功能说明:将指定路径的csv文件合并成一个csv文件 ''' ######################参数说明###################### RESULT_FILE = 'result.csv' # 合并之后的文件 ################################################### file_dict = {} FILE_PATH = sys.path[0] + '\\source' # 文件路径 RESULT_HEAD = u.getFirstFile('csv')[1] # 默认以第一个csv文件抬头为合并后的csv文件抬头 if __name__ == '__main__': try: print u.utf8_2_gbk('开始执行') file_list = u.GetFileList(FILE_PATH, []) for f in file_list: file_dict[f.encode('gbk')] = u.create_file_body(f.encode('utf-8')) result_file = file(u.utf8_2_gbk(RESULT_FILE), 'w+') result_file_head = u.create_file_head(RESULT_HEAD) result_file.write(result_file_head) for key, value in file_dict.items(): result_file.writelines(value) result_file.close() print u.utf8_2_gbk('执行完毕') print u.utf8_2_gbk('输出文件路径:') + sys.path[0] + u.utf8_2_gbk('\\' + RESULT_FILE)
import os import util as u import sys import traceback ''' 功能说明: 将数据文件拆分成一份含关键词,一份不含关键词 关键词之间用逗号隔开,数据文件问csv,gbk编码,匹配词文件txt,gbk编码,匹配词支持中英文,支持大小写,支持精确匹配 关键词放在label文件夹,数据文件放在source文件夹 ''' ####################参数和说明##################### COLUMN = 5 # 需要匹配的列 ACCURATE = True # 选择是否精确匹配 ################################################## SOURCE, SOURCEPATH = u.getFirstFile('csv') LABEL, LABELPATH = u.getFirstFile('txt') result_list = [] remove_list = [] def createPattern(fileName): content = '' f = open(u.utf8_2_gbk(fileName)) for line in f: if line[:-1].strip(): if ACCURATE: content += '|' + '^' + line.strip() + '$' else: content += '|' + '.*' + line.strip() + '.*'