#!/bin/bash # coding=utf-8 import re import sys import os import util as u ''' 功能说明:先groupby某一列,然后统计每个类的数量,输出一份文件 ''' ##############参数说明############### SOURCE_FILE = 'hebing.csv' # 输入文件 RESULT_FILE = 'result.csv' # 输出文件 COLUMN = 1 # 要groupby和统计的列的列 #################################### count_dict = {} if __name__ == '__main__': source_file_head = u.create_file_head(SOURCE_FILE, 'right', ['次数']) source_file_body = u.create_file_body(SOURCE_FILE) for line in source_file_body: content = u.create_content(line, COLUMN) count_dict[content] = count_dict.get(content, 0) + 1 result_file = file(RESULT_FILE, 'w+') result_file.write(source_file_head) for key, value in count_dict.items(): result_file.write(key + ',' + str(value) + '\n') result_file.close()
result = '.*' + str.replace(',', '.*|.*') + '.*' return result if __name__ == '__main__': try: LABEL_FILE, LABEL_PATH = u.getFirstFile('txt') SOURCE_FILE, SOURCE_PATH = u.getFirstFile('csv') print u.utf8_2_gbk('打标签文件:' + LABEL_FILE) print u.utf8_2_gbk('数据源文件:' + SOURCE_FILE) source_file_body = u.create_file_body(SOURCE_PATH) for num, line in enumerate(source_file_body): source_file_body[num] = line.strip().lower() + ',' + '\n' labelType, labelNum = createMoreMatch(LABEL_PATH) matchHead = u.create_file_head(SOURCE_PATH, 'right', [LABEL_FILE.split('.')[0]]) print u.utf8_2_gbk('标签个数:' + str(labelNum) + '个') for key, value in labelType.items(): count += 1 print u.utf8_2_gbk('当前执行到第' + str(count) + '个') words = value.strip().split('|') if len(words) == 1: c = createPattern(words[0], ACCURATE) p = re.compile(c) for num, line in enumerate(source_file_body): content = u.create_content(line, COLUNM) if p.match(content): source_file_body[num] = source_file_body[num].strip() + key + '|' + '\n' keyWordCount[key] = keyWordCount.get(key, 0) + 1 if len(words) == 2: c = createPattern(words[0], ACCURATE)
head = linecache.getline(u.utf8_2_gbk(SOURCE_FILE), 1).strip() TOTALCOLUNM = len(head.split(',')) print u.utf8_2_gbk('标签词个数:') + u.printDictLen(labelWordp) source_file_body = u.create_file_body(SOURCE_FILE) for key, value in labelWordp.items(): i += 1 print u.utf8_2_gbk('当前执行到{0}个'.format(i)) for num, line in enumerate(source_file_body): data = line.strip().split(',') if len(data) == TOTALCOLUNM + 1: continue content = data[COLUNM - 1] p = re.compile(value) if p.match(content): source_file_body[num] = source_file_body[num].strip() + ',' + key + '\n' keyWordCount[key] = keyWordCount.get(key, 0) + 1 # 补全格式 for num, line in enumerate(source_file_body): data = line.strip().split(',') if len(data) == TOTALCOLUNM + 1: continue source_file_body[num] = source_file_body[num].strip() + ',' + '' + '\n' result_file_head = u.create_file_head(SOURCE_FILE, 'right', [u.gbk_2_utf8(columnName)]) u.create_result_file(RESULT_FILE, result_file_head, source_file_body) KEYWORD_FILE = LABELWORD.split('.')[0] + '统计.txt' u.writeDictFile(KEYWORD_FILE, keyWordCount) # 输出统计结果
def createPattern(match_words): pattern = '' cmatch_words = map(lambda line: '.*' + line.strip() + '.*', match_words[0]) ematch_words = map(lambda line: '.*' + line.strip() + '.*', match_words[1]) if len(ematch_words) == 0: pattern = '|'.join(cmatch_words) else: pattern = '|'.join(cmatch_words) + '|' + '|'.join(ematch_words) return pattern if __name__ == '__main__': source_file_body = u.create_file_body(SOURCE_FILE) source_file_head = u.create_file_head(SOURCE_FILE) match_words = u.create_match_words(MATCH_FILE) pattern = createPattern(match_words) p = re.compile(pattern) for line in source_file_body: content = u.create_content(line, COLUMN) if p.match(content): resultFile.append(line) else: removeFile.append(line) resultFileName = getFileName(SOURCE_FILE, '-含关键词.csv') removeFileName = getFileName(SOURCE_FILE, '-不含关键词.csv')
for key, value in res.items(): rowNum_list = value[2:].split(",") if len(rowNum_list) >= NUMBER: for num in rowNum_list: remove_list.append(num) else: for num in rowNum_list: save_list.append(num) if __name__ == "__main__": try: print u.utf8_2_gbk('开始执行') result_file_head = u.create_file_head(SOURCE_FILE) # 文件标题 result_file_body = u.create_file_body(SOURCE_FILE) # 文件内容 factory(result_file_body) # 构造输出文件 for num in save_list: save_file_list.append(result_file_body[int(num) - 1]) for num in remove_list: remove_file_list.append(result_file_body[int(num) - 1]) print u.utf8_2_gbk(SAVE_FILE + '行数:' + str(len(save_file_list))) print u.utf8_2_gbk(REMOVE_FILE + '行数:' + str(len(remove_file_list))) u.create_result_file(REMOVE_FILE, result_file_head, remove_file_list) # 符合条件的输出文件(大于等于101次) u.create_result_file(SAVE_FILE, result_file_head, save_file_list) # 不符合条件的输出文件 except:
# coding=utf-8 import sys import re import util as u import os ''' 功能说明:将指定路径的文件合并成一个文件(文件格式为csv) ''' ##################参数说明################# FILE_PATH = r'G:\merge_n_file\data' # 文件路径 RESULT_FILE = 'result.csv' # 合并之后的文件 RESULT_HEAD = r'G:\merge_n_file\data\3779_利鑫-999道私房菜.csv' # 指定合并后的文件标题基准 ########################################## file_dict = {} if __name__ == '__main__': file_list = u.GetFileList(u.utf8_2_gbk(FILE_PATH), []) for f in file_list: file_dict[f.encode('gbk')] = u.create_file_body(f.encode('utf-8')) result_file = file(RESULT_FILE, 'w+') result_file_head = u.create_file_head(RESULT_HEAD) result_file.write(result_file_head) for key, value in file_dict.items(): result_file.writelines(value) result_file.close()
行数----1000----10000----100000----1000000-- 耗时-----2s------11s------129s------1432s--- 内存----0.3mb----3mb------33mb------400mb--- """ __author__ = "liangzhicheng" SOURCENAME, SOURCEPATH = u.getFirstFile('csv') cluster = {} result_file_body = [] pattern = re.compile("\w|[/.,/#@$%^& ]") count_file_dict = {} if __name__ == '__main__': try: source_file_head = u.create_file_head(SOURCEPATH, 'left', ['类型']) source_file_body = u.create_file_body(SOURCEPATH) print u.utf8_2_gbk('开始执行聚类') for num, line in enumerate(source_file_body): content = re.sub(pattern, '', u.create_content(line, COLUMN)) if len(content) <= 20: keywords = jieba.analyse.extract_tags(content, topK=2) else: keywords = jieba.analyse.extract_tags(content, topK=TOPKET) keywords.sort() key = ','.join(keywords) cluster[key] = str(cluster.get(key, 0)) + "," + str(num + 1) print u.utf8_2_gbk('聚类完成,生成输出文件') for num, value in enumerate(cluster.itervalues()): cluster_list = value[2:].split(',') count_file_dict[num] = len(cluster_list)
content = '' f = open(u.utf8_2_gbk(fileName)) for line in f: if line[:-1].strip(): if ACCURATE: content += '|' + '^' + line.strip() + '$' else: content += '|' + '.*' + line.strip() + '.*' f.close() return content[1:].lower() if __name__ == '__main__': try: source_file_body = u.create_file_body(SOURCEPATH) source_file_head = u.create_file_head(SOURCEPATH) m = createPattern(LABELPATH) print m print '===============>>' + u.utf8_2_gbk( '若乱码,匹配词文件请使用gbk编码') + '<<===================' p = re.compile(m) print u.utf8_2_gbk('数据源文件行数:') + str(len(source_file_body)) for line in source_file_body: content = u.create_content(line, COLUMN).lower() if p.match(content): result_list.append(line) else: remove_list.append(line) print u.utf8_2_gbk('不包含关键词行数:') + str(len(remove_list)) print u.utf8_2_gbk('包含关键词行数:') + str(len(result_list)) u.create_result_file(u.changeFileName(SOURCE, '-含关键词.csv'),
将结果按照每个cluster的大小排序,大的在前面 性能: 行数----1000----10000----100000----1000000-- 耗时-----2s------11s------129s------1432s--- 内存----0.3mb----3mb------33mb------400mb--- """ __author__ = "liangzhicheng" cluster = {} result_file_body = [] pattern = re.compile("\w|[/.,/#@$%^& ]") count_file_list = [] if __name__ == '__main__': source_file_head = u.create_file_head(SOURCE_FILE, 'left', ['类型']) source_file_body = u.create_file_body(SOURCE_FILE) for num, line in enumerate(source_file_body): content = re.sub(pattern, '', u.create_content(line, COLUMN)) if len(content) <= 20: keywords = jieba.analyse.extract_tags(content, topK=2) else: keywords = jieba.analyse.extract_tags(content, topK=TOPKET) keywords.sort() key = ','.join(keywords) cluster[key] = str(cluster.get(key, 0)) + "," + str(num + 1) for num, value in enumerate(cluster.itervalues()): cluster_list = value[2:].split(',') count_file_list.append(str(num) + '\t' + str(len(cluster_list)) + '\n')