def createMoreMatch(fileName): # 构造匹配关键词 typeCount = {} count = 0 File = open(u.utf8_2_gbk(fileName), 'rb') for line in File: if line[:-1].strip(): try: content = u.utf8_2_gbk(line).strip().split('\t') except: content = line.strip().split('\t') typeCount[content[0]] = typeCount.get(content[0], '') + ',' + content[1] for key, value in typeCount.items(): count += 1 c = value[1:].split('|') if len(c) == 2: k = c[0].split(',') f = c[1].split(',') newk = map(changeStr, k) newf = map(changeStr, f) typeCount[key] = ','.join(newk) + '|' + ','.join(newf) if len(c) == 1: k = c[0].split(',') newk = map(changeStr, k) typeCount[key] = ','.join(newk) File.close() # u.writeDictFile('match.txt', typeCount, 0) # 将匹配词输出到文件 return [typeCount, count]
def createPattern(fileName): content = '' f = open(u.utf8_2_gbk(fileName)) for line in f: if line[:-1].strip(): content += '|' + '.*' + line.strip() + '.*' f.close() return content[1:].lower()
def createContent(fileName, rows): result = [] count = 0 f = open(u.utf8_2_gbk(fileName), 'rb') f.next() for line in f: count += 1 result.append(u.create_content(line, rows).lower() + ',' + '\n') f.close() return [result, count]
def createMoreMatch(fileName): typeCount = {} count = 0 f = open(u.utf8_2_gbk(fileName), 'rb') for line in f: if line[:-1].strip(): content = line.strip().split('\t') typeCount[content[0]] = typeCount.get(content[0], '') + ',' + content[1] for key, value in typeCount.items(): typeCount[key] = value[1:].lower() count += 1 return [typeCount, count]
result = [] count = 0 f = open(u.utf8_2_gbk(fileName), 'rb') f.next() for line in f: count += 1 result.append(u.create_content(line, rows).lower() + ',' + '\n') f.close() return [result, count] if __name__ == '__main__': source_file_body, totalRows = createContent(SOURCE_FILE, COLUNM) labelType, labelNum = createMoreMatch(LABEL_FILE) matchHead = u.utf8_2_gbk('内容' + ',' + LABEL_FILE.split('.')[0] + '\n') print u.utf8_2_gbk('标签个数:' + str(labelNum) + '个') for key, value in labelType.items(): count += 1 print u.utf8_2_gbk('当前执行到第' + str(count) + '个') words = value.strip().split('|') if len(words) == 1: c = createPattern(words[0]) p = re.compile(c) for num, line in enumerate(source_file_body): if p.match(line): source_file_body[num] = source_file_body[num].strip( ) + key + '|' + '\n' keyWordCount[key] = keyWordCount.get(key, 0) + 1
##################################################### count_dict = {} file_dict = {} def rm_repeat(file_list): rm_set = set() for line in file_list: content = u.create_content(line, 1) rm_set.add(content.replace('"', '') + '\n') return list(rm_set) if __name__ == '__main__': file_list = u.GetFileList(u.utf8_2_gbk(FILE_PATH), []) for f in file_list: file_dict[f.encode('gbk')] = rm_repeat( u.create_file_body(f.encode('utf-8'))) total_file = file('total.csv', 'w+') for key, value in file_dict.items(): total_file.writelines(value) total_file.close() count_file = open('total.csv', 'rb') for line in count_file: content = u.create_content(line, 1) count_dict[content] = count_dict.get(content, 0) + 1 result_file = file(RESULT_FILE, 'w+')
def create_file(fileName): fileList = linecache.getlines(u.utf8_2_gbk(fileName)) fileHead = fileList[0] fileBody = fileList[1:] fileBLen = len(fileBody) return [fileHead, fileBody, fileBLen]
FILE_NAME, FILE_PATH = u.getFirstFile('csv') def create_file(fileName): fileList = linecache.getlines(u.utf8_2_gbk(fileName)) fileHead = fileList[0] fileBody = fileList[1:] fileBLen = len(fileBody) return [fileHead, fileBody, fileBLen] if __name__ == '__main__': try: fileHead, fileBody, fileBLen = create_file(FILE_PATH) middle = (fileBLen / COUNT) + 1 for num in range(COUNT): left = num * middle right = (num + 1) * middle u.create_result_file(u.changeFileName(FILE_NAME, '-' + str(num) + '.csv'), fileHead, fileBody[left:right]) except: traceback.print_exc() print '==============================================================' print u.utf8_2_gbk('运行出错') print u.utf8_2_gbk('常见错误') print u.utf8_2_gbk('IndexError: list index out of range') print u.utf8_2_gbk('匹配列选择错误或source文件夹为空或label文件夹为空') print '==============================================================' raw_input('Press Enter to exit...')
COLUNM = 13 # 需要匹配的列 LABELWORD = '平台分类.txt' # 匹配关键词 ########################################## i = 0 keyWordCount = {} def writeFileList(list, fileName): f = file(fileName, 'w+') f.writelines(list) f.close() if __name__ == '__main__': columnName = u.GetFileNameAndExt(u.utf8_2_gbk(LABELWORD))[0] labelWords = u.create_match_words(LABELWORD) labelWordp = u.build_match_label(labelWords) head = linecache.getline(u.utf8_2_gbk(SOURCE_FILE), 1).strip() TOTALCOLUNM = len(head.split(',')) print u.utf8_2_gbk('标签词个数:') + u.printDictLen(labelWordp) source_file_body = u.create_file_body(SOURCE_FILE) for key, value in labelWordp.items(): i += 1 print u.utf8_2_gbk('当前执行到{0}个'.format(i)) for num, line in enumerate(source_file_body): data = line.strip().split(',') if len(data) == TOTALCOLUNM + 1: continue content = data[COLUNM - 1]
res[content] = str(res.get(content, 0)) + "," + str(count) for key, value in res.items(): rowNum_list = value[2:].split(",") if len(rowNum_list) >= NUMBER: for num in rowNum_list: remove_list.append(num) else: for num in rowNum_list: save_list.append(num) if __name__ == "__main__": try: print u.utf8_2_gbk('开始执行') result_file_head = u.create_file_head(SOURCE_FILE) # 文件标题 result_file_body = u.create_file_body(SOURCE_FILE) # 文件内容 factory(result_file_body) # 构造输出文件 for num in save_list: save_file_list.append(result_file_body[int(num) - 1]) for num in remove_list: remove_file_list.append(result_file_body[int(num) - 1]) print u.utf8_2_gbk(SAVE_FILE + '行数:' + str(len(save_file_list))) print u.utf8_2_gbk(REMOVE_FILE + '行数:' + str(len(remove_file_list))) u.create_result_file(REMOVE_FILE, result_file_head, remove_file_list) # 符合条件的输出文件(大于等于101次) u.create_result_file(SAVE_FILE, result_file_head, save_file_list) # 不符合条件的输出文件
def createPattern(fileName): content = '' f = open(u.utf8_2_gbk(fileName)) for line in f: if line[:-1].strip(): content += '|' + '.*' + line.strip() + '.*' f.close() return content[1:].lower() if __name__ == '__main__': source_file_body = u.create_file_body(SOURCE_FILE) source_file_head = u.create_file_head(SOURCE_FILE) m = createPattern(MATCH_FILE) print m + '===>>' + u.utf8_2_gbk('若乱码,匹配词文件请使用gbk编码') p = re.compile(m) print u.utf8_2_gbk('数据源文件行数:') + str(len(source_file_body)) for line in source_file_body: content = u.create_content(line, COLUMN).lower() if p.match(content): result_list.append(line) else: remove_list.append(line) print u.utf8_2_gbk('不包含关键词行数:') + str(len(remove_list)) print u.utf8_2_gbk('包含关键词行数:') + str(len(result_list)) u.create_result_file(u.changeFileName(SOURCE_FILE, '-含关键词.csv'), source_file_head, result_list) u.create_result_file(u.changeFileName(SOURCE_FILE, '-不含关键词.csv'), source_file_head, remove_list) raw_input('Press Enter to exit...')
OUTPUT_FILE = u.changeFileName(SOURCE, '-统计.csv') i = 0 j = 0 result = dict() def utf8_2_gbk(src): res = src.decode("utf-8").encode("gbk", "ignore") return res if __name__ == "__main__": try: ResultWriter = file(u.utf8_2_gbk(OUTPUT_FILE), "w+") reader = open(u.utf8_2_gbk(SOURCEPATH), 'rb') count_file = file(u.utf8_2_gbk(COUNT_FILE), "w+") print u.utf8_2_gbk('开始执行') next(reader) for line in reader: content = line.split(DELIMITER)[COLUMN - 1].strip().decode( CODING, 'ignore') if PATTERNTYPE == 1: pattern = re.compile(r'' + patternTemp1) elif PATTERNTYPE == 2: pattern = re.compile(r'' + patternTemp2) elif PATTERNTYPE == 3: pattern = re.compile(r'' + patternTemp3) matches = pattern.findall(content)
''' 功能说明: 取文件的前几行 ''' ####################以下是参数###################### LINE_NUM = 300 # 行数 ####################以上是参数###################### FILE_NAME, FILE_PATH = u.getFirstFile('csv') if __name__ == "__main__": try: RESULT_FILE = u.changeFileName(FILE_NAME, '-' + str(LINE_NUM) + '.csv') ResultWriter = file(u.utf8_2_gbk(RESULT_FILE), "w+") reader = open(u.utf8_2_gbk(FILE_PATH), 'rb') count = 0 for line in reader: count = count + 1 if count > LINE_NUM: break ResultWriter.write(line.strip() + '\n') ResultWriter.close() reader.close() except: traceback.print_exc() print '==============================================================' print u.utf8_2_gbk('运行出错') print u.utf8_2_gbk('常见错误') print u.utf8_2_gbk('IndexError: list index out of range')
if line[:-1].strip(): if ACCURATE: content += '|' + '^' + line.strip() + '$' else: content += '|' + '.*' + line.strip() + '.*' f.close() return content[1:].lower() if __name__ == '__main__': try: source_file_body = u.create_file_body(SOURCEPATH) source_file_head = u.create_file_head(SOURCEPATH) m = createPattern(LABELPATH) print m print '===============>>' + u.utf8_2_gbk( '若乱码,匹配词文件请使用gbk编码') + '<<===================' p = re.compile(m) print u.utf8_2_gbk('数据源文件行数:') + str(len(source_file_body)) for line in source_file_body: content = u.create_content(line, COLUMN).lower() if p.match(content): result_list.append(line) else: remove_list.append(line) print u.utf8_2_gbk('不包含关键词行数:') + str(len(remove_list)) print u.utf8_2_gbk('包含关键词行数:') + str(len(result_list)) u.create_result_file(u.changeFileName(SOURCE, '-含关键词.csv'), source_file_head, result_list) u.create_result_file(u.changeFileName(SOURCE, '-不含关键词.csv'), source_file_head, remove_list) except:
FILE_PATH = sys.path[0] + '\\source' def rm_repeat(file_list): rm_set = set() for line in file_list: content = u.create_content(line, 1) rm_set.add(content.replace('"', '') + '\n') return list(rm_set) if __name__ == '__main__': try: print u.utf8_2_gbk('开始执行') file_list = u.GetFileList(FILE_PATH, []) for f in file_list: file_dict[f.encode('gbk')] = rm_repeat(u.create_file_body(f.encode('utf-8'))) total_file = file('total.csv', 'w+') for key, value in file_dict.items(): total_file.writelines(value) total_file.close() count_file = open('total.csv', 'rb') for line in count_file: content = u.create_content(line, 1) count_dict[content] = count_dict.get(content, 0) + 1 count_file.close()
def createPattern(str, accurate): # 添加正则匹配规则 if accurate: result = '^' + str.replace(',', '$|^') + '$' else: result = '.*' + str.replace(',', '.*|.*') + '.*' return result if __name__ == '__main__': try: LABEL_FILE, LABEL_PATH = u.getFirstFile('txt') SOURCE_FILE, SOURCE_PATH = u.getFirstFile('csv') print u.utf8_2_gbk('打标签文件:' + LABEL_FILE) print u.utf8_2_gbk('数据源文件:' + SOURCE_FILE) source_file_body = u.create_file_body(SOURCE_PATH) for num, line in enumerate(source_file_body): source_file_body[num] = line.strip().lower() + ',' + '\n' labelType, labelNum = createMoreMatch(LABEL_PATH) matchHead = u.create_file_head(SOURCE_PATH, 'right', [LABEL_FILE.split('.')[0]]) print u.utf8_2_gbk('标签个数:' + str(labelNum) + '个') for key, value in labelType.items(): count += 1 print u.utf8_2_gbk('当前执行到第' + str(count) + '个') words = value.strip().split('|') if len(words) == 1: c = createPattern(words[0], ACCURATE) p = re.compile(c) for num, line in enumerate(source_file_body):
def createPattern(str, accurate): # 添加正则匹配规则 if ACCURATE: result = '^' + str.replace(',', '$|^').replace('&', '$|^') + '$' else: result = '.*' + str.replace(',', '.*|.*').replace('&', '.*') + '.*' return result if __name__ == '__main__': try: keyWordCount = {} count = 0 LABEL_FILE, LABEL_PATH = u.getFirstFile('txt') SOURCE_FILE, SOURCE_PATH = u.getFirstFile('csv') print u.utf8_2_gbk('打标签文件:' + LABEL_FILE) print u.utf8_2_gbk('数据源文件:' + SOURCE_FILE) print u.utf8_2_gbk('是否精确匹配' + str(ACCURATE)) source_file_body = u.create_file_body(SOURCE_PATH) print len(source_file_body) for num, line in enumerate(source_file_body): source_file_body[num] = line.strip() + ',' + '\n' labelType, labelNum = createMoreMatch(LABEL_PATH) matchHead = u.create_file_head(SOURCE_PATH, 'right', [LABEL_FILE.split('.')[0]]) print u.utf8_2_gbk('标签个数:' + str(labelNum) + '个') for key, value in labelType.items(): count += 1 print u.utf8_2_gbk('当前执行到第' + str(count) + '个') words = value.strip().split('|') if len(words) == 1: # 只有关键词无过滤词