Пример #1
0
#!/bin/bash
# coding=utf-8
import re
import sys
import os
import util as u
'''
功能说明:先groupby某一列,然后统计每个类的数量,输出一份文件
'''

##############参数说明###############
SOURCE_FILE = 'hebing.csv'  # 输入文件
RESULT_FILE = 'result.csv'  # 输出文件
COLUMN = 1  # 要groupby和统计的列的列
####################################
count_dict = {}

if __name__ == '__main__':
    source_file_head = u.create_file_head(SOURCE_FILE, 'right', ['次数'])
    source_file_body = u.create_file_body(SOURCE_FILE)
    for line in source_file_body:
        content = u.create_content(line, COLUMN)
        count_dict[content] = count_dict.get(content, 0) + 1
    result_file = file(RESULT_FILE, 'w+')
    result_file.write(source_file_head)
    for key, value in count_dict.items():
        result_file.write(key + ',' + str(value) + '\n')
    result_file.close()
Пример #2
0
        result = '.*' + str.replace(',', '.*|.*') + '.*'
    return result


if __name__ == '__main__':

    try:
        LABEL_FILE, LABEL_PATH = u.getFirstFile('txt')
        SOURCE_FILE, SOURCE_PATH = u.getFirstFile('csv')
        print u.utf8_2_gbk('打标签文件:' + LABEL_FILE)
        print u.utf8_2_gbk('数据源文件:' + SOURCE_FILE)
        source_file_body = u.create_file_body(SOURCE_PATH)
        for num, line in enumerate(source_file_body):
            source_file_body[num] = line.strip().lower() + ',' + '\n'
        labelType, labelNum = createMoreMatch(LABEL_PATH)
        matchHead = u.create_file_head(SOURCE_PATH, 'right', [LABEL_FILE.split('.')[0]])
        print u.utf8_2_gbk('标签个数:' + str(labelNum) + '个')
        for key, value in labelType.items():
            count += 1
            print u.utf8_2_gbk('当前执行到第' + str(count) + '个')
            words = value.strip().split('|')
            if len(words) == 1:
                c = createPattern(words[0], ACCURATE)
                p = re.compile(c)
                for num, line in enumerate(source_file_body):
                    content = u.create_content(line, COLUNM)
                    if p.match(content):
                        source_file_body[num] = source_file_body[num].strip() + key + '|' + '\n'
                        keyWordCount[key] = keyWordCount.get(key, 0) + 1
            if len(words) == 2:
                c = createPattern(words[0], ACCURATE)
Пример #3
0
    head = linecache.getline(u.utf8_2_gbk(SOURCE_FILE), 1).strip()
    TOTALCOLUNM = len(head.split(','))
    print u.utf8_2_gbk('标签词个数:') + u.printDictLen(labelWordp)

    source_file_body = u.create_file_body(SOURCE_FILE)
    for key, value in labelWordp.items():
        i += 1
        print u.utf8_2_gbk('当前执行到{0}个'.format(i))
        for num, line in enumerate(source_file_body):
            data = line.strip().split(',')
            if len(data) == TOTALCOLUNM + 1:
                continue
            content = data[COLUNM - 1]
            p = re.compile(value)
            if p.match(content):
                source_file_body[num] = source_file_body[num].strip() + ',' + key + '\n'
                keyWordCount[key] = keyWordCount.get(key, 0) + 1

    # 补全格式
    for num, line in enumerate(source_file_body):
        data = line.strip().split(',')
        if len(data) == TOTALCOLUNM + 1:
            continue
        source_file_body[num] = source_file_body[num].strip() + ',' + '' + '\n'

    result_file_head = u.create_file_head(SOURCE_FILE, 'right', [u.gbk_2_utf8(columnName)])
    u.create_result_file(RESULT_FILE, result_file_head, source_file_body)

    KEYWORD_FILE = LABELWORD.split('.')[0] + '统计.txt'
    u.writeDictFile(KEYWORD_FILE, keyWordCount)  # 输出统计结果
Пример #4
0

def createPattern(match_words):
    pattern = ''
    cmatch_words = map(lambda line: '.*' + line.strip() + '.*', match_words[0])
    ematch_words = map(lambda line: '.*' + line.strip() + '.*', match_words[1])
    if len(ematch_words) == 0:
        pattern = '|'.join(cmatch_words)
    else:
        pattern = '|'.join(cmatch_words) + '|' + '|'.join(ematch_words)
    return pattern


if __name__ == '__main__':
    source_file_body = u.create_file_body(SOURCE_FILE)
    source_file_head = u.create_file_head(SOURCE_FILE)

    match_words = u.create_match_words(MATCH_FILE)
    pattern = createPattern(match_words)
    p = re.compile(pattern)

    for line in source_file_body:
        content = u.create_content(line, COLUMN)
        if p.match(content):
            resultFile.append(line)
        else:
            removeFile.append(line)

    resultFileName = getFileName(SOURCE_FILE, '-含关键词.csv')
    removeFileName = getFileName(SOURCE_FILE, '-不含关键词.csv')
Пример #5
0
    for key, value in res.items():
        rowNum_list = value[2:].split(",")
        if len(rowNum_list) >= NUMBER:
            for num in rowNum_list:
                remove_list.append(num)
        else:
            for num in rowNum_list:
                save_list.append(num)


if __name__ == "__main__":

    try:
        print u.utf8_2_gbk('开始执行')
        result_file_head = u.create_file_head(SOURCE_FILE)  # 文件标题
        result_file_body = u.create_file_body(SOURCE_FILE)  # 文件内容
        factory(result_file_body)  # 构造输出文件
        for num in save_list:
            save_file_list.append(result_file_body[int(num) - 1])

        for num in remove_list:
            remove_file_list.append(result_file_body[int(num) - 1])

        print u.utf8_2_gbk(SAVE_FILE + '行数:' + str(len(save_file_list)))
        print u.utf8_2_gbk(REMOVE_FILE + '行数:' + str(len(remove_file_list)))
        u.create_result_file(REMOVE_FILE, result_file_head,
                             remove_file_list)  # 符合条件的输出文件(大于等于101次)
        u.create_result_file(SAVE_FILE, result_file_head,
                             save_file_list)  # 不符合条件的输出文件
    except:
Пример #6
0
# coding=utf-8
import sys
import re
import util as u
import os

'''
功能说明:将指定路径的文件合并成一个文件(文件格式为csv)
'''

##################参数说明#################
FILE_PATH = r'G:\merge_n_file\data'  # 文件路径
RESULT_FILE = 'result.csv'  # 合并之后的文件
RESULT_HEAD = r'G:\merge_n_file\data\3779_利鑫-999道私房菜.csv'  # 指定合并后的文件标题基准
##########################################
file_dict = {}

if __name__ == '__main__':

    file_list = u.GetFileList(u.utf8_2_gbk(FILE_PATH), [])
    for f in file_list:
        file_dict[f.encode('gbk')] = u.create_file_body(f.encode('utf-8'))

    result_file = file(RESULT_FILE, 'w+')
    result_file_head = u.create_file_head(RESULT_HEAD)
    result_file.write(result_file_head)

    for key, value in file_dict.items():
        result_file.writelines(value)
    result_file.close()
Пример #7
0
行数----1000----10000----100000----1000000--
耗时-----2s------11s------129s------1432s---
内存----0.3mb----3mb------33mb------400mb---
"""
__author__ = "liangzhicheng"

SOURCENAME, SOURCEPATH = u.getFirstFile('csv')
cluster = {}
result_file_body = []
pattern = re.compile("\w|[/.,/#@$%^& ]")
count_file_dict = {}

if __name__ == '__main__':

    try:
        source_file_head = u.create_file_head(SOURCEPATH, 'left', ['类型'])
        source_file_body = u.create_file_body(SOURCEPATH)
        print u.utf8_2_gbk('开始执行聚类')
        for num, line in enumerate(source_file_body):
            content = re.sub(pattern, '', u.create_content(line, COLUMN))
            if len(content) <= 20:
                keywords = jieba.analyse.extract_tags(content, topK=2)
            else:
                keywords = jieba.analyse.extract_tags(content, topK=TOPKET)
            keywords.sort()
            key = ','.join(keywords)
            cluster[key] = str(cluster.get(key, 0)) + "," + str(num + 1)
        print u.utf8_2_gbk('聚类完成,生成输出文件')
        for num, value in enumerate(cluster.itervalues()):
            cluster_list = value[2:].split(',')
            count_file_dict[num] = len(cluster_list)
Пример #8
0
    content = ''
    f = open(u.utf8_2_gbk(fileName))
    for line in f:
        if line[:-1].strip():
            if ACCURATE:
                content += '|' + '^' + line.strip() + '$'
            else:
                content += '|' + '.*' + line.strip() + '.*'
    f.close()
    return content[1:].lower()


if __name__ == '__main__':
    try:
        source_file_body = u.create_file_body(SOURCEPATH)
        source_file_head = u.create_file_head(SOURCEPATH)
        m = createPattern(LABELPATH)
        print m
        print '===============>>' + u.utf8_2_gbk(
            '若乱码,匹配词文件请使用gbk编码') + '<<==================='
        p = re.compile(m)
        print u.utf8_2_gbk('数据源文件行数:') + str(len(source_file_body))
        for line in source_file_body:
            content = u.create_content(line, COLUMN).lower()
            if p.match(content):
                result_list.append(line)
            else:
                remove_list.append(line)
        print u.utf8_2_gbk('不包含关键词行数:') + str(len(remove_list))
        print u.utf8_2_gbk('包含关键词行数:') + str(len(result_list))
        u.create_result_file(u.changeFileName(SOURCE, '-含关键词.csv'),
Пример #9
0
将结果按照每个cluster的大小排序,大的在前面
性能:
行数----1000----10000----100000----1000000--
耗时-----2s------11s------129s------1432s---
内存----0.3mb----3mb------33mb------400mb---
"""
__author__ = "liangzhicheng"

cluster = {}
result_file_body = []
pattern = re.compile("\w|[/.,/#@$%^& ]")
count_file_list = []

if __name__ == '__main__':

    source_file_head = u.create_file_head(SOURCE_FILE, 'left', ['类型'])
    source_file_body = u.create_file_body(SOURCE_FILE)

    for num, line in enumerate(source_file_body):
        content = re.sub(pattern, '', u.create_content(line, COLUMN))
        if len(content) <= 20:
            keywords = jieba.analyse.extract_tags(content, topK=2)
        else:
            keywords = jieba.analyse.extract_tags(content, topK=TOPKET)
        keywords.sort()
        key = ','.join(keywords)
        cluster[key] = str(cluster.get(key, 0)) + "," + str(num + 1)

    for num, value in enumerate(cluster.itervalues()):
        cluster_list = value[2:].split(',')
        count_file_list.append(str(num) + '\t' + str(len(cluster_list)) + '\n')