Пример #1
0
    pattern = ''
    cmatch_words = map(lambda line: '.*' + line.strip() + '.*', match_words[0])
    ematch_words = map(lambda line: '.*' + line.strip() + '.*', match_words[1])
    if len(ematch_words) == 0:
        pattern = '|'.join(cmatch_words)
    else:
        pattern = '|'.join(cmatch_words) + '|' + '|'.join(ematch_words)
    return pattern


if __name__ == '__main__':
    source_file_body = u.create_file_body(SOURCE_FILE)
    source_file_head = u.create_file_head(SOURCE_FILE)

    match_words = u.create_match_words(MATCH_FILE)
    pattern = createPattern(match_words)
    p = re.compile(pattern)

    for line in source_file_body:
        content = u.create_content(line, COLUMN)
        if p.match(content):
            resultFile.append(line)
        else:
            removeFile.append(line)

    resultFileName = getFileName(SOURCE_FILE, '-含关键词.csv')
    removeFileName = getFileName(SOURCE_FILE, '-不含关键词.csv')

    u.create_result_file(resultFileName, source_file_head, resultFile)
    u.create_result_file(removeFileName, source_file_head, removeFile)
Пример #2
0
        labelType, labelNum = createMoreMatch(LABEL_PATH)
        matchHead = u.create_file_head(SOURCE_PATH, 'right', [LABEL_FILE.split('.')[0]])
        print u.utf8_2_gbk('标签个数:' + str(labelNum) + '个')
        for key, value in labelType.items():
            count += 1
            print u.utf8_2_gbk('当前执行到第' + str(count) + '个')
            words = value.strip().split('|')
            if len(words) == 1:
                c = createPattern(words[0], ACCURATE)
                p = re.compile(c)
                for num, line in enumerate(source_file_body):
                    content = u.create_content(line, COLUNM)
                    if p.match(content):
                        source_file_body[num] = source_file_body[num].strip() + key + '|' + '\n'
                        keyWordCount[key] = keyWordCount.get(key, 0) + 1
            if len(words) == 2:
                c = createPattern(words[0], ACCURATE)
                f = createPattern(words[1], ACCURATE)
                cp = re.compile(c)
                fp = re.compile(f)
                for num, line in enumerate(source_file_body):
                    content = u.create_content(line, COLUNM)
                    if cp.match(content) and not fp.match(content):
                        source_file_body[num] = source_file_body[num].strip() + key + '|' + '\n'
                        keyWordCount[key] = keyWordCount.get(key, 0) + 1

        u.create_result_file(u.setFileName(SOURCE_FILE, LABEL_FILE), matchHead, source_file_body)
        u.writeDictFile(u.changeFileName(combinefileName(SOURCE_FILE, LABEL_FILE), '统计.txt'), keyWordCount, 1)
    except:
        traceback.print_exc(file=open('error.txt', 'w+'))
Пример #3
0
    head = linecache.getline(u.utf8_2_gbk(SOURCE_FILE), 1).strip()
    TOTALCOLUNM = len(head.split(','))
    print u.utf8_2_gbk('标签词个数:') + u.printDictLen(labelWordp)

    source_file_body = u.create_file_body(SOURCE_FILE)
    for key, value in labelWordp.items():
        i += 1
        print u.utf8_2_gbk('当前执行到{0}个'.format(i))
        for num, line in enumerate(source_file_body):
            data = line.strip().split(',')
            if len(data) == TOTALCOLUNM + 1:
                continue
            content = data[COLUNM - 1]
            p = re.compile(value)
            if p.match(content):
                source_file_body[num] = source_file_body[num].strip() + ',' + key + '\n'
                keyWordCount[key] = keyWordCount.get(key, 0) + 1

    # 补全格式
    for num, line in enumerate(source_file_body):
        data = line.strip().split(',')
        if len(data) == TOTALCOLUNM + 1:
            continue
        source_file_body[num] = source_file_body[num].strip() + ',' + '' + '\n'

    result_file_head = u.create_file_head(SOURCE_FILE, 'right', [u.gbk_2_utf8(columnName)])
    u.create_result_file(RESULT_FILE, result_file_head, source_file_body)

    KEYWORD_FILE = LABELWORD.split('.')[0] + '统计.txt'
    u.writeDictFile(KEYWORD_FILE, keyWordCount)  # 输出统计结果
Пример #4
0
FILE_NAME, FILE_PATH = u.getFirstFile('csv')


def create_file(fileName):
    fileList = linecache.getlines(u.utf8_2_gbk(fileName))
    fileHead = fileList[0]
    fileBody = fileList[1:]
    fileBLen = len(fileBody)
    return [fileHead, fileBody, fileBLen]


if __name__ == '__main__':
    try:
        fileHead, fileBody, fileBLen = create_file(FILE_PATH)
        middle = (fileBLen / COUNT) + 1
        for num in range(COUNT):
            left = num * middle
            right = (num + 1) * middle
            u.create_result_file(u.changeFileName(FILE_NAME, '-' + str(num) + '.csv'), fileHead,
                                 fileBody[left:right])
    except:
        traceback.print_exc()
        print '=============================================================='
        print u.utf8_2_gbk('运行出错')
        print u.utf8_2_gbk('常见错误')
        print u.utf8_2_gbk('IndexError: list index out of range')
        print u.utf8_2_gbk('匹配列选择错误或source文件夹为空或label文件夹为空')
        print '=============================================================='
        raw_input('Press Enter to exit...')
Пример #5
0

if __name__ == "__main__":

    try:
        print u.utf8_2_gbk('开始执行')
        result_file_head = u.create_file_head(SOURCE_FILE)  # 文件标题
        result_file_body = u.create_file_body(SOURCE_FILE)  # 文件内容
        factory(result_file_body)  # 构造输出文件
        for num in save_list:
            save_file_list.append(result_file_body[int(num) - 1])

        for num in remove_list:
            remove_file_list.append(result_file_body[int(num) - 1])

        print u.utf8_2_gbk(SAVE_FILE + '行数:' + str(len(save_file_list)))
        print u.utf8_2_gbk(REMOVE_FILE + '行数:' + str(len(remove_file_list)))
        u.create_result_file(REMOVE_FILE, result_file_head,
                             remove_file_list)  # 符合条件的输出文件(大于等于101次)
        u.create_result_file(SAVE_FILE, result_file_head,
                             save_file_list)  # 不符合条件的输出文件
    except:
        traceback.print_exc()
        print '=============================================================='
        print u.utf8_2_gbk('运行出错')
        print u.utf8_2_gbk('常见错误')
        print u.utf8_2_gbk('IndexError: list index out of range')
        print u.utf8_2_gbk('匹配列选择错误或source文件夹为空或label文件夹为空')
        print '=============================================================='
        raw_input('Press Enter to exit...')
Пример #6
0
def createPattern(fileName):
    content = ''
    f = open(u.utf8_2_gbk(fileName))
    for line in f:
        if line[:-1].strip():
            content += '|' + '.*' + line.strip() + '.*'
    f.close()
    return content[1:].lower()


if __name__ == '__main__':
    source_file_body = u.create_file_body(SOURCE_FILE)
    source_file_head = u.create_file_head(SOURCE_FILE)
    m = createPattern(MATCH_FILE)
    print m + '===>>' + u.utf8_2_gbk('若乱码,匹配词文件请使用gbk编码')
    p = re.compile(m)
    print u.utf8_2_gbk('数据源文件行数:') + str(len(source_file_body))
    for line in source_file_body:
        content = u.create_content(line, COLUMN).lower()
        if p.match(content):
            result_list.append(line)
        else:
            remove_list.append(line)
    print u.utf8_2_gbk('不包含关键词行数:') + str(len(remove_list))
    print u.utf8_2_gbk('包含关键词行数:') + str(len(result_list))
    u.create_result_file(u.changeFileName(SOURCE_FILE, '-含关键词.csv'),
                         source_file_head, result_list)
    u.create_result_file(u.changeFileName(SOURCE_FILE, '-不含关键词.csv'),
                         source_file_head, remove_list)
    raw_input('Press Enter to exit...')
Пример #7
0
        print u.utf8_2_gbk('开始执行聚类')
        for num, line in enumerate(source_file_body):
            content = re.sub(pattern, '', u.create_content(line, COLUMN))
            if len(content) <= 20:
                keywords = jieba.analyse.extract_tags(content, topK=2)
            else:
                keywords = jieba.analyse.extract_tags(content, topK=TOPKET)
            keywords.sort()
            key = ','.join(keywords)
            cluster[key] = str(cluster.get(key, 0)) + "," + str(num + 1)
        print u.utf8_2_gbk('聚类完成,生成输出文件')
        for num, value in enumerate(cluster.itervalues()):
            cluster_list = value[2:].split(',')
            count_file_dict[num] = len(cluster_list)
            for n in cluster_list:
                result_file_body.append(
                    str(num) + ',' + source_file_body[int(n) - 1])
        u.create_result_file(u.changeFileName(SOURCENAME, '-聚类.csv'),
                             source_file_head, result_file_body)
        u.writeDictFile(u.changeFileName(SOURCENAME, '-聚类统计.txt'),
                        count_file_dict, 1)
    except:
        traceback.print_exc()
        print '=============================================================='
        print u.utf8_2_gbk('运行出错')
        print u.utf8_2_gbk('常见错误')
        print u.utf8_2_gbk('IndexError: list index out of range')
        print u.utf8_2_gbk('匹配列选择错误或source文件夹为空或label文件夹为空')
        print '=============================================================='
        raw_input('Press Enter to exit...')
Пример #8
0
REMOVE_FILE = "remove.csv"  # 输出文件(含关键词)
FILETER_FILE = "filter.txt"  # 过滤词文件

################################################
result_list = []
remove_list = []

if __name__ == "__main__":

    result_file_head = u.create_file_head(SOURCE_FILE)
    result_file_body = u.create_file_body(SOURCE_FILE)

    words_file = u.create_match_words(FILETER_FILE)

    chiness_words = words_file[0]
    english_words = words_file[1]

    pattern = u.build_pattern(chiness_words, english_words)

    print 'start'
    for line in result_file_body:
        content = u.create_content(line, COLUMN)
        if pattern.match(content):
            remove_list.append(line)
        else:
            result_list.append(line)
    print 'end'

    u.create_result_file(RESULT_FILE, result_file_head, result_list)
    u.create_result_file(REMOVE_FILE, result_file_head, remove_list)
Пример #9
0
cluster = {}
result_file_body = []
pattern = re.compile("\w|[/.,/#@$%^& ]")
count_file_list = []

if __name__ == '__main__':

    source_file_head = u.create_file_head(SOURCE_FILE, 'left', ['类型'])
    source_file_body = u.create_file_body(SOURCE_FILE)

    for num, line in enumerate(source_file_body):
        content = re.sub(pattern, '', u.create_content(line, COLUMN))
        if len(content) <= 20:
            keywords = jieba.analyse.extract_tags(content, topK=2)
        else:
            keywords = jieba.analyse.extract_tags(content, topK=TOPKET)
        keywords.sort()
        key = ','.join(keywords)
        cluster[key] = str(cluster.get(key, 0)) + "," + str(num + 1)

    for num, value in enumerate(cluster.itervalues()):
        cluster_list = value[2:].split(',')
        count_file_list.append(str(num) + '\t' + str(len(cluster_list)) + '\n')
        for n in cluster_list:
            result_file_body.append(
                str(num) + ',' + source_file_body[int(n) - 1])

    u.create_result_file(RESULT_FILE, source_file_head, result_file_body)
    u.create_result_file(COUNT_FILE, ['type\tcount\n'], count_file_list)