示例#1
0
def createMoreMatch(fileName):  # 构造匹配关键词
    typeCount = {}
    count = 0
    File = open(u.utf8_2_gbk(fileName), 'rb')
    for line in File:
        if line[:-1].strip():
            try:
                content = u.utf8_2_gbk(line).strip().split('\t')
            except:
                content = line.strip().split('\t')

            typeCount[content[0]] = typeCount.get(content[0],
                                                  '') + ',' + content[1]
    for key, value in typeCount.items():
        count += 1
        c = value[1:].split('|')
        if len(c) == 2:
            k = c[0].split(',')
            f = c[1].split(',')
            newk = map(changeStr, k)
            newf = map(changeStr, f)
            typeCount[key] = ','.join(newk) + '|' + ','.join(newf)
        if len(c) == 1:
            k = c[0].split(',')
            newk = map(changeStr, k)
            typeCount[key] = ','.join(newk)
    File.close()
    # u.writeDictFile('match.txt', typeCount, 0)  # 将匹配词输出到文件
    return [typeCount, count]
示例#2
0
def createPattern(fileName):
    content = ''
    f = open(u.utf8_2_gbk(fileName))
    for line in f:
        if line[:-1].strip():
            content += '|' + '.*' + line.strip() + '.*'
    f.close()
    return content[1:].lower()
示例#3
0
def createContent(fileName, rows):
    result = []
    count = 0
    f = open(u.utf8_2_gbk(fileName), 'rb')
    f.next()
    for line in f:
        count += 1
        result.append(u.create_content(line, rows).lower() + ',' + '\n')
    f.close()
    return [result, count]
示例#4
0
def createMoreMatch(fileName):
    typeCount = {}
    count = 0
    f = open(u.utf8_2_gbk(fileName), 'rb')
    for line in f:
        if line[:-1].strip():
            content = line.strip().split('\t')
            typeCount[content[0]] = typeCount.get(content[0], '') + ',' + content[1]
    for key, value in typeCount.items():
        typeCount[key] = value[1:].lower()
        count += 1
    return [typeCount, count]
示例#5
0
    result = []
    count = 0
    f = open(u.utf8_2_gbk(fileName), 'rb')
    f.next()
    for line in f:
        count += 1
        result.append(u.create_content(line, rows).lower() + ',' + '\n')
    f.close()
    return [result, count]


if __name__ == '__main__':

    source_file_body, totalRows = createContent(SOURCE_FILE, COLUNM)
    labelType, labelNum = createMoreMatch(LABEL_FILE)
    matchHead = u.utf8_2_gbk('内容' + ',' + LABEL_FILE.split('.')[0] + '\n')

    print u.utf8_2_gbk('标签个数:' + str(labelNum) + '个')

    for key, value in labelType.items():
        count += 1
        print u.utf8_2_gbk('当前执行到第' + str(count) + '个')
        words = value.strip().split('|')
        if len(words) == 1:
            c = createPattern(words[0])
            p = re.compile(c)
            for num, line in enumerate(source_file_body):
                if p.match(line):
                    source_file_body[num] = source_file_body[num].strip(
                    ) + key + '|' + '\n'
                    keyWordCount[key] = keyWordCount.get(key, 0) + 1
示例#6
0
#####################################################
count_dict = {}
file_dict = {}


def rm_repeat(file_list):
    rm_set = set()
    for line in file_list:
        content = u.create_content(line, 1)
        rm_set.add(content.replace('"', '') + '\n')
    return list(rm_set)


if __name__ == '__main__':

    file_list = u.GetFileList(u.utf8_2_gbk(FILE_PATH), [])
    for f in file_list:
        file_dict[f.encode('gbk')] = rm_repeat(
            u.create_file_body(f.encode('utf-8')))

    total_file = file('total.csv', 'w+')
    for key, value in file_dict.items():
        total_file.writelines(value)
    total_file.close()

    count_file = open('total.csv', 'rb')
    for line in count_file:
        content = u.create_content(line, 1)
        count_dict[content] = count_dict.get(content, 0) + 1

    result_file = file(RESULT_FILE, 'w+')
示例#7
0
def create_file(fileName):
    fileList = linecache.getlines(u.utf8_2_gbk(fileName))
    fileHead = fileList[0]
    fileBody = fileList[1:]
    fileBLen = len(fileBody)
    return [fileHead, fileBody, fileBLen]
示例#8
0
FILE_NAME, FILE_PATH = u.getFirstFile('csv')


def create_file(fileName):
    fileList = linecache.getlines(u.utf8_2_gbk(fileName))
    fileHead = fileList[0]
    fileBody = fileList[1:]
    fileBLen = len(fileBody)
    return [fileHead, fileBody, fileBLen]


if __name__ == '__main__':
    try:
        fileHead, fileBody, fileBLen = create_file(FILE_PATH)
        middle = (fileBLen / COUNT) + 1
        for num in range(COUNT):
            left = num * middle
            right = (num + 1) * middle
            u.create_result_file(u.changeFileName(FILE_NAME, '-' + str(num) + '.csv'), fileHead,
                                 fileBody[left:right])
    except:
        traceback.print_exc()
        print '=============================================================='
        print u.utf8_2_gbk('运行出错')
        print u.utf8_2_gbk('常见错误')
        print u.utf8_2_gbk('IndexError: list index out of range')
        print u.utf8_2_gbk('匹配列选择错误或source文件夹为空或label文件夹为空')
        print '=============================================================='
        raw_input('Press Enter to exit...')
示例#9
0
COLUNM = 13  # 需要匹配的列
LABELWORD = '平台分类.txt'  # 匹配关键词
##########################################
i = 0
keyWordCount = {}


def writeFileList(list, fileName):
    f = file(fileName, 'w+')
    f.writelines(list)
    f.close()


if __name__ == '__main__':

    columnName = u.GetFileNameAndExt(u.utf8_2_gbk(LABELWORD))[0]
    labelWords = u.create_match_words(LABELWORD)
    labelWordp = u.build_match_label(labelWords)
    head = linecache.getline(u.utf8_2_gbk(SOURCE_FILE), 1).strip()
    TOTALCOLUNM = len(head.split(','))
    print u.utf8_2_gbk('标签词个数:') + u.printDictLen(labelWordp)

    source_file_body = u.create_file_body(SOURCE_FILE)
    for key, value in labelWordp.items():
        i += 1
        print u.utf8_2_gbk('当前执行到{0}个'.format(i))
        for num, line in enumerate(source_file_body):
            data = line.strip().split(',')
            if len(data) == TOTALCOLUNM + 1:
                continue
            content = data[COLUNM - 1]
示例#10
0
        res[content] = str(res.get(content, 0)) + "," + str(count)

    for key, value in res.items():
        rowNum_list = value[2:].split(",")
        if len(rowNum_list) >= NUMBER:
            for num in rowNum_list:
                remove_list.append(num)
        else:
            for num in rowNum_list:
                save_list.append(num)


if __name__ == "__main__":

    try:
        print u.utf8_2_gbk('开始执行')
        result_file_head = u.create_file_head(SOURCE_FILE)  # 文件标题
        result_file_body = u.create_file_body(SOURCE_FILE)  # 文件内容
        factory(result_file_body)  # 构造输出文件
        for num in save_list:
            save_file_list.append(result_file_body[int(num) - 1])

        for num in remove_list:
            remove_file_list.append(result_file_body[int(num) - 1])

        print u.utf8_2_gbk(SAVE_FILE + '行数:' + str(len(save_file_list)))
        print u.utf8_2_gbk(REMOVE_FILE + '行数:' + str(len(remove_file_list)))
        u.create_result_file(REMOVE_FILE, result_file_head,
                             remove_file_list)  # 符合条件的输出文件(大于等于101次)
        u.create_result_file(SAVE_FILE, result_file_head,
                             save_file_list)  # 不符合条件的输出文件
示例#11
0
def createPattern(fileName):
    content = ''
    f = open(u.utf8_2_gbk(fileName))
    for line in f:
        if line[:-1].strip():
            content += '|' + '.*' + line.strip() + '.*'
    f.close()
    return content[1:].lower()


if __name__ == '__main__':
    source_file_body = u.create_file_body(SOURCE_FILE)
    source_file_head = u.create_file_head(SOURCE_FILE)
    m = createPattern(MATCH_FILE)
    print m + '===>>' + u.utf8_2_gbk('若乱码,匹配词文件请使用gbk编码')
    p = re.compile(m)
    print u.utf8_2_gbk('数据源文件行数:') + str(len(source_file_body))
    for line in source_file_body:
        content = u.create_content(line, COLUMN).lower()
        if p.match(content):
            result_list.append(line)
        else:
            remove_list.append(line)
    print u.utf8_2_gbk('不包含关键词行数:') + str(len(remove_list))
    print u.utf8_2_gbk('包含关键词行数:') + str(len(result_list))
    u.create_result_file(u.changeFileName(SOURCE_FILE, '-含关键词.csv'),
                         source_file_head, result_list)
    u.create_result_file(u.changeFileName(SOURCE_FILE, '-不含关键词.csv'),
                         source_file_head, remove_list)
    raw_input('Press Enter to exit...')
示例#12
0
OUTPUT_FILE = u.changeFileName(SOURCE, '-统计.csv')

i = 0
j = 0
result = dict()


def utf8_2_gbk(src):
    res = src.decode("utf-8").encode("gbk", "ignore")
    return res


if __name__ == "__main__":

    try:
        ResultWriter = file(u.utf8_2_gbk(OUTPUT_FILE), "w+")
        reader = open(u.utf8_2_gbk(SOURCEPATH), 'rb')
        count_file = file(u.utf8_2_gbk(COUNT_FILE), "w+")
        print u.utf8_2_gbk('开始执行')
        next(reader)

        for line in reader:
            content = line.split(DELIMITER)[COLUMN - 1].strip().decode(
                CODING, 'ignore')
            if PATTERNTYPE == 1:
                pattern = re.compile(r'' + patternTemp1)
            elif PATTERNTYPE == 2:
                pattern = re.compile(r'' + patternTemp2)
            elif PATTERNTYPE == 3:
                pattern = re.compile(r'' + patternTemp3)
            matches = pattern.findall(content)
示例#13
0
'''
功能说明:
取文件的前几行
'''
####################以下是参数######################
LINE_NUM = 300  # 行数
####################以上是参数######################

FILE_NAME, FILE_PATH = u.getFirstFile('csv')

if __name__ == "__main__":

    try:
        RESULT_FILE = u.changeFileName(FILE_NAME, '-' + str(LINE_NUM) + '.csv')
        ResultWriter = file(u.utf8_2_gbk(RESULT_FILE), "w+")
        reader = open(u.utf8_2_gbk(FILE_PATH), 'rb')
        count = 0
        for line in reader:
            count = count + 1
            if count > LINE_NUM:
                break
            ResultWriter.write(line.strip() + '\n')
        ResultWriter.close()
        reader.close()
    except:
        traceback.print_exc()
        print '=============================================================='
        print u.utf8_2_gbk('运行出错')
        print u.utf8_2_gbk('常见错误')
        print u.utf8_2_gbk('IndexError: list index out of range')
示例#14
0
        if line[:-1].strip():
            if ACCURATE:
                content += '|' + '^' + line.strip() + '$'
            else:
                content += '|' + '.*' + line.strip() + '.*'
    f.close()
    return content[1:].lower()


if __name__ == '__main__':
    try:
        source_file_body = u.create_file_body(SOURCEPATH)
        source_file_head = u.create_file_head(SOURCEPATH)
        m = createPattern(LABELPATH)
        print m
        print '===============>>' + u.utf8_2_gbk(
            '若乱码,匹配词文件请使用gbk编码') + '<<==================='
        p = re.compile(m)
        print u.utf8_2_gbk('数据源文件行数:') + str(len(source_file_body))
        for line in source_file_body:
            content = u.create_content(line, COLUMN).lower()
            if p.match(content):
                result_list.append(line)
            else:
                remove_list.append(line)
        print u.utf8_2_gbk('不包含关键词行数:') + str(len(remove_list))
        print u.utf8_2_gbk('包含关键词行数:') + str(len(result_list))
        u.create_result_file(u.changeFileName(SOURCE, '-含关键词.csv'),
                             source_file_head, result_list)
        u.create_result_file(u.changeFileName(SOURCE, '-不含关键词.csv'),
                             source_file_head, remove_list)
    except:
示例#15
0
FILE_PATH = sys.path[0] + '\\source'


def rm_repeat(file_list):
    rm_set = set()
    for line in file_list:
        content = u.create_content(line, 1)
        rm_set.add(content.replace('"', '') + '\n')
    return list(rm_set)


if __name__ == '__main__':

    try:
        print u.utf8_2_gbk('开始执行')
        file_list = u.GetFileList(FILE_PATH, [])
        for f in file_list:
            file_dict[f.encode('gbk')] = rm_repeat(u.create_file_body(f.encode('utf-8')))

        total_file = file('total.csv', 'w+')
        for key, value in file_dict.items():
            total_file.writelines(value)
        total_file.close()

        count_file = open('total.csv', 'rb')
        for line in count_file:
            content = u.create_content(line, 1)
            count_dict[content] = count_dict.get(content, 0) + 1
        count_file.close()
示例#16
0

def createPattern(str, accurate):  # 添加正则匹配规则
    if accurate:
        result = '^' + str.replace(',', '$|^') + '$'
    else:
        result = '.*' + str.replace(',', '.*|.*') + '.*'
    return result


if __name__ == '__main__':

    try:
        LABEL_FILE, LABEL_PATH = u.getFirstFile('txt')
        SOURCE_FILE, SOURCE_PATH = u.getFirstFile('csv')
        print u.utf8_2_gbk('打标签文件:' + LABEL_FILE)
        print u.utf8_2_gbk('数据源文件:' + SOURCE_FILE)
        source_file_body = u.create_file_body(SOURCE_PATH)
        for num, line in enumerate(source_file_body):
            source_file_body[num] = line.strip().lower() + ',' + '\n'
        labelType, labelNum = createMoreMatch(LABEL_PATH)
        matchHead = u.create_file_head(SOURCE_PATH, 'right', [LABEL_FILE.split('.')[0]])
        print u.utf8_2_gbk('标签个数:' + str(labelNum) + '个')
        for key, value in labelType.items():
            count += 1
            print u.utf8_2_gbk('当前执行到第' + str(count) + '个')
            words = value.strip().split('|')
            if len(words) == 1:
                c = createPattern(words[0], ACCURATE)
                p = re.compile(c)
                for num, line in enumerate(source_file_body):
示例#17
0
def createPattern(str, accurate):  # 添加正则匹配规则
    if ACCURATE:
        result = '^' + str.replace(',', '$|^').replace('&', '$|^') + '$'
    else:
        result = '.*' + str.replace(',', '.*|.*').replace('&', '.*') + '.*'
    return result


if __name__ == '__main__':
    try:
        keyWordCount = {}
        count = 0
        LABEL_FILE, LABEL_PATH = u.getFirstFile('txt')
        SOURCE_FILE, SOURCE_PATH = u.getFirstFile('csv')
        print u.utf8_2_gbk('打标签文件:' + LABEL_FILE)
        print u.utf8_2_gbk('数据源文件:' + SOURCE_FILE)
        print u.utf8_2_gbk('是否精确匹配' + str(ACCURATE))
        source_file_body = u.create_file_body(SOURCE_PATH)
        print len(source_file_body)
        for num, line in enumerate(source_file_body):
            source_file_body[num] = line.strip() + ',' + '\n'
        labelType, labelNum = createMoreMatch(LABEL_PATH)
        matchHead = u.create_file_head(SOURCE_PATH, 'right',
                                       [LABEL_FILE.split('.')[0]])
        print u.utf8_2_gbk('标签个数:' + str(labelNum) + '个')
        for key, value in labelType.items():
            count += 1
            print u.utf8_2_gbk('当前执行到第' + str(count) + '个')
            words = value.strip().split('|')
            if len(words) == 1:  # 只有关键词无过滤词