Exemplo n.º 1
0
    def run(self):
        with open(self.__SampleListFile, 'w', encoding='utf-8') as fp:
            scaned_files, sampled_files, err_counters = 0, 0, [
                0, 0, 0, 0, 0, 0
            ]
            for initial_path in self.__InitialPaths:
                for dir_path, dir_names, file_names in os.walk(initial_path):
                    if False in [
                            not match(excluded_path, dir_path)
                            for excluded_path in self.__ExcludedPaths
                    ]:  # 跳过例外目录
                        dir_names[:] = []  # 跳过例外目录的子目录
                        continue
                    if not os.access(dir_path,
                                     os.X_OK | os.R_OK):  # 有的目录下面的循环拦不住!
                        log.warning('[Permission Denied:] ' + dir_path)
                        continue
                    for dir_name in dir_names:  # 对无权进入的子目录,从扫描列表中清除并记录告警日志
                        dir_fullname = os.path.join(dir_path, dir_name)
                        if not os.access(dir_fullname, os.X_OK | os.R_OK):
                            dir_names.remove(dir_name)
                            log.warning('[Permission denied:] ' + dir_fullname)
                    if len(file_names
                           ) > self.__MaxFiles:  # 目录下文件特别多,很可能是数据文件目录
                        log.warning('[Too Many Files]( ' +
                                    str(len(file_names)) + '), Ignoring:' +
                                    dir_path)
                        continue

                    timer = time.time()
                    for file_name in file_names:
                        try:
                            scaned_files += 1
                            if scaned_files % 1000 == 0:
                                log.info(
                                    'Files scaned:[%d], error[%d], inactive[%d], small[%d], wrong-type[%d], non-text[%d], candidate[%d]\t%s'
                                    %
                                    (scaned_files, err_counters[0],
                                     err_counters[1], err_counters[2],
                                     err_counters[3], err_counters[4] +
                                     err_counters[5], sampled_files, dir_path))
                                if time.time(
                                ) - timer > self.__MaxSeconds:  # Too slow to scan a folder
                                    log.warning(
                                        '[Too slow to scan, Ignoring:]( ' +
                                        dir_path)
                                    break
                                time.sleep(self.__SleepSeconds)  # 防止过多占有系统资源

                            file_fullname = os.path.join(dir_path, file_name)
                            rc = Judger.filter(file_fullname)
                            if type(rc) is int:  # 该文件不是候选日志,无需采
                                err_counters[rc] += 1
                                continue
                            print(file_fullname, file=fp)
                            sampled_files += 1
                        except Exception as err:  # 出现过目录/文件名为乱字符导致写fp文件出现字符集异常情况
                            log.error(str(err))

        log.info(
            'Finish scan:[%d], error[%d], inactive[%d], small[%d], wrong-type[%d], non-text[%d], candidate[%d]'
            % (scaned_files, err_counters[0], err_counters[1], err_counters[2],
               err_counters[3], err_counters[4] + err_counters[5],
               sampled_files))