-
Notifications
You must be signed in to change notification settings - Fork 0
/
scanner.py
82 lines (71 loc) · 4.28 KB
/
scanner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @File : scanner.py
# @Author: Sui Huafeng
# @Date : 2018/1/1
# @Desc : 扫描全盘,找出候选日志文件,把文件名称等信息保存到文件中
#
import os
from re import match
import time
from config import cfg, log, win
from judger import Judger
class Scanner(object):
__SleepSeconds = cfg.getint('ScanFile', 'Sleep')
__MaxFiles = cfg.getint('ScanFile', 'MaxFiles')
__MaxSeconds = cfg.getint('ScanFile','MaxSeconds')
if win: # 根据不同操作系统设置起始扫描目录
__InitialPaths = [chr(i) + ':\\' for i in range(0x61, 0x7a) if os.path.isdir(chr(i) + ':\\')]
__ExcludedPaths = cfg.get('ScanFile', 'ExcludedWin').lower().split()
else:
__InitialPaths = ['/']
__ExcludedPaths = cfg.get('ScanFile', 'ExcludedUnix').lower().split()
def __init__(self, sample_list_file=os.path.join(cfg.get('Log', 'Folder'), 'samples.lst')):
self.__SampleListFile = sample_list_file
def run(self):
with open(self.__SampleListFile, 'w', encoding='utf-8') as fp:
scaned_files, sampled_files, err_counters = 0, 0, [0, 0, 0, 0, 0, 0]
for initial_path in self.__InitialPaths:
for dir_path, dir_names, file_names in os.walk(initial_path):
if False in [not match(excluded_path, dir_path) for excluded_path in
self.__ExcludedPaths]: # 跳过例外目录
dir_names[:] = [] # 跳过例外目录的子目录
continue
if not os.access(dir_path, os.X_OK | os.R_OK): # 有的目录下面的循环拦不住!
log.warning('[Permission Denied:] ' + dir_path)
continue
for dir_name in dir_names: # 对无权进入的子目录,从扫描列表中清除并记录告警日志
dir_fullname = os.path.join(dir_path, dir_name)
if not os.access(dir_fullname, os.X_OK|os.R_OK):
dir_names.remove(dir_name)
log.warning('[Permission denied:] ' + dir_fullname)
if len(file_names) > self.__MaxFiles: # 目录下文件特别多,很可能是数据文件目录
log.warning('[Too Many Files]( ' + str(len(file_names)) + '), Ignoring:' + dir_path)
continue
timer = time.time()
for file_name in file_names:
try:
scaned_files += 1
if scaned_files % 1000 == 0:
log.info(
'Files scaned:[%d], error[%d], inactive[%d], small[%d], wrong-type[%d], non-text[%d], candidate[%d]\t%s' % (
scaned_files, err_counters[0], err_counters[1], err_counters[2],
err_counters[3], err_counters[4] + err_counters[5], sampled_files, dir_path))
if time.time() - timer > self.__MaxSeconds: # Too slow to scan a folder
log.warning('[Too slow to scan, Ignoring:]( ' + dir_path)
break
time.sleep(self.__SleepSeconds) # 防止过多占有系统资源
file_fullname = os.path.join(dir_path, file_name)
rc = Judger.filter(file_fullname)
if type(rc) is int: # 该文件不是候选日志,无需采
err_counters[rc] += 1
continue
print(file_fullname, file=fp)
sampled_files += 1
except Exception as err: # 出现过目录/文件名为乱字符导致写fp文件出现字符集异常情况
log.error(str(err))
log.info('Finish scan:[%d], error[%d], inactive[%d], small[%d], wrong-type[%d], non-text[%d], candidate[%d]' % (
scaned_files, err_counters[0], err_counters[1], err_counters[2], err_counters[3],
err_counters[4] + err_counters[5], sampled_files))
if __name__ == '__main__':
Scanner().run()