def __init__(self): # ==== Required vars ===== # self.stdin_path = '/dev/null' self.stdout_path = '/dev/null' self.stderr_path = '/dev/null' # self.pidfile_path = '/var/run/toidaemon/toidaemon.pid' self.pidfile_path = PID_FILE_PATH self.pidfile_timeout = 5 # ========================= # self.db_name = DB_PATH self.db = SQLite(self.db_name) self.table = self.db.get(ToiScraper.TABLE_NAME) print("Initializing...") if not self.table: print("No table found with name {0}. Creating it.".format( ToiScraper.TABLE_NAME)) self.table = self.db.create(ToiScraper.TABLE_NAME, ToiScraper.TABLE_SCHEMA) else: if not self.table.get_info() == ToiScraper.TABLE_SCHEMA: error_str = "Table {0} exists but with incorrect schema".format( ToiScraper.TABLE_NAME) print(error_str) raise Exception(error_str) self.iter_date = self._get_init_date_full()
def get_top(table='secwiki', column='domain', time=2020, top=10): """ 取top数据作饼图 :param table: :param column: :param time: :param top: :return type:dict :return value:percentage(domain top10+other) """ so = SQLite("data/secwiki.db") sql = "select {column},count(url) as ct from {table} \ where ts like '%{time}%' \ group by {column} \ order by ct DESC".format(column=column, table=table, time=time) r = so.query(sql) od = OrderedDict() for i in r: od[i[0]] = i[1] od_pec = dict() i = 0 for k, v in od.items(): if i < top: od_pec[k] = round(v / sum(od.values()), 4) else: break i = i + 1 od_pec['other'] = round(1 - sum(od_pec.values()), 4) return od_pec
def parse_all(fnames, reparse=False): """ 格式化为ts、tag、url、title、root_domain、domain、url_path :param reparse:是否重新全部解析 :return: """ sqldb = SQLite('data/secwiki.db') # 判断是否重新全部解析 if reparse: fnames = [] gen_file = glob.iglob(r'data/html/secwiki_*.html') sql = 'delete from `secwiki`' for gfile in gen_file: fnames.append(gfile) sqldb.execute(sql) if fnames is None: print('No new secwiki') return sql = 'insert into `secwiki` (`ts`,`tag`,`url`,`title`,`root_domain`,`domain`,`url_path`) values(?,?,?,?,?,?,?);' for fname in fnames: # 判断目标文件本地是否存在 m = re.search(r'secwiki_(\d+)\.html', fname) rname = m.group(1) rname = path('data/txt', 'secwiki_' + rname + '.txt') if not os.path.exists(path("data/txt")): os.mkdir(path("data/txt")) if os.path.exists(rname) and os.path.getsize(rname) > 0: continue # 待统一写入目标文件 rf = codecs.open(rname, mode='wb') # 读本地源文件并解析 with codecs.open(fname, 'rb') as f: all_content = {} #print(fname) for content in parse_single(f): if content: # 解析完写入目标文件 k = content[0] + content[2] all_content[k] = content line = "\t".join(content) rf.write(line.encode() + b'\r\n') # 批量存入sqlite3 if all_content: sqldb.executemany(sql, all_content.values()) rf.close()
def load(): """ 载入nvd.nist原始exp标记数据 """ # 取CVE exp白样本 so = SQLite('data/nvd.db') sql = 'select CVE_Items_cve_CVE_data_meta_ID,CVE_Items_cve_description_description_data_value from nvd_cve where CVE_Items_cve_references_reference_data_tags not like "%Exploit%"' cve_0 = sql2cve(so, sql) cve_0['label'] = 0 # 取CVE exp黑样本 sql = 'select CVE_Items_cve_CVE_data_meta_ID,CVE_Items_cve_description_description_data_value from nvd_cve where CVE_Items_cve_references_reference_data_tags like "%Exploit%"' cve_1 = sql2cve(so, sql) cve_1['label'] = 1 cve = pd.concat([cve_0, cve_1]) print(cve.head()) cve.to_csv('cve2.csv', index=False) return cve
ts = float(ts) ts_str = datetime.datetime.fromtimestamp(ts).strftime( '%Y-%m-%d %H:%M:%S.%f') return ts_str def get_md5(path): return hashlib.md5(open(path, 'rb').read()).hexdigest() def get_sha1(path): return hashlib.sha1(open(path, 'rb').read()).hexdigest() if __name__ == '__main__': sqlite = SQLite('data.db') # sqlite.insert() while True: input_dir = input('Enter folder path: ') if isinstance(input_dir, str): work_dir = input_dir break else: pass # work_dir = 'D:\\共享区' for parent, dirnames, filenames in os.walk(work_dir, followlinks=True): for filename in filenames: file_path = os.path.join(parent, filename) file_attr = stat(file_path) attr_list = [ file_attr.st_mode, file_attr.st_uid, file_attr.st_gid,