def __init__(self, rulefname): """ :param rulefname: 配置文件路径 """ self.__rulefd = None try: self.__rulefd = codecs.open(rulefname, mode='rb', encoding='utf8', errors='ignore') except Exception as e: logging.error("read %s file failed: %s" % (rulefname, repr(e))) dataMap = load(self.__rulefd) tcp_stream_handler = dataMap.get("tcp_stream_handler") self.bpf_filter = tcp_stream_handler.get("bpf_filter") self.dst_port_filter = tcp_stream_handler.get("dst_port_filter") self.dst_ip_filter = tcp_stream_handler.get("dst_ip_filter") self.pcap_file = tcp_stream_handler.get( "pcap_file") if tcp_stream_handler.get("pcap_file_enable", 0) == 1 else None if self.pcap_file: self.pcap_file = mills.path(self.pcap_file) self.device = tcp_stream_handler.get( "device") if tcp_stream_handler.get("device_enable", 0) == 1 else None self.data_level = tcp_stream_handler.get("data_level", 1) self.data_stream_direct = tcp_stream_handler.get( "data_stream_direct", 2) self.std_output_enable = tcp_stream_handler.get("std_output_enable", 1) self.file_tcpsession_path = tcp_stream_handler.get( "file_tcpsession_path") if tcp_stream_handler.get( "file_output_enable", 0) == 1 else None if self.file_tcpsession_path: self.file_tcpsession_path = mills.path(self.file_tcpsession_path) self.protocol_parse_conf = tcp_stream_handler.get( "protocol_parse_conf")
def parse_all(fnames=None, renew=False, proxy=None): """ 批量解析页面 :param fnames: :param renew 是否重新解析所有文件 :return: """ so = SQLiteOper("data/scrap.db") if renew: fnames = [] fname_gen = glob.iglob(r'data/secwiki/*.html') sql = 'delete from `secwiki_detail`' for f in fname_gen: fnames.append(f) so.execute(sql) if fnames is None: print "no new secwiki" return nos = sort_fname(fnames) # sqlite handler sql = """insert into `secwiki_detail`(`ts`,`tag`,`url`,`title`,`root_domain`,`domain`,`path`) values(?,?,?,?,?,?,?);""" # file handler result_fname = path("data/secwiki_{start}_{end}.txt".format( start=nos.keys()[0], end=nos.keys()[-1])) if not renew and os.path.isfile( result_fname) and os.path.getsize(result_fname) > 0: return result_fh = codecs.open(result_fname, mode='wb') for k in nos.keys(): fname = nos[k] with open(fname, mode='r') as html_hd: results_list = {} for content in parse_item(html_hd, so=so, proxy=proxy): if content: k = content[0] + content[2] results_list[k] = content line = "\t".join(content) print line result_fh.write("{line}{linesep}".format( line=line, linesep=os.linesep)) so.executemany(sql, operate_list=results_list.values()) result_fh.close()
def scrap_item(cur_day=None): """ :return: """ year = cur_day[0:4] month = cur_day[4:6] day = cur_day[6:8] fname = path("data/xuanwu/{year}/{month}/{day}/index.html".format( year=year, month=month, day=day)) url = """https://xuanwulab.github.io/cn/secnews/{year}/{month}/{day}/index.html""".format( year=year, month=month, day=day) print url logging.info("[SCRAP_PAGE]: %s" % url) try: r = requests.get(url) if r.status_code == 200: fname_year = path("data/xuanwu/{year}".format(year=year)) if not os.path.exists(fname_year): os.mkdir(fname_year) fname_month = path("data/xuanwu/{year}/{month}".format( year=year, month=month)) if not os.path.exists(fname_month): os.mkdir(fname_month) fname_day = path("data/xuanwu/{year}/{month}/{day}".format( year=year, month=month, day=day)) if not os.path.exists(fname_day): os.mkdir(fname_day) with codecs.open(fname, mode='wb') as fw: fw.write(r.content) return fname except Exception as e: logging.error("[SCRAP_REQUEST_FAILED]: %s %s" % (url, str(e)))
def scrap_item(i=1): """ 爬取单个页面 :return: """ url = "https://www.sec-wiki.com/weekly/{i}".format(i=i) if not os.path.exists(path("data/secwiki")): os.mkdir(path("data/secwiki")) fname = path("data/secwiki/{i}_week.html".format(i=i)) logging.info("[SCRAP_PAGE]: %s" % url) try: r = requests.get(url) if r.status_code == 200: with codecs.open(fname, mode='wb') as fw: fw.write(r.content) return fname except Exception as e: logging.error("[SCRAP_REQUEST_FAILED]: %s %s" % (url, str(e)))
def statistict_github_language(so, topn=132, reverse=True, year=''): """ :param so: :return: """ lang_dict = {} sql = "select distinct repo_lang from github where ts like '{year}%' and (repo_lang is not null or repo_lang != '')".format( year=year) # print sql result = so.query(sql) if result: for item in result: repo_lang = item[0] repo_langs = [_.strip() for _ in re.split(',', repo_lang)] for repo_lang in repo_langs: if not repo_lang: continue if repo_lang in lang_dict: lang_dict[repo_lang] = lang_dict[repo_lang] + 1 else: lang_dict[repo_lang] = 1 vd = OrderedDict( sorted(lang_dict.items(), key=lambda t: t[1], reverse=reverse)) sum_count = sum(vd.values()) vd2 = OrderedDict() i = 0 for k, v in vd.items(): if i < topn: vd2[k] = round(float(v) / sum_count, 4) else: break i = i + 1 fname = path("data", "%s_github_lang.txt" % year) with open(fname, mode='wb') as fw: for k, v in vd.items(): fw.write("%s\t%s%s" % (k, v, os.linesep)) return vd2
def draw_pie(so, source="secwiki", year="", tag="domain", top=10): """ :return: """ if tag != "language": ods = info_source(so, table="{source}_detail".format(source=source), top=top, year=str(year), tag=tag) else: ods = statistict_github_language(so, topn=top, year=year) labels = [] values = [] if not ods: return for k, v in ods.items(): labels.append(k) values.append(v) labels.append("other") values.append(1 - sum(values)) explode = [0.1 for _ in range(0, len(labels))] explode[-1] = 0 # 凸显 try: #plt.rcParams['font.sans-serif'] = ['MicrosoftYaHei'] plt.rcParams['font.sans-serif'] = ['SimHei'] # 解决中文乱码 plt.rcParams['font.family'] = 'sans-serif' plt.rcParams['axes.unicode_minus'] = False # 坐标轴负号的处理 plt.axes(aspect='equal') # 设置x,y轴刻度一致,这样饼图才能是圆的 plt.pie( values, # 指定绘图的数据 explode=explode, # 指定饼图某些部分的突出显示,即呈现爆炸式 labels=labels, # 为饼图添加标签说明,类似于图例说明 labeldistance=1.2, # 设置各扇形标签(图例)与圆心的距离; pctdistance=0.6, # :设置百分比标签与圆心的距离; startangle=90, # 设置饼图的初始摆放角度; shadow=True, # 是否添加饼图的阴影效果; autopct='%3.2f%%') if tag == "domain": title_pie = "%s-信息源占比-%s" % (year, source) elif tag == "tag": title_pie = "%s-信息类型占比-%s" % (year, source) elif tag == "language": title_pie = "%s-最喜欢语言占比" % (year) else: return plt.title(unicode(title_pie)) fdir = path("data/img/%s" % tag) if not os.path.exists(fdir): os.mkdir(fdir) fpath = path(fdir, "%s.png" % title_pie) plt.legend(labels, loc='upper right', fontsize=5) plt.savefig(fpath) plt.close() except Exception as e: print source, year, tag print len(labels), labels print len(values), values print len(explode), explode
def parse_all(renew=False, ndays=None, proxy=None): """ 解析多个页面 :return: """ so = SQLiteOper("data/scrap.db") # 解析或爬取缺失的页面 fname_lists = [] if ndays is not None: for cur_day in ndays: year = cur_day[0:4] month = cur_day[4:6] day = cur_day[6:8] fname = path("data/xuanwu/{year}/{month}/{day}/index.html".format( year=year, month=month, day=day)) if not os.path.exists(fname): fname = scrap_item(cur_day) if fname is None: print "%s news not exits" % cur_day else: fname_lists.append(fname) if renew: fname_lists = [] # 重新解析所有页面 sql = 'delete from `xuanwu_detail`' so.execute(sql) for fname in glob.iglob(r'data/xuanwu/*/*/*/index.html'): fname_lists.append(fname) if fname_lists: start, end = getstartendfrompath(fname_lists) sql = """ insert into `xuanwu_detail`(`ts`,`tag`,`url`,`title`,`root_domain`,`domain`,`path`,`author_id`) values(?,?,?,?,?,?,?,?); """ # file handler result_fname = path("data/xuanwu_{start}_{end}.txt".format(start=start, end=end)) if not renew and os.path.isfile( result_fname) and os.path.getsize(result_fname) > 0: return result_fh = codecs.open(result_fname, mode='wb') for fname in fname_lists: fname = path(fname) results_list = {} for content in parse_item(fname, so=so, proxy=proxy): if content: k = content[0] + content[2] results_list[k] = content line = "\t".join(content) print line result_fh.write("{line}{linesep}".format( line=line, linesep=os.linesep)) if results_list: so.executemany(sql, operate_list=results_list.values())
else: cmd = cmd_darwin local_ip = subprocess.Popen([cmd], stdout=subprocess.PIPE, shell=True) (IP, errors) = local_ip.communicate() local_ip.stdout.close() IP = IP.strip() return IP if __name__ == "__main__": from optparse import OptionParser import logger logger.generate_special_logger(level=logging.INFO, logtype="network", curdir=mills.path("log/"), ismultiprocess=False) parser = OptionParser() parser.add_option( "--portHost", dest='getPortHostByteOrder', action="store", type="int", help= "change network byte order port to host byte order port(20480 - 80)", # default=20480 ) parser.add_option( "--portNetwork", dest='getPortNetworkByteOrder',