def parse_all(fnames=None, renew=False, proxy=None): """ 批量解析页面 :param fnames: :param renew 是否重新解析所有文件 :return: """ so = SQLiteOper("data/scrap.db") if renew: fnames = [] fname_gen = glob.iglob(r'data/secwiki/*.html') sql = 'delete from `secwiki_detail`' for f in fname_gen: fnames.append(f) so.execute(sql) if fnames is None: print "no new secwiki" return nos = sort_fname(fnames) # sqlite handler sql = """insert into `secwiki_detail`(`ts`,`tag`,`url`,`title`,`root_domain`,`domain`,`path`) values(?,?,?,?,?,?,?);""" # file handler result_fname = path("data/secwiki_{start}_{end}.txt".format( start=nos.keys()[0], end=nos.keys()[-1])) if not renew and os.path.isfile( result_fname) and os.path.getsize(result_fname) > 0: return result_fh = codecs.open(result_fname, mode='wb') for k in nos.keys(): fname = nos[k] with open(fname, mode='r') as html_hd: results_list = {} for content in parse_item(html_hd, so=so, proxy=proxy): if content: k = content[0] + content[2] results_list[k] = content line = "\t".join(content) print line result_fh.write("{line}{linesep}".format( line=line, linesep=os.linesep)) so.executemany(sql, operate_list=results_list.values()) result_fh.close()
def scraw(): """ :return: """ proxy = None so = SQLiteOper("data/scrap.db") secwiki_scraw(so, proxy=proxy, delta=2) xuanwu_scraw(so, proxy=proxy, delta=2)
def main_pie(year): """ :return: """ so = SQLiteOper("data/scrap.db") for tag in ["domain", "tag"]: for source in ["secwiki", "xuanwu"]: draw_pie(so, source=source, year=str(year), tag=tag, top=10) draw_pie(so, tag="language", top=25, year=year)
def draw_readme(fpath=None): """ :return: """ if fpath is None: fpath = "README.md" tables_rets = [] so = SQLiteOper("data/scrap.db") year = get_special_date(delta=0, format="%Y%m") # update main_pie(year) # update weixin,github sources = ["weixin", "github_org", "github_private"] d = { "weixin": "微信公众号", "github_org": "组织github账号", "github_private": "私人github账号" } for source in sources: rets = draw_table(so, top=100, source=source, year=year) if rets: markdown_rets = markdown_table(rets) if markdown_rets: tables_rets.append("# %s 推荐" % d.get(source, source)) for markdown_ret in markdown_rets: tables_rets.append(markdown_ret) tables_rets.append(os.linesep) with codecs.open(fpath, mode='wb') as fr: fr.write("# [数据年报](README_YEAR.md)") fr.write(os.linesep) fr.write('# %s 信息源与信息类型占比' % year) fr.write(os.linesep) fr.write( '![{year}-信息源占比-secwiki](data/img/domain/{year}-信息源占比-secwiki.png)' .format(year=year)) fr.write(os.linesep) fr.write(os.linesep) fr.write( '![{year}-信息源占比-xuanwu](data/img/domain/{year}-信息源占比-xuanwu.png)'. format(year=year)) fr.write(os.linesep) fr.write(os.linesep) fr.write( '![{year}-信息类型占比-secwiki](data/img/tag/{year}-信息类型占比-secwiki.png)'. format(year=year)) fr.write(os.linesep) fr.write(os.linesep) fr.write( '![{year}-信息类型占比-xuanwu](data/img/tag/{year}-信息类型占比-xuanwu.png)'. format(year=year)) fr.write(os.linesep) fr.write(os.linesep) fr.write( '![{year}-最喜欢语言占比](data/img/language/{year}-最喜欢语言占比.png)'.format( year=year)) fr.write(os.linesep) fr.write(os.linesep) st = os.linesep.join(tables_rets) fr.write(st) fr.write(os.linesep) fr.write(os.linesep) fr.write('# 日更新程序') fr.write(os.linesep) fr.write('`python update_daily.py`')
elif url.find("weixin.qq.com") != -1: d = get_weixin_info(url, ts, tag) if d: sql = d2sql(d, table="weixin") elif url.find("//github.com") != -1: d = get_github_info(url, title, ts=ts, tag=tag) if d: sql = d2sql(d, table='github') if sql: try: #print sql so.execute(sql) except Exception as e: logging.error("[sql]: %s %s" % (sql, str(e))) if __name__ == "__main__": """ """ proxy = None so = SQLiteOper("data/scrap.db") scraw(so, proxy=proxy)
def parse_all(renew=False, ndays=None, proxy=None): """ 解析多个页面 :return: """ so = SQLiteOper("data/scrap.db") # 解析或爬取缺失的页面 fname_lists = [] if ndays is not None: for cur_day in ndays: year = cur_day[0:4] month = cur_day[4:6] day = cur_day[6:8] fname = path("data/xuanwu/{year}/{month}/{day}/index.html".format( year=year, month=month, day=day)) if not os.path.exists(fname): fname = scrap_item(cur_day) if fname is None: print "%s news not exits" % cur_day else: fname_lists.append(fname) if renew: fname_lists = [] # 重新解析所有页面 sql = 'delete from `xuanwu_detail`' so.execute(sql) for fname in glob.iglob(r'data/xuanwu/*/*/*/index.html'): fname_lists.append(fname) if fname_lists: start, end = getstartendfrompath(fname_lists) sql = """ insert into `xuanwu_detail`(`ts`,`tag`,`url`,`title`,`root_domain`,`domain`,`path`,`author_id`) values(?,?,?,?,?,?,?,?); """ # file handler result_fname = path("data/xuanwu_{start}_{end}.txt".format(start=start, end=end)) if not renew and os.path.isfile( result_fname) and os.path.getsize(result_fname) > 0: return result_fh = codecs.open(result_fname, mode='wb') for fname in fname_lists: fname = path(fname) results_list = {} for content in parse_item(fname, so=so, proxy=proxy): if content: k = content[0] + content[2] results_list[k] = content line = "\t".join(content) print line result_fh.write("{line}{linesep}".format( line=line, linesep=os.linesep)) if results_list: so.executemany(sql, operate_list=results_list.values())
def draw_readme_item(year=None, fpath=None): """ :param year: :param fpath: :return: """ tables_rets = [] so = SQLiteOper("data/scrap.db") if year is None: year = get_special_date(delta=0, format="%Y%m") if fpath is None: fpath = 'README_%s.md' % year # update main_pie(year) # update weixin,github sources = [ "weixin", "github_org", "github_private", "medium_xuanwu", "medium_secwiki", "zhihu_xuanwu", "zhihu_secwiki" ] d = { "weixin": "微信公众号", "github_org": "组织github账号", "github_private": "私人github账号" } for source in sources: rets = draw_table(so, top=100, source=source, year=year) if rets: markdown_rets = markdown_table(rets) if markdown_rets: tables_rets.append("# %s 推荐" % d.get(source, source)) for markdown_ret in markdown_rets: tables_rets.append(markdown_ret) tables_rets.append(os.linesep) with codecs.open(fpath, mode='wb') as fr: fr.write('# [数据--所有](README_20.md)') fr.write(os.linesep) fr.write( '# [数据--年度](README_{year_year}.md)'.format(year_year=year[0:4])) fr.write(os.linesep) fr.write('# %s 信息源与信息类型占比' % year) fr.write(os.linesep) fr.write( '![{year}-信息源占比-secwiki](data/img/domain/{year}-信息源占比-secwiki.png)' .format(year=year)) fr.write(os.linesep) fr.write(os.linesep) fr.write( '![{year}-信息源占比-xuanwu](data/img/domain/{year}-信息源占比-xuanwu.png)'. format(year=year)) fr.write(os.linesep) fr.write(os.linesep) # fr.write('![{year}-信息类型占比-secwiki](data/img/tag/{year}-信息类型占比-secwiki.png)'. # format(year=year)) fr.write(os.linesep) fr.write(os.linesep) fr.write( '![{year}-信息类型占比-xuanwu](data/img/tag/{year}-信息类型占比-xuanwu.png)'. format(year=year)) fr.write(os.linesep) fr.write(os.linesep) fr.write( '![{year}-最喜欢语言占比](data/img/language/{year}-最喜欢语言占比.png)'.format( year=year)) fr.write(os.linesep) fr.write(os.linesep) st = os.linesep.join(tables_rets) fr.write(st) fr.write(os.linesep) fr.write(os.linesep) fr.write('# 日更新程序') fr.write(os.linesep) fr.write('`python update_daily.py`') return fpath