示例#1
0
def parse_all(fnames=None, renew=False, proxy=None):
    """
    批量解析页面
    :param fnames:
    :param renew 是否重新解析所有文件
    :return:
    """
    so = SQLiteOper("data/scrap.db")
    if renew:
        fnames = []
        fname_gen = glob.iglob(r'data/secwiki/*.html')
        sql = 'delete from `secwiki_detail`'
        for f in fname_gen:
            fnames.append(f)

        so.execute(sql)

    if fnames is None:
        print "no new secwiki"
        return

    nos = sort_fname(fnames)

    # sqlite handler
    sql = """insert into `secwiki_detail`(`ts`,`tag`,`url`,`title`,`root_domain`,`domain`,`path`)
                            values(?,?,?,?,?,?,?);"""

    # file handler

    result_fname = path("data/secwiki_{start}_{end}.txt".format(
        start=nos.keys()[0], end=nos.keys()[-1]))

    if not renew and os.path.isfile(
            result_fname) and os.path.getsize(result_fname) > 0:
        return

    result_fh = codecs.open(result_fname, mode='wb')
    for k in nos.keys():
        fname = nos[k]

        with open(fname, mode='r') as html_hd:
            results_list = {}
            for content in parse_item(html_hd, so=so, proxy=proxy):
                if content:
                    k = content[0] + content[2]

                    results_list[k] = content

                    line = "\t".join(content)
                    print line
                    result_fh.write("{line}{linesep}".format(
                        line=line, linesep=os.linesep))

            so.executemany(sql, operate_list=results_list.values())

    result_fh.close()
示例#2
0
def scraw():
    """

    :return:
    """
    proxy = None
    so = SQLiteOper("data/scrap.db")
    secwiki_scraw(so, proxy=proxy, delta=2)

    xuanwu_scraw(so, proxy=proxy, delta=2)
示例#3
0
def main_pie(year):
    """

    :return:
    """
    so = SQLiteOper("data/scrap.db")

    for tag in ["domain", "tag"]:
        for source in ["secwiki", "xuanwu"]:
            draw_pie(so, source=source, year=str(year), tag=tag, top=10)

    draw_pie(so, tag="language", top=25, year=year)
示例#4
0
def draw_readme(fpath=None):
    """

    :return:
    """

    if fpath is None:
        fpath = "README.md"

    tables_rets = []
    so = SQLiteOper("data/scrap.db")
    year = get_special_date(delta=0, format="%Y%m")
    # update
    main_pie(year)

    # update weixin,github
    sources = ["weixin", "github_org", "github_private"]

    d = {
        "weixin": "微信公众号",
        "github_org": "组织github账号",
        "github_private": "私人github账号"
    }

    for source in sources:
        rets = draw_table(so, top=100, source=source, year=year)
        if rets:

            markdown_rets = markdown_table(rets)
            if markdown_rets:
                tables_rets.append("# %s 推荐" % d.get(source, source))
                for markdown_ret in markdown_rets:
                    tables_rets.append(markdown_ret)
                tables_rets.append(os.linesep)

    with codecs.open(fpath, mode='wb') as fr:
        fr.write("# [数据年报](README_YEAR.md)")
        fr.write(os.linesep)
        fr.write('# %s 信息源与信息类型占比' % year)
        fr.write(os.linesep)
        fr.write(
            '![{year}-信息源占比-secwiki](data/img/domain/{year}-信息源占比-secwiki.png)'
            .format(year=year))
        fr.write(os.linesep)
        fr.write(os.linesep)
        fr.write(
            '![{year}-信息源占比-xuanwu](data/img/domain/{year}-信息源占比-xuanwu.png)'.
            format(year=year))
        fr.write(os.linesep)
        fr.write(os.linesep)
        fr.write(
            '![{year}-信息类型占比-secwiki](data/img/tag/{year}-信息类型占比-secwiki.png)'.
            format(year=year))
        fr.write(os.linesep)
        fr.write(os.linesep)
        fr.write(
            '![{year}-信息类型占比-xuanwu](data/img/tag/{year}-信息类型占比-xuanwu.png)'.
            format(year=year))
        fr.write(os.linesep)
        fr.write(os.linesep)

        fr.write(
            '![{year}-最喜欢语言占比](data/img/language/{year}-最喜欢语言占比.png)'.format(
                year=year))
        fr.write(os.linesep)
        fr.write(os.linesep)

        st = os.linesep.join(tables_rets)
        fr.write(st)
        fr.write(os.linesep)
        fr.write(os.linesep)

        fr.write('# 日更新程序')
        fr.write(os.linesep)
        fr.write('`python update_daily.py`')
示例#5
0
                                    elif url.find("weixin.qq.com") != -1:
                                        d = get_weixin_info(url, ts, tag)

                                        if d:
                                            sql = d2sql(d, table="weixin")
                                    elif url.find("//github.com") != -1:
                                        d = get_github_info(url,
                                                            title,
                                                            ts=ts,
                                                            tag=tag)

                                        if d:
                                            sql = d2sql(d, table='github')

                                    if sql:
                                        try:
                                            #print sql
                                            so.execute(sql)
                                        except Exception as e:
                                            logging.error("[sql]: %s %s" %
                                                          (sql, str(e)))


if __name__ == "__main__":
    """
    """
    proxy = None
    so = SQLiteOper("data/scrap.db")
    scraw(so, proxy=proxy)
示例#6
0
def parse_all(renew=False, ndays=None, proxy=None):
    """
    解析多个页面
    :return:
    """
    so = SQLiteOper("data/scrap.db")

    # 解析或爬取缺失的页面
    fname_lists = []
    if ndays is not None:

        for cur_day in ndays:
            year = cur_day[0:4]
            month = cur_day[4:6]
            day = cur_day[6:8]
            fname = path("data/xuanwu/{year}/{month}/{day}/index.html".format(
                year=year, month=month, day=day))

            if not os.path.exists(fname):

                fname = scrap_item(cur_day)
                if fname is None:
                    print "%s news not exits" % cur_day

                else:
                    fname_lists.append(fname)

    if renew:
        fname_lists = []
        # 重新解析所有页面
        sql = 'delete from `xuanwu_detail`'
        so.execute(sql)
        for fname in glob.iglob(r'data/xuanwu/*/*/*/index.html'):
            fname_lists.append(fname)

    if fname_lists:
        start, end = getstartendfrompath(fname_lists)
        sql = """
                    insert into `xuanwu_detail`(`ts`,`tag`,`url`,`title`,`root_domain`,`domain`,`path`,`author_id`)
                        values(?,?,?,?,?,?,?,?);
                    """
        # file handler
        result_fname = path("data/xuanwu_{start}_{end}.txt".format(start=start,
                                                                   end=end))

        if not renew and os.path.isfile(
                result_fname) and os.path.getsize(result_fname) > 0:
            return

        result_fh = codecs.open(result_fname, mode='wb')

        for fname in fname_lists:

            fname = path(fname)

            results_list = {}
            for content in parse_item(fname, so=so, proxy=proxy):
                if content:
                    k = content[0] + content[2]

                    results_list[k] = content
                    line = "\t".join(content)
                    print line
                    result_fh.write("{line}{linesep}".format(
                        line=line, linesep=os.linesep))

            if results_list:
                so.executemany(sql, operate_list=results_list.values())
示例#7
0
def draw_readme_item(year=None, fpath=None):
    """

    :param year:
    :param fpath:
    :return:
    """

    tables_rets = []
    so = SQLiteOper("data/scrap.db")
    if year is None:
        year = get_special_date(delta=0, format="%Y%m")

    if fpath is None:
        fpath = 'README_%s.md' % year
    # update
    main_pie(year)

    # update weixin,github
    sources = [
        "weixin", "github_org", "github_private", "medium_xuanwu",
        "medium_secwiki", "zhihu_xuanwu", "zhihu_secwiki"
    ]

    d = {
        "weixin": "微信公众号",
        "github_org": "组织github账号",
        "github_private": "私人github账号"
    }

    for source in sources:
        rets = draw_table(so, top=100, source=source, year=year)
        if rets:

            markdown_rets = markdown_table(rets)
            if markdown_rets:
                tables_rets.append("# %s 推荐" % d.get(source, source))
                for markdown_ret in markdown_rets:
                    tables_rets.append(markdown_ret)
                tables_rets.append(os.linesep)

    with codecs.open(fpath, mode='wb') as fr:
        fr.write('# [数据--所有](README_20.md)')
        fr.write(os.linesep)
        fr.write(
            '# [数据--年度](README_{year_year}.md)'.format(year_year=year[0:4]))
        fr.write(os.linesep)
        fr.write('# %s 信息源与信息类型占比' % year)
        fr.write(os.linesep)
        fr.write(
            '![{year}-信息源占比-secwiki](data/img/domain/{year}-信息源占比-secwiki.png)'
            .format(year=year))
        fr.write(os.linesep)
        fr.write(os.linesep)
        fr.write(
            '![{year}-信息源占比-xuanwu](data/img/domain/{year}-信息源占比-xuanwu.png)'.
            format(year=year))
        fr.write(os.linesep)
        fr.write(os.linesep)
        # fr.write('![{year}-信息类型占比-secwiki](data/img/tag/{year}-信息类型占比-secwiki.png)'.
        #        format(year=year))
        fr.write(os.linesep)
        fr.write(os.linesep)
        fr.write(
            '![{year}-信息类型占比-xuanwu](data/img/tag/{year}-信息类型占比-xuanwu.png)'.
            format(year=year))
        fr.write(os.linesep)
        fr.write(os.linesep)

        fr.write(
            '![{year}-最喜欢语言占比](data/img/language/{year}-最喜欢语言占比.png)'.format(
                year=year))
        fr.write(os.linesep)
        fr.write(os.linesep)

        st = os.linesep.join(tables_rets)
        fr.write(st)
        fr.write(os.linesep)
        fr.write(os.linesep)

        fr.write('# 日更新程序')
        fr.write(os.linesep)
        fr.write('`python update_daily.py`')
    return fpath