예제 #1
0
def parseJuejinInfo(
):  #https://juejin.im/tag/%E6%8E%98%E9%87%91%E7%BF%BB%E8%AF%91%E8%AE%A1%E5%88%92
    #网页信息
    page_url = "https://juejin.im/tag/%E6%8E%98%E9%87%91%E7%BF%BB%E8%AF%91%E8%AE%A1%E5%88%92"
    title_xpath = "//div[@class='info-row title-row']/a/text()"
    link_xpath = "//div[@class='info-row title-row']/a/@href"
    #Excel文件名称
    file_name = os.getcwd() + "\Juejin_articles.xls"
    author_name = "Juejin"
    # 写入表头数据
    headerData = [
        [
            "文章标题",
            "文章链接",
        ],
    ]
    opeExcel.create_excel_sheet(file_name, author_name)
    opeExcel.write_excel_xls_append(file_name, author_name, headerData)
    #将HTML源码字符串转换尘土HTML对象
    page_html = getEtreeHTML(page_url)
    # 博客文章的标题
    title_list = parseEtreeHTML(page_html, title_xpath)
    # 博客文章的链接
    link_list = parseEtreeHTML(page_html, link_xpath)
    print("" + str(len(title_list)))
    print("" + str(len(link_list)))
    visitList(title_list)

    # 将数据保存到excel表格中
    opeExcel.write_excel_xls_append_2(file_name, author_name, title_list,
                                      link_list)
예제 #2
0
def parseGityuanHTML():  #http://gityuan.com/archive/
    #网页信息
    page_url = "http://gityuan.com/archive/"
    title_xpath = "//div[@class='post-preview']/a/text()"
    link_xpath = "//div[@class='post-preview']/a/@href"
    # Excel文件名称
    file_name = os.getcwd() + "\Gityuan_articles.xls"
    author_name = "Gityuan"
    # 写入表头数据
    headerData = [
        [
            "文章标题",
            "文章链接",
        ],
    ]
    opeExcel.create_excel_sheet(file_name, author_name)
    opeExcel.write_excel_xls_append(file_name, author_name, headerData)
    #将HTML源码字符串转换尘土HTML对象
    page_html = getEtreeHTML(page_url)
    # 博客文章的标题
    title_list = parseEtreeHTML(page_html, title_xpath)
    # 博客文章的链接
    link_list = parseEtreeHTML(page_html, link_xpath)
    #Gityuan的网站返回的都是不带"http://gityuan.com"的链接信息
    for i in range(0, len(link_list)):
        link_list[i] = "http://gityuan.com" + link_list[i]
    # 将数据保存到excel表格中
    opeExcel.write_excel_xls_append_2(file_name, author_name, title_list,
                                      link_list)
예제 #3
0
def parseLightMoon(
):  #http://light3moon.com/1986/12/20/%E6%96%87%E7%AB%A0%E7%B4%A2%E5%BC%95/
    #网页信息
    page_url = "http://light3moon.com/1986/12/20/%E6%96%87%E7%AB%A0%E7%B4%A2%E5%BC%95/"
    title_xpath = "//div[@class='article-content']/p/a/text()"
    link_xpath = "//div[@class='article-content']/p/a/@href"
    #Excel文件名称
    file_name = os.getcwd() + "\LightMoon_articles.xls"
    author_name = "LightMoon"
    # 写入表头数据
    headerData = [
        [
            "文章标题",
            "文章链接",
        ],
    ]
    opeExcel.create_excel_sheet(file_name, author_name)
    opeExcel.write_excel_xls_append(file_name, author_name, headerData)
    #将HTML源码字符串转换尘土HTML对象
    page_html = getEtreeHTML(page_url)
    # 博客文章的标题
    title_list = parseEtreeHTML(page_html, title_xpath)
    # 博客文章的链接
    link_list = parseEtreeHTML(page_html, link_xpath)
    # 将数据保存到excel表格中
    opeExcel.write_excel_xls_append_2(file_name, author_name, title_list,
                                      link_list)
예제 #4
0
def getCnblogsInfo():  #https://www.cnblogs.com/Jax/default.html?page=1
    title_xpath = "//a[@class='postTitle2 vertical-middle']/span/text()"
    link_xpath = "//a[@class='postTitle2 vertical-middle']/@href"
    date_xpath = "//div[@class='dayTitle']/a/text()"
    # 写入Excel文件的表头数据,即第一行数据
    headerData = [
        [
            "文章标题",
            "文章链接",
            "发布日期",
        ],
    ]
    # 博主名字
    author_name = "baojianqiang"
    # 博主博文页数
    page_num = 999999
    # page_num = int(input("请输入博客页数: "))
    # Excel文件名称
    file_name = os.getcwd() + "\cnblogs_articles.xls"

    opeExcel.create_excel_sheet(file_name, author_name)
    opeExcel.write_excel_xls_append(file_name, author_name, headerData)
    # 循环每页
    allNumber = 0  #文章总数
    for index in range(1, page_num + 1):
        # 拼接URL
        page_url = "https://www.cnblogs.com/Jax/default.html?page=" + str(
            index)
        page_html = getEtreeHTML(page_url)
        title_list = parseEtreeHTML(page_html, title_xpath)
        if len(title_list) == 0:
            print(author_name + "文章获取完毕,共计文章数目:" + str(allNumber))
            allNumber = 0
            break
        link_list = parseEtreeHTML(page_html, link_xpath)
        date_list = parseEtreeHTML(page_html,
                                   date_xpath)  #该博客中日期数目少于标题数目,因此只保存文章标题和链接
        print("title_list: " + str(len(title_list)))
        print("date_list: " + str(len(date_list)))
        opeExcel.write_excel_xls_append_2(file_name, author_name, title_list,
                                          link_list)
        allNumber = len(title_list) + allNumber
예제 #5
0
def getCSDNAuthorInfo():
    #下列为CSDN博客页面的xpath,例如https://blog.csdn.net/luoshengyang/article/list/1
    type_xpath = "//div[@class='article-item-box csdn-tracking-statistics']/h4/a/span/text()"
    title_xpath = "//div[@class='article-item-box csdn-tracking-statistics']/h4/a/text()"
    link_xpath = "//div[@class='article-item-box csdn-tracking-statistics']/h4/a/@href"
    publishDate_xpath = "//div[@class='info-box d-flex align-content-center']/p/span[@class='date']/text()"
    readerCount_xpath = "//div[@class='info-box d-flex align-content-center']/p//span[last()-1][@class='read-num']/text()"
    commentCount_xpath = "//div[@class='info-box d-flex align-content-center']/p//span[last()][@class='read-num']/text()"
    # 写入Excel文件的表头数据,即第一行数据

    # 博主名字
    author_name = input("请输入博主的名字: ")
    # Sheet列数
    col_num = int(input("请输入Sheet列数: "))
    if col_num == 2:
        print("2列")
        headerData = [
            ["文章标题", "文章链接"],
        ]
    elif col_num == 3:
        print("3列")
        headerData = [
            ["文章标题", "文章链接", "发表日期"],
        ]
    elif col_num == 6:
        print("6列")
        headerData = [
            ["文章类型", "文章标题", "文章链接", "发表日期", "阅读数", "评论数"],
        ]
    else:
        print("列数不对,函数返回!")
        return
    # 博主博文页数
    page_num = 999999
    # page_num = int(input("请输入博客页数: "))
    # Excel文件名称
    file_name = os.getcwd() + "\CSDN_articles.xls"

    opeExcel.create_excel_sheet(file_name, author_name)
    opeExcel.write_excel_xls_append(file_name, author_name, headerData)
    # 总体数组
    title_sum = []
    link_sum = []
    publishDate_sum = []
    type_sum = []
    readerCount_sum = []
    commentCount_sum = []
    for index in range(1, page_num + 1):
        # 拼接URL
        page_url = "https://blog.csdn.net/" + author_name + "/article/list/" + str(
            index)
        page_html = opeHTML.getEtreeHTML(page_url)
        # 博客文章的标题
        title_list = opeHTML.parseEtreeHTML(page_html, title_xpath)
        if len(title_list) == 0:
            break
        # 博客文章的链接
        link_list = opeHTML.parseEtreeHTML(page_html, link_xpath)
        # 博客文章的发布日期
        publishDate_list = opeHTML.parseEtreeHTML(page_html, publishDate_xpath)
        # 博客文章的类型
        type_list = opeHTML.parseEtreeHTML(page_html, type_xpath)
        # 博客文章的阅读数
        readerCount_list = opeHTML.parseEtreeHTML(page_html, readerCount_xpath)
        # 博客文章的评论数
        commentCount_list = opeHTML.parseEtreeHTML(page_html,
                                                   commentCount_xpath)
        # 将所有内容存放到一个总的数组中
        title_sum.extend(title_list)
        link_sum.extend(link_list)
        publishDate_sum.extend(publishDate_list)
        type_sum.extend(type_list)
        readerCount_sum.extend(readerCount_list)
        commentCount_sum.extend(commentCount_list)
    # 数据写入
    if len(title_sum) > 0:
        if col_num == 2:
            print("2列")
            opeExcel.write_excel_xls_append_2(file_name, author_name,
                                              title_sum, link_sum)
        elif col_num == 3:
            opeExcel.write_excel_xls_append_3(file_name, author_name,
                                              title_sum, link_sum,
                                              publishDate_sum)
        elif col_num == 6:
            opeExcel.write_excel_xls_append_6(file_name, author_name, type_sum,
                                              title_sum, link_sum,
                                              publishDate_sum, readerCount_sum,
                                              commentCount_sum)
    # 储存完毕数据一次性打印数据个数
    print(author_name + "文章获取完毕,共计文章数目:" + str(len(title_sum)))