Python UrlUtil示例，utils.UrlUtil Python示例

示例#1

0

显示文件

文件： GpHotReplyNews.py 项目： wukuiqing49/QhNews

def getGpListNews(type, url):
    htmlContent = etree.HTML(UrlUtil.parse_url(url))
    if htmlContent.xpath(".//div[@class='repeatList']") != None and len(
            htmlContent.xpath(".//div[@class='repeatList']")
    ) > 0 and htmlContent.xpath(".//div[@class='repeatList']")[0].xpath(
            './/ul/li') != None:
        content = htmlContent.xpath(".//div[@class='repeatList']")[0].xpath(
            './/ul/li')
        orglists = []
        for div in content:
            org = {}
            href = str(
                div.xpath('.//p[@class="title"]')[0].xpath('.//a/@href')[0])
            org["href"] = href
            if len(div.xpath('.//div/a/img/@src')) > 0:
                icon = str(div.xpath('.//div/a/img/@src')[0])
                org["icon"] = 'http:' + icon
            else:
                org["icon"] = ""
            title = div.xpath('.//p[@class="title"]')[0].xpath('.//a')[0].text
            org["title"] = title
            desc = div.xpath('.//p[@class="info"]')[0].text
            org["desc"] = desc
            time = div.xpath('.//p[@class="time"]')[0].text
            org["time"] = time
            org["type"] = type
            orglists.append(org)

        for org in orglists:
            getDetailInfo(org)

示例#2

0

显示文件

def getTag():
    htmlContent = etree.HTML(UrlUtil.parse_url_get_proxy(baseUrl))
    tagContent = htmlContent.xpath('.//div[@class="d_tags"]/a')

    for tag in tagContent:
        url = baseUrl + str(tag.xpath('./@href')[0])
        QklDbUtli.insertTag(tag.text)

示例#3

0

显示文件

def getTagNews():
    htmlContent = etree.HTML(UrlUtil.parse_url_get_proxy(baseUrl))
    tagContent = htmlContent.xpath('.//div[@class="d_tags"]/a')

    for tag in tagContent:
        url = baseUrl + str(tag.xpath('./@href')[0])
        getNewsListByType(url, tag.text)

示例#4

0

显示文件

def getNewsDetail(news):
    time.sleep(2)
    url = newsUrl % (news['newsId'])
    htmlContent = etree.HTML(UrlUtil.parse_url_get_proxy(url))

    if htmlContent.xpath('.//div[@class="content-wrap"]') != None and len(
            htmlContent.xpath('.//div[@class="content"]')) > 0:
        detail = str(
            htmlContent.xpath("string(.//article[@class='article-content'])"))
        news['newsDetail'] = detail
        # 作者名字
        news['authorName'] = htmlContent.xpath(
            './/div[@class="meta"]/span')[1].text
        news['authorDesc'] = ""
        # 查看数
        news['newsWatch'] = "".join(
            list(
                filter(
                    str.isdigit,
                    str(htmlContent.xpath('.//span[@class="muted"]/text()')))))
        # 时间
        news['newsTime'] = htmlContent.xpath(
            './/div[@class="meta"]/time')[0].text
        QklDbUtli.insertQklNews(news)
        print(news['newsTitle'])

示例#5

0

显示文件

def getNewsList24():
    content = UrlUtil.parse_url_get_proxy(news24Url)
    content = json.loads(content)
    list = content['list']
    listNews = []
    newsType = []

    for data in list:
        news = {}
        # 文章id
        article_id = data['flash_id']
        news['newsId'] = str(article_id)
        # 标题
        title = data['title']
        news['newsTitle'] = title
        # 简介
        brief = data['brief']
        news['newsDesc'] = brief
        news['newsType'] = '24小时'
        # 时间
        add_time = data['add_time']
        news['newsTime'] = str(add_time)
        rise = data['rise']
        news['newsSupport'] = str(rise)
        fall = data['fall']
        news['newsSupportNo'] = str(fall)
        listNews.append(news)

    # 获取json数据
    for news in listNews:
        getNews7x24Detail(news)

示例#6

0

显示文件

def getAuthorNewsDetails(author):
    htmlContent = etree.HTML(UrlUtil.parse_url_get_proxy(author['href']))
    if len(htmlContent.xpath('.//div[@class="content"]')) > 0:
        detail = str(
            htmlContent.xpath("string(.//article[@class='article-content'])"))
        author['newsDetail'] = detail.strip()
        QklDbUtli.insertQklAuthorsNews(author)
        print("标题:" + author['newsTitle'])

示例#7

0

显示文件

def getDetailInfo(org):
    time.sleep(2)
    href = org["href"]
    detailContent = etree.HTML(UrlUtil.parse_url(href))
    if detailContent.xpath('.//div[@class="Body"]') != None and len(
            detailContent.xpath('.//div[@class="Body"]')) > 0:
        content = str(detailContent.xpath("string(.//div[@class='Body'])"))
        # org["news_content"] = content
        GpDbUtli.insertGpNews(org["title"].strip(), org["icon"].strip(), org["type"].strip(), org["desc"].strip(),
                              content.strip(), org["time"].strip())

示例#8

0

显示文件

def getNewsTagDetail(news):
    time.sleep(2)
    url = newsUrl % (news['newsId'])
    htmlContent = etree.HTML(UrlUtil.parse_url_get_proxy(url))

    if len(htmlContent.xpath('.//div[@class="content"]')) > 0:
        detail = str(
            htmlContent.xpath("string(.//article[@class='article-content'])"))
        news['newsDetail'] = detail
        QklDbUtli.insertQklNews(news)
        print(news['newsTitle'])

示例#9

0

显示文件

def getNewsListByType(url, tag):
    htmlContent = etree.HTML(UrlUtil.parse_url_get_proxy(url))

    listContent = htmlContent.xpath('.//article[@class="excerpt"]')
    listNews = []

    for data in listContent:
        news = {}
        # 文章id
        article_id = "".join(
            list(filter(str.isdigit, str(data.xpath('.//a/@href')[0]))))
        news['newsId'] = str(article_id)
        # 作者id

        news['authorId'] = ""

        # 作者名字
        author_name = data.xpath('.//span[@class="muted"]')[0].text
        news['authorName'] = author_name
        news['authorDesc'] = ""
        news['authorIcon'] = ""
        # 标题
        title = data.xpath('./header/h2/a')[0].text
        news['newsTitle'] = title
        # 简介
        if data.xpath('./p') is None:
            news['newsDesc'] = ''
        else:
            news['newsDesc'] = data.xpath('./p')[0].text

        # 图片

        if len(data.xpath('./div/a/img/@src')
               ) == 0 or data.xpath('./div/a/img/@src')[0] is None:
            news['newsIcon'] = ""
        else:
            rectangle_img = str(data.xpath('./div/a/img/@src')[0])
            news['newsIcon'] = rectangle_img
        # 类型

        news['newsType'] = tag
        # 查看数
        show_total = str(data.xpath('.//span[@class="muted none"]/text()')[0])
        news['newsWatch'] = str(show_total)
        # 时间
        add_time = data.xpath('.//span[@class="muted"]')[1].text
        news['newsTime'] = str(add_time)
        if len(listNews) < 13:
            listNews.append(news)
    # 获取json数据

    for news in listNews:
        getNewsTagDetail(news)

示例#10

0

显示文件

def getAuthorDetail(author):
    time.sleep(2)
    url = newsAuthorDetailUrl % (author['authorId'])

    htmlContent = etree.HTML(UrlUtil.parse_url_get_proxy(url))

    if len(htmlContent.xpath('.//article[@class="excerpt"]')) > 0:
        ulcontent = htmlContent.xpath('.//article[@class="excerpt"]')
        authorNews = []
        author['authorFuns'] = htmlContent.xpath(
            './/ul[@class="data"]/li/span')[4].text
        author['authorNews'] = htmlContent.xpath(
            './/ul[@class="data"]/li/span')[0].text
        for news in ulcontent:
            if len(authorNews) < 16:
                authornew = {}

                authornew['authorFuns'] = author['authorFuns']
                authornew['authorNews'] = author['authorNews']
                authornew['authorId'] = author['authorId']
                authornew['authorName'] = author['authorName']
                authornew['authorDesc'] = author['authorDesc']

                authornew['authorName'] = author['authorName']
                authornew['authorIcon'] = author['authorIcon']
                authornew['authorSupport'] = author['authorSupport']

                authornew['newsTitle'] = str(
                    news.xpath('./header/h2/a/@title')[0])
                if news.xpath('./p')[0].text is None:
                    authornew['newsDesc'] = ''
                else:
                    authornew['newsDesc'] = news.xpath('./p')[0].text
                authornew['newsIcon'] = str(news.xpath('./div/a/img/@src')[0])
                # 类型
                authornew['newsType'] = news.xpath('./header/a')[0].text
                authornew['href'] = "https://www.55coin.com" + news.xpath(
                    './div/a/@href')[0]
                authornew['newsId'] = "".join(
                    list(
                        filter(str.isdigit,
                               str(news.xpath('./div/a/@href')[0]))))
                # 查看数
                authornew['newsWatch'] = str(
                    news.xpath('.//span[@class="muted none"]/text()')[0])
                authornew['newsTime'] = str(
                    news.xpath('.//span[@class="muted"]/text()')[1])
                authorNews.append(authornew)

        for news in authorNews:
            print("")
            getAuthorNewsDetails(news)

示例#11

0

显示文件

文件： GpHotReplyNews.py 项目： wukuiqing49/QhNews

def getNewsType():
    htmlContent = etree.HTML(UrlUtil.parse_url(baseUrl))
    typs = []

    base = "http://finance.eastmoney.com"
    if len(htmlContent.xpath('.//ul[@id="daodu_header"]')) > 0:
        ul = htmlContent.xpath('.//ul[@id="daodu_header"]')[0]
        for li in ul:
            type = {}
            type["type"] = li.text
            type["href"] = str(li.xpath('./@data-href')[0])
            typs.append(type)

        for type in typs:
            getGpListNews(type["type"], type["href"])

示例#12

0

显示文件

def getNewsListByType(url):
    content = UrlUtil.parse_url_get_proxy(url)
    content = json.loads(content)
    list = content['list']
    listNews = []
    newsType = []

    for data in list:
        news = {}

        # 文章id
        article_id = data['article_id']
        news['newsId'] = str(article_id)
        # 作者id
        editor_id = data['editor_id']
        news['authorId'] = str(editor_id)

        # 作者名字
        author_name = data['author_name']
        news['authorName'] = author_name
        # 标题
        title = data['title']
        news['newsTitle'] = title
        # 简介
        brief = data['brief']
        news['newsDesc'] = brief
        # 图片
        rectangle_img = data['rectangle_img']
        news['newsIcon'] = rectangle_img
        # 类型
        cat_name = data['cat_name']
        news['newsType'] = cat_name
        if cat_name not in newsType:
            newsType.append(cat_name)
        # 查看数
        show_total = data['show_total']
        news['newsWatch'] = str(show_total)
        # 时间
        add_time = data['add_time']
        news['newsTime'] = str(add_time)
        if len(listNews) < 13:
            listNews.append(news)
    # 获取json数据

    return listNews

示例#13

0

显示文件

def getNewsType(types):
    htmlContent = etree.HTML(UrlUtil.parse_url(baseUrl))
    typs = []

    base = "http://finance.eastmoney.com"
    if len(htmlContent.xpath('.//div[@id="box_pic"]')) > 0:
        ul = htmlContent.xpath('.//div[@id="box_pic"]')[0]
        uls = ul.xpath('.//ul/li')
        for li in uls:
            type = {}
            type["title"] = str(li.xpath('./a/@title')[0])
            type["href"] = str(li.xpath('./a/@href')[0])
            type["type"] = types
            type["icon"] = str(li.xpath('./a/img/@src')[0])
            typs.append(type)

        for type in typs:
            getTopDetailInfo(type)

示例#14

0

显示文件

def getAuthorList():
    htmlContent = etree.HTML(UrlUtil.parse_url_get_proxy(newsAuthorListUrl))

    if len(htmlContent.xpath('.//ul[@id="column_rank"]')) > 0:
        ulcontent = htmlContent.xpath('.//ul[@id="column_rank"]')[0]
        authorList = []
        for data in ulcontent:
            author = {}
            author['authorId'] = str(data.xpath('./a/@user_id')[0])
            author['authorName'] = data.xpath('./a/div/strong')[0].text

            if data.xpath('./a/div/span')[0].text is None:
                author['authorDesc'] = ""
            else:
                author['authorDesc'] = data.xpath('./a/div/span')[0].text

            author['authorIcon'] = str(data.xpath('./a/img/@src')[0])
            author['authorSupport'] = str(data.xpath('./div')[0].text)
            authorList.append(author)

        for author in authorList:
            getAuthorDetail(author)

示例#15

0

显示文件

def getHomeInfo():
    htmlContent = etree.HTML(UrlUtil.parse_url_get_proxy(baseUrl))

    hotContent = htmlContent.xpath(".//ul[@class='article-list']")[0]
    hotList = []
    for hotNews in list(hotContent):
        news = {}
        # 文章id
        news['newsId'] = "".join(
            list(filter(str.isdigit, str(hotNews.xpath('./a/@href')[0]))))
        # 作者id
        news['authorId'] = ""
        # 作者名字
        news['authorName'] = ""

        news['authorDesc'] = ""
        # 标题
        if (len(hotNews.xpath('.//div[@class="tit"]')) > 0):
            news['newsTitle'] = hotNews.xpath('.//div[@class="tit"]')[0].text
        else:
            news['newsTitle'] = str(hotNews.xpath('.//div/a/@title')[0])

        # 简介
        news['newsDesc'] = ""
        # 图片
        news['newsIcon'] = str(hotNews.xpath('.//img/@src')[0])
        # 类型
        news['newsType'] = "hot"
        # 查看数
        news['newsWatch'] = ""
        # 时间
        news['newsTime'] = ""
        # 详情
        news['newsDetail'] = ""
        hotList.append(news)

    for news in hotList:
        getNewsDetail(news)