Python HtmlSource примеры использования

Язык программирования: Python

Пространство имен/Пакет: common.HtmlSource

Класс/Тип: HtmlSource

Примеров на hotexamples.com: 7

Python HtmlSource - 7 примеров найдено. Это лучшие примеры Python кода для common.HtmlSource.HtmlSource, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

HtmlSource(6)

get_html(6)

Основные методы

HtmlSource (6)

get_html (6)

Пример #1

Показать файл

 def crawler_list(self, url, conf, type_p='rp', charset='utf8',row={}):
     try:
         htmlSource = HtmlSource()
         # 获取网页原文
         try:
             html_context = htmlSource.get_html(url_p=url, type_p=type_p, charset_p=charset)
         except Exception as e:
             raise e
         index = 0
         while len(html_context) < 128 and index < 2:
             html_context = htmlSource.get_html(url_p=url)
             index += 1
         if len(html_context) < 128:
             raise Exception(1001, '网页访问失败，无内容！')
         # 解析原文
         tree = html.fromstring(html_context)
         result_list = tree.xpath(conf['group'])
         result_list_context = self._analysis_list(list=result_list, columns=conf['columns'], url=url,row=row)
         if 'nextPage' in conf.keys():
             next_page = tree.xpath(conf['nextPage'])
             if len(next_page) > 0:
                 return result_list_context, parse.urljoin(url, next_page[0])
             else:
                 return result_list_context, None
         else:
             return result_list_context, None
     except Exception as e:
         raise e

Пример #2

Показать файл

    def openUrl(self):
        self.textBrowser.setVisible(True)
        self.tableView.setVisible(False)

        self.conf["url"] = self.urlLineEdit.text()
        # 加载外部页面，调用
        # self.webEngineView.setUrl(QtCore.QUrl(url))
        # print(self.conf["url"])
        htmlSource = HtmlSource()
        self.html_context = htmlSource.get_html(url_p=self.conf["url"], type_p='rg')
        self.textBrowser.setText(self.html_context)
        self.lineEdit.setText("//div[@class =\"padd w645\"]/div[@class=\"list_left\"]/div[@class=\"topic-list\"]/ul/li")
        self.lineEdit_4.setText(
            "//div[@class =\"padd w645\"]/div[@class=\"list_left\"]/div[@class=\"show-page\"]/a[@class=\"next\"]/@href")

Пример #3

Показать файл

    def crawler(self, nextPageUrl='', times=1):
        if (times > 5):
            return
        times = times + 1
        # 表格头信息
        title = []
        for column in self.conf['columns']:
            title.append(column['名称'])
        # 输出表格头 TODO
        print(1)

        self.textBrowser.setVisible(True)
        self.tableView.setVisible(False)

        htmlSource = HtmlSource()
        html_context = htmlSource.get_html(url_p=nextPageUrl, type_p='rg')
        self.textBrowser.setText(html_context)
        time.sleep(1)
        tree = html.fromstring(html_context)
        result_list = tree.xpath(self.conf['body_content_xpath'])
        if (len(result_list) > 0):
            # 表格数据
            rule = Rule()
            list = rule._analysis_list(list=result_list, columns=self.conf["columns"])
            # TODO 输出数据
            print(list)

            # 设置数据层次结构，4行4列
            self.tableView.model = QStandardItemModel(len(list), len(self.conf['columns']))
            # 设置水平方向四个头标签文本内容
            self.tableView.model.setHorizontalHeaderLabels(title)

            for row in range(len(list)):
                for column in range(len(title)):
                    item = QStandardItem(list[row][title[column]])
                    # 设置每个位置的文本值
                    self.tableView.model.setItem(row, column, item)
            # 实例化表格视图，设置模型为自定义的模型
            self.tableView.setModel(self.tableView.model)
            # 展示表格
            self.textBrowser.setVisible(False)
            self.tableView.setVisible(True)
            time.sleep(1)
        # 下一页url
        nextpageurl = tree.xpath(self.conf['nextpage'])
        # 递归调用
        if (len(nextpageurl) > 0):
            print(nextpageurl[0])
            self.crawler(nextPageUrl=nextpageurl[0], times=times)

Пример #4

Показать файл

 def crawler_detail(self, conf, url='', type_p='rp', charset='utf8',row={}):
     htmlSource = HtmlSource()
     # 获取网页原文
     try:
         html_context = htmlSource.get_html(url_p=url, type_p=type_p, charset_p=charset)
     except Exception as e:
         raise e
     index = 0
     while len(html_context) < 128 and index < 2:
         html_context = htmlSource.get_html(url_p=url)
         index += 1
     if len(html_context) < 128:
         raise Exception(1001, '网页访问失败，无内容！')
     # 解析原文
     tree = html.fromstring(html_context)
     result_list = tree.xpath(conf['group'])
     if result_list is not None and len(result_list) > 0:
         result_list_context = self._analysis_context(tree=result_list[0], columns=conf['columns'], url=url,row=row)
         return result_list_context
     else:
         return None

Пример #5

Показать файл

Файл: test.py Проект: aiyang-t/crawler

# -- coding: UTF-8 --
from common.HtmlSource import HtmlSource

htmlSource = HtmlSource()

url = 'https://www.meishij.net/zuofa/hubeixianroutangyuan.html'
html_source = htmlSource.get_html(url_p=url)
print(html_source)

detial_html = htmlSource.get_html(url_p=url, type_p='rg')
print(detial_html)

Пример #6

Показать файл

#!/usr/bin/env python
# -*- coding: UTF-8 -*-
from common.HtmlSource import HtmlSource
from common.Rule import Rule
# from common.inc_conn import Conn_mysql
from common.inc_file import File_file, File_floder
from common.inc_csv import Csv_base
import time

htmlSource = HtmlSource()
rule = Rule()
path = 'D:/newpro/6.1'


# 多线程
def read_detial(url, i):
    detial_html = htmlSource.get_html(url_p=url, type_p='rg')
    #print(detial_html)
    # 写html
    files = File_file()
    names = url.split('/')
    file_name = names[len(names) - 1]

    files.save_source(path=path,
                      file=file_name,
                      all_the_text=detial_html,
                      encoding_='utf-8')
    colum = [
        ('title', '//h1[@class="articleHead"]//text()', 'l'),
        ('pushDate',
         '//div[@class="info"]//span[@class="infoA"][@id="pubtime_baidu"]//text()',

Пример #7

Показать файл

 def openUrl(self):
     self.conf["urlList"] = self.textEdit.toPlainText().split("\n") # 获取文本用toPlainText
     self.textBrowser.setVisible(True)
     htmlSource = HtmlSource()
     html_context = htmlSource.get_html(url_p=self.conf["urlList"][0], type_p='rg')
     self.textBrowser.setText(html_context)