Пример #1
0
 def crawler_list(self, url, conf, type_p='rp', charset='utf8',row={}):
     try:
         htmlSource = HtmlSource()
         # 获取网页原文
         try:
             html_context = htmlSource.get_html(url_p=url, type_p=type_p, charset_p=charset)
         except Exception as e:
             raise e
         index = 0
         while len(html_context) < 128 and index < 2:
             html_context = htmlSource.get_html(url_p=url)
             index += 1
         if len(html_context) < 128:
             raise Exception(1001, '网页访问失败,无内容!')
         # 解析原文
         tree = html.fromstring(html_context)
         result_list = tree.xpath(conf['group'])
         result_list_context = self._analysis_list(list=result_list, columns=conf['columns'], url=url,row=row)
         if 'nextPage' in conf.keys():
             next_page = tree.xpath(conf['nextPage'])
             if len(next_page) > 0:
                 return result_list_context, parse.urljoin(url, next_page[0])
             else:
                 return result_list_context, None
         else:
             return result_list_context, None
     except Exception as e:
         raise e
Пример #2
0
    def openUrl(self):
        self.textBrowser.setVisible(True)
        self.tableView.setVisible(False)

        self.conf["url"] = self.urlLineEdit.text()
        # 加载外部页面,调用
        # self.webEngineView.setUrl(QtCore.QUrl(url))
        # print(self.conf["url"])
        htmlSource = HtmlSource()
        self.html_context = htmlSource.get_html(url_p=self.conf["url"], type_p='rg')
        self.textBrowser.setText(self.html_context)
        self.lineEdit.setText("//div[@class =\"padd w645\"]/div[@class=\"list_left\"]/div[@class=\"topic-list\"]/ul/li")
        self.lineEdit_4.setText(
            "//div[@class =\"padd w645\"]/div[@class=\"list_left\"]/div[@class=\"show-page\"]/a[@class=\"next\"]/@href")
Пример #3
0
    def crawler(self, nextPageUrl='', times=1):
        if (times > 5):
            return
        times = times + 1
        # 表格头信息
        title = []
        for column in self.conf['columns']:
            title.append(column['名称'])
        # 输出表格头 TODO
        print(1)

        self.textBrowser.setVisible(True)
        self.tableView.setVisible(False)

        htmlSource = HtmlSource()
        html_context = htmlSource.get_html(url_p=nextPageUrl, type_p='rg')
        self.textBrowser.setText(html_context)
        time.sleep(1)
        tree = html.fromstring(html_context)
        result_list = tree.xpath(self.conf['body_content_xpath'])
        if (len(result_list) > 0):
            # 表格数据
            rule = Rule()
            list = rule._analysis_list(list=result_list, columns=self.conf["columns"])
            # TODO 输出数据
            print(list)

            # 设置数据层次结构,4行4列
            self.tableView.model = QStandardItemModel(len(list), len(self.conf['columns']))
            # 设置水平方向四个头标签文本内容
            self.tableView.model.setHorizontalHeaderLabels(title)

            for row in range(len(list)):
                for column in range(len(title)):
                    item = QStandardItem(list[row][title[column]])
                    # 设置每个位置的文本值
                    self.tableView.model.setItem(row, column, item)
            # 实例化表格视图,设置模型为自定义的模型
            self.tableView.setModel(self.tableView.model)
            # 展示表格
            self.textBrowser.setVisible(False)
            self.tableView.setVisible(True)
            time.sleep(1)
        # 下一页url
        nextpageurl = tree.xpath(self.conf['nextpage'])
        # 递归调用
        if (len(nextpageurl) > 0):
            print(nextpageurl[0])
            self.crawler(nextPageUrl=nextpageurl[0], times=times)
Пример #4
0
 def crawler_detail(self, conf, url='', type_p='rp', charset='utf8',row={}):
     htmlSource = HtmlSource()
     # 获取网页原文
     try:
         html_context = htmlSource.get_html(url_p=url, type_p=type_p, charset_p=charset)
     except Exception as e:
         raise e
     index = 0
     while len(html_context) < 128 and index < 2:
         html_context = htmlSource.get_html(url_p=url)
         index += 1
     if len(html_context) < 128:
         raise Exception(1001, '网页访问失败,无内容!')
     # 解析原文
     tree = html.fromstring(html_context)
     result_list = tree.xpath(conf['group'])
     if result_list is not None and len(result_list) > 0:
         result_list_context = self._analysis_context(tree=result_list[0], columns=conf['columns'], url=url,row=row)
         return result_list_context
     else:
         return None
Пример #5
0
# -- coding: UTF-8 --
from common.HtmlSource import HtmlSource

htmlSource = HtmlSource()

url = 'https://www.meishij.net/zuofa/hubeixianroutangyuan.html'
html_source = htmlSource.get_html(url_p=url)
print(html_source)

detial_html = htmlSource.get_html(url_p=url, type_p='rg')
print(detial_html)
Пример #6
0
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
from common.HtmlSource import HtmlSource
from common.Rule import Rule
# from common.inc_conn import Conn_mysql
from common.inc_file import File_file, File_floder
from common.inc_csv import Csv_base
import time

htmlSource = HtmlSource()
rule = Rule()
path = 'D:/newpro/6.1'


# 多线程
def read_detial(url, i):
    detial_html = htmlSource.get_html(url_p=url, type_p='rg')
    #print(detial_html)
    # 写html
    files = File_file()
    names = url.split('/')
    file_name = names[len(names) - 1]

    files.save_source(path=path,
                      file=file_name,
                      all_the_text=detial_html,
                      encoding_='utf-8')
    colum = [
        ('title', '//h1[@class="articleHead"]//text()', 'l'),
        ('pushDate',
         '//div[@class="info"]//span[@class="infoA"][@id="pubtime_baidu"]//text()',
Пример #7
0
 def openUrl(self):
     self.conf["urlList"] = self.textEdit.toPlainText().split("\n") # 获取文本用toPlainText
     self.textBrowser.setVisible(True)
     htmlSource = HtmlSource()
     html_context = htmlSource.get_html(url_p=self.conf["urlList"][0], type_p='rg')
     self.textBrowser.setText(html_context)